Source code for dmelon.utils

"""
Helper functions that fit into a more general category
"""

import json
import os
import warnings
from typing import Optional

import geopandas as gpd
import numpy as np
import pandas as pd
from geopandas.tools import sjoin


[docs]def check_folder(base_path: str, name: Optional[str] = None) -> None:
    """
    Create a folder if it does not exists
    """
    if name is not None:
        out_path = os.path.join(base_path, str(name))
    else:
        out_path = base_path
    if not os.path.exists(out_path):
        os.makedirs(out_path)


[docs]def load_json(path: str) -> dict:
    """
    Load the contents of a json file into a python dictionary
    """
    with open(path) as f:
        content = json.load(f)
    return content


[docs]def findPointsInPolys(
    pandas_df: pd.DataFrame,
    shape_df: gpd.GeoDataFrame,
    crs: str = "EPSG:4326",
) -> gpd.GeoDataFrame:
    """
    Filter DataFrame by their spatial location within a
    GeoDataFrame
    """
    argo_geodf = gpd.GeoDataFrame(
        pandas_df,
        geometry=gpd.points_from_xy(pandas_df.longitude, pandas_df.latitude, crs=crs),
    )

    # Return spatial join to filer out values outside the shapefile
    return sjoin(argo_geodf, shape_df, predicate="within", how="inner")


# Piece of code from xmip that I am testing
# not sure how it affects other cmip6 models


def _interp_nominal_lon(lon_1d):
    """Interpolate the nominal longitude values to remove nans"""
    x = np.arange(len(lon_1d))
    idx = np.isnan(lon_1d)
    # the periodicity of the coordinates should be the length of the array
    # not a fixed 360 since the base coordinates is constructed as a range
    # from 0 to the length of the array
    # this is what I am testing
    return np.interp(x, x[~idx], lon_1d[~idx], period=len(lon_1d))


[docs]def replace_x_y_nominal_lat_lon(ds):
    """Approximate the dimensional values of x and y with mean lat and lon at the equator"""
    ds = ds.copy()

    def maybe_fix_non_unique(data, pad=False):
        """remove duplicate values by linear interpolation
        if values are non-unique. `pad` if the last two points are the same
        pad with -90 or 90. This is only applicable to lat values"""
        if len(data) == len(np.unique(data)):
            return data
        else:
            # pad each end with the other end.
            if pad:
                if len(np.unique([data[0:2]])) < 2:
                    data[0] = -90
                if len(np.unique([data[-2:]])) < 2:
                    data[-1] = 90

            ii_range = np.arange(len(data))
            _, indicies = np.unique(data, return_index=True)
            double_idx = np.array([ii not in indicies for ii in ii_range])
            # print(f"non-unique values found at:{ii_range[double_idx]})")
            data[double_idx] = np.interp(
                ii_range[double_idx],
                ii_range[~double_idx],
                data[~double_idx],
            )
            return data

    if "x" in ds.dims and "y" in ds.dims:
        # define 'nominal' longitude/latitude values
        # latitude is defined as the max value of `lat` in the zonal direction
        # longitude is taken from the `middle` of the meridonal direction, to
        # get values close to the equator

        # pick the nominal lon/lat values from the eastern
        # and southern edge, and
        eq_idx = len(ds.y) // 2

        nominal_x = ds.isel(y=eq_idx).lon.load()
        nominal_y = ds.lat.max("x").load()

        # interpolate nans
        # Special treatment for gaps in longitude
        nominal_x = _interp_nominal_lon(nominal_x.data)
        nominal_y = nominal_y.interpolate_na("y").data

        # eliminate non unique values
        # these occour e.g. in "MPI-ESM1-2-HR"
        nominal_y = maybe_fix_non_unique(nominal_y)
        nominal_x = maybe_fix_non_unique(nominal_x)

        ds = ds.assign_coords(x=nominal_x, y=nominal_y)
        ds = ds.sortby("x")
        ds = ds.sortby("y")

        # do one more interpolation for the x values, in case the boundary values were
        # affected
        ds = ds.assign_coords(
            x=maybe_fix_non_unique(ds.x.load().data),
            y=maybe_fix_non_unique(ds.y.load().data, pad=True),
        )

    else:
        warnings.warn(
            "No x and y found in dimensions for source_id:%s. This likely means that you forgot to rename the dataset or this is the German unstructured model"
            % ds.attrs["source_id"],
        )
    return ds