Source code for dmelon.utils

"""
Helper functions that fit into a more general category
"""

import json
import os
import warnings
from typing import Optional

import geopandas as gpd
import numpy as np
import pandas as pd
from geopandas.tools import sjoin


[docs]def check_folder(base_path: str, name: Optional[str] = None) -> None: """ Create a folder if it does not exists """ if name is not None: out_path = os.path.join(base_path, str(name)) else: out_path = base_path if not os.path.exists(out_path): os.makedirs(out_path)
[docs]def load_json(path: str) -> dict: """ Load the contents of a json file into a python dictionary """ with open(path) as f: content = json.load(f) return content
[docs]def findPointsInPolys( pandas_df: pd.DataFrame, shape_df: gpd.GeoDataFrame, crs: str = "EPSG:4326", ) -> gpd.GeoDataFrame: """ Filter DataFrame by their spatial location within a GeoDataFrame """ argo_geodf = gpd.GeoDataFrame( pandas_df, geometry=gpd.points_from_xy(pandas_df.longitude, pandas_df.latitude, crs=crs), ) # Return spatial join to filer out values outside the shapefile return sjoin(argo_geodf, shape_df, predicate="within", how="inner")
# Piece of code from xmip that I am testing # not sure how it affects other cmip6 models def _interp_nominal_lon(lon_1d): """Interpolate the nominal longitude values to remove nans""" x = np.arange(len(lon_1d)) idx = np.isnan(lon_1d) # the periodicity of the coordinates should be the length of the array # not a fixed 360 since the base coordinates is constructed as a range # from 0 to the length of the array # this is what I am testing return np.interp(x, x[~idx], lon_1d[~idx], period=len(lon_1d))
[docs]def replace_x_y_nominal_lat_lon(ds): """Approximate the dimensional values of x and y with mean lat and lon at the equator""" ds = ds.copy() def maybe_fix_non_unique(data, pad=False): """remove duplicate values by linear interpolation if values are non-unique. `pad` if the last two points are the same pad with -90 or 90. This is only applicable to lat values""" if len(data) == len(np.unique(data)): return data else: # pad each end with the other end. if pad: if len(np.unique([data[0:2]])) < 2: data[0] = -90 if len(np.unique([data[-2:]])) < 2: data[-1] = 90 ii_range = np.arange(len(data)) _, indicies = np.unique(data, return_index=True) double_idx = np.array([ii not in indicies for ii in ii_range]) # print(f"non-unique values found at:{ii_range[double_idx]})") data[double_idx] = np.interp( ii_range[double_idx], ii_range[~double_idx], data[~double_idx], ) return data if "x" in ds.dims and "y" in ds.dims: # define 'nominal' longitude/latitude values # latitude is defined as the max value of `lat` in the zonal direction # longitude is taken from the `middle` of the meridonal direction, to # get values close to the equator # pick the nominal lon/lat values from the eastern # and southern edge, and eq_idx = len(ds.y) // 2 nominal_x = ds.isel(y=eq_idx).lon.load() nominal_y = ds.lat.max("x").load() # interpolate nans # Special treatment for gaps in longitude nominal_x = _interp_nominal_lon(nominal_x.data) nominal_y = nominal_y.interpolate_na("y").data # eliminate non unique values # these occour e.g. in "MPI-ESM1-2-HR" nominal_y = maybe_fix_non_unique(nominal_y) nominal_x = maybe_fix_non_unique(nominal_x) ds = ds.assign_coords(x=nominal_x, y=nominal_y) ds = ds.sortby("x") ds = ds.sortby("y") # do one more interpolation for the x values, in case the boundary values were # affected ds = ds.assign_coords( x=maybe_fix_non_unique(ds.x.load().data), y=maybe_fix_non_unique(ds.y.load().data, pad=True), ) else: warnings.warn( "No x and y found in dimensions for source_id:%s. This likely means that you forgot to rename the dataset or this is the German unstructured model" % ds.attrs["source_id"], ) return ds