Source code for sloth.toolBox

""" toolBox - submodule of SLOTH
"""
import sys
import os
import glob
import numpy as np
import datetime
import matplotlib.pyplot as plt
from calendar import monthrange
from . import IO as io
from scipy import ndimage as nd
import geopandas as gpd
import xarray as xr
import pandas as pd
from shapely.geometry import Polygon

import sloth.slothHelper as slothHelper



[docs]
def calc_catchment(slopex, slopey, x, y):
    """ Calculate the catchment area associated with a given outlet pixel 
    using x- and y-slope data.

    This function implements a simple yet efficient algorithm to determine the 
    catchment area by starting with a list of pixels that initially contains 
    only the outlet pixel(s). It iteratively processes the pixels in the list, 
    marking them as part of the catchment and identifying the surrounding pixels 
    that drain into the current pixel and are not already included in the 
    catchment. Foun pixels are added to the list and the algorithm continues 
    until all pixels belonging to the catchment area are discovered.

    Parameters
    ----------
    slopex : ndarray
        2D slopes in x-direction
    slopey : ndarray
        2D slopes in y-direction
    x : int
        index in x-direction to calulate catchment from 
    y : int
        index in y-direction to calulate catchment from

    Returns
    -------
    catchment : ndarray
        2D ndarray of the same size as slopex/y. 0 = not part of catchment; 1 = part of catchment
    """
    dims = slopey.shape
    nx = dims[1]
    # convert slopes to: 'in which 1D index I do drain'
    drainx = np.zeros_like(slopex)
    drainx = np.where(slopex<0, 1, drainx)
    drainx = np.where(slopex>0, -1, drainx)
    drainx = np.where(slopex==0, -999, drainx)
    drainy = np.zeros_like(slopey)
    drainy = np.where(slopey<0, nx, drainy)
    drainy = np.where(slopey>0, -nx, drainy)
    drainy = np.where(slopey==0, -999, drainy)
    # plt.imshow(drainx)
    # plt.show()
    # FlatDrainX
    fdx = drainx.ravel(order='C')
    # FlatDrainY
    fdy = drainy.ravel(order='C')
    # FlatCatchment
    fc = np.zeros_like(fdx)
    # flat_idx order='C': idx = (y * nx) + x
    start = (y * nx ) + x

    openEnds = [start]
    while openEnds:
        # Get one open end
        step = openEnds.pop()
        # Mark open end as catchment
        fc[step] = 1
        # GET surrounding Pixel
        # Nort = step - nx; East = step +1
        # West = step - 1 ;South = step + nx
        # NEWS = [step-nx, step+1, step-1, step+nx]
        NS = [step-nx, step+nx]
        EW = [step+1, step-1]
        # CHECK if surrounding drain to step (D2S) and are NOT catchment already.
        # Step is the current handled open end.
        try:
            NSD2S = [ idx for idx in  NS if (idx + fdy[idx] == step and not fc[idx] ) ] 
            EWD2S = [ idx for idx in  EW if (idx + fdx[idx] == step and not fc[idx] ) ] 
        except IndexError:
            print('FEHLER')
            continue
        D2S = NSD2S + EWD2S
        # add all found pixes to openEnds
        openEnds += D2S

    return fc.reshape(dims)



[docs]
def get_intervalSlice(dates, sliceInterval='month'):
    ''' This functions calculates interval slices of a given time-series

    This function calculates interval slices of a given time-series, 
    aiming to mimic pythons default slice notation `array[start:stop:step]`
    to also enable multi dimensional slicing.    
    The calculation takes the models dumpintervall into account, wherefore 
    this function is working for (nearly) every time-resolution.   
    The calculation is based on datetime-objects and therefore really 
    calculates the slice of individual interval, even if the time-series 
    does not start or end at the first or last of a given interval.  

    Use case:   
    You got a time-series of hourly data points and want to calculate the
    monthly mean. This functions returns the slices needed to divide your
    data into monthly peaces.

    Parameters
    ----------
    dates : NDarray 
        the time-series as datetime-object. 
    sliceInterval : str
        defining the interval. Supported are: 'day', 'month'

    Returns
    -------
    Slices : list
        A list with length of found intervals with related slices

    Notes 
    -----
    This function assumes:   
    i) The passed time-series covers at least on intervall!
    ii) At least hourly steps / dumpIntervals! 
    iii) No seconds in dates - model output is at least on full minutes!

    '''

    # Check is passed sliceInterval is supported and exit if not.
    supportedSliceInterval = ['day', 'month']
    if sliceInterval not in supportedSliceInterval:
        print(f'ERROR: sliceInterval "{sliceInterval}" is not supported.')
        print(f'---    supported values are: {supportedSliceInterval}')
        print(f'---    EXIT program')
        return False

    print(f'########################################################################')
    print(f'#### calculate dumpInterval and check if equal for all data-points')
    print(f'########################################################################')
    # Calculating dumpInterval
    tmp_dumpInterval = np.diff(dates, n=1)
    # Check if dumpInterval equal for all data-points
    # and if so define set dumpInterval 
    if not np.all(tmp_dumpInterval == tmp_dumpInterval[0]):
        print('ERROR: calculated dumpInterval is not equal for all data-points')
        # In case of error: break function
        return False
    else:
        dumpInterval = tmp_dumpInterval[0]
        print(f'DONE --> DumpInterval: {dumpInterval}')        
    
    # Finding the first time-step of a interval, by looping over the time-series and 
    # check for each time-step if this is the first of the interval (break loop if found).
    print(f'########################################################################')
    print(f'#### Finding first of month')
    print(f'########################################################################')
    Offset = 0
    while True:
        # verify first date is first of interval.
        # idea explained with sliceInterval='month': 
        # first step - dumpInterval (or 0.5*dumpInterval) is first of month 00UTC
        # By checking 1*dumpInterval and 0.5*dumpInterval we do take into account, that
        # the time-axis of averaged model output might got shifted in between the 
        # time-bounds.
        print(f'Offset: {Offset}')
        tmp_first = dates[Offset]
        if sliceInterval == 'day':
            # using midnight as reference
            tmp_first_ref = tmp_first.replace(hour=0, minute=0, second=0)
        elif sliceInterval == 'month':
            # using the first of current month at midnight as reference
            tmp_first_ref = tmp_first.replace(day=1, hour=0, minute=0, second=0)
        # First time-step of dates is already first of a interval
        if (tmp_first == tmp_first_ref):
            print(f'check step {Offset} is first of a month at midnight')
            break
        #if (tmp_first - dumpInterval) == tmp_first_ref:
        #    print(f'check step {Offset} is first of a month at midnight')
        #    break
        # First time-step is not the exact first of a interval but set in
        # middle of time-bounds and therefore halfe a dumpInteral away from
        # first date of a interval
        elif (tmp_first - (0.5*dumpInterval)) == tmp_first_ref:
            print(f'check step {Offset} is first of a month at midnight')
            break
        else:
            print(f'ERROR: step {Offset} is not first step of month at midnight!')
            print(f'step: {tmp_first}')
            Offset += 1
            # hard break of loop if no 'beginning' is found after 50 time-steps
            if Offset > 50:
                break
            continue

    # Calculating the slices by looping over all time-steps and check if the 
    # next time-step belong to the next interval.
    # NWR 20210422:
    # there should be some clever and short solution for below loop ...!
    # now that we know the dumpInterval and first step is first of interval...
    print(f'########################################################################')
    print(f'#### getting month series / slices')
    print(f'########################################################################')
    t_lower = Offset
    Slices = []
    for t in range(Offset, dates.shape[0]):
        # I decided to go for the solution checking current month and next month
        # to catch the case if the dateset contains one month only!
        if sliceInterval == 'day':
            currInterval = dates[t].day
            nextInterval = (dates[t] + dumpInterval).day
        elif sliceInterval == 'month':
            currInterval = dates[t].month
            nextInterval = (dates[t] + dumpInterval).month
        if nextInterval != currInterval:
            Slices.append(slice(t_lower,t+1,None))
            t_lower = t+1

    return Slices 



[docs]
def spher_dist_v1(lon1, lat1, lon2, lat2, Rearth=6373):
    """ calculate the spherical / haversine distance

    This function calculates the real distance (in [km]) between two points on
    earth given in lat/lon coordinates. 

    Parameters
    ----------
    lon1 : ndarray
        Longitude value in [rad] of first point in [rad]. Could be any dim
    lat1 : ndarray
        Latitude value in [rad] of first point in [rad]. Could be any dim
    lon2 : ndarray
        Longitude value in [rad] of second point in [rad]. Could be any dim
    lat2 : ndarray
        Latitude value in [rad] of second point in [rad]. Could be any dim
    Rearth : int or float 
        The earth radius in [km].

    Returns
    -------
    ndarray
        The distance is returned in [km]

    Notes
    -----
    Source: https://www.kompf.de/gps/distcalc.html

    """
    term1 = np.sin(lat1) * np.sin(lat2)
    term2 = np.cos(lat1) * np.cos(lat2)
    term3 = np.cos(lon2 - lon1)
    return Rearth * np.arccos(term1+term2*term3)



[docs]
def find_nearest_Index_2D(point, coord2D):
    """
    Find the nearest index in a 2D array.

    Given a 2D array `coord2D` and a specific `point` value, this function returns
    the index of the 2D array that holds the closest values to the `point`.

    Parameters
    ----------
    point : scalar
        The value for which to find the nearest index in the 2D array.
    coord2D : ndarray
        The 2D array in which to search for the nearest index.

    Returns
    -------
    (int, int)
        The index in the first and second dimensions of `coord2D` that holds the
        closest values to the `point`.

    Notes
    -----
    This function calculates the absolute differences between `coord2D` and `point`
    using `np.abs()`. It then determines the index of the minimum value in the
    differences array using `np.argmin()`. The resulting flattened index is converted
    into a tuple of indices representing the original shape of `coord2D` using
    `np.unravel_index()`.

    Examples
    --------
    >>> import numpy as np
    >>> arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    >>> find_nearest_Index_2D(5, arr)
    (1, 1)

    >>> arr = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
    >>> find_nearest_Index_2D(0.25, arr)
    (0, 1)

    """
    dist = np.abs(coord2D - point)
    idx = np.unravel_index(np.argmin(dist, axis=None),dist.shape)
    return idx[0], idx[1]



[docs]
def plusOneMonth(currDate):
    """
    Return the passed date incremented by one month.

    This function provides a simple way to calculate a date that is one month ahead
    of the given `currDate`. Unlike the bash command-line tool `date`, which can
    handle this calculation easily, Python's `datetime` objects are based on hours
    or seconds, requiring special treatment for adding one month due to varying
    month lengths. This function serves as a wrapper to handle this task in a clean
    manner, facilitating usage in other code snippets.

    Parameters
    ----------
    currDate : datetime
        An arbitrary date.

    Returns
    -------
    datetime
        The passed date incremented by one month.

    Notes
    -----
    This function determines the number of days in the month of `currDate` using
    `monthrange()` from the `calendar` module. It then returns `currDate` incremented
    by a timedelta calculated as `24 * num_days` hours. This ensures that the returned
    date correctly reflects one month ahead while accounting for varying month lengths.

    Examples
    --------
    >>> import datetime
    >>> currDate = datetime.datetime(2023, 5, 15)
    >>> plusOneMonth(currDate)
    datetime.datetime(2023, 6, 15, 0, 0)

    >>> currDate = datetime.datetime(2023, 12, 31)
    >>> plusOneMonth(currDate)
    datetime.datetime(2024, 1, 31, 0, 0)

    """
    num_days = monthrange(currDate.year, currDate.month)[1]
    return currDate + datetime.timedelta(hours=24*num_days)



[docs]
def trunc(values, decs=0):
    """
    Truncate a passed float value or floating ndarray.

    This function truncates a given float value or floating ndarray by a 
    specified truncation precision (decimal digits).

    Parameters
    ----------
    values : ndarray
        A ndarray of any dimension (including scalar) containing float values to be truncated.    
    decs : int, optional
        An integer value defining the truncation precision. It specifies the number of decimal places to keep (default is 0).

    Returns
    -------
    ndarray
        A ndarray of the same type as the input, with the specified number of decimal places truncated from the original values.

    Examples
    --------
    >>> import numpy as np
    >>> values = np.array([2.12345, 3.45678, 4.56789])
    >>> trunc(values, decs=3)
    array([2.123, 3.456, 4.567])

    >>> value = 2.98765
    >>> trunc(value, decs=2)
    2.98

    """
    return np.trunc(values*10**decs)/(10**decs)



[docs]
def fill(data, invalid=None, transkargs={}):
    """
    Fill invalid data points with nearest neighbor interpolation.

    This function replaces invalid data points in the input array (`data`) with the
    value of the nearest valid data point. Invalid data points are indicated by the
    `invalid` array or, if not provided, by NaN (Not a Number) values in `data`.

    Parameters
    ----------
    data : ndarray
        An array containing the data to be filled.
    invalid : ndarray, optional
        A binary array of the same shape as `data`. Data values are replaced where
        `invalid` is True. If not provided, invalid data points are identified using
        `np.isnan(data)` (default).
    transkargs : dict, optional
        Additional arguments to pass to the `distance_transform_edt()` function.
        Default is an empty dictionary.

    Returns
    -------
    ndarray
        A filled ndarray, where invalid data points have been replaced by the value
        of the nearest valid data point.

    Notes
    -----
    This function uses nearest neighbor interpolation to fill invalid data points in
    the input array. It calculates the Euclidean distance transform of the `invalid`
    array to find the indices of the nearest valid data points. The original `data`
    array is then updated with the values from the nearest valid data points.

    If the shapes of `data` and `invalid` are not equal, an error is raised, as filling
    is not possible. 
    
    See also: https://stackoverflow.com/questions/5551286/filling-gaps-in-a-numpy-array

    Example
    -------
    >>> import numpy as np
    >>> data = np.array([1.0, 2.0, np.nan, 4.0, np.nan])
    >>> fill(data)
    array([1., 2., 4., 4., 4.])

    >>> invalid = np.isnan(data)
    >>> fill(data, invalid)
    array([1., 2., 4., 4., 4.])

    """
    # check if data and invalid shape is equal, as otherwise filling is not
    # possible
    if data.shape != invalid.shape:
        print(f'ERROR: data.shape {data.shape} != invalid.shape (invalid.shape)')
    if invalid is None: invalid = np.isnan(data)

    ind = nd.distance_transform_edt(invalid,
                                    return_distances=False,
                                    return_indices=True,
                                    **transkargs)
    return data[tuple(ind)]



[docs]
def get_prudenceMask(lat2D, lon2D, prudName):
    """
    Return a Prudence mask based on latitude and longitude values.

    This function generates a boolean mask array where True represents masked areas
    and False represents non-masked areas. The mask is determined based on a set of
    latitude and longitude values and the name of the Prudence region.

    Parameters
    ----------
    lat2D : ndarray
        2D array containing latitude information for each pixel.
    lon2D : ndarray
        2D array containing longitude information for each pixel.
    prudName : str
        Short name of the Prudence region.

    Returns
    -------
    prudMask : ndarray
        Boolean ndarray of the same shape as `lat2D`, indicating masked and
        non-masked areas.
        True = masked; False = not masked.

    Notes
    -----
    The Prudence mask is created based on specific latitude and longitude ranges
    for each Prudence region. The function checks the `prudName` parameter and
    generates the corresponding mask using NumPy's `np.where()` function.

    The available Prudence region names and their corresponding latitude and
    longitude ranges are as follows (True is masked, Fals is not masked):

    - 'BI': Latitude: <50.0 or >59.0, Longitude: <-10.0 or >2.0
    - 'IP': Latitude: <36.0 or >44.0, Longitude: <-10.0 or >3.0
    - 'FR': Latitude: <44.0 or >50.0, Longitude: <-5.0 or >5.0
    - 'ME': Latitude: <48.0 or >55.0, Longitude: <2.0 or >16.0
    - 'SC': Latitude: <55.0 or >70.0, Longitude: <5.0 or >30.0
    - 'AL': Latitude: <44.0 or >48.0, Longitude: <5.0 or >15.0
    - 'MD': Latitude: <36.0 or >44.0, Longitude: <3.0 or >25.0
    - 'EA': Latitude: <44.0 or >55.0, Longitude: <16.0 or >30.0

    If an unsupported Prudence region name is provided, an error message is printed,
    and the program exits.

    For reference see also: http://prudence.dmi.dk/public/publications/PSICC/Christensen&Christensen.pdf p.38

    """
    if (prudName=='BI'):
        prudMask = np.where((lat2D < 50.0) | (lat2D > 59.0)  | (lon2D < -10.0) | (lon2D >  2.0), True, False)
    elif (prudName=='IP'):
        prudMask = np.where((lat2D < 36.0) | (lat2D > 44.0)  | (lon2D < -10.0) | (lon2D >  3.0), True, False)
    elif (prudName=='FR'):
        prudMask = np.where((lat2D < 44.0) | (lat2D > 50.0)  | (lon2D < -5.0) | (lon2D >  5.0), True, False)
    elif (prudName=='ME'):
        prudMask = np.where((lat2D < 48.0) | (lat2D > 55.0)  | (lon2D < 2.0) | (lon2D >  16.0), True, False)
    elif (prudName=='SC'):
        prudMask = np.where((lat2D < 55.0) | (lat2D > 70.0)  | (lon2D < 5.0) | (lon2D >  30.0), True, False)
    elif (prudName=='AL'):
        prudMask = np.where((lat2D < 44.0) | (lat2D > 48.0)  | (lon2D < 5.0) | (lon2D >  15.0), True, False)
    elif (prudName=='MD'):
        prudMask = np.where((lat2D < 36.0) | (lat2D > 44.0)  | (lon2D < 3.0) | (lon2D >  25.0), True, False)
    elif (prudName=='EA'):
        prudMask = np.where((lat2D < 44.0) | (lat2D > 55.0)  | (lon2D < 16.0) | (lon2D >  30.0), True, False)
    else:
        print(f'prudance region {prudName} not found --> EXIT')
        eys.exit(1)

    return prudMask



[docs]
def stampLLSM(data, invalid, LLSM, LLSMThreshold=2):
    """ stamps a LLSM to passed data

    Some times its needed to force different datasets to use the same 
    Land-Lake-Sea-Mask (LLSM). One example is combining different data sets to 
    force a model with. If the datasets are taken fomr different sources, most 
    propably the used LLSM is different too. This, for exmple, could lead to the 
    situation along the coastline that one dataset is indicating water (ocean) 
    while the other is indicating land. This inconsistency has to be fixe, 
    especially within the realm of coupled models.


    To achive this, this function is\n
    i)   masking invalid pixels within the passed data set  
    ii)  interpolate remaining data over the maske regions  
    iii) set pixel to water value according to a passed LLSM  

    This guaranthees each land pixel is holding land informtion and each
    water pixel is marked as water.

    Parameters
    ----------
    data : ndarray
        A 2D nd array holding the datset to stamp the LLSM
    invalid : scalar
        A scalar witht the value indicating invalid pixel
    LLSM : ndarray
        A ndarray of the same shape as data holding the LLSM
        (Land=2 Lake=1 Sea=0)

    Returns
    -------
    ndarray
        A (MASKED!) ndarray of the same shape as data, with stamped LLSM

    """

    # some checks to verify we can progress
    if not isinstance(data, np.ndarray):
        print(f'data is of type {type(data)} but <class "numpy.ndarray"> is required!')
        sys.exit(1)
    if not data.ndim == 2:
        print(f'data is of dimension {data.ndim} but dimension 2 is required!')
        sys.exit(1)

    # i)
    out = np.ma.masked_where(data==invalid, data)
    # ii)
    out = fill(data=out, invalid=out.mask)
    # iii)
    out = np.ma.masked_where((LLSM < LLSMThreshold), out)

    return out




[docs]
def mapDataRange_lin(X, y_min=0, y_max=1,
        x_min=None, x_max=None,
        cutMinMax=False):
    """Map a source data range linearly to a target data range.

    Perform a linear mapping of a source data range (X) to a target data range (Y). The
    function calculates the mapping using the formula: y = (y_max - y_min) / (x_max - x_min) * (x - x_min) + y_min.

    Parameters
    ----------
    X : ndarray
        Source data range to remap.
    y_min : scalar, optional
        Minimum value of the target data range (default is 0).
    y_max : scalar, optional
        Maximum value of the target data range (default is 1).
    x_min : scalar, optional
        Minimum value of the source data range. If not provided, it is calculated based on X.
    x_max : scalar, optional
        Maximum value of the source data range. If not provided, it is calculated based on X.
    cutMinMax : bool, optional
        If True, cut the mapped values outside the target data range to the minimum and maximum values.
    
    Returns
    -------
    ndarray
        The target data range after linear mapping.

    Notes
    -----
    This function is useful for mapping a source data range to a different target data range. It can be
    used for various purposes, such as normalizing data ranges to facilitate comparison with other ranges.
    An intermediat step is used by first transform both (X and Y) into data 
    ranges starting from zero (X' and Y'), as those data range can be easily 
    mapped with

    i)   y' = y'_max / x'_max * x'
    ii)  y' = y - y_min   AND   x' = x - x_min
    iii) y = (y_max-y_min) / (x_max-x_min) * (x-x_min) + y_min

    Examples
    --------
    >>> X = np.array([0, 1, 2, 3, 4, 5])
    >>> mapped = mapDataRange_lin(X, y_min=10, y_max=20)
    >>> print(mapped)
    [10. 12. 14. 16. 18. 20.]

    >>> X = np.array([100, 200, 300, 400, 500])
    >>> mapped = mapDataRange_lin(X, y_min=-1, y_max=1, x_min=100, x_max=500)
    >>> print(mapped)
    [-1.  -0.5  0.   0.5  1. ]

    """
    # Calculate x_min and x_max if not passed
    if x_min is None:
        x_min = np.min(X)
    if x_max is None:
        x_max = np.max(X)

    # Map X to new data range
    Y = (y_max-y_min) / (x_max-x_min) * (X-x_min) + y_min

    # Cut Y for min and max values if wanted
    if cutMinMax:
        Y = np.where(Y<=y_min, y_min, Y)
        Y = np.where(Y>=y_max, y_max, Y)

    return Y



[docs]
def intersection_calculations(df_data, corners, area_of_interest, Name_area, crs_utm, nr_yr, nr_entries, save_dir):
    """ Calculate spatial mean values for a shapefile in interest.

    This function calculates spatial mean values (example for a specific 
    region or province) taking into account how much (ratio) of the 
    gridpoint is actually being intersected inside our region of interest.
    
    Parameters
    -------------
    df_data: dataframe
         dataframe containing infromation about each gridpoint
    
    corners: dataframe (can also be a nectdf or any other data type)
         containing infromation on the four longitudes and latitudes 
         that surrounds each gridpoint in the source dataframe
    
    area_of_interest: a shapefile or geodataframe
    	shapefile of the area of interest

    Name_area: the field name in the shapefile, that the dissolve will be based on
     
    crs_utm: projected coordinate reference system (utm)
    
    nr_yr: variable
    	number of years of interest
    
    nr_entries:variable
    	number of hours/days/or months etc..
    
    save_dir: path
    	path for saving the output 
     """
    

    # create a geodataframe from the dataframe that we want to work with
    df_data = pd.read_csv(df_data)
    print("creating a geodataframe of the given dataframe")
    gdf_data = gpd.GeoDataFrame(df_data, geometry=gpd.points_from_xy(df_data.lon,
                                                                     df_data.lat))
    ds_corners = xr.open_dataset(corners)

    # convert the netcdf to dataframe in order to be able 
    # to convert it to geopandas and create polygons
    df_corners = ds_corners.to_dataframe()
    df_corners = df_corners.reset_index()
    df_corners = df_corners.iloc[:, 4:]

    # create a list of the four pairs (lon,lat) that creates the polygons
    print("creating a list of four pairs (lon,lat) to create polygons")
    list_poly = []
    polygon_geom = list(zip(df_corners.grid_corner_lon, df_corners.grid_corner_lat))
    for i in range(0, len(polygon_geom), 4):
        list_poly.append(Polygon(polygon_geom[i:i + 4]))

    # create a geodataframe from the polygon list that was created
    print("creating a geodataframe with the polygons")
    gdf_polygons = gpd.GeoDataFrame(geometry=list_poly)
    # for some reason the polygons were doubled so a drop_duplicates() was used
    gdf_polygons = gdf_polygons.drop_duplicates() 

    # if the two goedataframes are do not have a coordinate systems
    # we have to set it first before going further setting the crs

    gdf_data_wgs = gdf_data.set_crs('EPSG:4326') # setting the crs to WGS84
    # in order to calculate the area later, it is needed to convert 
    # the geodataframe to UTM
    gdf_data_utm32N = gdf_data_wgs.to_crs(crs_utm) 

    gdf_polygons_wgs = gdf_polygons.set_crs('EPSG:4326')  # setting the crs to WGS84
    gdf_polygons_utm32N = gdf_polygons_wgs.to_crs(crs_utm)
    print("Geodataframes are created")

    # calculate the area of the polygons.
    # it is important for calculating the ratio (weight) of the area of 
    # each polygon interscted
    gdf_polygons_utm32N['area'] = gdf_polygons_utm32N.area

    # after calculating the area, in come cases (when calculating for whole country)
    # the area of polygons will differ for example in when comparing cities along 
    # the meridian lines, there will be slight difference in the area

    # joining thw two geodataframe using sjoin, in order to join the dataframe 
    # that has the values with its corresponding polygons
    print("Joining the two geodataframes")
    join_within_right_gdf_utm32N = gdf_data_utm32N.sjoin(gdf_polygons_utm32N, how="right", predicate="within")
    
    shapefile_gdf = gpd.read_file(area_of_interest)
    print(shapefile_gdf.crs)   # to check wether it has the same crs as the geodataframe that we created
    # if not convert it to utm (to be able to calculate the area)

    print("intersecting")
    overlay_gdf = gpd.overlay(join_within_right_gdf_utm32N, shapefile_gdf, how='intersection')
    overlay_gdf['area_intersected'] = overlay_gdf.area
    # the weight calculated below represents how much area from each polygon
    # is inside the shapefile after intesection
    overlay_gdf['weight'] = overlay_gdf['area_intersected']/overlay_gdf['area'] 


    # after this step we should have a geodataframe for the intersected polygons 
    # that are inside the shapefile of interest the geodataframe will have
    # the lon,lat, with the corresponding values for each date (each date in a column), 
    # area of the polygon before intersection, area after intersection,
    # and the ratio (area_intersected/area) and it can also include the names of the 
    # regions (in order to be able to dissolve it into mean values for each region)

    # in the next step, a dataframe was created to add the values of each 
    # polygon (gridpoint) after calculating its weight inside 
    # the intersection for each date
    nr_yr = nr_yr
    nr_entries = nr_entries # ex: number of dates :hourly, daily, monthly..etc
    nr_column = nr_yr*nr_entries
    df_data_inter = overlay_gdf.iloc[:,3:nr_column+2].multiply(overlay_gdf['weight'], axis="index")

    # copy the geometry column (important for creating a geodataframe) and other 
    # relevant data to the new dataframe
    # like the name of the regions (important for calculating the mean values for each region)
    Name_area = Name_area
    df_data_inter[Name_area] = overlay_gdf[Name_area] # this sould be adapted depending on the shapefile available
    df_data_inter['geometry'] = overlay_gdf['geometry']
    geometry = df_data_inter['geometry']
    gdf_data_inter = gpd.GeoDataFrame(df_data_inter, crs="epsg:25832", geometry=geometry)

    print("calculating the mean value for each region of interest")
    dissolve_gdf = gdf_data_inter.dissolve(by=Name_area, aggfunc='mean', as_index=True, level=None, sort=True, observed=False, dropna=True)
    dissolve_gdf_wgs = dissolve_gdf.to_crs('EPSG:4326') # convert back to wgs 1984
    
    print("saving as a shapefile")
    dissolve_gdf_wgs.to_file(save_dir)


    

if __name__ == '__main__':
    print('Im there!')