Source code for primap2.csg._strategies.local_trends

import numpy as np
import xarray as xr
from attrs import field, frozen

import primap2
from primap2.csg._strategies.gaps import FitParameters, calculate_scaling_factor, fill_gap, get_gaps

from .exceptions import StrategyUnableToProcess



[docs]
@frozen
class LocalTrendsStrategy:
    """Fill missing data using local trends or single overlap values.

    The NaNs in the first timeseries :math:`\\textrm{ts}(t)` are filled using harmonized data
    from the lower priority timeseries :math:`\\textrm{fill_ts}(t)`.

    For gaps in the data and missing data at the boundaries of time series different
    treatments are used.

    For boundaries the strategy uses

    .. math::

       \\textrm{fill_ts}_h(t) = \\textrm{fill_ts}(t) \\times a,

    where :math:`\\textrm{fill_ts}_h(t)` is the harmonized dataset and :math:`a` is determined
    by

    .. math::

        a = \\textrm{fill_ts}_t(t_b) / \\textrm{ts}_t(t_b),

    where :math:`\\textrm{fill_ts}_t(t_b)` is the trend value calculated for
    :math:`\\textrm{fill_ts}(t_b)` and equally for :math:`\\textrm{ts}_t(t_b)`.
    :math:`t_b` is the last (in case of a right boundary) or first (in case of a left
    boundary) non-NaN data point in :math:`\\textrm{ts}`. The trend value is calculated
    using a linear trend of length `trend_length` or less data points if a time-series
    does not cover the full period. By setting `min_trend_points` a minimal number of
    points necessary for the trend calculation can be set. If less points are available a
    :py:class:`StrategyUnableToProcess` error will be raised. This enables the user to
    define a fallback strategy, e.g. single point matching.

    For the case of gaps this leads to the situation that we can't use trends on
    one side of the gap and single year matching as fallback on the other. Left and right
    scaling factors are always calculated using the same method.

    By setting `trend_length` to 1 single year matching is used.

    For gaps the left (:math:`t_{bl}`) and right (:math:`t_{br}`) end have to be considered.
    The data is harmonized using

    .. math::

       \\textrm{fill_ts}_h(t) = \\textrm{fill_ts}(t) \\times
       \\frac{a_l(t_{br}-t)+a_r(t-t_{bl})}{t_{br}-t_{bl}},

    where

    .. math::

        a_l = \\textrm{fill_ts}_t(t_{bl}) / \\textrm{ts}_t(t_{bl}),

    and

    .. math::

        a_r = \\textrm{fill_ts}_t(t_b) / \\textrm{ts}_t(t_b).

    If only one of the ends of the gap has an overlap with :math:`\\textrm{fill_ts}(t)`,
    we use the harmonization factor of this side for the whole gap (we treat the gap like
    a boundary).

    If there is no overlap in non-NaN data between :math:`\\textrm{ts}(t)` and
    :math:`\\textrm{fill_ts}(t)` a :py:class:`StrategyUnableToProcess` error will be raised.

    If ``allow_negative = False`` and the harmonized time-series :math:`\\textrm{fill_ts}_h(t)`
    contains negative data a :py:class:`StrategyUnableToProcess` error will be raised.

    Filling multiple gaps and boundaries with this function is scientifically questionable
    as they will all use different scaling factors and thus don't use a consistent model to
    harmonize one time-series :math:`\\textrm{fill_ts}(t)` to :math:`\\textrm{ts}(t)`.
    Use with care.

    Attributes
    ----------
    fit_params
        Instance of the FitParameters class defining the parameters for the fits on the
        boundaries of the time-series. The default values are
            trend_length=10,  # ten years if default unit for trend length is used)
            min_trend_points=5,   # minimal data points necessary for trend calculation
            trend_length_unit="YS",  # year start datapoint
            fit_degree=1,  # linear trend by default
            fallback_degree=0,  # constant
    allow_negative
        Allow the filling time series to contain negative data initially.
    """

    fit_params: FitParameters = field()
    allow_negative: bool = field()
    type = "localTrends"

    @fit_params.default
    def _set_def_fit_params(self):
        return FitParameters(
            trend_length=10,
            min_trend_points=5,
            trend_length_unit="YS",
            fit_degree=1,  # linear trend by default
            fallback_degree=0,  # take average as fallback
        )

    @allow_negative.default
    def _set_allow_negative(self):
        return False

    def fill(
        self,
        *,
        ts: xr.DataArray,
        fill_ts: xr.DataArray,
        fill_ts_repr: str,
    ) -> tuple[xr.DataArray, list[primap2.ProcessingStepDescription]]:
        """Fill missing data using matching of local trends on the boundaries.

        For a description of the algorithm, see the documentation of this class.

        Parameters
        ----------
            ts
                Base timeseries. Missing data (NaNs) in this timeseries will be filled.
                This function does not modify the data in ts.
            fill_ts
                Fill timeseries. Data from this timeseries will be used (possibly after
                modification) to fill missing data in the base timeseries.
                This function does not modify the data in fill_ts.
            fill_ts_repr
                String representation of fill_ts. Human-readable short representation of
                the fill_ts (e.g. the source).

        Returns
        -------
            filled_ts, descriptions.
                filled_ts contains the result, where missing
                data in ts is (partly) filled using scaled data from fill_ts.
                descriptions contains information about which years were affected and
                filled how.
        """
        filled_mask = ts.isnull() & ~fill_ts.isnull()
        time_fillable = filled_mask["time"][filled_mask].to_numpy()

        if time_fillable.any():
            any_filled = False
            time_filled = np.array([], dtype=np.datetime64)
            description = (
                f"filled with local trend matched data from {fill_ts_repr}. "
                f"The following gaps have been filled:"
            )
            gaps = get_gaps(ts)
            filled_ts = ts.copy()
            for gap in gaps:
                gap_description = (
                    f" gap {np.datetime_as_string(gap.left, unit='h')}"
                    f" - {np.datetime_as_string(gap.right, unit='h')}:"
                )
                # check if we have information for the specific gap
                filled_mask_gap = filled_mask.pr.loc[gap.get_date_slice()]
                time_filled_gap = filled_mask_gap["time"][filled_mask_gap].to_numpy()

                if time_filled_gap.any():
                    # get factor
                    factor = calculate_scaling_factor(
                        ts=ts,
                        fill_ts=fill_ts,
                        gap=gap,
                        fit_params=self.fit_params,
                    )
                    # check if positive or negative allowed. if true proceed, if false
                    # use fallback

                    # it would be more consistent to handle the negative value fallback in
                    # calculate_scaling_factor as well. It comes with the drawback that
                    # it can't be controlled from the filling function and that we have
                    # to deal with different return values here

                    if any(factor < 0) and not self.allow_negative:
                        factor = calculate_scaling_factor(
                            ts=ts,
                            fill_ts=fill_ts,
                            gap=gap,
                            fit_params=self.fit_params.get_fallback(),
                        )
                        gap_description = (
                            gap_description + f" negative scaling factor - use fallback degree "
                            f"{self.fit_params.fallback_degree}"
                        )

                    if any(factor < 0) and not self.allow_negative:
                        # negative with fallback. fail to fill gap
                        gap_description = (
                            gap_description
                            + " negative scaling after fallback - failed to fill gap;"
                        )
                    else:
                        if any(np.isnan(factor)):
                            # fail because no factor can be calculated
                            gap_description = (
                                gap_description + " scaling factor is nan - failed to fill gap;"
                            )
                        else:
                            any_filled = True
                            time_filled = np.concatenate((time_filled, time_filled_gap))

                            # fill nans (in the gap only)
                            filled_ts = fill_gap(
                                ts=filled_ts, fill_ts=fill_ts, gap=gap, factor=factor
                            )

                            if factor[0] == factor[1]:
                                gap_description = (
                                    gap_description + f" filled for times "
                                    f"{np.datetime_as_string(time_filled_gap, unit='h')} "
                                    f"using factor {factor[0]:.2f};"
                                )
                            else:
                                gap_description = (
                                    gap_description + f" filled for times "
                                    f"{np.datetime_as_string(time_filled_gap, unit='h')} "
                                    f"using factors {factor[0]:.2f} and {factor[1]:.2f};"
                                )

                    # update description
                    description = description + gap_description

            if any_filled:
                descriptions = [
                    primap2.ProcessingStepDescription(
                        time=time_filled,
                        description=description,
                        function=self.type,
                        source=fill_ts_repr,
                    )
                ]
            else:
                raise StrategyUnableToProcess(
                    reason="No overlap between timeseries for any gap. Can't match"
                )

        else:
            # if we don't have anything to fill we don't need to calculate anything
            filled_ts = ts
            descriptions = [
                primap2.ProcessingStepDescription(
                    time=np.array([], dtype=np.datetime64),
                    description=f"no additional data in {fill_ts_repr}",
                    function=self.type,
                    source=fill_ts_repr,
                )
            ]

        return filled_ts, descriptions