Source code for wind_validation.validation

import warnings
import xarray as xr
import numpy as np
from typing import Union

from wind_validation.ts.stats import TimeSeriesStats
from wind_validation.hist.stats import HistogramStats
from wind_validation.weib.stats import WeibStats
from wind_validation.ts.metrics import TimeSeriesMetrics
from wind_validation.hist.metrics import HistogramMetrics
from wind_validation.weib.metrics import WeibMetrics
from wind_validation.constants import (
    DIM_TIME,
    VAR_POWER_DENS,
    VAR_WIND_SPEED,
    VAR_AIR_DENS,
    VAR_WD_FREQ,
    VAR_WV_COUNT,
)

from windkit.time_series_wind_climate import ts_validate
from windkit.binned_wind_climate import bwc_validate, count_to_ws_freq_by_sector


def wwc_validate(wwc):
    """This one is required because windkit has only decorator
    for validating the wwc and here a simple function is needed
    that you can call on observations and model datasets. The checks
    are just copied from the decorator function in windkit:

    from windkit.weibull_wind_climate import wwc_validate

    Parameters
    ----------
    wwc : [type]
        [description]

    Raises
    ------
    ValueError
        [description]
    ValueError
        [description]
    """

    if not all(v in wwc.data_vars for v in ["A", "k", "wdfreq"]):
        raise ValueError("wwc dataset should contain A, k and wdfreq!")
    if "sector" not in wwc["wdfreq"].dims:
        raise ValueError("Expected sector in wdfreq dims!")


def bwc_validate_mod(bwc: xr.Dataset):
    """Convert binned wind climate format from wind vector count
    to ws freq by sector before validation and validates afterwards

    Parameters
    ----------
    bwc : [xr.Dataset]
        [bwc xarray dataset to validate]
    """
    if "wsfreq" in bwc.data_vars:
        return

    bwc = count_to_ws_freq_by_sector(bwc["wv_count"])

    bwc_validate(bwc)


"""all stats & metrics obj are created beforehand
in this way we can select whichever fits the need
depending on the type of data during the validation"""
STATS = {
    "ts": TimeSeriesStats(),
    "hist": HistogramStats(),
    "weib": WeibStats(),
}
METRICS = {
    "ts": TimeSeriesMetrics(),
    "hist": HistogramMetrics(),
    "weib": WeibMetrics(),
}


[docs]def validate( obs: xr.Dataset, mod: xr.Dataset, dtype: str = None, stats: str = "basic", metrics: str = "basic", **kwargs, ) -> xr.Dataset: """Function to validate modelled wind data against observations. The validation calculates "stats" and "metrics" and outputs them as the results. Stats are calculated on the modelled and observed data seperately. An example is the mean of the wind speed or the variance of the wind direction. Metrics are measures that are calculated between the observed and modelled wind data. This can be for example the pearson correlation coefficient, the mean absolute error, and the mean wind direction error. The data variables in the output xr.dataset are named with the convention `obs_VARIABLE_STATISTIC` for observations and `mod_VARIABLE_STATISTIC` for modelled data. The metrics are labelled as `VARIABLE_METRIC`. Parameters ---------- obs, mod: xarray.Dataset Observed and modelled data to validate. dtype : str, optional Explicitely state the data format. Possible options: ts, hist, weib. By default None. stats: str or list, optional Stats to be calculated. str: if a string is used, it should the name of a suite of stats. the available suites and their included stats are: "basic": - wind speed mean - wind speed standard deviation - wind direction circular mean - wind direction circular standard deviation - etc. "all": - every available stat (see documentation for stats) If a list is used, it must contain tuples in a form of: ('variable', 'stat'). For example: [('wind_speed', 'mean'), ('power_density', 'mean'), ...]. All available option can be found under the respective format folder in the stats.py file. metrics: str or list, optional Metrics to be calculated. metrics suites are: "basic": - wind speed pearson correlation coefficient - wind speed spearman correlation coefficient - wind speed coefficient of determination - wind speed mean error - wind speed root-mean-square error - wind speed mean-absolute error - wind speed mean-absolute-percentage error - wind direction circular mean error - wind direction circular mean-absolute error "all": - every available metric (see documentation for metrics) If a list is used, it must contain tuples in a form of: (('variable','stat'), 'metric'). For example: [(('wind_speed', 'mean'), 'me'), (('power_density', 'mean'), 'mpe')]. All available option can be found under the respective format folder in the metrics.py file. dim: str, optional Dimension to calculate timeseries stats and metrics along. By default dim is "time". It's important to note that this is only applicable to timeseries data. by: numpy.array or xarray.DataArray, optional Optional grouper-array to calculate stats and metrics in groups based on the unique values in the array. Returns ------- xarray.Dataset Validation results of stats and metrics. Raises ------ ValueError Raises this in case the automatic inference of data format from "hist", "ts", "weib" is unsuccessful, a user should provide it manually in this case. Examples -------- >>> results = validate(obs, mod, stats='all', metrics='all') """ dtype = _infer_data_type(obs, mod) if dtype is None else dtype if dtype not in STATS.keys(): raise ValueError( f"Failed to infer known format for both datasets. " f"Known types: {list(STATS.keys())}. " f"While provided one is: {str(dtype)}." ) stats = STATS[dtype]._prepare_stats_from_user_input(stats) metrics = METRICS[dtype]._prepare_metrics_from_user_input(metrics) result = {} datasets = [("obs", obs), ("mod", mod)] """TODO: This is ugly, but for now it will do the job. Time series validation is handled a bit differentely.""" if dtype == "ts": kwargs["dim"] = DIM_TIME for name, dataset in datasets: # add power density variable, this can always be calculated based on the # wind speed. The air density is optionally used if available. dataset = _add_power_density(dataset, dtype) for var, stat, func in stats: label = f"{name}_{var}_{stat}" da = dataset[var].copy() da.name = label result[label] = _calc(func, da, **kwargs) for var, metric, func in metrics: label = f"{var}_{metric}" da_obs, da_mod = obs[var].copy(), mod[var].copy() da_obs.name, da_mod.name = label, label result[label] = _calc(func, da_obs, da_mod, **kwargs) else: for name, dataarr in datasets: dataarr = _add_power_density(dataarr, dtype) for var, stat, func in stats: label = f"{name}_{var}_{stat}" da = dataarr.copy() da_out = _calc(func, da, **kwargs) da_out.name = label result[label] = da_out for var, metric, func in metrics: label = f"{var}_{metric}" if isinstance(var, tuple): # calculates metric on specific stat label = f"{var[0]}_{metric}" stat_var = "_".join([*var]) obs_label = "_".join(["obs", stat_var]) mod_label = "_".join(["mod", stat_var]) try: da_obs, da_mod = result[obs_label], result[mod_label] except KeyError: warnings.warn( f"The required stat {stat_var} is not present" f" to calculate {'_'.join(var)}_{metric} metric." ) continue elif isinstance(var, str): # calculates metric on raw dataset label = f"{var}_{metric}" da_obs, da_mod = obs.copy(), mod.copy() else: # pragma: no cover raise RuntimeError("Unknown internal type of metric variable") da_out = _calc(func, da_obs, da_mod, **kwargs) da_out.name = label result[label] = da_out validation_result = xr.merge(result.values(), compat="override") validation_result = validation_result.assign_attrs(mod.attrs) validation_result = validation_result.assign_attrs(obs.attrs) validation_result = validation_result.assign_attrs( {"format": _get_full_name_of_data_format(dtype)} ) return validation_result
def _calc(func, *args, **kwargs): if kwargs.get("by", None) is None: return func(*args, **kwargs) result = [] by = kwargs.get("by") del kwargs["by"] for cat in np.unique(by): if np.isnan(cat): continue args_cat = tuple(arg.where(by == cat) for arg in args) da_out_cat = func(*args_cat, **kwargs) da_out_cat = da_out_cat.expand_dims(**{by.name: [cat]}) result.append(da_out_cat) return xr.merge(result) def _get_full_name_of_data_format(dtype): if dtype == "hist": return "histogram" if dtype == "ts": return "timeseries" if dtype == "weib": return "weibull" return "" # pragma: no cover def _infer_data_type(obs: xr.Dataset, mod: xr.Dataset) -> Union[str, None]: """Function tries to infer what data format of both observation and model data. Parameters ---------- obs : xarray.Dataset Observations dataset. mod : xarray.Dataset Model dataset Returns ------- Union[str, None] If data format is detected returns the string with the format, otherwise just return None. """ # The sequence of validation is meant to be maintained as it is now # because weibull is the most mainstream format of the data try: if "sector" not in obs.dims or "sector" not in mod.dims: raise ValueError wwc_validate(obs) wwc_validate(mod) return "weib" except ValueError: pass try: if "sector" not in obs.dims or "sector" not in mod.dims: raise ValueError bwc_validate_mod(obs) bwc_validate_mod(mod) return "hist" except ValueError: pass except KeyError: pass try: ts_validate(obs) ts_validate(mod) return "ts" except ValueError: pass return None def detect_spatial_dims(ds): if VAR_WIND_SPEED in ds.data_vars: air_density_dims = ds[VAR_WIND_SPEED] elif VAR_WD_FREQ in ds.data_vars: air_density_dims = ds[VAR_WD_FREQ].isel(sector=0) elif VAR_WV_COUNT in ds.data_vars: air_density_dims = ds[VAR_WV_COUNT].isel(sector=0, wsbin=0) else: raise ValueError("Could not detect structure for adding air density!") return air_density_dims def _add_power_density(ds, dtype): try: _ = ds[VAR_AIR_DENS] except KeyError: air_density_dims = detect_spatial_dims(ds) ds[VAR_AIR_DENS] = xr.full_like(air_density_dims, 1.225) if dtype == "ts": ds[VAR_POWER_DENS] = 0.5 * ds[VAR_AIR_DENS] * ds[VAR_WIND_SPEED] ** 3 return ds