Source code for primap2.pm2io._GHG_inventory_reading

"""This file contains functions for reading of country GHG inventories
from National Inventory Reports (NIR), biannual Update Reports (BUR),
and other official country emissions inventories
Most of the functions in this file are exposed to the outside yet they
currently do not undergo the strict testing applied to the rest of PRIMAP2 as
they are added during the process of reading an preparing data for the PRIMAP-hist
update. Testing will be added in the future."""

import re
from typing import Optional, Union

import pandas as pd



[docs]
def nir_add_unit_information(
    df_nir: pd.DataFrame,
    *,
    unit_row: Union[str, int],
    entity_row: Optional[Union[str, int]] = None,
    regexp_entity: str,
    regexp_unit: Optional[str] = None,
    manual_repl_unit: Optional[dict[str, str]] = None,
    manual_repl_entity: Optional[dict[str, str]] = None,
    default_unit: str,
) -> pd.DataFrame:
    """Add unit information to a National Inventory Report (NIR) style DataFrame.

    Add unit information to the header of an "entity-wide" file as
    present in the standard table format of National Inventory Reports (NIRs). The
    unit and entity information is extracted from combined unit and entity information
    in the row defined by `unit_row`. The parameters `regexp_unit` and `regexp_entity`
    determines how this is done by regular expressions for unit and entity.
    Additionally, manual mappings can be defined in the `manual_repl_unit` and
    `manual_repl_entity` dicts. For each column the routine tries to extract a unit
    using the regular expression. If this fails it looks in the `manual_repl_unit`
    dict for unit and in `manual_repl_entity` for entity information. If there is no
    information the default unit given in `default_unit` is used. In this case the
    analyzed value is used as entity unchanged.

    Parameters
    ----------
    df_nir : pd.DataFrame
        Pandas DataFrame with the table to process
    unit_row : str or int
        String "header" to indicate that the column header should be used to derive the
        unit information or an integer specifying the row to use for unit information.
        If entity and unit information are given in the same row use only unit_row.
    entity_row : str or int
        String "header" to indicate that the column header should be used to derive the
        unit information or an integer specifying the row to use for entity information.
        If entity and unit information are given in the same row use only unit_row
    regexp_entity : str
        regular expression that extracts the entity from the cell value
    regexp_unit : str (optional)
        regular expression that extracts the unit from the cell value
    manual_repl_unit : dict (optional)
        dict defining unit for given cell values
    manual_repl_entity : dict (optional)
        dict defining entity for given cell values
    default_unit : str
        unit to be used if no unit can be extracted an no unit is given

    Returns
    -------
    pd.DataFrame
        DataFrame with explicit unit information (as column header)
    """

    if manual_repl_unit is None:
        manual_repl_unit = {}

    if manual_repl_entity is None:
        manual_repl_entity = {}

    cols_to_drop = []

    # get the data to extract the units and entities from
    # can be either the header row or a regular row
    if unit_row == "header":
        values_for_units = list(df_nir.columns)
    else:
        # unit_row must be an integer
        values_for_units = list(df_nir.iloc[unit_row])
        cols_to_drop.append(unit_row)

    if entity_row is not None:
        if entity_row == "header":
            values_for_entities = list(df_nir.columns)
        else:
            values_for_entities = list(df_nir.iloc[entity_row])
            if entity_row != unit_row:
                cols_to_drop.append(entity_row)
    else:
        values_for_entities = values_for_units

    if regexp_unit is not None:
        re_unit = re.compile(regexp_unit)
    re_entity = re.compile(regexp_entity)

    units = values_for_units.copy()
    entities = values_for_entities.copy()

    for idx, value in enumerate(values_for_units):
        if str(value) in manual_repl_unit:
            units[idx] = manual_repl_unit[str(value)]
        else:
            if regexp_unit is not None:
                unit = re_unit.findall(str(value))
            else:
                unit = False

            if unit:
                units[idx] = unit[0]
            else:
                units[idx] = default_unit

    for idx, value in enumerate(values_for_entities):
        if str(value) in manual_repl_entity:
            entities[idx] = manual_repl_entity[str(value)]
        else:
            entity = re_entity.findall(str(value))
            if entity:
                entities[idx] = entity[0]
            else:
                entities[idx] = value

    new_header = [entities, units]

    df_out = df_nir.copy()
    df_out.columns = new_header
    if cols_to_drop:
        df_out = df_out.drop(df_out.index[cols_to_drop])

    return df_out




[docs]
def nir_convert_df_to_long(
    df_nir: pd.DataFrame, year: int, header_long: Optional[list[str]] = None
) -> pd.DataFrame:
    """Convert an entity-wide NIR table for a single year to a long format
    DataFrame.

    The input DataFrame is required to have the following structure:
    * Columns for category, original category name, and data in this order, where
    category and original category name form a multiindex.
    * Column header as multiindex for entity and unit
    A column for the year is added during the conversion.

    Parameters
    ----------
    df_nir: pd.DataFrame
        Pandas DataFrame with the NIR table to be converted
    year: int
        Year of the given data
    header_long: list, optional
        specify a non-standard column header, e.g. with only category code
        or orig_cat_name

    Returns
    -------
    pd.DataFrame
        converted DataFrame
    """
    if header_long is None:
        header_long = ["category", "orig_cat_name", "entity", "unit", "time", "data"]

    df_stacked = df_nir.stack([0, 1], dropna=True).to_frame()
    df_stacked.insert(0, "year", str(year))
    df_stacked = df_stacked.reset_index()
    df_stacked.columns = header_long
    return df_stacked
PRIMAP2

Navigation

Related Topics

Source code for primap2.pm2io._GHG_inventory_reading