Source code for primap2.pm2io._GHG_inventory_reading

"""This file contains functions for reading of country GHG inventories
from National Inventory Reports (NIR), biannual Update Reports (BUR),
and other official country emissions inventories
Most of the functions in this file are exposed to the outside yet they
currently do not undergo the strict testing applied to the rest of PRIMAP2 as
they are added during the process of reading an preparing data for the PRIMAP-hist
update. Testing will be added in the future."""

import re
from typing import Optional, Union

import pandas as pd


[docs] def nir_add_unit_information( df_nir: pd.DataFrame, *, unit_row: Union[str, int], entity_row: Optional[Union[str, int]] = None, regexp_entity: str, regexp_unit: Optional[str] = None, manual_repl_unit: Optional[dict[str, str]] = None, manual_repl_entity: Optional[dict[str, str]] = None, default_unit: str, ) -> pd.DataFrame: """Add unit information to a National Inventory Report (NIR) style DataFrame. Add unit information to the header of an "entity-wide" file as present in the standard table format of National Inventory Reports (NIRs). The unit and entity information is extracted from combined unit and entity information in the row defined by `unit_row`. The parameters `regexp_unit` and `regexp_entity` determines how this is done by regular expressions for unit and entity. Additionally, manual mappings can be defined in the `manual_repl_unit` and `manual_repl_entity` dicts. For each column the routine tries to extract a unit using the regular expression. If this fails it looks in the `manual_repl_unit` dict for unit and in `manual_repl_entity` for entity information. If there is no information the default unit given in `default_unit` is used. In this case the analyzed value is used as entity unchanged. Parameters ---------- df_nir : pd.DataFrame Pandas DataFrame with the table to process unit_row : str or int String "header" to indicate that the column header should be used to derive the unit information or an integer specifying the row to use for unit information. If entity and unit information are given in the same row use only unit_row. entity_row : str or int String "header" to indicate that the column header should be used to derive the unit information or an integer specifying the row to use for entity information. If entity and unit information are given in the same row use only unit_row regexp_entity : str regular expression that extracts the entity from the cell value regexp_unit : str (optional) regular expression that extracts the unit from the cell value manual_repl_unit : dict (optional) dict defining unit for given cell values manual_repl_entity : dict (optional) dict defining entity for given cell values default_unit : str unit to be used if no unit can be extracted an no unit is given Returns ------- pd.DataFrame DataFrame with explicit unit information (as column header) """ if manual_repl_unit is None: manual_repl_unit = {} if manual_repl_entity is None: manual_repl_entity = {} cols_to_drop = [] # get the data to extract the units and entities from # can be either the header row or a regular row if unit_row == "header": values_for_units = list(df_nir.columns) else: # unit_row must be an integer values_for_units = list(df_nir.iloc[unit_row]) cols_to_drop.append(unit_row) if entity_row is not None: if entity_row == "header": values_for_entities = list(df_nir.columns) else: values_for_entities = list(df_nir.iloc[entity_row]) if entity_row != unit_row: cols_to_drop.append(entity_row) else: values_for_entities = values_for_units if regexp_unit is not None: re_unit = re.compile(regexp_unit) re_entity = re.compile(regexp_entity) units = values_for_units.copy() entities = values_for_entities.copy() for idx, value in enumerate(values_for_units): if str(value) in manual_repl_unit: units[idx] = manual_repl_unit[str(value)] else: if regexp_unit is not None: unit = re_unit.findall(str(value)) else: unit = False if unit: units[idx] = unit[0] else: units[idx] = default_unit for idx, value in enumerate(values_for_entities): if str(value) in manual_repl_entity: entities[idx] = manual_repl_entity[str(value)] else: entity = re_entity.findall(str(value)) if entity: entities[idx] = entity[0] else: entities[idx] = value new_header = [entities, units] df_out = df_nir.copy() df_out.columns = new_header if cols_to_drop: df_out = df_out.drop(df_out.index[cols_to_drop]) return df_out
[docs] def nir_convert_df_to_long( df_nir: pd.DataFrame, year: int, header_long: Optional[list[str]] = None ) -> pd.DataFrame: """Convert an entity-wide NIR table for a single year to a long format DataFrame. The input DataFrame is required to have the following structure: * Columns for category, original category name, and data in this order, where category and original category name form a multiindex. * Column header as multiindex for entity and unit A column for the year is added during the conversion. Parameters ---------- df_nir: pd.DataFrame Pandas DataFrame with the NIR table to be converted year: int Year of the given data header_long: list, optional specify a non-standard column header, e.g. with only category code or orig_cat_name Returns ------- pd.DataFrame converted DataFrame """ if header_long is None: header_long = ["category", "orig_cat_name", "entity", "unit", "time", "data"] df_stacked = df_nir.stack([0, 1], dropna=True).to_frame() df_stacked.insert(0, "year", str(year)) df_stacked = df_stacked.reset_index() df_stacked.columns = header_long return df_stacked