Source code for pipelog.frame_log

from collections import OrderedDict
from typing import Any, Dict, Tuple, Union

import pandas as pd

_NEW_LOG_KEY = "df_{}".format  # Call with _LOG_KEY(0)

_LOG_KEY = "log_key"

_AGG_FUNC_NAME = "agg_func"
_COL_NAME = "col_name"
_N_ROWS = "n_rows"
_N_COLS = "n_cols"


[docs]class FrameLog: def __init__( self, agg: pd.DataFrame = None, agg_axis: int = None, dtypes: dict = None, shape: Tuple[int, int] = None, column_names: list = None, copy: pd.DataFrame = None, ) -> None: """Init empty FrameLog""" self.agg = agg self.agg_axis = agg_axis self.dtypes = dtypes self.shape = shape self.column_names = column_names self.copy = copy def __eq__(self, o: object) -> bool: """Checks classical equivalence for all non DataFrame objects, and asserts that all DataFrames are exactly the same. """ if not isinstance(o, FrameLog): return False else: for attr in vars(self).keys(): a1, a2 = getattr(self, attr), getattr(o, attr) if isinstance(a1, pd.DataFrame) and isinstance(a2, pd.DataFrame): try: pd.testing.assert_frame_equal(a1, a2) except AssertionError: return False elif a1 != a2: return False return True def __repr__(self) -> str: """Create an easy to read representation of a FrameLog. None values will not be added, to make the representation shorter. Example: FrameLog(agg=DataFrame(...), axis=1) """ repr_str = [] for k, v in dict(vars(self)).items(): if v is not None: if isinstance(v, pd.DataFrame): v = "DataFrame(...)" repr_str.append(f"{k}={v}") return f"FrameLog({', '.join(repr_str)})"
[docs]class FrameLogCollection(OrderedDict): """An OrderedDict, which supports slicing, integer access and some custom functionality.""" def __init__(self, *args, **kwargs) -> None: """Overwritten, to initialise additional parameters that should be tracked.""" # It is important to assign _assignment_counter before super().__init__ because the instantiation might # call __setitem__ and will result in not finding this attribute. self._assignment_counter = 0 super().__init__(*args, **kwargs) def __setitem__(self, *args, **kwargs) -> None: """Overwrites the original version, to be able to count assignments.""" if not isinstance(args[0], str): raise ValueError("Keys should always be a string to enable unambiguous integer access, e.g. logs[0]") super().__setitem__(*args, **kwargs) self._assignment_counter += 1 def __getitem__(self, k: Union[slice]) -> Any: """Overwrites the original version, to be able to get a list like slice with frame_logs[1:3].""" if isinstance(k, slice): k_slice = list(self.keys())[k] log_slice = FrameLogCollection() for _k in k_slice: log_slice[_k] = super().__getitem__(_k) return log_slice elif isinstance(k, int): _k = list(self.keys())[k] return super().__getitem__(_k) else: return super().__getitem__(k)
[docs] def append(self, value: FrameLog, key: str = None) -> str: """Append new entry. If key is not given a new one will be created based on the internal assigment counter.""" if key is not None and key in self: raise KeyError(f"Key '{key}' already exists!") elif key is None: self[_NEW_LOG_KEY(self._assignment_counter)] = value else: self[key] = value
def _get_attr_dict(self, attr: str) -> Dict[str, Any]: attr_dict = OrderedDict() for k, v in self.items(): attr_dict[k] = getattr(v, attr) return attr_dict
[docs] def agg(self, agg_func_first: bool = False) -> pd.DataFrame: """View agg values as a multi index DataFrame.""" agg_dict = self._get_attr_dict("agg") # Concat with "keys" will result in a multi index for the index agg_concat = pd.concat(agg_dict.values(), axis=0, keys=agg_dict.keys()) # Rename indices agg_concat.columns.name = _COL_NAME agg_concat.index.names = (_LOG_KEY, _AGG_FUNC_NAME) if agg_func_first: agg_concat = agg_concat.swaplevel() # swaplevel() leaves the sorting of both index levels the same, so the new outer index is not grouped. # We need to group same keys, but do want the groups to be ordered by first occurrence, not alphabetically. # ["sum", "min", "sum", "min"] should become ["sum", "sum", "min", "min"] not ["min", "min", "sum", "sum"] agg_func_names = agg_concat.index.get_level_values(_AGG_FUNC_NAME) ordered_index = pd.Index(pd.Categorical(agg_func_names, agg_func_names.unique(), ordered=True)) # categories should be: Categories ['sum' < 'min'] so sorting will happen with regard to this ordering sorted_indexer = ordered_index.sort_values(return_indexer=True)[1] agg_concat = agg_concat.iloc[sorted_indexer] # Will return copy not view return agg_concat
[docs] def dtypes(self) -> pd.DataFrame: """View dtypes values as a DataFrame.""" dtypes_dict = self._get_attr_dict("dtypes") df_dtypes = pd.DataFrame(dtypes_dict).T df_dtypes.index.name = _LOG_KEY df_dtypes.columns.name = _COL_NAME return df_dtypes
[docs] def shape(self) -> pd.DataFrame: """View shape values as a DataFrame.""" shape_dict = self._get_attr_dict("shape") df_shape = pd.DataFrame(shape_dict).T df_shape.index.name = _LOG_KEY df_shape.columns = [_N_ROWS, _N_COLS] return df_shape
[docs] def column_names(self) -> pd.DataFrame: """View shape values as a DataFrame.""" columns_names_dict = self._get_attr_dict("column_names") cols = OrderedDict() for v in columns_names_dict.values(): cols.update(OrderedDict.fromkeys(v)) df_cols = pd.DataFrame(columns=cols, dtype=bool) for k, v in columns_names_dict.items(): bool_mask = pd.Series(df_cols.columns.isin(v), index=df_cols.columns, name=k) df_cols = df_cols.append(bool_mask) df_cols.columns.name = _COL_NAME df_cols.index.name = _LOG_KEY return df_cols