Source code for paradance.dataloader.base

from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Tuple, Union

import pandas as pd
from pydantic import BaseModel


[docs] class BaseDataLoaderConfig(BaseModel): """Configuration model for data loaders. This class defines the configuration settings for data loaders, including file path, file name, file type, and maximum rows to be loaded. It utilizes Pydantic's BaseModel for data validation and settings management. Attributes: file_path (Optional[str]): The path to the file to be loaded. Defaults to None. file_name (Optional[str]): The name of the file to be loaded. Defaults to None. file_type (Optional[str]): The type of the file to be loaded. Defaults to 'csv'. max_rows (Optional[int]): The maximum number of rows to load from the file. Defaults to None, indicating no limit. """ file_path: Optional[str] = None file_name: Optional[str] = None file_type: Optional[str] = "csv" max_rows: Optional[int] = None clean_zero_columns: Optional[Union[bool, List]] = None
[docs] class BaseDataLoader(ABC): """Base class for data loaders."""
[docs] def __init__( self, file_path: Optional[str] = None, file_name: Optional[str] = None, file_type: Optional[str] = "csv", max_rows: Optional[int] = None, clean_zero_columns: Optional[Union[bool, List]] = None, config: Optional[Dict] = None, ) -> None: if config is not None: self.config = BaseDataLoaderConfig(**config) else: self.config = BaseDataLoaderConfig( file_path=file_path, file_name=file_name, file_type=file_type, max_rows=max_rows, clean_zero_columns=clean_zero_columns, ) self.file_path = self.config.file_path self.file_name = self.config.file_name self.file_type = self.config.file_type self.max_rows = self.config.max_rows self.clean_zero_columns = self.config.clean_zero_columns self.df = self.load_data() self.column_name_spliting() if self.clean_zero_columns is not None: self.clean_columns_zero(self.clean_zero_columns)
[docs] @abstractmethod def load_data(self) -> pd.DataFrame: """Load data from file.""" raise NotImplementedError("load_data() not implemented")
[docs] def column_name_spliting(self, delimiter: str = ".") -> None: """Split column names by delimiter.""" columns = [] if self.df is not None: for column in self.df.columns: columns.append(column.split(delimiter)[-1]) self.df.columns = pd.Index(columns)
[docs] def add_one_smoothing(self, column: str) -> None: """Add one smoothing to a column. :param column: column name """ if self.df is not None: self.df[column] = self.df[column] + 1
[docs] def clean_columns_zero(self, columns: Union[bool, List] = False) -> None: """Clean columns with all zeros. :param columns: columns to clean """ if self.df is not None and columns is not False: self.df[columns] = self.df[columns].fillna(0) self.df = self.df[(self.df[columns] > 0).all(axis=1)] self.df.reset_index(drop=True, inplace=True) self.df = self.df.fillna(0)
[docs] @staticmethod def clip_and_sum_with_group( df: pd.DataFrame, groupby: str, clip_column: str ) -> Optional[pd.Series]: """Clip and sum with group. :param df: dataframe :param groupby: groupby column :param clip_column: column to clip """ if df is not None: threshold = df[clip_column].quantile(0.999) df["clipped"] = df[clip_column].clip(upper=threshold) grouped = df.groupby(groupby)["clipped"].sum() return grouped + 1 # add one smoothing else: return None
[docs] @staticmethod def clean_one_label_users( df: pd.DataFrame, user_column: str = "user_id", label_column: str = "label", ) -> pd.DataFrame: """Remove users with only one label. :param df: dataframe :param user_column: user column name :param label_column: label column name """ df = df.fillna(0) valid_users = ( df.groupby(user_column) .filter(lambda x: x[label_column].nunique() > 1)[user_column] .unique() ) valid_df = df[df[user_column].isin(valid_users)].copy() valid_df.reset_index(drop=True, inplace=True) return valid_df
[docs] @staticmethod def clip_clean_count_with_group( df: pd.DataFrame, groupby: str, label_column: str, clip_column: Optional[str] = None, ) -> Tuple: """Clip and count with group. :param df: dataframe :param groupby: groupby column :param clip_column: column to clip :param label_column: label column """ df_clean = BaseDataLoader.clean_one_label_users( df=df, user_column=groupby, label_column=label_column ) if clip_column is not None: counts_df = BaseDataLoader.clip_and_sum_with_group( df=df_clean, groupby=groupby, clip_column=clip_column ) else: counts_df = None return df_clean, counts_df