Source code for paradance.evaluation.calculator

from typing import Dict, List, Optional

import numpy as np
import pandas as pd

from .base_calculator import BaseCalculator
from .calculate_json_formula import JSONFormula, calculate_formula_scores



[docs]
class Calculator(BaseCalculator):
    """A calculator for processing and analyzing data within a DataFrame based on specified equations and methods.

    Attributes:
        df (pd.DataFrame): The DataFrame to perform calculations on.
        selected_columns (List[str]): The names of the columns to include in calculations.
        overall_score_lower_bound (Optional[float]): The lower bound for overall scores.
        overall_score_upper_bound (Optional[float]): The upper bound for overall scores.
        equation_eval_str (Optional[str]): A string representing a custom equation to evaluate.
        equation_type (str): The type of equation to use for calculations ("product", "sum", "free_style", or "json").
        selected_columns (List[str]): Columns selected for calculations.
        selected_values (np.ndarray): The values of the selected columns in the DataFrame.
        value_scales (np.ndarray): The negative average log10 magnitude of absolute values for selected columns.
        weights_for_groups (pd.Series): A Series containing weights for different groups within the DataFrame.
    """


[docs]
    def __init__(
        self,
        df: pd.DataFrame,
        selected_columns: List[str],
        overall_score_lower_bound: Optional[float] = None,
        overall_score_upper_bound: Optional[float] = None,
        equation_type: str = "product",
        weights_for_groups: Optional[pd.Series] = None,
        equation_eval_str: Optional[str] = None,
        equation_json: Optional[Dict] = None,
        delimiter: Optional[str] = "#",
        rerank_eval_str: Optional[str] = None,
    ) -> None:
        """Initializes the Calculator object.

        Args:
            df (pd.DataFrame): The DataFrame to perform calculations on.
            selected_columns (List[str]): The names of the columns to include in calculations.
            equation_type (str, optional): The type of equation to use for score calculation. Defaults to "product".
            weights_for_groups (Optional[pd.Series], optional): A Series containing weights for different groups. Defaults to None, which sets equal weights.
            equation_eval_str (Optional[str], optional): A string representing a custom equation for free-style calculations. Defaults to None.
            rerank_eval_str (Optional[str], optional): A string representing a custom equation for reranking. Defaults to None.
        """
        super().__init__(
            df=df,
            selected_columns=selected_columns,
            overall_score_lower_bound=overall_score_lower_bound,
            overall_score_upper_bound=overall_score_upper_bound,
            rerank_eval_str=rerank_eval_str,
        )
        self.df = df
        self.df_len = len(self.df)
        self.equation_eval_str = equation_eval_str
        self.rerank_eval_str = rerank_eval_str

        if equation_json is not None:
            self.equation_json = JSONFormula(**equation_json)

        self.delimiter = delimiter
        self.equation_type = equation_type
        self.selected_values = self.df[selected_columns].values

        if weights_for_groups is None:
            self.weights_for_groups = pd.Series(
                np.ones(len(self.df)), index=self.df.index
            )
        else:
            self.weights_for_groups = weights_for_groups



[docs]
    def value_scale(self) -> None:
        """
        Calculates the negative average log10 magnitude of absolute values for selected columns in the dataframe,
        storing the result in `self.value_scales`.
        """
        dataframe = self.df[self.selected_columns].abs()
        magnitudes = np.log10(dataframe.values + 1e-10)

        avg_magnitude = np.nanmean(magnitudes, axis=0)
        magnitudes = [-magnitude for magnitude in avg_magnitude]

        self.value_scales = np.asarray(magnitudes)



[docs]
    def get_overall_score(
        self,
        weights_for_equation: List[float],
    ) -> None:
        """Calculates the overall score for each row in the DataFrame based on the specified equation type and weights.

        Args:
            weights_for_equation (List[float]): A list of weights to apply to each selected column for the calculation.
        """
        if self.equation_type == "product" and (
            len(weights_for_equation) == len(self.selected_columns)
        ):
            self.df["overall_score"] = np.prod(
                self.selected_values**weights_for_equation, axis=1
            )
        elif self.equation_type == "sum":
            weights_array = np.array(weights_for_equation).reshape(-1, 1)
            self.df["overall_score"] = self.selected_values @ weights_array

        elif self.equation_type == "free_style":
            columns = [
                self.selected_values[:, i] for i in range(self.selected_values.shape[1])
            ]
            local_dict = self.initialize_local_dict(weights_for_equation, columns)
            if self.equation_eval_str is not None:
                self.df["overall_score"] = eval(
                    self.equation_eval_str, {"__builtins__": None}, local_dict
                )
            else:
                raise ValueError("equation_eval_str is not defined.")

        elif (self.equation_type == "json") and (self.equation_json is not None):
            self.df["overall_score"] = calculate_formula_scores(
                equation_json=self.equation_json,
                selected_values=self.df[self.selected_columns],
                weights=weights_for_equation,
                delimiter=self.delimiter,
            )

        elif len(weights_for_equation) == 2 * len(self.selected_columns):
            powers_for_equation = weights_for_equation[: len(self.selected_columns)]
            first_order_weights = weights_for_equation[len(self.selected_columns) :]
            self.df["overall_score"] = np.prod(
                (1 + np.asarray(first_order_weights) * np.asarray(self.selected_values))
                ** powers_for_equation,
                axis=1,
            )

        self._clip_overall_score()
        self.rerank_with_side_information()



[docs]
    def create_score_columns(
        self, boundary_dict: dict, score_column: str = "score"
    ) -> None:
        """Creates new columns in the DataFrame to categorize rows based on score boundaries.

        Args:
            boundary_dict (Dict): A dictionary with score boundaries as keys and conditions as values.
            score_column (str, optional): The name of the column to apply the boundaries to. Defaults to "score".
        """
        for k, _ in boundary_dict.items():
            self.df[f"{score_column}_lt_{k}"] = (self.df[score_column] >= k).astype(int)



[docs]
    def initialize_fq_sampler(
        self,
        sample_size: int,
        score_column: str,
        slice_from: Optional[float] = None,
        slice_to: Optional[float] = None,
        log_scale: Optional[bool] = True,
        laplace_smoothing: Optional[bool] = True,
    ) -> None:
        """Initializes a frequency sampler for a given score column and applies sampling results to create new columns.

        Args:
            sample_size (int): The size of the sample to generate.
            score_column (str): The name of the score column to sample from.
            slice_from (Optional[float], optional): The lower bound of the score range to sample. Defaults to None.
            slice_to (Optional[float], optional): The upper bound of the score range to sample. Defaults to None.
            log_scale (Optional[bool], optional): Whether to use logarithmic scaling for sampling. Defaults to True.
            laplace_smoothing (Optional[bool], optional): Whether to apply Laplace smoothing to the sampling. Defaults to True.
        """
        from ..sampling.frequency_sampler import FrequencySampler

        sampler = FrequencySampler(
            sample_size=sample_size,
            data=self.df[score_column],
            slice_from=slice_from,
            slice_to=slice_to,
            log_scale=log_scale,
            laplace_smoothing=laplace_smoothing,
        )
        self.create_score_columns(
            boundary_dict=sampler.sample(), score_column=score_column
        )
        self.samplers[score_column] = sampler
        slice_from_condition = (
            pd.Series(True, index=self.df.index)
            if slice_from is None
            else (self.df[score_column] >= slice_from)
        )
        slice_to_condition = (
            pd.Series(True, index=self.df.index)
            if slice_to is None
            else (self.df[score_column] <= slice_to)
        )
        self.woauc_dict[score_column] = self.df[
            slice_from_condition & slice_to_condition
        ].index