from typing import Dict, List, Optional
import numpy as np
import pandas as pd
from .base_calculator import BaseCalculator
from .calculate_json_formula import JSONFormula, calculate_formula_scores
[docs]
class Calculator(BaseCalculator):
"""A calculator for processing and analyzing data within a DataFrame based on specified equations and methods.
Attributes:
df (pd.DataFrame): The DataFrame to perform calculations on.
selected_columns (List[str]): The names of the columns to include in calculations.
overall_score_lower_bound (Optional[float]): The lower bound for overall scores.
overall_score_upper_bound (Optional[float]): The upper bound for overall scores.
equation_eval_str (Optional[str]): A string representing a custom equation to evaluate.
equation_type (str): The type of equation to use for calculations ("product", "sum", "free_style", or "json").
selected_columns (List[str]): Columns selected for calculations.
selected_values (np.ndarray): The values of the selected columns in the DataFrame.
value_scales (np.ndarray): The negative average log10 magnitude of absolute values for selected columns.
weights_for_groups (pd.Series): A Series containing weights for different groups within the DataFrame.
"""
[docs]
def __init__(
self,
df: pd.DataFrame,
selected_columns: List[str],
overall_score_lower_bound: Optional[float] = None,
overall_score_upper_bound: Optional[float] = None,
equation_type: str = "product",
weights_for_groups: Optional[pd.Series] = None,
equation_eval_str: Optional[str] = None,
equation_json: Optional[Dict] = None,
delimiter: Optional[str] = "#",
rerank_eval_str: Optional[str] = None,
) -> None:
"""Initializes the Calculator object.
Args:
df (pd.DataFrame): The DataFrame to perform calculations on.
selected_columns (List[str]): The names of the columns to include in calculations.
equation_type (str, optional): The type of equation to use for score calculation. Defaults to "product".
weights_for_groups (Optional[pd.Series], optional): A Series containing weights for different groups. Defaults to None, which sets equal weights.
equation_eval_str (Optional[str], optional): A string representing a custom equation for free-style calculations. Defaults to None.
rerank_eval_str (Optional[str], optional): A string representing a custom equation for reranking. Defaults to None.
"""
super().__init__(
df=df,
selected_columns=selected_columns,
overall_score_lower_bound=overall_score_lower_bound,
overall_score_upper_bound=overall_score_upper_bound,
rerank_eval_str=rerank_eval_str,
)
self.df = df
self.df_len = len(self.df)
self.equation_eval_str = equation_eval_str
self.rerank_eval_str = rerank_eval_str
if equation_json is not None:
self.equation_json = JSONFormula(**equation_json)
self.delimiter = delimiter
self.equation_type = equation_type
self.selected_values = self.df[selected_columns].values
if weights_for_groups is None:
self.weights_for_groups = pd.Series(
np.ones(len(self.df)), index=self.df.index
)
else:
self.weights_for_groups = weights_for_groups
[docs]
def value_scale(self) -> None:
"""
Calculates the negative average log10 magnitude of absolute values for selected columns in the dataframe,
storing the result in `self.value_scales`.
"""
dataframe = self.df[self.selected_columns].abs()
magnitudes = np.log10(dataframe.values + 1e-10)
avg_magnitude = np.nanmean(magnitudes, axis=0)
magnitudes = [-magnitude for magnitude in avg_magnitude]
self.value_scales = np.asarray(magnitudes)
[docs]
def get_overall_score(
self,
weights_for_equation: List[float],
) -> None:
"""Calculates the overall score for each row in the DataFrame based on the specified equation type and weights.
Args:
weights_for_equation (List[float]): A list of weights to apply to each selected column for the calculation.
"""
if self.equation_type == "product" and (
len(weights_for_equation) == len(self.selected_columns)
):
self.df["overall_score"] = np.prod(
self.selected_values**weights_for_equation, axis=1
)
elif self.equation_type == "sum":
weights_array = np.array(weights_for_equation).reshape(-1, 1)
self.df["overall_score"] = self.selected_values @ weights_array
elif self.equation_type == "free_style":
columns = [
self.selected_values[:, i] for i in range(self.selected_values.shape[1])
]
local_dict = self.initialize_local_dict(weights_for_equation, columns)
if self.equation_eval_str is not None:
self.df["overall_score"] = eval(
self.equation_eval_str, {"__builtins__": None}, local_dict
)
else:
raise ValueError("equation_eval_str is not defined.")
elif (self.equation_type == "json") and (self.equation_json is not None):
self.df["overall_score"] = calculate_formula_scores(
equation_json=self.equation_json,
selected_values=self.df[self.selected_columns],
weights=weights_for_equation,
delimiter=self.delimiter,
)
elif len(weights_for_equation) == 2 * len(self.selected_columns):
powers_for_equation = weights_for_equation[: len(self.selected_columns)]
first_order_weights = weights_for_equation[len(self.selected_columns) :]
self.df["overall_score"] = np.prod(
(1 + np.asarray(first_order_weights) * np.asarray(self.selected_values))
** powers_for_equation,
axis=1,
)
self._clip_overall_score()
self.rerank_with_side_information()
[docs]
def create_score_columns(
self, boundary_dict: dict, score_column: str = "score"
) -> None:
"""Creates new columns in the DataFrame to categorize rows based on score boundaries.
Args:
boundary_dict (Dict): A dictionary with score boundaries as keys and conditions as values.
score_column (str, optional): The name of the column to apply the boundaries to. Defaults to "score".
"""
for k, _ in boundary_dict.items():
self.df[f"{score_column}_lt_{k}"] = (self.df[score_column] >= k).astype(int)
[docs]
def initialize_fq_sampler(
self,
sample_size: int,
score_column: str,
slice_from: Optional[float] = None,
slice_to: Optional[float] = None,
log_scale: Optional[bool] = True,
laplace_smoothing: Optional[bool] = True,
) -> None:
"""Initializes a frequency sampler for a given score column and applies sampling results to create new columns.
Args:
sample_size (int): The size of the sample to generate.
score_column (str): The name of the score column to sample from.
slice_from (Optional[float], optional): The lower bound of the score range to sample. Defaults to None.
slice_to (Optional[float], optional): The upper bound of the score range to sample. Defaults to None.
log_scale (Optional[bool], optional): Whether to use logarithmic scaling for sampling. Defaults to True.
laplace_smoothing (Optional[bool], optional): Whether to apply Laplace smoothing to the sampling. Defaults to True.
"""
from ..sampling.frequency_sampler import FrequencySampler
sampler = FrequencySampler(
sample_size=sample_size,
data=self.df[score_column],
slice_from=slice_from,
slice_to=slice_to,
log_scale=log_scale,
laplace_smoothing=laplace_smoothing,
)
self.create_score_columns(
boundary_dict=sampler.sample(), score_column=score_column
)
self.samplers[score_column] = sampler
slice_from_condition = (
pd.Series(True, index=self.df.index)
if slice_from is None
else (self.df[score_column] >= slice_from)
)
slice_to_condition = (
pd.Series(True, index=self.df.index)
if slice_to is None
else (self.df[score_column] <= slice_to)
)
self.woauc_dict[score_column] = self.df[
slice_from_condition & slice_to_condition
].index