Skip to content

tea_tasting.aggr #

Module for working with aggregated statistics: count, mean, var, cov.

Aggregates(count_=None, mean_={}, var_={}, cov_={}) #

Bases: ReprMixin

Aggregated statistics.

Parameters:

Name Type Description Default
count_ int | None

Sample size (number of observations).

None
mean_ dict[str, float | int]

Dictionary of sample means with variable names as keys.

{}
var_ dict[str, float | int]

Dictionary of sample variances with variable names as keys.

{}
cov_ dict[tuple[str, str], float | int]

Dictionary of sample covariances with pairs of variable names as keys.

{}
Source code in src/tea_tasting/aggr.py
def __init__(
    self,
    count_: int | None = None,
    mean_: dict[str, float | int] = {},  # noqa: B006
    var_: dict[str, float | int] = {},  # noqa: B006
    cov_: dict[tuple[str, str], float | int] = {},  # noqa: B006
) -> None:
    """Aggregated statistics.

    Args:
        count_: Sample size (number of observations).
        mean_: Dictionary of sample means with variable names as keys.
        var_: Dictionary of sample variances with variable names as keys.
        cov_: Dictionary of sample covariances with pairs of variable names as keys.
    """
    self.count_ = count_
    self.mean_ = mean_
    self.var_ = var_
    self.cov_ = {_sorted_tuple(*k): v for k, v in cov_.items()}

count() #

Sample size (number of observations).

Raises:

Type Description
RuntimeError

Count is None (if it was not defined during initialization).

Returns:

Type Description
int

Sample size (number of observations).

Source code in src/tea_tasting/aggr.py
def count(self) -> int:
    """Sample size (number of observations).

    Raises:
        RuntimeError: Count is `None` (if it was not defined during initialization).

    Returns:
        Sample size (number of observations).
    """
    if self.count_ is None:
        raise RuntimeError("Count is None.")
    return self.count_

cov(left, right) #

Sample covariance.

Assume the variable is a constant if the variable name is None.

Parameters:

Name Type Description Default
left str | None

First variable name.

required
right str | None

Second variable name.

required

Returns:

Type Description
float | int

Sample covariance.

Source code in src/tea_tasting/aggr.py
def cov(self, left: str | None, right: str | None) -> float | int:
    """Sample covariance.

    Assume the variable is a constant if the variable name is `None`.

    Args:
        left: First variable name.
        right: Second variable name.

    Returns:
        Sample covariance.
    """
    if left is None or right is None:
        return 0
    return self.cov_[_sorted_tuple(left, right)]

mean(name) #

Sample mean.

Assume the variable is a constant 1 if the variable name is None.

Parameters:

Name Type Description Default
name str | None

Variable name.

required

Returns:

Type Description
float | int

Sample mean.

Source code in src/tea_tasting/aggr.py
def mean(self, name: str | None) -> float | int:
    """Sample mean.

    Assume the variable is a constant `1` if the variable name is `None`.

    Args:
        name: Variable name.

    Returns:
        Sample mean.
    """
    if name is None:
        return 1
    return self.mean_[name]

ratio_cov(left_numer, left_denom, right_numer, right_denom) #

Sample covariance of the ratios of variables using the Delta method.

Parameters:

Name Type Description Default
left_numer str | None

First numerator variable name.

required
left_denom str | None

First denominator variable name.

required
right_numer str | None

Second numerator variable name.

required
right_denom str | None

Second denominator variable name.

required

Returns:

Type Description
float | int

Sample covariance of the ratios of variables.

References
Source code in src/tea_tasting/aggr.py
def ratio_cov(
    self,
    left_numer: str | None,
    left_denom: str | None,
    right_numer: str | None,
    right_denom: str | None,
) -> float | int:
    """Sample covariance of the ratios of variables using the Delta method.

    Args:
        left_numer: First numerator variable name.
        left_denom: First denominator variable name.
        right_numer: Second numerator variable name.
        right_denom: Second denominator variable name.

    Returns:
        Sample covariance of the ratios of variables.

    References:
        - [Delta method](https://en.wikipedia.org/wiki/Delta_method).
        - [Taylor expansions for the moments of functions of random variables](https://en.wikipedia.org/wiki/Taylor_expansions_for_the_moments_of_functions_of_random_variables).
    """
    left_ratio_of_means = self.mean(left_numer) / self.mean(left_denom)
    right_ratio_of_means = self.mean(right_numer) / self.mean(right_denom)
    return (
        self.cov(left_numer, right_numer)
        - self.cov(left_numer, right_denom) * right_ratio_of_means
        - self.cov(left_denom, right_numer) * left_ratio_of_means
        + self.cov(left_denom, right_denom)
            * left_ratio_of_means * right_ratio_of_means
    ) / self.mean(left_denom) / self.mean(right_denom)

ratio_var(numer, denom) #

Sample variance of the ratio of two variables using the Delta method.

Parameters:

Name Type Description Default
numer str | None

Numerator variable name.

required
denom str | None

Denominator variable name.

required

Returns:

Type Description
float | int

Sample variance of the ratio of two variables.

References
Source code in src/tea_tasting/aggr.py
def ratio_var(
    self,
    numer: str | None,
    denom: str | None,
) -> float | int:
    """Sample variance of the ratio of two variables using the Delta method.

    Args:
        numer: Numerator variable name.
        denom: Denominator variable name.

    Returns:
        Sample variance of the ratio of two variables.

    References:
        - [Delta method](https://en.wikipedia.org/wiki/Delta_method).
        - [Taylor expansions for the moments of functions of random variables](https://en.wikipedia.org/wiki/Taylor_expansions_for_the_moments_of_functions_of_random_variables).
    """
    numer_mean_sq = self.mean(numer) * self.mean(numer)
    denom_mean_sq = self.mean(denom) * self.mean(denom)
    return (
        self.var(numer)
        - 2 * self.cov(numer, denom) * self.mean(numer) / self.mean(denom)
        + self.var(denom) * numer_mean_sq / denom_mean_sq
    ) / denom_mean_sq

var(name) #

Sample variance.

Assume the variable is a constant if the variable name is None.

Parameters:

Name Type Description Default
name str | None

Variable name.

required

Returns:

Type Description
float | int

Sample variance.

Source code in src/tea_tasting/aggr.py
def var(self, name: str | None) -> float | int:
    """Sample variance.

    Assume the variable is a constant if the variable name is `None`.

    Args:
        name: Variable name.

    Returns:
        Sample variance.
    """
    if name is None:
        return 0
    return self.var_[name]

with_zero_div() #

Return aggregates that do not raise an error on division by zero.

Division by zero returns
  • nan if numerator is equal to 0,
  • inf if numerator is greater than 0,
  • -inf if numerator is less than 0.
Source code in src/tea_tasting/aggr.py
def with_zero_div(self) -> Aggregates:
    """Return aggregates that do not raise an error on division by zero.

    Division by zero returns:
        - `nan` if numerator is equal to `0`,
        - `inf` if numerator is greater than `0`,
        - `-inf` if numerator is less than `0`.
    """
    return Aggregates(
        count_=None if self.count_ is None else tea_tasting.utils.Int(self.count_),
        mean_={k: tea_tasting.utils.numeric(v) for k, v in self.mean_.items()},
        var_={k: tea_tasting.utils.numeric(v) for k, v in self.var_.items()},
        cov_={k: tea_tasting.utils.numeric(v) for k, v in self.cov_.items()},
    )

read_aggregates(data, group_col, *, has_count, mean_cols, var_cols, cov_cols) #

Extract aggregated statistics from an Ibis Table or a Pandas DataFrame.

Parameters:

Name Type Description Default
data Table | DataFrame

Granular data.

required
group_col str | None

Column name to group by before aggregation. If None, total aggregates are calculated.

required
has_count bool

If True, calculate the sample size.

required
mean_cols Sequence[str]

Column names for calculation of sample means.

required
var_cols Sequence[str]

Column names for calculation of sample variances.

required
cov_cols Sequence[tuple[str, str]]

Pairs of column names for calculation of sample covariances.

required

Returns:

Type Description
dict[Any, Aggregates] | Aggregates

Aggregated statistics.

Source code in src/tea_tasting/aggr.py
def read_aggregates(
    data: ibis.expr.types.Table | pd.DataFrame,
    group_col: str | None,
    *,
    has_count: bool,
    mean_cols: Sequence[str],
    var_cols: Sequence[str],
    cov_cols: Sequence[tuple[str, str]],
) -> dict[Any, Aggregates] | Aggregates:
    """Extract aggregated statistics from an Ibis Table or a Pandas DataFrame.

    Args:
        data: Granular data.
        group_col: Column name to group by before aggregation.
            If `None`, total aggregates are calculated.
        has_count: If `True`, calculate the sample size.
        mean_cols: Column names for calculation of sample means.
        var_cols: Column names for calculation of sample variances.
        cov_cols: Pairs of column names for calculation of sample covariances.

    Returns:
        Aggregated statistics.
    """
    if isinstance(data, pd.DataFrame):
        con = ibis.pandas.connect()
        data = con.create_table("data", data)

    mean_cols, var_cols, cov_cols = _validate_aggr_cols(mean_cols, var_cols, cov_cols)

    demean_cols = tuple({*var_cols, *itertools.chain(*cov_cols)})
    if len(demean_cols) > 0:
        demean_expr = {
            _DEMEAN.format(col): data[col] - data[col].mean()  # type: ignore
            for col in demean_cols
        }
        grouped_data = data.group_by(group_col) if group_col is not None else data  # type: ignore
        data = grouped_data.mutate(**demean_expr)  # type: ignore

    count_expr = {_COUNT: data.count()} if has_count else {}
    mean_expr = {_MEAN.format(col): data[col].mean() for col in mean_cols}  # type: ignore
    var_expr = {
        _VAR.format(col): (
            data[_DEMEAN.format(col)] * data[_DEMEAN.format(col)]
        ).sum().cast("float") / (data.count() - 1)  # type: ignore
        for col in var_cols
    }
    cov_expr = {
        _COV.format(left, right): (
            data[_DEMEAN.format(left)] * data[_DEMEAN.format(right)]
        ).sum().cast("float") / (data.count() - 1)  # type: ignore
        for left, right in cov_cols
    }

    grouped_data = data.group_by(group_col) if group_col is not None else data  # type: ignore
    aggr_data = grouped_data.aggregate(
        **count_expr,  # type: ignore
        **mean_expr,  # type: ignore
        **var_expr,
        **cov_expr,
    ).to_pandas()

    if group_col is None:
        return _get_aggregates(
            aggr_data,
            has_count=has_count,
            mean_cols=mean_cols,
            var_cols=var_cols,
            cov_cols=cov_cols,
        )

    return {
        group: _get_aggregates(
            group_data,
            has_count=has_count,
            mean_cols=mean_cols,
            var_cols=var_cols,
            cov_cols=cov_cols,
        )
        for group, group_data in aggr_data.groupby(group_col)
    }