Skip to content

tea_tasting.aggr #

Module for working with aggregated statistics: count, mean, var, cov.

Aggregates(count_=None, mean_={}, var_={}, cov_={}) #

Bases: ReprMixin

Aggregated statistics.

Parameters:

Name Type Description Default
count_ int | None

Sample size (number of observations).

None
mean_ dict[str, float | int]

Dictionary of sample means with variable names as keys.

{}
var_ dict[str, float | int]

Dictionary of sample variances with variable names as keys.

{}
cov_ dict[tuple[str, str], float | int]

Dictionary of sample covariances with pairs of variable names as keys.

{}
Source code in src/tea_tasting/aggr.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(
    self,
    count_: int | None = None,
    mean_: dict[str, float | int] = {},  # noqa: B006
    var_: dict[str, float | int] = {},  # noqa: B006
    cov_: dict[tuple[str, str], float | int] = {},  # noqa: B006
) -> None:
    """Aggregated statistics.

    Args:
        count_: Sample size (number of observations).
        mean_: Dictionary of sample means with variable names as keys.
        var_: Dictionary of sample variances with variable names as keys.
        cov_: Dictionary of sample covariances with pairs of variable names as keys.
    """
    self.count_ = count_
    self.mean_ = mean_
    self.var_ = var_
    self.cov_ = {_sorted_tuple(*k): v for k, v in cov_.items()}

count() #

Sample size (number of observations).

Returns:

Type Description
int

Sample size (number of observations).

Source code in src/tea_tasting/aggr.py
70
71
72
73
74
75
76
77
78
def count(self) -> int:
    """Sample size (number of observations).

    Returns:
        Sample size (number of observations).
    """
    if self.count_ is None:
        raise RuntimeError("Count is None.")
    return self.count_

cov(left, right) #

Sample covariance.

Assume the variable is a constant if the variable name is None.

Parameters:

Name Type Description Default
left str | None

First variable name.

required
right str | None

Second variable name.

required

Returns:

Type Description
float | int

Sample covariance.

Source code in src/tea_tasting/aggr.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def cov(self, left: str | None, right: str | None) -> float | int:
    """Sample covariance.

    Assume the variable is a constant if the variable name is `None`.

    Args:
        left: First variable name.
        right: Second variable name.

    Returns:
        Sample covariance.
    """
    if left is None or right is None:
        return 0
    return self.cov_[_sorted_tuple(left, right)]

mean(name) #

Sample mean.

Assume the variable is a constant 1 if the variable name is None.

Parameters:

Name Type Description Default
name str | None

Variable name.

required

Returns:

Type Description
float | int

Sample mean.

Source code in src/tea_tasting/aggr.py
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def mean(self, name: str | None) -> float | int:
    """Sample mean.

    Assume the variable is a constant `1` if the variable name is `None`.

    Args:
        name: Variable name.

    Returns:
        Sample mean.
    """
    if name is None:
        return 1
    return self.mean_[name]

ratio_cov(left_numer, left_denom, right_numer, right_denom) #

Sample covariance of the ratios of variables using the Delta method.

Parameters:

Name Type Description Default
left_numer str | None

First numerator variable name.

required
left_denom str | None

First denominator variable name.

required
right_numer str | None

Second numerator variable name.

required
right_denom str | None

Second denominator variable name.

required

Returns:

Type Description
float | int

Sample covariance of the ratios of variables.

References
Source code in src/tea_tasting/aggr.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def ratio_cov(
    self,
    left_numer: str | None,
    left_denom: str | None,
    right_numer: str | None,
    right_denom: str | None,
) -> float | int:
    """Sample covariance of the ratios of variables using the Delta method.

    Args:
        left_numer: First numerator variable name.
        left_denom: First denominator variable name.
        right_numer: Second numerator variable name.
        right_denom: Second denominator variable name.

    Returns:
        Sample covariance of the ratios of variables.

    References:
        - [Delta method](https://en.wikipedia.org/wiki/Delta_method).
        - [Taylor expansions for the moments of functions of random variables](https://en.wikipedia.org/wiki/Taylor_expansions_for_the_moments_of_functions_of_random_variables).
    """
    left_ratio_of_means = self.mean(left_numer) / self.mean(left_denom)
    right_ratio_of_means = self.mean(right_numer) / self.mean(right_denom)
    return (
        self.cov(left_numer, right_numer)
        - self.cov(left_numer, right_denom) * right_ratio_of_means
        - self.cov(left_denom, right_numer) * left_ratio_of_means
        + self.cov(left_denom, right_denom)
            * left_ratio_of_means * right_ratio_of_means
    ) / self.mean(left_denom) / self.mean(right_denom)

ratio_var(numer, denom) #

Sample variance of the ratio of two variables using the Delta method.

Parameters:

Name Type Description Default
numer str | None

Numerator variable name.

required
denom str | None

Denominator variable name.

required

Returns:

Type Description
float | int

Sample variance of the ratio of two variables.

References
Source code in src/tea_tasting/aggr.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def ratio_var(
    self,
    numer: str | None,
    denom: str | None,
) -> float | int:
    """Sample variance of the ratio of two variables using the Delta method.

    Args:
        numer: Numerator variable name.
        denom: Denominator variable name.

    Returns:
        Sample variance of the ratio of two variables.

    References:
        - [Delta method](https://en.wikipedia.org/wiki/Delta_method).
        - [Taylor expansions for the moments of functions of random variables](https://en.wikipedia.org/wiki/Taylor_expansions_for_the_moments_of_functions_of_random_variables).
    """
    numer_mean_sq = self.mean(numer) * self.mean(numer)
    denom_mean_sq = self.mean(denom) * self.mean(denom)
    return (
        self.var(numer)
        - 2 * self.cov(numer, denom) * self.mean(numer) / self.mean(denom)
        + self.var(denom) * numer_mean_sq / denom_mean_sq
    ) / denom_mean_sq

var(name) #

Sample variance.

Assume the variable is a constant if the variable name is None.

Parameters:

Name Type Description Default
name str | None

Variable name.

required

Returns:

Type Description
float | int

Sample variance.

Source code in src/tea_tasting/aggr.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def var(self, name: str | None) -> float | int:
    """Sample variance.

    Assume the variable is a constant if the variable name is `None`.

    Args:
        name: Variable name.

    Returns:
        Sample variance.
    """
    if name is None:
        return 0
    return self.var_[name]

with_zero_div() #

Return aggregates that do not raise an error on division by zero.

Division by zero returns:

  • inf if numerator is greater than 0,
  • nan if numerator is equal to or less than 0.
Source code in src/tea_tasting/aggr.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def with_zero_div(self) -> Aggregates:
    """Return aggregates that do not raise an error on division by zero.

    Division by zero returns:

    - `inf` if numerator is greater than `0`,
    - `nan` if numerator is equal to or less than `0`.
    """
    return Aggregates(
        count_=None if self.count_ is None else tea_tasting.utils.Int(self.count_),
        mean_={k: tea_tasting.utils.numeric(v) for k, v in self.mean_.items()},
        var_={k: tea_tasting.utils.numeric(v) for k, v in self.var_.items()},
        cov_={k: tea_tasting.utils.numeric(v) for k, v in self.cov_.items()},
    )

read_aggregates(data, group_col, *, has_count, mean_cols, var_cols, cov_cols) #

Extract aggregated statistics.

Parameters:

Name Type Description Default
data Table | IntoFrame

Granular data.

required
group_col str | None

Column name to group by before aggregation. If None, total aggregates are calculated.

required
has_count bool

If True, calculate the sample size.

required
mean_cols Sequence[str]

Column names for calculation of sample means.

required
var_cols Sequence[str]

Column names for calculation of sample variances.

required
cov_cols Sequence[tuple[str, str]]

Pairs of column names for calculation of sample covariances.

required

Returns:

Type Description
dict[Any, Aggregates] | Aggregates

Aggregated statistics.

Source code in src/tea_tasting/aggr.py
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
def read_aggregates(
    data: ibis.expr.types.Table | narwhals.typing.IntoFrame,
    group_col: str | None,
    *,
    has_count: bool,
    mean_cols: Sequence[str],
    var_cols: Sequence[str],
    cov_cols: Sequence[tuple[str, str]],
) -> dict[Any, Aggregates] | Aggregates:
    """Extract aggregated statistics.

    Args:
        data: Granular data.
        group_col: Column name to group by before aggregation.
            If `None`, total aggregates are calculated.
        has_count: If `True`, calculate the sample size.
        mean_cols: Column names for calculation of sample means.
        var_cols: Column names for calculation of sample variances.
        cov_cols: Pairs of column names for calculation of sample covariances.

    Returns:
        Aggregated statistics.
    """
    mean_cols, var_cols, cov_cols = _validate_aggr_cols(mean_cols, var_cols, cov_cols)

    if isinstance(data, ibis.expr.types.Table):
        aggr_data = _read_aggr_ibis(
            data=data,
            group_col=group_col,
            has_count=has_count,
            mean_cols=mean_cols,
            var_cols=var_cols,
            cov_cols=cov_cols,
        )
    else:
        aggr_data = _read_aggr_narwhals(
            data=data,
            group_col=group_col,
            has_count=has_count,
            mean_cols=mean_cols,
            var_cols=var_cols,
            cov_cols=cov_cols,
        )

    if group_col is None:
        return _get_aggregates(
            aggr_data[0],
            has_count=has_count,
            mean_cols=mean_cols,
            var_cols=var_cols,
            cov_cols=cov_cols,
        )

    return {
        group_data[group_col]: _get_aggregates(
            group_data,
            has_count=has_count,
            mean_cols=mean_cols,
            var_cols=var_cols,
            cov_cols=cov_cols,
        )
        for group_data in aggr_data
    }