Skip to content

tea_tasting.metrics.resampling #

Metrics analyzed using resampling methods.

Bootstrap(columns, statistic, *, alternative=None, confidence_level=None, n_resamples=None, method='bca', batch=None, random_state=None) #

Bases: MetricBaseGranular[BootstrapResult]

Metric for analysis of a statistic using bootstrap resampling.

If columns is a sequence of strings, then the sample passed to the statistic callable contains an extra dimension in the first axis. See examples below.

Parameters:

Name Type Description Default
columns str | Sequence[str]

Names of the columns to be used in the analysis.

required
statistic Callable[..., NDArray[number[Any]]]

Statistic. It must be a vectorized callable that accepts a NumPy array as the first argument and returns the resulting statistic. It must also accept a keyword argument axis and be vectorized to compute the statistic along the provided axis.

required
alternative Literal['two-sided', 'greater', 'less'] | None

Alternative hypothesis:

  • "two-sided": the means are unequal,
  • "greater": the mean in the treatment variant is greater than the mean in the control variant,
  • "less": the mean in the treatment variant is less than the mean in the control variant.
None
confidence_level float | None

Confidence level for the confidence interval.

None
n_resamples int | None

The number of resamples performed to form the bootstrap distribution of the statistic.

None
method Literal['percentile', 'basic', 'bca']

Whether to return the "percentile" bootstrap confidence interval ("percentile"), the "basic" (AKA "reverse") bootstrap confidence interval ("basic"), or the bias-corrected and accelerated bootstrap confidence interval ("bca").

'bca'
batch int | None

The number of resamples to process in each vectorized call to statistic. Memory usage is O(batch * n), where n is the sample size. Default is None, in which case batch = n_resamples (or batch = max(n_resamples, n) for method="bca").

None
random_state int | Generator | SeedSequence | None

Pseudorandom number generator state used to generate resamples.

None
Parameter defaults

Defaults for parameters alternative, confidence_level, and n_resamples can be changed using the config_context and set_context functions. See the Global configuration reference for details.

References

Examples:

>>> import numpy as np
>>> import tea_tasting as tt

>>> experiment = tt.Experiment(
...     orders_per_user=tt.Bootstrap("orders", np.mean, random_state=42),
... )
>>> data = tt.make_users_data(seed=42)
>>> result = experiment.analyze(data)
>>> print(result)
         metric control treatment rel_effect_size rel_effect_size_ci pvalue
orders_per_user   0.530     0.573            8.0%       [-1.8%, 19%]      -

With multiple columns:

>>> def ratio_of_means(sample, axis):
...     means = np.mean(sample, axis=axis)
...     return means[0] / means[1]

>>> experiment = tt.Experiment(
...     orders_per_session=tt.Bootstrap(
...         ("orders", "sessions"),
...         ratio_of_means,
...         random_state=42,
...     ),
... )
>>> data = tt.make_users_data(seed=42)
>>> result = experiment.analyze(data)
>>> print(result)
            metric control treatment rel_effect_size rel_effect_size_ci pvalue
orders_per_session   0.266     0.289            8.8%      [-0.61%, 20%]      -
Source code in src/tea_tasting/metrics/resampling.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def __init__(
    self,
    columns: str | Sequence[str],
    statistic: Callable[..., npt.NDArray[np.number[Any]]],
    *,
    alternative: Literal["two-sided", "greater", "less"] | None = None,
    confidence_level: float | None = None,
    n_resamples: int | None = None,
    method: Literal["percentile", "basic", "bca"] = "bca",
    batch: int | None = None,
    random_state: int | np.random.Generator | np.random.SeedSequence | None = None,
) -> None:
    """Metric for analysis of a statistic using bootstrap resampling.

    If `columns` is a sequence of strings, then the sample passed
    to the statistic callable contains an extra dimension in the first axis.
    See examples below.

    Args:
        columns: Names of the columns to be used in the analysis.
        statistic: Statistic. It must be a vectorized callable
            that accepts a NumPy array as the first argument and returns
            the resulting statistic.
            It must also accept a keyword argument `axis` and be vectorized
            to compute the statistic along the provided axis.
        alternative: Alternative hypothesis:

            - `"two-sided"`: the means are unequal,
            - `"greater"`: the mean in the treatment variant is greater than the mean
                in the control variant,
            - `"less"`: the mean in the treatment variant is less than the mean
                in the control variant.

        confidence_level: Confidence level for the confidence interval.
        n_resamples: The number of resamples performed to form
            the bootstrap distribution of the statistic.
        method: Whether to return the "percentile" bootstrap confidence
            interval (`"percentile"`), the "basic" (AKA "reverse") bootstrap
            confidence interval (`"basic"`), or the bias-corrected
            and accelerated bootstrap confidence interval (`"bca"`).
        batch: The number of resamples to process in each vectorized call
            to statistic. Memory usage is O(`batch * n`), where `n` is
            the sample size. Default is `None`, in which case `batch = n_resamples`
            (or `batch = max(n_resamples, n)` for method="bca").
        random_state: Pseudorandom number generator state used
            to generate resamples.

    Parameter defaults:
        Defaults for parameters `alternative`, `confidence_level`,
        and `n_resamples` can be changed using the
        `config_context` and `set_context` functions.
        See the [Global configuration](https://tea-tasting.e10v.me/api/config/)
        reference for details.

    References:
        - [Bootstrapping (statistics) — Wikipedia](https://en.wikipedia.org/wiki/Bootstrapping_(statistics)).
        - [scipy.stats.bootstrap — SciPy Manual](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html#scipy-stats-bootstrap).

    Examples:
        ```pycon
        >>> import numpy as np
        >>> import tea_tasting as tt

        >>> experiment = tt.Experiment(
        ...     orders_per_user=tt.Bootstrap("orders", np.mean, random_state=42),
        ... )
        >>> data = tt.make_users_data(seed=42)
        >>> result = experiment.analyze(data)
        >>> print(result)
                 metric control treatment rel_effect_size rel_effect_size_ci pvalue
        orders_per_user   0.530     0.573            8.0%       [-1.8%, 19%]      -

        ```

        With multiple columns:

        ```pycon
        >>> def ratio_of_means(sample, axis):
        ...     means = np.mean(sample, axis=axis)
        ...     return means[0] / means[1]

        >>> experiment = tt.Experiment(
        ...     orders_per_session=tt.Bootstrap(
        ...         ("orders", "sessions"),
        ...         ratio_of_means,
        ...         random_state=42,
        ...     ),
        ... )
        >>> data = tt.make_users_data(seed=42)
        >>> result = experiment.analyze(data)
        >>> print(result)
                    metric control treatment rel_effect_size rel_effect_size_ci pvalue
        orders_per_session   0.266     0.289            8.8%      [-0.61%, 20%]      -

        ```
    """  # noqa: E501
    tea_tasting.utils.check_scalar(columns, "columns", typ=str | Sequence)
    if not isinstance(columns, str):
        for col in columns:
            tea_tasting.utils.check_scalar(col, "column", typ=str)
    self.columns = columns

    self.statistic = tea_tasting.utils.check_scalar(
        statistic, "statistic", typ=Callable)

    self.alternative = (
        tea_tasting.utils.auto_check(alternative, "alternative")
        if alternative is not None
        else tea_tasting.config.get_config("alternative")
    )

    self.confidence_level = (
        tea_tasting.utils.auto_check(confidence_level, "confidence_level")
        if confidence_level is not None
        else tea_tasting.config.get_config("confidence_level")
    )

    self.n_resamples = (
        tea_tasting.utils.auto_check(n_resamples, "n_resamples")
        if n_resamples is not None
        else tea_tasting.config.get_config("n_resamples")
    )

    self.method = tea_tasting.utils.check_scalar(
        method, "method", typ=str, in_={"percentile", "basic", "bca"})

    self.batch = tea_tasting.utils.check_scalar(batch, "batch", typ=int | None)

    self.random_state = tea_tasting.utils.check_scalar(
        random_state,
        "random_state",
        typ=int | np.random.Generator | np.random.SeedSequence | None,
    )

cols: Sequence[str] property #

Columns to be fetched for a metric analysis.

analyze(data, control, treatment, variant=None) #

Analyze a metric in an experiment.

Parameters:

Name Type Description Default
data IntoFrame | Table | dict[Any, Table]

Experimental data.

required
control Any

Control variant.

required
treatment Any

Treatment variant.

required
variant str | None

Variant column name.

None

Returns:

Type Description
R

Analysis result.

Source code in src/tea_tasting/metrics/base.py
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
def analyze(
    self,
    data: (
        narwhals.typing.IntoFrame |
        ibis.expr.types.Table |
        dict[Any, pa.Table]
    ),
    control: Any,
    treatment: Any,
    variant: str | None = None,
) -> R:
    """Analyze a metric in an experiment.

    Args:
        data: Experimental data.
        control: Control variant.
        treatment: Treatment variant.
        variant: Variant column name.

    Returns:
        Analysis result.
    """
    dfs = read_granular(
        data,
        cols=self.cols,
        variant=variant,
    )
    return self.analyze_granular(
        control=dfs[control],
        treatment=dfs[treatment],
    )

analyze_granular(control, treatment) #

Analyze metric in an experiment using granular data.

Parameters:

Name Type Description Default
control Table

Control data.

required
treatment Table

Treatment data.

required

Returns:

Type Description
BootstrapResult

Analysis result.

Source code in src/tea_tasting/metrics/resampling.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def analyze_granular(
    self,
    control: pa.Table,
    treatment: pa.Table,
) -> BootstrapResult:
    """Analyze metric in an experiment using granular data.

    Args:
        control: Control data.
        treatment: Treatment data.

    Returns:
        Analysis result.
    """
    def statistic(
        contr: npt.NDArray[np.number[Any]],
        treat: npt.NDArray[np.number[Any]],
        axis: int = -1,
    ) -> npt.NDArray[np.number[Any]]:
        contr_stat = self.statistic(contr, axis=axis)
        treat_stat = self.statistic(treat, axis=axis)

        effect_size = treat_stat - contr_stat
        with np.errstate(divide="ignore", invalid="ignore"):
            rel_effect_size = np.divide(treat_stat, contr_stat) - 1

        return np.stack((effect_size, rel_effect_size), axis=0)

    contr = _select_as_numpy(control, self.columns)
    treat = _select_as_numpy(treatment, self.columns)
    stat = statistic(contr, treat, axis=0)

    result = scipy.stats.bootstrap(
        (contr, treat),
        statistic,
        n_resamples=self.n_resamples,
        batch=self.batch,
        axis=0,
        confidence_level=self.confidence_level,
        alternative=self.alternative,
        method=self.method,
        random_state=self.random_state,  # type: ignore
    )
    ci = result.confidence_interval

    return BootstrapResult(
        control=self.statistic(contr, axis=0),  # type: ignore
        treatment=self.statistic(treat, axis=0),  # type: ignore
        effect_size=stat[0],
        effect_size_ci_lower=ci.low[0],
        effect_size_ci_upper=ci.high[0],
        rel_effect_size=stat[1],
        rel_effect_size_ci_lower=ci.low[1],
        rel_effect_size_ci_upper=ci.high[1],
    )

BootstrapResult #

Bases: NamedTuple

Result of the analysis using bootstrap resampling.

Attributes:

Name Type Description
control float

Control statistic value.

treatment float

Treatment statistic value.

effect_size float

Absolute effect size. Difference between the two statistic values.

effect_size_ci_lower float

Lower bound of the absolute effect size confidence interval.

effect_size_ci_upper float

Upper bound of the absolute effect size confidence interval.

rel_effect_size float

Relative effect size. Difference between the two statistic values, divided by the control statistic value.

rel_effect_size_ci_lower float

Lower bound of the relative effect size confidence interval.

rel_effect_size_ci_upper float

Upper bound of the relative effect size confidence interval.

Quantile(column, q=0.5, *, alternative=None, confidence_level=None, n_resamples=None, method='basic', batch=None, random_state=None) #

Bases: Bootstrap

Metric for the analysis of quantiles using bootstrap resampling.

Parameters:

Name Type Description Default
column str

Name of the column for the quantiles to compute.

required
q float

Probability for the quantiles to compute.

0.5
alternative Literal['two-sided', 'greater', 'less'] | None

Alternative hypothesis:

  • "two-sided": the means are unequal,
  • "greater": the mean in the treatment variant is greater than the mean in the control variant,
  • "less": the mean in the treatment variant is less than the mean in the control variant.
None
confidence_level float | None

Confidence level for the confidence interval.

None
n_resamples int | None

The number of resamples performed to form the bootstrap distribution of the statistic.

None
method Literal['percentile', 'basic', 'bca']

Whether to return the "percentile" bootstrap confidence interval ("percentile"), the "basic" (AKA "reverse") bootstrap confidence interval ("basic"), or the bias-corrected and accelerated bootstrap confidence interval ("bca").

Default method is "basic" which is different from default method "bca" in Bootstrap. The "bca" confidence intervals cannot be calculated when the bootstrap distribution is degenerate (e.g. all elements are identical). This is often the case for the quantile metrics.

'basic'
batch int | None

The number of resamples to process in each vectorized call to statistic. Memory usage is O(batch * n), where n is the sample size. Default is None, in which case batch = n_resamples (or batch = max(n_resamples, n) for method="bca").

None
random_state int | Generator | SeedSequence | None

Pseudorandom number generator state used to generate resamples.

None
Parameter defaults

Defaults for parameters alternative, confidence_level, and n_resamples can be changed using the config_context and set_context functions. See the Global configuration reference for details.

Examples:

>>> import tea_tasting as tt

>>> experiment = tt.Experiment(
...     revenue_per_user_p80=tt.Quantile("revenue", 0.8, random_state=42),
... )
>>> data = tt.make_users_data(seed=42)
>>> result = experiment.analyze(data)
>>> print(result)
              metric control treatment rel_effect_size rel_effect_size_ci pvalue
revenue_per_user_p80    10.6      11.6            9.1%       [-1.2%, 21%]      -
Source code in src/tea_tasting/metrics/resampling.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
def __init__(
    self,
    column: str,
    q: float = 0.5,
    *,
    alternative: Literal["two-sided", "greater", "less"] | None = None,
    confidence_level: float | None = None,
    n_resamples: int | None = None,
    method: Literal["percentile", "basic", "bca"] = "basic",
    batch: int | None = None,
    random_state: int | np.random.Generator | np.random.SeedSequence | None = None,
) -> None:
    """Metric for the analysis of quantiles using bootstrap resampling.

    Args:
        column: Name of the column for the quantiles to compute.
        q: Probability for the quantiles to compute.
        alternative: Alternative hypothesis:

            - `"two-sided"`: the means are unequal,
            - `"greater"`: the mean in the treatment variant is greater than the mean
                in the control variant,
            - `"less"`: the mean in the treatment variant is less than the mean
                in the control variant.

        confidence_level: Confidence level for the confidence interval.
        n_resamples: The number of resamples performed to form
            the bootstrap distribution of the statistic.
        method: Whether to return the "percentile" bootstrap confidence
            interval (`"percentile"`), the "basic" (AKA "reverse") bootstrap
            confidence interval (`"basic"`), or the bias-corrected
            and accelerated bootstrap confidence interval (`"bca"`).

            Default method is "basic" which is different from default
            method "bca" in `Bootstrap`. The "bca" confidence intervals cannot
            be calculated when the bootstrap distribution is degenerate
            (e.g. all elements are identical). This is often the case for the
            quantile metrics.

        batch: The number of resamples to process in each vectorized call
            to statistic. Memory usage is O(`batch * n`), where `n` is
            the sample size. Default is `None`, in which case `batch = n_resamples`
            (or `batch = max(n_resamples, n)` for method="bca").
        random_state: Pseudorandom number generator state used
            to generate resamples.

    Parameter defaults:
        Defaults for parameters `alternative`, `confidence_level`,
        and `n_resamples` can be changed using the
        `config_context` and `set_context` functions.
        See the [Global configuration](https://tea-tasting.e10v.me/api/config/)
        reference for details.

    Examples:
        ```pycon
        >>> import tea_tasting as tt

        >>> experiment = tt.Experiment(
        ...     revenue_per_user_p80=tt.Quantile("revenue", 0.8, random_state=42),
        ... )
        >>> data = tt.make_users_data(seed=42)
        >>> result = experiment.analyze(data)
        >>> print(result)
                      metric control treatment rel_effect_size rel_effect_size_ci pvalue
        revenue_per_user_p80    10.6      11.6            9.1%       [-1.2%, 21%]      -

        ```
    """  # noqa: E501
    self.column = tea_tasting.utils.check_scalar(column, "column", typ=str)
    self.q = tea_tasting.utils.check_scalar(q, "q", typ=float, ge=0, le=1)
    super().__init__(
        columns=column,
        statistic=functools.partial(np.nanquantile, q=q),
        alternative=alternative,
        confidence_level=confidence_level,
        n_resamples=n_resamples,
        method=method,
        batch=batch,
        random_state=random_state,
    )

cols: Sequence[str] property #

Columns to be fetched for a metric analysis.

analyze(data, control, treatment, variant=None) #

Analyze a metric in an experiment.

Parameters:

Name Type Description Default
data IntoFrame | Table | dict[Any, Table]

Experimental data.

required
control Any

Control variant.

required
treatment Any

Treatment variant.

required
variant str | None

Variant column name.

None

Returns:

Type Description
R

Analysis result.

Source code in src/tea_tasting/metrics/base.py
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
def analyze(
    self,
    data: (
        narwhals.typing.IntoFrame |
        ibis.expr.types.Table |
        dict[Any, pa.Table]
    ),
    control: Any,
    treatment: Any,
    variant: str | None = None,
) -> R:
    """Analyze a metric in an experiment.

    Args:
        data: Experimental data.
        control: Control variant.
        treatment: Treatment variant.
        variant: Variant column name.

    Returns:
        Analysis result.
    """
    dfs = read_granular(
        data,
        cols=self.cols,
        variant=variant,
    )
    return self.analyze_granular(
        control=dfs[control],
        treatment=dfs[treatment],
    )

analyze_granular(control, treatment) #

Analyze metric in an experiment using granular data.

Parameters:

Name Type Description Default
control Table

Control data.

required
treatment Table

Treatment data.

required

Returns:

Type Description
BootstrapResult

Analysis result.

Source code in src/tea_tasting/metrics/resampling.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def analyze_granular(
    self,
    control: pa.Table,
    treatment: pa.Table,
) -> BootstrapResult:
    """Analyze metric in an experiment using granular data.

    Args:
        control: Control data.
        treatment: Treatment data.

    Returns:
        Analysis result.
    """
    def statistic(
        contr: npt.NDArray[np.number[Any]],
        treat: npt.NDArray[np.number[Any]],
        axis: int = -1,
    ) -> npt.NDArray[np.number[Any]]:
        contr_stat = self.statistic(contr, axis=axis)
        treat_stat = self.statistic(treat, axis=axis)

        effect_size = treat_stat - contr_stat
        with np.errstate(divide="ignore", invalid="ignore"):
            rel_effect_size = np.divide(treat_stat, contr_stat) - 1

        return np.stack((effect_size, rel_effect_size), axis=0)

    contr = _select_as_numpy(control, self.columns)
    treat = _select_as_numpy(treatment, self.columns)
    stat = statistic(contr, treat, axis=0)

    result = scipy.stats.bootstrap(
        (contr, treat),
        statistic,
        n_resamples=self.n_resamples,
        batch=self.batch,
        axis=0,
        confidence_level=self.confidence_level,
        alternative=self.alternative,
        method=self.method,
        random_state=self.random_state,  # type: ignore
    )
    ci = result.confidence_interval

    return BootstrapResult(
        control=self.statistic(contr, axis=0),  # type: ignore
        treatment=self.statistic(treat, axis=0),  # type: ignore
        effect_size=stat[0],
        effect_size_ci_lower=ci.low[0],
        effect_size_ci_upper=ci.high[0],
        rel_effect_size=stat[1],
        rel_effect_size_ci_lower=ci.low[1],
        rel_effect_size_ci_upper=ci.high[1],
    )