`tea_tasting.multiplicity` #

Multiple hypothesis testing.

`MultipleComparisonsResults` #

Bases: DictsReprMixin, UserDict[object, ExperimentResult]

Multiple comparisons result.

`to_arrow()` #

Convert the object to a PyArrow Table.

Source code in src/tea_tasting/utils.py

@_cache_method
def to_arrow(self) -> pa.Table:
    """Convert the object to a PyArrow Table."""
    return pa.Table.from_pylist(self.to_dicts())

`to_dicts()` #

Convert the result to a sequence of dictionaries.

Source code in src/tea_tasting/multiplicity.py

@tea_tasting.utils._cache_method
def to_dicts(self) -> tuple[dict[str, object], ...]:
    """Convert the result to a sequence of dictionaries."""
    return tuple(
        {"comparison": str(comparison)} | metric_result
        for comparison, experiment_result in self.items()
        for metric_result in experiment_result.to_dicts()
    )

`to_html(keys=None, formatter=get_and_format_num, *, max_rows=None, indent=None)` #

Convert the object to HTML.

Default formatting rules:

If a name starts with "rel_" or equals to "power" consider it a percentage value. Round percentage values to 2 significant digits, multiply by 100 and add "%".
Round other values to 3 significant values.
If value is less than 0.001 or is greater than or equal to 10_000_000, format it in exponential presentation.
If a name ends with "_ci", consider it a confidence interval. Look up for attributes "{name}_lower" and "{name}_upper", and format the interval as "[{lower_bound}, {upper_bound}]".

Parameters:

Name	Type	Description	Default
`keys`	`Sequence[str] \| None`	Keys to convert. If a key is not defined in the dictionary it's assumed to be `None`.	`None`
`formatter`	`Callable[[dict[str, object], str], str]`	Custom formatter function. It should accept a dictionary of metric result attributes and an attribute name, and return a formatted attribute value.	`get_and_format_num`
`max_rows`	`int \| None`	Maximum number of rows to convert. If `None`, the default value will be used. If `0` or less, all rows will be converted.	`None`
`indent`	`str \| None`	Whitespace to insert for each indentation level. If `None`, do not indent.	`None`

Returns:

Type	Description
`str`	A table with results rendered as HTML.

Source code in src/tea_tasting/utils.py

def to_html(
    self,
    keys: Sequence[str] | None = None,
    formatter: Callable[[dict[str, object], str], str] = get_and_format_num,
    *,
    max_rows: int | None = None,
    indent: str | None = None,
) -> str:
    """Convert the object to HTML.

    Default formatting rules:

    - If a name starts with `"rel_"` or equals to `"power"` consider it
        a percentage value. Round percentage values to 2 significant digits,
        multiply by `100` and add `"%"`.
    - Round other values to 3 significant values.
    - If value is less than `0.001` or is greater than or equal to `10_000_000`,
        format it in exponential presentation.
    - If a name ends with `"_ci"`, consider it a confidence interval.
        Look up for attributes `"{name}_lower"` and `"{name}_upper"`,
        and format the interval as `"[{lower_bound}, {upper_bound}]"`.

    Args:
        keys: Keys to convert. If a key is not defined in the dictionary
            it's assumed to be `None`.
        formatter: Custom formatter function. It should accept a dictionary
            of metric result attributes and an attribute name, and return
            a formatted attribute value.
        max_rows: Maximum number of rows to convert.
            If `None`, the default value will be used.
            If `0` or less, all rows will be converted.
        indent: Whitespace to insert for each indentation level. If `None`,
            do not indent.

    Returns:
        A table with results rendered as HTML.
    """
    if keys is None:
        keys = self.default_keys
    if max_rows is None:
        max_rows = self.default_max_rows

    table = ET.Element(
        "table",
        {"class": "dataframe", "style": "text-align: right;"},
    )
    thead = ET.SubElement(table, "thead")
    thead_tr = ET.SubElement(thead, "tr")
    for key in keys:
        th = ET.SubElement(thead_tr, "th")
        th.text = key
    tbody = ET.SubElement(table, "tbody")
    for pretty_dict in self.to_pretty_dicts(keys, formatter, max_rows=max_rows):
        tr = ET.SubElement(tbody, "tr")
        for key in keys:
            td = ET.SubElement(tr, "td")
            td.text = pretty_dict[key]
    if indent is not None:
        ET.indent(table, space=indent)
    return ET.tostring(table, encoding="unicode", method="html")

`to_pandas()` #

Convert the object to a Pandas DataFrame.

Source code in src/tea_tasting/utils.py

@_cache_method
def to_pandas(self) -> pd.DataFrame:
    """Convert the object to a Pandas DataFrame."""
    import pandas as pd
    return pd.DataFrame.from_records(self.to_dicts())

`to_polars()` #

Convert the object to a Polars DataFrame.

Source code in src/tea_tasting/utils.py

@_cache_method
def to_polars(self) -> pl.DataFrame:
    """Convert the object to a Polars DataFrame."""
    import polars as pl
    return pl.from_dicts(self.to_dicts())

`to_pretty_dicts(keys=None, formatter=get_and_format_num, *, max_rows=None)` #

Convert the object to a list of dictionaries with formatted values.

Default formatting rules:

If a name starts with "rel_" or equals to "power" consider it a percentage value. Round percentage values to 2 significant digits, multiply by 100 and add "%".
Round other values to 3 significant values.
If value is less than 0.001 or is greater than or equal to 10_000_000, format it in exponential presentation.
If a name ends with "_ci", consider it a confidence interval. Look up for attributes "{name}_lower" and "{name}_upper", and format the interval as "[{lower_bound}, {upper_bound}]".

Parameters:

Name	Type	Description	Default
`keys`	`Sequence[str] \| None`	Keys to convert. If a key is not defined in the dictionary it's assumed to be `None`.	`None`
`formatter`	`Callable[[dict[str, object], str], str]`	Custom formatter function. It should accept a dictionary of metric result attributes and an attribute name, and return a formatted attribute value.	`get_and_format_num`
`max_rows`	`int \| None`	Maximum number of rows to convert. If `None`, the default value will be used. If `0` or less, all rows will be converted.	`None`

Returns:

Type	Description
`list[dict[str, str]]`	List of dictionaries with formatted values.

Source code in src/tea_tasting/utils.py

def to_pretty_dicts(
    self,
    keys: Sequence[str] | None = None,
    formatter: Callable[[dict[str, object], str], str] = get_and_format_num,
    *,
    max_rows: int | None = None,
) -> list[dict[str, str]]:
    """Convert the object to a list of dictionaries with formatted values.

    Default formatting rules:

    - If a name starts with `"rel_"` or equals to `"power"` consider it
        a percentage value. Round percentage values to 2 significant digits,
        multiply by `100` and add `"%"`.
    - Round other values to 3 significant values.
    - If value is less than `0.001` or is greater than or equal to `10_000_000`,
        format it in exponential presentation.
    - If a name ends with `"_ci"`, consider it a confidence interval.
        Look up for attributes `"{name}_lower"` and `"{name}_upper"`,
        and format the interval as `"[{lower_bound}, {upper_bound}]"`.

    Args:
        keys: Keys to convert. If a key is not defined in the dictionary
            it's assumed to be `None`.
        formatter: Custom formatter function. It should accept a dictionary
            of metric result attributes and an attribute name, and return
            a formatted attribute value.
        max_rows: Maximum number of rows to convert.
            If `None`, the default value will be used.
            If `0` or less, all rows will be converted.

    Returns:
        List of dictionaries with formatted values.
    """
    if keys is None:
        keys = self.default_keys
    if max_rows is None:
        max_rows = self.default_max_rows

    dicts = self.to_dicts()
    if max_rows <= 0 or len(dicts) <= max_rows:
        return [{key: formatter(data, key) for key in keys} for data in dicts]

    bottom = max_rows // 2
    top = max_rows - bottom
    return (
        [{key: formatter(data, key) for key in keys} for data in dicts[:top]] +
        [dict.fromkeys(keys, "…")] +
        [{key: formatter(data, key) for key in keys} for data in dicts[-bottom:]]
    )

`to_string(keys=None, formatter=get_and_format_num, *, max_rows=None)` #

Convert the object to a string.

Default formatting rules:

If a name starts with "rel_" or equals to "power" consider it a percentage value. Round percentage values to 2 significant digits, multiply by 100 and add "%".
Round other values to 3 significant values.
If value is less than 0.001 or is greater than or equal to 10_000_000, format it in exponential presentation.
If a name ends with "_ci", consider it a confidence interval. Look up for attributes "{name}_lower" and "{name}_upper", and format the interval as "[{lower_bound}, {upper_bound}]".

Parameters:

Name	Type	Description	Default
`keys`	`Sequence[str] \| None`	Keys to convert. If a key is not defined in the dictionary it's assumed to be `None`.	`None`
`formatter`	`Callable[[dict[str, object], str], str]`	Custom formatter function. It should accept a dictionary of metric result attributes and an attribute name, and return a formatted attribute value.	`get_and_format_num`
`max_rows`	`int \| None`	Maximum number of rows to convert. If `None`, the default value will be used. If `0` or less, all rows will be converted.	`None`

Returns:

Type	Description
`str`	A table with results rendered as string.

Source code in src/tea_tasting/utils.py

def to_string(
    self,
    keys: Sequence[str] | None = None,
    formatter: Callable[[dict[str, object], str], str] = get_and_format_num,
    *,
    max_rows: int | None = None,
) -> str:
    """Convert the object to a string.

    Default formatting rules:

    - If a name starts with `"rel_"` or equals to `"power"` consider it
        a percentage value. Round percentage values to 2 significant digits,
        multiply by `100` and add `"%"`.
    - Round other values to 3 significant values.
    - If value is less than `0.001` or is greater than or equal to `10_000_000`,
        format it in exponential presentation.
    - If a name ends with `"_ci"`, consider it a confidence interval.
        Look up for attributes `"{name}_lower"` and `"{name}_upper"`,
        and format the interval as `"[{lower_bound}, {upper_bound}]"`.

    Args:
        keys: Keys to convert. If a key is not defined in the dictionary
            it's assumed to be `None`.
        formatter: Custom formatter function. It should accept a dictionary
            of metric result attributes and an attribute name, and return
            a formatted attribute value.
        max_rows: Maximum number of rows to convert.
            If `None`, the default value will be used.
            If `0` or less, all rows will be converted.

    Returns:
        A table with results rendered as string.
    """
    if keys is None:
        keys = self.default_keys
    if max_rows is None:
        max_rows = self.default_max_rows

    pretty_dicts = self.to_pretty_dicts(keys, formatter, max_rows=max_rows)
    widths = {key: len(key) for key in keys}
    for pretty_dict in pretty_dicts:
        for key in keys:
            widths[key] = max(widths[key], len(pretty_dict[key]))

    sep = " "
    rows = [sep.join(key.rjust(widths[key]) for key in keys)]
    rows.extend(
        sep.join(pretty_dict[key].rjust(widths[key]) for key in keys)
        for pretty_dict in pretty_dicts
    )
    return "\n".join(rows)

`with_defaults(*, keys=None, max_rows=None)` #

Copies the object and sets the new default parameters.

Parameters:

Name	Type	Description	Default
`keys`	`Sequence[str] \| None`	New default `keys` for the methods `to_pretty_dicts`, `to_string`, and `to_html`.	`None`
`max_rows`	`int \| None`	New default `max_rows` for the methods `to_pretty_dicts`, `to_string`, and `to_html`.	`None`

Returns:

Type	Description
`DictsReprMixinT`	A copy of the object with the new default keys.

Source code in src/tea_tasting/utils.py

def with_defaults(
    self: DictsReprMixinT,
    *,
    keys: Sequence[str] | None = None,
    max_rows: int | None = None,
) -> DictsReprMixinT:
    """Copies the object and sets the new default parameters.

    Args:
        keys: New default `keys` for the methods `to_pretty_dicts`, `to_string`,
            and `to_html`.
        max_rows: New default `max_rows` for the methods `to_pretty_dicts`,
            `to_string`, and `to_html`.

    Returns:
        A copy of the object with the new default keys.
    """
    new_instance = self.__class__.__new__(self.__class__)
    new_instance.__dict__.update(self.__dict__)
    new_instance._cache = None
    if keys is not None:
        new_instance.default_keys = keys
    if max_rows is not None:
        new_instance.default_max_rows = max_rows
    return new_instance

`with_keys(keys)` #

Copies the object and sets the new default keys.

Parameters:

Name	Type	Description	Default
`keys`	`Sequence[str]`	New default `keys` for the methods `to_pretty_dicts`, `to_string`, and `to_html`.	required

Returns:

Type	Description
`DictsReprMixinT`	A copy of the object with the new default `keys`.

Source code in src/tea_tasting/utils.py

def with_keys(self: DictsReprMixinT, keys: Sequence[str]) -> DictsReprMixinT:
    """Copies the object and sets the new default `keys`.

    Args:
        keys: New default `keys` for the methods `to_pretty_dicts`, `to_string`,
            and `to_html`.

    Returns:
        A copy of the object with the new default `keys`.
    """
    return self.with_defaults(keys=keys)

`with_max_rows(max_rows)` #

Copies the object and sets the new default max_rows.

Parameters:

Name	Type	Description	Default
`max_rows`	`int`	New default `max_rows` for the methods `to_pretty_dicts`, `to_string`, and `to_html`.	required

Returns:

Type	Description
`DictsReprMixinT`	A copy of the object with the new default `max_rows`.

Source code in src/tea_tasting/utils.py

def with_max_rows(self: DictsReprMixinT, max_rows: int) -> DictsReprMixinT:
    """Copies the object and sets the new default `max_rows`.

    Args:
        max_rows: New default `max_rows` for the methods `to_pretty_dicts`,
            `to_string`, and `to_html`.

    Returns:
        A copy of the object with the new default `max_rows`.
    """
    return self.with_defaults(max_rows=max_rows)

`adjust_fdr(experiment_results, metrics=None, *, alpha=None, arbitrary_dependence=False)` #

Adjust p-value and alpha to control the false discovery rate (FDR).

The number of hypotheses tested is the total number of metrics included in the comparison in all experiment results. For example, if there are 3 experiments with 2 metrics in each, the number of hypotheses is 6.

The function performs one of the following corrections, depending on parameters:

Benjamini-Hochberg procedure, assuming non-negative correlation between hypotheses (arbitrary_dependence=False).
Benjamini-Yekutieli procedure, assuming arbitrary dependence between hypotheses (arbitrary_dependence=True).

The function adds the following attributes to the results:

pvalue_adj: The adjusted p-value, which should be compared with the unadjusted FDR (alpha).
alpha_adj: The adjusted FDR, which should be compared with the unadjusted p-value (pvalue).
null_rejected: A binary indicator (0 or 1) that shows whether the null hypothesis is rejected.

Parameters:

Name	Type	Description	Default
`experiment_results`	`ExperimentResult \| Mapping[object, ExperimentResult]`	Experiment results.	required
`metrics`	`str \| set[str] \| Sequence[str] \| None`	Metrics included in the comparison. If `None`, all metrics are included.	`None`
`alpha`	`float \| None`	Significance level. If `None`, the value from global settings is used.	`None`
`arbitrary_dependence`	`bool`	If `True`, arbitrary dependence between hypotheses is assumed and Benjamini-Yekutieli procedure is performed. If `False`, non-negative correlation between hypotheses is assumed and Benjamini-Hochberg procedure is performed.	`False`

Returns:

Type	Description
`MultipleComparisonsResults`	The experiments results with adjusted p-values and alphas.

Parameter defaults

Default for parameter alpha can be changed using the config_context and set_context functions. See the Global configuration reference for details.

References

Examples:

>>> import polars as pl
>>> import tea_tasting as tt

>>> data = pl.concat((
...     tt.make_users_data(
...         seed=42,
...         orders_uplift=0.10,
...         revenue_uplift=0.15,
...         return_type="polars",
...     ),
...     tt.make_users_data(
...         seed=21,
...         orders_uplift=0.15,
...         revenue_uplift=0.20,
...         return_type="polars",
...     )
...         .filter(pl.col("variant").eq(1))
...         .with_columns(variant=pl.lit(2, pl.Int64)),
... ))
>>> data
shape: (6_046, 5)
┌──────┬─────────┬──────────┬────────┬─────────┐
│ user ┆ variant ┆ sessions ┆ orders ┆ revenue │
│ ---  ┆ ---     ┆ ---      ┆ ---    ┆ ---     │
│ i64  ┆ i64     ┆ i64      ┆ i64    ┆ f64     │
╞══════╪═════════╪══════════╪════════╪═════════╡
│ 0    ┆ 1       ┆ 2        ┆ 1      ┆ 9.58    │
│ 1    ┆ 0       ┆ 2        ┆ 1      ┆ 6.43    │
│ 2    ┆ 1       ┆ 2        ┆ 1      ┆ 8.3     │
│ 3    ┆ 1       ┆ 2        ┆ 1      ┆ 16.65   │
│ 4    ┆ 0       ┆ 1        ┆ 1      ┆ 7.14    │
│ …    ┆ …       ┆ …        ┆ …      ┆ …       │
│ 3989 ┆ 2       ┆ 4        ┆ 4      ┆ 34.93   │
│ 3991 ┆ 2       ┆ 1        ┆ 0      ┆ 0.0     │
│ 3992 ┆ 2       ┆ 3        ┆ 3      ┆ 27.96   │
│ 3994 ┆ 2       ┆ 2        ┆ 1      ┆ 17.22   │
│ 3998 ┆ 2       ┆ 3        ┆ 0      ┆ 0.0     │
└──────┴─────────┴──────────┴────────┴─────────┘

>>> experiment = tt.Experiment(
...     sessions_per_user=tt.Mean("sessions"),
...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
...     orders_per_user=tt.Mean("orders"),
...     revenue_per_user=tt.Mean("revenue"),
... )

>>> # Results without correction.
>>> results = experiment.analyze(data, control=0, all_variants=True)
>>> results
variants             metric control treatment rel_effect_size rel_effect_size_ci  pvalue
  (0, 1)  sessions_per_user    2.00      1.98          -0.66%      [-3.7%, 2.5%]   0.674
  (0, 1) orders_per_session   0.266     0.289            8.8%      [-0.89%, 19%]  0.0762
  (0, 1)    orders_per_user   0.530     0.573            8.0%       [-2.0%, 19%]   0.118
  (0, 1)   revenue_per_user    5.24      5.99             14%        [2.1%, 28%]  0.0211
  (0, 2)  sessions_per_user    2.00      2.02           0.98%      [-2.1%, 4.1%]   0.532
  (0, 2) orders_per_session   0.266     0.295             11%        [1.2%, 22%]  0.0273
  (0, 2)    orders_per_user   0.530     0.594             12%        [1.7%, 23%]  0.0213
  (0, 2)   revenue_per_user    5.24      6.25             19%        [6.6%, 33%] 0.00218

>>> # Success metrics.
>>> metrics = {"orders_per_user", "revenue_per_user"}

>>> # Benjamini-Hochberg procedure,
>>> # assuming non-negative correlation between hypotheses.
>>> adjusted_results_fdr = tt.adjust_fdr(results, metrics)
>>> adjusted_results_fdr
comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
    (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.118
    (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0284
    (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0284
    (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.00872

>>> # The adjusted confidence level alpha.
>>> adjusted_results_fdr.with_keys((
...     "comparison",
...     "metric",
...     "control",
...     "treatment",
...     "rel_effect_size",
...     "pvalue",
...     "alpha_adj",
... ))
comparison           metric control treatment rel_effect_size  pvalue alpha_adj
    (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118    0.0500
    (0, 1) revenue_per_user    5.24      5.99             14%  0.0211    0.0375
    (0, 2)  orders_per_user   0.530     0.594             12%  0.0213    0.0375
    (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.0375

>>> # Benjamini-Yekutieli procedure,
>>> # assuming arbitrary dependence between hypotheses.
>>> tt.adjust_fdr(results, metrics, arbitrary_dependence=True)
comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
    (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.245
    (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0592
    (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0592
    (0, 2) revenue_per_user    5.24      6.25             19% 0.00218     0.0182

Source code in src/tea_tasting/multiplicity.py

def adjust_fdr(
    experiment_results: tea_tasting.experiment.ExperimentResult | Mapping[
        object, tea_tasting.experiment.ExperimentResult],
    metrics: str | set[str] | Sequence[str] | None = None,
    *,
    alpha: float | None = None,
    arbitrary_dependence: bool = False,
) -> MultipleComparisonsResults:
    """Adjust p-value and alpha to control the false discovery rate (FDR).

    The number of hypotheses tested is the total number of metrics included in
    the comparison in all experiment results. For example, if there are
    3 experiments with 2 metrics in each, the number of hypotheses is 6.

    The function performs one of the following corrections, depending on parameters:

    - Benjamini-Hochberg procedure, assuming non-negative correlation between
        hypotheses (`arbitrary_dependence=False`).
    - Benjamini-Yekutieli procedure, assuming arbitrary dependence between
        hypotheses (`arbitrary_dependence=True`).

    The function adds the following attributes to the results:

    - `pvalue_adj`: The adjusted p-value, which should be compared with
        the unadjusted FDR (`alpha`).
    - `alpha_adj`: The adjusted FDR, which should be compared with the unadjusted
        p-value (`pvalue`).
    - `null_rejected`: A binary indicator (`0` or `1`) that shows whether
        the null hypothesis is rejected.

    Args:
        experiment_results: Experiment results.
        metrics: Metrics included in the comparison.
            If `None`, all metrics are included.
        alpha: Significance level. If `None`, the value from global settings is used.
        arbitrary_dependence: If `True`, arbitrary dependence between hypotheses
            is assumed and Benjamini-Yekutieli procedure is performed.
            If `False`, non-negative correlation between hypotheses is assumed
            and Benjamini-Hochberg procedure is performed.

    Returns:
        The experiments results with adjusted p-values and alphas.

    Parameter defaults:
        Default for parameter `alpha` can be changed using the `config_context`
        and `set_context` functions.
        See the [Global configuration](https://tea-tasting.e10v.me/api/config/)
        reference for details.

    References:
        - [Multiple comparisons problem](https://en.wikipedia.org/wiki/Multiple_comparisons_problem).
        - [False discovery rate](https://en.wikipedia.org/wiki/False_discovery_rate).

    Examples:
        ```pycon
        >>> import polars as pl
        >>> import tea_tasting as tt

        >>> data = pl.concat((
        ...     tt.make_users_data(
        ...         seed=42,
        ...         orders_uplift=0.10,
        ...         revenue_uplift=0.15,
        ...         return_type="polars",
        ...     ),
        ...     tt.make_users_data(
        ...         seed=21,
        ...         orders_uplift=0.15,
        ...         revenue_uplift=0.20,
        ...         return_type="polars",
        ...     )
        ...         .filter(pl.col("variant").eq(1))
        ...         .with_columns(variant=pl.lit(2, pl.Int64)),
        ... ))
        >>> data
        shape: (6_046, 5)
        ┌──────┬─────────┬──────────┬────────┬─────────┐
        │ user ┆ variant ┆ sessions ┆ orders ┆ revenue │
        │ ---  ┆ ---     ┆ ---      ┆ ---    ┆ ---     │
        │ i64  ┆ i64     ┆ i64      ┆ i64    ┆ f64     │
        ╞══════╪═════════╪══════════╪════════╪═════════╡
        │ 0    ┆ 1       ┆ 2        ┆ 1      ┆ 9.58    │
        │ 1    ┆ 0       ┆ 2        ┆ 1      ┆ 6.43    │
        │ 2    ┆ 1       ┆ 2        ┆ 1      ┆ 8.3     │
        │ 3    ┆ 1       ┆ 2        ┆ 1      ┆ 16.65   │
        │ 4    ┆ 0       ┆ 1        ┆ 1      ┆ 7.14    │
        │ …    ┆ …       ┆ …        ┆ …      ┆ …       │
        │ 3989 ┆ 2       ┆ 4        ┆ 4      ┆ 34.93   │
        │ 3991 ┆ 2       ┆ 1        ┆ 0      ┆ 0.0     │
        │ 3992 ┆ 2       ┆ 3        ┆ 3      ┆ 27.96   │
        │ 3994 ┆ 2       ┆ 2        ┆ 1      ┆ 17.22   │
        │ 3998 ┆ 2       ┆ 3        ┆ 0      ┆ 0.0     │
        └──────┴─────────┴──────────┴────────┴─────────┘

        >>> experiment = tt.Experiment(
        ...     sessions_per_user=tt.Mean("sessions"),
        ...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
        ...     orders_per_user=tt.Mean("orders"),
        ...     revenue_per_user=tt.Mean("revenue"),
        ... )

        >>> # Results without correction.
        >>> results = experiment.analyze(data, control=0, all_variants=True)
        >>> results
        variants             metric control treatment rel_effect_size rel_effect_size_ci  pvalue
          (0, 1)  sessions_per_user    2.00      1.98          -0.66%      [-3.7%, 2.5%]   0.674
          (0, 1) orders_per_session   0.266     0.289            8.8%      [-0.89%, 19%]  0.0762
          (0, 1)    orders_per_user   0.530     0.573            8.0%       [-2.0%, 19%]   0.118
          (0, 1)   revenue_per_user    5.24      5.99             14%        [2.1%, 28%]  0.0211
          (0, 2)  sessions_per_user    2.00      2.02           0.98%      [-2.1%, 4.1%]   0.532
          (0, 2) orders_per_session   0.266     0.295             11%        [1.2%, 22%]  0.0273
          (0, 2)    orders_per_user   0.530     0.594             12%        [1.7%, 23%]  0.0213
          (0, 2)   revenue_per_user    5.24      6.25             19%        [6.6%, 33%] 0.00218

        >>> # Success metrics.
        >>> metrics = {"orders_per_user", "revenue_per_user"}

        >>> # Benjamini-Hochberg procedure,
        >>> # assuming non-negative correlation between hypotheses.
        >>> adjusted_results_fdr = tt.adjust_fdr(results, metrics)
        >>> adjusted_results_fdr
        comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
            (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.118
            (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0284
            (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0284
            (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.00872

        >>> # The adjusted confidence level alpha.
        >>> adjusted_results_fdr.with_keys((
        ...     "comparison",
        ...     "metric",
        ...     "control",
        ...     "treatment",
        ...     "rel_effect_size",
        ...     "pvalue",
        ...     "alpha_adj",
        ... ))
        comparison           metric control treatment rel_effect_size  pvalue alpha_adj
            (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118    0.0500
            (0, 1) revenue_per_user    5.24      5.99             14%  0.0211    0.0375
            (0, 2)  orders_per_user   0.530     0.594             12%  0.0213    0.0375
            (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.0375

        >>> # Benjamini-Yekutieli procedure,
        >>> # assuming arbitrary dependence between hypotheses.
        >>> tt.adjust_fdr(results, metrics, arbitrary_dependence=True)
        comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
            (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.245
            (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0592
            (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0592
            (0, 2) revenue_per_user    5.24      6.25             19% 0.00218     0.0182

        ```
    """  # noqa: E501
    alpha = (
        tea_tasting.utils.auto_check(alpha, "alpha")
        if alpha is not None
        else tea_tasting.config.get_config("alpha")
    )
    arbitrary_dependence = tea_tasting.utils.check_scalar(
        arbitrary_dependence, "arbitrary_dependence", typ=bool)

    # results and metric_results refer to the same dicts.
    results, metric_results = _copy_results(experiment_results, metrics)
    method = _Benjamini(
        alpha=alpha,  # type: ignore
        m=len(metric_results),
        arbitrary_dependence=arbitrary_dependence,
    )
    # In-place update.
    _hochberg_stepup(metric_results, method.adjust)

    return MultipleComparisonsResults(results)

`adjust_fwer(experiment_results, metrics=None, *, alpha=None, arbitrary_dependence=False, method='sidak')` #

Adjust p-value and alpha to control the family-wise error rate (FWER).

The number of hypotheses tested is the total number of metrics included in the comparison in all experiment results. For example, if there are 3 experiments with 2 metrics in each, the number of hypotheses is 6.

The function performs one of the following procedures, depending on parameters:

Hochberg's step-up procedure, assuming non-negative correlation between hypotheses (arbitrary_dependence=False).
Holm's step-down procedure, assuming arbitrary dependence between hypotheses (arbitrary_dependence=True).

The function adds the following attributes to the results:

pvalue_adj: The adjusted p-value, which should be compared with the unadjusted FDR (alpha).
alpha_adj: The adjusted FWER, which should be compared with the unadjusted p-value (pvalue).
null_rejected: A binary indicator (0 or 1) that shows whether the null hypothesis is rejected.

Parameters:

Name	Type	Description	Default
`experiment_results`	`ExperimentResult \| Mapping[object, ExperimentResult]`	Experiment results.	required
`metrics`	`str \| set[str] \| Sequence[str] \| None`	Metrics included in the comparison. If `None`, all metrics are included.	`None`
`alpha`	`float \| None`	Significance level. If `None`, the value from global settings is used.	`None`
`arbitrary_dependence`	`bool`	If `True`, arbitrary dependence between hypotheses is assumed and Holm's step-down procedure is performed. If `False`, non-negative correlation between hypotheses is assumed and Hochberg's step-up procedure is performed.	`False`
`method`	`Literal['bonferroni', 'sidak']`	Correction method, Bonferroni (`"bonferroni"`) or Šidák (`"sidak"`).	`'sidak'`

Returns:

Type	Description
`MultipleComparisonsResults`	The experiments results with adjusted p-values and alphas.

Parameter defaults

Default for parameter alpha can be changed using the config_context and set_context functions. See the Global configuration reference for details.

References

Examples:

>>> import polars as pl
>>> import tea_tasting as tt

>>> data = pl.concat((
...     tt.make_users_data(
...         seed=42,
...         orders_uplift=0.10,
...         revenue_uplift=0.15,
...         return_type="polars",
...     ),
...     tt.make_users_data(
...         seed=21,
...         orders_uplift=0.15,
...         revenue_uplift=0.20,
...         return_type="polars",
...     )
...         .filter(pl.col("variant").eq(1))
...         .with_columns(variant=pl.lit(2, pl.Int64)),
... ))
>>> data
shape: (6_046, 5)
┌──────┬─────────┬──────────┬────────┬─────────┐
│ user ┆ variant ┆ sessions ┆ orders ┆ revenue │
│ ---  ┆ ---     ┆ ---      ┆ ---    ┆ ---     │
│ i64  ┆ i64     ┆ i64      ┆ i64    ┆ f64     │
╞══════╪═════════╪══════════╪════════╪═════════╡
│ 0    ┆ 1       ┆ 2        ┆ 1      ┆ 9.58    │
│ 1    ┆ 0       ┆ 2        ┆ 1      ┆ 6.43    │
│ 2    ┆ 1       ┆ 2        ┆ 1      ┆ 8.3     │
│ 3    ┆ 1       ┆ 2        ┆ 1      ┆ 16.65   │
│ 4    ┆ 0       ┆ 1        ┆ 1      ┆ 7.14    │
│ …    ┆ …       ┆ …        ┆ …      ┆ …       │
│ 3989 ┆ 2       ┆ 4        ┆ 4      ┆ 34.93   │
│ 3991 ┆ 2       ┆ 1        ┆ 0      ┆ 0.0     │
│ 3992 ┆ 2       ┆ 3        ┆ 3      ┆ 27.96   │
│ 3994 ┆ 2       ┆ 2        ┆ 1      ┆ 17.22   │
│ 3998 ┆ 2       ┆ 3        ┆ 0      ┆ 0.0     │
└──────┴─────────┴──────────┴────────┴─────────┘

>>> experiment = tt.Experiment(
...     sessions_per_user=tt.Mean("sessions"),
...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
...     orders_per_user=tt.Mean("orders"),
...     revenue_per_user=tt.Mean("revenue"),
... )

>>> # Results without correction.
>>> results = experiment.analyze(data, control=0, all_variants=True)
>>> results
variants             metric control treatment rel_effect_size rel_effect_size_ci  pvalue
  (0, 1)  sessions_per_user    2.00      1.98          -0.66%      [-3.7%, 2.5%]   0.674
  (0, 1) orders_per_session   0.266     0.289            8.8%      [-0.89%, 19%]  0.0762
  (0, 1)    orders_per_user   0.530     0.573            8.0%       [-2.0%, 19%]   0.118
  (0, 1)   revenue_per_user    5.24      5.99             14%        [2.1%, 28%]  0.0211
  (0, 2)  sessions_per_user    2.00      2.02           0.98%      [-2.1%, 4.1%]   0.532
  (0, 2) orders_per_session   0.266     0.295             11%        [1.2%, 22%]  0.0273
  (0, 2)    orders_per_user   0.530     0.594             12%        [1.7%, 23%]  0.0213
  (0, 2)   revenue_per_user    5.24      6.25             19%        [6.6%, 33%] 0.00218

>>> # Success metrics.
>>> metrics = {"orders_per_user", "revenue_per_user"}

>>> # Hochberg's step-up procedure with Šidák correction,
>>> # assuming non-negative correlation between hypotheses.
>>> adjusted_results_fwer = tt.adjust_fwer(results, metrics)
>>> adjusted_results_fwer
comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
    (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.118
    (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0422
    (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0422
    (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.00869

>>> # The adjusted confidence level alpha.
>>> adjusted_results_fwer.with_keys((
...     "comparison",
...     "metric",
...     "control",
...     "treatment",
...     "rel_effect_size",
...     "pvalue",
...     "alpha_adj",
... ))
comparison           metric control treatment rel_effect_size  pvalue alpha_adj
    (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118    0.0500
    (0, 1) revenue_per_user    5.24      5.99             14%  0.0211    0.0253
    (0, 2)  orders_per_user   0.530     0.594             12%  0.0213    0.0253
    (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.0253

>>> # Holm's step-down procedure with Bonferroni correction,
>>> # assuming arbitrary dependence between hypotheses.
>>> tt.adjust_fwer(
...     results,
...     metrics,
...     arbitrary_dependence=True,
...     method="bonferroni",
... )
comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
    (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.118
    (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0634
    (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0634
    (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.00872

Source code in src/tea_tasting/multiplicity.py

def adjust_fwer(
    experiment_results: tea_tasting.experiment.ExperimentResult | Mapping[
        object, tea_tasting.experiment.ExperimentResult],
    metrics: str | set[str] | Sequence[str] | None = None,
    *,
    alpha: float | None = None,
    arbitrary_dependence: bool = False,
    method: Literal["bonferroni", "sidak"] = "sidak",
) -> MultipleComparisonsResults:
    """Adjust p-value and alpha to control the family-wise error rate (FWER).

    The number of hypotheses tested is the total number of metrics included in
    the comparison in all experiment results. For example, if there are
    3 experiments with 2 metrics in each, the number of hypotheses is 6.

    The function performs one of the following procedures, depending on parameters:

    - Hochberg's step-up procedure, assuming non-negative correlation between
        hypotheses (`arbitrary_dependence=False`).
    - Holm's step-down procedure, assuming arbitrary dependence between
        hypotheses (`arbitrary_dependence=True`).

    The function adds the following attributes to the results:

    - `pvalue_adj`: The adjusted p-value, which should be compared with
        the unadjusted FDR (`alpha`).
    - `alpha_adj`: The adjusted FWER, which should be compared with the unadjusted
        p-value (`pvalue`).
    - `null_rejected`: A binary indicator (`0` or `1`) that shows whether
        the null hypothesis is rejected.

    Args:
        experiment_results: Experiment results.
        metrics: Metrics included in the comparison.
            If `None`, all metrics are included.
        alpha: Significance level. If `None`, the value from global settings is used.
        arbitrary_dependence: If `True`, arbitrary dependence between hypotheses
            is assumed and Holm's step-down procedure is performed.
            If `False`, non-negative correlation between hypotheses is assumed
            and Hochberg's step-up procedure is performed.
        method: Correction method, Bonferroni (`"bonferroni"`) or Šidák (`"sidak"`).

    Returns:
        The experiments results with adjusted p-values and alphas.

    Parameter defaults:
        Default for parameter `alpha` can be changed using the `config_context`
        and `set_context` functions.
        See the [Global configuration](https://tea-tasting.e10v.me/api/config/)
        reference for details.

    References:
        - [Multiple comparisons problem](https://en.wikipedia.org/wiki/Multiple_comparisons_problem).
        - [Family-wise error rate](https://en.wikipedia.org/wiki/Family-wise_error_rate).
        - [Holm–Bonferroni method](https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method).

    Examples:
        ```pycon
        >>> import polars as pl
        >>> import tea_tasting as tt

        >>> data = pl.concat((
        ...     tt.make_users_data(
        ...         seed=42,
        ...         orders_uplift=0.10,
        ...         revenue_uplift=0.15,
        ...         return_type="polars",
        ...     ),
        ...     tt.make_users_data(
        ...         seed=21,
        ...         orders_uplift=0.15,
        ...         revenue_uplift=0.20,
        ...         return_type="polars",
        ...     )
        ...         .filter(pl.col("variant").eq(1))
        ...         .with_columns(variant=pl.lit(2, pl.Int64)),
        ... ))
        >>> data
        shape: (6_046, 5)
        ┌──────┬─────────┬──────────┬────────┬─────────┐
        │ user ┆ variant ┆ sessions ┆ orders ┆ revenue │
        │ ---  ┆ ---     ┆ ---      ┆ ---    ┆ ---     │
        │ i64  ┆ i64     ┆ i64      ┆ i64    ┆ f64     │
        ╞══════╪═════════╪══════════╪════════╪═════════╡
        │ 0    ┆ 1       ┆ 2        ┆ 1      ┆ 9.58    │
        │ 1    ┆ 0       ┆ 2        ┆ 1      ┆ 6.43    │
        │ 2    ┆ 1       ┆ 2        ┆ 1      ┆ 8.3     │
        │ 3    ┆ 1       ┆ 2        ┆ 1      ┆ 16.65   │
        │ 4    ┆ 0       ┆ 1        ┆ 1      ┆ 7.14    │
        │ …    ┆ …       ┆ …        ┆ …      ┆ …       │
        │ 3989 ┆ 2       ┆ 4        ┆ 4      ┆ 34.93   │
        │ 3991 ┆ 2       ┆ 1        ┆ 0      ┆ 0.0     │
        │ 3992 ┆ 2       ┆ 3        ┆ 3      ┆ 27.96   │
        │ 3994 ┆ 2       ┆ 2        ┆ 1      ┆ 17.22   │
        │ 3998 ┆ 2       ┆ 3        ┆ 0      ┆ 0.0     │
        └──────┴─────────┴──────────┴────────┴─────────┘

        >>> experiment = tt.Experiment(
        ...     sessions_per_user=tt.Mean("sessions"),
        ...     orders_per_session=tt.RatioOfMeans("orders", "sessions"),
        ...     orders_per_user=tt.Mean("orders"),
        ...     revenue_per_user=tt.Mean("revenue"),
        ... )

        >>> # Results without correction.
        >>> results = experiment.analyze(data, control=0, all_variants=True)
        >>> results
        variants             metric control treatment rel_effect_size rel_effect_size_ci  pvalue
          (0, 1)  sessions_per_user    2.00      1.98          -0.66%      [-3.7%, 2.5%]   0.674
          (0, 1) orders_per_session   0.266     0.289            8.8%      [-0.89%, 19%]  0.0762
          (0, 1)    orders_per_user   0.530     0.573            8.0%       [-2.0%, 19%]   0.118
          (0, 1)   revenue_per_user    5.24      5.99             14%        [2.1%, 28%]  0.0211
          (0, 2)  sessions_per_user    2.00      2.02           0.98%      [-2.1%, 4.1%]   0.532
          (0, 2) orders_per_session   0.266     0.295             11%        [1.2%, 22%]  0.0273
          (0, 2)    orders_per_user   0.530     0.594             12%        [1.7%, 23%]  0.0213
          (0, 2)   revenue_per_user    5.24      6.25             19%        [6.6%, 33%] 0.00218

        >>> # Success metrics.
        >>> metrics = {"orders_per_user", "revenue_per_user"}

        >>> # Hochberg's step-up procedure with Šidák correction,
        >>> # assuming non-negative correlation between hypotheses.
        >>> adjusted_results_fwer = tt.adjust_fwer(results, metrics)
        >>> adjusted_results_fwer
        comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
            (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.118
            (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0422
            (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0422
            (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.00869

        >>> # The adjusted confidence level alpha.
        >>> adjusted_results_fwer.with_keys((
        ...     "comparison",
        ...     "metric",
        ...     "control",
        ...     "treatment",
        ...     "rel_effect_size",
        ...     "pvalue",
        ...     "alpha_adj",
        ... ))
        comparison           metric control treatment rel_effect_size  pvalue alpha_adj
            (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118    0.0500
            (0, 1) revenue_per_user    5.24      5.99             14%  0.0211    0.0253
            (0, 2)  orders_per_user   0.530     0.594             12%  0.0213    0.0253
            (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.0253

        >>> # Holm's step-down procedure with Bonferroni correction,
        >>> # assuming arbitrary dependence between hypotheses.
        >>> tt.adjust_fwer(
        ...     results,
        ...     metrics,
        ...     arbitrary_dependence=True,
        ...     method="bonferroni",
        ... )
        comparison           metric control treatment rel_effect_size  pvalue pvalue_adj
            (0, 1)  orders_per_user   0.530     0.573            8.0%   0.118      0.118
            (0, 1) revenue_per_user    5.24      5.99             14%  0.0211     0.0634
            (0, 2)  orders_per_user   0.530     0.594             12%  0.0213     0.0634
            (0, 2) revenue_per_user    5.24      6.25             19% 0.00218    0.00872

        ```
    """  # noqa: E501, RUF002
    alpha = (
        tea_tasting.utils.auto_check(alpha, "alpha")
        if alpha is not None
        else tea_tasting.config.get_config("alpha")
    )
    method = tea_tasting.utils.check_scalar(
        method, "method", typ=str, in_={"sidak", "bonferroni"})
    arbitrary_dependence = tea_tasting.utils.check_scalar(
        arbitrary_dependence, "arbitrary_dependence", typ=bool)

    # results and metric_results refer to the same dicts.
    results, metric_results = _copy_results(experiment_results, metrics)
    method_cls = _Sidak if method == "sidak" else _Bonferroni
    method_ = method_cls(alpha=alpha, m=len(metric_results))  # type: ignore
    procedure = _holm_stepdown if arbitrary_dependence else _hochberg_stepup
    # In-place update.
    procedure(metric_results, method_.adjust)

    return MultipleComparisonsResults(results)

tea_tasting.multiplicity #

MultipleComparisonsResults #

to_arrow() #

to_dicts() #

to_html(keys=None, formatter=get_and_format_num, *, max_rows=None, indent=None) #

to_pandas() #

to_polars() #

to_pretty_dicts(keys=None, formatter=get_and_format_num, *, max_rows=None) #

to_string(keys=None, formatter=get_and_format_num, *, max_rows=None) #

with_defaults(*, keys=None, max_rows=None) #

with_keys(keys) #

with_max_rows(max_rows) #

adjust_fdr(experiment_results, metrics=None, *, alpha=None, arbitrary_dependence=False) #

adjust_fwer(experiment_results, metrics=None, *, alpha=None, arbitrary_dependence=False, method='sidak') #

`tea_tasting.multiplicity` #

`MultipleComparisonsResults` #

`to_arrow()` #

`to_dicts()` #

`to_html(keys=None, formatter=get_and_format_num, *, max_rows=None, indent=None)` #

`to_pandas()` #

`to_polars()` #

`to_pretty_dicts(keys=None, formatter=get_and_format_num, *, max_rows=None)` #

`to_string(keys=None, formatter=get_and_format_num, *, max_rows=None)` #

`with_defaults(*, keys=None, max_rows=None)` #

`with_keys(keys)` #

`with_max_rows(max_rows)` #

`adjust_fdr(experiment_results, metrics=None, *, alpha=None, arbitrary_dependence=False)` #

`adjust_fwer(experiment_results, metrics=None, *, alpha=None, arbitrary_dependence=False, method='sidak')` #