Skip to content

tea_tasting.datasets #

Example datasets.

make_users_data(*, covariates=False, seed=None, n_users=4000, ratio=1, sessions_uplift=0.0, orders_uplift=0.1, revenue_uplift=0.1, avg_sessions=2, avg_orders_per_session=0.25, avg_revenue_per_order=10, to_ibis=False) #

Generate simulated data for A/B testing scenarios.

Data mimics what you might encounter in an A/B test for an online store, with a user-level randomization. Each row represents an individual user with information about:

  • user: User identifier.
  • variant: Variant of the test. 0 is control, 1 is treatment.
  • sessions: Number of user's sessions.
  • orders: Number of user's orders.
  • revenue: Revenue generated by the user.

Optionally, pre-experimental data can be generated as well:

  • sessions_covariate: Number of user's sessions before the experiment.
  • orders_covariate: Number of user's orders before the experiment.
  • revenue_covariate: Revenue generated by the user before the experiment.

Parameters:

Name Type Description Default
covariates bool

If True, generates pre-experimental data as the covariates in addition to default columns.

False
seed int | Generator | SeedSequence | None

Random seed.

None
n_users int

Number of users.

4000
ratio float | int

Ratio of the number of users in treatment relative to control.

1
sessions_uplift float | int

Sessions uplift in the treatment variant, relative to control.

0.0
orders_uplift float

Orders uplift in the treatment variant, relative to control.

0.1
revenue_uplift float

Revenue uplift in the treatment variant, relative to control.

0.1
avg_sessions float | int

Average number of sessions per user.

2
avg_orders_per_session float

Average number of orders per session. Should be less than 1.

0.25
avg_revenue_per_order float | int

Average revenue per order.

10
to_ibis bool

If set to True, returns an Ibis Table; otherwise, returns a Pandas DataFrame.

False

Returns:

Type Description
Table | DataFrame

Simulated data for A/B testing scenarios.

Examples:

import tea_tasting as tt


data = tt.make_users_data(seed=42)
data
#>       user  variant  sessions  orders    revenue
#> 0        0        1         2       1   9.166147
#> 1        1        0         2       1   6.434079
#> 2        2        1         2       1   7.943873
#> 3        3        1         2       1  15.928675
#> 4        4        0         1       1   7.136917
#> ...    ...      ...       ...     ...        ...
#> 3995  3995        0         2       0   0.000000
#> 3996  3996        0         2       0   0.000000
#> 3997  3997        0         3       0   0.000000
#> 3998  3998        0         1       0   0.000000
#> 3999  3999        0         5       2  17.162459
#>
#> [4000 rows x 5 columns]

With covariates:

data = tt.make_users_data(seed=42, covariates=True)
data
#>       user  variant  sessions  orders    revenue  sessions_covariate  orders_covariate  revenue_covariate
#> 0        0        1         2       1   9.166147                   3                 2          19.191712
#> 1        1        0         2       1   6.434079                   4                 1           2.770749
#> 2        2        1         2       1   7.943873                   4                 2          22.568422
#> 3        3        1         2       1  15.928675                   1                 0           0.000000
#> 4        4        0         1       1   7.136917                   1                 1          13.683796
#> ...    ...      ...       ...     ...        ...                 ...               ...                ...
#> 3995  3995        0         2       0   0.000000                   1                 0           0.000000
#> 3996  3996        0         2       0   0.000000                   3                 1          13.517967
#> 3997  3997        0         3       0   0.000000                   2                 0           0.000000
#> 3998  3998        0         1       0   0.000000                   1                 0           0.000000
#> 3999  3999        0         5       2  17.162459                   5                 0           0.000000
#>
#> [4000 rows x 8 columns]
Source code in src/tea_tasting/datasets.py
def make_users_data(
    *,
    covariates: bool = False,
    seed: int | np.random.Generator | np.random.SeedSequence | None = None,
    n_users: int = 4000,
    ratio: float | int = 1,
    sessions_uplift: float | int = 0.0,
    orders_uplift: float = 0.1,
    revenue_uplift: float = 0.1,
    avg_sessions: float | int = 2,
    avg_orders_per_session: float = 0.25,
    avg_revenue_per_order: float | int = 10,
    to_ibis: bool = False,
) -> ibis.expr.types.Table | pd.DataFrame:
    """Generate simulated data for A/B testing scenarios.

    Data mimics what you might encounter in an A/B test for an online store,
    with a user-level randomization. Each row represents an individual user
    with information about:

    - `user`: User identifier.
    - `variant`: Variant of the test. 0 is control, 1 is treatment.
    - `sessions`: Number of user's sessions.
    - `orders`: Number of user's orders.
    - `revenue`: Revenue generated by the user.

    Optionally, pre-experimental data can be generated as well:

    - `sessions_covariate`: Number of user's sessions
        before the experiment.
    - `orders_covariate`: Number of user's orders before the experiment.
    - `revenue_covariate`: Revenue generated by the user
        before the experiment.

    Args:
        covariates: If `True`, generates pre-experimental data as the covariates
            in addition to default columns.
        seed: Random seed.
        n_users: Number of users.
        ratio: Ratio of the number of users in treatment relative to control.
        sessions_uplift: Sessions uplift in the treatment variant, relative to control.
        orders_uplift: Orders uplift in the treatment variant, relative to control.
        revenue_uplift: Revenue uplift in the treatment variant, relative to control.
        avg_sessions: Average number of sessions per user.
        avg_orders_per_session: Average number of orders per session.
            Should be less than `1`.
        avg_revenue_per_order: Average revenue per order.
        to_ibis: If set to `True`, returns an Ibis Table; otherwise,
            returns a Pandas DataFrame.

    Returns:
        Simulated data for A/B testing scenarios.

    Examples:
        ```python
        import tea_tasting as tt


        data = tt.make_users_data(seed=42)
        data
        #>       user  variant  sessions  orders    revenue
        #> 0        0        1         2       1   9.166147
        #> 1        1        0         2       1   6.434079
        #> 2        2        1         2       1   7.943873
        #> 3        3        1         2       1  15.928675
        #> 4        4        0         1       1   7.136917
        #> ...    ...      ...       ...     ...        ...
        #> 3995  3995        0         2       0   0.000000
        #> 3996  3996        0         2       0   0.000000
        #> 3997  3997        0         3       0   0.000000
        #> 3998  3998        0         1       0   0.000000
        #> 3999  3999        0         5       2  17.162459
        #>
        #> [4000 rows x 5 columns]
        ```

        With covariates:

        ```python
        data = tt.make_users_data(seed=42, covariates=True)
        data
        #>       user  variant  sessions  orders    revenue  sessions_covariate  orders_covariate  revenue_covariate
        #> 0        0        1         2       1   9.166147                   3                 2          19.191712
        #> 1        1        0         2       1   6.434079                   4                 1           2.770749
        #> 2        2        1         2       1   7.943873                   4                 2          22.568422
        #> 3        3        1         2       1  15.928675                   1                 0           0.000000
        #> 4        4        0         1       1   7.136917                   1                 1          13.683796
        #> ...    ...      ...       ...     ...        ...                 ...               ...                ...
        #> 3995  3995        0         2       0   0.000000                   1                 0           0.000000
        #> 3996  3996        0         2       0   0.000000                   3                 1          13.517967
        #> 3997  3997        0         3       0   0.000000                   2                 0           0.000000
        #> 3998  3998        0         1       0   0.000000                   1                 0           0.000000
        #> 3999  3999        0         5       2  17.162459                   5                 0           0.000000
        #>
        #> [4000 rows x 8 columns]
        ```
    """  # noqa: E501
    return _make_data(
        covariates=covariates,
        seed=seed,
        n_users=n_users,
        ratio=ratio,
        sessions_uplift=sessions_uplift,
        orders_uplift=orders_uplift,
        revenue_uplift=revenue_uplift,
        avg_sessions=avg_sessions,
        avg_orders_per_session=avg_orders_per_session,
        avg_revenue_per_order=avg_revenue_per_order,
        to_ibis=to_ibis,
        explode_sessions=False,
    )

make_sessions_data(*, covariates=False, seed=None, n_users=4000, ratio=1, sessions_uplift=0.0, orders_uplift=0.1, revenue_uplift=0.1, avg_sessions=2, avg_orders_per_session=0.25, avg_revenue_per_order=10, to_ibis=False) #

Generate simulated user data for A/B testing scenarios.

Data mimics what you might encounter in an A/B test for an online store, with a user-level randomization. Each row represents a user's session with information about:

  • user: User identifier.
  • variant: Variant of the test. 0 is control, 1 is treatment.
  • sessions: Number of user's sessions.
  • orders: Number of user's orders.
  • revenue: Revenue generated by the user.

Optionally, pre-experimental data can be generated as well:

  • sessions_covariate: Number of user's sessions before the experiment.
  • orders_covariate: Number of user's orders before the experiment.
  • revenue_covariate: Revenue generated by the user before the experiment.

Parameters:

Name Type Description Default
covariates bool

If True, generates pre-experimental data as the covariates in addition to default columns.

False
seed int | Generator | SeedSequence | None

Random seed.

None
n_users int

Number of users.

4000
ratio float | int

Ratio of the number of users in treatment relative to control.

1
sessions_uplift float | int

Sessions uplift in the treatment variant, relative to control.

0.0
orders_uplift float

Orders uplift in the treatment variant, relative to control.

0.1
revenue_uplift float

Revenue uplift in the treatment variant, relative to control.

0.1
avg_sessions float | int

Average number of sessions per user.

2
avg_orders_per_session float

Average number of orders per session. Should be less than 1.

0.25
avg_revenue_per_order float | int

Average revenue per order.

10
to_ibis bool

If set to True, returns an Ibis Table; otherwise, returns a Pandas DataFrame.

False

Returns:

Type Description
Table | DataFrame

Simulated data for A/B testing scenarios.

Examples:

import tea_tasting as tt


data = tt.make_sessions_data(seed=42)
data
#>       user  variant  sessions  orders    revenue
#> 0        0        1         1       1   5.887178
#> 1        0        1         1       1   6.131080
#> 2        1        0         1       1   2.614675
#> 3        1        0         1       1  12.296075
#> 4        2        1         1       1  11.573409
#> ...    ...      ...       ...     ...        ...
#> 7953  3999        0         1       1  23.634941
#> 7954  3999        0         1       0   0.000000
#> 7955  3999        0         1       1   2.396078
#> 7956  3999        0         1       1  24.538111
#> 7957  3999        0         1       0   0.000000
#>
#> [7958 rows x 5 columns]

With covariates:

data = tt.make_sessions_data(seed=42, covariates=True)
data
#>       user  variant  sessions  orders    revenue  sessions_covariate  orders_covariate  revenue_covariate
#> 0        0        1         1       1   5.887178                 1.5               0.5           1.236732
#> 1        0        1         1       1   6.131080                 1.5               0.5           1.236732
#> 2        1        0         1       1   2.614675                 0.0               0.0           0.000000
#> 3        1        0         1       1  12.296075                 0.0               0.0           0.000000
#> 4        2        1         1       1  11.573409                 1.5               1.5          12.324434
#> ...    ...      ...       ...     ...        ...                 ...               ...                ...
#> 7953  3999        0         1       1  23.634941                 0.2               0.0           0.000000
#> 7954  3999        0         1       0   0.000000                 0.2               0.0           0.000000
#> 7955  3999        0         1       1   2.396078                 0.2               0.0           0.000000
#> 7956  3999        0         1       1  24.538111                 0.2               0.0           0.000000
#> 7957  3999        0         1       0   0.000000                 0.2               0.0           0.000000
#>
#> [7958 rows x 8 columns]
Source code in src/tea_tasting/datasets.py
def make_sessions_data(
    *,
    covariates: bool = False,
    seed: int | np.random.Generator | np.random.SeedSequence | None = None,
    n_users: int = 4000,
    ratio: float | int = 1,
    sessions_uplift: float | int = 0.0,
    orders_uplift: float = 0.1,
    revenue_uplift: float = 0.1,
    avg_sessions: float | int = 2,
    avg_orders_per_session: float = 0.25,
    avg_revenue_per_order: float | int = 10,
    to_ibis: bool = False,
) -> ibis.expr.types.Table | pd.DataFrame:
    """Generate simulated user data for A/B testing scenarios.

    Data mimics what you might encounter in an A/B test for an online store,
    with a user-level randomization. Each row represents a user's session
    with information about:

    - `user`: User identifier.
    - `variant`: Variant of the test. 0 is control, 1 is treatment.
    - `sessions`: Number of user's sessions.
    - `orders`: Number of user's orders.
    - `revenue`: Revenue generated by the user.

    Optionally, pre-experimental data can be generated as well:

    - `sessions_covariate`: Number of user's sessions
        before the experiment.
    - `orders_covariate`: Number of user's orders before the experiment.
    - `revenue_covariate`: Revenue generated by the user
        before the experiment.

    Args:
        covariates: If `True`, generates pre-experimental data as the covariates
            in addition to default columns.
        seed: Random seed.
        n_users: Number of users.
        ratio: Ratio of the number of users in treatment relative to control.
        sessions_uplift: Sessions uplift in the treatment variant, relative to control.
        orders_uplift: Orders uplift in the treatment variant, relative to control.
        revenue_uplift: Revenue uplift in the treatment variant, relative to control.
        avg_sessions: Average number of sessions per user.
        avg_orders_per_session: Average number of orders per session.
            Should be less than `1`.
        avg_revenue_per_order: Average revenue per order.
        to_ibis: If set to `True`, returns an Ibis Table; otherwise,
            returns a Pandas DataFrame.

    Returns:
        Simulated data for A/B testing scenarios.

    Examples:
        ```python
        import tea_tasting as tt


        data = tt.make_sessions_data(seed=42)
        data
        #>       user  variant  sessions  orders    revenue
        #> 0        0        1         1       1   5.887178
        #> 1        0        1         1       1   6.131080
        #> 2        1        0         1       1   2.614675
        #> 3        1        0         1       1  12.296075
        #> 4        2        1         1       1  11.573409
        #> ...    ...      ...       ...     ...        ...
        #> 7953  3999        0         1       1  23.634941
        #> 7954  3999        0         1       0   0.000000
        #> 7955  3999        0         1       1   2.396078
        #> 7956  3999        0         1       1  24.538111
        #> 7957  3999        0         1       0   0.000000
        #>
        #> [7958 rows x 5 columns]
        ```

        With covariates:

        ```python
        data = tt.make_sessions_data(seed=42, covariates=True)
        data
        #>       user  variant  sessions  orders    revenue  sessions_covariate  orders_covariate  revenue_covariate
        #> 0        0        1         1       1   5.887178                 1.5               0.5           1.236732
        #> 1        0        1         1       1   6.131080                 1.5               0.5           1.236732
        #> 2        1        0         1       1   2.614675                 0.0               0.0           0.000000
        #> 3        1        0         1       1  12.296075                 0.0               0.0           0.000000
        #> 4        2        1         1       1  11.573409                 1.5               1.5          12.324434
        #> ...    ...      ...       ...     ...        ...                 ...               ...                ...
        #> 7953  3999        0         1       1  23.634941                 0.2               0.0           0.000000
        #> 7954  3999        0         1       0   0.000000                 0.2               0.0           0.000000
        #> 7955  3999        0         1       1   2.396078                 0.2               0.0           0.000000
        #> 7956  3999        0         1       1  24.538111                 0.2               0.0           0.000000
        #> 7957  3999        0         1       0   0.000000                 0.2               0.0           0.000000
        #>
        #> [7958 rows x 8 columns]
        ```
    """  # noqa: E501
    return _make_data(
        covariates=covariates,
        seed=seed,
        n_users=n_users,
        ratio=ratio,
        sessions_uplift=sessions_uplift,
        orders_uplift=orders_uplift,
        revenue_uplift=revenue_uplift,
        avg_sessions=avg_sessions,
        avg_orders_per_session=avg_orders_per_session,
        avg_revenue_per_order=avg_revenue_per_order,
        to_ibis=to_ibis,
        explode_sessions=True,
    )