Source code for pymc_marketing.clv.models.beta_geo

#   Copyright 2022 - 2025 The PyMC Labs Developers
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
"""Beta-Geometric Negative Binomial Distribution (BG/NBD) model for a non-contractual customer population across continuous time."""  # noqa: E501

from collections.abc import Sequence
from typing import Literal

import numpy as np
import pandas as pd
import pymc as pm
import xarray
from pymc.util import RandomState
from pymc_extras.prior import Prior
from scipy.special import betaln, expit, hyp2f1

from pymc_marketing.clv.distributions import BetaGeoNBD
from pymc_marketing.clv.models.basic import CLVModel
from pymc_marketing.clv.utils import to_xarray
from pymc_marketing.model_config import ModelConfig


[docs] class BetaGeoModel(CLVModel): r"""Beta-Geometric Negative Binomial Distribution (BG/NBD) model for a non-contractual customer population across continuous time. First introduced by Fader, Hardie & Lee [1]_, with additional predictive methods and enhancements in [2]_,[3]_, [4]_ and [5]_ The BG/NBD model assumes dropout probabilities for the customer population are Beta distributed, and time between transactions follows a Gamma distribution while the customer is still active. This model requires data to be summarized by *recency*, *frequency*, and *T* for each customer, using `clv.utils.rfm_summary()` or equivalent. Modeling assumptions require *T >= recency*. Predictive methods have been adapted from the *BetaGeoFitter* class in the legacy *lifetimes* library (see https://github.com/CamDavidsonPilon/lifetimes/). Parameters ---------- data : ~pandas.DataFrame DataFrame containing the following columns: * `customer_id`: Unique customer identifier * `frequency`: Number of repeat purchases * `recency`: Time between the first and the last purchase * `T`: Time between the first purchase and the end of the observation period model_config : dict, optional Dictionary of model prior parameters: * `alpha`: Scale parameter for time between purchases; defaults to `Prior("Weibull", alpha=2, beta=10)` * `r`: Shape parameter for time between purchases; defaults to `Prior("Weibull", alpha=2, beta=1)` * `a`: Shape parameter of dropout process; defaults to `phi_purchase` * `kappa_purchase` * `b`: Shape parameter of dropout process; defaults to `1-phi_dropout` * `kappa_dropout` * `phi_dropout`: Nested prior for a and b priors; defaults to `Prior("Uniform", lower=0, upper=1)` * `kappa_dropout`: Nested prior for a and b priors; defaults to `Prior("Pareto", alpha=1, m=1)` * `purchase_covariates`: Coefficients for purchase rate covariates; defaults to `Normal(0, 1)` * `dropout_covariates`: Coefficients for dropout covariates; defaults to `Normal.dist(0, 1)` * `purchase_covariate_cols`: List containing column names of covariates for customer purchase rates. * `dropout_covariate_cols`: List containing column names of covariates for customer dropouts. sampler_config : dict, optional Dictionary of sampler parameters. Defaults to *None*. Examples -------- .. code-block:: python from pymc_extras.prior import Prior from pymc_marketing.clv import BetaGeoModel, rfm_summary # customer identifiers and purchase datetimes # are all that's needed to start modeling data = [ [1, "2024-01-01"], [1, "2024-02-06"], [2, "2024-01-01"], [3, "2024-01-02"], [3, "2024-01-05"], [4, "2024-01-16"], [4, "2024-02-05"], [5, "2024-01-17"], [5, "2024-01-18"], [5, "2024-01-19"], ] raw_data = pd.DataFrame(data, columns=["id", "date"] # preprocess data rfm_df = rfm_summary(raw_data,'id','date') # model_config and sampler_configs are optional model = BetaGeoModel( data=data, model_config={ "r": Prior("Weibull", alpha=2, beta=1), "alpha": Prior("HalfFlat"), "a": Prior("Beta", alpha=2, beta=3), "b": Prior("Beta", alpha=3, beta=2), }, sampler_config={ "draws": 1000, "tune": 1000, "chains": 2, "cores": 2, }, ) # The default 'mcmc' fit_method provides informative predictions # and reliable performance on small datasets model.fit() print(model.fit_summary()) # Maximum a Posteriori can quickly fit a model to large datasets, # but will give limited insights into predictive uncertainty. model.fit(fit_method='map') print(model.fit_summary()) # Predict number of purchases for current customers # over the next 10 time periods expected_purchases = model.expected_purchases(future_t=10) # Predict probability customers are still active probability_alive = model.expected_probability_alive() # Predict number of purchases for a new customer over 't' time periods expected_purchases_new_customer = model.expected_purchases_new_customer(t=10) References ---------- .. [1] Fader, P. S., Hardie, B. G., & Lee, K. L. (2005). “Counting your customers the easy way: An alternative to the Pareto/NBD model." Marketing science, 24(2), 275-284. http://brucehardie.com/papers/018/fader_et_al_mksc_05.pdf .. [2] Fader, P. S., Hardie, B. G., & Lee, K. L. (2008). "Computing P (alive) using the BG/NBD model." http://www.brucehardie.com/notes/021/palive_for_BGNBD.pdf. .. [3] Fader, P. S. & Hardie, B. G. (2013) "Overcoming the BG/NBD Model's #NUM! Error Problem." http://brucehardie.com/notes/027/bgnbd_num_error.pdf. .. [4] Fader, P. S. & Hardie, B. G. (2019) "A Step-by-Step Derivation of the BG/NBD Model." https://www.brucehardie.com/notes/039/bgnbd_derivation__2019-11-06.pdf .. [5] Fader, Peter & G. S. Hardie, Bruce (2007). "Incorporating Time-Invariant Covariates into the Pareto/NBD and BG/NBD Models". https://www.brucehardie.com/notes/019/time_invariant_covariates.pdf """ # noqa: E501 _model_type = "BG/NBD" # Beta-Geometric Negative Binomial Distribution
[docs] def __init__( self, data: pd.DataFrame, model_config: dict | None = None, sampler_config: dict | None = None, ): super().__init__( data=data, model_config=model_config, sampler_config=sampler_config, non_distributions=["purchase_covariate_cols", "dropout_covariate_cols"], ) self.purchase_covariate_cols = list( self.model_config["purchase_covariate_cols"] ) self.dropout_covariate_cols = list(self.model_config["dropout_covariate_cols"]) self.covariate_cols = self.purchase_covariate_cols + self.dropout_covariate_cols self._validate_cols( data, required_cols=[ "customer_id", "frequency", "recency", "T", *self.covariate_cols, ], must_be_unique=["customer_id"], )
@property def default_model_config(self) -> ModelConfig: """Default model configuration.""" return { "alpha": Prior("Weibull", alpha=2, beta=10), "r": Prior("Weibull", alpha=2, beta=1), "phi_dropout": Prior("Uniform", lower=0, upper=1), "kappa_dropout": Prior("Pareto", alpha=1, m=1), "purchase_coefficient": Prior("Normal", mu=0, sigma=1), "dropout_coefficient": Prior("Normal", mu=0, sigma=1), "purchase_covariate_cols": [], "dropout_covariate_cols": [], }
[docs] def build_model(self) -> None: # type: ignore[override] """Build the model.""" coords = { "purchase_covariate": self.purchase_covariate_cols, "dropout_covariate": self.dropout_covariate_cols, "customer_id": self.data["customer_id"], "obs_var": ["recency", "frequency"], } with pm.Model(coords=coords) as self.model: # purchase rate priors if self.purchase_covariate_cols: purchase_data = pm.Data( "purchase_data", self.data[self.purchase_covariate_cols], dims=["customer_id", "purchase_covariate"], ) self.model_config["purchase_coefficient"].dims = "purchase_covariate" purchase_coefficient_alpha = self.model_config[ "purchase_coefficient" ].create_variable("purchase_coefficient_alpha") alpha_scale = self.model_config["alpha"].create_variable("alpha_scale") alpha = pm.Deterministic( "alpha", ( alpha_scale * pm.math.exp( -pm.math.dot(purchase_data, purchase_coefficient_alpha) ) ), dims="customer_id", ) else: alpha = self.model_config["alpha"].create_variable("alpha") # dropout priors if "a" in self.model_config and "b" in self.model_config: if self.dropout_covariate_cols: dropout_data = pm.Data( "dropout_data", self.data[self.dropout_covariate_cols], dims=["customer_id", "dropout_covariate"], ) self.model_config["dropout_coefficient"].dims = "dropout_covariate" dropout_coefficient_a = self.model_config[ "dropout_coefficient" ].create_variable("dropout_coefficient_a") dropout_coefficient_b = self.model_config[ "dropout_coefficient" ].create_variable("dropout_coefficient_b") a_scale = self.model_config["a"].create_variable("a_scale") b_scale = self.model_config["b"].create_variable("b_scale") a = pm.Deterministic( "a", a_scale * pm.math.exp(pm.math.dot(dropout_data, dropout_coefficient_a)), dims="customer_id", ) b = pm.Deterministic( "b", b_scale * pm.math.exp(pm.math.dot(dropout_data, dropout_coefficient_b)), dims="customer_id", ) else: a = self.model_config["a"].create_variable("a") b = self.model_config["b"].create_variable("b") else: # hierarchical pooling of dropout rate priors if self.dropout_covariate_cols: dropout_data = pm.Data( "dropout_data", self.data[self.dropout_covariate_cols], dims=["customer_id", "dropout_covariate"], ) self.model_config["dropout_coefficient"].dims = "dropout_covariate" dropout_coefficient_a = self.model_config[ "dropout_coefficient" ].create_variable("dropout_coefficient_a") dropout_coefficient_b = self.model_config[ "dropout_coefficient" ].create_variable("dropout_coefficient_b") phi_dropout = self.model_config["phi_dropout"].create_variable( "phi_dropout" ) kappa_dropout = self.model_config["kappa_dropout"].create_variable( "kappa_dropout" ) a_scale = pm.Deterministic( "a_scale", phi_dropout * kappa_dropout, ) b_scale = pm.Deterministic( "b_scale", (1.0 - phi_dropout) * kappa_dropout, ) a = pm.Deterministic( "a", a_scale * pm.math.exp(pm.math.dot(dropout_data, dropout_coefficient_a)), dims="customer_id", ) b = pm.Deterministic( "b", b_scale * pm.math.exp(pm.math.dot(dropout_data, dropout_coefficient_b)), dims="customer_id", ) else: phi_dropout = self.model_config["phi_dropout"].create_variable( "phi_dropout" ) kappa_dropout = self.model_config["kappa_dropout"].create_variable( "kappa_dropout" ) a = pm.Deterministic("a", phi_dropout * kappa_dropout) b = pm.Deterministic("b", (1.0 - phi_dropout) * kappa_dropout) # r remains unchanged with or without covariates r = self.model_config["r"].create_variable("r") BetaGeoNBD( name="recency_frequency", a=a, b=b, r=r, alpha=alpha, T=self.data["T"], observed=np.stack( (self.data["recency"], self.data["frequency"]), axis=1 ), dims=["customer_id", "obs_var"], )
# TODO: delete this utility after API standardization is completed def _unload_params(self): trace = self.idata.posterior a = trace["a"] b = trace["b"] alpha = trace["alpha"] r = trace["r"] return a, b, alpha, r def _extract_predictive_variables( self, data: pd.DataFrame, customer_varnames: Sequence[str] = (), ) -> xarray.Dataset: """ Extract predictive variables from the data. Utility function assigning default customer arguments for predictive methods and converting to xarrays. """ self._validate_cols( data, required_cols=[ "customer_id", *customer_varnames, *self.purchase_covariate_cols, *self.dropout_covariate_cols, ], must_be_unique=["customer_id"], ) customer_id = data["customer_id"] model_coords = self.model.coords if self.purchase_covariate_cols: purchase_xarray = xarray.DataArray( data[self.purchase_covariate_cols], dims=["customer_id", "purchase_covariate"], coords=[customer_id, list(model_coords["purchase_covariate"])], ) alpha_scale = self.fit_result["alpha_scale"] purchase_coefficient_alpha = self.fit_result["purchase_coefficient_alpha"] alpha = alpha_scale * np.exp( -xarray.dot( purchase_xarray, purchase_coefficient_alpha, dim="purchase_covariate", ) ) alpha.name = "alpha" else: alpha = self.fit_result["alpha"] if self.dropout_covariate_cols: dropout_xarray = xarray.DataArray( data[self.dropout_covariate_cols], dims=["customer_id", "dropout_covariate"], coords=[customer_id, list(model_coords["dropout_covariate"])], ) a_scale = self.fit_result["a_scale"] dropout_coefficient_a = self.fit_result["dropout_coefficient_a"] a = a_scale * np.exp( xarray.dot( dropout_xarray, dropout_coefficient_a, dim="dropout_covariate" ) ) a.name = "a" dropout_coefficient_b = self.fit_result["dropout_coefficient_b"] b_scale = self.fit_result["b_scale"] b = b_scale * np.exp( xarray.dot( dropout_xarray, dropout_coefficient_b, dim="dropout_covariate" ) ) b.name = "b" else: a = self.fit_result["a"] b = self.fit_result["b"] r = self.fit_result["r"] customer_vars = to_xarray( data["customer_id"], *[data[customer_varname] for customer_varname in customer_varnames], ) if len(customer_varnames) == 1: customer_vars = [customer_vars] return xarray.combine_by_coords( ( a, b, alpha, r, *customer_vars, ) )
[docs] def expected_purchases( self, data: pd.DataFrame | None = None, *, future_t: int | np.ndarray | pd.Series | None = None, ) -> xarray.DataArray: r"""Compute the expected number of future purchases across *future_t* time periods given *recency*, *frequency*, and *T* for each customer. The *data* parameter is only required for out-of-sample customers. Adapted from equation (10) in [1]_, and *lifetimes* package: https://github.com/CamDavidsonPilon/lifetimes/blob/41e394923ad72b17b5da93e88cfabab43f51abe2/lifetimes/fitters/beta_geo_fitter.py#L201 Parameters ---------- future_t : int, array_like Number of time periods to predict expected purchases. data : ~pandas.DataFrame Optional dataframe containing the following columns: * `customer_id`: Unique customer identifier * `frequency`: Number of repeat purchases * `recency`: Time between the first and the last purchase * `T`: Time between first purchase and end of observation period; model assumptions require T >= recency References ---------- .. [1] Fader, Peter S., Bruce G.S. Hardie, and Ka Lok Lee (2005a), "Counting Your Customers the Easy Way: An Alternative to the Pareto/NBD Model," Marketing Science, 24 (2), 275-84. https://www.brucehardie.com/papers/bgnbd_2004-04-20.pdf """ # noqa: E501 if data is None: data = self.data if future_t is not None: data = data.assign(future_t=future_t) dataset = self._extract_predictive_variables( data, customer_varnames=["frequency", "recency", "T", "future_t"] ) a = dataset["a"] b = dataset["b"] alpha = dataset["alpha"] r = dataset["r"] x = dataset["frequency"] t_x = dataset["recency"] T = dataset["T"] t = dataset["future_t"] numerator = 1 - ((alpha + T) / (alpha + T + t)) ** (r + x) * hyp2f1( r + x, b + x, a + b + x - 1, t / (alpha + T + t), ) numerator *= (a + b + x - 1) / (a - 1) denominator = 1 + (x > 0) * (a / (b + x - 1)) * ( (alpha + T) / (alpha + t_x) ) ** (r + x) return (numerator / denominator).transpose( "chain", "draw", "customer_id", missing_dims="ignore" )
[docs] def expected_probability_alive( self, data: pd.DataFrame | None = None, ) -> xarray.DataArray: r"""Compute the probability a customer with history *frequency*, *recency*, and *T* is currently active. The *data* parameter is only required for out-of-sample customers. Adapted from page (2) in Bruce Hardie's notes [1]_, and *lifetimes* package: https://github.com/CamDavidsonPilon/lifetimes/blob/41e394923ad72b17b5da93e88cfabab43f51abe2/lifetimes/fitters/beta_geo_fitter.py#L260 Parameters ---------- data : *pandas.DataFrame Optional dataframe containing the following columns: * `customer_id`: Unique customer identifier * `frequency`: Number of repeat purchases * `recency`: Time between the first and the last purchase * `T`: Time between first purchase and end of observation period, model assumptions require T >= recency References ---------- .. [1] Fader, P. S., Hardie, B. G., & Lee, K. L. (2008). Computing P (alive) using the BG/NBD model. http://www.brucehardie.com/notes/021/palive_for_BGNBD.pdf. """ if data is None: data = self.data dataset = self._extract_predictive_variables( data, customer_varnames=["frequency", "recency", "T"] ) a = dataset["a"] b = dataset["b"] alpha = dataset["alpha"] r = dataset["r"] x = dataset["frequency"] t_x = dataset["recency"] T = dataset["T"] log_div = (r + x) * np.log((alpha + T) / (alpha + t_x)) + np.log( a / (b + np.maximum(x, 1) - 1) ) return xarray.where(x == 0, 1.0, expit(-log_div)).transpose( "chain", "draw", "customer_id", missing_dims="ignore" )
[docs] def expected_probability_no_purchase( self, t: int, data: pd.DataFrame | None = None, ) -> xarray.DataArray: r"""Compute the probability a customer with history frequency, recency, and T will have 0 purchases in the period (T, T+t]. The data parameter is only required for out-of-sample customers. Adapted from Section 5.3, Equation 34 in Bruce Hardie's notes [1]_. Parameters ---------- data : *pandas.DataFrame Optional dataframe containing the following columns: * `customer_id`: Unique customer identifier * `frequency`: Number of repeat purchases * `recency`: Time between the first and the last purchase * `T`: Time between first purchase and end of observation period, model assumptions require T >= recency t : int Days after T which defines the range (T, T+t]. References ---------- .. [1] Fader, P. S. & Hardie, B. G. (2019) "A Step-by-Step Derivation of the BG/NBD Model." https://www.brucehardie.com/notes/039/bgnbd_derivation__2019-11-06.pdf """ # noqa: D205 if data is None: data = self.data dataset = self._extract_predictive_variables( data, customer_varnames=["frequency", "recency", "T"] ) a = dataset["a"] b = dataset["b"] alpha = dataset["alpha"] r = dataset["r"] x = dataset["frequency"] t_x = dataset["recency"] T = dataset["T"] E = alpha + t_x F = alpha + T + t M = alpha + T beta_rep = betaln(a, b + x) K_E = betaln(a + 1, b + x - 1) - (r + x) * np.log(E) K_F = beta_rep - (r + x) * np.log(F) K_M = beta_rep - (r + x) * np.log(M) K1 = np.maximum(K_E, K_F) K2 = np.maximum(K_E, K_M) numer = np.exp(K_E - K1) + np.exp(K_F - K1) denom = np.exp(K_E - K2) + np.exp(K_M - K2) prob_no_deposits = np.exp(K1 - K2) * numer / denom return prob_no_deposits.transpose( "chain", "draw", "customer_id", missing_dims="ignore" )
[docs] def expected_purchases_new_customer( self, data: pd.DataFrame | None = None, *, t: int | np.ndarray | pd.Series | None = None, ) -> xarray.DataArray: r"""Compute the expected number of purchases for a new customer across *t* time periods. Adapted from equation (9) in [1]_, and `lifetimes` library: https://github.com/CamDavidsonPilon/lifetimes/blob/41e394923ad72b17b5da93e88cfabab43f51abe2/lifetimes/fitters/beta_geo_fitter.py#L328 Parameters ---------- t : array_like Number of time periods over which to estimate purchases. References ---------- .. [1] Fader, Peter S., Bruce G.S. Hardie, and Ka Lok Lee (2005a), "Counting Your Customers the Easy Way: An Alternative to the Pareto/NBD Model," Marketing Science, 24 (2), 275-84. http://www.brucehardie.com/notes/021/palive_for_BGNBD.pdf """ # TODO: This is extraneous now, but needed for future covariate support. if data is None: data = self.data if t is not None: data = data.assign(t=t) dataset = self._extract_predictive_variables(data, customer_varnames=["t"]) a = dataset["a"] b = dataset["b"] alpha = dataset["alpha"] r = dataset["r"] t = dataset["t"] first_term = (a + b - 1) / (a - 1) second_term = 1 - (alpha / (alpha + t)) ** r * hyp2f1( r, b, a + b - 1, t / (alpha + t) ) return (first_term * second_term).transpose( "chain", "draw", "customer_id", missing_dims="ignore" )
[docs] def distribution_new_customer( self, data: pd.DataFrame | None = None, *, T: int | np.ndarray | pd.Series | None = None, random_seed: RandomState | None = None, var_names: Sequence[ Literal["dropout", "purchase_rate", "recency_frequency"] ] = ("dropout", "purchase_rate", "recency_frequency"), n_samples: int = 1000, ) -> xarray.Dataset: """Compute posterior predictive samples of dropout, purchase rate and frequency/recency of new customers. In a model with covariates, if `data` is not specified, the dataset used for fitting will be used and a prediction will be computed for a *new customer* with each set of covariates. *This is not a conditional prediction for observed customers!* Parameters ---------- data : ~pandas.DataFrame, Optional DataFrame containing the following columns: * `customer_id`: Unique customer identifier * `T`: Time between the first purchase and the end of the observation period If not provided, predictions will be ran with data used to fit model. T : array_like, optional time between the first purchase and the end of the observation period. Not needed if `data` parameter is provided with a `T` column. random_seed : ~numpy.random.RandomState, optional Random state to use for sampling. var_names : sequence of str, optional Names of the variables to sample from. Defaults to ["dropout", "purchase_rate", "recency_frequency"]. n_samples : int, optional Number of samples to generate. Defaults to 1000 """ if data is None: data = self.data if T is not None: data = data.assign(T=T) dataset = self._extract_predictive_variables(data, customer_varnames=["T"]) T = dataset["T"].values # Delete "T" so we can pass dataset directly to `sample_posterior_predictive` del dataset["T"] if dataset.sizes["chain"] == 1 and dataset.sizes["draw"] == 1: # For map fit add a dummy draw dimension dataset = dataset.squeeze("draw").expand_dims(draw=range(n_samples)) coords = self.model.coords.copy() # type: ignore coords["customer_id"] = data["customer_id"] with pm.Model(coords=coords) as pred_model: if self.purchase_covariate_cols: alpha = pm.Flat("alpha", dims=["customer_id"]) else: alpha = pm.Flat("alpha") if self.dropout_covariate_cols: a = pm.Flat("a", dims=["customer_id"]) b = pm.Flat("b", dims=["customer_id"]) else: a = pm.Flat("a") b = pm.Flat("b") r = pm.Flat("r") pm.Beta( "dropout", alpha=a, beta=b, dims=pred_model.named_vars_to_dims.get("a") ) pm.Gamma( "purchase_rate", alpha=r, beta=alpha, dims=pred_model.named_vars_to_dims.get("alpha"), ) BetaGeoNBD( name="recency_frequency", a=a, b=b, r=r, alpha=alpha, T=T, dims=["customer_id", "obs_var"], ) return pm.sample_posterior_predictive( dataset, var_names=var_names, random_seed=random_seed, predictions=True, ).predictions
[docs] def distribution_new_customer_dropout( self, data: pd.DataFrame | None = None, *, random_seed: RandomState | None = None, ) -> xarray.Dataset: """Sample the Beta distribution for the population-level dropout rate. This is the probability that a new customer will "drop out" and make no further purchases. Parameters ---------- random_seed : RandomState, optional Random state to use for sampling. Returns ------- xarray.Dataset Dataset containing the posterior samples for the population-level dropout rate. """ return self.distribution_new_customer( data=data, random_seed=random_seed, var_names=["dropout"], )["dropout"]
[docs] def distribution_new_customer_purchase_rate( self, data: pd.DataFrame | None = None, *, random_seed: RandomState | None = None, ) -> xarray.Dataset: """Sample the Gamma distribution for the population-level purchase rate. This is the purchase rate for a new customer and determines the time between purchases for any new customer. Parameters ---------- random_seed : RandomState, optional Random state to use for sampling. Returns ------- xarray.Dataset Dataset containing the posterior samples for the population-level purchase rate. """ return self.distribution_new_customer( data=data, random_seed=random_seed, var_names=["purchase_rate"], )["purchase_rate"]
[docs] def distribution_new_customer_recency_frequency( self, data: pd.DataFrame | None = None, *, T: int | np.ndarray | pd.Series | None = None, random_seed: RandomState | None = None, n_samples: int = 1000, ) -> xarray.Dataset: """BG/NBD process representing purchases across the customer population. This is the distribution of purchase frequencies given 'T' observation periods for each customer. Parameters ---------- data : ~pandas.DataFrame, optional DataFrame containing the following columns: * `customer_id`: Unique customer identifier * `T`: Time between the first purchase and the end of the observation period. * All covariate columns specified when model was initialized. If not provided, the method will use the fit dataset. T : array_like, optional Number of observation periods for each customer. If not provided, T values from fit dataset will be used. Not required if `data` Dataframe contains a `T` column. random_seed : ~numpy.random.RandomState, optional Random state to use for sampling. n_samples : int, optional Number of samples to generate. Defaults to 1000. Returns ------- ~xarray.Dataset Dataset containing the posterior samples for the customer population. """ return self.distribution_new_customer( data=data, T=T, random_seed=random_seed, var_names=["recency_frequency"], n_samples=n_samples, )["recency_frequency"]