Source code for pymc_marketing.clv.models.shifted_beta_geo

#   Copyright 2022 - 2025 The PyMC Labs Developers
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
"""Shifted Beta Geometric model."""

from collections.abc import Sequence

import numpy as np
import pandas as pd
import pymc as pm
from pymc.util import RandomState
from pymc_extras.prior import Prior
from xarray import DataArray, Dataset

from pymc_marketing.clv.models import CLVModel
from pymc_marketing.model_config import ModelConfig


[docs] class ShiftedBetaGeoModelIndividual(CLVModel): """Shifted Beta Geometric model. Model for customer behavior in a discrete contractual setting. It assumes that: * At the end of each period, a customer has a probability `theta` of renewing the contract and `1-theta` of cancelling * The probability `theta` does not change over time for a given customer * The probability `theta` varies across customers according to a Beta prior distribution with hyperparameters `alpha` and `beta`. based on [1]_. Parameters ---------- data: pd.DataFrame DataFrame containing the following columns: * `customer_id`: Customer labels. There should be one unique label for each customer * `t_churn`: Time at which the customer cancelled the contract (starting at 0). It should equal T for users that have not cancelled by the end of the observation period * `T`: Maximum observed time period (starting at 0) model_config: dict, optional Dictionary of model prior parameters. If not provided, the model will use default priors specified in the `default_model_config` class attribute. sampler_config: dict, optional Dictionary of sampler parameters. Defaults to None. Examples -------- .. code-block:: python import pymc as pm from pymc_extras.prior import Prior from pymc_marketing.clv import ShiftedBetaGeoModelIndividual model = ShiftedBetaGeoModelIndividual( data=pd.DataFrame({ customer_id=[0, 1, 2, 3, ...], t_churn=[1, 2, 8, 4, 8 ...], T=[8 for x in range(len(customer_id))], }), model_config={ "alpha": Prior("HalfNormal", sigma=10), "beta": Prior("HalfStudentT", nu=4, sigma=10), }, sampler_config={ "draws": 1000, "tune": 1000, "chains": 2, "cores": 2, "nuts_kwargs": {"target_accept": 0.95}, }, ) model.fit() print(model.fit_summary()) # Predict how many periods in the future are existing customers likely to cancel (ignoring that some may already have cancelled) expected_churn_time = model.distribution_customer_churn_time( customer_id=[0, 1, 2, 3, ...], ) print(expected_churn_time.mean("customer_id")) # Predict churn time for 10 new customers, conditioned on data new_customers_churn_time = model.distribution_new_customer_churn_time(n=10) print(new_customers_churn_time.mean("new_customer_id")) References ---------- .. [1] Fader, P. S., & Hardie, B. G. (2007). How to project customer retention. Journal of Interactive Marketing, 21(1), 76-90. https://journals.sagepub.com/doi/pdf/10.1002/dir.20074 """ _model_type = "Shifted-Beta-Geometric Model (Individual Customers)"
[docs] def __init__( self, data: pd.DataFrame, model_config: ModelConfig | None = None, sampler_config: dict | None = None, ): self._validate_cols( data, required_cols=["customer_id", "t_churn", "T"], must_be_unique=["customer_id"], ) if np.any( (data["t_churn"] < 0) | (data["t_churn"] > data["T"]) | np.isnan(data["t_churn"]) ): raise ValueError( "t_churn must respect 0 < t_churn <= T.\n", "Customers that are still alive should have t_churn = T", ) super().__init__( data=data, model_config=model_config, sampler_config=sampler_config )
@property def default_model_config(self) -> dict: """Default model configuration.""" return { "alpha": Prior("HalfFlat"), "beta": Prior("HalfFlat"), }
[docs] def build_model(self) -> None: # type: ignore[override] """Build the model.""" coords = {"customer_id": self.data["customer_id"]} with pm.Model(coords=coords) as self.model: alpha = self.model_config["alpha"].create_variable("alpha") beta = self.model_config["beta"].create_variable("beta") theta = pm.Beta("theta", alpha, beta, dims=("customer_id",)) churn_raw = pm.Geometric.dist(theta) pm.Censored( "churn_censored", churn_raw, lower=None, upper=self.data["T"], observed=self.data["t_churn"], dims=("customer_id",), )
[docs] def distribution_customer_churn_time( self, customer_id: np.ndarray | pd.Series, random_seed: RandomState = None ) -> DataArray: """Sample distribution of churn time for existing customers. The draws represent the number of periods into the future after which a customer cancels their contract. It ignores that some customers may have already cancelled. """ coords = {"customer_id": customer_id} with pm.Model(coords=coords): alpha = pm.HalfFlat("alpha") beta = pm.HalfFlat("beta") theta = pm.Beta("theta", alpha, beta, dims=("customer_id",)) pm.Geometric("churn", theta, dims=("customer_id",)) return pm.sample_posterior_predictive( self.idata, var_names=["churn"], random_seed=random_seed, ).posterior_predictive["churn"]
def _distribution_new_customer( self, n: int = 1, random_seed: RandomState = None, var_names: Sequence[str] = ("theta", "churn"), ) -> Dataset: coords = {"new_customer_id": np.arange(n)} with pm.Model(coords=coords): alpha = pm.HalfFlat("alpha") beta = pm.HalfFlat("beta") theta = pm.Beta("theta", alpha, beta, dims=("new_customer_id",)) pm.Geometric("churn", theta, dims=("new_customer_id",)) return pm.sample_posterior_predictive( self.idata, var_names=var_names, random_seed=random_seed, ).posterior_predictive
[docs] def distribution_new_customer_churn_time( self, n: int = 1, random_seed: RandomState = None ) -> DataArray: """Sample distribution of churn time for new customers. The draws represent the number of periods into the future after which a customer cancels their contract. Use `n > 1` to simulate multiple identically distributed users. """ return self._distribution_new_customer( n=n, random_seed=random_seed, var_names=["churn"] )["churn"]
[docs] def distribution_new_customer_theta( self, n: int = 1, random_seed: RandomState = None ) -> DataArray: """Sample distribution of theta parameter for new customers. Use `n > 1` to simulate multiple identically distributed users. """ return self._distribution_new_customer( n=n, random_seed=random_seed, var_names=["theta"] )["theta"]