# Copyright 2022 - 2025 The PyMC Labs Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shifted Beta Geometric model."""
from collections.abc import Sequence
import numpy as np
import pandas as pd
import pymc as pm
from pymc.util import RandomState
from pymc_extras.prior import Prior
from xarray import DataArray, Dataset
from pymc_marketing.clv.models import CLVModel
from pymc_marketing.model_config import ModelConfig
[docs]
class ShiftedBetaGeoModelIndividual(CLVModel):
"""Shifted Beta Geometric model.
Model for customer behavior in a discrete contractual setting. It assumes that:
* At the end of each period, a customer has a probability `theta` of renewing the contract
and `1-theta` of cancelling
* The probability `theta` does not change over time for a given customer
* The probability `theta` varies across customers according to a Beta prior distribution
with hyperparameters `alpha` and `beta`.
based on [1]_.
Parameters
----------
data: pd.DataFrame
DataFrame containing the following columns:
* `customer_id`: Customer labels. There should be one unique label for each customer
* `t_churn`: Time at which the customer cancelled the contract (starting at 0).
It should equal T for users that have not cancelled by the end of the
observation period
* `T`: Maximum observed time period (starting at 0)
model_config: dict, optional
Dictionary of model prior parameters. If not provided, the model will use default priors specified in the
`default_model_config` class attribute.
sampler_config: dict, optional
Dictionary of sampler parameters. Defaults to None.
Examples
--------
.. code-block:: python
import pymc as pm
from pymc_extras.prior import Prior
from pymc_marketing.clv import ShiftedBetaGeoModelIndividual
model = ShiftedBetaGeoModelIndividual(
data=pd.DataFrame({
customer_id=[0, 1, 2, 3, ...],
t_churn=[1, 2, 8, 4, 8 ...],
T=[8 for x in range(len(customer_id))],
}),
model_config={
"alpha": Prior("HalfNormal", sigma=10),
"beta": Prior("HalfStudentT", nu=4, sigma=10),
},
sampler_config={
"draws": 1000,
"tune": 1000,
"chains": 2,
"cores": 2,
"nuts_kwargs": {"target_accept": 0.95},
},
)
model.fit()
print(model.fit_summary())
# Predict how many periods in the future are existing customers
likely to cancel (ignoring that some may already have cancelled)
expected_churn_time = model.distribution_customer_churn_time(
customer_id=[0, 1, 2, 3, ...],
)
print(expected_churn_time.mean("customer_id"))
# Predict churn time for 10 new customers, conditioned on data
new_customers_churn_time = model.distribution_new_customer_churn_time(n=10)
print(new_customers_churn_time.mean("new_customer_id"))
References
----------
.. [1] Fader, P. S., & Hardie, B. G. (2007). How to project customer retention.
Journal of Interactive Marketing, 21(1), 76-90.
https://journals.sagepub.com/doi/pdf/10.1002/dir.20074
"""
_model_type = "Shifted-Beta-Geometric Model (Individual Customers)"
[docs]
def __init__(
self,
data: pd.DataFrame,
model_config: ModelConfig | None = None,
sampler_config: dict | None = None,
):
self._validate_cols(
data,
required_cols=["customer_id", "t_churn", "T"],
must_be_unique=["customer_id"],
)
if np.any(
(data["t_churn"] < 0)
| (data["t_churn"] > data["T"])
| np.isnan(data["t_churn"])
):
raise ValueError(
"t_churn must respect 0 < t_churn <= T.\n",
"Customers that are still alive should have t_churn = T",
)
super().__init__(
data=data, model_config=model_config, sampler_config=sampler_config
)
@property
def default_model_config(self) -> dict:
"""Default model configuration."""
return {
"alpha": Prior("HalfFlat"),
"beta": Prior("HalfFlat"),
}
[docs]
def build_model(self) -> None: # type: ignore[override]
"""Build the model."""
coords = {"customer_id": self.data["customer_id"]}
with pm.Model(coords=coords) as self.model:
alpha = self.model_config["alpha"].create_variable("alpha")
beta = self.model_config["beta"].create_variable("beta")
theta = pm.Beta("theta", alpha, beta, dims=("customer_id",))
churn_raw = pm.Geometric.dist(theta)
pm.Censored(
"churn_censored",
churn_raw,
lower=None,
upper=self.data["T"],
observed=self.data["t_churn"],
dims=("customer_id",),
)
[docs]
def distribution_customer_churn_time(
self, customer_id: np.ndarray | pd.Series, random_seed: RandomState = None
) -> DataArray:
"""Sample distribution of churn time for existing customers.
The draws represent the number of periods into the future after which
a customer cancels their contract.
It ignores that some customers may have already cancelled.
"""
coords = {"customer_id": customer_id}
with pm.Model(coords=coords):
alpha = pm.HalfFlat("alpha")
beta = pm.HalfFlat("beta")
theta = pm.Beta("theta", alpha, beta, dims=("customer_id",))
pm.Geometric("churn", theta, dims=("customer_id",))
return pm.sample_posterior_predictive(
self.idata,
var_names=["churn"],
random_seed=random_seed,
).posterior_predictive["churn"]
def _distribution_new_customer(
self,
n: int = 1,
random_seed: RandomState = None,
var_names: Sequence[str] = ("theta", "churn"),
) -> Dataset:
coords = {"new_customer_id": np.arange(n)}
with pm.Model(coords=coords):
alpha = pm.HalfFlat("alpha")
beta = pm.HalfFlat("beta")
theta = pm.Beta("theta", alpha, beta, dims=("new_customer_id",))
pm.Geometric("churn", theta, dims=("new_customer_id",))
return pm.sample_posterior_predictive(
self.idata,
var_names=var_names,
random_seed=random_seed,
).posterior_predictive
[docs]
def distribution_new_customer_churn_time(
self, n: int = 1, random_seed: RandomState = None
) -> DataArray:
"""Sample distribution of churn time for new customers.
The draws represent the number of periods into the future after which
a customer cancels their contract.
Use `n > 1` to simulate multiple identically distributed users.
"""
return self._distribution_new_customer(
n=n, random_seed=random_seed, var_names=["churn"]
)["churn"]
[docs]
def distribution_new_customer_theta(
self, n: int = 1, random_seed: RandomState = None
) -> DataArray:
"""Sample distribution of theta parameter for new customers.
Use `n > 1` to simulate multiple identically distributed users.
"""
return self._distribution_new_customer(
n=n, random_seed=random_seed, var_names=["theta"]
)["theta"]