Source code for anesthetic.tension

"""Tension statistics between two or more datasets."""
import numpy as np
from scipy.stats import chi2
from scipy.special import erfcinv
from anesthetic.samples import Samples



[docs]
def tension_stats(joint, *separate):
    r"""Compute tension statistics between two or more samples.

    With the Bayesian (log-)evidence ``logZ``, Kullback--Leibler divergence
    ``D_KL``, posterior average of the log-likelihood ``logL_P``, Gaussian
    model dimensionality ``d_G``, we can compute tension statistics between
    two or more samples (example here for simplicity just with two datasets
    A and B):

    - ``logR``: R statistic for dataset consistency.

      .. math::
        \ln R = \ln Z_{AB} - \ln Z_{A} - \ln Z_{B}

    - ``I``: Mutual information estimate between data and params:
      :math:`I(\Theta,A,B)`.

      .. math::
        \hat{I} = D_{KL}^{A} + D_{KL}^{B} - D_{KL}^{AB}

    - ``logS``: Suspiciousness.

      .. math::
        \ln S = \ln L_{AB} - \ln L_{A} - \ln L_{B}

    - ``d_G``: Gaussian model dimensionality of shared constrained parameters.

      .. math::
        d = d_{A} + d_{B} - d_{AB}

    - ``p``: p-value for the tension between two samples based on `logS`.

      .. math::
        p = \int_{d-2\ln{S}}^{\infty} \chi^2_d(x) dx

    - ``sigma``: Tension quantification in terms of numbers of sigma
      calculated from `p`.

      .. math::
        \sqrt{2} {\rm erfc}^{-1}(p)

    Parameters
    ----------
    joint : :class:`anesthetic.samples.Samples`
        Bayesian stats from a nested sampling run using all the datasets from
        the list in ``separate`` jointly. This should be a ``stats`` object
        with columns ['logZ', 'D_KL', 'logL_P', 'd_G'] as returned by
        :meth:`anesthetic.samples.NestedSamples.stats`.

    *separate
        A variable number of Bayesian stats from independent nested sampling
        runs using various datasets (A, B, ...) separately. Each should be a
        ``stats`` object with the columns ['logZ', 'D_KL', 'logL_P', 'd_G']
        as returned by :meth:`anesthetic.samples.NestedSamples.stats`.

    Returns
    -------
    samples : :class:`anesthetic.samples.Samples`
        DataFrame containing the following tension statistics in columns:
        ['logR', 'I', 'logS', 'd_G', 'p', 'sigma']
    """
    columns = ["logL_P", "d_G"]
    if "logZ" in joint.drop_labels().columns:
        columns += ["logZ"]
    if "D_KL" in joint.drop_labels().columns:
        columns += ["D_KL"]
    if not set(columns).issubset(joint.drop_labels().columns):
        raise ValueError(
            "The DataFrame passed to `joint` needs to contain"
            "the columns 'logZ', 'D_KL', 'logL_P', and 'd_G'."
        )
    for s in separate:
        if not set(columns).issubset(s.drop_labels().columns):
            raise ValueError(
                "The DataFrames passed to `separate` need to "
                "contain the columns 'logZ', 'D_KL', 'logL_P', "
                "and 'd_G'."
            )
    separate_stats = separate[0][columns].copy()
    for s in separate[1:]:
        separate_stats += s
    joint_stats = joint[columns].copy()

    samples = Samples(index=joint_stats.index)

    if "logZ" in joint_stats.drop_labels().columns:
        samples["logR"] = joint_stats["logZ"] - separate_stats["logZ"]
        samples.set_label("logR", r"$\ln\mathcal{R}$")

    if "D_KL" in joint_stats.drop_labels().columns:
        samples["I"] = separate_stats["D_KL"] - joint_stats["D_KL"]
        samples.set_label("I", r"$\mathcal{I}$")

    samples["logS"] = joint_stats["logL_P"] - separate_stats["logL_P"]
    samples.set_label("logS", r"$\ln\mathcal{S}$")

    samples["d_G"] = separate_stats["d_G"] - joint_stats["d_G"]
    samples.set_label("d_G", r"$d_\mathrm{G}$")

    p = chi2.sf(samples["d_G"] - 2 * samples["logS"], df=samples["d_G"])
    samples["p"] = p
    samples.set_label("p", "$p$")

    samples["sigma"] = erfcinv(p) * np.sqrt(2)
    samples.set_label("sigma", r"$\sigma$")

    return samples