Source code for anesthetic.tension

"""Tension statistics between two or more datasets."""
import numpy as np
from scipy.stats import chi2
from scipy.special import erfcinv
from anesthetic.samples import Samples


[docs] def tension_stats(joint, *separate): r"""Compute tension statistics between two or more samples. With the Bayesian (log-)evidence ``logZ``, Kullback--Leibler divergence ``D_KL``, posterior average of the log-likelihood ``logL_P``, Gaussian model dimensionality ``d_G``, we can compute tension statistics between two or more samples (example here for simplicity just with two datasets A and B): - ``logR``: R statistic for dataset consistency. .. math:: \ln R = \ln Z_{AB} - \ln Z_{A} - \ln Z_{B} - ``I``: Mutual information estimate between data and params: :math:`I(\Theta,A,B)`. .. math:: \hat{I} = D_{KL}^{A} + D_{KL}^{B} - D_{KL}^{AB} - ``logS``: Suspiciousness. .. math:: \ln S = \ln L_{AB} - \ln L_{A} - \ln L_{B} - ``d_G``: Gaussian model dimensionality of shared constrained parameters. .. math:: d = d_{A} + d_{B} - d_{AB} - ``p``: p-value for the tension between two samples based on `logS`. .. math:: p = \int_{d-2\ln{S}}^{\infty} \chi^2_d(x) dx - ``sigma``: Tension quantification in terms of numbers of sigma calculated from `p`. .. math:: \sqrt{2} {\rm erfc}^{-1}(p) Parameters ---------- joint : :class:`anesthetic.samples.Samples` Bayesian stats from a nested sampling run using all the datasets from the list in ``separate`` jointly. This should be a ``stats`` object with columns ['logZ', 'D_KL', 'logL_P', 'd_G'] as returned by :meth:`anesthetic.samples.NestedSamples.stats`. *separate A variable number of Bayesian stats from independent nested sampling runs using various datasets (A, B, ...) separately. Each should be a ``stats`` object with the columns ['logZ', 'D_KL', 'logL_P', 'd_G'] as returned by :meth:`anesthetic.samples.NestedSamples.stats`. Returns ------- samples : :class:`anesthetic.samples.Samples` DataFrame containing the following tension statistics in columns: ['logR', 'I', 'logS', 'd_G', 'p', 'sigma'] """ columns = ["logL_P", "d_G"] if "logZ" in joint.drop_labels().columns: columns += ["logZ"] if "D_KL" in joint.drop_labels().columns: columns += ["D_KL"] if not set(columns).issubset(joint.drop_labels().columns): raise ValueError( "The DataFrame passed to `joint` needs to contain" "the columns 'logZ', 'D_KL', 'logL_P', and 'd_G'." ) for s in separate: if not set(columns).issubset(s.drop_labels().columns): raise ValueError( "The DataFrames passed to `separate` need to " "contain the columns 'logZ', 'D_KL', 'logL_P', " "and 'd_G'." ) separate_stats = separate[0][columns].copy() for s in separate[1:]: separate_stats += s joint_stats = joint[columns].copy() samples = Samples(index=joint_stats.index) if "logZ" in joint_stats.drop_labels().columns: samples["logR"] = joint_stats["logZ"] - separate_stats["logZ"] samples.set_label("logR", r"$\ln\mathcal{R}$") if "D_KL" in joint_stats.drop_labels().columns: samples["I"] = separate_stats["D_KL"] - joint_stats["D_KL"] samples.set_label("I", r"$\mathcal{I}$") samples["logS"] = joint_stats["logL_P"] - separate_stats["logL_P"] samples.set_label("logS", r"$\ln\mathcal{S}$") samples["d_G"] = separate_stats["d_G"] - joint_stats["d_G"] samples.set_label("d_G", r"$d_\mathrm{G}$") p = chi2.sf(samples["d_G"] - 2 * samples["logS"], df=samples["d_G"]) samples["p"] = p samples.set_label("p", "$p$") samples["sigma"] = erfcinv(p) * np.sqrt(2) samples.set_label("sigma", r"$\sigma$") return samples