Source code for icefit.statstools

# Statistical tests and tools
#
# pytest icefit/statstools.py -rP
#
# m.mieskolainen@imperial.ac.uk, 2023

import numpy as np
import numba
import copy
import scipy.stats as stats



[docs]
def covmat2corrmat(C, EPS=1e-12):
    """
    Covariance matrix to correlation matrix

    Args:
        C: Covariance matrix

    Returns:
        Correlation matrix
    """
    v       = np.sqrt(np.diag(C))
    outer_v = np.outer(v, v)
    corr    = C / (outer_v + EPS)
    corr[C == 0] = np.nan # Not defined

    return corr



[docs]
def correlation_matrix(X, weights=None, return_cov=False):
    """
    Compute a (sample weighted) correlation or covariance matrix
    
    Args:
        X:           Data matrix (rows as sample points, columns as variables)
        weights:     Weights per sample
        return_cov:  Return covariance matrix (not correlation)
    
    Returns:
        correlation matrix
    """
    
    if weights is None:
        C = np.cov(X, rowvar=False)

        if return_cov: # Return covariance
            return C

        # Turn into a correlation matrix
        C = covmat2corrmat(C=C)

    else:
        # Zero-mean each column (variable), repeat weights
        Q = copy.deepcopy(X)
        W = np.zeros(X.shape)
        for j in range(Q.shape[1]):
            Q[:,j] = Q[:,j] - np.average(X[:,j], weights=weights)
            W[:,j] = weights
        
        # Compute a weighted covariance matrix
        QW = Q*W
        C  = QW.T.dot(QW) / W.T.dot(W)

        if return_cov: # Return covariance
            return C

        # Turn into a correlation matrix
        C = covmat2corrmat(C=C)

    return C




[docs]
def error_on_mu(sigma, n):
    """ Standard error estimate on a sample mean.
    https://web.eecs.umich.edu/~fessler/papers/files/tr/stderr.pdf
    
    Args:
        sigma : sample standard deviation
           n  : sample size
    """
    return sigma / np.sqrt(n)




[docs]
def error_on_std(sigma, n):
    """
    Standard error estimate on a sample standard deviation.
    https://web.eecs.umich.edu/~fessler/papers/files/tr/stderr.pdf
    Args:
        sigma : sample standard deviation
        n     : sample size
    """
    Kn = np.sqrt((n-1)/2) * np.exp(special.loggamma((n-1) / 2) - special.loggamma(n/2))
    Vn = 2 * ((n-1)/2 - (special.gamma(n/2) / special.gamma((n-1)/2))**2)
    return sigma * Kn * np.sqrt(Vn / (n-1))




[docs]
def geom_mean_2D(x, y, x_err, y_err, flip_vertical=False):
  """
  Geometric 2D mean of x and y (understood e.g. as 1D histograms) element wise
  and its error propagation based uncertainty (x,y taken also independent elem. wise)
  
  Math note: outer(u,v)_ij = u_i v_j, with outer(u,v) = uv^T
  
  Args:
    x, y         : values        (numpy arrays)
    x_err, y_err : uncertainties (numpy arrays)
    flip_vertical: flip the values over the vertical (y) dimension
  
  Returns:
    z, z_err     : 2D-matrix (with rank 1) and its uncertainty
  """
  z     = np.sqrt(np.outer(y, x))
  z_err = 0.5 * np.sqrt(np.outer(y, x_err**2/x) + np.outer(y_err**2/y, x))
  
  if flip_vertical:
    z     = np.flip(z,0)
    z_err = np.flip(z_err,0)
  
  return z, z_err




[docs]
def tpratio_taylor(x,y, x_err,y_err, xy_err=0.0):
    """
    Error propagation on a ratio

    r = x/(x+y)
    
    If x and y are e.g. counts from stat. independent histograms,
    then xy_err = 0 (~ tag & probe style ratios).
    
    Args:
        x,y         : values
        x_err,y_err : uncertainties
        xy_err      : covariance (err_x * err_y * rho_xy)
    
    Returns:
        Error propagated ratio error
    """
    dx =  y/(x + y)**2
    dy = -x/(x + y)**2

    r2 = dx**2*x_err**2 + dy**2*y_err**2 + 2*dx*dy*xy_err
    
    return np.sqrt(r2)




[docs]
def prod_eprop(A, B, sigmaA, sigmaB, sigmaAB=0):
    """
    Error propagation (Taylor expansion) of a product A*B

    Args:
        A       : Value A 
        B       : Value B
        sigmaA  : 1 sigma uncertainty on A
        sigmaB  : 1 sigma uncertainty on B
        sigmaAB : Covariance of A,B

    Returns:
        Uncertainty
    """
    f = A * B
    return np.sqrt(f**2*((sigmaA/A)**2 + (sigmaB/B)**2 + 2*sigmaAB/(A*B)))




[docs]
def ratio_eprop(A, B, sigmaA, sigmaB, sigmaAB=0):
    """
    Error propagation (Taylor expansion) of a ratio A/B

    Args:
        A       : Value A 
        B       : Value B
        sigmaA  : 1 sigma uncertainty on A
        sigmaB  : 1 sigma uncertainty on B
        sigmaAB : Covariance of A,B
    
    Returns:
        Uncertainty
    """
    f = A / B
    return np.sqrt(f**2*((sigmaA/A)**2 + (sigmaB/B)**2 - 2*sigmaAB/(A*B)))




[docs]
def weighted_binomial_err(b1: float, b2: float, e1: float, e2: float):
    """
    Binomial counting efficiency error with weights
    
    For the reference, see:
    https://root.cern/doc/master/TH1_8cxx_source.html#l02962
    
    If e1 ~ sqrt(b1) and e2 ~ sqrt(b2), one should recover results of this
    function by using the proper covariance with:
    
    prodratio_eprop(A=b1, B=b2, sigma_A=e1, sigma_B=e2, sigmaAB=b1, mode='ratio)
    
    N.B. If b1 == b2, will return 0
    
    Args:
        b1: binomial numerator (e.g. count or sum of weights)
        b2: binomial denominator
        e1: error on the numerator (e.g. \sqrt(\sum_i(w_i^2))
        e2: error on the denominator
    """
    if b1 == b2:
        return 0.0
    
    r   = b1 / b2
    err = np.sqrt(np.abs( ((1.0 - 2.0*r))*e1**2 + (r*e2)**2) / (b2**2))
    
    return err




[docs]
def columnar_mask_efficiency(num_mask: np.ndarray, den_mask: np.ndarray,
                             y_true: np.ndarray, y_target: int, weights: np.ndarray):
    """
    Masked columnar selection efficiency with event weights and classes
    
    Args:
        num_mask: numerator mask vector (Nx1, bool)
        den_mask: denominator mask      (Nx1, bool)
        y_true:   label vector          (Nx1, int)
        y_target: target label          (int)
        weights:  event weights         (Nx1, float)
        
    Returns:
        binomial efficiency, weighted binomial stat. error
    """
     
    w1 = weights[y_true == y_target]*num_mask[y_true == y_target]
    w2 = weights[y_true == y_target]*den_mask[y_true == y_target]
    
    b1 = np.sum(w1)
    b2 = np.sum(w2)
    e1 = np.sqrt(np.sum(w1**2))
    e2 = np.sqrt(np.sum(w2**2))
    
    eff     = b1 / b2
    eff_err = weighted_binomial_err(b1=b1, b2=b2, e1=e1, e2=e2)
    
    return eff, eff_err




[docs]
def ADS(s, b, sigma_b, EPS=1e-6):
    """
    Asimov discovery significance with background uncertainty
    https://indico.cern.ch/event/708041/papers/3272129/files/9437-acat19DirectOpti.pdf
    
    when sigma_b --> 0, this gives np.sqrt(2*((s+b)*np.log(1.0 + s/b) - s))
    
    Args:
        s:        expected signal count
        b:        expected background count
        sigma_b:  error on the background estimate
        EPS:      regularization constant
    """
    if s + b == 0:
        return 0
    if b == 0:
        b = EPS
    if sigma_b == 0:
        sigma_b = EPS

    T1 = ((s+b)*(b+sigma_b**2)) / (b**2+(s+b)*sigma_b**2)
    T2 = 1 + (sigma_b**2*s) / (b*(b+sigma_b**2))
    
    return np.sqrt(2*((s+b)*np.log(T1) - b**2/(sigma_b**2) * np.log(T2)))




[docs]
def welch_ttest(X1, X2, s1, s2, n1, n2):
    """
    Welch's two sample t-test for normally distributed variables.
    https://en.wikipedia.org/wiki/Welch%27s_t-test
    
    Args:
        X1 : Mean of sample 1
        X2 : Mean of sample 2
        s1 : Standard deviation of sample 1
        s2 : Standard deviation of sample 2
        n1 : Counts in sample 1
        n2 : Counts in sample 2 

    Returns:
        t : test statistic
        p : p-value from t-distribution
    """

    # Satterthwaite Approximation
    sdelta = np.sqrt(s1**2/n1 + s2**2/n2)

    # Test statistic
    t = (X1 - X2) / sdelta

    # Degrees of freedom
    nu1 = n1-1 
    nu2 = n2-1
    df  = np.ceil((s1**2/n1 + s2**2/n2)**2 / (s1**4/(n1**2*nu1) + s2**4/(n2**2*nu2)))

    # p-value, two-tailed test gives factor 2
    p = (1 - stats.t.cdf(np.abs(t), df=df)) * 2

    return t,p




[docs]
def p2zscore(p):
    """
    p-value to z-score (standard normal sigmas)
    
    Example:
        p-value is divided by two such that
        p = 0.05 gives z_{1-\\alpha/2} = z_{1-0.025} = 1.96
    """
    z = np.abs( stats.norm.ppf(p / 2) )

    if np.isinf(z): # overflow
        z = 99
    return z




[docs]
def clopper_pearson_err(k, n, CL=[0.025, 0.975]):
    """ Clopper-Pearson binomial proportion confidence interval.
    Below, beta.ppf (percent point functions) returns inverse CDF for quantiles.
    
    Args:
        k  : observed success counts
        n  : number of trials
        CL : confidence levels [lower, upper]
    Returns:
        confidence interval [lower, upper]
    """
    # Special case must be treated separately
    if   k == 0:
        lower = 0
        upper = 1 - (1-CL[1])**(1/n)

    # Special case must be treated separately
    elif k == n:
        lower = CL[0]**(1/n)
        upper = 1

    # Normal case
    else:
        lower = stats.beta.ppf(q=CL[0], a=k, b=n-k+1)
        upper = stats.beta.ppf(q=CL[1], a=k+1, b=n-k)

    return np.array([lower, upper])




[docs]
def poisson_ratio(k1,k2, CL=np.array([0.025, 0.975])):
    """
    Poisson ratio uncertainty via conditional ratio
    theta = lambda1 / (lambda1 + lambda2)
    
    See e.g. "Confidence limits on the ratio of two Poisson variables, 1974"
    
    E[k1] = lambda1 (Poisson mean)
    E[k2] = lambda2 (Poisson mean)
    
    Args:   
        k1 : observations k1
        k2 : observations k2
        CL : 2-array of lower and upper quantiles

    Returns:
        ratio confidence interval endpoints
    """

    theta = clopper_pearson_err(k=k1, n=k1+k2, CL=CL)

    if (1-theta).all() > 0:
        R = theta / (1 - theta)
    else:
        R = np.array([0,0])

    return R




[docs]
def poisson_tail(k1, k2):
    """
    Single Poisson CDF tail integral. Counts 2 works as the reference.
    Args:
        k1 : poisson counts 1
        k2 : poisson counts 2 (reference sample)
    Returns:
        p value
    """
    P = stats.poisson.cdf(k1, mu=k2)
    if k1 > k2:
        return 1 - P
    else:
        return P




[docs]
def mc_extreme_npdf(x, N, mu=0, sigma=1, trials=int(1e6)):
    """
    Extreme value distribution for normal pdf via Monte Carlo
    
    Args:
        x      : value to evaluate the extreme value pdf upper tail integral
        mu     : mean of normal pdf
        sigma  : std of normal pdf
        N      : sample size
        trials : number of Monte Carlo samples, as large as possible CPU wise
    
    Returns:
        p_value : probability for the value x or more extreme
        maxvals : full MC distribution of extreme values obtained
    """

    sample  = np.random.normal(mu, sigma, size=(trials, N))
    maxvals = np.amax(sample, 1) # max per each row (trial)
    p_value = np.sum(maxvals >= x) / trials

    return p_value, maxvals




[docs]
def mc_extreme_multivariate_npdf(x, mu, cov, trials=int(1e6)):
    """
    Extreme value distribution for a multivariate normal pdf via Monte Carlo,
    e.g. for "correlated measurements or bins" -- casted as a univariate problem
    such that maximum of components is taken (not a "vector maximum")
    
    Args:
        x       : scalar value to evaluate the extreme value pdf upper tail integral
        mu      : mean vector (N)
        cov     : covariance matrix (NxN)
        trials  : number of Monte Carlo samples, as large as possible CPU wise
    
    Returns:
        p_value : probability for the value x or more extreme
        maxvals : full MC distribution of extreme values obtained
    """

    sample  = np.random.multivariate_normal(mean=mu, cov=cov, size=trials)
    maxvals = np.amax(sample, 1) # max per each row (trial)
    p_value = np.sum(maxvals >= x) / trials

    return p_value, maxvals




[docs]
def analytical_extreme_npdf(N, mu=0, sigma=1):
    """
    Analytical expectation (mean) value approximation (based on an expansion)
    for a normal pdf max (extreme) values with a sample size of N.
    
    References:
        https://en.wikipedia.org/wiki/Extreme_value_theory
        https://en.wikipedia.org/wiki/Gumbel_distribution
    """

    # Ref: https://en.wikipedia.org/wiki/Generalized_extreme_value_distribution
    # y = 0.5772156649 # Euler–Mascheroni constant
    # return np.sqrt(np.log(N**2/(2*np.pi*np.log(N**2/(2*np.pi)) ) ) ) *(1+y/np.log(N))

    # Ref: Youtube, <From one extreme to another: the statistics of extreme events, Jon Keating, Oxford>
    return mu + sigma*np.sqrt(2*np.log(N)) - 0.5*sigma*(np.log(np.log(N))) / (np.sqrt(2*np.log(N)))




[docs]
def cormat2covmat(R,s):
    """ Convert a correlation matrix to a covariance matrix.
    
    Args:
        R : correlation matrix (NxN)
        s : vector of standard deviations (N)
    Returns:
        covariance matrix
    """

    if np.any(s <= 0):
        raise Exception('cormat2covmat: s vector elements must be positive')

    if np.any(np.abs(R) > 1):
        raise Exception('cormat2covmat: R matrix elements must be between [-1,1]')

    return np.diag(s) @ R @ np.diag(s)




[docs]
def test_extreme_npdf():

    mu    = 0 # standard normal
    sigma = 1

    x = 3.0   # we test for "x sigma deviation"
    N = 100   # data sample size

    # -------
    print('Independent standard normal PDF')

    p_value, maxvals = mc_extreme_npdf(x=x, N=N, mu=mu, sigma=sigma)

    print(f' Extreme value p-value for x >= {x} is {p_value:0.2E} [with N = {N}]')
    print(f' Monte Carlo expectation: <extreme> = {np.mean(maxvals):0.3f}')
    print(f' Analytical expectation:  <extreme> = {analytical_extreme_npdf(N=N, mu=mu, sigma=sigma):0.3f}')
    
    # -------
    # Uncorrelated case, will return the same as independent Gaussians
    print('\nIndependent standard N-dim normal PDF')

    mu  = np.zeros(N) # Mean vector
    cov = np.eye(N)   # Covariance matrix

    p_value, maxvals = mc_extreme_multivariate_npdf(x=x, mu=mu, cov=cov)

    print(f' Extreme value p-value for x >= {x} is {p_value:0.2E}')
    print(f' Monte Carlo expectation: <extreme> = {np.mean(maxvals):0.3f}')

    # -------
    # Correlated bins
    ccoef = 0.7
    print(f'\nCorrelated N-dim normal PDF [ccoef = {ccoef:0.3f}]')

    mu  = np.zeros(N)             # Mean vector

    R   = np.ones((N,N)) * ccoef  # Correlation matrix
    np.fill_diagonal(R, 1.0)
    s   = np.ones(N)              # Vector of standard deviations
    cov = cormat2covmat(R=R,s=s)  # --> Covariance matrix

    p_value, maxvals = mc_extreme_multivariate_npdf(x=x, mu=mu, cov=cov)
    
    print(f' Extreme value p-value for x >= {x} is {p_value:0.2E}')
    print(f' Monte Carlo expectation: <extreme> = {np.mean(maxvals):0.3f}')




[docs]
def test_efficiency_ratio(EPS=1e-6):

    import pytest
    
    # Ratio error, where b1 is the numerator (number of successes)
    # in a binomial ratio and b2 is the denominator (total number of trials)
    # where we allow both numerator and denominator to fluctuate aka take
    # here Poisson errors for them.
    
    b1 = 5
    b2 = 20
    e1 = np.sqrt(b1)
    e2 = np.sqrt(b2)

    eff       = b1 / b2
    eff_err_A = weighted_binomial_err(b1=b1, b2=b2, e1=e1, e2=e2)
    eff_err_B = ratio_eprop(A=b1, B=b2, sigmaA=e1, sigmaB=e2, sigmaAB=b1)
    eff_err_C = tpratio_taylor(x=b1, y=b2-b1, x_err=e1, y_err=np.sqrt(b2 - b1))
    
    print(f'eff = {eff}')
    print(f'err_eff = {eff_err_A:0.6g} (A)')
    print(f'err_eff = {eff_err_B:0.6g} (B)')
    print(f'err_eff = {eff_err_C:0.6g} (C)')

    assert   eff_err_A == pytest.approx(eff_err_B, abs=EPS)
    assert   eff_err_B == pytest.approx(eff_err_C, abs=EPS)
    assert   eff_err_A == pytest.approx(eff_err_C, abs=EPS)




[docs]
def test_ratios():
    """
    Test function
    """

    # Quantile values
    q68 = np.array([0.16, 0.84])
    q95 = np.array([0.025, 0.975])

    # --------------------------------------------
    # INPUT TEST DATA

    # Observed Poisson counts
    k_obs = np.array([17, 10, 100, 20, 100, 30, 400, 400])

    # Predicted Poisson counts
    k_pre = np.array([7, 15, 101, 25, 105, 10, 380, 200])

    # Poisson uncertainties
    s_obs = np.sqrt(k_obs)
    s_pre = np.sqrt(k_pre)
    # --------------------------------------------

    for i in range(len(k_obs)):

        print(f'>> Case {i} pre/obs = {k_pre[i]} / {k_obs[i]} = {k_pre[i] / k_obs[i]:0.3f}')

        # Conditional Poisson ratios    
        ppois68 = poisson_ratio(k1=k_pre[i], k2=k_obs[i], CL=q68)
        ppois95 = poisson_ratio(k1=k_pre[i], k2=k_obs[i], CL=q95)

        print(f'Poisson ratio uncertainty CI68: {ppois68}')
        print(f'Poisson ratio uncertainty CI95: {ppois95}')
        
        # Systematic residual from target ratio = 1 at CI95 level
        covers = lambda n,a,b : (n - a)*(n - b) <= 0
        res = 0 if covers(1, ppois95[0], ppois95[1]) else np.min(np.abs(ppois95 - 1))
        print(f'Systematic residual: {res:0.2f}')
        
        # Single sample Poisson tail integral
        ppois = poisson_tail(k1=k_pre[i], k2=k_obs[i])
        print(f'Poisson tail p = {ppois:0.3E} ({p2zscore(ppois):0.2f} sigmas)')

        # Error propagation
        errAB = ratio_eprop(A=k_pre[i], sigmaA=s_pre[i], B=k_obs[i], sigmaB=s_obs[i])
        PRE_over_OBS = k_pre[i] / k_obs[i]
        print(f'  error propagated rel.uncertainty on the ratio: err / (pre/obs) = {errAB / PRE_over_OBS:0.3f}')

        print('')