Source code for icedqcd.optimize

# Efficiency flows ++ (Work-in-progress)
# 
# m.mieskolainen@imperial.ac.uk, 2024

import numpy as np
import pickle
import os
from datetime import datetime
from scanf import scanf
import copy

import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy import interpolate
from scipy.stats import ncx2,norm

from icenet.tools import aux
from icefit import cortools, statstools
from icedqcd import limits

# ------------------------------------------
from icenet import print
# --------------------------------------------

latex_header = \
"""
\\documentclass{article}
% General document formatting
\\usepackage[margin=0.7in]{geometry}
\\usepackage[parfill]{parskip}
\\usepackage[utf8]{inputenc}

% Related to math
\\usepackage{amsmath,amssymb,amsfonts,amsthm}

\\begin{document}
"""


[docs]
def func_binormal2(x, a, b):
    """
    Formulas 4, (14):
    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5570585/pdf/nihms-736507.pdf
    """
    F_inv = (1/a) * ncx2.ppf(1-x,  df=1, nc=b)
    return 1 - ncx2.cdf(F_inv, df=1, nc=a*b)




[docs]
def func_binormal(x, a, b):  
    """
    binormal-function
    https://dpc10ster.github.io/RJafrocBook/binormal-model.html
    
    b = sd0 / sd1
    a = (mu1 - mu0) / sd1
    """
    return norm.cdf(a + b*norm.ppf(x))



[docs]
def plot_ROC_fit(i, fpr, tpr, tpr_err, fpr_err, roc_obj, roc_label, names, args, savename):
    """
    Plot ROC fits
    """
    
    xval = np.logspace(-8, 0, int(1e4))
    
    # Fit the ROC-curve
    try:
        popt, pcov = curve_fit(f=func_binormal, xdata=fpr, ydata=tpr)#, sigma=tpr_err)
    except:
        print(f"Problem with curve_fit", 'red')
        print(f'fpr = {fpr}')
        print(f'tpr = {tpr}')
        return
    
    # -----------------
    # Create parameter estimate text label
    param     = np.round(np.array(popt),2)
    param_err = np.round(np.sqrt(np.diag(np.array(pcov))), 2)
    fit_label = f'fit: $\\Phi(a_0 + a_1\\Phi^{{-1}}(x)), ('
    for k in range(len(param)):
        fit_label += f'a_{{{k}}} = {param[k]} \\pm {param_err[k]}'
        if k < len(param)-1: fit_label += ','
    fit_label += ')$'
    
    # ------------------------------------
    fig,ax = plt.subplots(1,2, figsize=(8,4))
    alpha  = 0.32
    
    for k in [0,1]:

        plt.sca(ax[k])

        auc_CI  = cortools.prc_CI(x=roc_obj.auc_bootstrap, alpha=alpha)
        auc_err = np.abs(auc_CI - roc_obj.auc)
        plt.plot(fpr, tpr, drawstyle='steps-mid', color=f'C{i}', label=f'{roc_label}, AUC: ${roc_obj.auc:0.3f} \\pm_{{{auc_err[0]:0.3f}}}^{{{auc_err[1]:0.3f}}}$')

        tpr_lo = cortools.percentile_per_dim(x=roc_obj.tpr_bootstrap, q=100*(alpha/2))
        tpr_hi = cortools.percentile_per_dim(x=roc_obj.tpr_bootstrap, q=100*(1-alpha/2))
        plt.fill_between(fpr, tpr_lo, tpr_hi, step='mid', alpha=0.4, color=f'C{i}', edgecolor='none') # vertical

        plt.plot(xval, func_binormal(xval, *popt), linestyle='-', color=(0.35,0.35,0.35), label='binormal fit')
        plt.plot(xval, xval, color=(0.5,0.5,0.5), linestyle=':')

        path = aux.makedir(f'{args["plotdir"]}/optimize/ROC_fit/{roc_label}')
        
        if k == 0:
            plt.xscale('linear')
            plt.xlim([0, 1])
            ax[k].set_aspect('equal', 'box')
            plt.ylabel('True positive rate $1 - \\beta$ (signal efficiency)', fontsize=9)
            plt.title(f'{names[i]}', fontsize=5)
            plt.legend(loc='lower right', fontsize=7)

        if k == 1:
            plt.xscale('log')
            xmin = 1e-6
            plt.xlim([xmin, 1])
            ax[k].set_aspect(-np.log10(xmin))
            plt.title(fit_label, fontsize=6)
            plt.xlabel('False positive rate $\\alpha$ (background efficiency)', fontsize=9)

        plt.ylim([0,1])

    pdf_filename = f'{path}/{savename}.pdf'
    plt.savefig(pdf_filename, bbox_inches='tight')
    plt.close()




[docs]
def find_filter(rd, model_param, args):
    """
    Find filter strings
    """
    model_POI      = args['limits_param']['model_POI']
    box_POI        = args['limits_param']['box_POI'] 
    
    GEN_filter_tag = args['limits_param']['GEN_filter_tag']
    BOX_filter_tag = args['limits_param']['BOX_filter_tag']
    GBX_filter_tag = args['limits_param']['GBX_filter_tag']
    
    GEN_filter_key = None
    BOX_filter_key = None
    GBX_filter_key = None
    param_point    = {}
    
    print(f'Signal: {model_param}', 'yellow')

    # --------------
    
    for filter in rd['roc_mstats'].keys():
        matches = 0
        for param_name in model_POI:
            
            if f'{param_name} = {model_param[param_name]:g}' in filter and GEN_filter_tag in filter:
                matches += 1
                param_point[param_name] = model_param[param_name]
        
        if matches == len(model_POI): # all parameters match
            GEN_filter_key = filter
            print(f'Found GEN filter: {GEN_filter_key}' , 'green')
            break
    
    if GEN_filter_key is None:
        print(f'ERROR: No matching GEN filter for the signal: {model_param}, exit', 'red')
        exit()
    
    # --------------
    
    for filter in rd['roc_mstats'].keys():
        
        if BOX_filter_tag in filter:
            param_value = scanf(f"{BOX_filter_tag}: box({box_POI[0]} ~ %f)", filter)[0]
            
            if np.round(param_value, 3) == np.round(param_point[box_POI[1]], 3):
                BOX_filter_key = filter
                print(f'Found BOX filter: {BOX_filter_key}' , 'green')
                break
    
    if BOX_filter_key is None:
        print(f'ERROR: No matching BOX filter for the signal: {model_param}, exit', 'red')
        exit()
    
    # --------------
    
    for filter in rd['roc_mstats'].keys():
        matches = 0
        for param_name in model_POI :
            
            if f'{param_name} = {model_param[param_name]:g}' in filter and GBX_filter_tag in filter:
                matches += 1
        
        if matches == len(model_POI): # all parameters match
            GBX_filter_key = filter
            
            print(f'Found GBX filter: {GBX_filter_key}' , 'green')
            break
    
    if GBX_filter_key is None:
        print(f'ERROR: No matching GBX filter for the signal: {model_param}, exit', 'red')
        exit()
    
    # Sort alphabetically by parameter name
    param_point = dict(sorted(param_point.items()))
    
    return GEN_filter_key, BOX_filter_key, GBX_filter_key, param_point




[docs]
def optimize_selection(args):
    """
    Main program
    """
    
    def dprint(string, mode='a'):
        print(string)
        with open(latex_filename, mode) as f:
            f.write(string + '\n')
    
    latex_path     = aux.makedir(f'{args["plotdir"]}/optimize')
    latex_filename = f'{latex_path}/optimize.tex'
    
    # -----------------
    ## Main parameters

    # Integrated luminosity
    L_int          = args['limits_param']['luminosity'] # in pb^{-1}
    
    # Signal production cross section
    signal_xs      = args['limits_param']['signal_xs']  # in pb
    
    # Benchmark BR
    BR             = args['limits_param']['target_r']
    
    # Targets
    const_mode     = args['limits_param']['const_mode']
    const_cut      = args['limits_param']['const_cut']
    const_bgk_eff  = args['limits_param']['const_bgk_eff']
    
    # BOX emulation
    commutate_BOX  = args['limits_param']['commutate_BOX']
    
    # -----------------
    
    dprint('', 'w')
    dprint(latex_header)
    
    dprint('\\Large')
    dprint('Cut efficiencies and discovery significance')
    dprint('\\small')
    dprint('\\\\')
    dprint(f'github.com/mieskolainen/icenet')
    dprint('\\\\')
    dprint(f"{datetime.now():%d-%m-%Y %H:%M}")
    dprint('\\\\')
    dprint('\\vspace{0.5em}')
    dprint('\\\\')
    
    dprint(f'$L = {L_int} \\; \\text{{pb}}^{{-1}}$ \\\\')
    dprint(f'$\\sigma_S$ = {signal_xs} \\text{{pb}} \\\\')
    dprint(f'$\\text{{BR}} = {BR}$ (benchmark) \\\\')
    dprint(f'$\\langle S \\rangle \\equiv L \\times (\\sigma_S \\times \\text{{BR}}) \\times \\epsilon A_S (\\text{{trg}}) \\times \\epsilon_S (\\text{{pre-cut}}) \\times \\epsilon_S (\\text{{ML}}) \\times \\epsilon_S (M\\text{{-box}})$ (cut ordered) \\\\')
    dprint('\\vspace{1em}')
    dprint('\\\\')
    
    if   const_mode == 'cut':
        dprint(f'Const mode: ML score cut threshold fixed at $c > {const_cut:0.2f}$ \\\\')
    elif const_mode == 'bgk_eff':
        dprint(f'Const mode: Bgk-efficiency target fixed at {const_bgk_eff:0.1E} (after trg + pre-selection) \\\\')
    else:
        raise Exception(__name__ + f".optimize_selection: Unknown const mode '{const_mode}'")
    
    dprint(f'commutate BOX cut: {commutate_BOX} (low MC stats treatment) \\\\')
    dprint(f'Signal scenario: {args["limits_param"]["cosmetics"]["portal"]} \\\\')
    dprint('\\vspace{1em}')
    dprint('\\\\')
    
    dprint('\\tiny')
    dprint('\\begin{tabular}{l|l|ccc|c}')
    dprint('Background process & input $\\sigma_B$ [pb] & $\\epsilon A$(trg) & $\\epsilon$(pre-cut) & $\\epsilon A$(trg $\\times$ cut) & $\\langle B \\rangle$ \\\\')
    dprint('\\hline')
    
    # -----------------
    # Load evaluation data
    
    targetfile = f'{args["plotdir"]}/eval/eval_results.pkl'
    print(f'Loading "{targetfile}" ...', 'yellow')
    
    with open(targetfile, 'rb') as file:
        rd = pickle.load(file)
    info = rd['info']

    # MVA-model labels
    ML_model_names = rd['roc_labels']['inclusive']

    # background, signal
    class_ids = args['primary_classes']

    # -----------------
    # Background estimate
    
    B_tot        = 0
    B_xs_tot     = 0
    B_trg_eA_xs  = 0
    B_cut_eA_xs  = 0
    B_acc_eff_xs = 0

    # Loop over background sub-processes
    for name in info[f"class_{class_ids[0]}"].keys():
        
        proc          = info[f"class_{class_ids[0]}"][name]
        
        xs            = proc['yaml']["xs"]            # Cross-section
        trg_eA        = proc['cut_stats']['filterfunc']['after'] / proc['cut_stats']['filterfunc']['before']
        cut_eA        = proc['cut_stats']['cutfunc']['after']    / proc['cut_stats']['cutfunc']['before']
        
        N             = trg_eA  * cut_eA * xs * L_int # Number of events expected
        B_tot        += N
        
        B_trg_eA_xs  += trg_eA * xs
        B_cut_eA_xs  += cut_eA * xs
        B_acc_eff_xs += trg_eA * cut_eA * xs
        B_xs_tot     += xs
        
        name = name.replace("_", "-")
        name = (name[:65] + '..') if len(name) > 65 else name # Truncate maximum length, add ..
        
        dprint(f'{name} & {xs:0.1f} & {trg_eA:0.3f} & {cut_eA:0.3f} & {trg_eA*cut_eA:0.3f} & {N:0.1E} \\\\')
    
    B_trg_eA  = B_trg_eA_xs  / B_xs_tot
    B_cut_eA  = B_cut_eA_xs  / B_xs_tot
    B_acc_eff = B_acc_eff_xs / B_xs_tot
    
    dprint(f'\\hline')
    dprint(f'Total & {B_xs_tot:0.1f} & {B_trg_eA:0.3f} & {B_cut_eA:0.3f} & {B_acc_eff:0.3f} & {B_tot:0.1E} \\\\')
    dprint('\\end{tabular}')
    dprint('\\\\')
    dprint('')
    
    
    # -----------------
    # Signal estimate per model point

    keys = list(rd['roc_mstats'].keys())
    num_ML_models = len(rd['roc_mstats'][keys[0]])
    
    for ML_idx in range(num_ML_models):
        
        S           = np.zeros(len(info[f"class_{class_ids[1]}"].keys()))
        sigma_S     = np.zeros(len(S))
        B           = np.zeros(len(S))
        sigma_B     = np.zeros(len(S))
        ADS         = np.zeros(len(S))
        ds          = np.zeros(len(S))
        
        S_trg_eA    = np.zeros(len(S))          # Trigger eff x acceptance
        S_cut_eA    = np.zeros(len(S))          # Basic cuts
        
        ML_eff         = np.zeros((len(S), 2))  # ML cut
        ML_eff_err     = np.zeros((len(S), 2))  # ML cut
        
        BOX_eff        = np.zeros((len(S), 2))  # Box cut
        BOX_eff_err    = np.zeros((len(S), 2))  # Box cut
        
        ML_BOX_eff     = np.zeros((len(S), 2))  # ML x Box cut
        ML_BOX_eff_err = np.zeros((len(S), 2))  # ML x Box cut
        
        thr         = np.zeros(len(S))
        names       = len(S) * [None]
        
        param_values = []
        output       = {}
        i = 0
        
        # Loop over different signal model point processes
        for name in info[f"class_{class_ids[1]}"].keys():
            
            names[i]  = name.replace('_', '-')
            proc      = info[f"class_{class_ids[1]}"][name]
            
            # -----------------
            # Pick model parameters
            model_param = proc['yaml']['model_param']
            
            # Find the matching filter strings
            GEN_filter_key, BOX_filter_key, GBX_filter_key, param_point \
                = find_filter(rd, model_param, args)
            
            param_values.append(param_point)
            
            # -----------------
            # Get the pre-computed MVA-ROC results after combined GEN & BOX cut
            # (background uses "diplomat_class" filter under plots)
            
            if commutate_BOX:           # Low MC stats treatment
                fKEY = GEN_filter_key # Gen
            else:
                fKEY = GBX_filter_key # Gen & Box
            
            roc_obj   = rd['roc_mstats'][fKEY][ML_idx]
            roc_label = rd['roc_labels'][fKEY][ML_idx]
            roc_path  = rd['roc_paths'][fKEY][ML_idx]
            
            fpr, tpr, thresholds = roc_obj.fpr, roc_obj.tpr, roc_obj.thresholds
            fpr_err, tpr_err     = 0,0
            
            plot_ROC_fit(i=i, fpr=fpr, tpr=tpr, tpr_err=tpr_err, fpr_err=fpr_err,
                roc_obj=roc_obj, roc_label=roc_label, names=names, args=args, savename=roc_path)
            
            # -----------------
            # Construct box cut efficiency prior ML cut
            
            BOX_cut_mask = rd['roc_filters'][BOX_filter_key][ML_idx] & rd['roc_filters'][GEN_filter_key][ML_idx]
            den_mask     = copy.deepcopy(rd['roc_filters'][GEN_filter_key][ML_idx]) #! deepcopy
            
            for C in range(2):

                BOX_eff[i,C], BOX_eff_err[i,C] = statstools.columnar_mask_efficiency(
                    num_mask=BOX_cut_mask, den_mask=den_mask,
                    y_true=rd['data'].y, y_target=class_ids[C], weights=rd['data'].w)
            
            # -----------------
            # Construct ML cut efficiency such that the combined BOX & ML efficiency
            # matches the FPR level we want for the background (or use a constant cut)
            
            if   const_mode == 'cut':
                thr[i] = const_cut
            elif const_mode == 'bgk_eff':
                func   = interpolate.interp1d(fpr * BOX_eff[i,0], thresholds, 'linear')
                thr[i] = func(const_bgk_eff)
            else:
                raise Exception(__name__ + f'.optimize_selection: Unknown const_mode "{const_mode}"')
            
            # -----------------
            # ML cut
            
            ML_cut_mask = (rd['y_preds'][ML_idx] > thr[i]) & rd['roc_filters'][GEN_filter_key][ML_idx]
            den_mask    = copy.deepcopy(rd['roc_filters'][GEN_filter_key][ML_idx]) #! deepcopy
            
            for C in range(2):
                
                ML_eff[i,C], ML_eff_err[i,C] = statstools.columnar_mask_efficiency(num_mask=ML_cut_mask, den_mask=den_mask,
                    y_true=rd['data'].y, y_target=class_ids[C], weights=rd['data'].w)
            
            # -----------------
            # BOX cut

            BOX_cut_mask = ML_cut_mask & rd['roc_filters'][BOX_filter_key][ML_idx]
            den_mask     = copy.deepcopy(ML_cut_mask) #! deepcopy
            
            for C in range(2):
                
                if commutate_BOX and C == 0:
                    continue # Low MC stats treatment, computed above
                
                BOX_eff[i,C], BOX_eff_err[i,C] = statstools.columnar_mask_efficiency(
                    num_mask=BOX_cut_mask, den_mask=den_mask,
                    y_true=rd['data'].y, y_target=class_ids[C], weights=rd['data'].w)
            
            # ----------------
            # Total ML x BOX cut
            
            ML_BOX_cut_mask = ML_cut_mask & BOX_cut_mask
            den_mask        = copy.deepcopy(rd['roc_filters'][GEN_filter_key][ML_idx]) #! deepcopy
            
            for C in range(2):
                
                if commutate_BOX and C == 0: # Low MC stats treatment
                    ML_BOX_eff[i,C]     = ML_eff[i,C] * BOX_eff[i,C]
                    ML_BOX_eff_err[i,C] = statstools.prod_eprop(A=ML_eff[i,C], B=BOX_eff[i,C],
                        sigmaA=ML_eff_err[i,C], sigmaB=BOX_eff_err[i,C], sigmaAB=0)
                    continue
                
                ML_BOX_eff[i,C], ML_BOX_eff_err[i,C] = statstools.columnar_mask_efficiency(
                        num_mask=ML_BOX_cut_mask, den_mask=den_mask,
                        y_true=rd['data'].y, y_target=class_ids[C], weights=rd['data'].w)
            
            # ----------------
            # Compute values
            
            B[i]        = B_tot * ML_BOX_eff[i,0]
            sigma_B[i]  = B_tot * ML_BOX_eff_err[i,0]
            
            S_trg_eA[i] = proc['cut_stats']['filterfunc']['after'] / proc['cut_stats']['filterfunc']['before']
            S_cut_eA[i] = proc['cut_stats']['cutfunc']['after']    / proc['cut_stats']['cutfunc']['before']
            
            # Expected signal event count (without BR applied here!)
            S_tot       = (signal_xs * L_int) * S_trg_eA[i] * S_cut_eA[i]
            S[i]        = S_tot * ML_BOX_eff[i,1]
            sigma_S[i]  = S_tot * ML_BOX_eff_err[i,1]
            
            # Asimov Discovery Significance
            ADS[i] = statstools.ADS(s=BR * S[i], b=B[i], sigma_b=sigma_B[i])

            # Naive
            ds[i]  = BR * S[i] / np.sqrt(B[i])

            output[names[i]] = {'ML_model':     ML_model_names[ML_idx],
                                'model_param':  param_values[i],
                                'signal':       {'counts': S[i], 'systematic': sigma_S[i]},
                                'background':   {'counts': B[i], 'systematic': sigma_B[i]}}
            
            # ----------------
            # Diagnostics
            print(f'ML_eff:  {np.round(ML_eff[i,:], 6)}  | ML_eff_err  = {np.round(100 * ML_eff_err[i,:]  / ML_eff[i,:], 1)} %')
            print(f'BOX_eff: {np.round(BOX_eff[i,:], 6)} | BOX_eff_err = {np.round(100 * BOX_eff_err[i,:] / BOX_eff[i,:], 1)} %')
            print(f'tot_eff: {np.round(ML_eff[i,:] * BOX_eff[i,:], 6)} | tot_eff_err = {np.round(100 * ML_BOX_eff_err[i,:] / (ML_BOX_eff[i,:]), 1)} %')
            
            i += 1
            
            # << == Loop over signal points ends
        
        dprint(f'\\textbf{{ML model: {ML_model_names[ML_idx]} }} \\\\')
        dprint('')
        dprint('\\tiny')
        dprint('\\begin{tabular}{l|c|cc|cccc|cc|cc}')
        dprint('Signal model point & ML-cut & $B$: $\\epsilon$(ML) & $\\epsilon$($M$-box) & $S$: $\\epsilon A$(trg) & $\\epsilon$(pre-cut) & $\\epsilon$(ML) & $\\epsilon$($M$-box) & $\\langle B \\rangle \\pm$ [\\%] & $\\langle S \\rangle \\pm$ [\\%] & ADS & $S / \\sqrt{B}$ \\\\')
        dprint('\\hline')
        
        # -----------------
        # Plot summary table
        for i in range(len(S)):
            
            line = f'{param_values[i]} & {thr[i]:0.3f} &{ML_eff[i,0]:0.1E} & {BOX_eff[i,0]:0.1E} & {S_trg_eA[i]:0.2g} & {S_cut_eA[i]:0.2g} & {ML_eff[i,1]:0.2g} & {BOX_eff[i,1]:0.2g} & {B[i]:0.1E} $\\pm$ {100*sigma_B[i]/B[i]:0.1f} & {BR*S[i]:0.1E} $\\pm$ {100*sigma_S[i]/S[i]:0.1f} & {ADS[i]:0.2g} & {ds[i]:0.2g} \\\\'
            dprint(line)
        
        dprint('\\end{tabular}')
        dprint('')
        
        # -----------------
        # Compute upper limits
        
        limits.run_limits_vector(data=output, param=args['limits_param'],
            savepath = f'{args["plotdir"]}/optimize/limits/{ML_model_names[ML_idx]}')
        
        # << == Loop over models ends
    
    # -----------------
    # End the document and compile latex
    dprint('\\end{document}')
    
    os.system(f'pdflatex -output-directory {latex_path} {latex_filename}')
    
    print('[Done]')