# Efficiency flows ++ (Work-in-progress)
#
# m.mieskolainen@imperial.ac.uk, 2024
import numpy as np
import pickle
import os
from datetime import datetime
from scanf import scanf
import copy
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy import interpolate
from scipy.stats import ncx2,norm
from icenet.tools import aux
from icefit import cortools, statstools
from icedqcd import limits
# ------------------------------------------
from icenet import print
# --------------------------------------------
latex_header = \
"""
\\documentclass{article}
% General document formatting
\\usepackage[margin=0.7in]{geometry}
\\usepackage[parfill]{parskip}
\\usepackage[utf8]{inputenc}
% Related to math
\\usepackage{amsmath,amssymb,amsfonts,amsthm}
\\begin{document}
"""
[docs]
def func_binormal2(x, a, b):
"""
Formulas 4, (14):
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5570585/pdf/nihms-736507.pdf
"""
F_inv = (1/a) * ncx2.ppf(1-x, df=1, nc=b)
return 1 - ncx2.cdf(F_inv, df=1, nc=a*b)
[docs]
def func_binormal(x, a, b):
"""
binormal-function
https://dpc10ster.github.io/RJafrocBook/binormal-model.html
b = sd0 / sd1
a = (mu1 - mu0) / sd1
"""
return norm.cdf(a + b*norm.ppf(x))
[docs]
def plot_ROC_fit(i, fpr, tpr, tpr_err, fpr_err, roc_obj, roc_label, names, args, savename):
"""
Plot ROC fits
"""
xval = np.logspace(-8, 0, int(1e4))
# Fit the ROC-curve
try:
popt, pcov = curve_fit(f=func_binormal, xdata=fpr, ydata=tpr)#, sigma=tpr_err)
except:
print(f"Problem with curve_fit", 'red')
print(f'fpr = {fpr}')
print(f'tpr = {tpr}')
return
# -----------------
# Create parameter estimate text label
param = np.round(np.array(popt),2)
param_err = np.round(np.sqrt(np.diag(np.array(pcov))), 2)
fit_label = f'fit: $\\Phi(a_0 + a_1\\Phi^{{-1}}(x)), ('
for k in range(len(param)):
fit_label += f'a_{{{k}}} = {param[k]} \\pm {param_err[k]}'
if k < len(param)-1: fit_label += ','
fit_label += ')$'
# ------------------------------------
fig,ax = plt.subplots(1,2, figsize=(8,4))
alpha = 0.32
for k in [0,1]:
plt.sca(ax[k])
auc_CI = cortools.prc_CI(x=roc_obj.auc_bootstrap, alpha=alpha)
auc_err = np.abs(auc_CI - roc_obj.auc)
plt.plot(fpr, tpr, drawstyle='steps-mid', color=f'C{i}', label=f'{roc_label}, AUC: ${roc_obj.auc:0.3f} \\pm_{{{auc_err[0]:0.3f}}}^{{{auc_err[1]:0.3f}}}$')
tpr_lo = cortools.percentile_per_dim(x=roc_obj.tpr_bootstrap, q=100*(alpha/2))
tpr_hi = cortools.percentile_per_dim(x=roc_obj.tpr_bootstrap, q=100*(1-alpha/2))
plt.fill_between(fpr, tpr_lo, tpr_hi, step='mid', alpha=0.4, color=f'C{i}', edgecolor='none') # vertical
plt.plot(xval, func_binormal(xval, *popt), linestyle='-', color=(0.35,0.35,0.35), label='binormal fit')
plt.plot(xval, xval, color=(0.5,0.5,0.5), linestyle=':')
path = aux.makedir(f'{args["plotdir"]}/optimize/ROC_fit/{roc_label}')
if k == 0:
plt.xscale('linear')
plt.xlim([0, 1])
ax[k].set_aspect('equal', 'box')
plt.ylabel('True positive rate $1 - \\beta$ (signal efficiency)', fontsize=9)
plt.title(f'{names[i]}', fontsize=5)
plt.legend(loc='lower right', fontsize=7)
if k == 1:
plt.xscale('log')
xmin = 1e-6
plt.xlim([xmin, 1])
ax[k].set_aspect(-np.log10(xmin))
plt.title(fit_label, fontsize=6)
plt.xlabel('False positive rate $\\alpha$ (background efficiency)', fontsize=9)
plt.ylim([0,1])
pdf_filename = f'{path}/{savename}.pdf'
plt.savefig(pdf_filename, bbox_inches='tight')
plt.close()
[docs]
def find_filter(rd, model_param, args):
"""
Find filter strings
"""
model_POI = args['limits_param']['model_POI']
box_POI = args['limits_param']['box_POI']
GEN_filter_tag = args['limits_param']['GEN_filter_tag']
BOX_filter_tag = args['limits_param']['BOX_filter_tag']
GBX_filter_tag = args['limits_param']['GBX_filter_tag']
GEN_filter_key = None
BOX_filter_key = None
GBX_filter_key = None
param_point = {}
print(f'Signal: {model_param}', 'yellow')
# --------------
for filter in rd['roc_mstats'].keys():
matches = 0
for param_name in model_POI:
if f'{param_name} = {model_param[param_name]:g}' in filter and GEN_filter_tag in filter:
matches += 1
param_point[param_name] = model_param[param_name]
if matches == len(model_POI): # all parameters match
GEN_filter_key = filter
print(f'Found GEN filter: {GEN_filter_key}' , 'green')
break
if GEN_filter_key is None:
print(f'ERROR: No matching GEN filter for the signal: {model_param}, exit', 'red')
exit()
# --------------
for filter in rd['roc_mstats'].keys():
if BOX_filter_tag in filter:
param_value = scanf(f"{BOX_filter_tag}: box({box_POI[0]} ~ %f)", filter)[0]
if np.round(param_value, 3) == np.round(param_point[box_POI[1]], 3):
BOX_filter_key = filter
print(f'Found BOX filter: {BOX_filter_key}' , 'green')
break
if BOX_filter_key is None:
print(f'ERROR: No matching BOX filter for the signal: {model_param}, exit', 'red')
exit()
# --------------
for filter in rd['roc_mstats'].keys():
matches = 0
for param_name in model_POI :
if f'{param_name} = {model_param[param_name]:g}' in filter and GBX_filter_tag in filter:
matches += 1
if matches == len(model_POI): # all parameters match
GBX_filter_key = filter
print(f'Found GBX filter: {GBX_filter_key}' , 'green')
break
if GBX_filter_key is None:
print(f'ERROR: No matching GBX filter for the signal: {model_param}, exit', 'red')
exit()
# Sort alphabetically by parameter name
param_point = dict(sorted(param_point.items()))
return GEN_filter_key, BOX_filter_key, GBX_filter_key, param_point
[docs]
def optimize_selection(args):
"""
Main program
"""
def dprint(string, mode='a'):
print(string)
with open(latex_filename, mode) as f:
f.write(string + '\n')
latex_path = aux.makedir(f'{args["plotdir"]}/optimize')
latex_filename = f'{latex_path}/optimize.tex'
# -----------------
## Main parameters
# Integrated luminosity
L_int = args['limits_param']['luminosity'] # in pb^{-1}
# Signal production cross section
signal_xs = args['limits_param']['signal_xs'] # in pb
# Benchmark BR
BR = args['limits_param']['target_r']
# Targets
const_mode = args['limits_param']['const_mode']
const_cut = args['limits_param']['const_cut']
const_bgk_eff = args['limits_param']['const_bgk_eff']
# BOX emulation
commutate_BOX = args['limits_param']['commutate_BOX']
# -----------------
dprint('', 'w')
dprint(latex_header)
dprint('\\Large')
dprint('Cut efficiencies and discovery significance')
dprint('\\small')
dprint('\\\\')
dprint(f'github.com/mieskolainen/icenet')
dprint('\\\\')
dprint(f"{datetime.now():%d-%m-%Y %H:%M}")
dprint('\\\\')
dprint('\\vspace{0.5em}')
dprint('\\\\')
dprint(f'$L = {L_int} \\; \\text{{pb}}^{{-1}}$ \\\\')
dprint(f'$\\sigma_S$ = {signal_xs} \\text{{pb}} \\\\')
dprint(f'$\\text{{BR}} = {BR}$ (benchmark) \\\\')
dprint(f'$\\langle S \\rangle \\equiv L \\times (\\sigma_S \\times \\text{{BR}}) \\times \\epsilon A_S (\\text{{trg}}) \\times \\epsilon_S (\\text{{pre-cut}}) \\times \\epsilon_S (\\text{{ML}}) \\times \\epsilon_S (M\\text{{-box}})$ (cut ordered) \\\\')
dprint('\\vspace{1em}')
dprint('\\\\')
if const_mode == 'cut':
dprint(f'Const mode: ML score cut threshold fixed at $c > {const_cut:0.2f}$ \\\\')
elif const_mode == 'bgk_eff':
dprint(f'Const mode: Bgk-efficiency target fixed at {const_bgk_eff:0.1E} (after trg + pre-selection) \\\\')
else:
raise Exception(__name__ + f".optimize_selection: Unknown const mode '{const_mode}'")
dprint(f'commutate BOX cut: {commutate_BOX} (low MC stats treatment) \\\\')
dprint(f'Signal scenario: {args["limits_param"]["cosmetics"]["portal"]} \\\\')
dprint('\\vspace{1em}')
dprint('\\\\')
dprint('\\tiny')
dprint('\\begin{tabular}{l|l|ccc|c}')
dprint('Background process & input $\\sigma_B$ [pb] & $\\epsilon A$(trg) & $\\epsilon$(pre-cut) & $\\epsilon A$(trg $\\times$ cut) & $\\langle B \\rangle$ \\\\')
dprint('\\hline')
# -----------------
# Load evaluation data
targetfile = f'{args["plotdir"]}/eval/eval_results.pkl'
print(f'Loading "{targetfile}" ...', 'yellow')
with open(targetfile, 'rb') as file:
rd = pickle.load(file)
info = rd['info']
# MVA-model labels
ML_model_names = rd['roc_labels']['inclusive']
# background, signal
class_ids = args['primary_classes']
# -----------------
# Background estimate
B_tot = 0
B_xs_tot = 0
B_trg_eA_xs = 0
B_cut_eA_xs = 0
B_acc_eff_xs = 0
# Loop over background sub-processes
for name in info[f"class_{class_ids[0]}"].keys():
proc = info[f"class_{class_ids[0]}"][name]
xs = proc['yaml']["xs"] # Cross-section
trg_eA = proc['cut_stats']['filterfunc']['after'] / proc['cut_stats']['filterfunc']['before']
cut_eA = proc['cut_stats']['cutfunc']['after'] / proc['cut_stats']['cutfunc']['before']
N = trg_eA * cut_eA * xs * L_int # Number of events expected
B_tot += N
B_trg_eA_xs += trg_eA * xs
B_cut_eA_xs += cut_eA * xs
B_acc_eff_xs += trg_eA * cut_eA * xs
B_xs_tot += xs
name = name.replace("_", "-")
name = (name[:65] + '..') if len(name) > 65 else name # Truncate maximum length, add ..
dprint(f'{name} & {xs:0.1f} & {trg_eA:0.3f} & {cut_eA:0.3f} & {trg_eA*cut_eA:0.3f} & {N:0.1E} \\\\')
B_trg_eA = B_trg_eA_xs / B_xs_tot
B_cut_eA = B_cut_eA_xs / B_xs_tot
B_acc_eff = B_acc_eff_xs / B_xs_tot
dprint(f'\\hline')
dprint(f'Total & {B_xs_tot:0.1f} & {B_trg_eA:0.3f} & {B_cut_eA:0.3f} & {B_acc_eff:0.3f} & {B_tot:0.1E} \\\\')
dprint('\\end{tabular}')
dprint('\\\\')
dprint('')
# -----------------
# Signal estimate per model point
keys = list(rd['roc_mstats'].keys())
num_ML_models = len(rd['roc_mstats'][keys[0]])
for ML_idx in range(num_ML_models):
S = np.zeros(len(info[f"class_{class_ids[1]}"].keys()))
sigma_S = np.zeros(len(S))
B = np.zeros(len(S))
sigma_B = np.zeros(len(S))
ADS = np.zeros(len(S))
ds = np.zeros(len(S))
S_trg_eA = np.zeros(len(S)) # Trigger eff x acceptance
S_cut_eA = np.zeros(len(S)) # Basic cuts
ML_eff = np.zeros((len(S), 2)) # ML cut
ML_eff_err = np.zeros((len(S), 2)) # ML cut
BOX_eff = np.zeros((len(S), 2)) # Box cut
BOX_eff_err = np.zeros((len(S), 2)) # Box cut
ML_BOX_eff = np.zeros((len(S), 2)) # ML x Box cut
ML_BOX_eff_err = np.zeros((len(S), 2)) # ML x Box cut
thr = np.zeros(len(S))
names = len(S) * [None]
param_values = []
output = {}
i = 0
# Loop over different signal model point processes
for name in info[f"class_{class_ids[1]}"].keys():
names[i] = name.replace('_', '-')
proc = info[f"class_{class_ids[1]}"][name]
# -----------------
# Pick model parameters
model_param = proc['yaml']['model_param']
# Find the matching filter strings
GEN_filter_key, BOX_filter_key, GBX_filter_key, param_point \
= find_filter(rd, model_param, args)
param_values.append(param_point)
# -----------------
# Get the pre-computed MVA-ROC results after combined GEN & BOX cut
# (background uses "diplomat_class" filter under plots)
if commutate_BOX: # Low MC stats treatment
fKEY = GEN_filter_key # Gen
else:
fKEY = GBX_filter_key # Gen & Box
roc_obj = rd['roc_mstats'][fKEY][ML_idx]
roc_label = rd['roc_labels'][fKEY][ML_idx]
roc_path = rd['roc_paths'][fKEY][ML_idx]
fpr, tpr, thresholds = roc_obj.fpr, roc_obj.tpr, roc_obj.thresholds
fpr_err, tpr_err = 0,0
plot_ROC_fit(i=i, fpr=fpr, tpr=tpr, tpr_err=tpr_err, fpr_err=fpr_err,
roc_obj=roc_obj, roc_label=roc_label, names=names, args=args, savename=roc_path)
# -----------------
# Construct box cut efficiency prior ML cut
BOX_cut_mask = rd['roc_filters'][BOX_filter_key][ML_idx] & rd['roc_filters'][GEN_filter_key][ML_idx]
den_mask = copy.deepcopy(rd['roc_filters'][GEN_filter_key][ML_idx]) #! deepcopy
for C in range(2):
BOX_eff[i,C], BOX_eff_err[i,C] = statstools.columnar_mask_efficiency(
num_mask=BOX_cut_mask, den_mask=den_mask,
y_true=rd['data'].y, y_target=class_ids[C], weights=rd['data'].w)
# -----------------
# Construct ML cut efficiency such that the combined BOX & ML efficiency
# matches the FPR level we want for the background (or use a constant cut)
if const_mode == 'cut':
thr[i] = const_cut
elif const_mode == 'bgk_eff':
func = interpolate.interp1d(fpr * BOX_eff[i,0], thresholds, 'linear')
thr[i] = func(const_bgk_eff)
else:
raise Exception(__name__ + f'.optimize_selection: Unknown const_mode "{const_mode}"')
# -----------------
# ML cut
ML_cut_mask = (rd['y_preds'][ML_idx] > thr[i]) & rd['roc_filters'][GEN_filter_key][ML_idx]
den_mask = copy.deepcopy(rd['roc_filters'][GEN_filter_key][ML_idx]) #! deepcopy
for C in range(2):
ML_eff[i,C], ML_eff_err[i,C] = statstools.columnar_mask_efficiency(num_mask=ML_cut_mask, den_mask=den_mask,
y_true=rd['data'].y, y_target=class_ids[C], weights=rd['data'].w)
# -----------------
# BOX cut
BOX_cut_mask = ML_cut_mask & rd['roc_filters'][BOX_filter_key][ML_idx]
den_mask = copy.deepcopy(ML_cut_mask) #! deepcopy
for C in range(2):
if commutate_BOX and C == 0:
continue # Low MC stats treatment, computed above
BOX_eff[i,C], BOX_eff_err[i,C] = statstools.columnar_mask_efficiency(
num_mask=BOX_cut_mask, den_mask=den_mask,
y_true=rd['data'].y, y_target=class_ids[C], weights=rd['data'].w)
# ----------------
# Total ML x BOX cut
ML_BOX_cut_mask = ML_cut_mask & BOX_cut_mask
den_mask = copy.deepcopy(rd['roc_filters'][GEN_filter_key][ML_idx]) #! deepcopy
for C in range(2):
if commutate_BOX and C == 0: # Low MC stats treatment
ML_BOX_eff[i,C] = ML_eff[i,C] * BOX_eff[i,C]
ML_BOX_eff_err[i,C] = statstools.prod_eprop(A=ML_eff[i,C], B=BOX_eff[i,C],
sigmaA=ML_eff_err[i,C], sigmaB=BOX_eff_err[i,C], sigmaAB=0)
continue
ML_BOX_eff[i,C], ML_BOX_eff_err[i,C] = statstools.columnar_mask_efficiency(
num_mask=ML_BOX_cut_mask, den_mask=den_mask,
y_true=rd['data'].y, y_target=class_ids[C], weights=rd['data'].w)
# ----------------
# Compute values
B[i] = B_tot * ML_BOX_eff[i,0]
sigma_B[i] = B_tot * ML_BOX_eff_err[i,0]
S_trg_eA[i] = proc['cut_stats']['filterfunc']['after'] / proc['cut_stats']['filterfunc']['before']
S_cut_eA[i] = proc['cut_stats']['cutfunc']['after'] / proc['cut_stats']['cutfunc']['before']
# Expected signal event count (without BR applied here!)
S_tot = (signal_xs * L_int) * S_trg_eA[i] * S_cut_eA[i]
S[i] = S_tot * ML_BOX_eff[i,1]
sigma_S[i] = S_tot * ML_BOX_eff_err[i,1]
# Asimov Discovery Significance
ADS[i] = statstools.ADS(s=BR * S[i], b=B[i], sigma_b=sigma_B[i])
# Naive
ds[i] = BR * S[i] / np.sqrt(B[i])
output[names[i]] = {'ML_model': ML_model_names[ML_idx],
'model_param': param_values[i],
'signal': {'counts': S[i], 'systematic': sigma_S[i]},
'background': {'counts': B[i], 'systematic': sigma_B[i]}}
# ----------------
# Diagnostics
print(f'ML_eff: {np.round(ML_eff[i,:], 6)} | ML_eff_err = {np.round(100 * ML_eff_err[i,:] / ML_eff[i,:], 1)} %')
print(f'BOX_eff: {np.round(BOX_eff[i,:], 6)} | BOX_eff_err = {np.round(100 * BOX_eff_err[i,:] / BOX_eff[i,:], 1)} %')
print(f'tot_eff: {np.round(ML_eff[i,:] * BOX_eff[i,:], 6)} | tot_eff_err = {np.round(100 * ML_BOX_eff_err[i,:] / (ML_BOX_eff[i,:]), 1)} %')
i += 1
# << == Loop over signal points ends
dprint(f'\\textbf{{ML model: {ML_model_names[ML_idx]} }} \\\\')
dprint('')
dprint('\\tiny')
dprint('\\begin{tabular}{l|c|cc|cccc|cc|cc}')
dprint('Signal model point & ML-cut & $B$: $\\epsilon$(ML) & $\\epsilon$($M$-box) & $S$: $\\epsilon A$(trg) & $\\epsilon$(pre-cut) & $\\epsilon$(ML) & $\\epsilon$($M$-box) & $\\langle B \\rangle \\pm$ [\\%] & $\\langle S \\rangle \\pm$ [\\%] & ADS & $S / \\sqrt{B}$ \\\\')
dprint('\\hline')
# -----------------
# Plot summary table
for i in range(len(S)):
line = f'{param_values[i]} & {thr[i]:0.3f} &{ML_eff[i,0]:0.1E} & {BOX_eff[i,0]:0.1E} & {S_trg_eA[i]:0.2g} & {S_cut_eA[i]:0.2g} & {ML_eff[i,1]:0.2g} & {BOX_eff[i,1]:0.2g} & {B[i]:0.1E} $\\pm$ {100*sigma_B[i]/B[i]:0.1f} & {BR*S[i]:0.1E} $\\pm$ {100*sigma_S[i]/S[i]:0.1f} & {ADS[i]:0.2g} & {ds[i]:0.2g} \\\\'
dprint(line)
dprint('\\end{tabular}')
dprint('')
# -----------------
# Compute upper limits
limits.run_limits_vector(data=output, param=args['limits_param'],
savepath = f'{args["plotdir"]}/optimize/limits/{ML_model_names[ML_idx]}')
# << == Loop over models ends
# -----------------
# End the document and compile latex
dprint('\\end{document}')
os.system(f'pdflatex -output-directory {latex_path} {latex_filename}')
print('[Done]')