# Plotting functions
#
# m.mieskolainen@imperial.ac.uk, 2024
import matplotlib.pyplot as plt
import numpy as np
import awkward as ak
import torch
import gc
import os
from pprint import pprint
import copy
from prettytable import PrettyTable
import multiprocessing
import time
from tqdm import tqdm
import pandas as pd
from iceplot import iceplot
from icefit import statstools
from icefit import cortools
from icenet.tools import aux
from icenet.tools import reweight
from icenet.tools import prints
# ------------------------------------------
from icenet import print
# ------------------------------------------
[docs]
def binengine(bindef, x):
"""
Binning processor function
Args:
bindef: binning definition
x: data input array
Examples:
50 (number of bins, integer)
[1.0, 40.0, 50.0] (list of explicit edges)
{'nbin': 30, 'q': [0.0, 0.95], 'space': 'linear'} (automatic with quantiles)
{'nbin': 30, 'minmax': [2.0, 50.0], 'space': 'log10'} (automatic with boundaries)
Returns:
edges: binning edges
"""
if len(x) == 0:
print(f'Input is zero-array, returning [0,0]')
return np.array([0,0])
def mylinspace(minval, maxval, nbin):
if 'space' in bindef and bindef['space'] == 'log10':
if minval <= 0:
raise Exception(__name__ + f'.bin_processor: Trying log10 binning with minimum = {minval} <= 0')
return np.logspace(np.log10(minval), np.log10(maxval), nbin + 1)
else:
return np.linspace(minval, maxval, nbin + 1)
# Integer
if type(bindef) is int or type(bindef) is float:
return np.linspace(np.min(x), np.max(x), int(bindef) + 1)
# Bin edges given directly
elif type(bindef) is list:
return np.array(bindef)
elif type(bindef) is np.ndarray:
return bindef
# Automatic with quantiles
elif type(bindef) is dict and 'q' in bindef:
return mylinspace(minval=np.percentile(x, 100*bindef['q'][0]), maxval=np.percentile(x, 100*bindef['q'][1]), nbin=bindef['nbin'])
# Automatic with boundaries
elif type(bindef) is dict and 'minmax' in bindef:
return mylinspace(minval=bindef['minmax'][0], maxval=bindef['minmax'][1], nbin=bindef['nbin'])
else:
raise Exception(__name__ + f'.bin_processor: Unknown binning description in {bindef}')
[docs]
def plot_selection(X, mask, ids, plotdir, label, varlist, density=True, library='np'):
"""
Plot selection before / after type histograms against all chosen variables
Args:
X : data array (N events x D dimensions)
mask : boolean selection indices (N)
ids : variable string array (D)
plotdir : plotting directory
label : a string label
varlist : a list of variables to be plotted (from ids)
density : normalize all histograms to unit density
library : 'np' or 'ak'
"""
for var in tqdm(varlist):
if var not in ids:
continue
# Histogram (autobinning)
if library == 'np':
before = np.asarray(X[:, ids.index(var)])
after = np.asarray(X[mask, ids.index(var)])
elif library == 'ak':
before = ak.to_numpy(X[:][var])
after = ak.to_numpy(X[mask][var])
else:
raise Exception(__name__ + f'.Unknown library: {library}')
counts1, errs1, bins, cbins = iceplot.hist(before, bins=100, density=density)
counts2, errs2, bins, cbins = iceplot.hist(after, bins=bins, density=density)
# Plot
obs_x = {
'xlim' : (np.min(bins), np.max(bins)),
'ylim' : None,
'ylim_ratio': (0.7, 1.3),
'xlabel' : var,
'ylabel' : r'Counts',
'units' : {'x': None, 'y' : r'counts'},
'density' : density,
'figsize' : (4, 3.75)
}
fig, ax = iceplot.create_axes(**obs_x, ratio_plot=True)
label1 = f'before cuts'
label2 = f'after cuts'
ax[0].hist(x=cbins, bins=bins, weights=counts1, color=(0,0,0), label=label1, **iceplot.hist_style_step)
ax[0].hist(x=cbins, bins=bins, weights=counts2, color=(1,0,0), alpha=0.5, label=label2, **iceplot.hist_style_step)
iceplot.ordered_legend(ax=ax[0], order=[label1, label2])
iceplot.plot_horizontal_line(ax[1])
ax[1].hist(x=cbins, bins=bins, weights=counts2 / (counts1 + 1E-30), color=(1,0,0), alpha=0.5, label='ratio', **iceplot.hist_style_step)
# Save it
targetdir = aux.makedir(f'{plotdir}/cuts/{label}')
fig.savefig(f'{targetdir}/{var}.pdf', bbox_inches='tight')
ax[0].set_yscale('log')
fig.savefig(f'{targetdir}/{var}--log.pdf', bbox_inches='tight')
# --------
fig.clf()
plt.close()
gc.collect()
[docs]
def plot_matrix(XY, x_bins, y_bins, vmin=0, vmax=None, cmap='RdBu', figsize=(4,3), grid_on=False):
"""
Visualize matrix.
"""
fig,ax = plt.subplots(figsize=figsize)
ax.grid(grid_on)
x,y = np.meshgrid(x_bins, y_bins)
# Note transpose
c = ax.pcolormesh(x, y, XY.T, cmap=cmap, vmin=vmin, vmax=vmax, antialiased=True, linewidth=0.0)
ax.axis([x.min(), x.max(), y.min(), y.max()])
return fig,ax,c
[docs]
def plot_train_evolution_multi(losses, trn_aucs, val_aucs, label, aspect=0.85,
yscale='linear', xscale='linear'):
""" Training evolution plots.
Args:
losses: loss values in a dictionary
trn_aucs: training metrics
val_aucs: validation metrics
Returns:
fig: figure handle
ax: figure axis
"""
fig,ax = plt.subplots(1,2, figsize=(10, 7.5))
# How many loss terms
N_terms = 0
for key in losses.keys():
if 'eval' in key:
N_terms += 1
for key in losses.keys():
if ('sum' in key) and (N_terms == 2):
continue # Do not plot 'sum' if only sum made of one term
ax[0].plot(losses[key], label=key)
ax[0].set_xlabel('k (epoch)')
ax[0].set_ylabel('loss')
ax[0].legend(fontsize=6)
ax[0].set_title(f'{label}', fontsize=10)
plt.sca(ax[0])
plt.autoscale(enable=True, axis='x', tight=True)
plt.yscale(yscale)
plt.xscale(xscale)
ax[1].plot(trn_aucs)
ax[1].plot(val_aucs)
ax[1].legend(['train','validation'], fontsize=8)
ax[1].set_xlabel('k (epoch)')
ax[1].set_ylabel('AUC')
ax[1].grid(True)
plt.sca(ax[1])
plt.autoscale(enable=True, axis='x', tight=True)
plt.yscale(yscale)
plt.xscale(xscale)
ax[0].set_aspect(1.0/ax[0].get_data_ratio()*aspect)
for i in [1]:
ax[1].set_ylim([np.min([np.min(trn_aucs), np.min(val_aucs)]), 1.0])
ax[1].set_aspect(1.0/ax[i].get_data_ratio()*aspect)
return fig,ax
[docs]
def binned_2D_AUC(y_pred, y, X_kin, ids_kin, X, ids, edges, label, weights=None, VAR:list=['trk_pt', 'trk_eta']):
"""
Evaluate AUC per 2D-bin.
Args:
y_pred : MVA algorithm output
y : Output (truth level target) data
X_kin : Data
ids_kin : Variables
X : Data
ids : Variables
edges : Edges of the A,B-space cells (2D array)
label : Label of the classifier (string)
weights : Sample weights
VAR : Variable identifiers (two)
Returns:
fig,ax : Figure handle and axis
met : Metrics object
"""
edges_A = edges[0]
edges_B = edges[1]
AUC = np.zeros((len(edges_A)-1, len(edges_B)-1))
if len(y_pred) != len(y):
raise Exception(__name__ + f'.binned_2D_AUC: len(y_pred) = {len(y_pred)} != len(y) = {len(y)}')
# Loop over bins
for i in range(len(edges_A)-1):
for j in range(len(edges_B)-1):
range_A = [edges_A[i], edges_A[i+1]]
range_B = [edges_B[j], edges_B[j+1]]
# Indices
try:
ind = np.logical_and(aux.pick_ind(X_kin[:, ids_kin.index(VAR[0])], range_A),
aux.pick_ind(X_kin[:, ids_kin.index(VAR[1])], range_B))
except:
try:
ind = np.logical_and(aux.pick_ind(X[:, ids.index(VAR[0])], range_A),
aux.pick_ind(X[:, ids.index(VAR[1])], range_B))
except:
raise Exception(__name__ + f'.binned_2D_AUC: Cannot find the variables {VAR}')
string = f'{VAR[0]} = [{range_A[0]:10.3f},{range_A[1]:10.3f}), {VAR[1]} = [{range_B[0]:10.3f},{range_B[1]:10.3f})'
if np.sum(ind) > 0: # Do we have any events in this cell
# Evaluate metric
if weights is not None:
met = aux.Metric(y_true=y[ind], y_pred=y_pred[ind], weights=weights[ind])
else:
met = aux.Metric(y_true=y[ind], y_pred=y_pred[ind])
print(f'{string} | AUC = {met.auc:.5f}')
AUC[i,j] = met.auc
else:
print(f'{string} | No events found in this cell!')
# Evaluate total performance
met = aux.Metric(y_true=y, y_pred=y_pred, weights=weights)
# Finally plot it
fig,ax = plot_AUC_matrix(AUC=AUC, edges_A=edges_A, edges_B=edges_B)
ax.set_title(f'{label} | AUC = {met.auc:.3f} (integrated)', fontsize=9)
ax.set_xlabel(f"{VAR[0]}")
ax.set_ylabel(f"{VAR[1]}")
return fig, ax, met
[docs]
def binned_1D_AUC(y_pred, y, X_kin, ids_kin, X, ids, edges, weights=None,
VAR:str='trk_pt', num_bootstrap=0):
"""
Evaluate AUC & ROC per 1D-bin.
Args:
y_pred : MVA algorithm output
y : Output (truth level target) data
X_kin : Data
ids_X_kin : Variables (strings)
X : Data
ids : Variables (strings)
edges : Edges of the space cells
weights : Sample weights
VAR : Variable identifier to pick (one)
Returns:
fig,ax : Figure handle and axis
met : Metrics object
"""
AUC = np.zeros((len(edges)-1, 1))
if len(y_pred) != len(y):
raise Exception(__name__ + f'.binned_1D_AUC: len(y_pred) = {len(y_pred)} != len(y) = {len(y)}')
METS = []
LABELS = []
# Loop over bins
for i in range(len(edges)-1):
range_ = [edges[i], edges[i+1]]
# Indices (either from X_kin or X)
try:
ind = aux.pick_ind(X_kin[:, ids_kin.index(VAR)], range_)
except:
try:
ind = aux.pick_ind(X[:, ids.index(VAR)], range_)
except:
raise Exception(__name__ + f'.binned_1D_AUC: Cannot find the variable {VAR}')
string = f'{VAR} = [{range_[0]:10.3f},{range_[1]:10.3f})'
if np.sum(ind) > 0: # Do we have any events in this cell
# Evaluate metric
if weights is not None:
met = aux.Metric(y_true=y[ind], y_pred=y_pred[ind], weights=weights[ind], num_bootstrap=num_bootstrap)
else:
met = aux.Metric(y_true=y[ind], y_pred=y_pred[ind], num_bootstrap=num_bootstrap)
AUC[i] = met.auc
METS.append(met)
print(f'{string} | AUC = {AUC[i]}')
else:
METS.append(None)
print(f'{string} | No events found in this cell!')
LABELS.append(f'{VAR}$ \\in [{range_[0]:.1f},{range_[1]:.1f})$')
return METS, LABELS
[docs]
def density_MVA_wclass(y_pred, y, label, weights=None, class_ids=None, edges=80, path='', **kwargs):
"""
Evaluate MVA output (1D) density per class.
Args:
y_pred : MVA algorithm output
y : Output (truth level target) data
label : Label of the MVA model (string)
weights : Sample weights
class_ids : Class IDs to plot
hist_edges : Histogram edges list (or number of bins, as an alternative)
Returns:
Plot pdf saved directly
"""
if class_ids is None:
class_ids = np.unique(y.astype(int))
# Make sure it is 1-dim array of length N (not N x num classes)
if (weights is not None) and len(weights.shape) > 1:
weights = np.sum(weights, axis=1)
if weights is not None:
classlegs = [f'$\\mathcal{{C}} = {k}$, $N={np.sum(y == k)}$ ($\\Sigma_w = {np.sum(weights[y == k]):0.1f}$)' for k in class_ids]
else:
classlegs = [f'$\\mathcal{{C}} = {k}$, $N={np.sum(y == k)}$ (no weights)' for k in class_ids]
# Handle logits vs probabilities
THRESH = 1E-5
if np.min(y_pred) < (-THRESH) or np.max(y_pred) > (1.0 + THRESH):
logit = copy.deepcopy(y_pred)
prob = aux.sigmoid(logit)
else:
prob = copy.deepcopy(y_pred)
logit = aux.inverse_sigmoid(prob)
# Plot both
for mode in ['logit', 'prob']:
# Over classes
fig,ax = plt.subplots()
for k in class_ids:
ind = (y == k)
if np.sum(ind) == 0:
print(f'No samples for class {k} -- continue')
continue
w = weights[ind] if weights is not None else None
if mode == 'logit':
x = logit[ind]
else:
x = prob[ind]
hI, bins, patches = plt.hist(x=x, bins=binengine(bindef=edges, x=x), weights=w,
density = True, histtype = 'step', fill = False, linewidth = 2, label = 'inverse')
plt.legend(classlegs, loc='upper center', fontsize=7)
plt.xlabel(f'MVA output $f(\\mathbf{{x}})$ [{mode}]')
plt.ylabel('density')
plt.title(f'{label}', fontsize=9)
for scale in ['linear', 'log']:
try:
ax.set_yscale(scale)
outputdir = aux.makedir(f'{path}')
savepath = f'{outputdir}/MVA-output__{mode}__{scale}.pdf'
plt.savefig(savepath, bbox_inches='tight')
print(f'Saved: "{savepath}"', 'green')
except Exception as e:
print(e, 'red')
print(f'Plotting failed for {label} -- continue', 'red')
continue
# --------
fig.clf()
plt.close(fig)
gc.collect()
[docs]
def plot_correlation_comparison(corr_mstats, targetdir, xlim=None):
"""
Plot collected correlation metrics from density_COR_wclass()
Args:
corr_mstats: statistics dictionary
targetdir: output directory
xlim: plot limits dictionary per class
Returns:
plots saved to a directory
"""
print(f'Plotting correlation comparisons ...')
class_ids = {}
## Find all variables
all_var = []
for model in corr_mstats.keys():
for category in corr_mstats[model].keys():
for c in corr_mstats[model][category].keys():
class_ids = corr_mstats[model][category].keys() # This is overwritten (all same)
for var in corr_mstats[model][category][c].keys():
all_var.append(var)
all_var = set(all_var) # Reduce to a set (values appear only once)
## Over all variables
for var in all_var:
# Over classes
for c in class_ids:
# Over different statistical metrics
for stats in ['pearson', 'abs_pearson', 'disco', 'MI']:
fig,ax = plt.subplots()
# Per model
for model in corr_mstats.keys():
categories = list(corr_mstats[model].keys())
values = np.nan * np.ones(len(categories))
lower = np.nan * np.ones(len(categories))
upper = np.nan * np.ones(len(categories))
## Over each powerset category
for i in range(len(categories)):
x = corr_mstats[model][categories[i]][c]
if x is not {}: # We have some stats
values[i] = x[var][f'{stats}']
lower[i] = x[var][f'{stats}_CI'][0]
upper[i] = x[var][f'{stats}_CI'][1]
error = (upper - lower)/2 # Simple symmetric errors
#lower = np.abs(values - np.array(lower))
#upper = np.abs(values - np.array(upper))
#asymmetric_error = np.array(list(zip(lower, upper))).T
# Vertical line at zero
plt.plot(np.zeros(len(values)), np.arange(len(values)), color=np.ones(3)*0.5, label=None)
## Plot horizontal plot i.e. values +- (lower, upper) on x-axis, category on y-axis
legend_label = f'{model}Â [{np.median(values):0.3f}]'
plt.errorbar(values, np.arange(len(values)), xerr=error,
fmt='s', capsize=5.0, label=legend_label)
title = f'$\\mathcal{{C}} = {c}$'
plt.title(title)
plt.xlabel(f'{stats}$_{{XY}}$ (MVA score, {var}) (68CL)')
if xlim is not None:
ax.set_xlim(xlim[stats][c])
ax.set_yticks(np.arange(len(values)))
ax.set_yticklabels(categories)
ax.invert_yaxis()
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
filename = aux.makedir(targetdir + f'/COR/')
plt.savefig(filename + f'var-{var}--stats-{stats}--class-{c}.pdf',
bbox_inches='tight')
plt.close()
[docs]
def density_COR_wclass(y_pred, y, X, ids, label, \
weights=None, class_ids=None, edges=[[50], [50]], density=True,
path='', cmap='Oranges', **kwargs):
"""
Evaluate the 2D-density of the MVA algorithm output vs other variables per class.
Args:
y_pred : MVA algorithm output
y : Output (truth level target) data
X : Variables to be plotted
ids : Identifiers of the variables in X
label : Label of the MVA model (string)
weights : Sample weights
class__ids : Class ids to plot
hist_edges : Histogram edges list (or number of bins, as an alternative) (2D)
density : Normalize to density
path : Save path
cmap : Color map
Returns:
correlation values in a dictionary (per variable, per class)
plots are saved directly
"""
if class_ids is None:
class_ids = np.unique(y.astype(int))
# Make sure it is 1-dim array of length N (not N x num classes)
if (weights is not None) and len(weights.shape) > 1:
weights = np.sum(weights, axis=1)
if weights is not None:
classlegs = [f'$\\mathcal{{C}} = {k}$, $N={np.sum(y == k)}$ (weighted {np.sum(weights[y == k]):0.1f})' for k in class_ids]
else:
classlegs = [f'$\\mathcal{{C}} = {k}$, $N={np.sum(y == k)}$ (no weights)' for k in class_ids]
output = {}
# Over classes
for k in class_ids:
output[k] = {}
ind = (y == k)
w = weights[ind] if weights is not None else None
if np.sum(ind) == 0:
print(f'No samples for class {k} -- continue')
continue
# Loop over variables
for var in ids:
xx = y_pred[ind]
yy = X[ind, ids.index(var)]
bins = [binengine(bindef=edges[0], x=xx), binengine(bindef=edges[1], x=yy)]
# Apply histogram bounds
box = np.logical_and(np.logical_and(xx > bins[0][0], xx < bins[0][-1]),
np.logical_and(yy > bins[1][0], yy < bins[1][-1]))
xx = xx[box]
yy = yy[box]
ww = w[box]
# Pearson correlation coefficient
cc,cc_CI,_ = cortools.pearson_corr(x=xx, y=yy, weights=ww)
# Absolute Pearson correlation coefficient
cc_abs,cc_abs_CI,_ = cortools.pearson_corr(x=xx, y=yy, weights=ww, return_abs=True)
# Distance correlation
disco,disco_CI = cortools.distance_corr(x=xx, y=yy, weights=ww)
# Histogram based MI
MI,MI_CI = cortools.mutual_information(x=xx, y=yy, weights=ww, automethod='Scott2D')
# Neural Mutual Information [cannot use, unreliable to compute for small samples]
#from icefit import mine
#MI,MI_err = mine.estimate(X=xx, Z=yy, weights=ww)
#MI_CI = np.array([MI-MI_err, MI+MI_err])
# Save output
output[k][var] = {}
output[k][var]['pearson'] = cc
output[k][var]['pearson_CI'] = cc_CI
output[k][var]['abs_pearson'] = cc_abs
output[k][var]['abs_pearson_CI'] = cc_abs_CI
output[k][var]['disco'] = disco
output[k][var]['disco_CI'] = disco_CI
output[k][var]['MI'] = MI
output[k][var]['MI_CI'] = MI_CI
for scale in ['linear', 'log']:
fig,ax = plt.subplots()
outputdir = aux.makedir(f'{path}')
savepath = f'{outputdir}/var-{var}--class-{k}--{scale}.pdf'
try:
if scale == 'log':
import matplotlib as mpl
h2,xedges,yedges,im = plt.hist2d(x=xx, y=yy, bins=bins, weights=ww, density=density, norm=mpl.colors.LogNorm(), cmap=plt.get_cmap(cmap))
else:
h2,xedges,yedges,im = plt.hist2d(x=xx, y=yy, bins=bins, weights=ww, density=density, cmap=plt.get_cmap(cmap))
fig.colorbar(im)
plt.xlabel(f'MVA output $f(\\mathbf{{x}})$')
plt.ylabel(f'{var}')
rho_value = f'$\\rho_{{XY}} = {cc:0.2f}_{{-{cc-cc_CI[0]:0.2f}}}^{{+{cc_CI[1]-cc:0.2f}}}$'
MI_value = f'$\\mathcal{{I}}_{{XY}} = ({MI_CI[0]:0.3f}, {MI_CI[1]:0.3f})$'
plt.title(f'{label} | $\\mathcal{{C}} = {k}$ | {rho_value} | {MI_value}', fontsize=9)
# -----
plt.savefig(savepath, bbox_inches='tight')
print(f'Saved: "{savepath}"', 'green')
# -----
fig.clf()
plt.close()
gc.collect()
except: # Matplotlib LogNorm() can be buggy
print(f'Failed to produce plot {savepath}')
pprint(output)
return output
[docs]
def density_COR(y_pred, X, ids, label, weights=None, hist_edges=[[50], [50]], path='', cmap='Oranges'):
"""
Evaluate the 2D-density of the MVA algorithm output vs other variables.
Args:
y_pred : MVA algorithm output
X : Variables to be plotted
ids : Identifiers of the variables in X
label : Label of the MVA model (string)
weights : Sample weights
hist_edges : Histogram edges list (or number of bins, as an alternative) (2D)
path : Save path
cmap : Color map
Returns:
Plot pdf saved directly
"""
# Make sure it is 1-dim array of length N (not N x num classes)
if (weights is not None) and len(weights.shape) > 1:
weights = np.sum(weights, axis=1)
# Loop over variables
for var in ids:
fig,ax = plt.subplots()
# Plot 2D
xx = y_pred
yy = X[:, ids.index(var)]
bins = [binengine(bindef=hist_edges[0], x=xx), binengine(bindef=hist_edges[1], x=yy)]
h2,xedges,yedges,im = plt.hist2d(x=xx, y=yy, bins=bins, weights=weights, cmap=plt.get_cmap(cmap))
fig.colorbar(im)
plt.xlabel(f'MVA output $f(\\mathbf{{x}})$')
plt.ylabel(f'{var}')
plt.title(f'{label}', fontsize=10)
# -----
outputdir = aux.makedir(f'{path}/{label}')
savepath = f'{outputdir}/var-{var}.pdf'
plt.savefig(savepath, bbox_inches='tight')
print(f'Saved: "{savepath}"', 'green')
# -----
fig.clf()
plt.close()
gc.collect()
[docs]
def annotate_heatmap(X, ax, xlabels, ylabels, x_rot = 90, y_rot = 0, decimals = 1, color = "w"):
""" Add text annotations to a matrix heatmap plot
"""
ax.set_xticks(np.arange(0, len(xlabels), 1));
ax.set_yticks(np.arange(0, len(ylabels), 1));
ax.set_xticklabels(labels=xlabels, rotation=x_rot, fontsize='xx-small')
ax.set_yticklabels(labels=ylabels, rotation=y_rot, fontsize='xx-small')
# Loop over data dimensions and create text annotations.
for i in range(len(ylabels)):
for j in range(len(xlabels)):
if (decimals == 0):
text = ax.text(j, i, f'{X[i,j]:.0f}', ha="center", va="center", color=color)
if (decimals == 1):
text = ax.text(j, i, f'{X[i,j]:.1f}', ha="center", va="center", color=color)
if (decimals == 2):
text = ax.text(j, i, f'{X[i,j]:.2f}', ha="center", va="center", color=color)
return ax
[docs]
def plot_AUC_matrix(AUC, edges_A, edges_B):
""" Plot AUC matrix.
Args:
AUC: AUC-ROC matrix
edges_A: Histogram edges of variable A
edges_B: Histogram edges of variable B
Returns:
fig: figure handle
ax: figure axis
"""
fig, ax = plt.subplots()
xlabels = [f'[{edges_A[k]},{edges_A[k+1]})' for k in range(len(edges_A) - 1)]
ylabels = [f'[{edges_B[k]},{edges_B[k+1]})' for k in range(len(edges_B) - 1)]
ax.imshow(AUC.transpose(), origin = 'lower')
ax.set_title('AUC x 100', fontsize=10)
ax = annotate_heatmap(X = AUC.transpose() * 100, ax = ax, xlabels = xlabels,
ylabels = ylabels, decimals = 0, x_rot = 0, y_rot = 0, color = "w")
return fig, ax
[docs]
def multiprocess_plot_wrapper(p):
"""
Multiprocessing wrapper
"""
i = p['i']
# Exclude special values
ind = np.ones(len(p['X']), dtype=bool)
for k in range(len(p['exclude_vals'])):
ind = np.logical_and(ind, (p['X'][:,i] != p['exclude_vals'][k]))
plotvar(x = p['X'][ind,i],
y = p['y'][ind],
weights = p['weights'][ind],
var = p['ids'][i],
nbins = p['nbins'],
percentile_range = p['percentile_range'],
title = p['title'],
targetdir = p['targetdir'],
plot_unweighted = p['plot_unweighted'])
[docs]
def plotvars(X, y, ids, weights, nbins=70, percentile_range=[0.5, 99.5],
exclude_vals=[None], plot_unweighted=True, title = '', targetdir = '.',
num_cpus: int=0):
"""
Plot all variables.
"""
print(f'Creating plots | exclude_vals = {exclude_vals} ...')
# Chunked loop [parallel processing]
param = {
'i': None,
'X': None,
'y': None,
'weights': None,
'ids': ids,
'nbins': nbins,
'percentile_range': percentile_range,
'exclude_vals': exclude_vals,
'plot_unweighted': plot_unweighted,
'title': title,
'targetdir': targetdir
}
# Create parameter dictionaries for each process
paramlist = []
for i in range(X.shape[1]):
paramlist.append(copy.deepcopy(param))
paramlist[-1]['i'] = i
paramlist[-1]['X'] = X # big numpy arrays should share the memory (i.e. no copy of X)
paramlist[-1]['y'] = y
paramlist[-1]['weights'] = weights
# Start multiprocessing
print(f'Multiprocessing {X.shape[1]} plots', 'yellow')
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() // 2 if num_cpus == 0 else num_cpus)
tic = time.time()
pool.map(multiprocess_plot_wrapper, paramlist)
pool.close() # no more tasks
pool.join() # wrap up current tasks
toc = time.time()
print(f'Took {toc-tic:0.2f} sec')
[docs]
def plotvar(x, y, var, weights, nbins=70, percentile_range=[0.5, 99.5],
plot_unweighted=True, title='', targetdir='.'):
"""
Plot a single variable.
"""
try:
binrange = (np.percentile(x, percentile_range[0]), np.percentile(x, percentile_range[1]))
fig, axs = plot_reweight_result(X=x, y=y, nbins=nbins, binrange=binrange, weights=weights,
title=title, xlabel=var, plot_unweighted=plot_unweighted)
plt.savefig(f'{targetdir}/var-{var}.pdf', bbox_inches='tight')
fig.clf()
plt.close(fig)
gc.collect()
except Exception as e:
print(e, 'red')
print(f'Problem plotting variable "{var}"', 'red')
[docs]
def plot_reweight_result(X, y, nbins, binrange, weights, title = '', xlabel = 'x', linewidth=1.5,
plot_unweighted=True):
"""
Here plot pure event counts
so we see that also integrated class fractions are equalized (or not) after weighting!
"""
fig,ax = plt.subplots(1, 2, figsize = (10, 4.25))
class_ids = np.unique(y.astype(int))
legends = []
min_x = 1e30
max_x = -1e30
# Loop over classes
for c in class_ids:
ind = (y == c)
if np.sum(ind) == 0:
print(f'No samples for class {c} -- continue')
continue
# Compute histograms with numpy (we use nbins and range() for speed)
if plot_unweighted:
counts, edges = np.histogram(X[ind], bins=nbins, range=binrange, weights=None)
counts_w, edges = np.histogram(X[ind], bins=nbins, range=binrange, weights=weights[ind])
mu, std = aux.weighted_avg_and_std(values=X[ind], weights=weights[ind])
min_x = edges[0] if edges[0] < min_x else min_x
max_x = edges[-1] if edges[-1] > max_x else max_x
# Linear and log scale scale (left and right plots)
for i in range(2):
plt.sca(ax[i])
if plot_unweighted:
plt.stairs(counts, edges, fill=False, linewidth = linewidth+0.75, linestyle='--') # bigger linewidth first
plt.stairs(counts_w, edges, fill=False, linewidth = linewidth, linestyle='-')
if i == 0:
if plot_unweighted:
legends.append(f'$\\mathcal{{C}} = {c}$ (unweighted)')
legends.append(f'$\\mathcal{{C}} = {c}$ [$\\mu={mu:0.2f}, \\sigma={std:0.2f}$]')
ax[0].set_ylabel('[weighted] counts')
ax[0].set_xlabel(xlabel)
ax[0].set_ylim([0,None])
ax[0].set_xlim([min_x, max_x])
ax[1].set_xlabel(xlabel)
ax[1].set_title(title, fontsize=10)
ax[1].legend(legends, fontsize=8)
ax[1].set_yscale('log')
plt.tight_layout()
return fig,ax
[docs]
def plot_correlations(X, ids, weights=None, y=None, round_threshold=0.0, targetdir=None, colorbar = False):
"""
Plot a cross-correlation matrix of vector data
Args:
X: Data matrix (N x D)
ids: Variable names (list of length D)
weights: Event weights
y: Class labels per event (list of length N)
round_threshold: Correlation matrix |C| < threshold to set matrix elements to zero
targetdir: Output plot directory
colorbar: Colorbar on the plot
Returns:
figs, axs: Figures, axes (per class)
"""
N = X.shape[0]
if y is None:
y = np.zeros(N)
class_ids = [0]
else:
class_ids = np.unique(y.astype(int))
figs = {}
axs = {}
for k in class_ids:
label = f'all' if (len(class_ids) == 1) else f'class_{k}'
# Compute correlation matrix
tic = time.time()
w = weights[y==k] if weights is not None else None
C = statstools.correlation_matrix(X=X[y==k,:], weights=w)
C[np.abs(C) < round_threshold] = np.nan
C *= 100
toc = time.time()
print(f'Computing C-matrix (took {toc-tic:0.2f} sec)')
# Compute suitable figsize
size = np.ceil(C.shape[0] / 3)
# Plot it
figs[label], axs[label] = plt.subplots(1,1, figsize=(size,size))
axs[label].imshow(C)
axs[label] = annotate_heatmap(X = C, ax = axs[label], xlabels = ids,
ylabels = ids, decimals = 0, x_rot = 90, y_rot = 0, color = "w")
axs[label].set_title(f'{label}: linear correlation $\\in$ [-100,100]', fontsize=10)
if colorbar:
cb = plt.colorbar()
if targetdir is not None:
tic = time.time()
if X.shape[1] <= 100: # Not too many dimensions --> pdf OK
fname = targetdir + f'{label}-correlation-matrix.pdf'
plt.savefig(fname=fname, pad_inches=0.2, bbox_inches='tight')
else:
fname = targetdir + f'{label}-correlation-matrix.png'
plt.savefig(fname=fname, pad_inches=0.2, dpi=300, bbox_inches='tight')
toc = time.time()
print(f'Saved: "{fname} (took {toc-tic:0.2f} sec)"')
return figs, axs
[docs]
def draw_error_band(ax, x, y, x_err, y_err, **kwargs):
"""
Calculate normals via centered finite differences (except the first point
which uses a forward difference and the last point which uses a backward
difference).
https://matplotlib.org/stable/gallery/lines_bars_and_markers/curve_error_band.html
"""
from matplotlib.path import Path
from matplotlib.patches import PathPatch
dx = np.concatenate([[x[1] - x[0]], x[2:] - x[:-2], [x[-1] - x[-2]]])
dy = np.concatenate([[y[1] - y[0]], y[2:] - y[:-2], [y[-1] - y[-2]]])
l = np.hypot(dx, dy)
nx = dy / l
ny = -dx / l
# End points of errors
xp = x + nx * x_err
yp = y + ny * y_err
xn = x - nx * x_err
yn = y - ny * y_err
vertices = np.block([[xp, xn[::-1]],
[yp, yn[::-1]]]).T
codes = np.full(len(vertices), Path.LINETO)
codes[0] = codes[len(xp)] = Path.MOVETO
path = Path(vertices, codes)
ax.add_patch(PathPatch(path, **kwargs))
[docs]
def ROC_plot(metrics, labels, title = '', plot_thresholds=True, \
thr_points_signal = [0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95], \
filename = 'ROC', legend_fontsize=7, xmin=1.0e-4, alpha=0.32):
"""
Receiver Operating Characteristics i.e. False positive (x) vs True positive (y)
Args:
metrics:
labels:
title:
plot_thresholds:
thr_points_signal:
filename:
legend_fontsize:
xmin:
alpha:
"""
for k in [0,1]: # linear & log
fig,ax = plt.subplots()
xx = np.logspace(-5, 0, 100)
plt.plot(xx, xx, linestyle='--', color='black', linewidth=1) # ROC diagonal
for i in range(len(metrics)):
if i < 10:
linestyle = '-'
elif 10 <= i < 20:
linestyle = '--'
else:
linestyle = ':'
marker = 'None'
if metrics[i] is None:
print(f'metrics[{i}] ({labels[i]}) is None, continue without')
continue
fpr = metrics[i].fpr
tpr = metrics[i].tpr
thresholds = metrics[i].thresholds
if metrics[i].tpr_bootstrap is not None:
q = [100*(alpha/2), 100*(1-alpha/2)]
# Percentile bootstrap based uncertainties
tpr_CI = cortools.percentile_per_dim(x=metrics[i].tpr_bootstrap, q=q)
tpr_lo = tpr_CI[0]
tpr_hi = tpr_CI[1]
fpr_CI = cortools.percentile_per_dim(x=metrics[i].fpr_bootstrap, q=q)
fpr_lo = fpr_CI[0]
fpr_hi = fpr_CI[1]
# Autodetect a ROC point triangle (instead of a curve)
if len(np.unique(fpr)) == 3:
tpr = tpr[1:-1] # Remove first and last
fpr = fpr[1:-1] # Remove first and last
linestyle = "None"
marker = 'o'
"""
# ROC-curve
elif not (isinstance(fpr, int) or isinstance(fpr, float)):
fpr = fpr[1:] # Remove always the first element for log-plot reasons
tpr = tpr[1:]
if metrics[i].tpr_bootstrap is not None:
tpr_lo = tpr_lo[1:]
tpr_hi = tpr_hi[1:]
fpr_lo = fpr_lo[1:]
fpr_hi = fpr_hi[1:]
"""
## Plot it
plt.plot(fpr, tpr, drawstyle='steps-mid', color=f'C{i}', linestyle=linestyle, marker=marker, label = f'{labels[i]}: AUC = {metrics[i].auc:.4f}')
# Uncertainty band
if marker == 'None' and (metrics[i].tpr_bootstrap is not None):
plt.fill_between(fpr, tpr_lo, tpr_hi, step='mid', alpha=0.2, color=f'C{i}', edgecolor='none') # vertical
plt.fill_betweenx(tpr, fpr_lo, fpr_hi, step='mid', alpha=0.2, color=f'C{i}', edgecolor='none') # horizontal
# draw_error_band(ax=ax, x=fpr, y=tpr, \
# x_err=np.std(metrics[i].fpr_bootstrap, axis=0)[1:], \
# y_err=np.std(metrics[i].tpr_bootstrap, axis=0)[1:], \
# facecolor=f'C{i}', edgecolor="none", alpha=0.2)
# Plot corresponding threshold points
if plot_thresholds:
try:
for eff in thr_points_signal:
index = np.argmin(np.abs(tpr - eff))
if fpr[index] >= xmin and fpr[index] <= 1.0:
plt.plot(fpr[index], tpr[index], '.', color=f'C{i}')
t = plt.text(x=fpr[index], y=tpr[index], s=f'{thresholds[index]:0.4g}', fontsize=5, color=f'C{i}')
t.set_bbox(dict(facecolor='white', alpha=0.5, linewidth=0))
except: # If failed
True
ax.set_xlabel('False Positive Rate $\\alpha$ (background efficiency)')
ax.set_ylabel('True Positive Rate $1-\\beta$ (signal efficiency)')
ax.set_title(title, fontsize=10)
if k == 0: # Linear-Linear
# Legend
if len(metrics) > 12: # Put outside the figure
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=legend_fontsize)
else:
plt.legend(loc='lower right', fontsize=legend_fontsize)
plt.ylim(0.0, 1.0)
plt.xlim(0.0, 1.0)
plt.locator_params(axis="x", nbins=11)
plt.locator_params(axis="y", nbins=11)
ax.set_aspect(1.0 / ax.get_data_ratio() * 1.0)
plt.savefig(filename + '.pdf', bbox_inches='tight')
print(f'Saved: ' + filename + '.pdf','green')
if k == 1: # Log-Linear
# Legend
if len(metrics) > 12: # Put outside the figure
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=legend_fontsize)
else:
plt.legend(fontsize=legend_fontsize) # Automatic positioning
plt.ylim(0.0, 1.0)
plt.xlim(xmin, 1.0)
plt.locator_params(axis="x", nbins=int(-np.log10(xmin) + 1))
plt.locator_params(axis="y", nbins=11)
plt.gca().set_xscale('log')
ax.set_aspect(1.0 / ax.get_data_ratio() * 0.75)
plt.savefig(filename + '--log.pdf', bbox_inches='tight')
plt.close()
[docs]
def MVA_plot(metrics, labels, title='', filename='MVA', density=True, legend_fontsize=7) :
"""
MVA output plots
"""
# Check input
for i in range(len(metrics)):
if metrics[i] is None:
print(f'Error: metrics[{i}] ({labels[i]}) is None (check per class statistics), return -1')
return -1
else:
class_ids = metrics[i].class_ids
if len(metrics[i].mva_hist) != len(class_ids):
print(f'Error: num_classes != len(metrics[{i}].mva_hist) (check per class statistics), return -1')
return -1
for k in [0,1]: # linear & log
fig,ax = plt.subplots(1, len(class_ids), figsize=(7*len(class_ids), 5))
# Loop over classes
for c in class_ids:
# Loop over cells
N = len(metrics[0].mva_hist[c]) # Number of bins
counts = np.zeros((N, len(metrics)))
cbins = np.zeros((N, len(metrics)))
for i in range(len(metrics)):
if metrics[i] is None:
print(f'metrics[{i}] is None, continue without')
continue
if len(metrics[i].mva_bins) == 0:
print(f'len(metrics[{i}].mva_bins) == 0, continue without')
continue
bins = metrics[i].mva_bins
cbins[:,i] = (bins[0:-1] + bins[1:]) / 2
if metrics[i].mva_hist[c] != []:
counts[:,i] = metrics[i].mva_hist[c]
plt.sca(ax[c])
plt.hist(x=cbins, bins=bins, weights=counts, histtype='bar', stacked=True, \
density=density, label=labels, linewidth=2.0)
# Adjust
plt.sca(ax[c])
plt.legend(fontsize=legend_fontsize, loc='upper center')
ax[c].set_xlabel('MVA output $f(\\mathbf{x})$')
ax[c].set_ylabel('counts' if not density else 'density')
ax[c].set_title(f'{title} | class = {c}', fontsize=10)
if k == 0:
#plt.ylim(0.0, 1.0)
#plt.xlim(0.0, 1.0)
#ax.set_aspect(1.0/ax.get_data_ratio() * 1.0)
plt.savefig(filename + '.pdf', bbox_inches='tight')
print(f'Saved: ' + filename + '.pdf','green')
if k == 1:
#plt.ylim(0.0, 1.0)
#plt.xlim(1e-4, 1.0)
for i in range(len(class_ids)):
plt.sca(ax[i])
plt.gca().set_yscale('log')
#ax.set_aspect(1.0/ax.get_data_ratio() * 0.75)
plt.savefig(filename + '--log.pdf', bbox_inches='tight')
# --------
fig.clf()
plt.close()
gc.collect()
[docs]
def plot_contour_grid(pred_func, X, y, ids, targetdir = '.', transform = 'numpy', reso=50, npoints=400):
"""
Classifier decision contour evaluated pairwise for each dimension,
other dimensions evaluated at zero=(0,0,...0) (thus z-normalized with 0-mean is suitable)
Args:
pred_func: prediction function handle
X: input matrix
y: class targets
ids: variable label strings
targetdir: output directory
transform: 'numpy', 'torch'
reso: evaluation resolution
npoints: number of points to draw
"""
print(f'Evaluating, X.shape = {X.shape}...')
MAXP = min(npoints, X.shape[0])
D = X.shape[1]
pad = 0.5
for dim1 in tqdm(range(D)) :
x_min, x_max = X[:, dim1].min() - pad, X[:, dim1].max() + pad
for dim2 in range(D) :
if dim2 <= dim1 :
continue
# (x,y)-plane limits
y_min, y_max = X[:, dim2].min() - pad, X[:, dim2].max() + pad
# Grid points
PX,PY = np.meshgrid(np.linspace(x_min, x_max, reso), np.linspace(y_min, y_max, reso))
# -------------------------------------
## Evaluate function values
XX = np.zeros((reso*reso, D))
XX[:, dim1] = PX.ravel()
XX[:, dim2] = PY.ravel()
if transform == 'torch':
Z = pred_func(torch.from_numpy(XX).type(torch.FloatTensor))
elif transform == 'numpy':
Z = pred_func(XX)
else:
raise Exception(__name__ + f'.plot_decision_contour: Unknown matrix type = {transform}')
Z = Z.reshape(PX.shape)
# --------------------------------------
## Plot
fig, axs = plt.subplots()
# Contour
cs = plt.contourf(PX, PY, Z, cmap = plt.cm.Spectral)
# Events as dots
plt.scatter(X[0:MAXP, dim1], X[0:MAXP, dim2], c = y[0:MAXP], cmap = plt.cm.binary)
plt.xlabel(f'x[{dim1}] {ids[dim1]}')
plt.ylabel(f'x[{dim2}] {ids[dim2]}')
plt.colorbar(cs, ticks = np.linspace(0.0, 1.0, 11))
filename = targetdir + f'{dim1}-{dim2}.pdf'
plt.savefig(filename, bbox_inches='tight')
print(f'Saved: ' + filename,'green')
# --------
fig.clf()
plt.close()
gc.collect()
[docs]
def plot_xgb_importance(model, tick_label, importance_type='gain', label=None, sort=True, default_names=False):
"""
Plot XGBoost model feature importance
Args:
model: xgboost model object
dim: feature space dimension
tick_label: feature names
importance_type: type of importance metric ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
default_names: True for xgboost default, else set False (uses tick_label)
Returns
fig, ax
"""
fscores = model.get_score(importance_type=importance_type)
dim = len(tick_label)
xx = np.arange(dim)
yy = np.zeros(dim)
labels = []
# Try, Except needed because xgb does Not return (always) for all of them
for i in range(dim):
try:
if default_names:
yy[i] = fscores[f'f{i}'] # XGBoost default feature name 'f{i}'
else:
yy[i] = fscores[tick_label[i]] # Definite names
except:
yy[i] = 0.0
labels.append(f'{tick_label[i]} ({yy[i]:0.1f}) [{i}]')
# Sort them
if sort:
s_ind = np.array(np.argsort(yy), dtype=int)
yy = yy[s_ind]
labels = [labels[i] for i in s_ind]
# Plot
fig,ax = plt.subplots(figsize=(0.5 * (np.ceil(dim/6) + 2), np.ceil(dim/6) + 2))
plt.barh(xx, yy, align='center', height=0.5, tick_label=labels)
plt.xlabel(f'Score ({importance_type})')
plt.title(f'[{label}]')
return fig, ax
[docs]
def table_writer(filename, label, sublabel, tau, chi2_table, print_to_screen=False):
"""
Helper function to write a chi2 table to a file
"""
with open(filename, "a") as text_file:
text_file.write(f'category: {sublabel} | AI model: {label} | tau: {tau:0.2f}')
text_file.write('\n')
text_file.write(chi2_table.get_string())
text_file.write('\n')
text_file.write('\n')
if print_to_screen:
print(chi2_table, 'magenta')
print('')
[docs]
def plot_AIRW(X, y, ids, weights, y_pred, pick_ind,
label, sublabel, param, tau=1.0, targetdir=None, num_cpus=0):
"""
Plot AI based reweighting results
"""
print(f'label = {label} | sublabel = {sublabel} | tau = {tau}', 'green')
## Print stats
output_file = os.path.join(targetdir, 'stats_AIRW_weight_flow.log')
prints.print_weights(weights=weights, y=y, output_file=output_file,
header='Step 1. Input event weights', write_mode='w')
# ---------------------------------------------------
## 1. Transform model output scores to AI weights
EPS = param['EPS']
maxW = param['maxW']
C0 = param['C0']
C1 = param['C1']
RN_ID = param['renorm_origin']
mode = param['transform_mode']
# Handle logits vs probabilities
min_y_pred, max_y_pred = np.min(y_pred), np.max(y_pred)
THRESH = 1E-5
if min_y_pred < (-THRESH) or max_y_pred > (1.0 + THRESH):
print(f'Detected raw logit output [{min_y_pred:0.4f}, {max_y_pred:0.4f}] from the model')
logits = y_pred[y == C0]
probs = aux.sigmoid(logits)
print(f'Corresponding probability output [{np.min(probs):0.4f}, {np.max(probs):0.4f}]')
else:
print(f'Detected probability output [{min_y_pred:0.4f}, {max_y_pred:0.4f}] from the model')
logits = aux.inverse_sigmoid(p = y_pred[y == C0], EPS=EPS)
print(f'Corresponding logit output [{np.min(logits):0.4f}, {np.max(logits):0.4f}]')
# 1. Apply temperature scaling
logits /= tau
# 2. Get weights after the re-weighting transform
AIw0 = reweight.rw_transform_with_logits(logits=logits, mode=mode)
## Print stats
prints.print_weights(weights=AIw0, y=np.zeros(len(AIw0)), output_file=output_file,
header='Step 2. AI weights [raw]', write_mode='a')
# 3. Cut-off regularize anomalous high weights before event weights
AIw0 = np.clip(AIw0, 0.0, maxW)
## Print stats
prints.print_weights(weights=AIw0, y=np.zeros(len(AIw0)), output_file=output_file,
header=f'Step 3. AI weights [cutoff regularized with maxW = {maxW}]', write_mode='a')
# 4. Apply multiplicatively to event weights (which can be negative)
AIw0 = AIw0 * weights[y == C0]
## Print stats
prints.print_weights(weights=AIw0, y=np.zeros(len(AIw0)), output_file=output_file,
header=f'Step 4. Total weights [input weights x AI weights]', write_mode='a')
# ---------------------------------------------------
## 2. Renormalize (optional) (e.g. we want fixed overall normalization, or debug)
if RN_ID is not None:
print(f'Renormalizing with class [{RN_ID}] weight sum', 'yellow')
sum_before = AIw0.sum()
print(f'Sum(before): {sum_before:0.1f}')
AIw0 /= np.sum(AIw0) # Normalize
AIw0 *= np.sum(weights[y == RN_ID]) # Scale
sum_after = AIw0.sum()
print(f'Sum(after): {sum_after:0.1f}')
ratio = f'Ratio = Sum(after) / Sum(before) = {sum_after/sum_before:0.2f}'
print(ratio)
print('')
# Print stats
prints.print_weights(weights=AIw0, y=np.zeros(len(AIw0)), output_file=output_file,
header=f'Step 5. Total weights sum renormalized [wrt. class ID = {RN_ID}] | {ratio}', write_mode='a')
# ===================================================
# Save data to a parquet file
tau_str = str(np.round(tau, 3))
# Create DataFrame
w_post_S2 = copy.deepcopy(weights)
w_post_S2[y == C0] = AIw0 # only C0
df = pd.DataFrame(w_post_S2, columns=[f'w_post_S2__tau_{tau_str}'])
# ===================================================
# Visualization
# ---------------------------------------------------
chi2_table = PrettyTable(["observable", "ndf", "chi2 / ndf", "(AI) chi2 / ndf"])
# ---------------------------------------------------
param = {
'i': None,
'X': None,
'y': None,
'weights': None,
'ids': None,
'AIw0': None,
'C0': C0,
'C1': C1,
'label': label,
'param': param,
'local_dir': targetdir
}
# Loop over each observable (specific column of X)
paramlist = []
for i in pick_ind:
paramlist.append(copy.deepcopy(param))
# big numpy arrays should share the memory (i.e. no copy of X)
paramlist[-1]['i'] = i
paramlist[-1]['X'] = X
paramlist[-1]['y'] = y
paramlist[-1]['weights'] = weights
paramlist[-1]['ids'] = ids
paramlist[-1]['AIw0'] = AIw0
# Start multiprocessing
num_workers = multiprocessing.cpu_count() // 2 if num_cpus == 0 else num_cpus
print(f'Multiprocessing {len(pick_ind)} plots with {num_workers} processes', 'yellow')
pool = multiprocessing.Pool(processes=num_workers)
tic = time.time()
chi2 = pool.map(multiprocess_AIRW_wrapper, paramlist)
pool.close() # no more tasks
pool.join() # wrap up current tasks
toc = time.time()
print(f'Took {toc-tic:0.2f} sec')
# Collect summary statistics from the pool
total_ndf = 0
total_ndf_AI = 0
total_chi2 = 0.0
total_chi2_AI = 0.0
print('chi2 after pool.map: ')
print(chi2)
for i in range(len(chi2)):
# Add table row
row = [chi2[i]['id'], chi2[i]['ndf_0'], chi2[i]['chi2_0'] / chi2[i]['ndf_0'], chi2[i]['chi2_0_AI'] / chi2[i]['ndf_0_AI']]
chi2_table.add_row(row, divider=True if i == len(chi2)-1 else False)
total_ndf += chi2[i]['ndf_0']
total_ndf_AI += chi2[i]['ndf_0_AI']
total_chi2 += chi2[i]['chi2_0']
total_chi2_AI += chi2[i]['chi2_0_AI']
# Add rows
chi2_table.add_row(['total', f'{total_ndf}', f'{total_chi2/total_ndf:0.1f}', f'{total_chi2_AI/total_ndf_AI:0.1f}'])
chi2.append({'id': 'total', 'ndf_0': total_ndf, 'ndf_0_AI': total_ndf_AI, 'chi2_0': total_chi2, 'chi2_0_AI': total_chi2_AI})
# -----------------------------------------------
filename = os.path.join(targetdir, f"stats_chi2.log")
open(filename, 'w').close() # Clear content
table_writer(filename=filename, label=label, sublabel=sublabel, tau=tau, chi2_table=chi2_table, print_to_screen=True)
# -----------------------------------------------
return chi2_table, df, chi2
[docs]
def multiprocess_AIRW_wrapper(p):
"""
Multiprocessing plots
"""
i = p['i'] # Column index
X = p['X']
y = p['y']
w = p['weights']
ids = p['ids']
AIw0 = p['AIw0']
C0 = p['C0']
C1 = p['C1']
label = p['label']
param = p['param']
local_dir = p['local_dir']
X_col = copy.deepcopy(X[:, i])
bins = binengine(bindef=param['edges'], x=X_col)
# ----------------------------------------------------
obs_X = {
# Axis limits
'xlim' : (bins[0], bins[-1]),
'ylim' : None,
'xlabel' : f'{ids[i]}',
'ylabel' : r'Weighted counts',
'units' : {'x': r'a.u.', 'y' : r'1'},
'label' : f'{label}',
'figsize' : (4, 3.75),
# Ratio
'ylim_ratio' : (0.75, 1.25),
# Histogramming
'bins' : None,
'density' : False,
# Function to calculate
'func' : None
}
data_template = {
'data' : None,
'weights': None,
'label' : 'Data',
'hfunc' : 'errorbar',
'style' : iceplot.errorbar_style,
'obs' : obs_X,
'hdata' : None,
'color' : None
}
# Data source <-> Observable collections
class1 = data_template.copy() # Deep copies
class0 = data_template.copy()
class0_ML = data_template.copy()
class1.update({
'label' : '$C_1$',
'hfunc' : 'hist',
'style' : iceplot.hist_style_step,
'color' : (0,0,0)
})
class0.update({
'label' : '$C_0$',
'hfunc' : 'hist',
'style' : iceplot.hist_style_step,
'color' : iceplot.imperial_green
})
class0_ML.update({
'label' : '$C_0$ (AI)',
'hfunc' : 'hist',
'style' : iceplot.hist_style_step,
'color' : iceplot.imperial_dark_red
})
data = [class1, class0, class0_ML]
data[0]['hdata'] = iceplot.hist_obj(X_col[y == C1], bins=bins, weights=w[y == C1])
data[1]['hdata'] = iceplot.hist_obj(X_col[y == C0], bins=bins, weights=w[y == C0])
data[2]['hdata'] = iceplot.hist_obj(X_col[y == C0], bins=bins, weights=AIw0)
# Update labels with chi2
chi2_0, ndf_0 = iceplot.chi2_cost(h_mc=data[1]['hdata'], h_data=data[0]['hdata'], return_nbins=True)
chi2_0_AI, ndf_0_AI = iceplot.chi2_cost(h_mc=data[2]['hdata'], h_data=data[0]['hdata'], return_nbins=True)
data[1]['label'] += f' | $\\chi^2 = {chi2_0:0.0f} \, / \, {ndf_0} = {chi2_0/ndf_0:0.1f}$'
data[2]['label'] += f' | $\\chi^2 = {chi2_0_AI:0.0f} \, / \, {ndf_0_AI} = {chi2_0_AI/ndf_0_AI:0.1f}$'
# --------------------
# Plot it
fig0, ax0 = iceplot.superplot(data, ratio_plot=True, yscale='log', ratio_error_plot=True)
fig1, ax1 = iceplot.superplot(data, ratio_plot=True, yscale='linear', ratio_error_plot=True)
fig0.savefig(aux.makedir(local_dir + '/pdf/') + f'/reweight_[{ids[i]}]__log.pdf', bbox_inches='tight')
fig0.savefig(aux.makedir(local_dir + '/png/') + f'/reweight_[{ids[i]}]__log.png', bbox_inches='tight', dpi=300)
fig1.savefig(aux.makedir(local_dir + '/pdf/') + f'/reweight_[{ids[i]}]__linear.pdf', bbox_inches='tight')
fig1.savefig(aux.makedir(local_dir + '/png/') + f'/reweight_[{ids[i]}]__linear.png', bbox_inches='tight', dpi=300)
fig0.clf()
fig1.clf()
plt.close(fig0)
plt.close(fig1)
return {'id': ids[i], 'ndf_0': ndf_0, 'ndf_0_AI': ndf_0_AI, 'chi2_0': chi2_0, 'chi2_0_AI': chi2_0_AI}