Source code for icenet.tools.plots

# Plotting functions
#
# m.mieskolainen@imperial.ac.uk, 2025

import gc
import os
import copy
import multiprocessing
import time

from pprint import pprint
from prettytable import PrettyTable

import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
from matplotlib.gridspec import GridSpec

import numpy as np
import awkward as ak
import torch
from tqdm import tqdm
import pandas as pd

from iceplot import iceplot
from icefit import statstools, cortools
from icenet.tools import aux, reweight, prints

# ------------------------------------------
from icenet import print
# ------------------------------------------


[docs] def binengine(bindef, x): """ Binning processor function Args: bindef: binning definition x: data input array Examples: 50 (number of bins, integer) [1.0, 40.0, 50.0] (list of explicit edges) {'nbin': 30, 'q': [0.0, 0.95], 'space': 'linear'} (automatic with quantiles) {'nbin': 30, 'minmax': [2.0, 50.0], 'space': 'log10'} (automatic with boundaries) Returns: edges: binning edges """ if len(x) == 0: print(f'Input is zero-array, returning [0,0]') return np.array([0,0]) def mylinspace(minval, maxval, nbin): if 'space' in bindef and bindef['space'] == 'log10': if minval <= 0: raise Exception(__name__ + f'.bin_processor: Trying log10 binning with minimum = {minval} <= 0') return np.logspace(np.log10(minval), np.log10(maxval), nbin + 1) else: return np.linspace(minval, maxval, nbin + 1) # Integer if type(bindef) is int or type(bindef) is float: return np.linspace(np.min(x), np.max(x), int(bindef) + 1) # Bin edges given directly elif type(bindef) is list: return np.array(bindef) elif type(bindef) is np.ndarray: return bindef # Automatic with quantiles elif type(bindef) is dict and 'q' in bindef: return mylinspace(minval=np.percentile(x, 100*bindef['q'][0]), maxval=np.percentile(x, 100*bindef['q'][1]), nbin=bindef['nbin']) # Automatic with boundaries elif type(bindef) is dict and 'minmax' in bindef: return mylinspace(minval=bindef['minmax'][0], maxval=bindef['minmax'][1], nbin=bindef['nbin']) else: raise Exception(__name__ + f'.bin_processor: Unknown binning description in {bindef}')
[docs] def plot_selection(X, mask, ids, plotdir, label, varlist, density=True, library='np'): """ Plot selection before / after type histograms against all chosen variables Args: X : data array (N events x D dimensions) mask : boolean selection indices (N) ids : variable string array (D) plotdir : plotting directory label : a string label varlist : a list of variables to be plotted (from ids) density : normalize all histograms to unit density library : 'np' or 'ak' """ for var in tqdm(varlist): if var not in ids: continue # Histogram (autobinning) if library == 'np': before = np.asarray(X[:, ids.index(var)]) after = np.asarray(X[mask, ids.index(var)]) elif library == 'ak': before = ak.to_numpy(X[:][var]) after = ak.to_numpy(X[mask][var]) else: raise Exception(__name__ + f'.Unknown library: {library}') counts1, errs1, bins, cbins = iceplot.hist(before, bins=100, density=density) counts2, errs2, bins, cbins = iceplot.hist(after, bins=bins, density=density) # Plot obs_x = { 'xlim' : (np.min(bins), np.max(bins)), 'ylim' : None, 'ylim_ratio': (0.7, 1.3), 'xlabel' : var, 'ylabel' : r'Counts', 'units' : {'x': None, 'y' : r'counts'}, 'density' : density, 'figsize' : (4, 3.75) } fig, ax = iceplot.create_axes(**obs_x, ratio_plot=True) label1 = f'before cuts' label2 = f'after cuts' ax[0].hist(x=cbins, bins=bins, weights=counts1, color=(0,0,0), label=label1, **iceplot.hist_style_step) ax[0].hist(x=cbins, bins=bins, weights=counts2, color=(1,0,0), alpha=0.5, label=label2, **iceplot.hist_style_step) iceplot.ordered_legend(ax=ax[0], order=[label1, label2]) iceplot.plot_horizontal_line(ax[1]) ax[1].hist(x=cbins, bins=bins, weights=counts2 / (counts1 + 1E-30), color=(1,0,0), alpha=0.5, label='ratio', **iceplot.hist_style_step) # Save it targetdir = aux.makedir(f'{plotdir}/cuts/{label}') fig.savefig(f'{targetdir}/{var}.pdf', bbox_inches='tight') ax[0].set_yscale('log') fig.savefig(f'{targetdir}/{var}--log.pdf', bbox_inches='tight') # -------- fig.clf() plt.close() gc.collect()
[docs] def plot_matrix(XY, x_bins, y_bins, vmin=0, vmax=None, cmap='RdBu', figsize=(4,3), grid_on=False): """ Visualize matrix. """ fig,ax = plt.subplots(figsize=figsize) ax.grid(grid_on) x,y = np.meshgrid(x_bins, y_bins) # Note transpose c = ax.pcolormesh(x, y, XY.T, cmap=cmap, vmin=vmin, vmax=vmax, antialiased=True, linewidth=0.0) ax.axis([x.min(), x.max(), y.min(), y.max()]) return fig,ax,c
[docs] def plot_train_evolution_multi(losses, trn_aucs, val_aucs, label, aspect=0.85, yscale='linear', xscale='linear'): """ Training evolution plots. Args: losses: loss values in a dictionary trn_aucs: training metrics (or None) val_aucs: validation metrics (or None) Returns: fig: figure handle ax: figure axis """ fig, ax = plt.subplots(1, 2, figsize=(10, 7.5)) fig.subplots_adjust(wspace=0.3) # Add horizontal space between subplots # Count how many loss terms N_terms = sum(1 for key in losses if 'eval' in key) for key in losses: if ('sum' in key) and (N_terms == 2): continue # Skip sum if redundant txt = key.replace('eval', 'validate') if 'eval' in key else key ax[0].plot(losses[key], label=txt) ax[0].set_xlabel('k (epoch)') ax[0].set_ylabel('loss') ax[0].legend(fontsize=6) ax[0].set_title(f'{label}', fontsize=10) ax[0].set_yscale(yscale) ax[0].set_xscale(xscale) ax[0].set_aspect(1.0 / ax[0].get_data_ratio() * aspect) ax[0].autoscale(enable=True, axis='x', tight=True) # Plot AUCs only if they are not None auc_labels = [] if trn_aucs is not None: ax[1].plot(trn_aucs) auc_labels.append('train') if val_aucs is not None: ax[1].plot(val_aucs) auc_labels.append('validate') if auc_labels: ax[1].legend(auc_labels, fontsize=8) ax[1].set_ylabel('AUC') ymin = 0.0 ymax = 1.0 if trn_aucs is not None and len(trn_aucs) > 0: ymin = min(ymin, np.min(trn_aucs)) if val_aucs is not None and len(val_aucs) > 0: ymin = min(ymin, np.min(val_aucs)) ax[1].set_ylim([ymin, ymax]) else: ax[1].set_ylabel('AUC (no data)') ax[1].set_xlabel('k (epoch)') ax[1].grid(True) ax[1].set_yscale(yscale) ax[1].set_xscale(xscale) ax[1].autoscale(enable=True, axis='x', tight=True) ax[1].set_aspect(1.0 / ax[1].get_data_ratio() * aspect) return fig, ax
[docs] def binned_2D_AUC(y_pred, y, X_kin, ids_kin, X, ids, edges, label, weights=None, VAR:list=['trk_pt', 'trk_eta']): """ Evaluate AUC per 2D-bin. Args: y_pred : MVA algorithm output y : Output (truth level target) data X_kin : Data ids_kin : Variables X : Data ids : Variables edges : Edges of the A,B-space cells (2D array) label : Label of the classifier (string) weights : Sample weights VAR : Variable identifiers (two) Returns: fig,ax : Figure handle and axis met : Metrics object """ edges_A = edges[0] edges_B = edges[1] AUC = np.zeros((len(edges_A)-1, len(edges_B)-1)) if len(y_pred) != len(y): raise Exception(__name__ + f'.binned_2D_AUC: len(y_pred) = {len(y_pred)} != len(y) = {len(y)}') # Loop over bins for i in range(len(edges_A)-1): for j in range(len(edges_B)-1): range_A = [edges_A[i], edges_A[i+1]] range_B = [edges_B[j], edges_B[j+1]] # Indices try: ind = np.logical_and(aux.pick_ind(X_kin[:, ids_kin.index(VAR[0])], range_A), aux.pick_ind(X_kin[:, ids_kin.index(VAR[1])], range_B)) except: try: ind = np.logical_and(aux.pick_ind(X[:, ids.index(VAR[0])], range_A), aux.pick_ind(X[:, ids.index(VAR[1])], range_B)) except: raise Exception(__name__ + f'.binned_2D_AUC: Cannot find the variables {VAR}') string = f'{VAR[0]} = [{range_A[0]:10.3f},{range_A[1]:10.3f}), {VAR[1]} = [{range_B[0]:10.3f},{range_B[1]:10.3f})' if np.sum(ind) > 0: # Do we have any events in this cell # Evaluate metric if weights is not None: met = aux.Metric(y_true=y[ind], y_pred=y_pred[ind], weights=weights[ind]) else: met = aux.Metric(y_true=y[ind], y_pred=y_pred[ind]) print(f'{string} | AUC = {met.auc:.5f}') AUC[i,j] = met.auc else: print(f'{string} | No events found in this cell!') # Evaluate total performance met = aux.Metric(y_true=y, y_pred=y_pred, weights=weights) # Finally plot it fig,ax = plot_AUC_matrix(AUC=AUC, edges_A=edges_A, edges_B=edges_B) ax.set_title(f'{label} | AUC = {met.auc:.3f} (integrated)', fontsize=9) ax.set_xlabel(f"{VAR[0]}") ax.set_ylabel(f"{VAR[1]}") return fig, ax, met
[docs] def binned_1D_AUC(y_pred, y, X_kin, ids_kin, X, ids, edges, weights=None, VAR:str='trk_pt', num_bootstrap=0): """ Evaluate AUC & ROC per 1D-bin. Args: y_pred : MVA algorithm output y : Output (truth level target) data X_kin : Data ids_X_kin : Variables (strings) X : Data ids : Variables (strings) edges : Edges of the space cells weights : Sample weights VAR : Variable identifier to pick (one) Returns: fig,ax : Figure handle and axis met : Metrics object """ AUC = np.zeros((len(edges)-1, 1)) if len(y_pred) != len(y): raise Exception(__name__ + f'.binned_1D_AUC: len(y_pred) = {len(y_pred)} != len(y) = {len(y)}') METS = [] LABELS = [] # Loop over bins for i in range(len(edges)-1): range_ = [edges[i], edges[i+1]] # Indices (either from X_kin or X) try: ind = aux.pick_ind(X_kin[:, ids_kin.index(VAR)], range_) except: try: ind = aux.pick_ind(X[:, ids.index(VAR)], range_) except: raise Exception(__name__ + f'.binned_1D_AUC: Cannot find the variable {VAR}') string = f'{VAR} = [{range_[0]:10.3f},{range_[1]:10.3f})' if np.sum(ind) > 0: # Do we have any events in this cell # Evaluate metric if weights is not None: met = aux.Metric(y_true=y[ind], y_pred=y_pred[ind], weights=weights[ind], num_bootstrap=num_bootstrap) else: met = aux.Metric(y_true=y[ind], y_pred=y_pred[ind], num_bootstrap=num_bootstrap) AUC[i] = met.auc METS.append(met) print(f'{string} | AUC = {AUC[i]}') else: METS.append(None) print(f'{string} | No events found in this cell!') LABELS.append(f'{VAR}$ \\in [{range_[0]:.1f},{range_[1]:.1f})$') return METS, LABELS
[docs] def density_MVA_wclass(y_pred, y, label, weights=None, class_ids=None, edges=80, path='', **kwargs): """ Evaluate MVA output (1D) density per class. Args: y_pred : MVA algorithm output y : Output (truth level target) data label : Label of the MVA model (string) weights : Sample weights class_ids : Class IDs to plot hist_edges : Histogram edges list (or number of bins, as an alternative) Returns: Plot pdf saved directly """ if class_ids is None: class_ids = np.unique(y.astype(int)) # Make sure it is 1-dim array of length N (not N x num classes) if (weights is not None) and len(weights.shape) > 1: weights = np.sum(weights, axis=1) if weights is not None: classlegs = [f'$\\mathcal{{C}} = {k}$, $N={np.sum(y == k)}$ ($\\Sigma_w = {np.sum(weights[y == k]):0.1f}$)' for k in class_ids] else: classlegs = [f'$\\mathcal{{C}} = {k}$, $N={np.sum(y == k)}$ (no weights)' for k in class_ids] # Handle logits vs probabilities THRESH = 1E-5 if np.min(y_pred) < (-THRESH) or np.max(y_pred) > (1.0 + THRESH): logit = copy.deepcopy(y_pred) prob = aux.sigmoid(logit) else: prob = copy.deepcopy(y_pred) logit = aux.inverse_sigmoid(prob) # Plot both for mode in ['logit', 'prob']: # Over classes fig,ax = plt.subplots() for k in class_ids: ind = (y == k) if np.sum(ind) == 0: print(f'No samples for class {k} -- continue') continue w = weights[ind] if weights is not None else None if mode == 'logit': x = logit[ind] else: x = prob[ind] hI, bins, patches = plt.hist(x=x, bins=binengine(bindef=edges, x=x), weights=w, density = True, histtype = 'step', fill = False, linewidth = 2, label = 'inverse') plt.legend(classlegs, loc='upper center', fontsize=7) plt.xlabel(f'MVA output $f(\\mathbf{{x}})$ [{mode}]') plt.ylabel('density') plt.title(f'{label}', fontsize=9) for scale in ['linear', 'log']: try: ax.set_yscale(scale) outputdir = aux.makedir(f'{path}') savepath = f'{outputdir}/MVA-output__{mode}__{scale}.pdf' plt.savefig(savepath, bbox_inches='tight') print(f'Saved: "{savepath}"', 'green') except Exception as e: print(e, 'red') print(f'Plotting failed for {label} -- continue', 'red') continue # -------- fig.clf() plt.close(fig) gc.collect()
[docs] def plot_correlation_comparison(corr_mstats, targetdir, xlim=None): """ Plot collected correlation metrics from density_COR_wclass() Args: corr_mstats: statistics dictionary targetdir: output directory xlim: plot limits dictionary per class Returns: plots saved to a directory """ print(f'Plotting correlation comparisons ...') class_ids = {} ## Find all variables all_var = [] for model in corr_mstats.keys(): for category in corr_mstats[model].keys(): for c in corr_mstats[model][category].keys(): class_ids = corr_mstats[model][category].keys() # This is overwritten (all same) for var in corr_mstats[model][category][c].keys(): all_var.append(var) all_var = set(all_var) # Reduce to a set (values appear only once) ## Over all variables for var in all_var: # Over classes for c in class_ids: # Over different statistical metrics for stats in ['pearson', 'abs_pearson', 'disco', 'MI']: fig,ax = plt.subplots() # Per model for model in corr_mstats.keys(): categories = list(corr_mstats[model].keys()) values = np.nan * np.ones(len(categories)) lower = np.nan * np.ones(len(categories)) upper = np.nan * np.ones(len(categories)) ## Over each powerset category for i in range(len(categories)): x = corr_mstats[model][categories[i]][c] if x is not {}: # We have some stats values[i] = x[var][f'{stats}'] lower[i] = x[var][f'{stats}_CI'][0] upper[i] = x[var][f'{stats}_CI'][1] error = (upper - lower)/2 # Simple symmetric errors #lower = np.abs(values - np.array(lower)) #upper = np.abs(values - np.array(upper)) #asymmetric_error = np.array(list(zip(lower, upper))).T # Vertical line at zero plt.plot(np.zeros(len(values)), np.arange(len(values)), color=np.ones(3)*0.5, label=None) ## Plot horizontal plot i.e. values +- (lower, upper) on x-axis, category on y-axis legend_label = f'{model} [{np.median(values):0.3f}]' plt.errorbar(values, np.arange(len(values)), xerr=error, fmt='s', capsize=5.0, label=legend_label) title = f'$\\mathcal{{C}} = {c}$' plt.title(title) plt.xlabel(f'{stats}$_{{XY}}$ (MVA score, {var}) (68CL)') if xlim is not None: ax.set_xlim(xlim[stats][c]) ax.set_yticks(np.arange(len(values))) ax.set_yticklabels(categories) ax.invert_yaxis() plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) filename = aux.makedir(targetdir + f'/COR/') plt.savefig(filename + f'var-{var}--stats-{stats}--class-{c}.pdf', bbox_inches='tight') plt.close()
[docs] def density_COR_wclass(y_pred, y, X, ids, label, \ weights=None, class_ids=None, edges=[[50], [50]], density=True, path='', cmap='Oranges', **kwargs): """ Evaluate the 2D-density of the MVA algorithm output vs other variables per class. Args: y_pred : MVA algorithm output y : Output (truth level target) data X : Variables to be plotted ids : Identifiers of the variables in X label : Label of the MVA model (string) weights : Sample weights class__ids : Class ids to plot hist_edges : Histogram edges list (or number of bins, as an alternative) (2D) density : Normalize to density path : Save path cmap : Color map Returns: correlation values in a dictionary (per variable, per class) plots are saved directly """ if class_ids is None: class_ids = np.unique(y.astype(int)) # Make sure it is 1-dim array of length N (not N x num classes) if (weights is not None) and len(weights.shape) > 1: weights = np.sum(weights, axis=1) if weights is not None: classlegs = [f'$\\mathcal{{C}} = {k}$, $N={np.sum(y == k)}$ (weighted {np.sum(weights[y == k]):0.1f})' for k in class_ids] else: classlegs = [f'$\\mathcal{{C}} = {k}$, $N={np.sum(y == k)}$ (no weights)' for k in class_ids] output = {} # Over classes for k in class_ids: output[k] = {} ind = (y == k) w = weights[ind] if weights is not None else None if np.sum(ind) == 0: print(f'No samples for class {k} -- continue') continue # Loop over variables for var in ids: xx = y_pred[ind] yy = X[ind, ids.index(var)] bins = [binengine(bindef=edges[0], x=xx), binengine(bindef=edges[1], x=yy)] # Apply histogram bounds box = np.logical_and(np.logical_and(xx > bins[0][0], xx < bins[0][-1]), np.logical_and(yy > bins[1][0], yy < bins[1][-1])) xx = xx[box] yy = yy[box] ww = w[box] # Pearson correlation coefficient cc,cc_CI,_ = cortools.pearson_corr(x=xx, y=yy, weights=ww) # Absolute Pearson correlation coefficient cc_abs,cc_abs_CI,_ = cortools.pearson_corr(x=xx, y=yy, weights=ww, return_abs=True) # Distance correlation disco,disco_CI = cortools.distance_corr(x=xx, y=yy, weights=ww) # Histogram based MI MI,MI_CI = cortools.mutual_information(x=xx, y=yy, weights=ww, automethod='Scott2D') # Neural Mutual Information [cannot use, unreliable to compute for small samples] #from icefit import mine #MI,MI_err = mine.estimate(X=xx, Z=yy, weights=ww) #MI_CI = np.array([MI-MI_err, MI+MI_err]) # Save output output[k][var] = {} output[k][var]['pearson'] = cc output[k][var]['pearson_CI'] = cc_CI output[k][var]['abs_pearson'] = cc_abs output[k][var]['abs_pearson_CI'] = cc_abs_CI output[k][var]['disco'] = disco output[k][var]['disco_CI'] = disco_CI output[k][var]['MI'] = MI output[k][var]['MI_CI'] = MI_CI for scale in ['linear', 'log']: fig,ax = plt.subplots() outputdir = aux.makedir(f'{path}') savepath = f'{outputdir}/var-{var}--class-{k}--{scale}.pdf' try: if scale == 'log': import matplotlib as mpl h2,xedges,yedges,im = plt.hist2d(x=xx, y=yy, bins=bins, weights=ww, density=density, norm=mpl.colors.LogNorm(), cmap=plt.get_cmap(cmap)) else: h2,xedges,yedges,im = plt.hist2d(x=xx, y=yy, bins=bins, weights=ww, density=density, cmap=plt.get_cmap(cmap)) fig.colorbar(im) plt.xlabel(f'MVA output $f(\\mathbf{{x}})$') plt.ylabel(f'{var}') rho_value = f'$\\rho_{{XY}} = {cc:0.2f}_{{-{cc-cc_CI[0]:0.2f}}}^{{+{cc_CI[1]-cc:0.2f}}}$' MI_value = f'$\\mathcal{{I}}_{{XY}} = ({MI_CI[0]:0.3f}, {MI_CI[1]:0.3f})$' plt.title(f'{label} | $\\mathcal{{C}} = {k}$ | {rho_value} | {MI_value}', fontsize=9) # ----- plt.savefig(savepath, bbox_inches='tight') print(f'Saved: "{savepath}"', 'green') # ----- fig.clf() plt.close() gc.collect() except: # Matplotlib LogNorm() can be buggy print(f'Failed to produce plot {savepath}') pprint(output) return output
[docs] def density_COR(y_pred, X, ids, label, weights=None, hist_edges=[[50], [50]], path='', cmap='Oranges'): """ Evaluate the 2D-density of the MVA algorithm output vs other variables. Args: y_pred : MVA algorithm output X : Variables to be plotted ids : Identifiers of the variables in X label : Label of the MVA model (string) weights : Sample weights hist_edges : Histogram edges list (or number of bins, as an alternative) (2D) path : Save path cmap : Color map Returns: Plot pdf saved directly """ # Make sure it is 1-dim array of length N (not N x num classes) if (weights is not None) and len(weights.shape) > 1: weights = np.sum(weights, axis=1) # Loop over variables for var in ids: fig,ax = plt.subplots() # Plot 2D xx = y_pred yy = X[:, ids.index(var)] bins = [binengine(bindef=hist_edges[0], x=xx), binengine(bindef=hist_edges[1], x=yy)] h2,xedges,yedges,im = plt.hist2d(x=xx, y=yy, bins=bins, weights=weights, cmap=plt.get_cmap(cmap)) fig.colorbar(im) plt.xlabel(f'MVA output $f(\\mathbf{{x}})$') plt.ylabel(f'{var}') plt.title(f'{label}', fontsize=10) # ----- outputdir = aux.makedir(f'{path}/{label}') savepath = f'{outputdir}/var-{var}.pdf' plt.savefig(savepath, bbox_inches='tight') print(f'Saved: "{savepath}"', 'green') # ----- fig.clf() plt.close() gc.collect()
[docs] def annotate_heatmap(X, ax, xlabels, ylabels, x_rot = 90, y_rot = 0, decimals = 1, color = "w"): """ Add text annotations to a matrix heatmap plot """ ax.set_xticks(np.arange(0, len(xlabels), 1)); ax.set_yticks(np.arange(0, len(ylabels), 1)); ax.set_xticklabels(labels=xlabels, rotation=x_rot, fontsize='xx-small') ax.set_yticklabels(labels=ylabels, rotation=y_rot, fontsize='xx-small') # Loop over data dimensions and create text annotations. for i in range(len(ylabels)): for j in range(len(xlabels)): if (decimals == 0): text = ax.text(j, i, f'{X[i,j]:.0f}', ha="center", va="center", color=color) if (decimals == 1): text = ax.text(j, i, f'{X[i,j]:.1f}', ha="center", va="center", color=color) if (decimals == 2): text = ax.text(j, i, f'{X[i,j]:.2f}', ha="center", va="center", color=color) return ax
[docs] def plot_AUC_matrix(AUC, edges_A, edges_B): """ Plot AUC matrix. Args: AUC: AUC-ROC matrix edges_A: Histogram edges of variable A edges_B: Histogram edges of variable B Returns: fig: figure handle ax: figure axis """ fig, ax = plt.subplots() xlabels = [f'[{edges_A[k]},{edges_A[k+1]})' for k in range(len(edges_A) - 1)] ylabels = [f'[{edges_B[k]},{edges_B[k+1]})' for k in range(len(edges_B) - 1)] ax.imshow(AUC.transpose(), origin = 'lower') ax.set_title('AUC x 100', fontsize=10) ax = annotate_heatmap(X = AUC.transpose() * 100, ax = ax, xlabels = xlabels, ylabels = ylabels, decimals = 0, x_rot = 0, y_rot = 0, color = "w") return fig, ax
[docs] def multiprocess_plot_wrapper(p): """ Multiprocessing wrapper """ i = p['i'] # Exclude special values ind = np.ones(len(p['X']), dtype=bool) for k in range(len(p['exclude_vals'])): ind = np.logical_and(ind, (p['X'][:,i] != p['exclude_vals'][k])) plotvar(x = p['X'][ind,i], y = p['y'][ind], weights = p['weights'][ind], var = p['ids'][i], nbins = p['nbins'], percentile_range = p['percentile_range'], title = p['title'], targetdir = p['targetdir'], plot_unweighted = p['plot_unweighted'])
[docs] def plotvars(X, y, ids, weights, nbins=70, percentile_range=[0.5, 99.5], exclude_vals=[None], plot_unweighted=True, title = '', targetdir = '.', num_cpus: int=0): """ Plot all variables. """ print(f'Creating plots | exclude_vals = {exclude_vals} ...') # Chunked loop [parallel processing] param = { 'i': None, 'X': None, 'y': None, 'weights': None, 'ids': ids, 'nbins': nbins, 'percentile_range': percentile_range, 'exclude_vals': exclude_vals, 'plot_unweighted': plot_unweighted, 'title': title, 'targetdir': targetdir } # Create parameter dictionaries for each process paramlist = [] for i in range(X.shape[1]): paramlist.append(copy.deepcopy(param)) paramlist[-1]['i'] = i paramlist[-1]['X'] = X # big numpy arrays should share the memory (i.e. no copy of X) paramlist[-1]['y'] = y paramlist[-1]['weights'] = weights # Start multiprocessing print(f'Multiprocessing {X.shape[1]} plots', 'yellow') pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() // 2 if num_cpus == 0 else num_cpus) tic = time.time() pool.map(multiprocess_plot_wrapper, paramlist) pool.close() # no more tasks pool.join() # wrap up current tasks toc = time.time() print(f'Took {toc-tic:0.2f} sec')
[docs] def plotvar(x, y, var, weights, nbins=70, percentile_range=[0.5, 99.5], plot_unweighted=True, title='', targetdir='.'): """ Plot a single variable. """ try: binrange = (np.percentile(x, percentile_range[0]), np.percentile(x, percentile_range[1])) fig, axs = plot_reweight_result(X=x, y=y, nbins=nbins, binrange=binrange, weights=weights, title=title, xlabel=var, plot_unweighted=plot_unweighted) plt.savefig(f'{targetdir}/var-{var}.pdf', bbox_inches='tight') fig.clf() plt.close(fig) gc.collect() except Exception as e: print(e, 'red') print(f'Problem plotting variable "{var}"', 'red')
[docs] def plot_reweight_result(X, y, nbins, binrange, weights, title='', xlabel='x', linewidth=1.5, plot_unweighted=True, cmap='Set1', ylim_ratio=[0.7, 1.3]): """ Plot class-wise weighted and unweighted histograms and their ratio to the last class, including propagated statistical uncertainties and shaded error bands. """ class_ids = np.unique(y.astype(int)) legends = [] scheme = plt.get_cmap(cmap) color_unw = {} color_w = {} for i, c in enumerate(class_ids): color_unw[c] = to_hex(scheme(2 * i % 20)) color_w[c] = to_hex(scheme((2 * i + 1) % 20)) fig = plt.figure(figsize=(11, 6)) gs = GridSpec(2, 2, height_ratios=[2.5, 1], hspace=0.0) # with tight vertical spacing ax_main = [fig.add_subplot(gs[0, 0]), fig.add_subplot(gs[0, 1])] ax_ratio = [fig.add_subplot(gs[1, 0], sharex=ax_main[0]), fig.add_subplot(gs[1, 1], sharex=ax_main[1])] ref_counts = [None, None] ref_errs = [None, None] bin_edges = None color_map = {} hist_data = {0: {}, 1: {}} err_data = {0: {}, 1: {}} min_x = +1e30 max_x = -1e30 # ------------------------------------------------------------ # Main plots (linear, log) for idx, c in enumerate(class_ids): ind = (y == c) if np.sum(ind) == 0: print(f'No samples for class {c} -- continue') continue X_c = X[ind] w_c = weights[ind] w2_c = weights[ind] ** 2 if plot_unweighted: counts_unw, edges = np.histogram(X_c, bins=nbins, range=binrange, weights=None) counts_w, edges = np.histogram(X_c, bins=nbins, range=binrange, weights=w_c) counts_w2, _ = np.histogram(X_c, bins=nbins, range=binrange, weights=w2_c) errs = np.sqrt(counts_w2) bin_edges = edges min_x = min(min_x, edges[0]) max_x = max(max_x, edges[-1]) mu, std = aux.weighted_avg_and_std(values=X_c, weights=w_c) for i in range(2): ax = ax_main[i] cu = color_unw[c] cw = color_w[c] if plot_unweighted: ax.stairs(counts_unw, edges, fill=False, linewidth=linewidth + 0.75, linestyle='--', color=cu) line_w = ax.stairs(counts_w, edges, fill=False, linewidth=linewidth, linestyle='-', color=cw) color_map[c] = cw # map weighted color for later reuse in ratio plot hist_data[i][c] = counts_w err_data[i][c] = errs if idx == len(class_ids) - 1: ref_counts[i] = counts_w.astype(np.float64) + 1e-10 ref_errs[i] = errs ref_color = cw if i == 0: if plot_unweighted: legends.append(f'$\\mathcal{{C}} = {c}$ (unweighted)') legends.append(f'$\\mathcal{{C}} = {c}$ [$\\mu={mu:0.2f}, \\sigma={std:0.2f}$]') # Formatting for i in range(2): ax_main[i].tick_params(labelbottom=False) ax_main[i].xaxis.set_visible(False) # remove space due to tick padding ax_ratio[i].spines['top'].set_visible(False) ax_main[i].set_xlim([min_x, max_x]) ax_main[i].set_xlabel('') ax_main[0].set_ylabel('[weighted] counts') ax_main[0].set_ylim(bottom=0) ax_main[1].set_title(title, fontsize=10) ax_main[1].set_yscale('log') ax_main[1].legend(legends, fontsize=8) # ------------------------------------------------------------ # Ratio plots for i in range(2): ref = ref_counts[i] ref_err = ref_errs[i] with np.errstate(divide='ignore', invalid='ignore'): rel_ref_err = ref_err / ref rel_ref_err = np.nan_to_num(rel_ref_err, nan=0.0, posinf=0.0, neginf=0.0) # Extend to match N+1 bin edges y_ref_line_ext = np.append(np.ones_like(ref), 1.0) y_ref_err_ext = np.append(rel_ref_err, rel_ref_err[-1]) ax_ratio[i].step(bin_edges, y_ref_line_ext, where='post', color='black', linewidth=1) ax_ratio[i].fill_between(bin_edges, 1 - y_ref_err_ext, 1 + y_ref_err_ext, step='post', color=ref_color, alpha=0.35) # Prepare legend handles and labels legend_handles = [] legend_labels = [] # Classes for idx, c in enumerate(class_ids[:-1]): num = hist_data[i][c] numerr = err_data[i][c] denom = ref denomerr = ref_err with np.errstate(divide='ignore', invalid='ignore'): ratio = num / denom rel_err = np.sqrt((numerr / num) ** 2 + (denomerr / denom) ** 2) rel_err = np.nan_to_num(rel_err, nan=0.0, posinf=0.0, neginf=0.0) err = ratio * rel_err ratio_ext = np.append(ratio, ratio[-1]) err_ext = np.append(err, err[-1]) line = ax_ratio[i].step(bin_edges, ratio_ext, where='post', label=f'Class {c}', lw=linewidth, color=color_map[c]) ax_ratio[i].fill_between(bin_edges, ratio_ext - err_ext, ratio_ext + err_ext, step='post', color=color_map[c], alpha=0.3, edgecolor=None) # Add chi2 (hybrid type) sigma2 = numerr**2 + denomerr**2 with np.errstate(divide='ignore', invalid='ignore'): chi2 = np.nansum(((num - denom) ** 2) / sigma2) ndf = np.sum((num > 0) & (denom > 0)) if ndf > 0: chi2_str = rf"$\mathcal{{C}} = {c}$: $\,\chi_{{H}}^2 = {chi2:.1f} \, / \, {ndf} = {chi2/ndf:.2f}$" legend_handles.append(line[0]) # step() returns a list legend_labels.append(chi2_str) # Ratio axis formatting ax_ratio[i].set_ylim([ylim_ratio[0], ylim_ratio[1]]) yticks = np.arange(ylim_ratio[0], ylim_ratio[1] + 0.001, 0.1) ax_ratio[i].set_yticks(yticks[:-1]) # Remove topmost tick ax_ratio[i].set_xlim([min_x, max_x]) ax_ratio[i].set_xlabel(xlabel) ax_ratio[i].grid(True, linestyle=':', alpha=0.5) if i == 0: ax_ratio[i].set_ylabel('Ratio') if i == 1: if legend_handles: ax_ratio[i].legend(legend_handles, legend_labels, loc='upper right', fontsize=7) # Final spacing fig.subplots_adjust(hspace=0.02) return fig, (ax_main, ax_ratio)
[docs] def plot_correlations(X, ids, weights=None, y=None, round_threshold=0.0, targetdir=None, colorbar = False): """ Plot a cross-correlation matrix of vector data Args: X: Data matrix (N x D) ids: Variable names (list of length D) weights: Event weights y: Class labels per event (list of length N) round_threshold: Correlation matrix |C| < threshold to set matrix elements to zero targetdir: Output plot directory colorbar: Colorbar on the plot Returns: figs, axs: Figures, axes (per class) """ N = X.shape[0] if y is None: y = np.zeros(N) class_ids = [0] else: class_ids = np.unique(y.astype(int)) figs = {} axs = {} for k in class_ids: label = f'all' if (len(class_ids) == 1) else f'class_{k}' # Compute correlation matrix tic = time.time() w = weights[y==k] if weights is not None else None C = statstools.correlation_matrix(X=X[y==k,:], weights=w) C[np.abs(C) < round_threshold] = np.nan C *= 100 toc = time.time() print(f'Computing C-matrix (took {toc-tic:0.2f} sec)') # Compute suitable figsize size = np.ceil(C.shape[0] / 3) # Plot it figs[label], axs[label] = plt.subplots(1,1, figsize=(size,size)) axs[label].imshow(C) axs[label] = annotate_heatmap(X = C, ax = axs[label], xlabels = ids, ylabels = ids, decimals = 0, x_rot = 90, y_rot = 0, color = "w") axs[label].set_title(f'{label}: linear correlation $\\in$ [-100,100]', fontsize=10) if colorbar: cb = plt.colorbar() if targetdir is not None: tic = time.time() if X.shape[1] <= 100: # Not too many dimensions --> pdf OK fname = targetdir + f'{label}-correlation-matrix.pdf' plt.savefig(fname=fname, pad_inches=0.2, bbox_inches='tight') else: fname = targetdir + f'{label}-correlation-matrix.png' plt.savefig(fname=fname, pad_inches=0.2, dpi=300, bbox_inches='tight') toc = time.time() print(f'Saved: "{fname} (took {toc-tic:0.2f} sec)"') return figs, axs
[docs] def draw_error_band(ax, x, y, x_err, y_err, **kwargs): """ Calculate normals via centered finite differences (except the first point which uses a forward difference and the last point which uses a backward difference). https://matplotlib.org/stable/gallery/lines_bars_and_markers/curve_error_band.html """ from matplotlib.path import Path from matplotlib.patches import PathPatch dx = np.concatenate([[x[1] - x[0]], x[2:] - x[:-2], [x[-1] - x[-2]]]) dy = np.concatenate([[y[1] - y[0]], y[2:] - y[:-2], [y[-1] - y[-2]]]) l = np.hypot(dx, dy) nx = dy / l ny = -dx / l # End points of errors xp = x + nx * x_err yp = y + ny * y_err xn = x - nx * x_err yn = y - ny * y_err vertices = np.block([[xp, xn[::-1]], [yp, yn[::-1]]]).T codes = np.full(len(vertices), Path.LINETO) codes[0] = codes[len(xp)] = Path.MOVETO path = Path(vertices, codes) ax.add_patch(PathPatch(path, **kwargs))
[docs] def ROC_plot(metrics, labels, title = '', plot_thresholds=True, \ thr_points_signal = [0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95], \ filename = 'ROC', legend_fontsize=7, xmin=1.0e-4, alpha=0.32): """ Receiver Operating Characteristics i.e. False positive (x) vs True positive (y) Args: metrics: labels: title: plot_thresholds: thr_points_signal: filename: legend_fontsize: xmin: alpha: """ for k in [0,1]: # linear & log fig,ax = plt.subplots() xx = np.logspace(-5, 0, 100) plt.plot(xx, xx, linestyle='--', color='black', linewidth=1) # ROC diagonal for i in range(len(metrics)): if i < 10: linestyle = '-' elif 10 <= i < 20: linestyle = '--' else: linestyle = ':' marker = 'None' if metrics[i] is None: print(f'metrics[{i}] ({labels[i]}) is None, continue without') continue fpr = metrics[i].fpr tpr = metrics[i].tpr thresholds = metrics[i].thresholds if metrics[i].tpr_bootstrap is not None: q = [100*(alpha/2), 100*(1-alpha/2)] # Percentile bootstrap based uncertainties tpr_CI = cortools.percentile_per_dim(x=metrics[i].tpr_bootstrap, q=q) tpr_lo = tpr_CI[0] tpr_hi = tpr_CI[1] fpr_CI = cortools.percentile_per_dim(x=metrics[i].fpr_bootstrap, q=q) fpr_lo = fpr_CI[0] fpr_hi = fpr_CI[1] # Autodetect a ROC point triangle (instead of a curve) if len(np.unique(fpr)) == 3: tpr = tpr[1:-1] # Remove first and last fpr = fpr[1:-1] # Remove first and last linestyle = "None" marker = 'o' """ # ROC-curve elif not (isinstance(fpr, int) or isinstance(fpr, float)): fpr = fpr[1:] # Remove always the first element for log-plot reasons tpr = tpr[1:] if metrics[i].tpr_bootstrap is not None: tpr_lo = tpr_lo[1:] tpr_hi = tpr_hi[1:] fpr_lo = fpr_lo[1:] fpr_hi = fpr_hi[1:] """ ## Plot it plt.plot(fpr, tpr, drawstyle='steps-mid', color=f'C{i}', linestyle=linestyle, marker=marker, label = f'{labels[i]}: AUC = {metrics[i].auc:.4f}') # Uncertainty band if marker == 'None' and (metrics[i].tpr_bootstrap is not None): plt.fill_between(fpr, tpr_lo, tpr_hi, step='mid', alpha=0.2, color=f'C{i}', edgecolor='none') # vertical plt.fill_betweenx(tpr, fpr_lo, fpr_hi, step='mid', alpha=0.2, color=f'C{i}', edgecolor='none') # horizontal # draw_error_band(ax=ax, x=fpr, y=tpr, \ # x_err=np.std(metrics[i].fpr_bootstrap, axis=0)[1:], \ # y_err=np.std(metrics[i].tpr_bootstrap, axis=0)[1:], \ # facecolor=f'C{i}', edgecolor="none", alpha=0.2) # Plot corresponding threshold points if plot_thresholds: try: for eff in thr_points_signal: index = np.argmin(np.abs(tpr - eff)) if fpr[index] >= xmin and fpr[index] <= 1.0: plt.plot(fpr[index], tpr[index], '.', color=f'C{i}') t = plt.text(x=fpr[index], y=tpr[index], s=f'{thresholds[index]:0.4g}', fontsize=5, color=f'C{i}') t.set_bbox(dict(facecolor='white', alpha=0.5, linewidth=0)) except: # If failed True ax.set_xlabel('False Positive Rate $\\alpha$ (background efficiency)') ax.set_ylabel('True Positive Rate $1-\\beta$ (signal efficiency)') ax.set_title(title, fontsize=10) if k == 0: # Linear-Linear # Legend if len(metrics) > 12: # Put outside the figure plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=legend_fontsize) else: plt.legend(loc='lower right', fontsize=legend_fontsize) plt.ylim(0.0, 1.0) plt.xlim(0.0, 1.0) plt.locator_params(axis="x", nbins=11) plt.locator_params(axis="y", nbins=11) ax.set_aspect(1.0 / ax.get_data_ratio() * 1.0) plt.savefig(filename + '.pdf', bbox_inches='tight') print(f'Saved: ' + filename + '.pdf','green') if k == 1: # Log-Linear # Legend if len(metrics) > 12: # Put outside the figure plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=legend_fontsize) else: plt.legend(fontsize=legend_fontsize) # Automatic positioning plt.ylim(0.0, 1.0) plt.xlim(xmin, 1.0) plt.locator_params(axis="x", nbins=int(-np.log10(xmin) + 1)) plt.locator_params(axis="y", nbins=11) plt.gca().set_xscale('log') ax.set_aspect(1.0 / ax.get_data_ratio() * 0.75) plt.savefig(filename + '--log.pdf', bbox_inches='tight') plt.close()
[docs] def MVA_plot(metrics, labels, title='', filename='MVA', density=True, legend_fontsize=7) : """ MVA output plots """ # Check input for i in range(len(metrics)): if metrics[i] is None: print(f'Error: metrics[{i}] ({labels[i]}) is None (check per class statistics), return -1') return -1 else: class_ids = metrics[i].class_ids if len(metrics[i].mva_hist) != len(class_ids): print(f'Error: num_classes != len(metrics[{i}].mva_hist) (check per class statistics), return -1') return -1 for k in [0,1]: # linear & log fig,ax = plt.subplots(1, len(class_ids), figsize=(7*len(class_ids), 5)) # Loop over classes for c in class_ids: # Loop over cells N = len(metrics[0].mva_hist[c]) # Number of bins counts = np.zeros((N, len(metrics))) cbins = np.zeros((N, len(metrics))) for i in range(len(metrics)): if metrics[i] is None: print(f'metrics[{i}] is None, continue without') continue if len(metrics[i].mva_bins) == 0: print(f'len(metrics[{i}].mva_bins) == 0, continue without') continue bins = metrics[i].mva_bins cbins[:,i] = (bins[0:-1] + bins[1:]) / 2 if metrics[i].mva_hist[c] != []: counts[:,i] = metrics[i].mva_hist[c] plt.sca(ax[c]) plt.hist(x=cbins, bins=bins, weights=counts, histtype='bar', stacked=True, \ density=density, label=labels, linewidth=2.0) # Adjust plt.sca(ax[c]) plt.legend(fontsize=legend_fontsize, loc='upper center') ax[c].set_xlabel('MVA output $f(\\mathbf{x})$') ax[c].set_ylabel('counts' if not density else 'density') ax[c].set_title(f'{title} | class = {c}', fontsize=10) if k == 0: #plt.ylim(0.0, 1.0) #plt.xlim(0.0, 1.0) #ax.set_aspect(1.0/ax.get_data_ratio() * 1.0) plt.savefig(filename + '.pdf', bbox_inches='tight') print(f'Saved: ' + filename + '.pdf','green') if k == 1: #plt.ylim(0.0, 1.0) #plt.xlim(1e-4, 1.0) for i in range(len(class_ids)): plt.sca(ax[i]) plt.gca().set_yscale('log') #ax.set_aspect(1.0/ax.get_data_ratio() * 0.75) plt.savefig(filename + '--log.pdf', bbox_inches='tight') # -------- fig.clf() plt.close() gc.collect()
[docs] def plot_contour_grid(pred_func, X, y, ids, targetdir = '.', transform = 'numpy', reso=50, npoints=400): """ Classifier decision contour evaluated pairwise for each dimension, other dimensions evaluated at zero=(0,0,...0) (thus z-normalized with 0-mean is suitable) Args: pred_func: prediction function handle X: input matrix y: class targets ids: variable label strings targetdir: output directory transform: 'numpy', 'torch' reso: evaluation resolution npoints: number of points to draw """ print(f'Evaluating, X.shape = {X.shape}...') MAXP = min(npoints, X.shape[0]) D = X.shape[1] pad = 0.5 for dim1 in tqdm(range(D)) : x_min, x_max = X[:, dim1].min() - pad, X[:, dim1].max() + pad for dim2 in range(D) : if dim2 <= dim1 : continue # (x,y)-plane limits y_min, y_max = X[:, dim2].min() - pad, X[:, dim2].max() + pad # Grid points PX,PY = np.meshgrid(np.linspace(x_min, x_max, reso), np.linspace(y_min, y_max, reso)) # ------------------------------------- ## Evaluate function values XX = np.zeros((reso*reso, D)) XX[:, dim1] = PX.ravel() XX[:, dim2] = PY.ravel() if transform == 'torch': Z = pred_func(torch.from_numpy(XX).type(torch.FloatTensor)) elif transform == 'numpy': Z = pred_func(XX) else: raise Exception(__name__ + f'.plot_decision_contour: Unknown matrix type = {transform}') Z = Z.reshape(PX.shape) # -------------------------------------- ## Plot fig, axs = plt.subplots() # Contour cs = plt.contourf(PX, PY, Z, cmap = plt.cm.Spectral) # Events as dots plt.scatter(X[0:MAXP, dim1], X[0:MAXP, dim2], c = y[0:MAXP], cmap = plt.cm.binary) plt.xlabel(f'x[{dim1}] {ids[dim1]}') plt.ylabel(f'x[{dim2}] {ids[dim2]}') plt.colorbar(cs, ticks = np.linspace(0.0, 1.0, 11)) filename = targetdir + f'{dim1}-{dim2}.pdf' plt.savefig(filename, bbox_inches='tight') print(f'Saved: ' + filename,'green') # -------- fig.clf() plt.close() gc.collect()
[docs] def plot_xgb_importance(model, tick_label, importance_type='gain', label=None, sort=True, default_names=False): """ Plot XGBoost model feature importance Args: model: xgboost model object dim: feature space dimension tick_label: feature names importance_type: type of importance metric ['weight', 'gain', 'cover', 'total_gain', 'total_cover'] default_names: True for xgboost default, else set False (uses tick_label) Returns fig, ax """ fscores = model.get_score(importance_type=importance_type) dim = len(tick_label) xx = np.arange(dim) yy = np.zeros(dim) labels = [] # Try, Except needed because xgb does Not return (always) for all of them for i in range(dim): try: if default_names: yy[i] = fscores[f'f{i}'] # XGBoost default feature name 'f{i}' else: yy[i] = fscores[tick_label[i]] # Definite names except: yy[i] = 0.0 labels.append(f'{tick_label[i]} ({yy[i]:0.1f}) [{i}]') # Sort them if sort: s_ind = np.array(np.argsort(yy), dtype=int) yy = yy[s_ind] labels = [labels[i] for i in s_ind] # Plot fig,ax = plt.subplots(figsize=(0.5 * (np.ceil(dim/6) + 2), np.ceil(dim/6) + 2)) plt.barh(xx, yy, align='center', height=0.5, tick_label=labels) plt.xlabel(f'Score ({importance_type})') plt.title(f'[{label}]') return fig, ax
[docs] def table_writer(filename, label, table, print_to_screen=False): """ Helper function to write a table to a file """ with open(filename, "a") as text_file: text_file.write(label) text_file.write('\n') text_file.write(table.get_string()) text_file.write('\n') text_file.write('\n') if print_to_screen: print(label) print(table, 'magenta') print('')
[docs] def plot_AIRW(X, y, ids, weights, y_pred, pick_ind, label, sublabel, param, tau=1.0, X_new=None, targetdir=None, num_cpus=0, map_mode='RW'): """ Plot AI based reweighting results Args: map_mode: 'RW' (Stage-2 reweight) or 'FT' (Stage-2 feature transport) """ prints.printbar() print(f'label = {label} | sublabel = {sublabel} | tau = {tau}', 'green') ## Print stats output_file = os.path.join(targetdir, 'stats_AIRW_weight_flow.log') EPS = param['EPS'] maxW = param['maxW'] lambda_shrink = param['lambda_shrink'] C0 = param['C0'] C1 = param['C1'] RN_ID = param['renorm_origin'] mode = param['transform_mode'] # =============================================================== ## 0. Extract out Stage-1 AI weights ## Remember that we can have negative weights here try: idx, _ = aux.pick_index(all_ids=ids, vars=['raw_weight']) w0_raw = X[y == C0, idx] # Normalize to the class 1 weight sum w0_raw = w0_raw / np.sum(w0_raw) * np.sum(weights[y == C1]) w0_S1_only = weights[y == C0] / w0_raw except: raise Exception('plot_AIRW: Input data X must contain "raw_weight" column') # =============================================================== # Option: S2-Reweight if map_mode == 'RW': # --------------------------------------------------- ## 1. Transform model output scores to Stage-2 AI weights # Handle logits vs probabilities min_y_pred, max_y_pred = np.min(y_pred), np.max(y_pred) THRESH = 1E-5 if min_y_pred < (-THRESH) or max_y_pred > (1.0 + THRESH): print(f'Detected raw logit output [{min_y_pred:0.4f}, {max_y_pred:0.4f}] from the model') logits = y_pred[y == C0] probs = aux.sigmoid(logits) print(f'Corresponding probability output [{np.min(probs):0.4f}, {np.max(probs):0.4f}]') else: print(f'Detected probability output [{min_y_pred:0.4f}, {max_y_pred:0.4f}] from the model') logits = aux.inverse_sigmoid(p = y_pred[y == C0], EPS=EPS) print(f'Corresponding logit output [{np.min(logits):0.4f}, {np.max(logits):0.4f}]') # Apply temperature scaling logits /= tau # Get weights after the re-weighting transform w0_S2_only = reweight.rw_transform_with_logits(logits=logits, mode=mode) prints.print_weights(weights=weights, y=y, output_file=output_file, header='Step 1: Input event weights [raw x S1]', write_mode='w') prints.print_weights(weights=w0_S2_only, y=np.zeros(len(w0_S2_only)), output_file=output_file, header='Step 2: S2 only weights', write_mode='a') # Cut-off regularize anomalous high weights w0_S2_only = np.clip(w0_S2_only, 0.0, maxW) prints.print_weights(weights=w0_S2_only, y=np.zeros(len(w0_S2_only)), output_file=output_file, header=f'Step 3: S2 only weights [cutoff regularized with maxW = {maxW}]', write_mode='a') # Log-space smoothen weights w0_S2_only = aux.log_space_weight_smooth(w0_S2_only, lambda_shrink=lambda_shrink) prints.print_weights(weights=w0_S2_only, y=np.zeros(len(w0_S2_only)), output_file=output_file, header=f'Step 4: S2 only weights [log-space smoothing with lambda_shrink = {lambda_shrink}]', write_mode='a') # Apply multiplicatively to event weights (which can be negative) w0_tot = weights[y == C0] * w0_S2_only prints.print_weights(weights=w0_tot, y=np.zeros(len(w0_tot)), output_file=output_file, header=f'Step 5: Total weights [input weights x S2 weights]', write_mode='a') # --------------------------------------------------- ## 2. Renormalize (optional) (e.g. we want fixed overall normalization, or debug) if RN_ID is not None: print(f'Renormalizing with class [renorm_origin = {RN_ID}] weight sum', 'yellow') sum_before = w0_tot.sum() print(f'Sum(before): {sum_before:0.1f}') w0_tot /= np.sum(w0_tot) # Normalize w0_tot *= np.sum(weights[y == RN_ID]) # Scale sum_after = w0_tot.sum() print(f'Sum(after): {sum_after:0.1f}') ratio = f'Ratio = Sum(after) / Sum(before) = {sum_after/sum_before:0.5f}' print(ratio) print('') # Print stats prints.print_weights(weights=w0_tot, y=np.zeros(len(w0_tot)), output_file=output_file, header=f'Step 6: Total weights sum renormalized [wrt. class ID = {RN_ID}] | {ratio}', write_mode='a') # =================================================== # Add data to a DataFrame tau_str = str(np.round(tau, 3)) # Create DataFrame w_post_S2 = copy.deepcopy(weights) w_post_S2[y == C0] = w0_tot # only C0 df = pd.DataFrame(w_post_S2, columns=[f'w_post_S2__tau_{tau_str}']) # Option: S2-Transport elif map_mode == 'FT': w0_S2_only = None w0_tot = weights[y == C0] df = None else: raise Exception(f'plot_AIRW: Unknown "map_mode" = {map_mode}') # =================================================== # Visualization # --------------------------------------------------- param = { 'i': None, 'i_new': None, 'X': None, 'X_new': None, 'y': None, 'weights': None, 'ids': None, 'w0_raw': None, 'w0_S1_only': None, 'w0_S2_only': None, 'w0_tot': None, 'C0': C0, 'C1': C1, 'label': label, 'param': param, 'local_dir': targetdir, 'map_mode': map_mode } # Loop over each observable (specific column of X) paramlist = [] k = 0 for i in pick_ind: paramlist.append(copy.deepcopy(param)) # Column ndexing paramlist[-1]['i'] = i paramlist[-1]['i_new'] = k # linear running # big numpy arrays should share the memory (i.e. no copy of X) paramlist[-1]['X'] = X paramlist[-1]['X_new'] = X_new paramlist[-1]['y'] = y paramlist[-1]['weights'] = weights paramlist[-1]['ids'] = ids paramlist[-1]['w0_raw'] = w0_raw paramlist[-1]['w0_S1_only'] = w0_S1_only paramlist[-1]['w0_S2_only'] = w0_S2_only paramlist[-1]['w0_tot'] = w0_tot k += 1 # --------------------------------------------------------------- ## Start multiprocessing num_workers = multiprocessing.cpu_count() // 2 if num_cpus == 0 else num_cpus print(f'Multiprocessing {len(pick_ind)} plots with {num_workers} processes', 'yellow') pool = multiprocessing.Pool(processes=num_workers) tic = time.time() mets = pool.map(multiprocess_AIRW_wrapper, paramlist) pool.close() # no more tasks pool.join() # wrap up current tasks toc = time.time() print(f'Took {toc-tic:0.2f} sec') # --------------------------------------------------------------- # Collect summary statistics from the pool total_ndf = 0 # Initialize total ESS (flat structure) total_ESS = dict.fromkeys(mets[0]['ESS'], 0.0) # Initialize total chi2 (nested structure) total_chi2 = { chi2_type: dict.fromkeys(mets[0]['chi2'][chi2_type], 0.0) for chi2_type in mets[0]['chi2'] } ## Collect for the total metrics for i in range(len(mets)): total_ndf += mets[i]['ndf'] for key in mets[i]['ESS'].keys(): total_ESS[key] += mets[i]['ESS'][key] for dt in mets[i]['chi2'].keys(): for obj in mets[i]['chi2'][dt].keys(): total_chi2[dt][obj] += mets[i]['chi2'][dt][obj] # Append metrics mets.append({ 'id': 'total', 'ndf': total_ndf, 'ESS': total_ESS, 'chi2': total_chi2 }) # --------------------------------------------------------------- # Tables metric_table = {'hybrid': None, 'pearson': None, 'neyman': None} for mname in metric_table.keys(): L = mname[0].upper() metric_table[mname] = PrettyTable([ "observable", "ndf", "ESS(C0xS1)/ESS(C0)", "ESS(C0xS1xS2)/ESS(C0xS1)", f"C0 | chi2_{{{L},C1}}/ndf", f"C0xS1 | chi2_{{{L},C1}}/ndf", f"C0xS2 | chi2_{{{L},C0}}/ndf", f"C0xS1xS2 | chi2_{{{L},C1}}/ndf", f"C0xS1xS2 | chi2_{{{L},C0xS1}}/ndf" ]) for i in range(len(mets)): row = [ mets[i]['id'], mets[i]['ndf'], np.round(mets[i]['ESS']['C0S1'] / mets[i]['ESS']['C0'], 2), np.round(mets[i]['ESS']['C0S12'] / mets[i]['ESS']['C0S1'], 2), np.round(mets[i]['chi2'][mname]['C1_C0'] / mets[i]['ndf'], 2), np.round(mets[i]['chi2'][mname]['C1_C0S1'] / mets[i]['ndf'], 2), np.round(mets[i]['chi2'][mname]['C0_C0S2'] / mets[i]['ndf'], 2), np.round(mets[i]['chi2'][mname]['C1_C0S12'] / mets[i]['ndf'], 2), np.round(mets[i]['chi2'][mname]['C0S1_C0S12'] / mets[i]['ndf'], 2), ] metric_table[mname].add_row(row, divider=True if (i == len(mets)-1 or i == len(mets)-2) else False) # ----------------------------------------------- # Save it to the disk filename = os.path.join(targetdir, f"stats_chi2_{mname}.log") open(filename, 'w').close() # Clear content table_writer( filename = filename, label = f'category: {sublabel} | AI model: {label} | tau: {tau:0.2f} | chi2: {mname}', table = metric_table[mname], print_to_screen = True ) # ----------------------------------------------- return metric_table, df, mets
[docs] def multiprocess_AIRW_wrapper(p): """ Multiprocessing plots """ i = p['i'] # Column index i_new = p['i_new'] # X = p['X'] X_new = p['X_new'] y = p['y'] w = p['weights'] ids = p['ids'] w0_raw = p['w0_raw'] # Weights before AI w0_S1_only = p['w0_S1_only'] # Only Stage-1 AI weight w0_S2_only = p['w0_S2_only'] # Only Stage-2 AI weight w0_tot = p['w0_tot'] # Raw x S1 x S2 (mod renormalization) C0 = p['C0'] # Class indices C1 = p['C1'] label = p['label'] param = p['param'] local_dir = p['local_dir'] map_mode = p['map_mode'] # 'RW', or 'FT' X_col = copy.deepcopy(X[:, i]) # Transported if map_mode == 'FT': X_col_new = copy.deepcopy(X_new[:,i_new]) bins = binengine(bindef=param['edges'], x=X_col) # ---------------------------------------------------- obs_X = { # Axis limits 'xlim' : (bins[0], bins[-1]), 'ylim' : None, 'xlabel' : f'{ids[i]}', 'ylabel' : r'Weighted counts', 'units' : {'x': r'a.u.', 'y' : 'counts'}, 'label' : f'{label}', 'figsize' : (4, 3.75), # Ratio 'ylim_ratio' : (0.75, 1.25), # Histogramming 'bins' : None, 'density' : False, # Function to calculate 'func' : None } data_template = { 'data' : None, 'weights': None, 'label' : 'Data', 'hfunc' : 'errorbar', 'style' : iceplot.errorbar_style, 'obs' : obs_X, 'hdata' : None, 'color' : None } # Data source <-> Observable collections class1 = copy.deepcopy(data_template) class0 = copy.deepcopy(data_template) class0_S1 = copy.deepcopy(data_template) class0_S2 = copy.deepcopy(data_template) class0_S12 = copy.deepcopy(data_template) class1.update({ 'label' : '$C_1$', 'hfunc' : 'hist', 'style' : iceplot.hist_style_step, 'color' : (0,0,0) }) class0.update({ 'label' : '$C_0$', 'hfunc' : 'hist', 'style' : iceplot.hist_style_step, 'color' : iceplot.imperial_light_brown }) class0_S1.update({ 'label' : '$C_0 \\times S_1$', 'hfunc' : 'hist', 'style' : iceplot.hist_style_step, 'color' : iceplot.imperial_green }) class0_S2.update({ 'label' : '$C_0 \\times S_2$', 'hfunc' : 'hist', 'style' : iceplot.hist_style_step, 'color' : iceplot.imperial_very_light_blue }) class0_S12.update({ 'label' : '$C_0 \\times S_1 S_2$', 'hfunc' : 'hist', 'style' : iceplot.hist_style_step, 'color' : iceplot.imperial_dark_red }) data = [class1, class0, class0_S1, class0_S2, class0_S12] # C1 (class 1) data[0]['hdata'] = iceplot.hist_obj(X_col[y == C1], bins=bins, weights=w[y == C1]) # C0 (class 0) data[1]['hdata'] = iceplot.hist_obj(X_col[y == C0], bins=bins, weights=w0_raw) # C0 x S1 data[2]['hdata'] = iceplot.hist_obj(X_col[y == C0], bins=bins, weights=w[y == C0]) # Stage-2 reweight if map_mode == 'RW': # C0 x S2 data[3]['hdata'] = iceplot.hist_obj(X_col[y == C0], bins=bins, weights=w0_raw * w0_S2_only) # C0 x S1 x S2 data[4]['hdata'] = iceplot.hist_obj(X_col[y == C0], bins=bins, weights=w0_tot) # Stage-2 transport (no weights for S2) elif map_mode == 'FT': # C0 x S2(T) data[3]['hdata'] = iceplot.hist_obj(X_col_new[y == C0], bins=bins, weights=w0_raw) # C0 x S1 x S2(T) data[4]['hdata'] = iceplot.hist_obj(X_col_new[y == C0], bins=bins, weights=w0_tot) ## Chi2 vs Data chi2 = {'hybrid': {}, 'pearson': {}, 'neyman': {}} for dt in chi2.keys(): chi2[dt]['C1_C0'], ndf = iceplot.chi2_cost(h_mc=data[1]['hdata'], h_data=data[0]['hdata'], vartype=dt, return_nbins=True) chi2[dt]['C1_C0S1'], _ = iceplot.chi2_cost(h_mc=data[2]['hdata'], h_data=data[0]['hdata'], vartype=dt, return_nbins=True) chi2[dt]['C0_C0S2'], _ = iceplot.chi2_cost(h_mc=data[3]['hdata'], h_data=data[1]['hdata'], vartype=dt, return_nbins=True) chi2[dt]['C1_C0S12'], _ = iceplot.chi2_cost(h_mc=data[4]['hdata'], h_data=data[0]['hdata'], vartype=dt, return_nbins=True) chi2[dt]['C0S1_C0S12'], _ = iceplot.chi2_cost(h_mc=data[4]['hdata'], h_data=data[2]['hdata'], vartype=dt, return_nbins=True) # Not plotted ## Effective Sample Sizes binESS_C0, entries_C0 = iceplot.ess_metric(h=data[1]['hdata']) binESS_C0S1, entries_C0S1 = iceplot.ess_metric(h=data[2]['hdata']) binESS_C0S2, entries_C0S2 = iceplot.ess_metric(h=data[3]['hdata']) binESS_C0S12, entries_C0S12 = iceplot.ess_metric(h=data[4]['hdata']) ESS = {} ESS['C0'] = binESS_C0.sum() / entries_C0.sum() ESS['C0S1'] = binESS_C0S1.sum() / entries_C0S1.sum() ESS['C0S2'] = binESS_C0S2.sum() / entries_C0S2.sum() ESS['C0S12'] = binESS_C0S12.sum() / entries_C0S12.sum() mets = { 'id': ids[i], 'ndf': ndf, 'ESS': ESS, 'chi2': chi2 } # Legends data[1]['label'] += f' | $\\chi_{{H,C_1}}^2 = {chi2["hybrid"]["C1_C0"]:0.0f} \, / \, {ndf} = {chi2["hybrid"]["C1_C0"]/ndf:0.1f}$, $\\bar{{\\mathcal{{E}}}}={ESS["C0"]:0.2f}$' data[2]['label'] += f' | $\\chi_{{H,C_1}}^2 = {chi2["hybrid"]["C1_C0S1"]:0.0f} \, / \, {ndf} = {chi2["hybrid"]["C1_C0S1"]/ndf:0.1f}$, $\\bar{{\\mathcal{{E}}}}={ESS["C0S1"]:0.2f}$' data[3]['label'] += f' | $\\chi_{{H,C_0}}^2 = {chi2["hybrid"]["C0_C0S2"]:0.0f} \, / \, {ndf} = {chi2["hybrid"]["C0_C0S2"]/ndf:0.1f}$, $\\bar{{\\mathcal{{E}}}}={ESS["C0S2"]:0.2f}$' data[4]['label'] += f' | $\\chi_{{H,C_1}}^2 = {chi2["hybrid"]["C1_C0S12"]:0.0f} \, / \, {ndf} = {chi2["hybrid"]["C1_C0S12"]/ndf:0.1f}$, $\\bar{{\\mathcal{{E}}}}={ESS["C0S12"]:0.2f}$' # Plot it for mode in ['log', 'linear']: fig0, ax0 = iceplot.superplot(data, ratio_plot=True, yscale=mode, ratio_error_plot=True, legend_properties={'fontsize': 5.5}) EPS = 1e-9 binESS_list = [None, None, binESS_C0S1 / np.clip(binESS_C0, EPS, None), None, binESS_C0S12 / np.clip(binESS_C0S1, EPS, None)] colors = [entry['color'] for entry in data] ax_ess = ess_subplot( fig = fig0, ax_ratio = ax0[1], bin_edges = data[1]['hdata'].bins, binESS_list = binESS_list, colors = colors, labels = [None, None, '$\\mathcal{E}(C_0 \\times S_1)\, / \, \\mathcal{E}(C_0)$', None, '$\\mathcal{E}(C_0 \\times S_1 S_2)\, / \, \\mathcal{E}(C_0 \\times S_1)$'], ylabel = '$\\mathcal{E}_i \, / \, \\mathcal{E}_j$' ) fig0.savefig(aux.makedir(local_dir + '/pdf/') + f'/reweight_[{ids[i]}]__{mode}.pdf', bbox_inches='tight') fig0.savefig(aux.makedir(local_dir + '/png/') + f'/reweight_[{ids[i]}]__{mode}.png', bbox_inches='tight', dpi=300) fig0.clf() plt.close(fig0) return mets
[docs] def ess_subplot( fig, ax_ratio, bin_edges, # array (n+1) binESS_list, # list of arrays (n) *, colors=None, # for color reuse ylabel="ESS", gap=0.01, # vertical gap below ratio (figure fraction) labels=None, linestyle='-', lw=1.0 ): """ Append an ESS axis (same height as 'ax_ratio') directly underneath. """ # ------------------------------------------------------------------ # Validate edges # ------------------------------------------------------------------ edges = np.asarray(bin_edges, dtype=float) if edges.ndim != 1 or edges.size < 2: raise ValueError("'bin_edges' must be 1D with at least 2 entries.") n_bins = edges.size - 1 # ------------------------------------------------------------------ # 1. Compute positions; enlarge figure if needed # ------------------------------------------------------------------ pos_ratio = ax_ratio.get_position() ess_height = pos_ratio.height ess_y0 = pos_ratio.y0 - gap - ess_height if ess_y0 < 0.0: # not enough room -> enlarge fig grow_frac = -ess_y0 fig.set_figheight(fig.get_figheight() * (1 + grow_frac)) # after resizing, axes keep their coords, recompute y0 ess_y0 = pos_ratio.y0 - gap - ess_height # ------------------------------------------------------------------ # 2. Add ESS axis # ------------------------------------------------------------------ ax_ess = fig.add_axes([pos_ratio.x0, ess_y0, pos_ratio.width, ess_height], sharex=ax_ratio) # ------------------------------------------------------------------ # 3. Plot each ESS curve (histogram style, edges + where='post') # ------------------------------------------------------------------ for i, ess in enumerate(binESS_list): if ess is None: continue y = np.asarray(ess, dtype=float) if y.size == 0 or np.all(np.isnan(y)): continue if y.size != n_bins: raise ValueError(f"ESS curve #{i} has length {y.size}; expected {n_bins}.") # duplicate last entry so the final step spans the last bin y_plot = np.append(y, y[-1]) label = labels[i] if labels and i < len(labels) else None ax_ess.step(edges, y_plot, where='post', color=colors[i], linestyle=linestyle, label=label, lw=lw) # ------------------------------------------------------------------ # 4. Visuals # ------------------------------------------------------------------ # Copy font sizes from ratio axis (fallback to rcParams) tick_font = (ax_ratio.xaxis.get_ticklabels()[0].get_fontsize() if ax_ratio.xaxis.get_ticklabels() else plt.rcParams['xtick.labelsize']) label_font = ax_ratio.xaxis.label.get_size() # show x‑labels only on ESS axis ax_ratio.tick_params(labelbottom=False) # remove external y‑tick marks, keep numeric labels ax_ess.tick_params(axis='y', length=0, labelsize=tick_font) ax_ess.tick_params(axis='x', labelsize=tick_font) ax_ess.set_xlabel(ax_ratio.get_xlabel(), fontsize=label_font) ax_ess.set_ylabel(ylabel, fontsize=label_font) # fixed y‑range, ticks, and grid ax_ess.set_ylim(0, 1) ax_ess.set_yticks([0.0, 0.25, 0.5, 0.75]) ax_ess.grid(True, axis='y', linestyle='--', alpha=0.4) if labels: ax_ess.legend(fontsize=tick_font * 0.65, frameon=False) return ax_ess