Source code for icezee.common

# Common input & data reading routines for ZEE
#
# m.mieskolainen@imperial.ac.uk, 2024

import numpy as np
import copy
from importlib import import_module
import pandas as pd
import os

from icenet.tools import io
from icenet.tools import aux

from icefit import dequantize

# ------------------------------------------
from icenet import print
# ------------------------------------------

# GLOBALS
#from configs.zee.cuts import *
#from configs.zee.filter import *

[docs] def truncate(X,Y,W,maxevents): # Apply maxevents cutoff maxevents = np.min([maxevents, len(X)]) if maxevents < len(X): print(f'Applying maxevents cutoff {maxevents}') X, Y, W = X[0:maxevents], Y[0:maxevents], W[0:maxevents] return X,Y,W
[docs] def load_helper(mcfiles, datafiles, maxevents, args): inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"]) LOAD_VARS = inputvars.LOAD_VARS # ------------------------------------------------------------------------- # *** MC *** frames = [] for f in mcfiles: new_frame = copy.deepcopy(pd.read_parquet(f)) frames.append(new_frame) print(f'{f} | N = {len(new_frame)}', 'yellow') ids = list(new_frame.keys()); ids.sort() print(ids) frame_mc = pd.concat(frames) print(f'Total number of events: {len(frame_mc):0.1E}') X_MC = frame_mc[LOAD_VARS].to_numpy() # Label = 0 Y_MC = np.zeros(len(X_MC)).astype(int) ## Pre-computed weights (gen event weight x CMS weights) W_MC = frame_mc[['weight']].to_numpy().squeeze() print(f'X_MC.shape = {X_MC.shape}') print(f'W_MC.shape = {W_MC.shape}') # Apply maxevents cutoff X_MC, Y_MC, W_MC = truncate(X=X_MC, Y=Y_MC, W=W_MC, maxevents=maxevents) # ------------------------------------------------------------------------- # *** Data *** frames = [] for f in datafiles: new_frame = copy.deepcopy(pd.read_parquet(f)) frames.append(new_frame) print(f'{f} | N = {len(new_frame)}', 'yellow') ids = list(new_frame.keys()); ids.sort() print(ids) frame_data = pd.concat(frames) print(f'Total number of events: {len(frame_data):0.1E}') X_data = frame_data[LOAD_VARS].to_numpy() W_data = np.ones(len(X_data)) # Label = 1 Y_data = np.ones(len(X_data)).astype(int) print(f'X_data.shape = {X_data.shape}') print(f'W_data.shape = {W_data.shape}') # Apply maxevents cutoff X_data, Y_data, W_data = truncate(X=X_data, Y=Y_data, W=W_data, maxevents=maxevents) # ------------------------------------------------------------------------- # Combine MC and Data samples X = np.vstack((X_MC, X_data)) Y = np.concatenate((Y_MC, Y_data)) W = np.concatenate((W_MC, W_data)) ids = LOAD_VARS # We use these ## ------------------------------------------------- # ** Drop negative weight (MC) events ** if args['drop_negative']: ind = W < 0 if np.sum(ind) > 0: print(f'Dropping negative weight events ({np.sum(ind)/len(ind):0.3f})', 'red') X = X[~ind] W = W[~ind] # Boolean NOT Y = Y[~ind] # Re-nenormalize MC to the event count ind = (Y == 0) W[ind] = W[ind] / np.sum(W[ind]) * len(W[ind]) ## ------------------------------------------------- # ------------------------------------------------------------------------- # ** Randomize MC vs Data order to avoid problems with other functions ** # ** No need to randomize in this application. We have fixed (eval, validate, test) file structure ** #rand = np.random.permutation(len(X)) #X = X[rand].squeeze() # Squeeze removes additional [] dimension #Y = Y[rand].squeeze() #W = W[rand].squeeze() # ------------------------------------------------------------------------- # ------------------------------------------------------------------------- ## Add event weights as an aux variable called `raw_weight` for plots etc. X = np.hstack((X, W[None].T)) ids = ids + ['raw_weight'] # ------------------------------------------------------------------------- ## Print some diagnostics print(f'Total number of events: {len(X)}') for c in np.unique(Y): print(f'class[{c}] | N = {np.sum(Y == c)} | weight[mean,std,min,max] = {np.mean(W[Y==c]):0.3E}, {np.std(W[Y==c]):0.3E}, {np.min(W[Y==c]):0.3E}, {np.max(W[Y==c]):0.3E}') return X,Y,W,ids
[docs] def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevents=None, args=None): """ Loads the root file. Args: root_path : paths to root files (list) Returns: X: columnar data Y: class labels W: event weights ids: columnar variable string (list) info: trigger and pre-selection acceptance x efficiency information (dict) """ if type(root_path) is list: root_path = root_path[0] # Remove [] list, we expect only the path here # ----------------------------------------------- #param = { # 'entry_start': entry_start, # "entry_stop": entry_stop, # "maxevents": maxevents, # "args": args #} if maxevents is None: maxevents = int(1e10) print(f'"maxevents" is None, set {maxevents}', 'yellow') # ------------------------------------------------------------------------- # We do splitting according to dictionary (trn, val, tst) structure running_split = {} X = {} Y = {} W = {} N_prev = 0 for mode in ['trn', 'val', 'tst']: # Glob expansion type (be careful not to have "label noise" under the folder or subfolders) mc_files = io.glob_expand_files(datasets=args["mcfile"][mode], datapath=root_path) da_files = io.glob_expand_files(datasets=args["datafile"][mode], datapath=root_path) # Simple fixed one file #mc_files = [os.path.join(root_path, args['mcfile'][mode][0])] #da_files = [os.path.join(root_path, args['mcfile'][mode][0])] print(f'Found mcfiles: {mc_files}') print(f'Found datafiles: {da_files}') X[mode],Y[mode],W[mode],ids = load_helper(mcfiles=mc_files, datafiles=da_files, maxevents=maxevents, args=args) running_split[mode] = np.arange(N_prev, len(X[mode]) + N_prev) N_prev += len(X[mode]) #! # Combine X = np.vstack((X['trn'], X['val'], X['tst'])) Y = np.concatenate((Y['trn'], Y['val'], Y['tst'])) W = np.concatenate((W['trn'], W['val'], W['tst'])) # Aux info here info = {'running_split': running_split} return {'X':X, 'Y':Y, 'W':W, 'ids':ids, 'info':info}
[docs] def splitfactor(x, y, w, ids, args): """ Transform data into different datatypes. Args: data: jagged arrays args: arguments dictionary Returns: dictionary with different data representations """ inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"]) data = io.IceXYW(x=x, y=y, w=w, ids=ids) # ------------------------------------------------------------------------- ### Pick kinematic variables out (note also 'raw_weight') data_kin = None if inputvars.KINEMATIC_VARS is not None: vars = aux.process_regexp_ids(all_ids=data.ids, ids=inputvars.KINEMATIC_VARS + ['raw_weight']) data_kin = data[vars] data_kin.x = data_kin.x.astype(np.float32) # ------------------------------------------------------------------------- data_deps = None # ------------------------------------------------------------------------- data_tensor = None # ------------------------------------------------------------------------- data_graph = None # ------------------------------------------------------------------------- # Mutual information regularization targets data_MI = None if inputvars.MI_VARS is not None: vars = aux.process_regexp_ids(all_ids=data.ids, ids=inputvars.MI_VARS) data_MI = data[vars].x.astype(np.float32) # ------------------------------------------------------------------------- ### Finally pick active scalar variables out vars = aux.process_regexp_ids(all_ids=data.ids, ids=eval('inputvars.' + args['inputvar_scalar'])) data = data[vars] data.x = data.x.astype(np.float32) # ------------------------------------------------------------------------- ### ** Special transform for specific MVA input variables ** if args['pre_transform']['active']: var = args['pre_transform']['var'] func = args['pre_transform']['func'] for i, v in enumerate(var): try: ind = data.find_ind(v) except: print(f'Could not find variable "{v}" -- continue', 'red') continue print(f'Pre-transform: {func[i]} of variable "{v}"', 'magenta') x = data.x[:,ind] # x used in the transform data.x[:,ind] = eval(func[i]) # Change the variable name [+ have the same original variables in data_kin] data.ids[ind] = f'TRF__{v}' return {'data': data, 'data_MI': data_MI, 'data_kin': data_kin, 'data_deps': data_deps, 'data_tensor': data_tensor, 'data_graph': data_graph}