Source code for icebrk.loop

# B/RK analyzer main event loop (protocode)
#
# [CODE IS ROTTEN / UNFIXED due to awkward0 --> awkward1 changes]
#
#
# Mikael Mieskolainen, 2022
# m.mieskolainen@imperial.ac.uk


import copy
import uproot
import h5py
import awkward

import numpy as np
from tqdm import tqdm
from termcolor import colored

from iceplot import iceplot
from icenet.tools.io import *
from icenet.tools import aux
from icenet.tools import prints

import icebrk.features as features
import icebrk.tools as tools
import icebrk.histos as histos
import icebrk.cutstats as cutstats
import icebrk.fasthistos as fasthistos


[docs] def hdf5_write_handles(filename, N_weights, rwmode = 'w'): """Create HDF5 file handles. Args: filename: N_weights: rwmode Returns: f: datasets """ f = h5py.File(filename, rwmode) # Keys: Observables datasets = dict.fromkeys(histos.obs_all.keys(), 0) init_shape = (0,) for key in datasets.keys(): datasets[key] = f.create_dataset(key, shape=init_shape, maxshape=(None,), dtype=np.float32, compression='gzip', compression_opts=9) # Keys: Weights for i in range(N_weights): key = 'W'+str(i) datasets[key] = f.create_dataset(key, shape=init_shape, maxshape=(None,), dtype=np.float32, compression='gzip', compression_opts=9) return f, datasets
[docs] def hdf5_append(datasets, key, chunk): """Append chunk of data to HDF5 file. Args: datasets: key: chunk: Returns: f: datasets """ print(datasets[key]) row_count = datasets[key].size # Resize the dataset to accommodate the next chunk of rows datasets[key].resize((row_count + chunk.shape[0],)) # Write the next chunk datasets[key][row_count:] = chunk # Increment the row count row_count += chunk.shape[0] return row_count
''' -------------------------------------------------------------------------- y_tru = [1, 0, 0, 0, 0] y_est = [0.9, 0.05, 0.0001, 0.00001, 0.05] Scheme 1: MAXN = 1 BMAT = [0 0 0 0] = class 0 (no signal) [0 0 0 1] = class 1 (signal in last slot) [0 0 1 0] = class 2 ... [0 1 0 0] = class 3 ... [1 0 0 0] = class 4 (signal in first slot) -------------------------------------------------------------------------- Scheme 2: MAXN <= MAXT3 BMAT = [0 0 0 0] = class 0 (no signal) [0 0 0 1] = class 1 (signal in last slot) [0 0 1 0] = class 2 (signal in second last slot) [0 0 1 1] = class 3 (signal objects in two slots) ... '''
[docs] def poweranalysis(evt_index, batch_obs, obs, func_predict, x, y, qsets, MAXT3, MAXN, isMC, reco, BMAT, WNORM): """Powerset analysis of the event N.B.this is already CONDITIONED that we select a maximum MAXT3 triplets! Args: evt_index: batch_obs: obs: func_predict: x: y: qsets: MAXT3: MAXN: isMC: reco: BMAT: WNORM: Returns: w: """ # Create weights w = dict() if isMC: w['0'] = np.zeros(BMAT.shape[0]) for c in range(BMAT.shape[0]): w['0'][c] = 1 if aux.binvec_are_equal(y, BMAT[c,:]) else 0 else: for i in range(len(func_predict)): w[str(i)] = (func_predict[i](np.array([x]))).flatten() # Get indices we use for the kinematic reconstruction [UPDATE HERE **** !!!] set_ind = tools.get_first_indices(qsets, MAXT3=MAXT3) # Over all powerset combinations (one object, two objects, ... ) # with the total number being dependent on number of rows in BMAT wnorm = {'S': 0, 'B': 0} for c in range(BMAT.shape[0]): # Event (object) normalization strategy if WNORM == 'event': wnorm['S'] = np.sum(BMAT[c,:] == 1) # Number of signal objects wnorm['B'] = np.sum(BMAT[c,:] == 0) # Number of background objects elif WNORM == 'unit': wnorm['S'] = wnorm['B'] = 1.0 else: raise Exception(__name__ + '.poweranalysis: unknown WNORM parameter') # Sweep over all triplets of this combination for j in range(len(BMAT[c,:])): # SIGNAL / BACKGROUND COMPONENT ACTIVE (= POWERSET DEFINITION) ID = 'S' if (BMAT[c,j]) else 'B' # Computed observables for key in histos.obs_all.keys(): if key in batch_obs: """ print(key) print(evt_index) print(set_ind[j]) print(batch_obs[key][evt_index]) print(batch_obs[key][evt_index][set_ind[j]]) """ xx = awkward.to_list( batch_obs[key][evt_index][set_ind[j]] ) # ** SOMETHING IS BROKEN HERE, WE SHOULD NOT HAVE A LIST AT THIS POINT ** if isinstance(xx, list): reco[ID]['x'][key].add( xx[0] ) else: reco[ID]['x'][key].add( xx ) elif key in obs: reco[ID]['x'][key].add( obs[key] ) # Weights if isMC: reco[ID]['w']['0'].add(w['0'][c] / wnorm[ID]) else: for algo in w.keys(): reco[ID]['w'][algo].add(w[algo][c] / wnorm[ID]) return w
[docs] def hist_flush(reco, hobj, h5datasets = None): """ Histogram observables with accumulation of previous histograms, and flush buffer arrays Args: reco: hobj: h5datasets: Returns: w: """ # ==================================================================== ### Accumulate histograms for obs in reco['x'].keys(): if (reco['x'][obs].size > 0): # Histogram only non-empty buffer for algo in reco['w'].keys(): hobj[algo][obs] += iceplot.hist_obj(x = reco['x'][obs].values(), weights=reco['w'][algo].values(), bins=histos.obs_all[obs]['bins']) # ==================================================================== ### Dump data to HDF5 files if h5datasets is not None: for obs in reco['x'].keys(): if histos.obs_all[obs]['pickle']: hdf5_append(datasets=h5datasets, key=obs, chunk=reco['x'][obs].values()) for algo in reco['w'].keys(): hdf5_append(datasets=h5datasets, key='W'+algo, chunk=reco['w'][algo].values()) # ==================================================================== ### Flush buffers # Observables for obs in reco['x'].keys(): reco['x'][obs].reset() # Weight for algo in reco['w'].keys(): reco['w'][algo].reset()
# ====================================================================
[docs] def initarrays(BUFFER, func_predict, isMC): """ Init histogramming arrays and objects Args: BUFFER: func_predict: isMC: Returns: reco: hobj: """ reco_EMPTY = { 'x' : dict.fromkeys(histos.obs_all.keys()), 'w' : dict() } for obs in histos.obs_all.keys(): reco_EMPTY['x'][obs] = fastarray1(BUFFER) if isMC: reco_EMPTY['w']['0'] = fastarray1(BUFFER) else: for i in range(len(func_predict)): reco_EMPTY['w'][str(i)] = fastarray1(BUFFER) ## Final state event buffers reco = {'S': None, 'B': None} reco['S'] = copy.deepcopy(reco_EMPTY) reco['B'] = copy.deepcopy(reco_EMPTY) ## Final state histogram objects hobj = {'S': dict(), 'B': dict()} if isMC: hobj['S']['0'] = dict.fromkeys(histos.obs_all.keys(), iceplot.hobj()) hobj['B']['0'] = dict.fromkeys(histos.obs_all.keys(), iceplot.hobj()) else: for i in range(len(func_predict)): algo = str(i) hobj['S'][algo] = dict.fromkeys(histos.obs_all.keys(), iceplot.hobj()) hobj['B'][algo] = dict.fromkeys(histos.obs_all.keys(), iceplot.hobj()) return reco, hobj
[docs] def process(paths=[], func_predict=None, isMC=True, MAXT3=5, MAXN=2, maxevents=10000000000, EVTGROUPSIZE=1024, CHUNKBUFFER=512, VERBOSE=False, BMAT=[], WNORM=[], SUPERSETS=True, hd5dir=None, outputXY=False, outputP=False, **kwargs): """ Main event processing loop Args: Returns: """ # Check critical inputs if isMC is False and (func_predict is None or func_predict is []): raise Exception(__name__ + '.process: input ''func_predict'' should not be empty') # -------------------------------------------------------------------- # @ Basic variables @ NCLASS = BMAT.shape[0] # Number of classes D = features.getdimension() # Numbef of dimensions per triplet DTOT = D * MAXT3 # Total number of dimensions SORTKEY = 'BToKEE_fit_cos2D' # Triplet ranking variable # -------------------------------------------------------------------- # @ HDF5 dump @ h5file = {'S': None, 'B': None} h5datasets = {'S': None, 'B': None} if hd5dir is not None: if isMC: h5file['S'], h5datasets['S'] = hdf5_write_handles(filename=hd5dir + '/MC_S.hd5', N_weights=1) h5file['B'], h5datasets['B'] = hdf5_write_handles(filename=hd5dir + '/MC_B.hd5', N_weights=1) else: h5file['S'], h5datasets['S'] = hdf5_write_handles(filename=hd5dir + '/DA_S.hd5', N_weights=len(func_predict)) h5file['B'], h5datasets['B'] = hdf5_write_handles(filename=hd5dir + '/DA_B.hd5', N_weights=len(func_predict)) # -------------------------------------------------------------------- # @ MC AI/ML-training arrays @ X = None Y = None P = None x = None y = None if outputXY: X = np.zeros(shape=(maxevents, DTOT)) if outputP: P = dict() for i in range(len(func_predict)): P[str(i)] = np.zeros(shape=(maxevents, NCLASS)) if isMC: Y = np.zeros(shape=(maxevents, MAXT3)) # -------------------------------------------------------------------- # @ CUTS AND INFOFLOW @ cutflow, infostats_BC, mcinfostats_BC = cutstats.init_stat_objects() infostats_AC = copy.deepcopy(infostats_BC) mcinfostats_AC = copy.deepcopy(mcinfostats_BC) # -------------------------------------------------------------------- # @ WEIGHTED OBSERVABLE CONTAINERS @ T = CHUNKBUFFER * (BMAT.shape[0] * BMAT.shape[1]) reco, hobj = initarrays(BUFFER=T, func_predict=func_predict, isMC=isMC) # -------------------------------------------------------------------- # @ FAST HISTOGRAMS @ #hobjraw_MC = fasthistos.initialize() # -------------------------------------------------------------------- # @ LOOP VARIABLES @ passcount = 0 evtcount = 0 l1_p4 = dict() l2_p4 = dict() k_p4 = dict() # ==================================================================== ROOT file loop >> def print_input(events): prints.printbar() print(__name__ + f'.process: Input: {paths[z]}') print(__name__ + f'.process: {events.name} {events.title}') print(__name__ + f'.process: D = {features.getdimension()}') print(__name__ + f'.process: DTOT = {DTOT} (D x MAXT3)') print(__name__ + f'.process: NCLASS = {NCLASS}') prints.printbar() print(__name__ + f'.process: MAXT3 = {MAXT3}') print(__name__ + f'.process: MAXN = {MAXN}') print(__name__ + f'.process: WNORM = {WNORM}') print(__name__ + f'.process: SUPERSETS = {SUPERSETS}') print('') print(__name__ + f'.process: maxevents = {maxevents}') prints.printbar() # Loop over different files for z in range(len(paths)): file = uproot.open(paths[z]) print(file) events = file['Events'] if z == 0: events.show() print_input(events) ## Loop over this file skips = 0 for evt_i, evtgroup in enumerate(events.iterate(step_size = EVTGROUPSIZE)): print(f'evtgroup = {evt_i} [EVTGROUPSIZE = {EVTGROUPSIZE}]') d = copy.deepcopy(evtgroup) """ if not isMC: if skips < 1: # Skip events hack print('Skipping evtgroup due to simulation') skips += 1 continue; else: True """ # -------------------------------------------------------------------- # *** FULLY VECTORIZED KINEMATICS CONSTRUCTION (JAGGED/COLUMNAR) *** ## Construct new branches features.construct_new_branches(d) ## Construct kinematics (4-vectors) tools.construct_kinematics(d, l1_p4, l2_p4, k_p4) ## Construct final state observables batch_obs = histos.calc_batch_observables(l1_p4, l2_p4, k_p4) if isMC: tools.construct_MC_tree(d) tools.construct_MC_truth(d) batch_MC_obs = histos.calc_batch_MC_observables(d, l1_p4, l2_p4, k_p4) # -------------------------------------------------------------------- # Loop over events of this group N_EVENTS = len(d['event']) iterator = tqdm(range(N_EVENTS)) for evt_index in iterator: if (evtcount == maxevents): print(colored(__name__ + f': Maximum event count {maxevents} reached', 'red')) iterator.close() # So we don't get ghost messages break # this event group # ==================================================================== ## EVENT SELECTION AND COMBINATORICS # ==================================================================== evtcount += 1 ## Find triplets merged together if SUPERSETS: qsets = aux.los2lol(tools.find_connected_triplets(evt_index, l1_p4, l2_p4, k_p4)) # All individually, no fusion else: qsets = [] for kk in range( len(awkward.to_list(l1_p4['e'][evt_index])) ): qsets.append( [kk] ) # Sort the triplets def svprob_rank(tripletset): for ind in tripletset: # Return first of the (possible) superset [** SOMETHING BROKEN HERE PROBABLY **] return d[SORTKEY][evt_index][ind] qsets.sort(key=svprob_rank, reverse=True) # Reverse gives the biggest values first! # ==================================================================== # FILL FAST HISTOGRAMS ''' # Get indices we use for the kinematic reconstruction [UPDATE HERE **** !!!] set_ind = tools.get_first_indices(qsets, MAXT3=MAXT3) for obs in fasthistos['S'].keys(): for set_ind ''' # ==================================================================== ## Construct input (and output) vectors x = tools.construct_input_vec(evt_index, d, l1_p4=l1_p4, l2_p4=l2_p4, k_p4=k_p4, qsets=qsets, MAXT3=MAXT3) if outputXY: X[passcount,:] = copy.deepcopy(x) if isMC: y = tools.construct_output_vec(evt_index, d, qsets=qsets, MAXT3=MAXT3) if outputXY: Y[passcount,:] = copy.deepcopy(y) # ==================================================================== ## CUTS # ==================================================================== cutstats.collect_info_stats(d = d, evt_index = evt_index, infostats = infostats_BC) if isMC: cutstats.collect_mcinfo_stats(d=d, evt_index=evt_index, y=y, qsets=qsets, MAXT3=MAXT3, mcinfostats=mcinfostats_BC) # -------------------------------------------------------------------- if (cutstats.apply_cuts(d, evt_index, cutflow) == False): continue # -------------------------------------------------------------------- cutstats.collect_info_stats(d = d, evt_index = evt_index, infostats = infostats_AC) if isMC: cutstats.collect_mcinfo_stats(d=d, evt_index=evt_index, y=y, qsets=qsets,MAXT3=MAXT3, mcinfostats=mcinfostats_AC) # ==================================================================== # Event-by-event (non-vectorizable) observables obs = histos.calc_observables(evt_index, d, l1_p4, l2_p4, k_p4, qsets, MAXT3) if isMC: MC_obs = histos.calc_MC_observables(evt_index, d, l1_p4, l2_p4, k_p4, qsets, MAXT3) if VERBOSE: tools.print_MC_event(evt_index, d, l1_p4, l2_p4, k_p4, qsets) # ==================================================================== ## ANALYSIS # ==================================================================== if func_predict is not None: ## Analyze w = poweranalysis(evt_index=evt_index, batch_obs=batch_obs, obs=obs, func_predict=func_predict, x=x, y=y, qsets=qsets, MAXT3=MAXT3, MAXN=MAXN, reco=reco, isMC=isMC, BMAT=BMAT, WNORM=WNORM) if outputP: for i in range(len(func_predict)): P[str(i)][passcount,:] = w[str(i)] # Fill histograms & flush buffers if ((passcount > 0) & (passcount % (CHUNKBUFFER-1) == 0)): for ID in reco.keys(): hist_flush(reco=reco[ID], hobj=hobj[ID], h5datasets=h5datasets[ID]) ## Update counts passcount += 1 # Last fill histograms & flush buffers for ID in reco.keys(): hist_flush(reco=reco[ID], hobj=hobj[ID], h5datasets=h5datasets[ID]) # ==================================================================== << ROOT file loop # Close HDF5 files if hd5dir is not None: for ID in reco.keys(): h5file[ID].close() ## Remove buffer space if outputXY: X = X[0:passcount-1,:] if isMC: Y = Y[0:passcount-1,:] if outputP: for i in range(len(func_predict)): P[str(i)] = P[str(i)][0:passcount-1,:] ## Update total event counts cutflow['total'] = evtcount infostats_BC['total'] = evtcount infostats_AC['total'] = evtcount if isMC: mcinfostats_BC['total'] = evtcount mcinfostats_AC['total'] = evtcount ## Print flows print('\n<< INFOSTATS [BC] >>'); prints.print_flow(infostats_BC) if isMC: print('\n<< MC-ONLY-INFOSTATS [BC] >>'); prints.print_flow(mcinfostats_BC) print('\n<< CUTFLOW >>'); prints.print_flow(cutflow) print('\n<< INFOSTATS [AC] >>'); prints.print_flow(infostats_AC) if isMC: print('\n<< MC-ONLY-INFOSTATS [AC] >>'); prints.print_flow(mcinfostats_AC) print('\n') output = { 'X' : X, 'Y' : Y, 'P' : P, 'hobj' : hobj } return output