Source code for icebrk.loop

# B/RK analyzer main event loop (protocode)
#
# [CODE IS ROTTEN / UNFIXED due to awkward0 --> awkward1 changes]
#
#
# Mikael Mieskolainen, 2022
# m.mieskolainen@imperial.ac.uk


import copy
import uproot
import h5py
import awkward

import numpy as np
from tqdm import tqdm
from termcolor import colored

from iceplot import iceplot
from icenet.tools.io import *
from icenet.tools import aux
from icenet.tools import prints

import icebrk.features as features
import icebrk.tools as tools
import icebrk.histos as histos
import icebrk.cutstats as cutstats
import icebrk.fasthistos as fasthistos



[docs]
def hdf5_write_handles(filename, N_weights, rwmode = 'w'):
    """Create HDF5 file handles.

    Args:
        filename:
        N_weights:
        rwmode

    Returns:
        f:
        datasets
    """

    f = h5py.File(filename, rwmode)

    # Keys: Observables
    datasets = dict.fromkeys(histos.obs_all.keys(), 0)
    init_shape = (0,)
    for key in datasets.keys():
        datasets[key] = f.create_dataset(key, shape=init_shape, maxshape=(None,),
            dtype=np.float32, compression='gzip', compression_opts=9)

    # Keys: Weights
    for i in range(N_weights):
        key = 'W'+str(i)
        datasets[key] = f.create_dataset(key, shape=init_shape, maxshape=(None,),
            dtype=np.float32, compression='gzip', compression_opts=9)

    return f, datasets




[docs]
def hdf5_append(datasets, key, chunk):
    """Append chunk of data to HDF5 file.
    
    Args:
        datasets:
        key:
        chunk:

    Returns:
        f:
        datasets
    """

    print(datasets[key])
    row_count = datasets[key].size

    # Resize the dataset to accommodate the next chunk of rows
    datasets[key].resize((row_count + chunk.shape[0],))

    # Write the next chunk
    datasets[key][row_count:] = chunk
    
    # Increment the row count
    row_count += chunk.shape[0]

    return row_count




'''
--------------------------------------------------------------------------

y_tru = [1, 0, 0, 0, 0]
y_est = [0.9, 0.05, 0.0001, 0.00001, 0.05]

Scheme 1: MAXN = 1

BMAT =
[0 0 0 0] = class 0 (no signal)
[0 0 0 1] = class 1 (signal in last slot)
[0 0 1 0] = class 2  ...
[0 1 0 0] = class 3  ...
[1 0 0 0] = class 4 (signal in first slot)


--------------------------------------------------------------------------
Scheme 2: MAXN <= MAXT3

BMAT =
[0 0 0 0] = class 0 (no signal)
[0 0 0 1] = class 1 (signal in last slot)
[0 0 1 0] = class 2 (signal in second last slot)
[0 0 1 1] = class 3 (signal objects in two slots)
...

'''

[docs]
def poweranalysis(evt_index, batch_obs, obs, func_predict, x, y, qsets, MAXT3, MAXN, isMC, reco, BMAT, WNORM):
    """Powerset analysis of the event
    N.B.this is already CONDITIONED that we select a maximum MAXT3 triplets!
    
    Args:
        evt_index:
        batch_obs:
        obs:
        func_predict:
        x:
        y:
        qsets:
        MAXT3:
        MAXN:
        isMC:
        reco:
        BMAT:
        WNORM:

    Returns:
        w:
    """

    # Create weights
    w = dict()
    if isMC:
        w['0'] = np.zeros(BMAT.shape[0])
        for c in range(BMAT.shape[0]):
            w['0'][c] = 1 if aux.binvec_are_equal(y, BMAT[c,:]) else 0
    else:
        for i in range(len(func_predict)):
            w[str(i)] = (func_predict[i](np.array([x]))).flatten()

    # Get indices we use for the kinematic reconstruction [UPDATE HERE **** !!!]
    set_ind = tools.get_first_indices(qsets, MAXT3=MAXT3)


    # Over all powerset combinations (one object, two objects, ... )
    # with the total number being dependent on number of rows in BMAT
    wnorm = {'S': 0, 'B': 0}

    for c in range(BMAT.shape[0]):

        # Event (object) normalization strategy
        if   WNORM == 'event':
            wnorm['S'] = np.sum(BMAT[c,:] == 1) # Number of signal objects
            wnorm['B'] = np.sum(BMAT[c,:] == 0) # Number of background objects
            
        elif WNORM == 'unit':
            wnorm['S'] = wnorm['B'] = 1.0

        else:
            raise Exception(__name__ + '.poweranalysis: unknown WNORM parameter')

        # Sweep over all triplets of this combination
        for j in range(len(BMAT[c,:])):

            # SIGNAL / BACKGROUND COMPONENT ACTIVE (= POWERSET DEFINITION)
            ID = 'S' if (BMAT[c,j]) else 'B'

            # Computed observables
            for key in histos.obs_all.keys():

                if   key in batch_obs:

                    """
                    print(key)
                    print(evt_index)
                    print(set_ind[j])
                    
                    print(batch_obs[key][evt_index])
                    print(batch_obs[key][evt_index][set_ind[j]])
                    """
                    xx = awkward.to_list( batch_obs[key][evt_index][set_ind[j]] )

                    # ** SOMETHING IS BROKEN HERE, WE SHOULD NOT HAVE A LIST AT THIS POINT ** 
                    if isinstance(xx, list):
                        reco[ID]['x'][key].add( xx[0] )
                    else:
                        reco[ID]['x'][key].add( xx )
                
                elif key in obs:
                    reco[ID]['x'][key].add( obs[key] )

            # Weights
            if isMC:
                reco[ID]['w']['0'].add(w['0'][c] / wnorm[ID])

            else:
                for algo in w.keys():
                    reco[ID]['w'][algo].add(w[algo][c] / wnorm[ID])
    return w




[docs]
def hist_flush(reco, hobj, h5datasets = None):
    """ Histogram observables with accumulation of previous histograms, and flush buffer arrays
    
    Args:
        reco:
        hobj:
        h5datasets:

    Returns:
        w:
    """

    # ====================================================================
    ### Accumulate histograms

    for obs in reco['x'].keys():
        if (reco['x'][obs].size > 0): # Histogram only non-empty buffer

            for algo in reco['w'].keys():
                hobj[algo][obs] += iceplot.hist_obj(x = reco['x'][obs].values(),
                    weights=reco['w'][algo].values(), bins=histos.obs_all[obs]['bins'])

    # ====================================================================
    ### Dump data to HDF5 files

    if h5datasets is not None:

        for obs in reco['x'].keys():
            if histos.obs_all[obs]['pickle']:
                hdf5_append(datasets=h5datasets, key=obs, chunk=reco['x'][obs].values())

        for algo in reco['w'].keys():
            hdf5_append(datasets=h5datasets, key='W'+algo, chunk=reco['w'][algo].values())

    # ====================================================================
    ### Flush buffers

    # Observables
    for obs in reco['x'].keys():
        reco['x'][obs].reset()

    # Weight
    for algo in reco['w'].keys():
        reco['w'][algo].reset()

    
    # ====================================================================



[docs]
def initarrays(BUFFER, func_predict, isMC):
    """ Init histogramming arrays and objects
    
    Args:
        BUFFER:
        func_predict:
        isMC:

    Returns:
        reco:
        hobj:
    """

    reco_EMPTY = {
        'x' : dict.fromkeys(histos.obs_all.keys()),
        'w' : dict()
    }
    for obs in histos.obs_all.keys():
        reco_EMPTY['x'][obs] = fastarray1(BUFFER)

    if isMC:
        reco_EMPTY['w']['0'] = fastarray1(BUFFER)
    else:
        for i in range(len(func_predict)):
            reco_EMPTY['w'][str(i)] = fastarray1(BUFFER)


    ## Final state event buffers
    reco      = {'S': None, 'B': None}
    reco['S'] = copy.deepcopy(reco_EMPTY)
    reco['B'] = copy.deepcopy(reco_EMPTY)


    ## Final state histogram objects
    hobj = {'S': dict(), 'B': dict()}

    if isMC:
        hobj['S']['0'] = dict.fromkeys(histos.obs_all.keys(), iceplot.hobj())
        hobj['B']['0'] = dict.fromkeys(histos.obs_all.keys(), iceplot.hobj())
    
    else:
        for i in range(len(func_predict)):
            algo = str(i)
            hobj['S'][algo] = dict.fromkeys(histos.obs_all.keys(), iceplot.hobj())
            hobj['B'][algo] = dict.fromkeys(histos.obs_all.keys(), iceplot.hobj())

    return reco, hobj




[docs]
def process(paths=[], func_predict=None, isMC=True, MAXT3=5, MAXN=2, maxevents=10000000000,
    EVTGROUPSIZE=1024, CHUNKBUFFER=512, VERBOSE=False, BMAT=[], WNORM=[], SUPERSETS=True,
    hd5dir=None, outputXY=False, outputP=False, **kwargs):
    """ Main event processing loop
    
    Args:

    Returns:

    """

    # Check critical inputs

    if isMC is False and (func_predict is None or func_predict is []):
        raise Exception(__name__ + '.process: input ''func_predict'' should not be empty')

    # --------------------------------------------------------------------
    # @ Basic variables @

    NCLASS = BMAT.shape[0]       # Number of classes
    D = features.getdimension()  # Numbef of dimensions per triplet
    DTOT = D * MAXT3             # Total number of dimensions

    SORTKEY = 'BToKEE_fit_cos2D' # Triplet ranking variable


    # --------------------------------------------------------------------
    # @ HDF5 dump @

    h5file     = {'S': None, 'B': None}
    h5datasets = {'S': None, 'B': None}

    if hd5dir is not None:

        if isMC:
            h5file['S'], h5datasets['S'] = hdf5_write_handles(filename=hd5dir + '/MC_S.hd5', N_weights=1)
            h5file['B'], h5datasets['B'] = hdf5_write_handles(filename=hd5dir + '/MC_B.hd5', N_weights=1)
        else:
            h5file['S'], h5datasets['S'] = hdf5_write_handles(filename=hd5dir + '/DA_S.hd5', N_weights=len(func_predict))
            h5file['B'], h5datasets['B'] = hdf5_write_handles(filename=hd5dir + '/DA_B.hd5', N_weights=len(func_predict))


    # --------------------------------------------------------------------
    # @ MC AI/ML-training arrays @
    X = None
    Y = None
    P = None
    x = None
    y = None

    if outputXY:
        X = np.zeros(shape=(maxevents, DTOT))

    if outputP:
        P = dict()
        for i in range(len(func_predict)): P[str(i)] = np.zeros(shape=(maxevents, NCLASS))
    
    if isMC:
        Y = np.zeros(shape=(maxevents, MAXT3))


    # --------------------------------------------------------------------
    # @ CUTS AND INFOFLOW @

    cutflow, infostats_BC, mcinfostats_BC = cutstats.init_stat_objects()
    infostats_AC   = copy.deepcopy(infostats_BC)
    mcinfostats_AC = copy.deepcopy(mcinfostats_BC)


    # --------------------------------------------------------------------
    # @ WEIGHTED OBSERVABLE CONTAINERS @

    T = CHUNKBUFFER * (BMAT.shape[0] * BMAT.shape[1])
    reco, hobj = initarrays(BUFFER=T, func_predict=func_predict, isMC=isMC)


    # --------------------------------------------------------------------
    # @ FAST HISTOGRAMS @

    #hobjraw_MC = fasthistos.initialize()
        

    # --------------------------------------------------------------------
    # @ LOOP VARIABLES @

    passcount = 0
    evtcount  = 0

    l1_p4 = dict()
    l2_p4 = dict()
    k_p4  = dict()


    # ==================================================================== ROOT file loop >>

    def print_input(events):
        prints.printbar()
        print(__name__ + f'.process: Input: {paths[z]}')
        print(__name__ + f'.process: {events.name} {events.title}')

        print(__name__ + f'.process: D          = {features.getdimension()}')
        print(__name__ + f'.process: DTOT       = {DTOT} (D x MAXT3)')
        print(__name__ + f'.process: NCLASS     = {NCLASS}')
        prints.printbar()
        print(__name__ + f'.process: MAXT3      = {MAXT3}')
        print(__name__ + f'.process: MAXN       = {MAXN}')
        print(__name__ + f'.process: WNORM      = {WNORM}')
        print(__name__ + f'.process: SUPERSETS  = {SUPERSETS}')
        print('')
        print(__name__ + f'.process: maxevents  = {maxevents}')
        prints.printbar()
    

    # Loop over different files
    for z in range(len(paths)):

        file = uproot.open(paths[z])
        print(file)
        events = file['Events']

        if z == 0:
            events.show()
            print_input(events)

        ## Loop over this file
        skips = 0
        for evt_i, evtgroup in enumerate(events.iterate(step_size = EVTGROUPSIZE)):

            print(f'evtgroup = {evt_i} [EVTGROUPSIZE = {EVTGROUPSIZE}]')
            d = copy.deepcopy(evtgroup)

            """
            if not isMC:
                if skips < 1:
                    # Skip events hack
                    print('Skipping evtgroup due to simulation')
                    skips += 1
                    continue;
                else:
                    True
            """
            
            # --------------------------------------------------------------------
            # *** FULLY VECTORIZED KINEMATICS CONSTRUCTION (JAGGED/COLUMNAR) ***

            ## Construct new branches
            features.construct_new_branches(d)

            ## Construct kinematics (4-vectors)
            tools.construct_kinematics(d, l1_p4, l2_p4, k_p4)

            ## Construct final state observables
            batch_obs = histos.calc_batch_observables(l1_p4, l2_p4, k_p4)
            

            if isMC:
                tools.construct_MC_tree(d)
                tools.construct_MC_truth(d)
                batch_MC_obs = histos.calc_batch_MC_observables(d, l1_p4, l2_p4, k_p4)

            # --------------------------------------------------------------------
            # Loop over events of this group
            N_EVENTS = len(d['event'])
            iterator = tqdm(range(N_EVENTS))

            for evt_index in iterator:

                if (evtcount == maxevents): 
                    print(colored(__name__ + f': Maximum event count {maxevents} reached', 'red'))
                    iterator.close() # So we don't get ghost messages
                    break # this event group
                    
                # ====================================================================
                ## EVENT SELECTION AND COMBINATORICS
                # ====================================================================
                evtcount += 1
                
                ## Find triplets merged together
                if SUPERSETS:
                    qsets = aux.los2lol(tools.find_connected_triplets(evt_index, l1_p4, l2_p4, k_p4))

                # All individually, no fusion
                else:
                    qsets = []
                    for kk in range( len(awkward.to_list(l1_p4['e'][evt_index])) ):
                        qsets.append( [kk] )
                

                # Sort the triplets
                def svprob_rank(tripletset):
                    for ind in tripletset:
                        # Return first of the (possible) superset [** SOMETHING BROKEN HERE PROBABLY **]
                        return d[SORTKEY][evt_index][ind]

                qsets.sort(key=svprob_rank, reverse=True) # Reverse gives the biggest values first!


                # ====================================================================
                # FILL FAST HISTOGRAMS

                '''
                # Get indices we use for the kinematic reconstruction [UPDATE HERE **** !!!]
                set_ind = tools.get_first_indices(qsets, MAXT3=MAXT3)

                for obs in fasthistos['S'].keys():
                    for set_ind
                '''

                # ====================================================================
                ## Construct input (and output) vectors

                x = tools.construct_input_vec(evt_index, d, l1_p4=l1_p4, l2_p4=l2_p4, k_p4=k_p4, qsets=qsets, MAXT3=MAXT3)
                if outputXY:
                    X[passcount,:] = copy.deepcopy(x)
                
                if isMC:
                    y = tools.construct_output_vec(evt_index, d, qsets=qsets, MAXT3=MAXT3)
                    if outputXY:
                        Y[passcount,:] = copy.deepcopy(y)


                # ====================================================================
                ## CUTS
                # ====================================================================
                cutstats.collect_info_stats(d = d, evt_index = evt_index, infostats = infostats_BC)
                if isMC: cutstats.collect_mcinfo_stats(d=d, evt_index=evt_index, y=y,
                    qsets=qsets, MAXT3=MAXT3, mcinfostats=mcinfostats_BC)
                
                # --------------------------------------------------------------------
                if (cutstats.apply_cuts(d, evt_index, cutflow) == False):
                    continue
                # --------------------------------------------------------------------
                cutstats.collect_info_stats(d = d, evt_index = evt_index, infostats = infostats_AC)
                if isMC: cutstats.collect_mcinfo_stats(d=d, evt_index=evt_index, y=y,
                    qsets=qsets,MAXT3=MAXT3, mcinfostats=mcinfostats_AC)
                    

                # ====================================================================
                # Event-by-event (non-vectorizable) observables

                obs        = histos.calc_observables(evt_index, d, l1_p4, l2_p4, k_p4, qsets, MAXT3)
                if isMC:
                    MC_obs = histos.calc_MC_observables(evt_index, d, l1_p4, l2_p4, k_p4, qsets, MAXT3)
                
                if VERBOSE:
                    tools.print_MC_event(evt_index, d, l1_p4, l2_p4, k_p4, qsets)


                # ====================================================================
                ## ANALYSIS
                # ====================================================================
                if func_predict is not None:

                    ## Analyze
                    w = poweranalysis(evt_index=evt_index, batch_obs=batch_obs, obs=obs, func_predict=func_predict,
                        x=x, y=y, qsets=qsets, MAXT3=MAXT3, MAXN=MAXN, reco=reco, isMC=isMC, BMAT=BMAT, WNORM=WNORM)

                    if outputP:
                        for i in range(len(func_predict)): P[str(i)][passcount,:] = w[str(i)]

                    # Fill histograms & flush buffers
                    if ((passcount > 0) & (passcount % (CHUNKBUFFER-1) == 0)):
                        for ID in reco.keys(): hist_flush(reco=reco[ID], hobj=hobj[ID], h5datasets=h5datasets[ID])

                ## Update counts
                passcount += 1

        # Last fill histograms & flush buffers
        for ID in reco.keys(): hist_flush(reco=reco[ID], hobj=hobj[ID], h5datasets=h5datasets[ID])

    # ==================================================================== << ROOT file loop

    # Close HDF5 files
    if hd5dir is not None:
        for ID in reco.keys(): h5file[ID].close()

    ## Remove buffer space
    if outputXY:
        X = X[0:passcount-1,:]
        if isMC:
            Y = Y[0:passcount-1,:]

    if outputP:
        for i in range(len(func_predict)):
            P[str(i)] = P[str(i)][0:passcount-1,:]


    ## Update total event counts
    cutflow['total']      = evtcount
    infostats_BC['total'] = evtcount
    infostats_AC['total'] = evtcount

    if isMC:
        mcinfostats_BC['total'] = evtcount
        mcinfostats_AC['total'] = evtcount

    ## Print flows
    print('\n<< INFOSTATS [BC] >>'); prints.print_flow(infostats_BC)
    if isMC: print('\n<< MC-ONLY-INFOSTATS [BC] >>'); prints.print_flow(mcinfostats_BC)
    print('\n<< CUTFLOW >>'); prints.print_flow(cutflow)
    print('\n<< INFOSTATS [AC] >>'); prints.print_flow(infostats_AC)
    if isMC: print('\n<< MC-ONLY-INFOSTATS [AC] >>'); prints.print_flow(mcinfostats_AC)
    print('\n')
    
    output = {
        'X'    : X,
        'Y'    : Y,
        'P'    : P,
        'hobj' : hobj
    }
    
    return output