Source code for iceid.common

# Common input & data reading routines for the electron ID
#
# m.mieskolainen@imperial.ac.uk, 2024

import numpy as np
import uproot
from importlib import import_module
import time
import ray
import os

from icenet.tools import io, aux, prints, iceroot
from iceid import graphio

# ------------------------------------------
from icenet import print
# ------------------------------------------

# Globals
from configs.eid.mctargets import *
from configs.eid.mcfilter  import *
from configs.eid.cuts import *



[docs]
def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevents=None, args=None, library='np'):
    """ Loads the root files.
    
    Args:
        root_path : paths to root files (list)
    
    Returns:
        X:     columnar data
        Y:     class labels
        W:     event weights
        ids:   columnar variable string (list)
        info:  trigger and pre-selection acceptance x efficiency information (dict)
    """
    inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"])
    
    if type(root_path) is not list:
        root_path = [root_path] # Make sure it is a list, even if one file only
    
    # -----------------------------------------------
    CUTFUNC    = globals()[args['cutfunc']]
    TARFUNC    = globals()[args['targetfunc']]
    FILTERFUNC = globals()[args['filterfunc']]
    # -----------------------------------------------

    print('')
    print(f'Loading root file {root_path}', 'yellow')
    
    # Check is it MC (based on the first file and first event)
    file   = uproot.open(root_path[0])
    events = file[args['tree_name']]
    isMC   = bool(events.arrays('is_mc')[0]['is_mc'])

    # --------------------------------------------------------------
    # Load all files

    LOAD_VARS = inputvars.LOAD_VARS # Which variables do we read
    
    X,ids = iceroot.load_tree(rootfile=root_path, tree=args['tree_name'],
        entry_start=entry_start, entry_stop=entry_stop, maxevents=maxevents, ids=LOAD_VARS, library=library,
        num_cpus=args['num_cpus'])
    Y = None
    # --------------------------------------------------------------
    
    print(f'X.shape = {X.shape}')
    io.showmem()
    prints.printbar()

    # =================================================================
    # *** MC ONLY ***

    if isMC:

        # @@ MC class target definition here @@
        print(f'Computing MC <targetfunc> ...', 'yellow')
        Y = TARFUNC(X=X, ids=ids, xcorr_flow=args['xcorr_flow']).astype(np.int32)
        print(__name__ + f'Y.shape = {Y.shape}')
        
        # @@ MC filtering done here @@
        print(f'Computing MC <filterfunc> ...', 'yellow')
        mask_mc = FILTERFUNC(X=X, ids=ids, xcorr_flow=args['xcorr_flow'])
        print(f'<filterfunc> | before: {len(X)}, after: {sum(mask_mc)} events', 'green')
        prints.printbar()
        
        X = X[mask_mc]
        Y = Y[mask_mc].squeeze() # Remove useless dimension
    
    # =================================================================

    # @@ Observable cut selections done here @@
    print(f'Computing <cutfunc> ...', 'yellow')
    cmask = CUTFUNC(X=X, ids=ids, xcorr_flow=args['xcorr_flow'])
    print(f"<cutfunc> | before: {len(X)}, after: {np.sum(cmask)} events \n", 'green')
    
    X = X[cmask]
    if isMC: Y = Y[cmask]
    
    io.showmem()
    prints.printbar()
    file.close()

    # Trivial weights
    W = np.ones(len(X))

    # TBD add cut statistics etc. info here
    info = {}
    
    # ** Crucial -- randomize order to avoid problems with other functions **
    rand = np.random.permutation(len(X))
    X    = X[rand].squeeze() # Squeeze removes additional [] dimension
    Y    = Y[rand].squeeze()
    W    = W[rand].squeeze()

    return {'X':X, 'Y':Y, 'W':W, 'ids':ids, 'info':info}




[docs]
def splitfactor(x, y, w, ids, args):
    """
    Transform data into different datatypes.
    
    Args:
        data:  jagged arrays
        args:  arguments dictionary
    
    Returns:
        dictionary with different data representations
    """
    inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"])
    
    data = io.IceXYW(x=x, y=y, w=w, ids=ids)
    
    ### Pick active variables out
    scalar_vars = aux.process_regexp_ids(all_ids=ids, ids=eval('inputvars.' + args['inputvar_scalar']))

    if args['inputvar_image'] is not None:
        image_vars = aux.process_regexp_ids(all_ids=ids, ids=eval('inputvars.' + args['inputvar_image']))
    else:
        image_vars = None

    # -------------------------------------------------------------------------
    ### Pick kinematic variables out
    data_kin = None
    
    if inputvars.KINEMATIC_VARS is not None:

        vars       = aux.process_regexp_ids(all_ids=data.ids, ids=inputvars.KINEMATIC_VARS)
        data_kin   = data[vars]
        data_kin.x = data_kin.x.astype(np.float32)
    
    # -------------------------------------------------------------------------
    ### MI variables
    data_MI = None
    
    # -------------------------------------------------------------------------
    ### DeepSets representation
    data_deps = None
    
    # -------------------------------------------------------------------------
    ### Tensor representation
    data_tensor = None
    
    if image_vars is not None:
        data_tensor = graphio.parse_tensor_data(X=data.x, ids=ids, image_vars=image_vars, args=args)
    
    # -------------------------------------------------------------------------
    ## Graph representation
    data_graph = None

    if args['graph_param']['num_workers'] == 1:
        
        data_graph = graphio.parse_graph_data(X=data.x, ids=data.ids, features=scalar_vars,
            graph_param=args['graph_param'], Y=data.y, weights=data.w, entry_start=None, entry_stop=None)
    
    else:
        
        # Parallel processing of graph objects with Ray
        
        start_time  = time.time()
        
        num_workers = args['graph_param']['num_workers']
        chunk_ind   = aux.split_start_end(range(len(data.x)), num_workers)
        print(chunk_ind)

        data_graph = []
        job_index  = 0
        
        ray.init(num_cpus=num_workers, _temp_dir=f'{os.getcwd()}/tmp/')
        
        graph_futures = []
        obj_ref       = ray.put(data.x) # ** Ray seems not able to handle numpy(object) array without copy ..

        for _ in range(num_workers):
            
            entry_start, entry_stop = chunk_ind[job_index][0], chunk_ind[job_index][-1]
            
            graph_futures.append( \
                graphio.parse_graph_data_ray.remote( \
                    obj_ref, data.ids, scalar_vars, args['graph_param'],
                    data.y, data.w, entry_start, entry_stop)
            )

            job_index += 1

        data_graph += sum(ray.get(graph_futures), []) # Join split array results
        ray.shutdown()

        print(f'ray_results: {time.time() - start_time:0.1f} sec')
        io.showmem()
    
    # --------------------------------------------------------------------
    ### Finally pick active scalar variables out
    
    vars   = aux.process_regexp_ids(all_ids=data.ids, ids=scalar_vars)
    data   = data[vars]
    data.x = data.x.astype(np.float32)
    
    return {'data':        data,
            'data_MI':     data_MI,
            'data_kin':    data_kin,
            'data_deps':   data_deps,
            'data_tensor': data_tensor,
            'data_graph':  data_graph}