Source code for icezee.common

# Common input & data reading routines for ZEE
#
# m.mieskolainen@imperial.ac.uk, 2024

import numpy as np
import copy
from importlib import import_module
import pandas as pd
import os

from icenet.tools import io
from icenet.tools import aux

from icefit import dequantize

# ------------------------------------------
from icenet import print
# ------------------------------------------

# GLOBALS
#from configs.zee.cuts import *
#from configs.zee.filter import *


[docs]
def truncate(X,Y,W,maxevents):

    # Apply maxevents cutoff
    maxevents = np.min([maxevents, len(X)])
    if maxevents < len(X):
        print(f'Applying maxevents cutoff {maxevents}')
        X, Y, W = X[0:maxevents], Y[0:maxevents], W[0:maxevents]
    
    return X,Y,W   



[docs]
def load_helper(mcfiles, datafiles, maxevents, args):
    
    inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"])
    LOAD_VARS = inputvars.LOAD_VARS
    
    # -------------------------------------------------------------------------
    # *** MC ***
    
    frames = []
    
    for f in mcfiles:
        new_frame = copy.deepcopy(pd.read_parquet(f))
        frames.append(new_frame)
        print(f'{f} | N = {len(new_frame)}', 'yellow')
        ids = list(new_frame.keys()); ids.sort()
        print(ids)
    
    frame_mc = pd.concat(frames)
    
    print(f'Total number of events: {len(frame_mc):0.1E}')
    
    X_MC = frame_mc[LOAD_VARS].to_numpy()
    
    # Label = 0
    Y_MC = np.zeros(len(X_MC)).astype(int)
    
    ## Pre-computed weights (gen event weight x CMS weights)
    W_MC = frame_mc[['weight']].to_numpy().squeeze()
    
    print(f'X_MC.shape = {X_MC.shape}')
    print(f'W_MC.shape = {W_MC.shape}')
    
    # Apply maxevents cutoff
    X_MC, Y_MC, W_MC = truncate(X=X_MC, Y=Y_MC, W=W_MC, maxevents=maxevents)
    
    # -------------------------------------------------------------------------
    # *** Data ***
    
    frames = []
    
    for f in datafiles:
        new_frame = copy.deepcopy(pd.read_parquet(f))
        frames.append(new_frame)
        print(f'{f} | N = {len(new_frame)}', 'yellow')
        ids = list(new_frame.keys()); ids.sort()
        print(ids)
    
    frame_data = pd.concat(frames)
    
    print(f'Total number of events: {len(frame_data):0.1E}')
    
    X_data = frame_data[LOAD_VARS].to_numpy()
    W_data = np.ones(len(X_data))
    
    # Label = 1
    Y_data = np.ones(len(X_data)).astype(int)
    
    print(f'X_data.shape = {X_data.shape}')
    print(f'W_data.shape = {W_data.shape}')
    
    # Apply maxevents cutoff
    X_data, Y_data, W_data = truncate(X=X_data, Y=Y_data, W=W_data, maxevents=maxevents)
    
    # -------------------------------------------------------------------------
    # Combine MC and Data samples
    
    X   = np.vstack((X_MC, X_data))
    Y   = np.concatenate((Y_MC, Y_data))
    W   = np.concatenate((W_MC, W_data))
    
    ids = LOAD_VARS # We use these
    
    ## -------------------------------------------------
    # ** Drop negative weight (MC) events **
    if args['drop_negative']:
        ind = W < 0
        if np.sum(ind) > 0:
            print(f'Dropping negative weight events ({np.sum(ind)/len(ind):0.3f})', 'red')
            X = X[~ind]
            W = W[~ind] # Boolean NOT
            Y = Y[~ind]
    
    # Re-nenormalize MC to the event count
    ind    = (Y == 0)
    W[ind] = W[ind] / np.sum(W[ind]) * len(W[ind])
    ## -------------------------------------------------
    
    # -------------------------------------------------------------------------
    # ** Randomize MC vs Data order to avoid problems with other functions **
    # ** No need to randomize in this application. We have fixed (eval, validate, test) file structure **
    
    #rand = np.random.permutation(len(X))
    #X    = X[rand].squeeze() # Squeeze removes additional [] dimension
    #Y    = Y[rand].squeeze()
    #W    = W[rand].squeeze()
    # -------------------------------------------------------------------------
    
    # -------------------------------------------------------------------------
    ## Add event weights as an aux variable called `raw_weight` for plots etc.
    
    X   = np.hstack((X, W[None].T))
    ids = ids + ['raw_weight']
    
    # -------------------------------------------------------------------------
    
    ## Print some diagnostics
    print(f'Total number of events: {len(X)}')
    
    for c in np.unique(Y):
        print(f'class[{c}] | N = {np.sum(Y == c)} | weight[mean,std,min,max] = {np.mean(W[Y==c]):0.3E}, {np.std(W[Y==c]):0.3E}, {np.min(W[Y==c]):0.3E}, {np.max(W[Y==c]):0.3E}')
    
    return X,Y,W,ids




[docs]
def load_root_file(root_path, ids=None, entry_start=0, entry_stop=None, maxevents=None, args=None):
    """ Loads the root file.
    
    Args:
        root_path : paths to root files (list)
    
    Returns:
        X:     columnar data
        Y:     class labels
        W:     event weights
        ids:   columnar variable string (list)
        info:  trigger and pre-selection acceptance x efficiency information (dict)
    """
    
    if type(root_path) is list:
        root_path = root_path[0] # Remove [] list, we expect only the path here
    
    # -----------------------------------------------
    #param = {
    #    'entry_start': entry_start,
    #    "entry_stop":  entry_stop,
    #    "maxevents":   maxevents,
    #    "args": args
    #}

    if maxevents is None:
        maxevents = int(1e10)
        print(f'"maxevents" is None, set {maxevents}', 'yellow')
    
    # -------------------------------------------------------------------------
    # We do splitting according to dictionary (trn, val, tst) structure
    
    running_split = {}
    X = {}
    Y = {}
    W = {}
    N_prev = 0
    
    for mode in ['trn', 'val', 'tst']:
        
        # Glob expansion type (be careful not to have "label noise" under the folder or subfolders)
        mc_files  = io.glob_expand_files(datasets=args["mcfile"][mode],   datapath=root_path)
        da_files  = io.glob_expand_files(datasets=args["datafile"][mode], datapath=root_path)
        
        # Simple fixed one file
        #mc_files = [os.path.join(root_path, args['mcfile'][mode][0])]
        #da_files = [os.path.join(root_path, args['mcfile'][mode][0])]
        
        print(f'Found mcfiles:   {mc_files}')
        print(f'Found datafiles: {da_files}')
        
        X[mode],Y[mode],W[mode],ids = load_helper(mcfiles=mc_files, datafiles=da_files, maxevents=maxevents, args=args)
        running_split[mode] = np.arange(N_prev, len(X[mode]) + N_prev)
        N_prev += len(X[mode]) #!
    
    # Combine
    X = np.vstack((X['trn'], X['val'], X['tst']))
    Y = np.concatenate((Y['trn'], Y['val'], Y['tst']))
    W = np.concatenate((W['trn'], W['val'], W['tst']))
    
    # Aux info here
    info = {'running_split': running_split}
    
    return {'X':X, 'Y':Y, 'W':W, 'ids':ids, 'info':info}




[docs]
def splitfactor(x, y, w, ids, args):
    """
    Transform data into different datatypes.
    
    Args:
        data:  jagged arrays
        args:  arguments dictionary
    
    Returns:
        dictionary with different data representations
    """
    inputvars = import_module("configs." + args["rootname"] + "." + args["inputvars"])
    
    data = io.IceXYW(x=x, y=y, w=w, ids=ids)

    # -------------------------------------------------------------------------
    ### Pick kinematic variables out (note also 'raw_weight')
    data_kin = None

    if inputvars.KINEMATIC_VARS is not None:
        vars       = aux.process_regexp_ids(all_ids=data.ids, ids=inputvars.KINEMATIC_VARS + ['raw_weight'])
        data_kin   = data[vars]
        data_kin.x = data_kin.x.astype(np.float32)
    
    # -------------------------------------------------------------------------
    data_deps   = None

    # -------------------------------------------------------------------------
    data_tensor = None

    # -------------------------------------------------------------------------
    data_graph  = None
    
    # -------------------------------------------------------------------------
    # Mutual information regularization targets
    
    data_MI = None
    
    if inputvars.MI_VARS is not None:
        vars    = aux.process_regexp_ids(all_ids=data.ids, ids=inputvars.MI_VARS)
        data_MI = data[vars].x.astype(np.float32)
    
    # -------------------------------------------------------------------------
    ### Finally pick active scalar variables out
    
    vars   = aux.process_regexp_ids(all_ids=data.ids, ids=eval('inputvars.' + args['inputvar_scalar']))
    data   = data[vars]
    data.x = data.x.astype(np.float32)
    
    # -------------------------------------------------------------------------
    ### ** Special transform for specific MVA input variables **
    
    if args['pre_transform']['active']:
        
        var  = args['pre_transform']['var']
        func = args['pre_transform']['func']
        
        for i, v in enumerate(var):
            
            try:
                ind = data.find_ind(v)
            except:
                print(f'Could not find variable "{v}" -- continue', 'red')
                continue
            
            print(f'Pre-transform: {func[i]} of variable "{v}"', 'magenta')
            
            x = data.x[:,ind] # x used in the transform
            data.x[:,ind] = eval(func[i])
            
            # Change the variable name [+ have the same original variables in data_kin]
            data.ids[ind] = f'TRF__{v}'
    
    return {'data':        data,
            'data_MI':     data_MI,
            'data_kin':    data_kin,
            'data_deps':   data_deps,
            'data_tensor': data_tensor,
            'data_graph':  data_graph}