Source code for icenet.tools.reweight

# Event sample re-weighting tools
#
# m.mieskolainen@imperial.ac.uk, 2024

import numpy as np
import awkward as ak
import torch
import matplotlib.pyplot as plt
import copy

from icenet.tools import aux, io, prints
from icenet.deep  import iceboost, predict

# ------------------------------------------
from icenet import print
# ------------------------------------------



[docs]
def rw_transform_with_logits(logits, mode, absMax=30):
    """
    AI/Deep reweighting transform with logits input
    
    Args:
        prob: probabilities
        mode: operation mode
    
    Returns:
        transformed values
    """
    
    if type(logits) is torch.Tensor:
        exp_     = torch.exp
        sigmoid_ = torch.sigmoid
        logits   = torch.clip(logits, -absMax, absMax) # Protect to not overflow exp
    else:
        exp_     = np.exp
        sigmoid_ = aux.sigmoid
        logits   = np.clip(logits, -absMax, absMax)
    
    print(f'Reweight transform with mode: {mode}')
    
    # Simplified functions based on ratios of sigmoids
    
    if   mode == 'LR':                 # LR trick
        return exp_(logits)
    elif mode == 'inverse-LR':         # Inverse LR trick
        return exp_(-logits)
    elif mode == 'DeepEfficiency':     # https://arxiv.org/abs/1809.06101
        return exp_(-logits) + 1.0
    elif mode == 'direct':             # Direct
        return sigmoid_(logits)
    elif mode == 'identity' or mode == None:
        if type(logits) is torch.Tensor:
            return torch.ones_like(logits)
        else:
            return np.ones_like(logits)
    else:
        raise Exception(__name__ + f'.reweight_transform_with_logits: Unknown operation mode {mode}')




[docs]
def rw_transform(phat, mode, EPS=1E-12):
    """
    AI/Deep reweighting transform
    
    Args:
        phat: estimated probabilities
        mode: operation mode
    
    Returns:
        transformed values
    """
    
    if type(phat) is torch.Tensor:
        phat = torch.clip(phat, EPS, 1-EPS)
    else:
        phat = np.clip(phat, EPS, 1-EPS)
    
    print(f'Reweight transform with mode: {mode}')
    
    if   mode == 'LR':                    # LR trick
        return phat / (1.0 - phat)
    elif mode == 'inverse-LR':            # Inverse LR trick
        return (1.0 - phat) / phat
    elif mode == 'DeepEfficiency':        # https://arxiv.org/abs/1809.06101
        return 1.0 / phat
    elif mode == 'direct':                # Direct
        return phat
    elif mode == 'identity' or mode == None:
        if type(phat) is torch.Tensor:
            return torch.ones_like(phat)
        else:
            return np.ones_like(phat)
    else:
        raise Exception(__name__ + f'.reweight_transform: Unknown operation mode {mode}')




[docs]
def histogram_helper(x, y, w, ids, pdf, args, EPS):
    """
    Helper function for histogram based reweighting
    """
    
    class_ids       = np.unique(y.astype(int))
    reference_class = args['reweight_param']['reference_class']
    
    diff_args = args['reweight_param']['diff_param']
    h_args    = args['reweight_param']['diff_param']['hist_param']
    variables = diff_args['var']
    
    print(f"Reference (target) class [{reference_class}] | Found classes: {class_ids} from y")
    
    ### Collect re-weighting variables
    RV = {}
    for i, var in enumerate(variables):
        if isinstance(x, ak.Array):
            RV[i] = ak.to_numpy(x[var]).astype(np.float32)
        else:
            RV[i] = x[:, ids.index(var)].astype(np.float32)
    
    ### Pre-transform
    for i, var in enumerate(variables):
        
        mode = h_args[f'transform'][i]
        
        if h_args[f'binmode'][i] == 'edges':
            raise Exception(__name__ + '.histogram_helper: Cannot transform "edges" type')
        
        d = h_args[f'bins'][i]
        
        if   mode == 'log10':
            if np.any(RV[i] <= 0):
                ind = (RV[i] <= 0)
                print(f'Variable {var} < 0 (in {np.sum(ind)} elements) in log10 -- truncating to zero', 'red')

            # Transform values
            RV[i] = np.log10(np.maximum(RV[i], EPS))

            # Transform bins
            h_args[f'bins'][i][0] = np.log10(max(d[0], EPS))
            h_args[f'bins'][i][1] = np.log10(d[1])

        elif mode == 'sqrt':
            
            # Values
            RV[i] = np.sqrt(np.maximum(RV[i], EPS))

            # Bins
            h_args[f'bins'][i][0] = np.sqrt(max(d[0], EPS))
            h_args[f'bins'][i][1] = np.sqrt(d[1])
            
        elif mode == 'square':
            
            # Values
            RV[i] = RV[i]**2

            # Bins
            h_args[f'bins'][i][0] = d[0]**2
            h_args[f'bins'][i][1] = d[1]**2

        elif mode == None:
            True
        else:
            raise Exception(__name__ + '.histogram_helper: Unknown pre-transform')

    # Binning setup
    binedges = {}
    for i, var in enumerate(variables):
        
        d = h_args[f'bins'][i]
        
        if   h_args[f'binmode'][i] == 'linear':
            binedges[i] = np.linspace(d[0], d[1], d[2])

        elif h_args[f'binmode'][i] == 'log10':
            binedges[i] = np.logspace(np.log10(max(d[0], EPS)), np.log10(d[1]), d[2], base=10)
            
        elif h_args[f'binmode'][i] == 'edges':
            binedges[i] = np.array(d)
            
        else:
            raise Exception(__name__ + ': Unknown reweight binning mode')
    
    rwparam = {
        'y':               y,
        'reference_class': reference_class,
        'max_reg':         h_args['max_reg']
    }

    ### Compute 2D-PDFs for each class
    if diff_args['type'] == '2D':
        
        print(f'2D re-weighting with: [{variables[0]}, {variables[1]}] ...')
        
        if pdf is None: # Not given by user
            pdf = {}
            for c in class_ids:
                
                ind = (y == c)
                sample_weights = w[ind] if w is not None else None # Feed in the input weights !

                pdf[c] = pdf_2D_hist(X_A=RV[0][ind], X_B=RV[1][ind], w=sample_weights, \
                    binedges_A=binedges[0], binedges_B=binedges[1])

            pdf['binedges']  = binedges
            pdf['class_ids'] = class_ids
        
        weights_doublet = reweightcoeff2D(X_A = RV[0], X_B = RV[1], pdf=pdf, **rwparam)
        
    ### Compute factorized 1D x 1D product
    elif diff_args['type'] == 'pseudo-ND':
        
        print(f'pseudo-ND (2D for now) reweighting with: [{variables[0]}, {variables[1]}] ...')
        
        if pdf is None: # Not given by user
            pdf = {}
            for c in class_ids:
                
                ind = (y == c)
                sample_weights = w[ind] if w is not None else None # Feed in the input weights !
                
                pdf_0 = pdf_1D_hist(X=RV[0][ind], w=sample_weights, binedges=binedges[0])
                pdf_1 = pdf_1D_hist(X=RV[1][ind], w=sample_weights, binedges=binedges[1])
                
                if   h_args['pseudo_type'] == 'geometric_mean':
                    pdf[c] = np.sqrt(np.outer(pdf_0, pdf_1)) # (A,B) order gives normal matrix indexing
                elif h_args['pseudo_type'] == 'product':
                    pdf[c] = np.outer(pdf_0, pdf_1)
                else:
                    raise Exception(__name__ + f'.histogram_helper: Unknown "pseudo_type"')

                # Normalize to discrete density
                pdf[c] /= np.sum(pdf[c].flatten())
            
            pdf['binedges']  = binedges
            pdf['class_ids'] = class_ids
        
        weights_doublet = reweightcoeff2D(X_A = RV[0], X_B = RV[1], pdf=pdf, **rwparam)

    ### Compute 1D-PDFs for each class
    elif diff_args['type'] == '1D':
        
        print(f'1D reweighting with: {variables[0]} ...')
        
        if pdf is None: # Not given by user
            pdf = {}
            for c in class_ids:
                ind = (y == c)
                sample_weights = w[ind] if w is not None else None # Feed in the input weights !
                pdf[c] = pdf_1D_hist(X=RV[0][ind], w=sample_weights, binedges=binedges[0])

            pdf['binedges']  = binedges[0]
            pdf['class_ids'] = class_ids
        
        weights_doublet = reweightcoeff1D(X = RV[0], pdf=pdf, **rwparam)
    else:
        raise Exception(__name__ + f'.histogram_helper: Unsupported dimensionality mode "{diff_args["type"]}"')

    pdf['vars'] = variables
    
    return weights_doublet, pdf




[docs]
def map_xyw(x, y, w, vars, c, reference_class):
    """
    For AIRW helper
    """
    
    # Source and Target collected
    ind = (y == c) | (y == reference_class)
    
    new_data = io.IceXYW(x=x[ind], y=y[ind], w=w[ind], ids=vars)
    
    # ----------------------------
    # Change labels to (0,1)
    y_new = np.zeros(len(new_data.y)).astype(np.int32)
    y_new[new_data.y == c] = 0
    y_new[new_data.y == reference_class] = 1                
    
    new_data.y = y_new # !
    # ----------------------------
    
    # Equalize class balance for the training
    for k in [0,1]:
        ind = (y_new == k)
        new_data.w[ind] /= np.sum(new_data.w[ind])

    return new_data




[docs]
def AIRW_helper(x, y, w, ids, pdf, args, x_val, y_val, w_val, EPS=1e-12):
    """
    Helper function for ML based reweighting
    """
    
    class_ids       = np.unique(y.astype(int))
    reference_class = args['reweight_param']['reference_class']
    
    diff_args = args['reweight_param']['diff_param']
    variables = diff_args['var']
    ID        = diff_args['AIRW_param']['active_model']
    RW_mode   = diff_args['AIRW_param']['mode']
    MAX_REG   = diff_args['AIRW_param']['max_reg']
    
    param     = args['models'][ID]
    
    
    # ----------------------------------------
    # Conversions and pick variables of interest
    
    if isinstance(x, ak.Array):
        x     = aux.ak2numpy(x=x, fields=variables).astype(np.float32)
        if x_val is not None:
            x_val = aux.ak2numpy(x=x_val, fields=variables).astype(np.float32)
    else:
        x = x[:, io.index_list(ids, variables)].astype(np.float32)
        if x_val is not None:
            x_val = x_val[:, io.index_list(ids, variables)].astype(np.float32)
    # ----------------------------------------
    
    print(f'Training N-dim reweighting', 'magenta')
    print(f'x.shape = {x.shape}', 'magenta')
    print(f'variables = {variables}', 'magenta')
    
    # Train model per class pair
    if pdf is None:
        
        pdf = {'ID': ID, 'param': param, 'model': {}, 'vars': variables}
        
        for c in class_ids:
            if c != reference_class:
                
                data_trn = map_xyw(x=x, y=y, w=w, vars=variables, c=c, reference_class=reference_class) 
                data_val = map_xyw(x=x_val, y=y_val, w=w_val, vars=variables, c=c, reference_class=reference_class)
                
                inputs = {'data_trn':    data_trn,
                          'data_val':    data_val,
                          'args':        args,
                          'data_trn_MI': None,
                          'data_val_MI': None,
                          'param':       param}

                pdf['model'][c] = iceboost.train_xgb(**inputs)
    
    # ------------------------------------------------
    ## Now apply the model
    
    param = pdf['param']
    
    if w is not None:
        wnew = copy.deepcopy(w)
    else:
        wnew = np.ones(len(y))
    
    for c in pdf['model'].keys():
        
        print(f'Applying AIRW reweighter to class [{c}]')
        
        if   param['predict'] == 'xgb':
            func_predict = predict.pred_xgb(args=args, param=param, feature_names=variables)
        elif param['predict'] == 'xgb_logistic':
            func_predict = predict.pred_xgb_logistic(args=args, param=param, feature_names=variables)
        else:
            raise Exception(__name__ + f'.AIRW_helper: Unsupported model -- "predict" field should be "xgb" or "xgb_logistic"')
        
        # -----------------------------------------------
        # Predict for events of this class
        ind  = (y == c)
        pred = func_predict(x[ind])

        # Handle logits vs probabilities
        min_pred, max_pred = np.min(pred), np.max(pred)
        
        THRESH = 1E-5
        if min_pred < (-THRESH) or max_pred > (1.0 + THRESH):
            print(f'Detected raw logit output [{min_pred:0.4f}, {max_pred:0.4f}] from the model')
            logits = pred
            p      = aux.sigmoid(logits)
            print(f'Corresponding probability output [{np.min(p):0.4f}, {np.max(p):0.4f}]')
        else:
            print(f'Detected probability output [{min_pred:0.4f}, {max_pred:0.4f}] from the model')
            logits = aux.inverse_sigmoid(pred)
            print(f'Corresponding logit output [{np.min(logits):0.4f}, {np.max(logits):0.4f}]')
        
        # Get weights after the re-weighting transform
        AI_w = rw_transform_with_logits(logits=logits, mode=RW_mode)
        
        # Apply maximum weight regularization
        AI_w = np.clip(AI_w, 0.0, MAX_REG)
        
        # Apply multiplicatively to event weights
        wnew[ind] = wnew[ind] * AI_w
        # -----------------------------------------------
    
    # Transform weights into weights doublet
    weights_doublet = {}
    for c in class_ids:
        ind = (y == c)
        weights_doublet[c]      = np.zeros(x.shape[0])
        weights_doublet[c][ind] = wnew[ind]
    
    return weights_doublet, pdf




[docs]
def doublet_helper(x, y, w, class_ids):

    weights_doublet = {}
    
    for c in class_ids:
        weights_doublet[c] = np.zeros(len(x))                   # init with zeros
        sample_weights     = w[y==c] if w is not None else 1.0  # Feed in the input weights
        weights_doublet[c][y == c] = sample_weights

    return weights_doublet    




[docs]
def compute_ND_reweights(x, y, w, ids, args, pdf=None, EPS=1e-12,
                         x_val=None, y_val=None, w_val=None, skip_reweights=False):
    """
    Compute N-dim reweighting coefficients
    
    Supports 'ML' (ND), 'pseudo-ND' (1D x 1D ... x 1D), '2D', '1D'
    
    For 'args' dictionary structure, see steering cards.
    
    Args:
        x    : training sample input
        y    : training sample (class) labels
        w    : training sample weights
        ids  : variable names of columns of x
        pdf  : pre-computed pdfs (default None)
        args : reweighting parameters in a dictionary
       
    Returns:
        weights : 1D-array of re-weights
        pdf     : computed pdfs
    """
        
    use_ak = True if isinstance(x, ak.Array) else False
    
    if use_ak:
        y = copy.deepcopy(ak.to_numpy(y).astype(int))
        w = copy.deepcopy(ak.to_numpy(w).astype(float))
    
    class_ids = np.unique(y.astype(int))
    
    # Make sure we make a copy, because we modify args here
    args = copy.deepcopy(args)
    
    ## Differential reweighting
    if args['reweight_param']['differential']:
        
        print(f'Differential reweighting')
        
        # Histogram based
        if args['reweight_param']['diff_param']['type'] != 'AIRW':
            
            weights_doublet, pdf = histogram_helper(x=x, y=y, w=w, ids=ids,
                                                    pdf=pdf, args=args, EPS=EPS)      

        # AIRW based
        else:
            weights_doublet, pdf = AIRW_helper(x=x, y=y, w=w, ids=ids, pdf=pdf, args=args,
                                    x_val=x_val, y_val=y_val, w_val=w_val, EPS=EPS)
        
        # Renormalize integral (sum) to the event counts per class
        if args['reweight_param']['diff_param']['renorm_weight_to_count']:
            print(f'Renormalizing sum(weights) == sum(count) per class')    
            for c in class_ids:
                ind = (y == c)
                weights_doublet[c][ind] /= np.sum(weights_doublet[c][ind])
                weights_doublet[c][ind] *= np.sum(ind)

        # ---------------------------------------------------------------
        # Special mode -- allows to only train pdf but then do not apply
        if skip_reweights:
            print(f'Special mode [skip_reweights] active: differential reweight model not applied', 'red')    
            weights_doublet = doublet_helper(x=x, y=y, w=w, class_ids=class_ids)
        
        # ---------------------------------------------------------------
        
    # No differential re-weighting    
    else:
        print(f"No differential reweighting")
        print(f"Reference [target] class: [{args['reweight_param']['reference_class']}] | Found classes {class_ids} from y")

        weights_doublet = doublet_helper(x=x, y=y, w=w, class_ids=class_ids)
    
    # --------------------------------------------------------
    
    ### Apply class balance equalizing weight
    if (args['reweight_param']['equal_frac'] == True):
        print(f"Equalizing class fractions", "green")
        weights_doublet = balanceweights(weights_doublet=weights_doublet,
                                         reference_class=args['reweight_param']['reference_class'], y=y)

    ### Finally map back to 1D-array
    weights = np.zeros(len(w))
    for c in class_ids:
        weights = weights + weights_doublet[c]
    
    ### Print weights
    prints.print_weights(weights=weights, y=y)
    
    if use_ak:
        return ak.Array(weights), pdf
    else:
        return weights, pdf




[docs]
def reweightcoeff1D(X, y, pdf, reference_class, max_reg = 1e3, EPS=1e-12):
    """ Compute N-class density reweighting coefficients.
    
    Args:
        X:               Observable of interest (N x 1)
        y:               Class labels (N x 1)
        pdf:             PDF for each class
        reference_class: e.g. 0 (background) or 1 (signal)
    
    Returns:
        weights for each event
    """
    class_ids = pdf['class_ids']

    # Re-weighting weights
    weights_doublet = {} # Init with zeros!!
    for c in class_ids:
        weights_doublet[c] = np.zeros(X.shape[0])
    
    # Weight each class against the reference class
    for c in class_ids:
        inds = aux.x2ind(X[y == c], pdf['binedges'])
        if c is not reference_class:
            weights_doublet[c][y == c] = pdf[reference_class][inds] / np.clip(pdf[c][inds], EPS, None)
        else:
            weights_doublet[c][y == c] = 1.0 # Reference class stays intact

        # Maximum weight cut-off regularization
        weights_doublet[c][weights_doublet[c] > max_reg] = max_reg

    return weights_doublet




[docs]
def reweightcoeff2D(X_A, X_B, y, pdf, reference_class, max_reg = 1e3, EPS=1E-12):
    """
    Compute N-class density reweighting coefficients.
    
    Operates in full 2D without any factorization.
    
    Args:
        X_A :             First observable of interest  (N x 1)
        X_B :             Second observable of interest (N x 1)
        y   :             Class labels (N x 1)
        pdf :             Density histograms for each class
        reference_class : e.g. Background (0) or signal (1)
        max_reg :         Regularize the maximum reweight coefficient
    
    Returns:
        weights for each event
    """
    class_ids = pdf['class_ids']

    # Re-weighting weights
    weights_doublet = {} # Init with zeros!!
    for c in class_ids:
        weights_doublet[c] = np.zeros(X_A.shape[0])
    
    # Weight each class against the reference class
    for c in class_ids:
        inds_0 = aux.x2ind(X_A[y == c], pdf['binedges'][0]) # variable 0
        inds_1 = aux.x2ind(X_B[y == c], pdf['binedges'][1]) # variable 1
        if c is not reference_class:
            weights_doublet[c][y == c] = pdf[reference_class][inds_0, inds_1] / np.clip(pdf[c][inds_0, inds_1], EPS, None)
        else:
            weights_doublet[c][y == c] = 1.0 # Reference class stays intact

        # Maximum weight cut-off regularization
        weights_doublet[c][weights_doublet[c] > max_reg] = max_reg

    return weights_doublet




[docs]
def pdf_1D_hist(X, w, binedges):
    """ 
    Compute re-weighting 1D pdfs.
    """

    # Take re-weighting variables
    pdf,_,_ = plt.hist(x = X, weights=w, bins = binedges)

    # Make them densities
    pdf  /= np.sum(pdf.flatten())
    return pdf




[docs]
def pdf_2D_hist(X_A, X_B, w, binedges_A, binedges_B):
    """
    Compute re-weighting 2D pdfs.
    """

    # Take re-weighting variables
    pdf,_,_,_ = plt.hist2d(x = X_A, y = X_B, weights=w, bins = [binedges_A, binedges_B])

    # Make them densities
    pdf  /= np.sum(pdf.flatten())
    return pdf




[docs]
def balanceweights(weights_doublet, reference_class, y, EPS=1e-12):
    """ Balance N-class weights to sum to equal counts.
    
    Args:
        weights_doublet:  N-class event weights (events x classes)
        reference_class:  which class gives the reference (integer)
        y:                class targets
    Returns:
        weights doublet with new weights per event
    """
    class_ids = np.unique(y).astype(int)
    ref_sum   = np.sum(weights_doublet[reference_class][y == reference_class])
    
    for c in class_ids:
        if c is not reference_class:
            EQ = ref_sum / np.clip(np.sum(weights_doublet[c][y == c]), EPS, None)
            weights_doublet[c][y == c] *= EQ
    
    return weights_doublet