Source code for iceid.graphio

# Graph and tensor data readers and parsers for electron ID
#
# Mikael Mieskolainen, 2023
# m.mieskolainen@imperial.ac.uk

import numpy as np
from   tqdm import tqdm
import ray

import torch
from   torch_geometric.data import Data

import icenet.algo.analytic as analytic
from   icenet.tools import aux
from   icenet.tools.icevec import vec4

# ------------------------------------------
from icenet import print
# ------------------------------------------

[docs] def parse_tensor_data(X, ids, image_vars, args): """ Args: X : Jagged array of variables ids : Variable names as an array of strings args : Arguments Returns: Tensor of pytorch-geometric Data objects """ newind = np.where(np.isin(ids, image_vars)) newind = np.array(newind).flatten() newvars = [] for i in newind : newvars.append(ids[i]) # Pick image data X_image = X[:, newind] # Use single channel tensors if args['image_param']['channels'] == 1: xyz = [['image_clu_eta', 'image_clu_phi', 'image_clu_e']] # Use multichannel tensors elif args['image_param']['channels'] == 2: xyz = [['image_clu_eta', 'image_clu_phi', 'image_clu_e'], ['image_pf_eta', 'image_pf_phi', 'image_pf_p']] else: raise Exception(__name__ + f'.splitfactor: Unknown [image_param][channels] parameter') eta_binedges = args['image_param']['eta_bins'] phi_binedges = args['image_param']['phi_bins'] # Pick tensor data out print(f'jagged2tensor processing ...', 'yellow') tensor = aux.jagged2tensor(X=X_image, ids=newvars, xyz=xyz, x_binedges=eta_binedges, y_binedges=phi_binedges) return tensor
@ray.remote def parse_graph_data_ray(X, ids, features, graph_param, Y=None, weights=None, entry_start=None, entry_stop=None): return parse_graph_data(X, ids, features, graph_param, Y, weights, entry_start, entry_stop)
[docs] def parse_graph_data(X, ids, features, graph_param, Y=None, weights=None, entry_start=None, entry_stop=None, EPS=1e-12, null_value=-999.0): """ Jagged array data into pytorch-geometric style Data format array. Args: X : Jagged array of variables ids : Variable names as an array of strings features : List of active global feature strings graph_param: Graph construction parameters dict Y : Target class array (if any, typically MC only) weights : (Re-)weighting array (if any, typically MC only) Returns: List of pytorch-geometric Data objects """ M_PION = 0.13957 # charged pion mass (GeV) global_on = graph_param['global_on'] coord = graph_param['coord'] directed = graph_param['directed'] self_loops = graph_param['self_loops'] # ---------------------------------------------- num_node_features = 5 num_edge_features = 4 entry_start, entry_stop, num_events = aux.slice_range(start=entry_start, stop=entry_stop, N=len(X)) dataset = [] print(__name__ + f'.parse_graph_data: Converting {num_events} events into graphs ...') # Collect feature indices feature_ind = np.zeros(len(features), dtype=np.int32) for i in range(len(features)): feature_ind[i] = ids.index(features[i]) # Collect indices ind__trk_pt = ids.index('trk_pt') ind__trk_eta = ids.index('trk_eta') ind__trk_phi = ids.index('trk_phi') ind__image_clu_e = ids.index('image_clu_e') ind__image_clu_eta = ids.index('image_clu_eta') ind__image_clu_phi = ids.index('image_clu_phi') num_empty_ECAL = 0 # Loop over events for ev in tqdm(range(entry_start, entry_stop)): num_clu = len(X[ev, ind__image_clu_eta]) num_nodes = num_clu + 1 # +1 for the virtual node (empty data) num_edges = analytic.count_simple_edges(num_nodes=num_nodes, directed=directed, self_loops=self_loops) # Construct 4-vector for the track, with pion mass p4track = vec4() p4track.setPtEtaPhiM(X[ev, ind__trk_pt], X[ev, ind__trk_eta], X[ev, ind__trk_phi], M_PION) # Construct 4-vector for each ECAL cluster p4vec = [] if num_clu > 0: for k in range(num_clu): pt = X[ev, ind__image_clu_e][k] / np.cosh(X[ev, ind__image_clu_eta][k]) # Massless approx. eta = X[ev, ind__image_clu_eta][k] phi = X[ev, ind__image_clu_phi][k] v = vec4() v.setPtEtaPhiM(pt, eta, phi, 0) p4vec.append(v) # Empty ECAL cluster information else: num_empty_ECAL += 1 # However, never skip empty ECAL cluster events here!!, do pre-filtering before this function if needed p4vec.append(vec4()) # Add empty 4-vector (for all events, due to empty events) # ==================================================================== # CONSTRUCT TENSORS # Construct output class, note [] is important to have for right dimensions if Y is None: y = torch.tensor([0], dtype=torch.long) else: y = torch.tensor([Y[ev]], dtype=torch.long) # Training weights, note [] is important to have for right dimensions if weights is None: w = torch.tensor([1.0], dtype=torch.float) else: w = torch.tensor([weights[ev]], dtype=torch.float) ## Construct node features x = get_node_features(p4vec=p4vec, p4track=p4track, X=X[ev, ...], ids=ids, num_nodes=num_nodes, num_node_features=num_node_features, coord=coord) x[~np.isfinite(x)] = null_value # Input protection x = torch.tensor(x, dtype=torch.float) ## Construct edge features edge_attr = analytic.get_Lorentz_edge_features(p4vec=p4vec, num_nodes=num_nodes, \ num_edges=num_edges, num_edge_features=num_edge_features, self_loops=self_loops, directed=directed) edge_attr[~np.isfinite(edge_attr)] = null_value # Input protection edge_attr = torch.tensor(edge_attr, dtype=torch.float) ## Construct edge connectivity edge_index = analytic.get_simple_edge_index(num_nodes=num_nodes, num_edges=num_edges, self_loops=self_loops, directed=directed) edge_index = torch.tensor(edge_index, dtype=torch.long) ## Construct global feature vector if global_on == False: # Null the global features u = torch.tensor([], dtype=torch.float) else: u = (X[ev, feature_ind]).astype(float) u[~np.isfinite(u)] = null_value # Input protection u = torch.tensor(u, dtype=torch.float) dataset.append(Data(num_nodes=x.shape[0], x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, w=w, u=u)) print(f'Empty ECAL events: {num_empty_ECAL} / {num_events} = {num_empty_ECAL/num_events:0.5f} (using only global data u)') return dataset
[docs] def get_node_features(p4vec, p4track, X, ids, num_nodes, num_node_features, coord): # Node feature matrix x = np.zeros((num_nodes, num_node_features), dtype=float) for i in range(num_nodes - 1): # -1, because last one is the dummy (empty) node if coord == 'ptetaphim': x[i,0] = p4vec[i].pt x[i,1] = p4vec[i].eta x[i,2] = p4vec[i].phi x[i,3] = p4vec[i].m elif coord == 'pxpypze': x[i,0] = p4vec[i].px x[i,1] = p4vec[i].py x[i,2] = p4vec[i].pz x[i,3] = p4vec[i].e else: raise Exception(__name__ + f'.get_node_features: Unknown coordinate representation') # Other features x[i,4] = p4track.deltaR(p4vec[i]) # Add more here ... from X # Cast x = x.astype(float) return x