# Graph and tensor data readers and parsers for electron ID
#
# Mikael Mieskolainen, 2023
# m.mieskolainen@imperial.ac.uk
import numpy as np
from tqdm import tqdm
import ray
import torch
from torch_geometric.data import Data
import icenet.algo.analytic as analytic
from icenet.tools import aux
from icenet.tools.icevec import vec4
# ------------------------------------------
from icenet import print
# ------------------------------------------
[docs]
def parse_tensor_data(X, ids, image_vars, args):
"""
Args:
X : Jagged array of variables
ids : Variable names as an array of strings
args : Arguments
Returns:
Tensor of pytorch-geometric Data objects
"""
newind = np.where(np.isin(ids, image_vars))
newind = np.array(newind).flatten()
newvars = []
for i in newind :
newvars.append(ids[i])
# Pick image data
X_image = X[:, newind]
# Use single channel tensors
if args['image_param']['channels'] == 1:
xyz = [['image_clu_eta', 'image_clu_phi', 'image_clu_e']]
# Use multichannel tensors
elif args['image_param']['channels'] == 2:
xyz = [['image_clu_eta', 'image_clu_phi', 'image_clu_e'],
['image_pf_eta', 'image_pf_phi', 'image_pf_p']]
else:
raise Exception(__name__ + f'.splitfactor: Unknown [image_param][channels] parameter')
eta_binedges = args['image_param']['eta_bins']
phi_binedges = args['image_param']['phi_bins']
# Pick tensor data out
print(f'jagged2tensor processing ...', 'yellow')
tensor = aux.jagged2tensor(X=X_image, ids=newvars, xyz=xyz, x_binedges=eta_binedges, y_binedges=phi_binedges)
return tensor
@ray.remote
def parse_graph_data_ray(X, ids, features, graph_param, Y=None, weights=None,
entry_start=None, entry_stop=None):
return parse_graph_data(X, ids, features, graph_param,
Y, weights, entry_start, entry_stop)
[docs]
def parse_graph_data(X, ids, features, graph_param, Y=None, weights=None, entry_start=None, entry_stop=None, EPS=1e-12, null_value=-999.0):
"""
Jagged array data into pytorch-geometric style Data format array.
Args:
X : Jagged array of variables
ids : Variable names as an array of strings
features : List of active global feature strings
graph_param: Graph construction parameters dict
Y : Target class array (if any, typically MC only)
weights : (Re-)weighting array (if any, typically MC only)
Returns:
List of pytorch-geometric Data objects
"""
M_PION = 0.13957 # charged pion mass (GeV)
global_on = graph_param['global_on']
coord = graph_param['coord']
directed = graph_param['directed']
self_loops = graph_param['self_loops']
# ----------------------------------------------
num_node_features = 5
num_edge_features = 4
entry_start, entry_stop, num_events = aux.slice_range(start=entry_start, stop=entry_stop, N=len(X))
dataset = []
print(__name__ + f'.parse_graph_data: Converting {num_events} events into graphs ...')
# Collect feature indices
feature_ind = np.zeros(len(features), dtype=np.int32)
for i in range(len(features)):
feature_ind[i] = ids.index(features[i])
# Collect indices
ind__trk_pt = ids.index('trk_pt')
ind__trk_eta = ids.index('trk_eta')
ind__trk_phi = ids.index('trk_phi')
ind__image_clu_e = ids.index('image_clu_e')
ind__image_clu_eta = ids.index('image_clu_eta')
ind__image_clu_phi = ids.index('image_clu_phi')
num_empty_ECAL = 0
# Loop over events
for ev in tqdm(range(entry_start, entry_stop)):
num_clu = len(X[ev, ind__image_clu_eta])
num_nodes = num_clu + 1 # +1 for the virtual node (empty data)
num_edges = analytic.count_simple_edges(num_nodes=num_nodes, directed=directed, self_loops=self_loops)
# Construct 4-vector for the track, with pion mass
p4track = vec4()
p4track.setPtEtaPhiM(X[ev, ind__trk_pt], X[ev, ind__trk_eta], X[ev, ind__trk_phi], M_PION)
# Construct 4-vector for each ECAL cluster
p4vec = []
if num_clu > 0:
for k in range(num_clu):
pt = X[ev, ind__image_clu_e][k] / np.cosh(X[ev, ind__image_clu_eta][k]) # Massless approx.
eta = X[ev, ind__image_clu_eta][k]
phi = X[ev, ind__image_clu_phi][k]
v = vec4()
v.setPtEtaPhiM(pt, eta, phi, 0)
p4vec.append(v)
# Empty ECAL cluster information
else:
num_empty_ECAL += 1
# However, never skip empty ECAL cluster events here!!, do pre-filtering before this function if needed
p4vec.append(vec4()) # Add empty 4-vector (for all events, due to empty events)
# ====================================================================
# CONSTRUCT TENSORS
# Construct output class, note [] is important to have for right dimensions
if Y is None:
y = torch.tensor([0], dtype=torch.long)
else:
y = torch.tensor([Y[ev]], dtype=torch.long)
# Training weights, note [] is important to have for right dimensions
if weights is None:
w = torch.tensor([1.0], dtype=torch.float)
else:
w = torch.tensor([weights[ev]], dtype=torch.float)
## Construct node features
x = get_node_features(p4vec=p4vec, p4track=p4track, X=X[ev, ...], ids=ids, num_nodes=num_nodes, num_node_features=num_node_features, coord=coord)
x[~np.isfinite(x)] = null_value # Input protection
x = torch.tensor(x, dtype=torch.float)
## Construct edge features
edge_attr = analytic.get_Lorentz_edge_features(p4vec=p4vec, num_nodes=num_nodes, \
num_edges=num_edges, num_edge_features=num_edge_features, self_loops=self_loops, directed=directed)
edge_attr[~np.isfinite(edge_attr)] = null_value # Input protection
edge_attr = torch.tensor(edge_attr, dtype=torch.float)
## Construct edge connectivity
edge_index = analytic.get_simple_edge_index(num_nodes=num_nodes, num_edges=num_edges, self_loops=self_loops, directed=directed)
edge_index = torch.tensor(edge_index, dtype=torch.long)
## Construct global feature vector
if global_on == False: # Null the global features
u = torch.tensor([], dtype=torch.float)
else:
u = (X[ev, feature_ind]).astype(float)
u[~np.isfinite(u)] = null_value # Input protection
u = torch.tensor(u, dtype=torch.float)
dataset.append(Data(num_nodes=x.shape[0], x=x, edge_index=edge_index, edge_attr=edge_attr, y=y, w=w, u=u))
print(f'Empty ECAL events: {num_empty_ECAL} / {num_events} = {num_empty_ECAL/num_events:0.5f} (using only global data u)')
return dataset
[docs]
def get_node_features(p4vec, p4track, X, ids, num_nodes, num_node_features, coord):
# Node feature matrix
x = np.zeros((num_nodes, num_node_features), dtype=float)
for i in range(num_nodes - 1): # -1, because last one is the dummy (empty) node
if coord == 'ptetaphim':
x[i,0] = p4vec[i].pt
x[i,1] = p4vec[i].eta
x[i,2] = p4vec[i].phi
x[i,3] = p4vec[i].m
elif coord == 'pxpypze':
x[i,0] = p4vec[i].px
x[i,1] = p4vec[i].py
x[i,2] = p4vec[i].pz
x[i,3] = p4vec[i].e
else:
raise Exception(__name__ + f'.get_node_features: Unknown coordinate representation')
# Other features
x[i,4] = p4track.deltaR(p4vec[i])
# Add more here ... from X
# Cast
x = x.astype(float)
return x