Source code for icefit.jacobian

# Torch autograd Jacobians, Hessians and related
# 
# m.mieskolainen@imperial.ac.uk, 2023

import numpy as np
import torch
from torch import nn
from torch.autograd import Variable
from torch.autograd.functional import jacobian


[docs] def get_gradient(inputs, func, **kwargs): """ Return derivative df(x)/dx or gradient vectors for f: R^n -> R """ x = Variable(inputs.data, requires_grad=True) # We start to track gradients from here z = func(x, **kwargs) if len(z.size()) > 1: # Higher dimensional output return get_jacobian(inputs=inputs, func=func, **kwargs) z.backward([torch.ones_like(z)], create_graph=True) return x.grad
[docs] def get_jacobian(inputs, func, **kwargs): """ Return Jacobian matrix of size m x n of a function f: R^n -> R^m """ size = inputs.size() n = size[0] # Input dim if (len(size)) > 1: dim_input = size[1] else: dim_input = 1 test = func(inputs, **kwargs) # Output dim if len(test.size()) > 1: dim_output = test.size()[1] else: dim_output = 1 J = torch.zeros((n, dim_output, dim_input)) # Loop over all batch samples for i in range(n): x = inputs[i,...].squeeze() x = x.repeat(dim_output, 1) x.requires_grad_(True) z = func(x, **kwargs) if dim_output == 1: z.backward([torch.ones_like(z)]) else: z.backward(torch.eye(dim_output)) J[i,:,:] = x.grad.data return J
[docs] def get_full_hessian(loss, param): """ Compute the full Hessian matrix of a neural net (can be very slow) https://www.cs.toronto.edu/~rgrosse/courses/csc2541_2021/readings/L02_Taylor_approximations.pdf Args: loss: neural net loss function param: neural net parameters """ N = param.numel() H = torch.zeros(N,N) loss_grad = torch.autograd.grad(loss, param, create_graph=True, retain_graph=True, only_inputs=True)[0].view(-1) for i in tqmd(range(N)): grad2 = torch.autograd.grad(loss_grad[i], param, create_graph=False, retain_graph=True, only_inputs=True) H[i] = grad2[0].view(-1) return H
""" def get_hessian(inputs, func, **kwargs): The Hessian matrix of a function of is the Jacobian matrix of the gradient of the function f. H(f(x)) = J(grad(f(x)) Note that sometimes Hessian is approximated using a product of Jacobians as JJ^T TBD: Do we assume derivatives commute here (c.f. Grasmann variables) ? """
[docs] def observed_fisher_info(logL, inputs, **kwargs): """ Observed Fisher information -- the negative of the second derivative (Hessian matrix) of the log-likelihood function. The observed information is typically evaluated at the maximum-likelihood estimate of theta. 'Fisher information represents the curvature of the relative entropy' Args: logL: The log-likelihood function l(theta|X1,...X_n) = sum_{i=1}^n log f(X_i|theta), with X_i being the independent random observations inputs: log-likelihood function input References: https://en.wikipedia.org/wiki/Observed_information https://en.wikipedia.org/wiki/Optimal_design Efron, B.; Hinkley, D.V. (1978). Assessing the accuracy of the maximum likelihood estimator: Observed versus expected Fisher Information". Biometrika. 65 (3): 457–487. """ I = - torch.autograd.functional.hessian(func=logL, inputs=inputs).squeeze() return I
[docs] def test_hessians(EPS=1e-4): """ Test Hessian matrix computations """ def func_f1(x): # R^3 -> R # # Input dim: n_batches x 3 # return x[:,0]**2 + x[:,1]**2 + x[:,2]**2 def H_1(x): # Hessian for func_f1 return torch.tensor([[2, 0, 0], [0, 2, 0], [0, 0, 2]]) def func_f2(x): # R^3 -> R # # Input dim: n_batches x 3 return torch.sin(x[:,0] * x[:,1] * x[:,2]) def H_2(X): # Hessian for func_f2 x = X[:,0] y = X[:,1] z = X[:,2] sin = torch.sin cos = torch.cos return torch.tensor([[-y**2*z**2*sin(x*y*z), -x*y*z**2*sin(x*y*z) + z*cos(x*y*z), -x*y**2*z*sin(x*y*z) + y*cos(x*y*z)], [-x*y*z**2*sin(x*y*z) + z*cos(x*y*z), -x**2*z**2*sin(x*y*z), -x**2*y*z*sin(x*y*z) + x*cos(x*y*z)], [-x*y**2*z*sin(x*y*z) + y*cos(x*y*z), -x**2*y*z*sin(x*y*z) + x*cos(x*y*z), -x**2*y**2*sin(x*y*z)]]) for i in range(1000): ### TEST CASE 1 x = Variable(torch.randn(size=(1,3))) H_auto = torch.autograd.functional.hessian(func=func_f1, inputs=x).squeeze() H_analytic = H_1(x) assert torch.norm(H_auto - H_analytic) < EPS ### TEST CASE 2 x = Variable(torch.randn(size=(1,3))) H_auto = torch.autograd.functional.hessian(func=func_f2, inputs=x).squeeze() H_analytic = H_2(x) assert torch.norm(H_auto - H_analytic) < EPS
[docs] def test_dimension_interfaces(): # Test different dimensional input-output. import pytest def fx_R_R(x, param=None): # Function f: R -> R return x**2; def fx_R3_R(x, param=None): # Function f: R^3 -> R return (x[:,0] + x[:,1]**2 + x[:,2]**4) def fx_R_R3(x, param=None): # Function f: R -> R^3 y = torch.hstack((x, x**4, x**2)) return y def fx_R3_R3(x, param=None): # Function f: R^3 -> R^3 y = torch.vstack((x[:,0], x[:,1]**4, x[:,2])).transpose(0,1); return y def fx_R3_R4(x, param=None): # Function f: R^3 -> R^4 y = torch.vstack((x[:,0], x[:,1]**4, x[:,2]-x[:,0], x[:,2]*x[:,0])).transpose(0,1); return y def fx_R4_R3(x, param=None): # Function f: R^4 -> R^3 y = torch.vstack((x[:,1]**4, x[:,2]-x[:,0], x[:,2]*x[:,0])).transpose(0,1); return y # Number of samples n = 2 # Dimensions dim_domain = [1,3,4] dim_range = [1,3,4] for d_in in dim_domain: for d_out in dim_range: # Test input vector with arbitrary 1,2,3,4... elements x = Variable(torch.ones(n, d_in).cumsum(0)) if d_in == 1 and d_out == 1: func = fx_R_R; elif d_in == 1 and d_out == 3: func = fx_R_R3; elif d_in == 3 and d_out == 1: func = fx_R3_R; elif d_in == 3 and d_out == 3: func = fx_R3_R3; elif d_in == 3 and d_out == 4: func = fx_R3_R4; elif d_in == 4 and d_out == 3: func = fx_R4_R3; else: continue # Did not found test case print(f'----------------------------------------------------------') print(f'f: R^{d_in} -> R^{d_out}') print(f'x = {x}') print(f'f(x) = {func(x)}') print(f'df(x)/dx = {get_gradient(func=func, inputs=x)} (get_gradient)') print(f'df(x)/dx = {get_jacobian(func=func, inputs=x)} (get_jacobian)') print(f'df(x)/dx = {jacobian(func=func, inputs=x)} (jacobian)') print('\n')
[docs] def test_jacobians(EPS=1e-4): #Test Jacobians from: #https://en.wikipedia.org/wiki/Jacobian_matrix_and_determinant import pytest def func_f1(x): # R^2 -> R^2 # # n_batches x 2 # return torch.vstack((x[:,0]**2 * x[:,1], 5*x[:,0] + torch.sin(x[:,1]))).transpose(0,1); def J_f1(x): # This will accept only one sample (batch size = 1) J = torch.tensor([[2*x[0]*x[1], x[0]**2], [5, torch.cos(x[1])]]) return J def func_f2(x): # Polar-Cartesian # R_+ x [0, 2\pi) -> R^2 # # n_batches x 2 # return torch.vstack((x[:,0] * torch.cos(x[:,1]), x[:,0]*torch.sin(x[:,1]))).transpose(0,1); def J_f2(x): # This will accept only one sample (batch size = 1) J = torch.tensor([[torch.cos(x[1]), -x[0]*torch.sin(x[1])], [torch.sin(x[1]), x[0]*torch.cos(x[1])]]) return J def func_f3(x): # Polar-Cartesian # R_+ x [0, 2\pi) x [0, 2\pi) -> R^3 # # n_batches x 2 # return torch.vstack((x[:,0]*torch.sin(x[:,1])*torch.cos(x[:,2]), x[:,0]*torch.sin(x[:,1])*torch.sin(x[:,2]), x[:,0]*torch.cos(x[:,1]))).transpose(0,1); def J_f3(x): # This will accept only one sample (batch size = 1) J = torch.tensor([[torch.sin(x[1])*torch.cos(x[2]), x[0]*torch.cos(x[1])*torch.cos(x[2]), -x[0]*torch.sin(x[1])*torch.sin(x[2])], [torch.sin(x[1])*torch.sin(x[2]), x[0]*torch.cos(x[1])*torch.sin(x[2]), x[0]*torch.sin(x[1])*torch.cos(x[2])], [torch.cos(x[1]), -x[0]*torch.sin(x[1]), 0]]) return J def func_f4(x): # R^3 -> R^4 # # n_batches x 2 # return torch.vstack((x[:,0], 5*x[:,2], 4*x[:,1]**2 - 2*x[:,2], x[:,2]*torch.sin(x[:,0]))).transpose(0,1); def J_f4(x): # This will accept only one sample (batch size = 1) J = torch.tensor([[ 1, 0, 0 ], [ 0, 0, 5 ], [ 0, 8*x[1], -2], [x[2]*torch.cos(x[0]), 0, torch.sin(x[0])]]) return J # Test with random input for k in range(100): ### TEST CASE 1 x = Variable(torch.randn(size=(1,2))) J_auto = get_jacobian(func=func_f1, inputs=x).squeeze() J_analytic = J_f1(x[0,...]) assert torch.norm(J_auto - J_analytic) < EPS ### TEST CASE 2 x = Variable(torch.rand(size=(1,2))) x[0,0] *= 1000 x[0,1] *= 2*np.pi J_auto = get_jacobian(func=func_f2, inputs=x).squeeze() J_analytic = J_f2(x[0,...]) assert torch.norm(J_auto - J_analytic) < EPS ### TEST CASE 3 x = Variable(torch.rand(size=(1,3))) x[0,0] *= 1000 x[0,1] *= 2*np.pi x[0,2] *= 2*np.pi J_auto = get_jacobian(func=func_f3, inputs=x).squeeze() J_analytic = J_f3(x[0,...]) assert torch.norm(J_auto - J_analytic) < EPS ### TEST CASE 4 x = Variable(torch.rand(size=(1,3))) J_auto = get_jacobian(func=func_f4, inputs=x).squeeze() J_analytic = J_f4(x[0,...]) print(J_auto) print(J_analytic) assert torch.norm(J_auto - J_analytic) < EPS