Refresher Course on Matrix Analysis and OptimizationPart II: Numerical Optimization |
Franck Iutzeler Jerome Malick |
We consider two $\mathbb{R}^2 \to \mathbb{R}$ convex functions with the same global minimizer $(3,1)$ but quite different shapes and see how this impacts the performance of gradient-based algorithms. The considered functions $f$ and $g$ and their 3D are:
\begin{array}{rrcll} f: & \mathbb{R}^2 & \to &\mathbb{R}\\ & (x_1,x_2) & \mapsto & 4 (x_1-3)^2 + 2(x_2-1)^2 \end{array} | \begin{array}{rrcll} g: & \mathbb{R}^2 & \to &\mathbb{R}\\ & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3) . \end{array} |
---|
In numerical optimization, it is customary to define oracles for the functions. These are functions that take a point of the ambiant space as an input and return the value of the function at this point (or its gradient, Hessian, etc.)
# Oracle for function f
def f(x):
"""
Function f
"""
x1 = x[0]
x2 = x[1]
return 4*(x1-3)**2+2*(x2-1)**2
Plotting the level sets of the functions can be useful in 2D.
import numpy as np
import matplotlib.pyplot as plt
npoints = 200
xmin = -0.5
xmax = 5.5
ymin = -0.5
ymax = 5.5
X,Y = np.meshgrid(np.linspace(xmin,xmax,npoints),np.linspace(ymin,ymax,npoints))
Points = [ np.array([x,y]) for (x,y) in zip(X.ravel(), Y.ravel())]
Z = np.array([ f(p) for p in Points ]).reshape(X.shape)
plt.contour(X,Y,Z,levels=[0.1,1,2,5,10,25])
Question a: Construct a function
g
that will be the oracle for the second function \begin{array}{rrcll} g: & \mathbb{R}^2 & \to &\mathbb{R}\\ & (x_1,x_2) & \mapsto & \log( 1 + \exp(4 (x_1-3)^2 ) + \exp( 2(x_2-1)^2 ) ) - \log(3) . \end{array}
def g(x):
"""
Function g
"""
x1 = x[0]
x2 = x[1]
return np.log( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) ) - np.log(3)
Question b: Create a function
f_grad
that returns $\nabla f(x)$ (as a Numpy array) from an input vector $x$. Do the same for function $g$
def f_grad(x):
x1 = x[0]
x2 = x[1]
gx = 8*(x1-3)
gy = 4*(x2-1)
return np.array( [ gx , gy ] )
def g_grad(x):
x1 = x[0]
x2 = x[1]
gx = 8*(x1-3)*np.exp(4*(x1-3)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )
gy = 4*(x2-1)*np.exp(2*(x2-1)**2)/( 1 + np.exp(4*(x1-3)**2) + np.exp(2*(x2-1)**2) )
return np.array( [ gx , gy ] )
Question c: Implement a constant stepsize gradient method
gradient_algorithm(f,f_grad,x0,step,PREC,ITE_MAX)
that takes:
f
andf_grad
: respectively functions and gradient oraclesx0
: a starting pointstep
: a (real) stepsizePREC
andITE_MAX
: two stopping criteria for i) sought precision (in gradient norm); and ii) maximum number of iterations;and returns a tuple with
x
the final point andx_tab
the matrix of all vectors stacked vertically
import numpy as np
import timeit
def gradient_algorithm(f , f_grad , x0 , step , PREC , ITE_MAX ):
x = np.copy(x0)
stop = PREC*np.linalg.norm(f_grad(x0) )
x_tab = np.copy(x)
print("------------------------------------\n Constant Stepsize gradient\n------------------------------------\nSTART -- stepsize = {:0}".format(step))
t_s = timeit.default_timer()
for k in range(ITE_MAX):
g = f_grad(x)
x = x - step*g
x_tab = np.vstack((x_tab,x))
if np.linalg.norm(g) < stop:
break
t_e = timeit.default_timer()
print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
return x,x_tab
Question d: Test your gradient descent function on $f$ and $g$:
- Verify that the final point is close to the sought minimizer $(3,1)$
- Observe the behavior of the iterates (you can plot them on top of the contour plot)
- Change the stepsize and give the values for which the algorithm i) diverges; and ii) oscillates. Compare with theoretical limits by computing the Lipschitz constant of the gradients
def level_points_plot(f, x_tab, xmin, xmax, ymin, ymax, npoints, levels):
X,Y = np.meshgrid(np.linspace(xmin,xmax,npoints),np.linspace(ymin,ymax,npoints))
Points = [ np.array([x,y]) for (x,y) in zip(X.ravel(), Y.ravel())]
Z = np.array([ f(p) for p in Points ]).reshape(X.shape)
graph = plt.contour(X,Y,Z,levels=[0.1,1,2,5,10,25])
if x_tab.shape[0] > 40:
sub = int(x_tab.shape[0]/40.0)
x_tab = x_tab[::sub]
delay = 2.0/x_tab.shape[0]
for k in range(x_tab.shape[0]):
plt.plot(x_tab[k,0], x_tab[k,1], '*b', markersize=10)
plt.show()
PREC = 1e-8
ITE_MAX = 100
x0 = np.array([0, 0])
step = 0.1
x_f, x_tab_f = gradient_algorithm(f, f_grad, x0, step, PREC, ITE_MAX)
npoints = 200
xmin = -0.5
xmax = 5.5
ymin = -1
ymax = 5.5
levels = [0.1,1,2,5,10,25]
level_points_plot(f, x_tab_f, xmin, xmax, ymin, ymax, npoints, levels)
PREC = 1e-8
ITE_MAX = 100
x0 = np.array([0, 0])
step = 0.1
x, x_tab = gradient_algorithm(g, g_grad, x0, step, PREC, ITE_MAX)
npoints = 200
xmin = -0.5
xmax = 5.5
ymin = -1
ymax = 5.5
levels = [0.1,1,2,5,10,25]
level_points_plot(g, x_tab, xmin, xmax, ymin, ymax, npoints, levels)
We now get back to the problem of predicting the final grade of a student from various features treated in the Matrix part of the course.
We remind that mathematically, from the $m_{learn} \times (n+1)$ learning matrix $A_{learn}$ ($m_{learn} = 300$, $n=27$) comprising of the features values of each training student in line, and the vector of the values of the target features $b_{learn}$; we seek a size-$(n+1)$ regression vector that minimizes the squared error between $A_{learn} x$ and $b_{learn}$. This problem boils down to the following least square problem: $$ \min_{x\in\mathbb{R}^{n+1}} s(x) = \frac{1}{2} \| A_{learn} x - b_{learn} \|_2^2 . $$
import numpy as np
# File reading
dat_file = np.load('data/student.npz')
A_learn = dat_file['A_learn']
b_learn = dat_file['b_learn']
A_test = dat_file['A_test']
b_test = dat_file['b_test']
m = 395 # number of read examples (total:395)
n = 27 # features
m_learn = 300
Question a: Construct the oracles for function $s$ and its gradient as in the previous section
def s(x):
return 0.5 * np.linalg.norm(np.dot(A_learn, x) - b_learn)**2
def s_grad(x):
return np.dot(A_learn.T, (np.dot(A_learn, x) - b_learn))
Question b: Compute the Lipschitz constant of the gradient of $s$. Find a solution to the minimization of $s$ using your gradient algorithm. Compare with Numpy's Least Square routine.
L = np.linalg.norm(np.dot(A_learn.T, A_learn))
print(L)
PREC = 1e-12
ITE_MAX = 5000
x0 = np.zeros(n+1)
step = 1 / L
x, x_tab = gradient_algorithm(s, s_grad, x0, step, PREC, ITE_MAX)
x_lstsq = np.linalg.lstsq(A_learn, b_learn,rcond=None)[0]
print(np.linalg.norm(x - x_lstsq))
ys = [ np.linalg.norm(x_tab[k, :] - x_lstsq) for k in range(x_tab.shape[0])]
fig, ax = plt.subplots()
ax.semilogy(range(x_tab.shape[0]), ys)
ax.set(xlabel='iteration k', ylabel='||x_k - x_lstsq ||')
Question c: Generate random Gaussian matrix/vector couples $A,b$ with different sizes, change the shape of $A$ from tall (nb. of rows >> nb. of cols.) to fat (nb. of rows << nb. of cols.). Compare the execution time of constant stepsize gradient with fixed precision for the different couples.
import numpy as np
n_new = 100
m_new = 10000
A_learn = np.random.randn(m_new,n_new)
b_learn = np.random.randn(m_new)
PREC = 1e-12
ITE_MAX = 5000
x0 = np.zeros(n_new)
step = 1 / np.linalg.norm(np.dot(A_learn.T, A_learn))
def s(x):
return 0.5 * np.linalg.norm(np.dot(A_learn, x) - b_learn)**2
def s_grad(x):
return np.dot(A_learn.T, (np.dot(A_learn, x) - b_learn))
x, x_tab = gradient_algorithm(s, s_grad, x0, step, PREC, ITE_MAX)
The Rosenbrock function $r$ writes
\begin{array}{rrcll} r: & \mathbb{R}^2 & \to &\mathbb{R}\\ & (x_1,x_2) & \mapsto & (1-x_1)^2 + 100(x_2-x_1^2)^2 . \end{array} |
---|
This function has a global minimum at $(1,1)$ with value $0$.
Question a: Construct function and gradient oracles for $r$.
def r(x):
x1 = x[0]
x2 = x[1]
return (1-x1)**2 + 100*(x2-x1**2)**2
def r_grad(x):
x1 = x[0]
x2 = x[1]
g1 = 2*(x1-1) + 100*2*(2*x1)*(x1**2-x2)
g2 = 100*2*(x2-x1**2)
return np.array([g1,g2])
Question b: Try to minimize $r$ using your constant stepsize gradient function. Can you find a stepsize for which the algorithm converges?
PREC = 1e-6
ITE_MAX = 1000
x0 = np.zeros(2)
step = 0.001
x, x_tab = gradient_algorithm(r, r_grad, x0, step, PREC, ITE_MAX)
npoints = 200
xmin = -1.5
xmax = 2.5
ymin = -1.5
ymax = 2.5
levels = [0.1,1,2,5,10,25]
level_points_plot(r, x_tab, xmin, xmax, ymin, ymax, npoints, levels)
The observed problem is typical of poorly conditionned functions. At the beggining, gradients are large and so a small stepsize has to be employed but near the optimal value, the gradients are very small and thus the progress is very slow.
Question c: Try to devise a gradient method where you change the stepsize during the algorithms to mend this problem.
A systematic approch to stepsize tuning is the use of a line-search method.
Question d: Implement a gradient algorithm with Armijo-Wolfe linesearch using Scipy's line_search routine.
Warning: Since you are doing gradient descent, your search direction is the opposite of the gradient.
import numpy as np
import timeit
from scipy.optimize import line_search
def gradient_Wolfe(f , f_grad , x0 , PREC , ITE_MAX ):
x = np.copy(x0)
g = f_grad(x0)
stop = PREC*np.linalg.norm( g )
x_tab = np.copy(x)
print("------------------------------------\n Gradient with Wolfe line search\n------------------------------------\nSTART")
t_s = timeit.default_timer()
for k in range(ITE_MAX):
########### TO FILL
g = f_grad(x)
stepsize = line_search(f, f_grad, x, -g)[0]
x = x - stepsize*g ###### ITERATION
x_tab = np.vstack((x_tab,x))
if np.linalg.norm(g) < stop:
break
t_e = timeit.default_timer()
print("FINISHED -- {:d} iterations / {:.6f}s -- final value: {:f} at point ({:.2f},{:.2f})\n\n".format(k,t_e-t_s,f(x),x[0],x[1]))
return x,x_tab
x, x_tab = gradient_algorithm(r, r_grad, x0, step, PREC, ITE_MAX)
x, x_tab = gradient_Wolfe(r, r_grad, x0, PREC, ITE_MAX)
Binary classification is another popular problem in machine learning. Instead of predicting a numerical value, the goal is now to classify the student into two classes: $+1$ -- pass i.e. final grade $\geq 10$; and $-1$ -- fail.
To this purpose, we create a class vector $c_{learn}$ from the observation vector $b_{learn}$ by simply setting $c_{learn}(i) = +1 $ if $b_{learn}(i)\geq10$ and $-1$ otherwise.
Then, the most common approach is to minimize the logistic loss regularized by a squared norm: \begin{equation} \min_{x\in\mathbb{R}^{n+1}} \ell(x) = \sum_{i=1}^{m_{learn} } \log\left( 1 + \exp\left( -c_{learn}(i) a_i^{\mathrm{T}} x \right) \right) + \frac{1}{m}\|x\|^2 \end{equation} where $a^\mathrm{T}_i$ is the $i$-th row of $A_{learn}$.
Finally, from a solution $x^\star$ of this problem, one can classify a new example, represented by its feature vector $a$, as such: the quantity $p(a) = \frac{1}{1+\exp(- a^\mathrm{T} x^\star)}$ estimates the probability of belonging to class $1$; thus, one can decide class $+1$ if for instance $ p(a) \geq 0.5$; otherwise, decide class $-1$.
import numpy as np
# File reading
dat_file = np.load('data/student.npz')
A_learn = dat_file['A_learn']
b_learn = dat_file['b_learn']
A_test = dat_file['A_test']
b_test = dat_file['b_test']
m = 395 # number of read examples (total:395)
n = 27 # features
m_learn = 300
c_learn = np.sign(b_learn-(10-1e-15))
c_learn
Question a: Compute the gradient of $q(t) = \log(1+\exp(t))$. Is the function is convex? Deduce that $\ell$ is convex and compute its gradient.
def q(t):
return np.log(1+np.exp(t))
def q_prime(t):
return 1/(1+np.exp(-t))
As you can see if you plot it, this kind of function has steep slopes but also very flat valleys. Hence, constant stepsize might not perform well.
Question b: Construct the suitable function and gradient simulators in order to use your gradient descent function with Armijo-Wolfe linesearch.
def l(x):
l = 0.0
for i in range(m_learn):
l += q(c_learn[i]*A_learn[i,:].dot(x))
return l + 1/m_learn*np.linalg.norm(x)**2
def l_grad(x):
g = np.zeros(x.shape)
for i in range(m_learn):
g += c_learn[i]*A_learn[i,:]*q_prime(c_learn[i]*A_learn[i,:].dot(x))
return g + 2/m_learn*x
x, x_tab = gradient_Wolfe(l, l_grad, np.random.randn(n+1), 1e-8, 500)
x
Question c: From a final point of the optimization algorithm above, generate a decision vector corresponding to the testing set $A_{test}$. Evaluate the classification error.
c_test = np.sign(b_test-(10-1e-15))
y_pred = np.zeros(c_test.shape)
for i in range(y_pred.size):
y_pred[i] = np.sign(1/(1+np.exp(A_test[i,:].dot(x)))-0.5)
print("Pred. \t True")
acc = 0
for i in range(y_pred.size):
print("{}\t{}".format(y_pred[i],c_test[i]))
if y_pred[i] == c_test[i]:
acc += 1.0/y_pred.size
print("Accuracy: {:.2f}%".format(acc*100))