%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%                                                                     %
%              Scientific Word   Wrap/Unwrap  Version 2.5             %
%                                                                     %
% If you are separating the files in this message by hand, you will   %
% need to identify the file type and place it in the appropriate      %
% directory.  The possible types are: Document, DocAssoc, Other,      %
% Macro, Style, Graphic, PastedPict, and PlotPict. Extract files      %
% tagged as Document, DocAssoc, or Other into your TeX source file    %
% directory.  Macro files go into your TeX macros directory. Style    %
% files are used by Scientific Word and do not need to be extracted.  %
% Graphic, PastedPict, and PlotPict files should be placed in a       %
% graphics directory.                                                 %
%                                                                     %
% Graphic files need to be converted from the text format (this is    %
% done for e-mail compatability) to the original 8-bit binary format. %
%                                                                     %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%                                                                     %
% Files included:                                                     %
%                                                                     %
% "/document/chap8.tex", Document, 8694, 3/24/1999, 21:46:40, ""      %
%                                                                     %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%% Start /document/chap8.tex %%%%%%%%%%%%%%%%%%%%%%

%% This document created by Scientific Notebook (R) Version 3.0


\documentclass[12pt,thmsa]{article}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{sw20jart}

%TCIDATA{TCIstyle=article/art4.lat,jart,sw20jart}

%TCIDATA{<META NAME="GraphicsSave" CONTENT="32">}
%TCIDATA{Created=Mon Aug 19 14:52:24 1996}
%TCIDATA{LastRevised=Wed Mar 24 13:46:38 1999}
%TCIDATA{CSTFile=Lab Report.cst}
%TCIDATA{PageSetup=72,72,72,72,0}
%TCIDATA{AllPages=
%F=36,\PARA{038<p type="texpara" tag="Body Text" >\hfill \thepage}
%}


\input{tcilatex}
\begin{document}


\vspace{1pt}Gradient Methods

For Chapter 8 of Chong/Zak

Author: Robert Beezer

History:

1999/03/22\qquad First version.

\vspace{1pt}

\subsubsection{Properties of the gradient vector}

\subsubsection{\protect\vspace{1pt}}

\begin{enumerate}
\item  Consider a level curve $x\left( t\right) $ \ parameterized as a
function of one variable, $t$. \ So, by definition \ $f(x(t))=c$ \ where \ $c
$ \ is some constant.\bigskip 

\item  Let \ $t_{0}$ \ be such that \ $x(t_{0})=x_{0}$, \ a point of
interest.\bigskip 

\item  Differentiating with respect to \ $t$ \ and using the right
formulation of the chain rule gives: 
\[
\frac{d}{dt}f(x(t))=\frac{d}{dt}c
\]
\[
\nabla f(x(t))\cdot x^{\prime }(t)=0
\]
\bigskip 

\item  Evaluating at \ $t_{0}$ \ gives 
\[
\nabla f(x_{0})\cdot x^{\prime }(t_{0})=0
\]
\bigskip 

\item  So the vector \ $x^{\prime }(t_{0})$ \ (which is tangent to the curve
at \ $x_{0}$) is orthogonal to the gradient ($\nabla f(x_{0})$)
there.\bigskip 

\item  Now consider a directional derivative at \ $x_{0}$. \ Its value in
the direction of \ $\ \ d$ \ is \ $\nabla f(x_{0})\cdot d$. \ This quantity
is maximized (resp. minimized) when \ $d$ \ has the same direction as \ $%
\nabla f(x_{0})$ \ (resp. the same direction as \ $-\nabla f(x_{0})$%
).\bigskip 

\item  So this maximum (resp. minimum) value of the directional derivative
is achieved with a directional derivative in the direction of \ the unit
vector $d=\frac{1}{\Vert \nabla f(x_{0})\Vert }$ $\nabla f(x_{0})$ \ (resp. $%
d=\frac{-1}{\Vert \nabla f(x_{0})\Vert }$ $\nabla f(x_{0})$ ) \ \ which is
perpendicular to the level curve. \ Notice that this is the actual rate of
increase since we have employed a unit vector for the direction.\bigskip

\item  This maximum value is then 
\[
\nabla f(x_{0})\cdot d=\nabla f(x_{0})\cdot \frac{1}{\Vert \nabla
f(x_{0})\Vert }\nabla f(x_{0})=\frac{\Vert \nabla f(x_{0})\Vert ^{2}}{\Vert
\nabla f(x_{0})\Vert }=\Vert \nabla f(x_{0})\Vert 
\]

(for the minimum value it is just \ $-\Vert \nabla f(x_{0})\Vert $ ).\bigskip

\item  Discuss hiking and topographic maps.
\end{enumerate}

\vspace{1pt}

\vspace{1pt}

\subsubsection{Employing the gradient for searching}

\vspace{1pt}

\vspace{1pt}

\begin{enumerate}
\item  Taylor's Theorem, taken out one term, built at a starting point \ $%
x^{(0)}$, \ says (i.e. this is a tangent plane approximation) 
\[
f(x)\approx f(x^{(0)})+\nabla f(x^{(0)})(x-x^{(0)}) 
\]

Evaluated at \ $x=x^{(0)}-\alpha \nabla f(x^{(0)})$ \ (for \ $\alpha >0$) \
we get 
\[
f(x^{(0)}-\alpha \nabla f(x^{(0)}))\approx f(x^{(0)})+\nabla
f(x^{(0)})(x^{(0)}-\alpha \nabla f(x^{(0)})-x^{(0)}) 
\]
\[
\approx f(x^{(0)})+\nabla f(x^{(0)})(-\alpha \nabla
f(x^{(0)}))=f(x^{(0)})-\alpha \Vert \nabla f(x^{(0)})\Vert ^{2} 
\]

So, if we move a ``little bit'' (this is more precisely described by $\alpha 
$) \ in the direction of $-\nabla f(x^{(0)})$ we will \emph{decrease} the
value of the function, over what \ $f(x^{(0)})$ \ is (since the length
squared is a positive quantity).\bigskip

\item  So set \ $x^{(1)}=x^{(0)}-\alpha \nabla f(x^{(0)})$ \ for some \ $%
\alpha $. \ \ How should we choose \ $\alpha $ \ (which is known as the step
size)??? \ Consider the function (of just one variable) described by \ \ $%
\phi (\alpha )=f(x^{(0)}-\alpha \nabla f(x^{(0)}))$ \ and find the value of
\ $\alpha $ \ that \emph{minimizes} this expression using standard
techniques for a function of one variable (i.e. first semester calculus)! \
This is called the steepest descent method.\bigskip

\item  Theorem: \ The method of steepest descent moves in orthogonal steps.

Proof: \ Let \ $\alpha _{k}$ \ be the value of \ $\alpha $ \ that minimizes
\ \ $\phi _{k}(\alpha )=f(x^{(k)}-\alpha \nabla f(x^{(k)}))$. \ Then, 
\[
0=\phi _{k}^{\prime }(\alpha _{k})=\frac{d}{d\alpha }(f(x^{(k)}-\alpha
\nabla f(x^{(k)})))|_{\alpha =\alpha _{k}} 
\]
\[
=\nabla f(x^{(k)}-\alpha _{k}\nabla f(x^{(k)}))\cdot \left( -\nabla
f(x^{(k)})\right) =-\nabla f(x^{(k+1)})\cdot \nabla f(x^{(k)}) 
\]

Now consider the dot product: 
\[
(x^{(k+1)}-x^{(k)})\cdot (x^{(k+2)}-x^{(k+1)})=\left( \alpha _{k}\nabla
f(x^{(k)})\right) \cdot \left( \alpha _{k+1}\nabla f(x^{(k+1)})\right)
=\alpha _{k}\alpha _{k+1}(0)=0 
\]

This says that \bigskip the movements from point to point are perpendicular.

\item  This ``zig-zag'' behavior is interesting, but not really desirable,
as it leads to poor performance for this algorithm.\bigskip
\end{enumerate}

\vspace{1pt}

\subsubsection{An example of steepest gradient descent}

\vspace{1pt}

\begin{enumerate}
\item  (Peressini/Sullivan/Uhl, Problem 3.8) \ \ $%
f(x_{1,}x_{2})=2x_{1}^{4}+x_{2}^{2}-4x_{1}x_{2}+5x_{2}$ \ starting with \ $%
x^{(0)}=(0,0)$.\bigskip

\item  $\nabla f\left( x_{1},x_{2}\right) =\left(
8x_{1}^{3}-4x_{2},2x_{2}-4x_{1}+5\right) \bigskip $

\item  First iteration: $\ $%
\[
\phi _{0}(\alpha )=f(x^{(0)}-\alpha \nabla f(x^{(0)}))=f((0,0)-\alpha \nabla
f\left( 0,0\right) )=f((0,0)-\alpha (0,5))=f((0,-5\alpha ))=-25\alpha
+25\alpha ^{2} 
\]

This function is minimized at \ $\alpha =\frac{1}{2}$ \ (verify that \ $\phi
_{0}^{\prime \prime }(\frac{1}{2})=50>0$).

So \ $x^{(1)}=x^{(0)}-\left( \frac{1}{2}\right) \nabla
f(x^{(0)})=(0,0)-\left( \frac{1}{2}\right) (0,5)=(0,-\frac{5}{2}%
)=\allowbreak \left( 0,-2.\,\allowbreak 5\right) \allowbreak $.\bigskip

\item  Second iteration: $\ $%
\[
\phi _{1}(\alpha )=f(x^{(1)}-\alpha \nabla f(x^{(1)}))=f((0,-\frac{5}{2}%
)-\alpha \nabla f(0,-\frac{5}{2}))=f((0,-\frac{5}{2})-\alpha
(10,0))=f((-10\alpha ,-\frac{5}{2}\alpha ))=-\frac{25}{4}-100\alpha
+20000\alpha ^{4} 
\]

The derivative \ $\phi _{1}^{\prime }(\alpha )=-100+80000\alpha ^{3}$ \ has
just one real root, \ $\alpha =\frac{1}{2\left( 10^{2/3}\right) }$. \ \ This
is a minimum since \ $\phi _{1}^{\prime \prime }\left( \frac{1}{2\left(
10^{2/3}\right) }\right) =600\left( 10^{2/3}\right) >0$).

So \ $x^{(2)}=x^{(1)}-\frac{1}{2\left( 10^{2/3}\right) }\nabla
f(x^{(0)})=(0,-\frac{5}{2})-\frac{1}{2\left( 10^{2/3}\right) }(10,0)=(0,-%
\frac{5}{2})=(-\frac{5^{1/3}}{2^{2/3}},-\frac{5}{2})=\allowbreak \left(
-1.\,\allowbreak 077,-2.\,\allowbreak 5\right) \allowbreak $.

\item  Third iteration: $\ $%
\[
\phi _{2}(\alpha )=f(x^{(2)}-\alpha \nabla f(x^{(2)}))=f((-\frac{5^{1/3}}{%
2^{2/3}},-\frac{5}{2})-\alpha \nabla f(-\frac{5^{1/3}}{2^{2/3}},-\frac{5}{2}%
))
\]
\[
=f((-\frac{5^{1/3}}{2^{2/3}},-\frac{5}{2})-\alpha \left( 0,2\left(
10^{1/3}\right) \right) )=-\frac{25}{4}-\frac{15\left( 5^{1/3}\right) }{%
2\left( 2^{2/3}\right) }-4\left( 10^{2/3}\right) \alpha +4\left(
10^{2/3}\right) \alpha ^{2}
\]

This function is minimized at \ $\alpha =\frac{1}{2}$ \ (verify that \ it is
an upward opening parabola).

So \ $x^{(3)}=x^{(2)}-\left( \frac{1}{2}\right) \nabla f(x^{(2)})=(-\frac{%
5^{1/3}}{2^{2/3}},-\frac{5}{2})-\left( \frac{1}{2}\right) \left( 0,2\left(
10^{1/3}\right) \right) =(-\frac{5^{1/3}}{2^{2/3}},-\frac{5}{2}-10^{1/3})$%
.\bigskip 

\item  We could continue in this vein for as long as we pleased. \ Let's
stop here and see how we are doing.

Numerically we are at: \ $x^{(3)}=(-\frac{5^{1/3}}{2^{2/3}},-\frac{5}{2}%
-10^{1/3})=\allowbreak \left( -1.\,\allowbreak
077\,217\,345,-4.\,\allowbreak 654\,434\,69\right) \allowbreak $

We can obtain a possible minimizer by solving the equation \ \ $\nabla f(x)=0
$. \ \ This yields the one real solution (Mathematica gives an exact
solution, but it is gruesome looking) \ $x^{\ast }=(-1.38041,-5.26082)$. \ 

Evaluating the objective function we have:

$f(x^{(3)})=-18.97071892$ \ \ \ versus \ \ $f(x^{\ast })=-20.41412443394550$
\ \ so we appear to be close.
\end{enumerate}

\vspace{1pt}

\vspace{1pt}

Problems: \ minimize $2x_{1}^{3}+3x_{2}^{4}-6x_{1}x_{2}^{2}$ \ \ \ starting
at $\ \ x^{(0)}=(2,2)$ \ \ \ \ do three iterations using steepest descent

\end{document}

%%%%%%%%%%%%%%%%%%%%%%% End /document/chap8.tex %%%%%%%%%%%%%%%%%%%%%%%