by: Tomasz Romańczukiewicz
web: th.if.uj.edu.pl/~trom/
rm B-2-03
Basically there three types of minimization techniques:
In principle if $f(x)=0$ for $x=x_*$ than $g(x)=[f(x)]^2$ has a minimum at $x_*$.
So it may seem that finding root of $f(x)$ is equivalen to finding minimum of $g(x)$.
Analytically it might be true but numerically they are different procedures.
All methods start with points satisfying the relations:
$$a<b<c \qquad f(a)>f(b) \;\wedge \;f(b)<f(a)$$
Then we may expect (if the function is regular enough - continouous $f$ and $f'$) that $f$ has a minimum between $(a, c)$
If we don't know such three points we search for them:
If we have three points bracketing the minimum we can use some interval divisions to localize the minimum
Interval $[a, c]$ can be devided into three intervals (equal - ternary search or not eg. golden ratio)
%matplotlib inline
import matplotlib.pyplot as pl
import numpy as np
def f(x):
return np.exp(-x)*(1-2*x)/(2*x+10)
def show_minimization_steps(F, x1, x2, step=0, plot=True, prnt=False):
"""
Function not optimized
used only for presentation purposes
"""
pl.figure(figsize=(6,3), dpi=150)
x = np.linspace(x1, x2, 1000)
y = F(x)
X = np.linspace(x1, x2, 4)
Y = F(X)
for n in range(step):
if Y[1]<Y[2]: x1, x2 = X[0], X[2]
else: x1, x2 = X[1], X[3]
X = np.linspace(x1, x2, 4)
Y = F(X)
if(prnt==True): print("step :", n, X)
if(plot==True):
pl.grid(True)
pl.plot(x, y)
pl.plot(X, Y, 'o')
pl.show()
show_minimization_steps(f, 0, 6, 0)
show_minimization_steps(f, 0, 6, 1)
show_minimization_steps(f, 0, 6, 2)
show_minimization_steps(f, 0, 6, 3)
show_minimization_steps(f, 0, 6, 4)
show_minimization_steps(f, 0, 6, 10, plot=False, prnt=True)
Each iteration uses new middle points. Very inneficient.
In each iteration we reduce the segment to $2/3$ for the cost of at two calculations of $f(x)$
Perhaps we could use three points from the previous iteration and calculate 𝑓(𝑥) only once
But where to put the fourth point?
Best strategy: devide an interval into three in such a way that each two would have the length $\tau$ times the length of a large one.
$$|\;\; a\;\; |\;b\;|\;\; a\;\; |$$$$a+b=(2a+b)\tau \;\wedge\; a=(a+b)\tau$$$$\tau^2+\tau-1=0\;\;\Rightarrow\;\; \tau=\frac{\sqrt{5}-1}{2}=0.61803398875$$Golden ratio minimizes number of steps on average (larger vs smaller segment)
We converge as $\tau^n$ (linear converngence) with $n$ calculations of $f$
Slower than bisection $(1/2)^n$ but faster than ternary search
Slighly better but far more complicated algorithm: Fibbonacci search
The legths of the intervals (from smaller to largest) form a Fibbonacci series
How accurate we can find $x_{min}$.
Around the minimum usually:
$$f(x)=f(x_{min})+\frac12f''(x_{min})(x-x_{min})^2$$
But $f$ can be calulated with an error $f\pm\epsilon|f|$ which limits the precision of $x_{min}$ to
def plot_accuracy_comparision():
pl.figure(figsize=(10,3), dpi=150)
x = np.linspace(0, 6, 1000)
y1 = np.float16(1 + 0.001*(x-2)**2-2e-4*(x-2.5)**3)
y2 = np.float16(0.002*(x-2)-6e-4*(x-2.5)**2)
pl.subplot(1, 2, 1)
pl.grid(True)
pl.title('Minimum')
pl.plot(x, y1)
pl.legend(['f(x)'])
pl.subplot(1, 2, 2)
pl.grid(True)
pl.title('zero')
pl.plot(x, y2, 'red')
pl.legend(["f'(x)"])
pl.show()
Expected:
ε = 2**(-11) # 11 bits out of 16 used for mantissa
print("Accuracy of minimum: ", np.sqrt(ε*1/0.002))
plot_accuracy_comparision()
should approximate a minimum of $f(x)$ if we are close enough to the minimum.
def plot_parabola_method(f, a, b, c, step):
"""
Function not optimized
used only for presentation purposes
"""
pl.figure(figsize=(6,3), dpi=150)
x = np.linspace(0, 6, 1000)
X = np.array([a, b, c])
parabola = (x-b)*(x-c)*f(a)/(a-b)/(a-c) + (x-a)*(x-c)*f(b)/(b-a)/(b-c) + (x-b)*(x-a)*f(c)/(c-b)/(c-a)
for n in range(step):
length = c-a
parabola = (x-b)*(x-c)*f(a)/(a-b)/(a-c) + (x-a)*(x-c)*f(b)/(b-a)/(b-c) + (x-b)*(x-a)*f(c)/(c-b)/(c-a)
d = b - 0.5 * ((b-a)**2*(f(b)-f(c))-(b-c)**2*(f(b)-f(a)) ) / \
((b-a)*(f(b)-f(c))-(b-c)*(f(b)-f(a)) )
X = np.array([a, b, c, d])
X.sort()
F = f(X)
if F[1]<F[2]: a, b, c = X[0], X[1], X[2]
else: a, b, c = X[1], X[2], X[3]
#print(X, " ", d, f(d))
pl.plot(x, f(x), 'black')
pl.ylim([-0.08,0.1])
pl.plot(x, parabola, linestyle='dashed')
pl.grid(True)
pl.plot(X,f(X), 'o', ms=3)
pl.show()
plot_parabola_method(f, 0, 1, 3, 0)
plot_parabola_method(f, 0, 1, 3, 1)
plot_parabola_method(f, 0, 1, 3, 2)
plot_parabola_method(f, 0, 1, 3, 3)
plot_parabola_method(f, 0, 1, 3, 10)
def plot_brent_method(f, a, b, c, step):
"""
Function not optimized
used only for presentation purposes
"""
pl.figure(figsize=(6,3), dpi=150)
x = np.linspace(0, 6, 1000)
X = np.array([a, b, c])
parabola = (x-b)*(x-c)*f(a)/(a-b)/(a-c) + (x-a)*(x-c)*f(b)/(b-a)/(b-c) + (x-b)*(x-a)*f(c)/(c-b)/(c-a)
bisect = False
for n in range(step):
length = c-a
XXX = (a, b, c)
bisect = False
parabola = (x-b)*(x-c)*f(a)/(a-b)/(a-c) + (x-a)*(x-c)*f(b)/(b-a)/(b-c) + (x-b)*(x-a)*f(c)/(c-b)/(c-a)
d = b - 0.5 * ((b-a)**2*(f(b)-f(c))-(b-c)**2*(f(b)-f(a)) ) / \
((b-a)*(f(b)-f(c))-(b-c)*(f(b)-f(a)) )
if d<a or d>c:
# print("Bisection outside")
d=0.5*(a+c)
bisect = True
X = np.array([a, b, c, d])
X.sort()
F = f(X)
if F[1]<F[2]: a, b, c = X[0], X[1], X[2]
else: a, b, c = X[1], X[2], X[3]
if c-a>0.5*length:
a, b, c = XXX
d=0.5*(a+c)
X = np.array([a, b, c, d])
X.sort()
F = f(X)
bisect = True
# print("Bisection length")
if F[1]<F[2]: a, b, c = X[0], X[1], X[2]
else: a, b, c = X[1], X[2], X[3]
#print(X, " ", d, f(d))
pl.plot(x, f(x), 'black')
pl.ylim([-0.08,0.1])
if bisect==True: pl.text(4, 0.06, "Bisection")
pl.plot(x, parabola, linestyle='dashed')
pl.grid(True)
pl.plot(X,f(X), 'o', ms=3)
pl.show()
plot_brent_method(f, 0, 1, 3, 0)
plot_brent_method(f, 0, 1, 3, 1)
plot_brent_method(f, 0, 1, 3, 2)
plot_brent_method(f, 0, 1, 3, 3)
plot_brent_method(f, 0, 1, 3, 4)
plot_brent_method(f, 0, 1, 3, 5)
Knowing the derivative of $f(x)$ allows to find a zero of $f'(x)$ which might be slightly more difficult but can be done more precisely
Simulated annealing reproduces a physical process of heating a body followed by slow cooling down often used in metalurgy.
Algorithm (details may vary):
if $f(y)>f(x)$ we accept $y$ as a new solution with probability $P(f(y)-f(x), T)$ in physics $P$ has a Boltzman distribution
$$P(\Delta E, T) = e^{-\beta \Delta E/T}$$
Differential evolution from a set of agents (test solutions) in each iteration we create a new set by mutating the values. In 1-dim:
In higher dimensions we also choose which coordinates are mutated with cross over probability
def f(x):
return 4*np.sin(0.5*x**2)+np.cos(10/(x**2+0.5))*(x-5)**2
def show_current_function(X, G, x, g, position, step):
pl.subplot(3, 2, position)
pl.plot(X, G, 'black')
pl.grid(True)
pl.plot(x, g, "o", color='orange', ms=5)
pl.text(1.2, 22, "step {:d}\nmin={:.6f}".format(step, np.amin(g)))
def diff_evolution_example():
x1, x2, N = -3, 3, 10
pl.figure(figsize=(12,7), dpi=120)
# initial random choice
x = x1 + (x2-x1)*np.random.rand(N)
g = f(x)
show_current_function(X, G, x, g, 1, 0)
F = 0.2 # wieght
for step in range(10):
for n in range(N):
(a,b,c) = np.random.choice(x, 3)
y = a + F*(b-c)
gy = f(y)
if gy<g[n] and x1<y<x2 :
x[n], g[n] = y, gy
if step%2==0:
show_current_function(X, G, x, g, 2+step/2, step+1)
pl.show()
diff_evolution_example()