In [None]:
from platform import python_version
print(python_version())
import torch
import torch.optim as optim

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import json
import numpy as np
import scipy as sp
import scipy.stats as st
import scipy.integrate as integrate
from scipy.stats import multivariate_normal
from sklearn import linear_model
from sklearn.exceptions import ConvergenceWarning
from matplotlib.colors import LogNorm

sns.set_style("whitegrid")
sns.set_palette("colorblind")
palette = sns.color_palette()
figsize = (15,8)
legend_fontsize = 16

from matplotlib import rc
rc('font',**{'family':'sans-serif'})
rc('text', usetex=True)
rc('text.latex',preamble=r'\usepackage[utf8]{inputenc}')
rc('text.latex',preamble=r'\usepackage[russian]{babel}')
rc('figure', **{'dpi': 300})

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
xs = np.linspace(-5, 5, 500)
lw = 1.5

relu = np.vectorize(lambda x : max(0.0, x))
thresh = np.vectorize(lambda x : 1 if x >= 0 else 0)

ax.plot(xs, thresh(xs), linewidth=lw, label="Threshold activation")
ax.plot(xs, 1. / (1 + np.exp(-xs)), linewidth=lw, label="Logistic sigmoid")
ax.plot(xs, np.tanh(xs), linewidth=lw, label="Hyperbolic tangent $\\tanh$")
ax.plot(xs, relu(xs), linewidth=lw, label="ReLU activation")

ax.set_ylim((-1., 2.))
ax.set_xlim((-5., 5.))
ax.legend(loc="upper left")

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
xs = np.linspace(-5, 5, 500)
lw = 1.5

relu = np.vectorize(lambda x : max(0.0, x))
softplus = np.vectorize(lambda x : np.log(1 + np.exp(x)))
lrelu = np.vectorize(lambda x : x if x >= 0 else 0.2*x)
lrelu2 = np.vectorize(lambda x : x if x >= 0 else 0.05*x)
elu = np.vectorize(lambda x : x if x >= 0 else 1.0*(np.exp(x)-1))
elu2 = np.vectorize(lambda x : x if x >= 0 else 0.2*(np.exp(x)-1))
thresh = np.vectorize(lambda x : 1 if x >= 0 else 0)

ax.plot(xs, relu(xs), linewidth=lw, label="ReLU activation")
ax.plot(xs, softplus(xs), linewidth=lw, label="Softplus")
ax.plot(xs, lrelu(xs), linewidth=lw, label="Leaky ReLU, $a=\\frac{1}{5}$")
ax.plot(xs, lrelu2(xs), linewidth=lw, label="Leaky ReLU, $a=\\frac{1}{20}$")
ax.plot(xs, elu(xs), linewidth=lw, label="Exponential linear unit, $\\alpha=1.0$")
ax.plot(xs, elu2(xs), linewidth=lw, label="Exponential linear unit, $\\alpha=\\frac{1}{5}$")

ax.set_ylim((-1., 1.))
ax.set_xlim((-5., 1.))
ax.legend(loc="upper left")

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
xs = np.linspace(-5, 5, 500)
lw = 1.5

relu = np.vectorize(lambda x : max(0.0, x))
swish = np.vectorize( lambda x : x / (1. + np.exp(-x)))
swish2 = np.vectorize( lambda x : x / (1. + np.exp(-5*x)))
mish = np.vectorize(lambda x : x * np.tanh(np.log(1 + np.exp(x))))

ax.plot(xs, relu(xs), linewidth=lw, label="ReLU activation")
ax.plot(xs, swish(xs), linewidth=lw, label="Swish activation, $\\beta=1$")
ax.plot(xs, swish2(xs), linewidth=lw, label="Swish activation, $\\beta=5$")
ax.plot(xs, mish(xs), linewidth=lw, label="Mish activation")

ax.set_ylim((-.5, 2.))
ax.set_xlim((-4., 2.))
ax.legend(loc="upper left")

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
xs = np.linspace(-5, 5, 500)
lw = 1.5

def aconc(a1, a2, beta):
    return lambda x : (a1-a2)*x / (1.0 + np.exp(-x*beta*(a1-a2))) + a2*x

acon1 = np.vectorize( aconc(1.0, 0.0, 1.0) )
acon2 = np.vectorize( aconc(1.2, -0.1, 1.0) )
acon3 = np.vectorize( aconc(1.0, -0.8, 1.0) )
acon4 = np.vectorize( aconc(1.0, -0.8, 0.1) )
acon5 = np.vectorize( aconc(1.0, -0.8, 0.01) )

ax.plot(xs, acon1(xs), linewidth=lw, label="ACON-C, $a_1=1$, $a_2=0$, $\\beta=1$")
ax.plot(xs, acon2(xs), linewidth=lw, label="ACON-C, $a_1=1.2$, $a_2=-0.1$, $\\beta=1$")
ax.plot(xs, acon3(xs), linewidth=lw, color="C3", label="ACON-C, $a_1=1$, $a_2=-0.8$, $\\beta=1$")
ax.plot(xs, acon4(xs), linewidth=lw, color="C3", linestyle="dashed", label="ACON-C, $a_1=1$, $a_2=-0.8$, $\\beta=0.1$")
ax.plot(xs, acon5(xs), linewidth=lw, color="C3", linestyle="dotted", label="ACON-C, $a_1=1$, $a_2=-0.8$, $\\beta=0.01$")

ax.set_ylim((-.5, 4.))
ax.set_xlim((-4., 4.))
ax.legend(loc="upper center")
# plt.savefig('act4.pdf', bbox_inches='tight')

In [None]:
x = torch.tensor([1.0], requires_grad=True)
y = torch.tensor([1.0], requires_grad=True)

In [None]:
rho = 1.0
f = x * x + rho * y * y

In [None]:
f.backward()

In [None]:
def my_func(x, y, a=1, b=10):
    return (1.5 - x + x*y)**2 + (2.25 - x + x*y**2)**2 + (2.625 - x + x*y**3)**2

In [None]:
def compute_results(f, Optimizer, momentum=0, nesterov=False, n=100, lrs=[0.01, 0.001, 0.0001], x0=-2.0, y0=-2.0):
    results = {}
    for lr in lrs:
        x = torch.tensor([x0], requires_grad=True)
        y = torch.tensor([y0], requires_grad=True)
        x_hist, y_hist = [x0], [y0]
        if Optimizer == optim.SGD:
            optimizer = Optimizer([x, y], lr=lr, momentum=momentum, nesterov=nesterov)
        else:
            optimizer = Optimizer([x, y], lr=lr)
        def closure():
            optimizer.zero_grad()
            ff = f(x, y)
            ff.backward()
            return ff
        for _ in range(n):
            optimizer.step(closure)
            x_hist.append(x.detach().numpy()[0])
            y_hist.append(y.detach().numpy()[0])
        results[lr] = (x_hist, y_hist)
    return results

In [None]:
def my_plot(ax, my_func, results, x0=-2.0, y0=-2.0, xopt=1, yopt=1, delta = 0.02, my_limx = 2.5, my_limy = 2.5, legend=True, legendloc="lower left"):
    xs = np.arange(-my_limx, my_limx, delta)
    ys = np.arange(-my_limy, my_limy, delta)
    Xs, Ys = np.meshgrid(xs, ys)
    Zs = my_func(Xs, Ys)
    CS = ax.contour(Xs, Ys, Zs, levels=np.logspace(0, 5, 100), norm=LogNorm(), cmap=plt.cm.jet, linewidths=.1)
    ax.scatter([x0], [y0], marker='*', s=40, color='g')
    ax.scatter([xopt], [yopt], marker='*', s=40, color='r')
    for label, hist in results:
        ax.plot(hist[0], hist[1], linewidth='1.0', label=label)
        ax.scatter(hist[0][-1], hist[1][-1], marker='*', s=20)
    ax.set_xlim((-my_limx, my_limx))
    ax.set_ylim((-my_limy, my_limy))
    if legend:
        ax.legend(framealpha=1, loc=legendloc)

In [None]:
x0, y0, rho = 1., 1.5, 0.001

def my_func2(x, y):
    return x**2 + rho * (y**2)

results = {}
# results['SGD, $n=100$'] = compute_results(my_func, optim.SGD, x0=x0, y0=y0, lrs=[0.001, 0.0001], n=100)
results['SGD, $n=1000$'] = compute_results(my_func2, optim.SGD, x0=x0, y0=y0, lrs=[.1, .5, 0.99], n=200)
fig, ax = plt.subplots(figsize=(8,5))

# to_plot = [ ("SGD, $n=100$, $\\alpha=%.4f$" % lr, hist) for lr, hist in results['SGD, $n=100$'].items() ] + [ ("SGD, $n=1000$, $\\alpha=%.4f$" % lr, hist) for lr, hist in results['SGD, $n=1000$'].items() ]
to_plot = [ ("SGD, $\\alpha=%s$, $n=200$" % lr, hist) for lr, hist in results['SGD, $n=1000$'].items() ]
my_plot(ax, my_func2, to_plot, x0=x0, y0=y0, xopt=0, yopt=0, my_limx=4, my_limy=3)
plt.ylim((-.5, 2.))
# plt.savefig('sgd0.pdf', bbox_inches='tight')

In [None]:
x0, y0 = 1., 1.5

def my_func2(x, y):
    return x**2 + 0.01 * (y**2)

results = {}
# results['SGD, $n=100$'] = compute_results(my_func, optim.SGD, x0=x0, y0=y0, lrs=[0.001, 0.0001], n=100)
results['sgd'] = compute_results(my_func2, optim.SGD, x0=x0, y0=y0, lrs=[0.1], n=200)
results['mom'] = compute_results(my_func2, optim.SGD, momentum=0.95, x0=x0, y0=y0, lrs=[0.1], n=200)
results['nag'] = compute_results(my_func2, optim.SGD, momentum=0.95, nesterov=True, x0=x0, y0=y0, lrs=[0.1], n=200)

to_plot = [ ("SGD with NAG, $\\gamma=0.95$, $\\alpha=%s$, $n=200$" % lr, hist) for lr, hist in results['nag'].items() ] + \
[ ("SGD, $\\alpha=%s$, $n=200$" % lr, hist) for lr, hist in results['sgd'].items() ] + \
  [ ("SGD with momentum, $\\gamma=0.95$, $\\alpha=%s$, $n=200$" % lr, hist) for lr, hist in results['mom'].items() ]  
  

fig, ax = plt.subplots(figsize=(8,5))
my_plot(ax, my_func2, to_plot, x0=x0, y0=y0, xopt=0, yopt=0, my_limx=4, my_limy=3)
plt.ylim((-1.5, 1.75))
plt.xlim((-.5, .5))

In [None]:
x0, y0 = 1., 1.5

def my_func1(x, y):
    return x**2 + 1. * (y**2)

def my_func2(x, y):
    return x**2 + 0.1 * (y**2)

def my_func3(x, y):
    return x**2 + 0.01 * (y**2)

fig, axs = plt.subplots(1, 3, figsize=(8,5), sharey=True)
my_funcs = [my_func1, my_func2, my_func3]

for i in range(3):
    results = {}
    results['sgd'] = compute_results(my_funcs[i], optim.SGD, x0=x0, y0=y0, lrs=[0.05, 0.01], n=500)
#     results['mom'] = compute_results(my_funcs[i], optim.SGD, momentum=0.95, x0=x0, y0=y0, lrs=[0.05], n=200)
#     results['nag'] = compute_results(my_funcs[i], optim.SGD, momentum=0.95, nesterov=True, x0=x0, y0=y0, lrs=[0.05], n=200)

    to_plot = [ ("SGD, $\\alpha=%s$" % lr, hist) for lr, hist in results['sgd'].items() ] 
#       [ ("SGD with momentum, $\\gamma=0.95$, $\\alpha=%s$, $n=200$" % lr, hist) for lr, hist in results['mom'].items() ] + \
#       [ ("SGD with NAG, $\\gamma=0.95$, $\\alpha=%s$, $n=200$" % lr, hist) for lr, hist in results['nag'].items() ]

    my_plot(axs[i], my_funcs[i], to_plot, x0=x0, y0=y0, xopt=0, yopt=0, my_limx=4, my_limy=3)

plt.ylim((-1., 2.))

In [None]:
x0, y0 = 1., 1.5
results = {}
# results['SGD, $n=100$'] = compute_results(my_func, optim.SGD, x0=x0, y0=y0, lrs=[0.001, 0.0001], n=100)
results['SGD, $n=1000$'] = compute_results(my_func, optim.SGD, x0=x0, y0=y0, lrs=[0.05, 0.01, 0.005, 0.001], n=500)
fig, ax = plt.subplots(figsize=(8,5))

# to_plot = [ ("SGD, $n=100$, $\\alpha=%.4f$" % lr, hist) for lr, hist in results['SGD, $n=100$'].items() ] + [ ("SGD, $n=1000$, $\\alpha=%.4f$" % lr, hist) for lr, hist in results['SGD, $n=1000$'].items() ]
to_plot = [ ("SGD, $\\alpha=%s$, $n=500$" % lr, hist) for lr, hist in results['SGD, $n=1000$'].items() ]
my_plot(ax, my_func, to_plot, x0=x0, y0=y0, xopt=3, yopt=0.5, my_limx=4, my_limy=3)
plt.ylim((-.5, 2.))

In [None]:
x0, y0 = 1., 1.5
results = {}
results['sgd'] = compute_results(my_func, optim.SGD, x0=x0, y0=y0, lrs=[0.002], n=300)
results['mom'] = compute_results(my_func, optim.SGD, momentum=0.95, x0=x0, y0=y0, lrs=[0.002], n=300)
results['nag'] = compute_results(my_func, optim.SGD, momentum=0.95, nesterov=True, x0=x0, y0=y0, lrs=[0.002], n=300)
fig, ax = plt.subplots(figsize=(8,5))

to_plot = [ ("SGD, $\\alpha=%s$, $n=200$" % lr, hist) for lr, hist in results['sgd'].items() ] + \
  [ ("SGD with momentum, $\\gamma=0.95$, $\\alpha=%s$, $n=200$" % lr, hist) for lr, hist in results['mom'].items() ] + \
  [ ("SGD with NAG, $\\gamma=0.95$, $\\alpha=%s$, $n=200$" % lr, hist) for lr, hist in results['nag'].items() ]
my_plot(ax, my_func, to_plot, x0=x0, y0=y0, xopt=3, yopt=0.5, my_limx=4, my_limy=3, legendloc="upper right")
plt.ylim((-2., 2.))
plt.xlim((0., 4.))

In [None]:
x0, y0 = 1., 1.5
n = 500
results = {}
results['sgd'] = compute_results(my_func, optim.SGD, x0=x0, y0=y0, lrs=[0.001], n=n)
results['adagrad'] = compute_results(my_func, optim.Adagrad, x0=x0, y0=y0, lrs=[1.], n=n)
results['adadelta'] = compute_results(my_func, optim.Adadelta, x0=x0, y0=y0, lrs=[1.], n=n)
results['adam'] = compute_results(my_func, optim.Adam, x0=x0, y0=y0, lrs=[0.1], n=n)
fig, ax = plt.subplots(figsize=(8,5))

to_plot = [ ("SGD, $\\alpha=%s$, $n=%d$" % (lr, n), hist) for lr, hist in results['sgd'].items() ] + \
  [ ("Adagrad, $\\alpha=%s$, $n=%d$" % (lr, n), hist) for lr, hist in results['adagrad'].items() ] + \
  [ ("Adadelta, $\\alpha=%s$, $n=%d$" % (lr, n), hist) for lr, hist in results['adadelta'].items() ] + \
  [ ("Adam, $\\alpha=%s$, $n=%d$" % (lr, n), hist) for lr, hist in results['adam'].items() ]
my_plot(ax, my_func, to_plot, x0=x0, y0=y0, xopt=3, yopt=0.5, my_limx=4, my_limy=3, legendloc="upper right")
plt.ylim((-.5, 1.6))
plt.xlim((0, 3.5))