In [None]:
import gym

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math
import random
import numpy as np
import scipy as sp
import scipy.stats as st
import scipy.integrate as integrate
from scipy.stats import multivariate_normal
from sklearn import linear_model
from sklearn.exceptions import ConvergenceWarning
import statsmodels.api as sm
from matplotlib.colors import LogNorm
import pickle

from joblib import Parallel, delayed
import multiprocessing
from collections import namedtuple
from itertools import count

import cProfile
from datetime import datetime

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# import torch. as T
from torch.autograd import Variable

sns.set_style("whitegrid")
sns.set_palette("colorblind")
palette = sns.color_palette()
figsize = (15,8)
legend_fontsize = 16

from matplotlib import rc
rc('font',**{'family':'sans-serif'})
rc('text', usetex=True)
rc('text.latex',preamble=r'\usepackage[utf8]{inputenc}')
rc('text.latex',preamble=r'\usepackage[russian]{babel}')
rc('figure', **{'dpi': 300})

## Cartpole

In [None]:
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def store(self, exptuple):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = exptuple
        self.position = (self.position + 1) % self.capacity
       
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [None]:
class Network(nn.Module):
    def __init__(self, layer_size=256):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, layer_size)
        self.l2 = nn.Linear(layer_size, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x

In [None]:
def plot_durations(xs, labels):
    plt.figure(figsize=(10, 5))
    plt.xlabel('Номер эпизода')
    plt.ylabel('Число шагов')
    for i,x in enumerate(xs):
        plt.plot(x, label=labels[i])
    plt.legend(loc="upper left")

In [None]:
class CartpoleDQN():
    def __init__(self):
        self.env = gym.make('CartPole-v0')
        self.model = Network()
        self.memory = ReplayMemory(10000)
        self.optimizer = optim.Adam(self.model.parameters(), 0.001)
        self.steps_done = 0
        self.episode_durations = []
        
        self.gamma = 0.8
        self.batch_size = 64
        
        self.eps_init, self.eps_final, self.eps_decay = 0.9, 0.05, 200
        self.num_step = 0

    def select_greedy_action(self, state):
        return self.model(state).data.max(1)[1].view(1, 1)

    def select_action(self, state):
        sample = random.random()
        self.num_step += 1
        eps_threshold = self.eps_final + (self.eps_init - self.eps_final) * math.exp(-1. * self.num_step / self.eps_decay)
        if sample > eps_threshold:
            return self.select_greedy_action(state)
        else:
            return torch.tensor([[random.randrange(2)]], dtype=torch.int64)
        
    def run_episode(self, e=0, do_learning=True, greedy=False, render=False):
        state, num_step = self.env.reset()[0], 0
        while True:
            if render:
                self.env.render()

            state_tensor = torch.tensor([state], dtype=torch.float32)
            with torch.no_grad():
                if greedy:
                    action = self.select_greedy_action(state_tensor)
                else:
                    action = self.select_action(state_tensor)
            next_state, reward, done, _, _ = self.env.step(action.numpy()[0][0])
            next_state_tensor = torch.tensor([next_state], dtype=torch.float32)

            if done:
                reward = -1

            transition = (state_tensor, action, next_state_tensor, torch.tensor([reward], dtype=torch.float32))
            self.memory.store(transition)

            if do_learning:
                self.learn()

            state = next_state
            num_step += 1

            if done:
                print("\tepisode %d finished after %d steps" % (e, num_step))
                self.episode_durations.append(num_step)
                break

    def learn(self):
        if len(self.memory) < self.batch_size:
            return

        # берём мини-батч из памяти
        transitions = self.memory.sample(self.batch_size)
        batch_state, batch_action, batch_next_state, batch_reward = zip(*transitions)

        batch_state = Variable(torch.cat(batch_state))
        batch_action = Variable(torch.cat(batch_action))
        batch_reward = Variable(torch.cat(batch_reward))
        batch_next_state = Variable(torch.cat(batch_next_state))

        # считаем значения функции Q
        Q = self.model(batch_state).gather(1, batch_action).reshape([self.batch_size])

        # оцениваем ожидаемые значения после этого действия
        Qmax = self.model(batch_next_state).detach().max(1)[0]
        Qnext = batch_reward + (self.gamma * Qmax)

        # и хотим, чтобы Q было похоже на Qnext -- это и есть суть Q-обучения
        loss = F.smooth_l1_loss(Q, Qnext)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [None]:
dqn = CartpoleDQN()

print("%s\tStarting training for 300 episodes..." % (datetime.now().time()))
for e in range(300):
    dqn.run_episode(e)
print("%s\t\t...done!" % (datetime.now().time()))

In [None]:
plot_durations([dqn.episode_durations], ['DQN'])

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, layer_size=256):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, layer_size)
        self.l2 = nn.Linear(layer_size, 2)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.softmax(self.l2(x), dim=1)
        return x

In [None]:
class CartpolePolicyGradient():
    def __init__(self):
        self.env = gym.make('CartPole-v0')
        self.model = PolicyNetwork()
        self.memory = ReplayMemory(10000)
        self.optimizer = optim.Adam(self.model.parameters(), 0.001)
        self.steps_done = 0
        self.episode_durations = []
        
        self.gamma = 0.8
        
    def discount_rewards(self, r):
        '''выдаём дисконтированные награды'''
        discounted_r = torch.zeros(r.size())
        running_add = 0
        for t in reversed(range(len(r))):
            running_add = running_add * self.gamma + r[t]
            discounted_r[t] = running_add

        return discounted_r

    def run_episode(self, e=0):
        state = self.env.reset()[0]
        reward_sum = 0
        xs = torch.tensor([], dtype=torch.float32)
        ys = torch.tensor([], dtype=torch.float32)
        rewards = torch.tensor([], dtype=torch.float32)
        num_step = 0

        while True:
            x = torch.tensor([state], dtype=torch.float32)
            xs = torch.cat([xs, x])

            # считаем вероятности действий и выбираем одно из двух
            action_prob = self.model(Variable(x))
            action = 0 if random.random() < action_prob.data[0][0] else 1

            y = torch.tensor([[1, 0]] if action == 0 else [[0, 1]], dtype=torch.float32)
            ys = torch.cat([ys, y])

            state, reward, done, _, _ = self.env.step(action)
            rewards = torch.cat([rewards, torch.tensor([[reward]], dtype=torch.float32)])
            reward_sum += reward
            num_step += 1

            if done or num_step >= 500:
                # считаем дисконтированные награды
                targets = self.discount_rewards(rewards)
                
                # нормализуем награды (это baseline)
                targets = (targets - targets.mean())/(targets.std() + 1e-6)
                
                loss = self.learn(xs, ys, targets)
                print("\tepisode %d finished after %d steps" % (e, num_step))
                self.episode_durations.append(num_step)
                break

    def learn(self, x, y, targets):
        # предсказания вероятностей действий
        action_pred = self.model(Variable(x))
        y = Variable(y, requires_grad=True)
        targets = Variable(targets)
        log_lik = -y * torch.log(action_pred)
        log_lik_adv = log_lik * targets
        loss = torch.sum(log_lik_adv, 1).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.data

In [None]:
pg = CartpolePolicyGradient()

print("%s\tStarting training for 300 episodes..." % (datetime.now().time()))
for e in range(300):
    pg.run_episode(e)
print("%s\t\t...done!" % (datetime.now().time()))

In [None]:
plot_durations([dqn.episode_durations, pg.episode_durations], ['DQN', 'Policy Gradient'])