DQN_BlocksWorld.py


# coding: utf-8

# In[1]:

#Based on the code from dennybritz
#https://github.com/dennybritz/reinforcement-learning
#Adapted to the blocksworld environment

get_ipython().magic('matplotlib inline')

import gym
from gym.wrappers import Monitor
import itertools
import numpy as np
import os
import random
import sys
import psutil
import tensorflow as tf
import datetime
if "../" not in sys.path:
  sys.path.append("../")

# Write the path to the repository from dennybritz, we'll use some helper functions
if "../reinforcement-learning/lib" not in sys.path:
  sys.path.append("../reinforcement-learning/lib")

import plotting
from collections import deque, namedtuple
from StateProcessor import StateProcessor
from Estimator import Estimator

# In[2]:

# Build the name of the folder where we'll store the results
basename = "DQN_BlocksWorld"
suffix = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
filename = "_".join([basename, suffix]) # e.g. 'BlocksWorld_120508_171442'


# In[3]:

#Define the parameters of the problem and the LSTM
#Number of blocks in the blocksworld environment
numBlocks = 4

#Defines the dimension of the input of the LSTM, we use current state and the goal state as the input
#For example [0,1] [2,0] (initial state,goal state) has dimension 2*2
n_input = numBlocks*2

#Dimensions of the output: [block to be moved][destination of the block]
#block to be moved has numBlocks length, and destination of the block has 
#numBlocks+1 because the block can be moved either on top of another block(numBlocks) but
#also on top of the table(+1)
#n_output = numBlocks*(numBlocks+1)
n_output = (numBlocks+1)*(numBlocks+1)

#How many possible actions we have in the environment. We will encode the actions as an integer.
VALID_ACTIONS = np.array(range(n_output))

#In order to track the convergence of the algorithm we cannot use 
q_value_dict={}


# In[4]:

env = gym.envs.make("BlocksWorld-v0")


# In[5]:


# In[6]:

#class Estimator():
#    """Q-Value Estimator neural network.
#
#    This network is used for both the Q-Network and the Target Network.
#    """
#
#    def __init__(self, scope="estimator", summaries_dir=None):
#        self.scope = scope
#        # Writes Tensorboard summaries to disk
#        self.summary_writer = None
#        with tf.variable_scope(scope):
#            # Build the graph
#            self._build_model()
#            if summaries_dir:
#                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
#                if not os.path.exists(summary_dir):
#                    os.makedirs(summary_dir)
#                self.summary_writer = tf.summary.FileWriter(summary_dir)
#
#    def _build_model(self):
#        """
#        Builds the Tensorflow graph.
#        """
#        
#        weights = {
#            'out': tf.Variable(tf.random_normal([len(VALID_ACTIONS), len(VALID_ACTIONS)]))
#            }
#        biases = {
#            'out': tf.Variable(tf.random_normal([len(VALID_ACTIONS)]))
#            }
# 
#        # Placeholders for our input
#        # Our input are 4 RGB frames of shape 160, 160 each
#        self.X_pl = tf.placeholder(shape=[None,n_input],dtype=tf.float32,name = "X")
#        #self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
#        # The TD target value
#        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
#        # Integer id of which action was selected
#        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
#
#        #X = tf.to_float(self.X_pl) / 255.0
#        batch_size = tf.shape(self.X_pl)[0]
#
#        # Three convolutional layers
#        #conv1 = tf.contrib.layers.conv2d(
#        #    X, 32, 8, 4, activation_fn=tf.nn.relu)
#        #conv2 = tf.contrib.layers.conv2d(
#        #    conv1, 64, 4, 2, activation_fn=tf.nn.relu)
#        #conv3 = tf.contrib.layers.conv2d(
#        #    conv2, 64, 3, 1, activation_fn=tf.nn.relu)
#
#        # Fully connected layers
#        #flattened = tf.contrib.layers.flatten(conv3)
#        fc1 = tf.contrib.layers.fully_connected(self.X_pl, 96)
#        fc2 = tf.contrib.layers.fully_connected(fc1, 96)
##        fc3 = tf.contrib.layers.fully_connected(fc2, 12)
#        last = tf.contrib.layers.fully_connected(fc2, len(VALID_ACTIONS))
#        
#        # We need the network to output negative numbers (Rewards are negative or zero, so we add another final linear layer)
#        self.predictions = tf.matmul(last, weights['out']) + biases['out']
#        #print (self.predictions.shape)
#
#        # Get the predictions for the chosen actions only
#        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
#        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)
#
#        # Calcualte the loss
#        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
#        self.loss = tf.reduce_mean(self.losses)
#
#        # Optimizer Parameters from original paper
##        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
##        self.optimizer = tf.train.RMSPropOptimizer(0.0025, 0.99, 0.0, 1e-6)
#        self.optimizer = tf.train.AdamOptimizer(0.0005)
#        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
#
#        self.max_q_value = tf.reduce_max(self.predictions,1)
#        #print (self.max_q_value.shape)
#        # Summaries for Tensorboard
#        self.summaries = tf.summary.merge([
#            tf.summary.scalar("loss", self.loss),
#            tf.summary.histogram("loss_hist", self.losses),
#            tf.summary.histogram("q_values_hist", self.predictions),
#            tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions)),
#            tf.summary.histogram("fc1",fc1),
#            tf.summary.histogram("fc2",fc2),
#            tf.summary.histogram("last",last)
#        ])
#
#    def predict(self, sess, s):
#        """
#        Predicts action values.
#
#        Args:
#          sess: Tensorflow session
#          s: State input of shape [batch_size, 4, 160, 160, 3]
#
#        Returns:
#          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
#          action values.
#        """
#        print ('State')
#        print (s)
#        ret = sess.run(self.predictions, { self.X_pl: s })
#        print ('Actions')
#        print (ret)
#        return ret
#    
#    def update(self, sess, s, a, y):
#        """
#        Updates the estimator towards the given targets.
#
#        Args:
#          sess: Tensorflow session object
#          s: State input of shape [batch_size, 4, 160, 160, 3]
#          a: Chosen actions of shape [batch_size]
#          y: Targets of shape [batch_size]
#
#        Returns:
#          The calculated loss on the batch.
#        """
#        feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
#        summaries, global_step, _, loss, max_q_value = sess.run(
#            [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss,self.max_q_value],
#            feed_dict)
#        if self.summary_writer:
#            self.summary_writer.add_summary(summaries, global_step)        
#        return loss,max_q_value


# In[7]:

# For Testing....

tf.reset_default_graph()
global_step = tf.Variable(0, name="global_step", trainable=False)

e = Estimator(scope="test",valid_actions = VALID_ACTIONS,n_input = n_input)
sp = StateProcessor(n_input)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Example observation batch
    observation = env.reset()
    
#    observation_p = sp.process(sess, observation)
    observation_p = sp.process_with_normalization(sess, observation)
    #print (observation_p.shape)
    observations = np.reshape(observation_p,[1,n_input]) 
        
    #print (observations.shape)
    #observation = np.stack([observation_p] * 4, axis=2)
    observations = np.array([observation_p] * 2)
    #print (observations.shape)
    # Test Prediction
    #print(e.predict(sess, observations))

    # Test training step
    y = np.array([10.0, 10.0])
    a = np.array([1, 3])
    #print(e.update(sess, observations, a, y))


# In[8]:

class ModelParametersCopier():
    """
    Copy model parameters of one estimator to another.
    """
    
    def __init__(self, estimator1, estimator2):
        """
        Defines copy-work operation graph.  
        Args:
          estimator1: Estimator to copy the paramters from
          estimator2: Estimator to copy the parameters to
        """
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        self.update_ops = []
        for e1_v, e2_v in zip(e1_params, e2_params):
            op = e2_v.assign(e1_v)
            self.update_ops.append(op)
            
    def make(self, sess):
        """
        Makes copy.
        Args:
            sess: Tensorflow session instance
        """
        sess.run(self.update_ops)


# In[9]:

def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        nA: Number of actions in the environment.

    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

    """
    def policy_fn(sess, observation, epsilon):
        print ('observation')
        print (observation)
        print ('Epsilon')
        print (epsilon)
        A = np.ones(nA, dtype=float) * epsilon / nA
        q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
        print ('Q_values')
        print (q_values)
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        print ('A')
        print (A)
        return A
    return policy_fn


# In[10]:

def add_q_value (state,q_value):
    # state must be a string representing the state, q_value must be a float
    if (state not in q_value_dict):
        q_value_dict[state]=[]
    q_value_dict[state].append(q_value)


# In[ ]:

def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
    
    #Q values dictionary, one key per state
    

    # The replay memory
    replay_memory = []
    
    # Make model copier object
    estimator_copy = ModelParametersCopier(q_estimator, target_estimator)

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))
    
    # For 'system/' summaries, usefull to check if currrent process looks healthy
    current_process = psutil.Process()

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")
    
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process_with_normalization(sess, state)
    #state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        #env.render()
        next_state = state_processor.process_with_normalization(sess, next_state)
        #next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
        replay_memory.append(Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process_with_normalization(sess, state)
            #state = np.stack([state] * 4, axis=2)
        else:
            state = next_state


    # Record videos
    # Add env Monitor wrapper
    #env = Monitor(env, directory=monitor_path, video_callable=lambda count: count % record_video_every == 0, resume=True)
    qvalue_summary = tf.Summary()
    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process_with_normalization(sess, state)
        #state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                estimator_copy.make(sess)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            #print ('State')
            #print (state)
            print ('Best action ' + str(action))
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            env.render()
            next_state = state_processor.process_with_normalization(sess, next_state)
            #next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(Transition(state, action, reward, next_state, done))   

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))

            # Calculate q values and targets
            q_values_next = target_estimator.predict(sess, next_states_batch)
            #print ('q_values_next')
            #print (q_values_next)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)
            #print ('targets_batch')
            #print (targets_batch)
            # Perform gradient descent update
            states_batch = np.array(states_batch)
            #print ('states_batch')
            #print (states_batch)
            #print ('Targets batch')
            #print (targets_batch)
            loss,max_q_values = q_estimator.update(sess, states_batch, action_batch, targets_batch)
#            n = 0
#            for q in max_q_values: 
                #print (states_batch[n])
                #print (q)
#                s_str = np.array2string(states_batch[n])
#                add_q_value(s_str,q)
                #Ideally we compute the average of max q values of the last steps in order
                #to have a less noisy metric of the convergence of the algorithm
#                if len(q_value_dict[s_str])>32:
#                    val = np.mean(q_value_dict[s_str][-32:])
#                else:
#                    val = np.mean(q_value_dict[s_str])
                #qvalue_summary.value.add(simple_value=val,tag="avg_max_q_value " + s_str)
#                n = n+1
            if done:                 
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=epsilon, tag="episode/epsilon")
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], tag="episode/reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], tag="episode/length")
        episode_summary.value.add(simple_value=current_process.cpu_percent(), tag="system/cpu_usage_percent")
        episode_summary.value.add(simple_value=current_process.memory_percent(memtype="vms"), tag="system/v_memeory_usage_percent")
        q_estimator.summary_writer.add_summary(episode_summary, i_episode)
        q_estimator.summary_writer.add_summary(qvalue_summary,i_episode)
        q_estimator.summary_writer.flush()
        
        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])

    return stats


# In[ ]:

tf.reset_default_graph()

# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("./experiments/{}".format(filename))
#experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))

# Create a glboal step variable
global_step = tf.Variable(0, name='global_step', trainable=False)
    
# Create estimators
q_estimator = Estimator(scope="q_estimator", valid_actions = VALID_ACTIONS,n_input= n_input,summaries_dir=experiment_dir)
target_estimator = Estimator(scope="target_q",valid_actions=VALID_ACTIONS,n_input=n_input)

# State processor
state_processor = StateProcessor(n_input)

# Run it!
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in deep_q_learning(sess,
                                    env,
                                    q_estimator=q_estimator,
                                    target_estimator=target_estimator,
                                    state_processor=state_processor,
                                    experiment_dir=experiment_dir,
                                    num_episodes=2000,
                                    replay_memory_size=50000,
                                    replay_memory_init_size=10000,
                                    update_target_estimator_every=10000,
                                    epsilon_start=1.0,
                                    epsilon_end=0.1,
                                    epsilon_decay_steps=200000,
                                    discount_factor=0.99,
                                    batch_size=32):

        print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))
        
        
ep_length,ep_reward,t_steps = plotting.plot_episode_stats (stats, smoothing_window=5,noshow=True)
ep_length.savefig(experiment_dir + '/ep_length.png')
ep_reward.savefig(experiment_dir + '/ep_reward.png')
t_steps.savefig(experiment_dir + '/t_steps.png')


# In[ ]:


# ###