kvfrans · dexhunter · Oct 20, 2017
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ Simple reinforcement learning algorithms implemented for CartPole on OpenAI gym.
 
 This code goes along with [my post about learning CartPole](http://kvfrans.com/simple-algoritms-for-solving-cartpole/), which is inspired by [an OpenAI request for research](https://openai.com/requests-for-research/#cartpole).
 
-##Algorithms implemented
+## Algorithms implemented
 
 **Random Search**: Keep trying random weights between [-1,1] and greedily keep the best set.
 

diff --git a/cartpole-hill.py b/cartpole-hill.py
@@ -19,7 +19,7 @@ def run_episode(env, parameters):
 def train(submit):
     env = gym.make('CartPole-v0')
     if submit:
-        env.monitor.start('cartpole-hill/', force=True)
+        env = gym.wrappers.Monitor(env, 'cartpole-hill/', force=True)
 
     episodes_per_update = 5
     noise_scaling = 0.1
@@ -47,7 +47,6 @@ def train(submit):
     if submit:
         for _ in xrange(100):
             run_episode(env,parameters)
-        env.monitor.close()
     return counter
 
 

diff --git a/cartpole-policygradient.py b/cartpole-policygradient.py
@@ -20,7 +20,7 @@ def policy_gradient():
         advantages = tf.placeholder("float",[None,1])
         linear = tf.matmul(state,params)
         probabilities = tf.nn.softmax(linear)
-        good_probabilities = tf.reduce_sum(tf.mul(probabilities, actions),reduction_indices=[1])
+        good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions),reduction_indices=[1])
         eligibility = tf.log(good_probabilities) * advantages
         loss = -tf.reduce_sum(eligibility)
         optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)
@@ -102,7 +102,7 @@ def run_episode(env, policy_grad, value_grad, sess):
 
 
 env = gym.make('CartPole-v0')
-env.monitor.start('cartpole-hill/', force=True)
+env = gym.wrappers.Monitor(env, 'cartpole-hill/', force=True)
 policy_grad = policy_gradient()
 value_grad = value_gradient()
 sess = tf.InteractiveSession()
@@ -118,4 +118,3 @@ def run_episode(env, policy_grad, value_grad, sess):
     reward = run_episode(env, policy_grad, value_grad, sess)
     t += reward
 print t / 1000
-env.monitor.close()
diff --git a/cartpole-random.py b/cartpole-random.py
@@ -16,7 +16,7 @@ def run_episode(env, parameters):
 def train(submit):
     env = gym.make('CartPole-v0')
     if submit:
-        env.monitor.start('cartpole-experiments/', force=True)
+        env = gym.wrappers.Monitor(env, 'cartpole-experiments/', force=True)
 
     counter = 0
     bestparams = None
@@ -34,12 +34,11 @@ def train(submit):
     if submit:
         for _ in xrange(100):
             run_episode(env,bestparams)
-        env.monitor.close()
 
     return counter
 
-# train an agent to submit to openai gym
-# train(submit=True)
+#train an agent to submit to openai gym
+train(submit=True)
 
 # create graphs
 results = []