Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added multi cartpole example #48

Open
wants to merge 20 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
from deepbots.robots.controllers.robot_emitter_receiver_csv import RobotEmitterReceiverCSV


class CartPoleRobot(RobotEmitterReceiverCSV):
"""
CartPole robot has 4 wheels and pole connected by an unactuated hinge to its body.
The hinge contains a Position Sensor device to measure the angle from vertical needed in the observation.
Hinge: https://cyberbotics.com/doc/reference/hingejoint
Position Sensor: https://cyberbotics.com/doc/reference/positionsensor
"""

def __init__(self):
"""
The constructor gets the Position Sensor reference and enables it and also initializes the wheels.
"""
super().__init__()

self.robot_num = int(self.robot.getName()[-1])
self.timestep = int(self.robot.getBasicTimeStep())

self.emitter, self.reciever = self.initialize_comms()
self.emitter.setChannel(self.robot_num)
self.reciever.setChannel(self.robot_num)

self.positionSensor = self.robot.getDevice("polePosSensor")
self.positionSensor.enable(self.timestep)

self.wheels = [None for _ in range(4)]
self.setup_motors()

def initialize_comms(self, emitter_name="emitter", receiver_name="receiver"):
emitter = self.robot.getDevice(emitter_name)
receiver = self.robot.getDevice(receiver_name)
receiver.enable(self.timestep)

return emitter, receiver

def setup_motors(self):
"""
This method initializes the four wheels, storing the references inside a list and setting the starting
positions and velocities.
"""
self.wheels[0] = self.robot.getDevice('wheel1')
self.wheels[1] = self.robot.getDevice('wheel2')
self.wheels[2] = self.robot.getDevice('wheel3')
self.wheels[3] = self.robot.getDevice('wheel4')
for i in range(len(self.wheels)):
self.wheels[i].setPosition(float('inf'))
self.wheels[i].setVelocity(0.0)

def create_message(self):
"""
This method packs the robot's observation into a list of strings to be sent to the supervisor.
The message contains only the Position Sensor value, ie. the angle from vertical position in radians.
From Webots documentation:
'The getValue function returns the most recent value measured by the specified position sensor. Depending on
the type, it will return a value in radians (angular position sensor) or in meters (linear position sensor).'

:return: A list of strings with the robot's observations.
:rtype: list
"""
message = [self.positionSensor.getValue()]
return message

def handle_receiver(self):
"""
Modified handle_receiver from the basic implementation of deepbots.
This one consumes all available messages in the queue during the step it is called.
"""
while self.receiver.getQueueLength() > 0:
# Receive and decode message from supervisor
message = self.receiver.getData().decode("utf-8")
# Convert string message into a list
message = message.split(",")

self.use_message_data(message)

self.receiver.nextPacket()

def use_message_data(self, message):
"""
This method unpacks the supervisor's message, which contains the next action to be executed by the robot.
In this case it contains an integer denoting the action, either 0 or 1, with 0 being forward and
1 being backward movement. The corresponding motorSpeed value is applied to the wheels.

:param message: The message the supervisor sent containing the next action.
:type message: list of strings
"""
action = int(message[0])

assert action == 0 or action == 1, "CartPoleRobot controller got incorrect action value: " + str(action)

if action == 0:
motorSpeed = 5.0
else:
motorSpeed = -5.0

for i in range(len(self.wheels)):
self.wheels[i].setPosition(float('inf'))
self.wheels[i].setVelocity(motorSpeed)

def handle_emitter(self):
data = self.create_message()
string_message = ""

if type(data) is list:
string_message = ",".join(map(str, data))
elif type(data) is str:
string_message = data
else:
raise TypeError(
"message must be either a comma-separater string or a 1D list")

string_message = string_message.encode("utf-8")
self.emitter.send(string_message)

def run(self):
while self.robot.step(self.timestep) != 1:
self.handle_receiver()
self.handle_emitter()

# Create the robot controller object and run it
robot_controller = CartPoleRobot()
robot_controller.run()
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from numpy import convolve, ones, mean

from supervisor_controller import CartPoleSupervisor
from agent.PPO_agent import PPOAgent, Transition
from utilities import plotData


def run():
# Initialize supervisor object
supervisorEnv = CartPoleSupervisor()

# The agent used here is trained with the PPO algorithm (https://arxiv.org/abs/1707.06347).
agent = PPOAgent(supervisorEnv.observationSpace, supervisorEnv.actionSpace)

episodeCount = 0
episodeLimit = 2000
veds12 marked this conversation as resolved.
Show resolved Hide resolved
solved = False # Whether the solved requirement is met

# Run outer loop until the episodes limit is reached or the task is solved
while not solved and episodeCount < episodeLimit:
state = supervisorEnv.reset() # Reset robots and get starting observation)
supervisorEnv.episodeScore = 0
actionProbs = [[] for i in range(9)]

# Inner loop is the episode loop
for step in range(supervisorEnv.stepsPerEpisode):
# In training mode the agent samples from the probability distribution, naturally implementing exploration
selectedActions = []
for i in range(9):
selectedAction, actionProb = agent.work(state[i], type_="selectAction")
actionProbs[i].append(actionProb)
selectedActions.append(selectedAction)

# Step the supervisor to get the current selectedAction reward, the new state and whether we reached the
# done condition
newState, reward, done, info = supervisorEnv.step([*selectedActions])

# Save the current state transitions from both robots in agent's memory
veds12 marked this conversation as resolved.
Show resolved Hide resolved
for i in range(9):
trans = Transition(state[i], selectedActions[i], actionProbs[i][-1], reward, newState[i])
agent.storeTransition(trans)

supervisorEnv.episodeScore += reward # Accumulate episode reward
if done:
# Save the episode's score
supervisorEnv.episodeScoreList.append(supervisorEnv.episodeScore)
agent.trainStep(batchSize=step + 1)
solved = supervisorEnv.solved() # Check whether the task is solved
break

state = newState # state for next step is current step's newState

if supervisorEnv.test: # If test flag is externally set to True, agent is deployed
break
veds12 marked this conversation as resolved.
Show resolved Hide resolved

avgActionProb = [round(mean(actionProbs[i]), 4) for i in range(9)]

# The average action probability tells us how confident the agent was of its actions.
# By looking at this we can check whether the agent is converging to a certain policy.
print(f"Episode: {episodeCount} Score = {supervisorEnv.episodeScore} | Average Action Probabilities = {avgActionProb}")

episodeCount += 1 # Increment episode counter

if not solved and not supervisorEnv.test:
print("Reached episode limit and task was not solved.")
else:
if not solved:
print("Task is not solved, deploying agent for testing...")
elif solved:
print("Task is solved, deploying agent for testing...")
veds12 marked this conversation as resolved.
Show resolved Hide resolved

state = supervisorEnv.reset()
supervisorEnv.test = True
supervisorEnv.episodeScore = 0
while True:
selectedActions = []
for i in range(9):
selectedAction, actionProb = agent.work(state[i], type_="selectAction")
actionProbs[i].append(actionProb)
selectedActions.append(selectedAction)

state, reward, done, _ = supervisorEnv.step(selectedActions)
supervisorEnv.episodeScore += reward # Accumulate episode reward

if done:
print("Reward accumulated =", supervisorEnv.episodeScore)
supervisorEnv.episodeScore = 0
state = supervisorEnv.reset()
Loading