1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
__author__ = 'Thomas Rueckstiess, ruecksti@in.tum.de'
from pybrain.rl.environments.cartpole import CartPoleEnvironment, DiscreteBalanceTask, CartPoleRenderer
from pybrain.rl.agents import LearningAgent
from pybrain.rl.experiments import EpisodicExperiment
from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork
#,ActionValueLSTMNetwork
from pybrain.rl.explorers import BoltzmannExplorer
from numpy import array, arange, meshgrid, pi, zeros, mean
from matplotlib import pyplot as plt
# switch this to True if you want to see the cart balancing the pole (slower)
render = False #True #
plt.ion()
env = CartPoleEnvironment()
if render:
renderer = CartPoleRenderer()
env.setRenderer(renderer)
renderer.start()
# balancetask. py inside only used 2 sensors, so here can not use(4,3), just use (2,3)
# there is a debug in vesion 0.30, now, new version 0.33 had correct it!!
module = ActionValueNetwork(4,3) #(4,3) # 0.33 had correct it
#module = ActionValueLSTMNetwork(2,3)
task = DiscreteBalanceTask(env, 100)
learner = NFQ()
learner.explorer.epsilon = 0.4
agent = LearningAgent(module, learner)
testagent = LearningAgent(module, None)
experiment = EpisodicExperiment(task, agent)
def plotPerformance(values, fig):
plt.figure(fig.number)
plt.clf()
plt.plot(values, 'o-')
plt.gcf().canvas.draw()
performance = []
if not render:
pf_fig = plt.figure()
#while (True):
for _ in xrange(60): #60
# one learning step after one episode of world-interaction!!!
experiment.doEpisodes(1)
agent.learn(2) # 5
# test performance (these real-world experiences are not used for training)
if render:
env.delay = True
experiment.agent = testagent
#r = mean([sum(x) for x in experiment.doEpisodes(5)])
env.delay = False
testagent.reset()
experiment.agent = agent
#performance.append(r)
print "update step", len(performance)
#print "reward avg", r
print "explorer epsilon", learner.explorer.epsilon
print "num episodes", agent.history.getNumSequences()
print "update step", len(performance)
if not render:
plotPerformance(performance, pf_fig)
str = raw_input("please input sth to end!")
print "you put :",str
|