Add license

iffiX · iffiX · commit 6742f4ce0892 · 2020-06-16T16:42:11.000+08:00
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Iffi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/machin/frame/algorithms/a2c.py b/machin/frame/algorithms/a2c.py
@@ -336,7 +336,7 @@ def update(self,
             act_policy_loss += (self.entropy_weight *
                                 new_action_entropy.mean())
 
-        act_policy_loss = act_policy_loss.sum()
+        act_policy_loss = act_policy_loss.mean()
 
         if self.visualize:
             self.visualize_model(act_policy_loss, "actor",
diff --git a/machin/frame/algorithms/ppo.py b/machin/frame/algorithms/ppo.py
@@ -107,9 +107,7 @@ def update(self,
                                                 "value", "gae"
                                             ])
 
-        # normalize target value
-        target_value = ((target_value - target_value.mean()) /
-                        (target_value.std() + 1e-5))
+        # normalize advantage
         advantage = ((advantage - advantage.mean()) /
                      (advantage.std() + 1e-6))
 
@@ -138,7 +136,7 @@ def update(self,
             surr_loss_1 = sim_ratio * advantage
             surr_loss_2 = t.clamp(sim_ratio,
                                   1 - self.surr_clip,
-                                  1 + self.surr_clip) * target_value
+                                  1 + self.surr_clip) * advantage
 
             # calculate policy loss using surrogate loss
             act_policy_loss = -t.min(surr_loss_1, surr_loss_2)
@@ -149,11 +147,6 @@ def update(self,
 
             act_policy_loss = act_policy_loss.mean()
 
-            # calculate value loss
-            value = self.criticize(state)
-            value_loss = (self.criterion(target_value.to(value.device), value) *
-                          self.value_weight)
-
             if self.visualize:
                 self.visualize_model(act_policy_loss, "actor",
                                      self.visualize_dir)
@@ -168,6 +161,12 @@ def update(self,
                 self.actor_optim.step()
             sum_act_policy_loss += act_policy_loss.item()
 
+            # calculate value loss
+            value = self.criticize(state)
+            value_loss = (self.criterion(target_value.to(value.device),
+                                         value) *
+                          self.value_weight)
+
             if self.visualize:
                 self.visualize_model(value_loss, "critic",
                                      self.visualize_dir)
diff --git a/test/frame/algorithms/test_a2c.py b/test/frame/algorithms/test_a2c.py
@@ -233,15 +233,14 @@ def test_lr_scheduler(self, train_config, lr_a2c):
     def test_full_train(self, train_config, a2c, gae_lambda):
         c = train_config
         a2c.gae_lambda = gae_lambda
+
         # begin training
         episode, step = Counter(), Counter()
         reward_fulfilled = Counter()
         smoother = Smooth()
         terminal = False
 
         env = c.env
-        t.set_printoptions(sci_mode=False)
-        a2c.update_times = 1
         while episode < c.max_episodes:
             episode.count()
 
diff --git a/test/frame/algorithms/test_ppo.py b/test/frame/algorithms/test_ppo.py
@@ -5,7 +5,7 @@
 from machin.utils.conf import Config
 from machin.env.utils.openai_gym import disable_view_window
 from torch.nn.functional import softplus
-from torch.distributions import Normal
+from torch.distributions import Normal, Categorical
 
 import pytest
 import torch as t
@@ -33,11 +33,29 @@ def forward(self, state, action=None):
         a_dist = Normal(a_mu, a_sigma)
         a = action if action is not None else a_dist.sample()
         a_entropy = a_dist.entropy()
-        a = a.clamp(-self.action_range, self.action_range)
         a_log_prob = a_dist.log_prob(a)
         return a, a_log_prob, a_entropy
 
 
+# class Actor(nn.Module):
+#     def __init__(self, state_dim, action_num):
+#         super(Actor, self).__init__()
+#
+#         self.fc1 = nn.Linear(state_dim, 16)
+#         self.fc2 = nn.Linear(16, 16)
+#         self.fc3 = nn.Linear(16, action_num)
+#
+#     def forward(self, state, action=None):
+#         a = t.relu(self.fc1(state))
+#         a = t.relu(self.fc2(a))
+#         probs = t.softmax(self.fc3(a), dim=1)
+#         dist = Categorical(probs=probs)
+#         act = (action if action is not None else dist.sample())
+#         act_entropy = dist.entropy()
+#         act_log_prob = dist.log_prob(act)
+#         return act, act_log_prob, act_entropy
+
+
 class Critic(nn.Module):
     def __init__(self, state_dim):
         super(Critic, self).__init__()
@@ -58,20 +76,40 @@ class TestPPO(object):
     @pytest.fixture(scope="class")
     def train_config(self, pytestconfig):
         disable_view_window()
+        t.manual_seed(0)
         c = Config()
         c.env_name = "Pendulum-v0"
         c.env = unwrap_time_limit(gym.make(c.env_name))
+        c.env.seed(0)
         c.observe_dim = 3
         c.action_dim = 1
         c.action_range = 2
         c.max_episodes = 1000
-        c.max_steps = 200
+        c.max_steps = 500
         c.replay_size = 10000
         c.solved_reward = -150
         c.solved_repeat = 5
         c.device = "cpu"
         return c
 
+    # @pytest.fixture(scope="class")
+    # def train_config(self, pytestconfig):
+    #     disable_view_window()
+    #     c = Config()
+    #     # Note: A2C is not sample efficient, it will not work very well
+    #     # in contiguous spaces such as "Pendulum-v0", PPO is better.
+    #     c.env_name = "CartPole-v1"
+    #     c.env = unwrap_time_limit(gym.make(c.env_name))
+    #     c.observe_dim = 4
+    #     c.action_num = 2
+    #     c.max_episodes = 1000
+    #     c.max_steps = 500
+    #     c.replay_size = 10000
+    #     c.solved_reward = 190
+    #     c.solved_repeat = 5
+    #     c.device = "cpu"
+    #     return c
+
     @pytest.fixture(scope="function")
     def ppo(self, train_config):
         c = train_config
@@ -86,6 +124,20 @@ def ppo(self, train_config):
                   replay_size=c.replay_size)
         return ppo
 
+    # @pytest.fixture(scope="function")
+    # def ppo(self, train_config):
+    #     c = train_config
+    #     actor = smw(Actor(c.observe_dim, c.action_num)
+    #                 .to(c.device), c.device, c.device)
+    #     critic = smw(Critic(c.observe_dim)
+    #                  .to(c.device), c.device, c.device)
+    #     ppo = PPO(actor, critic,
+    #               t.optim.Adam,
+    #               nn.MSELoss(reduction='sum'),
+    #               replay_device=c.device,
+    #               replay_size=c.replay_size)
+    #     return ppo
+
     @pytest.fixture(scope="function")
     def ppo_vis(self, train_config, tmpdir):
         # not used for training, only used for testing apis
@@ -169,15 +221,16 @@ def test_update(self, train_config, ppo_vis):
     ########################################################################
     def test_full_train(self, train_config, ppo):
         c = train_config
-
         # begin training
         episode, step = Counter(), Counter()
         reward_fulfilled = Counter()
         smoother = Smooth()
         terminal = False
 
         env = c.env
-        ppo.grad_max = 0.1
+        ppo.gae_lambda = 1.0
+        ppo.update_times = 20
+        ppo.entropy_weight = 1
         while episode < c.max_episodes:
             episode.count()
 
@@ -192,7 +245,11 @@ def test_full_train(self, train_config, ppo):
                     old_state = state
                     # agent model inference
                     action = ppo.act({"state": old_state.unsqueeze(0)})[0]
-                    state, reward, terminal, _ = env.step(action.cpu().numpy())
+                    state, reward, terminal, _ = env.step(
+                        action.clamp(-c.action_range, c.action_range).cpu()
+                        .numpy()
+                    )
+                    #state, reward, terminal, _ = env.step(action.item())
                     state = t.tensor(state, dtype=t.float32, device=c.device) \
                         .flatten()
                     total_reward += float(reward)
@@ -207,7 +264,8 @@ def test_full_train(self, train_config, ppo):
 
             # update
             ppo.store_episode(tmp_observations)
-            logger.info("{:.6f}, {:.0f}".format(*ppo.update()))
+            if episode.get() % 5 == 0:
+                logger.info("{:.6f}, {:.2f}".format(*ppo.update()))
 
             smoother.update(total_reward)
             step.reset()