From a1a33ef6674f27d894716dbdaa12be52c99de42d Mon Sep 17 00:00:00 2001
From: Phil <s8phsaue@stud.uni-saarland.de>
Date: Mon, 24 Feb 2025 13:59:21 +0100
Subject: [PATCH] Removed train files in cql_sac

---
 src/cql_sac/train.py         | 130 -----------------------------
 src/cql_sac/train_offline.py | 154 -----------------------------------
 2 files changed, 284 deletions(-)
 delete mode 100644 src/cql_sac/train.py
 delete mode 100644 src/cql_sac/train_offline.py

diff --git a/src/cql_sac/train.py b/src/cql_sac/train.py
deleted file mode 100644
index 8f083aa..0000000
--- a/src/cql_sac/train.py
+++ /dev/null
@@ -1,130 +0,0 @@
-
-
-import gym
-import pybullet_envs
-import numpy as np
-from collections import deque
-import torch
-import wandb
-import argparse
-from buffer import ReplayBuffer
-import glob
-from utils import save, collect_random, evaluate
-import random
-from agent import CQLSAC
-
-def get_config():
-    parser = argparse.ArgumentParser(description='RL')
-    parser.add_argument("--run_name", type=str, default="CQL-SAC", help="Run name, default: CQL-SAC")
-    parser.add_argument("--env", type=str, default="Pendulum-v1", help="Gym environment name, default: Pendulum-v0")
-    parser.add_argument("--episodes", type=int, default=300, help="Number of episodes, default: 200")
-    parser.add_argument("--buffer_size", type=int, default=100_000, help="Maximal training dataset size, default: 100_000")
-    parser.add_argument("--seed", type=int, default=1, help="Seed, default: 1")
-    parser.add_argument("--log_video", type=int, default=0, help="Log agent behaviour to wanbd when set to 1, default: 0")
-    parser.add_argument("--save_every", type=int, default=100, help="Saves the network every x epochs, default: 25")
-    parser.add_argument("--batch_size", type=int, default=256, help="Batch size, default: 256")
-    parser.add_argument("--hidden_size", type=int, default=256, help="")
-    parser.add_argument("--learning_rate", type=float, default=3e-4, help="")
-    parser.add_argument("--temperature", type=float, default=1.0, help="")
-    parser.add_argument("--cql_weight", type=float, default=1.0, help="")
-    parser.add_argument("--target_action_gap", type=float, default=10, help="")
-    parser.add_argument("--with_lagrange", type=int, default=0, help="")
-    parser.add_argument("--tau", type=float, default=5e-3, help="")
-    parser.add_argument("--eval_every", type=int, default=1, help="")
-
-    args = parser.parse_args()
-    return args
-
-def train(config):
-    np.random.seed(config.seed)
-    random.seed(config.seed)
-    torch.manual_seed(config.seed)
-    env = gym.make(config.env)
-    eval_env = gym.make(config.env)
-    
-    env.seed(config.seed)
-    eval_env.seed(config.seed + 123)
-    env.action_space.seed(config.seed)
-
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    
-    steps = 0
-    average10 = deque(maxlen=10)
-    total_steps = 0
-    
-    with wandb.init(project="CQL", name=config.run_name, config=config):
-        
-        agent = CQLSAC(state_size=env.observation_space.shape[0],
-                        action_size=env.action_space.shape[0],
-                        tau=config.tau,
-                        hidden_size=config.hidden_size,
-                        learning_rate=config.learning_rate,
-                        temp=config.temperature,
-                        with_lagrange=config.with_lagrange,
-                        cql_weight=config.cql_weight,
-                        target_action_gap=config.target_action_gap,
-                        device=device)
-
-        wandb.watch(agent, log="gradients", log_freq=10)
-
-        buffer = ReplayBuffer(buffer_size=config.buffer_size, batch_size=config.batch_size, device=device)
-        
-        collect_random(env=env, dataset=buffer, num_samples=10000)
-        
-        if config.log_video:
-            env = gym.wrappers.Monitor(env, './video', video_callable=lambda x: x%10==0, force=True)
-        eval_reward = evaluate(eval_env, agent)
-        wandb.log({"Test Reward": eval_reward, "Episode": 0, "Steps": steps}, step=steps)
-        for i in range(1, config.episodes+1):
-            state = env.reset()
-            episode_steps = 0
-            rewards = 0
-            while True:
-                action = agent.get_action(state)
-                steps += 1
-                next_state, reward, done, _ = env.step(action)
-                buffer.add(state, action, reward, next_state, done)
-                policy_loss, alpha_loss, bellmann_error1, bellmann_error2, cql1_loss, cql2_loss, current_alpha, lagrange_alpha_loss, lagrange_alpha = agent.learn(buffer.sample())
-                state = next_state
-                rewards += reward
-                episode_steps += 1
-                if done:
-                    break
-
-
-            average10.append(rewards)
-            total_steps += episode_steps
-            print("Episode: {} | Reward: {} | Polciy Loss: {} | Steps: {}".format(i, rewards, policy_loss, steps,))
-            
-            wandb.log({"Reward": rewards,
-                       "Average10": np.mean(average10),
-                       "Steps": total_steps,
-                       "Policy Loss": policy_loss,
-                       "Alpha Loss": alpha_loss,
-                       "Lagrange Alpha Loss": lagrange_alpha_loss,
-                       "CQL1 Loss": cql1_loss,
-                       "CQL2 Loss": cql2_loss,
-                       "Bellman error 1": bellmann_error1,
-                       "Bellman error 2": bellmann_error2,
-                       "Alpha": current_alpha,
-                       "Lagrange Alpha": lagrange_alpha,
-                       "Steps": steps,
-                       "Episode": i,
-                       "Buffer size": buffer.__len__()})
-        
-            if i % config.eval_every == 0:
-                eval_reward = evaluate(eval_env, agent)
-                wandb.log({"Test Reward": eval_reward, "Episode": i, "Steps": steps}, step=steps)
-
-            if (i %10 == 0) and config.log_video:
-                mp4list = glob.glob('video/*.mp4')
-                if len(mp4list) > 1:
-                    mp4 = mp4list[-2]
-                    wandb.log({"gameplays": wandb.Video(mp4, caption='episode: '+str(i-10), fps=4, format="gif"), "Episode": i})
-
-            if i % config.save_every == 0:
-                save(config, save_name="CQL-DQN", model=agent.actor_local, wandb=wandb, ep=0)
-
-if __name__ == "__main__":
-    config = get_config()
-    train(config)
diff --git a/src/cql_sac/train_offline.py b/src/cql_sac/train_offline.py
deleted file mode 100644
index 159ba5f..0000000
--- a/src/cql_sac/train_offline.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import gym
-import d4rl
-import numpy as np
-from collections import deque
-import torch
-import wandb
-import argparse
-import glob
-from utils import save, collect_random
-import random
-from agent import CQLSAC
-from torch.utils.data import DataLoader, TensorDataset
-
-def get_config():
-    parser = argparse.ArgumentParser(description='RL')
-    parser.add_argument("--run_name", type=str, default="CQL", help="Run name, default: CQL")
-    parser.add_argument("--env", type=str, default="halfcheetah-medium-v2", help="Gym environment name, default: Pendulum-v0")
-    parser.add_argument("--episodes", type=int, default=100, help="Number of episodes, default: 100")
-    parser.add_argument("--seed", type=int, default=1, help="Seed, default: 1")
-    parser.add_argument("--log_video", type=int, default=0, help="Log agent behaviour to wanbd when set to 1, default: 0")
-    parser.add_argument("--save_every", type=int, default=100, help="Saves the network every x epochs, default: 25")
-    parser.add_argument("--batch_size", type=int, default=512, help="Batch size, default: 256")
-    parser.add_argument("--hidden_size", type=int, default=256, help="")
-    parser.add_argument("--learning_rate", type=float, default=3e-4, help="")
-    parser.add_argument("--temperature", type=float, default=1.0, help="")
-    parser.add_argument("--cql_weight", type=float, default=1.0, help="")
-    parser.add_argument("--target_action_gap", type=float, default=10, help="")
-    parser.add_argument("--with_lagrange", type=int, default=0, help="")
-    parser.add_argument("--tau", type=float, default=5e-3, help="")
-    parser.add_argument("--eval_every", type=int, default=1, help="")
-    
-    args = parser.parse_args()
-    return args
-
-def prep_dataloader(env_id="halfcheetah-medium-v2", batch_size=256, seed=1):
-    env = gym.make(env_id)
-    dataset = env.get_dataset()
-    tensors = {}
-    for k, v in dataset.items():
-        if k in ["actions", "observations", "next_observations", "rewards", "terminals"]:
-            if  k is not "terminals":
-                tensors[k] = torch.from_numpy(v).float()
-            else:
-                tensors[k] = torch.from_numpy(v).long()
-
-    tensordata = TensorDataset(tensors["observations"],
-                               tensors["actions"],
-                               tensors["rewards"][:, None],
-                               tensors["next_observations"],
-                               tensors["terminals"][:, None])
-    dataloader  = DataLoader(tensordata, batch_size=batch_size, shuffle=True)
-    
-    if "halfcheetah" in env_id:
-        eval_env = gym.make("HalfCheetah-v2")
-    eval_env.seed(seed)
-    return dataloader, eval_env
-
-def evaluate(env, policy, eval_runs=5): 
-    """
-    Makes an evaluation run with the current policy
-    """
-    reward_batch = []
-    for i in range(eval_runs):
-        state = env.reset()
-
-        rewards = 0
-        while True:
-            action = policy.get_action(state, eval=True)
-
-            state, reward, done, _ = env.step(action)
-            rewards += reward
-            if done:
-                break
-        reward_batch.append(rewards)
-    return np.mean(reward_batch)
-
-def train(config):
-    np.random.seed(config.seed)
-    random.seed(config.seed)
-    torch.manual_seed(config.seed)
-
-    dataloader, env = prep_dataloader(env_id=config.env, batch_size=config.batch_size, seed=config.seed)
-
-    env.action_space.seed(config.seed)
-
-    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-    
-    batches = 0
-    average10 = deque(maxlen=10)
-    
-    with wandb.init(project="CQL-offline", name=config.run_name, config=config):
-        
-        agent = CQLSAC(state_size=env.observation_space.shape[0],
-                        action_size=env.action_space.shape[0],
-                        tau=config.tau,
-                        hidden_size=config.hidden_size,
-                        learning_rate=config.learning_rate,
-                        temp=config.temperature,
-                        with_lagrange=config.with_lagrange,
-                        cql_weight=config.cql_weight,
-                        target_action_gap=config.target_action_gap,
-                        device=device)
-
-        wandb.watch(agent, log="gradients", log_freq=10)
-        if config.log_video:
-            env = gym.wrappers.Monitor(env, './video', video_callable=lambda x: x%10==0, force=True)
-
-        eval_reward = evaluate(env, agent)
-        wandb.log({"Test Reward": eval_reward, "Episode": 0, "Batches": batches}, step=batches)
-        for i in range(1, config.episodes+1):
-
-            for batch_idx, experience in enumerate(dataloader):
-                states, actions, rewards, next_states, dones = experience
-                states = states.to(device)
-                actions = actions.to(device)
-                rewards = rewards.to(device)
-                next_states = next_states.to(device)
-                dones = dones.to(device)
-                policy_loss, alpha_loss, bellmann_error1, bellmann_error2, cql1_loss, cql2_loss, current_alpha, lagrange_alpha_loss, lagrange_alpha = agent.learn((states, actions, rewards, next_states, dones))
-                batches += 1
-
-            if i % config.eval_every == 0:
-                eval_reward = evaluate(env, agent)
-                wandb.log({"Test Reward": eval_reward, "Episode": i, "Batches": batches}, step=batches)
-
-                average10.append(eval_reward)
-                print("Episode: {} | Reward: {} | Polciy Loss: {} | Batches: {}".format(i, eval_reward, policy_loss, batches,))
-            
-            wandb.log({
-                       "Average10": np.mean(average10),
-                       "Policy Loss": policy_loss,
-                       "Alpha Loss": alpha_loss,
-                       "Lagrange Alpha Loss": lagrange_alpha_loss,
-                       "CQL1 Loss": cql1_loss,
-                       "CQL2 Loss": cql2_loss,
-                       "Bellman error 1": bellmann_error1,
-                       "Bellman error 2": bellmann_error2,
-                       "Alpha": current_alpha,
-                       "Lagrange Alpha": lagrange_alpha,
-                       "Batches": batches,
-                       "Episode": i})
-
-            if (i %10 == 0) and config.log_video:
-                mp4list = glob.glob('video/*.mp4')
-                if len(mp4list) > 1:
-                    mp4 = mp4list[-2]
-                    wandb.log({"gameplays": wandb.Video(mp4, caption='episode: '+str(i-10), fps=4, format="gif"), "Episode": i})
-
-            if i % config.save_every == 0:
-                save(config, save_name="IQL", model=agent.actor_local, wandb=wandb, ep=0)
-
-if __name__ == "__main__":
-    config = get_config()
-    train(config)
-- 
GitLab