Module deepcomp.main
Main execution script used for experimentation
Expand source code
"""Main execution script used for experimentation"""
import os
import logging
import structlog
from deepcomp.util.simulation import Simulation
from deepcomp.util.logs import config_logging
from deepcomp.util.env_setup import create_env_config
from deepcomp.util.cli import setup_cli
log = structlog.get_logger()
def main():
config_logging()
args = setup_cli()
# can't use args.continue: https://stackoverflow.com/a/63266666/2745116
args_continue = getattr(args, 'continue')
# stop training when any of the criteria is met
stop_criteria = dict()
if args.train_steps is not None:
stop_criteria['timesteps_total'] = args.train_steps
if args.train_iter is not None:
stop_criteria['training_iteration'] = args.train_iter
if args.target_reward is not None:
stop_criteria['episode_reward_mean'] = args.target_reward
if args.target_utility is not None:
stop_criteria['custom_metrics/sum_utility_mean'] = args.target_utility
# train or load trained agent; only set train=True for ppo agent
train = args.test is None
agent_path = None
if args.test is not None:
agent_path = os.path.abspath(args.test)
agent_path_continue = None
if args_continue is not None:
agent_path_continue = os.path.abspath(args_continue)
# create RLlib config (with env inside) & simulator
config = create_env_config(args)
# for sequential multi agent env
# config['no_done_at_end'] = True
# for continuous training without any resets between episodes
if args.cont_train:
config['soft_horizon'] = True
config['no_done_at_end'] = True
# default ppo params: https://docs.ray.io/en/latest/rllib-algorithms.html#proximal-policy-optimization-ppo
# config['entropy_coeff'] = 0.01
# lr: 5e-5, lr_schedule: None, gae lambda: 1.0, kl_coeff: 0.2
# config['lr'] = ray.tune.uniform(1e-6, 1e-4)
# config['gamma'] = ray.tune.uniform(0.9, 0.99)
# config['lambda'] = ray.tune.uniform(0.7, 1.0)
# lr_schedule: https://github.com/ray-project/ray/issues/7912#issuecomment-609833914
# eg, [[0, 0.01], [1000, 0.0001]] will start (t=0) lr=0.01 and linearly decr to lr=0.0001 at t=1000
# config['lr_schedule'] = [[0, 0.01], [50000, 1e-5]]
# import hyperopt as hp
# from ray.tune.suggest.hyperopt import HyperOptSearch
# hyperopt = HyperOptSearch(metric='episode_reward_mean', mode='max')
# add cli args to the config for saving inputs
sim = Simulation(config=config, agent_name=args.alg, cli_args=args, debug=False)
# train
if train and args.alg == 'ppo':
agent_path, analysis = sim.train(stop_criteria, restore_path=agent_path_continue)
# load & test agent
sim.load_agent(rllib_dir=agent_path, rand_seed=args.seed, fixed_action=[1, 1], explore=False)
# simulate one episode and render
log_dict = {
'deepcomp.util.simulation': logging.DEBUG,
# 'deepcomp.env.entities.user': logging.DEBUG,
# 'deepcomp.env.entities.station': logging.DEBUG
}
# set episode randomization for testing and evaluation according to CLI arg
sim.run(render=args.video, log_dict=log_dict)
# evaluate over multiple episodes
if args.eval > 0:
sim.run(num_episodes=args.eval, write_results=True)
# evaluate again with toggled episode randomization if --fixed-rand-eval
if args.fixed_rand_eval:
log.info('Evaluating again with toggled episode randomization', rand_episodes=not args.rand_test)
# set changed testing mode which is then saved to the data frame
sim.cli_args.rand_test = not args.rand_test
# make new result filename to avoid overwriting the existing one
sim.set_result_filename()
sim.run(num_episodes=args.eval, write_results=True)
log.info('Finished', agent=agent_path)
if __name__ == '__main__':
main()
Functions
def main()
-
Expand source code
def main(): config_logging() args = setup_cli() # can't use args.continue: https://stackoverflow.com/a/63266666/2745116 args_continue = getattr(args, 'continue') # stop training when any of the criteria is met stop_criteria = dict() if args.train_steps is not None: stop_criteria['timesteps_total'] = args.train_steps if args.train_iter is not None: stop_criteria['training_iteration'] = args.train_iter if args.target_reward is not None: stop_criteria['episode_reward_mean'] = args.target_reward if args.target_utility is not None: stop_criteria['custom_metrics/sum_utility_mean'] = args.target_utility # train or load trained agent; only set train=True for ppo agent train = args.test is None agent_path = None if args.test is not None: agent_path = os.path.abspath(args.test) agent_path_continue = None if args_continue is not None: agent_path_continue = os.path.abspath(args_continue) # create RLlib config (with env inside) & simulator config = create_env_config(args) # for sequential multi agent env # config['no_done_at_end'] = True # for continuous training without any resets between episodes if args.cont_train: config['soft_horizon'] = True config['no_done_at_end'] = True # default ppo params: https://docs.ray.io/en/latest/rllib-algorithms.html#proximal-policy-optimization-ppo # config['entropy_coeff'] = 0.01 # lr: 5e-5, lr_schedule: None, gae lambda: 1.0, kl_coeff: 0.2 # config['lr'] = ray.tune.uniform(1e-6, 1e-4) # config['gamma'] = ray.tune.uniform(0.9, 0.99) # config['lambda'] = ray.tune.uniform(0.7, 1.0) # lr_schedule: https://github.com/ray-project/ray/issues/7912#issuecomment-609833914 # eg, [[0, 0.01], [1000, 0.0001]] will start (t=0) lr=0.01 and linearly decr to lr=0.0001 at t=1000 # config['lr_schedule'] = [[0, 0.01], [50000, 1e-5]] # import hyperopt as hp # from ray.tune.suggest.hyperopt import HyperOptSearch # hyperopt = HyperOptSearch(metric='episode_reward_mean', mode='max') # add cli args to the config for saving inputs sim = Simulation(config=config, agent_name=args.alg, cli_args=args, debug=False) # train if train and args.alg == 'ppo': agent_path, analysis = sim.train(stop_criteria, restore_path=agent_path_continue) # load & test agent sim.load_agent(rllib_dir=agent_path, rand_seed=args.seed, fixed_action=[1, 1], explore=False) # simulate one episode and render log_dict = { 'deepcomp.util.simulation': logging.DEBUG, # 'deepcomp.env.entities.user': logging.DEBUG, # 'deepcomp.env.entities.station': logging.DEBUG } # set episode randomization for testing and evaluation according to CLI arg sim.run(render=args.video, log_dict=log_dict) # evaluate over multiple episodes if args.eval > 0: sim.run(num_episodes=args.eval, write_results=True) # evaluate again with toggled episode randomization if --fixed-rand-eval if args.fixed_rand_eval: log.info('Evaluating again with toggled episode randomization', rand_episodes=not args.rand_test) # set changed testing mode which is then saved to the data frame sim.cli_args.rand_test = not args.rand_test # make new result filename to avoid overwriting the existing one sim.set_result_filename() sim.run(num_episodes=args.eval, write_results=True) log.info('Finished', agent=agent_path)