"""Utility functions for NumPy-based Reinforcement learning algorithms."""
import numpy as np
from metarl.misc import tensor_utils
[docs]def paths_to_tensors(paths, max_path_length, baseline_predictions, discount):
"""Return processed sample data based on the collected paths.
Args:
paths (list[dict]): A list of collected paths.
max_path_length (int): Maximum length of a single rollout.
baseline_predictions(numpy.ndarray): : Predicted value of GAE
(Generalized Advantage Estimation) Baseline.
discount (float): Environment reward discount.
Returns:
dict: Processed sample data, with key
* observations (numpy.ndarray): Padded array of the observations of
the environment
* actions (numpy.ndarray): Padded array of the actions fed to the
the environment
* rewards (numpy.ndarray): Padded array of the acquired rewards
* agent_infos (dict): a dictionary of {stacked tensors or
dictionary of stacked tensors}
* env_infos (dict): a dictionary of {stacked tensors or
dictionary of stacked tensors}
* rewards (numpy.ndarray): Padded array of the validity information
"""
baselines = []
returns = []
for idx, path in enumerate(paths):
# baselines
path['baselines'] = baseline_predictions[idx]
baselines.append(path['baselines'])
# returns
path['returns'] = tensor_utils.discount_cumsum(path['rewards'],
discount)
returns.append(path['returns'])
obs = [path['observations'] for path in paths]
obs = tensor_utils.pad_tensor_n(obs, max_path_length)
actions = [path['actions'] for path in paths]
actions = tensor_utils.pad_tensor_n(actions, max_path_length)
rewards = [path['rewards'] for path in paths]
rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)
agent_infos = [path['agent_infos'] for path in paths]
agent_infos = tensor_utils.stack_tensor_dict_list([
tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos
])
env_infos = [path['env_infos'] for path in paths]
env_infos = tensor_utils.stack_tensor_dict_list(
[tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos])
valids = [np.ones_like(path['returns']) for path in paths]
valids = tensor_utils.pad_tensor_n(valids, max_path_length)
samples_data = dict(observations=obs,
actions=actions,
rewards=rewards,
agent_infos=agent_infos,
env_infos=env_infos,
valids=valids)
return samples_data