From 16914cd61e9d2da006cf6235a47aae0c70df9cb2 Mon Sep 17 00:00:00 2001 From: Eisinger Date: Fri, 19 Jun 2026 07:46:12 +0200 Subject: [PATCH 1/5] Add crane wire length and pendulum q_factor to the environment configuration --- src/crane_controller/envs/controlled_crane_pendulum.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py index f3c68d5..6fe9387 100644 --- a/src/crane_controller/envs/controlled_crane_pendulum.py +++ b/src/crane_controller/envs/controlled_crane_pendulum.py @@ -53,6 +53,8 @@ class AntiPendulumConfig: continuous_actions: If True, the action space is ``Box([-1], [1])`` and an action value in ``[-1, 1]`` is scaled by ``acc`` to produce the crane acceleration. If False, the action space is ``Discrete(3)`` with mapping``0=-acc, 1=0, 2=+acc`` (Q-agent compatible). + length: the length of the crane wire (and the pedestal) + q_factor: the damping factor of the pendulum action """ acc: float = 0.1 @@ -67,6 +69,8 @@ class AntiPendulumConfig: reward_fac: RewardConfig | None = None continuous_actions: bool = False discount: float = 0.8 + length: float = 10.0 + q_factor: float = 50.0 class AntiPendulumEnv(gym.Env[tuple[int, ...] | np.ndarray, int]): @@ -117,7 +121,7 @@ def __init__(self, crane: Callable[..., Crane], conf: AntiPendulumConfig | None self.crane_maker = crane self.conf = AntiPendulumConfig() if conf is None else conf self.render_mode: str | None = self.conf.render_mode # gymnasium convention: expose as direct attribute - self.crane: Crane = crane() + self.crane: Crane = crane(length=self.conf.length, q_factor=self.conf.q_factor) self.wire: Wire = self.crane.boom_by_name("wire") # type: ignore[assignment] # Wire is a sub-class of Boom assert isinstance(self.wire, Wire), "Need a crane wire!" assert self.conf.render_mode in AntiPendulumEnv.metadata["render_modes"], ( # type: ignore[operator] # metadata values are typed as object From 9036f2b1ac3df6044b395ae61774f9d4fdbf2265 Mon Sep 17 00:00:00 2001 From: Eisinger Date: Fri, 19 Jun 2026 07:52:37 +0200 Subject: [PATCH 2/5] Adapt use_q_ide to usage of configurations and a better way to manage experiments. --- scripts/use_q_ide.py | 134 ++++++++++++++++++++++++++----------------- 1 file changed, 82 insertions(+), 52 deletions(-) diff --git a/scripts/use_q_ide.py b/scripts/use_q_ide.py index 32dd7f6..57b41d1 100644 --- a/scripts/use_q_ide.py +++ b/scripts/use_q_ide.py @@ -11,10 +11,10 @@ from typing import Any from crane_controller.crane_factory import build_crane -from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv, AntiPendulumConfig from crane_controller.envs.simple_test_env import SimpleTestEnv from crane_controller.experiment_config import RewardConfig -from crane_controller.q_agent import QLearningAgent +from crane_controller.q_agent import QLearningAgent, QLearningConfig logging.basicConfig(level=logging.INFO, format="%(message)s") LOGGER = logging.getLogger(__name__) @@ -38,81 +38,104 @@ class Config: r_fac: optional weight factors (RewardConfig) for reward r_limit: optional reward limit disc: discount rate of acceleration history to include in observation - lr: optionally change the learning rate + learning_rate: optionally change the learning rate seed: optionally change the start seed """ - - v0: float = 1.0 + # AntiPendulumConfig + acc: float = 0.1 + start_speed: float = 1.0 randomize_start: bool = False - render: str = "none" - discretization: str = "energy" + render_mode: str = "none" + rail_limit: float = 10.0 + seed: int | None = 43 + reward_limit: float | None = None + dt: float = 1.0 + discrete: dict[str, tuple[float | int, ...]] | str = "none" + reward_fac: RewardConfig | None = None + continuous_actions: bool = False + discount: float = 0.8 + # agent + learning_rate: float = 0.1 + epsilon_decay: float = 1e-4 + final_epsilon: float = 0.1 + discount_factor: float = 0.95 + # additional for do_episodes file: str | None = None use_file: str = "r" episodes: int = 10000 steps: int = 1000 - dt: float = 1.0 - rc: RewardConfig | None = None - r_limit: float | None = None - discount: float = 0.8 - seed: int = 1 strategy: str = "default" - lr: float = 0.1 eps: float = 1e-10 - if rc is None: - rc = RewardConfig(energy=1.0, positional=1.0, crane_velocity=0.5) + if reward_fac is None: + reward_fac = RewardConfig(energy=1.0, positional=1.0, crane_velocity=0.5) -def do_use(conf: Config | dict[str, Any] | None = None) -> None: +def do_use(conf: Config | dict[str, Any]) -> None: """Perform training on the (Anti-)Pendulum environment using q-learning. Args: conf: Configuration data set. See Config class for all definitions. """ - _conf = Config() if conf is None else (Config(**conf) if isinstance(conf, dict) else conf) + _e_conf = AntiPendulumConfig() # default values + e_conf = AntiPendulumConfig( + acc = conf.get('acc', _e_conf.acc), + start_speed = conf.get('start_speed', _e_conf.start_speed), + randomize_start = conf.get('randomize_start', _e_conf.randomize_start), + render_mode = conf.get('render_mode', _e_conf.render_mode), + rail_limit = conf.get('rail_limit', _e_conf.rail_limit), + seed = conf.get('seed', _e_conf.seed), + reward_limit = conf.get('reward_limit', _e_conf.reward_limit), + dt = conf.get('dt', _e_conf.dt), + discrete = conf.get('discrete', _e_conf.discrete), + reward_fac = conf.get('reward_fac', _e_conf.reward_fac), + continuous_actions = conf.get('continuous_actions', _e_conf.continuous_actions), + length = conf.get('length', _e_conf.length), + q_factor = conf.get('q_factor', _e_conf.q_factor), + ) env = AntiPendulumEnv( build_crane, - conf=AntiPendulumConfig( - start_speed=_conf.v0, - randomize_start=_conf.randomize_start, - seed=_conf.seed, - dt=_conf.dt, - render_mode=_conf.render, - discrete=_conf.discretization, - reward_fac=_conf.rc, - reward_limit=_conf.r_limit, - discount=_conf.discount, - ), + conf = e_conf ) - - filename = Path(_conf.file) if _conf.file is not None else None + _a_conf = QLearningConfig() # default values + a_conf = QLearningConfig( + learning_rate = conf.get('learning_rate', _a_conf.learning_rate), + epsilon_decay = conf.get('epsilon_decay', _a_conf.epsilon_decay), + final_epsilon = conf.get('final_epsilon', _a_conf.final_epsilon), + discount_factor = conf.get('discount_factor', _a_conf.discount_factor), + ) + filename = conf.get('file', None) if filename is not None: - filename.parent.mkdir(parents=True, exist_ok=True) - agent = QLearningAgent(env, filename=filename, use_file=_conf.use_file, strategy=_conf.strategy) + Path(filename).parent.mkdir(parents=True, exist_ok=True) + agent = QLearningAgent(env, + conf = a_conf, + filename = filename, + use_file = conf.get('use_file', 'w'), + strategy = conf.get('strategy', 'default')) LOGGER.info(f"DISCRETE: {agent.env.discrete}") - agent.do_episodes(n_episodes=_conf.episodes, max_steps=_conf.steps, show=0) + agent.do_episodes(n_episodes=conf.get("episodes", 10), max_steps=conf.get("steps", 1000), show=0) if filename is not None and "w" in agent.use_file: LOGGER.info(f"Model saved to {filename}") -def simple_env(episodes: int, render: str, file: str, use: str, r_limit: float | None, steps: int) -> None: +def simple_env(episodes: int, render_mode: str, file: str, use: str, reward_limit: float | None, steps: int) -> None: """Define a SimpleTest environment. Args: episodes: number of episodes - render: render mode + render_mode: render_mode mode file: Optional definition of model-save file use: How 'file' is used (if exists): 'r', 'w', 'rw' - r_limit: optional reward limit + reward_limit: optional reward limit steps: number of steps per episodes (if not terminated or truncated) """ env = SimpleTestEnv( reward_fac=(1.0, 1.0), - reward_limit=r_limit, + reward_limit=reward_limit, dt=1.0, - render_mode=render, + render_mode=render_mode, ) - agent = QLearningAgent(env, filename=Path(file), use_file=use) + agent = QLearningAgent(env, filename=file, use_file=use) agent.do_episodes(n_episodes=episodes, max_steps=steps) @@ -125,24 +148,31 @@ def update_conf(conf: dict["str", Any], updates: dict["str", Any]) -> dict["str" if __name__ == "__main__": # ruff: disable[ERA001] ## we intentionally work with commenting out lines here - # do_use( v0, render, file, use_file, episodes, steps, rc, reward, s, seed, ) + # do_use( start_speed, render_mode, file, use_file, episodes, steps, reward_fac, reward, s, seed, ) ## Anti-pendulum training and results: conf1 = { - "discretization": "phase", - "v0": 2.0, - "render": "data", - "file": MODELS / "q_anti-pendulum_2.json", + "discrete": "phase", + "start_speed": 2.0, + "randomize_start":False, + "render_mode": "data", + "file": MODELS / "q_anti-pendulum1.json", "use_file": "rw", - "episodes": 3000, - "r_limit": -0.1, + "steps":1000, + "episodes": 50000, + "reward_fac": RewardConfig(energy=1.0, positional=1.0, crane_velocity=0.5), + "reward_limit": -0.001, "seed": 43, + "q_factor":500, } - # do_use(conf1) - do_use(update_conf(conf1, {"use_file": "r", "episodes": 10, "render": "plot"})) + #do_use(conf1) + #do_use(update_conf(conf1, {"use_file": "r", "episodes": 10, "render_mode": "plot"})) + conf2 = update_conf( conf1, {'file':MODELS / "q_anti-pendulum2.json", 'randomize_start':True}) + #do_use(conf2) + do_use(update_conf(conf1, {"use_file": "r", "episodes": 10, "render_mode": "plot", })) ## Pendulum training and results: - # conf0 = update_conf(conf1, {'v0':0.0,'file':MODELS / "q_pendulum.json",'r_limit':1000.0}) # start a pendulum - # do_use( update_conf( conf0, {'use_file':"r", 'episodes':10,'render':'plot'})) + # conf0 = update_conf(conf1, {'start_speed':0.0,'file':MODELS / "q_pendulum.json",'reward_limit':1000.0}) # start a pendulum + # do_use( update_conf( conf0, {'use_file':"r", 'episodes':10,'render_mode':'plot'})) # do_use(conf0) - # simple_env(episodes=50000, render="none", file=models/"q_simple.json", use="w", r_limit=29.4, steps=200) - # simple_env(episodes=10, render="plot", file=models/"q_simple.json", use="r", r_limit=29.7, steps=20) + # simple_env(episodes=50000, render_mode="none", file=models/"q_simple.json", use="w", reward_limit=29.4, steps=200) + # simple_env(episodes=10, render_mode="plot", file=models/"q_simple.json", use="r", reward_limit=29.7, steps=20) # ruff: enable[ERA001] From fe59873998e8cbaa3ee112c47a12d662413e6732 Mon Sep 17 00:00:00 2001 From: Eisinger Date: Fri, 19 Jun 2026 08:03:00 +0200 Subject: [PATCH 3/5] Optimised the calculation of the reward, not taking into account rc-factors which are set to 0.0 + including t_min_crane as an optional alternative to reward calculation --- .../envs/controlled_crane_pendulum.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py index 6fe9387..0a14fb7 100644 --- a/src/crane_controller/envs/controlled_crane_pendulum.py +++ b/src/crane_controller/envs/controlled_crane_pendulum.py @@ -387,14 +387,19 @@ def _get_obs(self, acc: float = 0.0) -> tuple[np.ndarray | tuple[int, ...], floa position = -abs(self.crane.position[0]) acc_penalty = -abs(acc) rc = self.reward_fac - self.reward = ( - rc.energy * energy - + rc.positional * positional - + rc.time * (-self.time) - + rc.position * position - + rc.acceleration * acc_penalty - + rc.crane_velocity * self.crane.velocity[0] ** 2 - ) + self.reward = rc.energy * energy + if rc.positional != 0.0: + self.reward += rc.positional * positional + if rc.time != 0.0: + self.reward += rc.time * (-self.time) + if rc.position != 0.0: + self.reward += rc.position * position + if rc.acceleration != 0.0: + self.reward += rc.acceleration * acc_penalty + if rc.crane_velocity != 0.0: + self.reward += rc.crane_velocity * self.crane.velocity[0] ** 2 + if rc.t_min_crane != 0.0: + self.reward += rc.t_min_crane * self._t_min_crane() if len(self.discrete): self.obs, truncate = self._get_discrete_obs(energy, acc) From c66d986b74013031f7e7ebec7b1bf83a8f9205c5 Mon Sep 17 00:00:00 2001 From: Eisinger Date: Fri, 19 Jun 2026 08:23:29 +0200 Subject: [PATCH 4/5] Adhere to ruff check and ruff format comments --- scripts/use_q_ide.py | 80 ++++++++++--------- .../envs/controlled_crane_pendulum.py | 22 +++-- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/scripts/use_q_ide.py b/scripts/use_q_ide.py index 57b41d1..98b5bf8 100644 --- a/scripts/use_q_ide.py +++ b/scripts/use_q_ide.py @@ -11,7 +11,7 @@ from typing import Any from crane_controller.crane_factory import build_crane -from crane_controller.envs.controlled_crane_pendulum import AntiPendulumEnv, AntiPendulumConfig +from crane_controller.envs.controlled_crane_pendulum import AntiPendulumConfig, AntiPendulumEnv from crane_controller.envs.simple_test_env import SimpleTestEnv from crane_controller.experiment_config import RewardConfig from crane_controller.q_agent import QLearningAgent, QLearningConfig @@ -42,6 +42,7 @@ class Config: seed: optionally change the start seed """ + # AntiPendulumConfig acc: float = 0.1 start_speed: float = 1.0 @@ -77,41 +78,40 @@ def do_use(conf: Config | dict[str, Any]) -> None: Args: conf: Configuration data set. See Config class for all definitions. """ - _e_conf = AntiPendulumConfig() # default values + _e_conf = AntiPendulumConfig() # default values e_conf = AntiPendulumConfig( - acc = conf.get('acc', _e_conf.acc), - start_speed = conf.get('start_speed', _e_conf.start_speed), - randomize_start = conf.get('randomize_start', _e_conf.randomize_start), - render_mode = conf.get('render_mode', _e_conf.render_mode), - rail_limit = conf.get('rail_limit', _e_conf.rail_limit), - seed = conf.get('seed', _e_conf.seed), - reward_limit = conf.get('reward_limit', _e_conf.reward_limit), - dt = conf.get('dt', _e_conf.dt), - discrete = conf.get('discrete', _e_conf.discrete), - reward_fac = conf.get('reward_fac', _e_conf.reward_fac), - continuous_actions = conf.get('continuous_actions', _e_conf.continuous_actions), - length = conf.get('length', _e_conf.length), - q_factor = conf.get('q_factor', _e_conf.q_factor), - ) - env = AntiPendulumEnv( - build_crane, - conf = e_conf + acc=conf.get("acc", _e_conf.acc), + start_speed=conf.get("start_speed", _e_conf.start_speed), + randomize_start=conf.get("randomize_start", _e_conf.randomize_start), + render_mode=conf.get("render_mode", _e_conf.render_mode), + rail_limit=conf.get("rail_limit", _e_conf.rail_limit), + seed=conf.get("seed", _e_conf.seed), + reward_limit=conf.get("reward_limit", _e_conf.reward_limit), + dt=conf.get("dt", _e_conf.dt), + discrete=conf.get("discrete", _e_conf.discrete), + reward_fac=conf.get("reward_fac", _e_conf.reward_fac), + continuous_actions=conf.get("continuous_actions", _e_conf.continuous_actions), + length=conf.get("length", _e_conf.length), + q_factor=conf.get("q_factor", _e_conf.q_factor), ) - _a_conf = QLearningConfig() # default values + env = AntiPendulumEnv(build_crane, conf=e_conf) + _a_conf = QLearningConfig() # default values a_conf = QLearningConfig( - learning_rate = conf.get('learning_rate', _a_conf.learning_rate), - epsilon_decay = conf.get('epsilon_decay', _a_conf.epsilon_decay), - final_epsilon = conf.get('final_epsilon', _a_conf.final_epsilon), - discount_factor = conf.get('discount_factor', _a_conf.discount_factor), + learning_rate=conf.get("learning_rate", _a_conf.learning_rate), + epsilon_decay=conf.get("epsilon_decay", _a_conf.epsilon_decay), + final_epsilon=conf.get("final_epsilon", _a_conf.final_epsilon), + discount_factor=conf.get("discount_factor", _a_conf.discount_factor), ) - filename = conf.get('file', None) + filename = conf.get("file", None) if filename is not None: Path(filename).parent.mkdir(parents=True, exist_ok=True) - agent = QLearningAgent(env, - conf = a_conf, - filename = filename, - use_file = conf.get('use_file', 'w'), - strategy = conf.get('strategy', 'default')) + agent = QLearningAgent( + env, + conf=a_conf, + filename=filename, + use_file=conf.get("use_file", "w"), + strategy=conf.get("strategy", "default"), + ) LOGGER.info(f"DISCRETE: {agent.env.discrete}") agent.do_episodes(n_episodes=conf.get("episodes", 10), max_steps=conf.get("steps", 1000), show=0) if filename is not None and "w" in agent.use_file: @@ -153,24 +153,26 @@ def update_conf(conf: dict["str", Any], updates: dict["str", Any]) -> dict["str" conf1 = { "discrete": "phase", "start_speed": 2.0, - "randomize_start":False, + "randomize_start": False, "render_mode": "data", "file": MODELS / "q_anti-pendulum1.json", "use_file": "rw", - "steps":1000, + "steps": 1000, "episodes": 50000, "reward_fac": RewardConfig(energy=1.0, positional=1.0, crane_velocity=0.5), "reward_limit": -0.001, "seed": 43, - "q_factor":500, + "q_factor": 500, } - #do_use(conf1) - #do_use(update_conf(conf1, {"use_file": "r", "episodes": 10, "render_mode": "plot"})) - conf2 = update_conf( conf1, {'file':MODELS / "q_anti-pendulum2.json", 'randomize_start':True}) - #do_use(conf2) - do_use(update_conf(conf1, {"use_file": "r", "episodes": 10, "render_mode": "plot", })) + _conf1 = update_conf(conf1, {"use_file": "r", "episodes": 10, "render_mode": "plot"}) + # do_use(conf1) + do_use(_conf1) + # do_use(update_conf(conf1, {"use_file": "r", "episodes": 10, "render_mode": "plot"})) + # conf2 = update_conf(conf1, {"file": MODELS / "q_anti-pendulum2.json", "randomize_start": True}) + # do_use(conf2) + ## Pendulum training and results: - # conf0 = update_conf(conf1, {'start_speed':0.0,'file':MODELS / "q_pendulum.json",'reward_limit':1000.0}) # start a pendulum + # conf0 = update_conf(conf1, {'start_speed':0.0,'file':MODELS / "q_pendulum.json",'reward_limit':1000.0}) # do_use( update_conf( conf0, {'use_file':"r", 'episodes':10,'render_mode':'plot'})) # do_use(conf0) # simple_env(episodes=50000, render_mode="none", file=models/"q_simple.json", use="w", reward_limit=29.4, steps=200) diff --git a/src/crane_controller/envs/controlled_crane_pendulum.py b/src/crane_controller/envs/controlled_crane_pendulum.py index 0a14fb7..62c51ff 100644 --- a/src/crane_controller/envs/controlled_crane_pendulum.py +++ b/src/crane_controller/envs/controlled_crane_pendulum.py @@ -388,18 +388,16 @@ def _get_obs(self, acc: float = 0.0) -> tuple[np.ndarray | tuple[int, ...], floa acc_penalty = -abs(acc) rc = self.reward_fac self.reward = rc.energy * energy - if rc.positional != 0.0: - self.reward += rc.positional * positional - if rc.time != 0.0: - self.reward += rc.time * (-self.time) - if rc.position != 0.0: - self.reward += rc.position * position - if rc.acceleration != 0.0: - self.reward += rc.acceleration * acc_penalty - if rc.crane_velocity != 0.0: - self.reward += rc.crane_velocity * self.crane.velocity[0] ** 2 - if rc.t_min_crane != 0.0: - self.reward += rc.t_min_crane * self._t_min_crane() + for rc_fac, rc_base in { + rc.positional: positional, + rc.time: (-self.time), + rc.position: position, + rc.acceleration: acc_penalty, + rc.crane_velocity: self.crane.velocity[0] ** 2, + rc.t_min_crane: self._t_min_crane(), + }.items(): + if rc_fac != 0.0: + self.reward += rc_fac * rc_base if len(self.discrete): self.obs, truncate = self._get_discrete_obs(energy, acc) From 0a6089e2a6135ee8056b217edc4a2286e5f26962 Mon Sep 17 00:00:00 2001 From: Eisinger Date: Fri, 19 Jun 2026 08:28:20 +0200 Subject: [PATCH 5/5] Remove a not-any-longer-needed element in use_ide --- scripts/use_q_ide.py | 55 ++------------------------------------------ 1 file changed, 2 insertions(+), 53 deletions(-) diff --git a/scripts/use_q_ide.py b/scripts/use_q_ide.py index 98b5bf8..ba42d92 100644 --- a/scripts/use_q_ide.py +++ b/scripts/use_q_ide.py @@ -6,7 +6,6 @@ """ import logging -from dataclasses import dataclass from pathlib import Path from typing import Any @@ -22,57 +21,7 @@ USE_DISCRETE2 = 2 -@dataclass(kw_only=True, frozen=True, slots=True) -class Config: - """Data for experiments performed in this module. - - Args: - v0: start speed of load in x-direction. 0: Pendulum mode, >/< 0 same/random start at every episode - randomize_start: Optionally randomize the start speed within +/- v0. Default: False - render: render mode of environment - file: Optional definition of model-save file - use_file: How 'file' is used (if exists): 'r', 'w', 'rw' - episodes: nnumber of episodes run in the training - steps: number of steps per episodes (if not terminated or truncated) - dt: step-size per time step - r_fac: optional weight factors (RewardConfig) for reward - r_limit: optional reward limit - disc: discount rate of acceleration history to include in observation - learning_rate: optionally change the learning rate - seed: optionally change the start seed - - """ - - # AntiPendulumConfig - acc: float = 0.1 - start_speed: float = 1.0 - randomize_start: bool = False - render_mode: str = "none" - rail_limit: float = 10.0 - seed: int | None = 43 - reward_limit: float | None = None - dt: float = 1.0 - discrete: dict[str, tuple[float | int, ...]] | str = "none" - reward_fac: RewardConfig | None = None - continuous_actions: bool = False - discount: float = 0.8 - # agent - learning_rate: float = 0.1 - epsilon_decay: float = 1e-4 - final_epsilon: float = 0.1 - discount_factor: float = 0.95 - # additional for do_episodes - file: str | None = None - use_file: str = "r" - episodes: int = 10000 - steps: int = 1000 - strategy: str = "default" - eps: float = 1e-10 - if reward_fac is None: - reward_fac = RewardConfig(energy=1.0, positional=1.0, crane_velocity=0.5) - - -def do_use(conf: Config | dict[str, Any]) -> None: +def do_use(conf: dict[str, Any]) -> None: """Perform training on the (Anti-)Pendulum environment using q-learning. Args: @@ -102,7 +51,7 @@ def do_use(conf: Config | dict[str, Any]) -> None: final_epsilon=conf.get("final_epsilon", _a_conf.final_epsilon), discount_factor=conf.get("discount_factor", _a_conf.discount_factor), ) - filename = conf.get("file", None) + filename = conf.get("file") if filename is not None: Path(filename).parent.mkdir(parents=True, exist_ok=True) agent = QLearningAgent(