huggingface · natolambert · Nov 14, 2022 · Jul 19, 2022 · Jul 19, 2022 · Jul 26, 2022
diff --git a/.gitignore b/.gitignore
@@ -163,4 +163,6 @@ tags
 *.lock
 
 # DS_Store (MacOS)
-.DS_Store
+.DS_Store
+# RL pipelines may produce mp4 outputs
+*.mp4
diff --git a/docs/source/api/models.mdx b/docs/source/api/models.mdx
@@ -22,12 +22,15 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module
 ## UNet2DOutput
 [[autodoc]] models.unet_2d.UNet2DOutput
 
-## UNet1DModel
-[[autodoc]] UNet1DModel
-
 ## UNet2DModel
 [[autodoc]] UNet2DModel
 
+## UNet1DOutput
+[[autodoc]] models.unet_1d.UNet1DOutput
+
+## UNet1DModel
+[[autodoc]] UNet1DModel
+
 ## UNet2DConditionOutput
 [[autodoc]] models.unet_2d_condition.UNet2DConditionOutput
 

diff --git a/examples/README.md b/examples/README.md
@@ -42,7 +42,7 @@ Training examples show how to pretrain or fine-tune diffusion models for a varie
 | [**Text-to-Image fine-tuning**](./text_to_image) | ✅ | ✅ | 
 | [**Textual Inversion**](./textual_inversion) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb)
 | [**Dreambooth**](./dreambooth) | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb)
-
+| [**Reinforcement Learning for Control**](https://github.com/huggingface/diffusers/blob/main/examples/rl/run_diffusers_locomotion.py)                    | - | - | coming soon.
 
 ## Community
 

diff --git a/examples/rl/README.md b/examples/rl/README.md
@@ -0,0 +1,19 @@
+# Overview
+
+These examples show how to run (Diffuser)[https://arxiv.org/abs/2205.09991] in Diffusers. 
+There are four scripts, 
+1. `run_diffuser_locomotion.py` to sample actions and run them in the environment,
+2. and `run_diffuser_gen_trajectories.py` to just sample actions from the pre-trained diffusion model.
+
+You will need some RL specific requirements to run the examples:
+
+```
+pip install -f https://download.pytorch.org/whl/torch_stable.html \
+                free-mujoco-py \
+                einops \
+                gym \
+                protobuf==3.20.1 \
+                git+https://github.com/rail-berkeley/d4rl.git \
+                mediapy \
+                Pillow==9.0.0
+```
diff --git a/examples/rl/run_diffuser_gen_trajectories.py b/examples/rl/run_diffuser_gen_trajectories.py
@@ -0,0 +1,57 @@
+import d4rl  # noqa
+import gym
+import tqdm
+from diffusers import ValueGuidedRLPipeline
+
+
+config = dict(
+    n_samples=64,
+    horizon=32,
+    num_inference_steps=20,
+    n_guide_steps=0,
+    scale_grad_by_std=True,
+    scale=0.1,
+    eta=0.0,
+    t_grad_cutoff=2,
+    device="cpu",
+)
+
+
+if __name__ == "__main__":
+    env_name = "hopper-medium-v2"
+    env = gym.make(env_name)
+
+    pipeline = ValueGuidedRLPipeline.from_pretrained(
+        "bglick13/hopper-medium-v2-value-function-hor32",
+        env=env,
+    )
+
+    env.seed(0)
+    obs = env.reset()
+    total_reward = 0
+    total_score = 0
+    T = 1000
+    rollout = [obs.copy()]
+    try:
+        for t in tqdm.tqdm(range(T)):
+            # Call the policy
+            denorm_actions = pipeline(obs, planning_horizon=32)
+
+            # execute action in environment
+            next_observation, reward, terminal, _ = env.step(denorm_actions)
+            score = env.get_normalized_score(total_reward)
+            # update return
+            total_reward += reward
+            total_score += score
+            print(
+                f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
+                f" {total_score}"
+            )
+            # save observations for rendering
+            rollout.append(next_observation.copy())
+
+            obs = next_observation
+    except KeyboardInterrupt:
+        pass
+
+    print(f"Total reward: {total_reward}")
diff --git a/examples/rl/run_diffuser_locomotion.py b/examples/rl/run_diffuser_locomotion.py
@@ -0,0 +1,57 @@
+import d4rl  # noqa
+import gym
+import tqdm
+from diffusers import ValueGuidedRLPipeline
+
+
+config = dict(
+    n_samples=64,
+    horizon=32,
+    num_inference_steps=20,
+    n_guide_steps=2,
+    scale_grad_by_std=True,
+    scale=0.1,
+    eta=0.0,
+    t_grad_cutoff=2,
+    device="cpu",
+)
+
+
+if __name__ == "__main__":
+    env_name = "hopper-medium-v2"
+    env = gym.make(env_name)
+
+    pipeline = ValueGuidedRLPipeline.from_pretrained(
+        "bglick13/hopper-medium-v2-value-function-hor32",
+        env=env,
+    )
+
+    env.seed(0)
+    obs = env.reset()
+    total_reward = 0
+    total_score = 0
+    T = 1000
+    rollout = [obs.copy()]
+    try:
+        for t in tqdm.tqdm(range(T)):
+            # call the policy
+            denorm_actions = pipeline(obs, planning_horizon=32)
+
+            # execute action in environment
+            next_observation, reward, terminal, _ = env.step(denorm_actions)
+            score = env.get_normalized_score(total_reward)
+            # update return
+            total_reward += reward
+            total_score += score
+            print(
+                f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
+                f" {total_score}"
+            )
+            # save observations for rendering
+            rollout.append(next_observation.copy())
+
+            obs = next_observation
+    except KeyboardInterrupt:
+        pass
+
+    print(f"Total reward: {total_reward}")
diff --git a/scripts/convert_models_diffuser_to_diffusers.py b/scripts/convert_models_diffuser_to_diffusers.py
@@ -0,0 +1,78 @@
+import json
+import os
+
+import torch
+
+from diffusers import UNet1DModel
+
+
+os.makedirs("hub/hopper-medium-v2/unet/hor32", exist_ok=True)
+os.makedirs("hub/hopper-medium-v2/unet/hor128", exist_ok=True)
+
+os.makedirs("hub/hopper-medium-v2/value_function", exist_ok=True)
+
+
+def unet(hor):
+    if hor == 128:
+        down_block_types = ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D")
+        block_out_channels = (32, 128, 256)
+        up_block_types = ("UpResnetBlock1D", "UpResnetBlock1D")
+
+    elif hor == 32:
+        down_block_types = ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D")
+        block_out_channels = (32, 64, 128, 256)
+        up_block_types = ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D")
+    model = torch.load(f"/Users/bglickenhaus/Documents/diffuser/temporal_unet-hopper-mediumv2-hor{hor}.torch")
+    state_dict = model.state_dict()
+    config = dict(
+        down_block_types=down_block_types,
+        block_out_channels=block_out_channels,
+        up_block_types=up_block_types,
+        layers_per_block=1,
+    )
+    hf_value_function = UNet1DModel(**config)
+    print(f"length of state dict: {len(state_dict.keys())}")
+    print(f"length of value function dict: {len(hf_value_function.state_dict().keys())}")
+    mapping = dict((k, hfk) for k, hfk in zip(model.state_dict().keys(), hf_value_function.state_dict().keys()))
+    for k, v in mapping.items():
+        state_dict[v] = state_dict.pop(k)
+    hf_value_function.load_state_dict(state_dict)
+
+    torch.save(hf_value_function.state_dict(), f"hub/hopper-medium-v2/unet/hor{hor}/diffusion_pytorch_model.bin")
+    with open(f"hub/hopper-medium-v2/unet/hor{hor}/config.json", "w") as f:
+        json.dump(config, f)
+
+
+def value_function():
+    config = dict(
+        in_channels=14,
+        down_block_types=("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"),
+        up_block_types=(),
+        out_block_type="ValueFunction",
+        mid_block_type="ValueFunctionMidBlock1D",
+        block_out_channels=(32, 64, 128, 256),
+        layers_per_block=1,
+        always_downsample=True,
+    )
+
+    model = torch.load("/Users/bglickenhaus/Documents/diffuser/value_function-hopper-mediumv2-hor32.torch")
+    state_dict = model
+    hf_value_function = UNet1DModel(**config)
+    print(f"length of state dict: {len(state_dict.keys())}")
+    print(f"length of value function dict: {len(hf_value_function.state_dict().keys())}")
+
+    mapping = dict((k, hfk) for k, hfk in zip(state_dict.keys(), hf_value_function.state_dict().keys()))
+    for k, v in mapping.items():
+        state_dict[v] = state_dict.pop(k)
+
+    hf_value_function.load_state_dict(state_dict)
+
+    torch.save(hf_value_function.state_dict(), "hub/hopper-medium-v2/value_function/diffusion_pytorch_model.bin")
+    with open("hub/hopper-medium-v2/value_function/config.json", "w") as f:
+        json.dump(config, f)
+
+
+if __name__ == "__main__":
+    unet(32)
+    # unet(128)
+    value_function()
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -17,6 +17,7 @@
 
 
 if is_torch_available():
+    from .experimental import ValueGuidedRLPipeline
     from .modeling_utils import ModelMixin
     from .models import AutoencoderKL, Transformer2DModel, UNet1DModel, UNet2DConditionModel, UNet2DModel, VQModel
     from .optimization import (

diff --git a/src/diffusers/experimental/README.md b/src/diffusers/experimental/README.md
@@ -0,0 +1,5 @@
+# 🧨 Diffusers Experimental
+
+We are adding experimental code to support novel applications and usages of the Diffusers library.
+Currently, the following experiments are supported:
+* Reinforcement learning via an implementation of the [Diffuser](https://arxiv.org/abs/2205.09991) model.
diff --git a/src/diffusers/experimental/__init__.py b/src/diffusers/experimental/__init__.py
@@ -0,0 +1 @@
+from .rl import ValueGuidedRLPipeline
diff --git a/src/diffusers/experimental/rl/__init__.py b/src/diffusers/experimental/rl/__init__.py
@@ -0,0 +1 @@
+from .value_guided_sampling import ValueGuidedRLPipeline
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .value_guided_sampling import ValueGuidedRLPipeline