Add multi res example (#151)

jackalcooper · strint · web-flow · commit e46926238341 · 2023-03-20T11:51:33.000+08:00
Co-authored-by: Xiaoyu Xu &lt;xuxiaoyu2048@foxmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -171,3 +171,5 @@ op_prof.csv
 *.lock
 *.png
 log
+unet_graphs
+*.json
diff --git a/examples/unet_save_and_load.sh b/examples/unet_save_and_load.sh
@@ -0,0 +1,3 @@
+set -eu
+python3 examples/unet_torch_interplay.py --save
+python3 examples/unet_torch_interplay.py --load
diff --git a/examples/unet_torch_interplay.py b/examples/unet_torch_interplay.py
@@ -24,23 +24,33 @@
 from tqdm import tqdm
 
 
-def mock_wrapper(f):
-    import sys
+class MockCtx(object):
+    def __enter__(self):
+        flow.mock_torch.enable(lazy=True)
 
-    flow.mock_torch.enable(lazy=True)
-    ret = f()
-    flow.mock_torch.disable()
-    # TODO: this trick of py mod purging will be removed
-    tmp = sys.modules.copy()
-    for x in tmp:
-        if x.startswith("diffusers"):
-            del sys.modules[x]
-    return ret
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        flow.mock_torch.disable()
 
 
-class UNetGraph(flow.nn.Graph):
+def get_unet(token):
+    from diffusers import UNet2DConditionModel
+
+    unet = UNet2DConditionModel.from_pretrained(
+        "runwayml/stable-diffusion-v1-5",
+        use_auth_token=token,
+        revision="fp16",
+        torch_dtype=flow.float16,
+        subfolder="unet",
+    )
+    with flow.no_grad():
+        unet = unet.to("cuda")
+    return unet
+
+
+class UNetGraphWithCache(flow.nn.Graph):
+    @flow.nn.Graph.with_dynamic_input_shape(size=9)
     def __init__(self, unet):
-        super().__init__()
+        super().__init__(enable_get_runtime_state_dict=True)
         self.unet = unet
         self.config.enable_cudnn_conv_heuristic_search_algo(False)
         self.config.allow_fuse_add_to_output(True)
@@ -51,58 +61,108 @@ def build(self, latent_model_input, t, text_embeddings):
             latent_model_input, t, encoder_hidden_states=text_embeddings
         ).sample
 
+    def warmup_with_arg(self, arg_meta_of_sizes):
+        for arg_metas in arg_meta_of_sizes:
+            print(f"warmup {arg_metas=}")
+            arg_tensors = [flow.empty(a[0], dtype=a[1]).to("cuda") for a in arg_metas]
+            self(*arg_tensors)  # build and warmup
 
-def get_graph(token):
-    from diffusers import UNet2DConditionModel
+    def warmup_with_load(self, file_path):
+        state_dict = flow.load(file_path)
+        self.load_runtime_state_dict(state_dict)
 
-    with flow.no_grad():
-        unet = UNet2DConditionModel.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            use_auth_token=token,
-            revision="fp16",
-            torch_dtype=flow.float16,
-            subfolder="unet",
-        )
-        unet = unet.to("cuda")
-        return UNetGraph(unet)
+    def save_graph(self, file_path):
+        state_dict = self.runtime_state_dict()
+        flow.save(state_dict, file_path)
+
+
+def image_dim(i):
+    return 768 + 128 * i
+
+
+def noise_shape(batch_size, num_channels, image_w, image_h):
+    sizes = (image_w // 8, image_h // 8)
+    return (batch_size, num_channels) + sizes
+
+
+def get_arg_meta_of_sizes(batch_sizes, resolution_scales, num_channels):
+    return [
+        [
+            (
+                noise_shape(batch_size, num_channels, image_dim(i), image_dim(j)),
+                flow.float16,
+            ),
+            ((1,), flow.int64),
+            ((batch_size, 77, 768), flow.float16),
+        ]
+        for batch_size in batch_sizes
+        for i in resolution_scales
+        for j in resolution_scales
+    ]
 
 
 @click.command()
 @click.option("--token")
-@click.option("--repeat", default=1000)
+@click.option("--repeat", default=100)
 @click.option("--sync_interval", default=50)
-def benchmark(token, repeat, sync_interval):
+@click.option("--save", is_flag=True)
+@click.option("--load", is_flag=True)
+@click.option("--file", type=str, default="./unet_graphs")
+def benchmark(token, repeat, sync_interval, save, load, file):
+    RESOLUTION_SCALES = [2, 1, 0]
+    BATCH_SIZES = [2]
+    # TODO: reproduce bug caused by changing batch
+    # BATCH_SIZES = [4, 2]
+
     # create a mocked unet graph
-    unet_graph = mock_wrapper(lambda: get_graph(token))
+    num_channels = 4
+
+    warmup_meta_of_sizes = get_arg_meta_of_sizes(BATCH_SIZES, RESOLUTION_SCALES, num_channels)
+    for (i, m) in enumerate(warmup_meta_of_sizes):
+        print(f"warmup case #{i + 1}:", m)
+    with MockCtx():
+        unet = get_unet(token)
+        unet_graph = UNetGraphWithCache(unet)
+        if load == True:
+            print("loading graphs...")
+            unet_graph.warmup_with_load(file)
+        else:
+            print("warmup with arguments...")
+            unet_graph.warmup_with_arg(warmup_meta_of_sizes)
 
     # generate inputs with torch
     from diffusers.utils import floats_tensor
     import torch
 
-    batch_size = 2
-    num_channels = 4
-    sizes = (64, 64)
-    noise = (
-        floats_tensor((batch_size, num_channels) + sizes).to("cuda").to(torch.float16)
-    )
-    print(f"{type(noise)=}")
     time_step = torch.tensor([10]).to("cuda")
-    encoder_hidden_states = (
-        floats_tensor((batch_size, 77, 768)).to("cuda").to(torch.float16)
-    )
-
-    # convert to oneflow tensors
-    [noise, time_step, encoder_hidden_states] = [
-        flow.utils.tensor.from_torch(x)
-        for x in [noise, time_step, encoder_hidden_states]
+    encoder_hidden_states_of_sizes = {
+        batch_size: floats_tensor((batch_size, 77, 768)).to("cuda").to(torch.float16)
+        for batch_size in BATCH_SIZES
+    }
+    noise_of_sizes = [
+        floats_tensor(noise_shape(batch_size, num_channels, image_dim(i), image_dim(j)))
+        .to("cuda")
+        .to(torch.float16)
+        for batch_size in BATCH_SIZES
+        for i in RESOLUTION_SCALES
+        for j in RESOLUTION_SCALES
     ]
-    unet_graph(noise, time_step, encoder_hidden_states)
+    noise_of_sizes = [flow.utils.tensor.from_torch(x) for x in noise_of_sizes]
+    encoder_hidden_states_of_sizes = {
+        k: flow.utils.tensor.from_torch(v) for k, v in encoder_hidden_states_of_sizes.items()
+    }
+    # convert to oneflow tensors
+    time_step = flow.utils.tensor.from_torch(time_step)
 
     flow._oneflow_internal.eager.Sync()
     import time
 
     t0 = time.time()
     for r in tqdm(range(repeat)):
+        import random
+
+        noise = random.choice(noise_of_sizes)
+        encoder_hidden_states = encoder_hidden_states_of_sizes[noise.shape[0]]
         out = unet_graph(noise, time_step, encoder_hidden_states)
         # convert to torch tensors
         out = flow.utils.tensor.to_torch(out)
@@ -116,6 +176,10 @@ def benchmark(token, repeat, sync_interval):
         f"Finish {repeat} steps in {duration:.3f} seconds, average {throughput:.2f}it/s"
     )
 
+    if save:
+        print("saving graphs...")
+        unet_graph.save_graph(file)
+
 
 if __name__ == "__main__":
     print(f"{flow.__path__=}")

-Original file line number
+Diff line change
 *.lock
 *.png
 log
 +unet_graphs
 +*.json
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+set -eu`
	`2`	`+python3 examples/unet_torch_interplay.py --save`
	`3`	`+python3 examples/unet_torch_interplay.py --load`