From 7b1fe47199e96e52aef5f32f1b0ee93343f18bf1 Mon Sep 17 00:00:00 2001
From: Artiprocher <wangye87v5@hotmail.com>
Date: Mon, 5 Feb 2024 13:59:56 +0800
Subject: [PATCH] add diffutoon editing example

---
 README.md                                     |   2 +-
 diffsynth/pipelines/stable_diffusion_video.py |  28 ++-
 diffsynth/processors/sequencial_processor.py  |  32 ++-
 ...utoon_toon_shading_with_editing_signals.py | 196 ++++++++++++++++++
 4 files changed, 250 insertions(+), 8 deletions(-)
 create mode 100644 examples/diffutoon_toon_shading_with_editing_signals.py

diff --git a/README.md b/README.md
index 20b9adf..3778c1e 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-47
 
 ### Example 5: Toon Shading with Editing Signals (Diffutoon)
 
-Coming soon.
+This example is implemented based on [Diffutoon](https://arxiv.org/abs/2401.16224), supporting video editing signals. See `examples\diffutoon_toon_shading_with_editing_signals.py`.
 
 https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/20528af5-5100-474a-8cdc-440b9efdd86c
 
diff --git a/diffsynth/pipelines/stable_diffusion_video.py b/diffsynth/pipelines/stable_diffusion_video.py
index 36c3127..eed4da3 100644
--- a/diffsynth/pipelines/stable_diffusion_video.py
+++ b/diffsynth/pipelines/stable_diffusion_video.py
@@ -4,6 +4,7 @@ from ..prompts import SDPrompter
 from ..schedulers import EnhancedDDIMScheduler
 from ..data import VideoData, save_frames, save_video
 from .dancer import lets_dance
+from ..processors.sequencial_processor import SequencialProcessor
 from typing import List
 import torch, os, json
 from tqdm import tqdm
@@ -251,6 +252,10 @@ class SDVideoPipeline(torch.nn.Module):
         # Decode image
         output_frames = self.decode_images(latents)
 
+        # Post-process
+        if smoother is not None and (num_inference_steps in smoother_progress_ids or -1 in smoother_progress_ids):
+            output_frames = smoother(output_frames, original_frames=input_frames)
+
         return output_frames
 
 
@@ -278,21 +283,30 @@ class SDVideoPipelineRunner:
         return model_manager, pipe
     
 
-    def synthesize_video(self, model_manager, pipe, seed, **pipeline_inputs):
+    def load_smoother(self, model_manager, smoother_configs):
+        smoother = SequencialProcessor.from_model_manager(model_manager, smoother_configs)
+        return smoother
+
+
+    def synthesize_video(self, model_manager, pipe, seed, smoother, **pipeline_inputs):
         torch.manual_seed(seed)
         if self.in_streamlit:
             import streamlit as st
             progress_bar_st = st.progress(0.0)
-            output_video = pipe(**pipeline_inputs, progress_bar_st=progress_bar_st)
+            output_video = pipe(**pipeline_inputs, smoother=smoother, progress_bar_st=progress_bar_st)
             progress_bar_st.progress(1.0)
         else:
-            output_video = pipe(**pipeline_inputs)
+            output_video = pipe(**pipeline_inputs, smoother=smoother)
         model_manager.to("cpu")
         return output_video
 
 
     def load_video(self, video_file, image_folder, height, width, start_frame_id, end_frame_id):
         video = VideoData(video_file=video_file, image_folder=image_folder, height=height, width=width)
+        if start_frame_id is None:
+            start_frame_id = 0
+        if end_frame_id is None:
+            end_frame_id = len(video)
         frames = [video[i] for i in range(start_frame_id, end_frame_id)]
         return frames
 
@@ -325,8 +339,14 @@ class SDVideoPipelineRunner:
         if self.in_streamlit: st.markdown("Loading models ...")
         model_manager, pipe = self.load_pipeline(**config["models"])
         if self.in_streamlit: st.markdown("Loading models ... done!")
+        if "smoother_configs" in config:
+            if self.in_streamlit: st.markdown("Loading smoother ...")
+            smoother = self.load_smoother(model_manager, config["smoother_configs"])
+            if self.in_streamlit: st.markdown("Loading smoother ... done!")
+        else:
+            smoother = None
         if self.in_streamlit: st.markdown("Synthesizing videos ...")
-        output_video = self.synthesize_video(model_manager, pipe, config["pipeline"]["seed"], **config["pipeline"]["pipeline_inputs"])
+        output_video = self.synthesize_video(model_manager, pipe, config["pipeline"]["seed"], smoother, **config["pipeline"]["pipeline_inputs"])
         if self.in_streamlit: st.markdown("Synthesizing videos ... done!")
         if self.in_streamlit: st.markdown("Saving videos ...")
         self.save_output(output_video, config["data"]["output_folder"], config["data"]["fps"], config)
diff --git a/diffsynth/processors/sequencial_processor.py b/diffsynth/processors/sequencial_processor.py
index 6f6b440..9b5bc94 100644
--- a/diffsynth/processors/sequencial_processor.py
+++ b/diffsynth/processors/sequencial_processor.py
@@ -1,15 +1,41 @@
 from .base import VideoProcessor
 
 
+class AutoVideoProcessor(VideoProcessor):
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def from_model_manager(model_manager, processor_type, **kwargs):
+        if processor_type == "FastBlend":
+            from .FastBlend import FastBlendSmoother
+            return FastBlendSmoother.from_model_manager(model_manager, **kwargs)
+        elif processor_type == "Contrast":
+            from .PILEditor import ContrastEditor
+            return ContrastEditor.from_model_manager(model_manager, **kwargs)
+        elif processor_type == "Sharpness":
+            from .PILEditor import SharpnessEditor
+            return SharpnessEditor.from_model_manager(model_manager, **kwargs)
+        elif processor_type == "RIFE":
+            from .RIFE import RIFESmoother
+            return RIFESmoother.from_model_manager(model_manager, **kwargs)
+        else:
+            raise ValueError(f"invalid processor_type: {processor_type}")
+
+
 class SequencialProcessor(VideoProcessor):
     def __init__(self, processors=[]):
         self.processors = processors
 
     @staticmethod
-    def from_model_manager(model_manager, **kwargs):
-        return SequencialProcessor(**kwargs)
+    def from_model_manager(model_manager, configs):
+        processors = [
+            AutoVideoProcessor.from_model_manager(model_manager, config["processor_type"], **config["config"])
+            for config in configs
+        ]
+        return SequencialProcessor(processors)
     
     def __call__(self, rendered_frames, **kwargs):
         for processor in self.processors:
             rendered_frames = processor(rendered_frames, **kwargs)
-        return rendered_frames
\ No newline at end of file
+        return rendered_frames
diff --git a/examples/diffutoon_toon_shading_with_editing_signals.py b/examples/diffutoon_toon_shading_with_editing_signals.py
new file mode 100644
index 0000000..428867e
--- /dev/null
+++ b/examples/diffutoon_toon_shading_with_editing_signals.py
@@ -0,0 +1,196 @@
+from diffsynth import SDVideoPipelineRunner
+import os
+
+
+# Download models
+# `models/stable_diffusion/aingdiffusion_v12.safetensors`: [link](https://civitai.com/api/download/models/229575)
+# `models/AnimateDiff/mm_sd_v15_v2.ckpt`: [link](https://huggingface.co/guoyww/animatediff/resolve/main/mm_sd_v15_v2.ckpt)
+# `models/ControlNet/control_v11p_sd15_lineart.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_lineart.pth)
+# `models/ControlNet/control_v11f1e_sd15_tile.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1e_sd15_tile.pth)
+# `models/ControlNet/control_v11f1p_sd15_depth.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11f1p_sd15_depth.pth)
+# `models/ControlNet/control_v11p_sd15_softedge.pth`: [link](https://huggingface.co/lllyasviel/ControlNet-v1-1/resolve/main/control_v11p_sd15_softedge.pth)
+# `models/Annotators/dpt_hybrid-midas-501f0c75.pt`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/dpt_hybrid-midas-501f0c75.pt)
+# `models/Annotators/ControlNetHED.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetHED.pth)
+# `models/Annotators/sk_model.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model.pth)
+# `models/Annotators/sk_model2.pth`: [link](https://huggingface.co/lllyasviel/Annotators/resolve/main/sk_model2.pth)
+# `models/textual_inversion/verybadimagenegative_v1.3.pt`: [link](https://civitai.com/api/download/models/25820?type=Model&format=PickleTensor&size=full&fp=fp16)
+
+# The original video in the example is https://www.bilibili.com/video/BV1zu4y1s7Ec/.
+
+config_stage_1 = {
+    "models": {
+        "model_list": [
+            "models/stable_diffusion/aingdiffusion_v12.safetensors",
+            "models/ControlNet/control_v11p_sd15_softedge.pth",
+            "models/ControlNet/control_v11f1p_sd15_depth.pth"
+        ],
+        "textual_inversion_folder": "models/textual_inversion",
+        "device": "cuda",
+        "lora_alphas": [],
+        "controlnet_units": [
+            {
+                "processor_id": "softedge",
+                "model_path": "models/ControlNet/control_v11p_sd15_softedge.pth",
+                "scale": 0.5
+            },
+            {
+                "processor_id": "depth",
+                "model_path": "models/ControlNet/control_v11f1p_sd15_depth.pth",
+                "scale": 0.5
+            }
+        ]
+    },
+    "data": {
+        "input_frames": {
+            "video_file": "data/examples/diffutoon_edit/input_video.mp4",
+            "image_folder": None,
+            "height": 512,
+            "width": 512,
+            "start_frame_id": 0,
+            "end_frame_id": 30
+        },
+        "controlnet_frames": [
+            {
+                "video_file": "data/examples/diffutoon_edit/input_video.mp4",
+                "image_folder": None,
+                "height": 512,
+                "width": 512,
+                "start_frame_id": 0,
+                "end_frame_id": 30
+            },
+            {
+                "video_file": "data/examples/diffutoon_edit/input_video.mp4",
+                "image_folder": None,
+                "height": 512,
+                "width": 512,
+                "start_frame_id": 0,
+                "end_frame_id": 30
+            }
+        ],
+        "output_folder": "data/examples/diffutoon_edit/color_video",
+        "fps": 25
+    },
+    "smoother_configs": [
+        {
+            "processor_type": "FastBlend",
+            "config": {}
+        }
+    ],
+    "pipeline": {
+        "seed": 0,
+        "pipeline_inputs": {
+            "prompt": "best quality, perfect anime illustration, orange clothes, night, a girl is dancing, smile, solo, black silk stockings",
+            "negative_prompt": "verybadimagenegative_v1.3",
+            "cfg_scale": 7.0,
+            "clip_skip": 1,
+            "denoising_strength": 0.9,
+            "num_inference_steps": 20,
+            "animatediff_batch_size": 8,
+            "animatediff_stride": 4,
+            "unet_batch_size": 8,
+            "controlnet_batch_size": 8,
+            "cross_frame_attention": True,
+            "smoother_progress_ids": [-1],
+            # The following parameters will be overwritten. You don't need to modify them.
+            "input_frames": [],
+            "num_frames": 30,
+            "width": 512,
+            "height": 512,
+            "controlnet_frames": []
+        }
+    }
+}
+
+
+config_stage_2 = {
+    "models": {
+        "model_list": [
+            "models/stable_diffusion/aingdiffusion_v12.safetensors",
+            "models/AnimateDiff/mm_sd_v15_v2.ckpt",
+            "models/ControlNet/control_v11f1e_sd15_tile.pth",
+            "models/ControlNet/control_v11p_sd15_lineart.pth"
+        ],
+        "textual_inversion_folder": "models/textual_inversion",
+        "device": "cuda",
+        "lora_alphas": [],
+        "controlnet_units": [
+            {
+                "processor_id": "tile",
+                "model_path": "models/ControlNet/control_v11f1e_sd15_tile.pth",
+                "scale": 0.5
+            },
+            {
+                "processor_id": "lineart",
+                "model_path": "models/ControlNet/control_v11p_sd15_lineart.pth",
+                "scale": 0.5
+            }
+        ]
+    },
+    "data": {
+        "input_frames": {
+            "video_file": "data/examples/diffutoon_edit/input_video.mp4",
+            "image_folder": None,
+            "height": 1536,
+            "width": 1536,
+            "start_frame_id": 0,
+            "end_frame_id": 30
+        },
+        "controlnet_frames": [
+            {
+                "video_file": "data/examples/diffutoon_edit/input_video.mp4",
+                "image_folder": None,
+                "height": 1536,
+                "width": 1536,
+                "start_frame_id": 0,
+                "end_frame_id": 30
+            },
+            {
+                "video_file": "data/examples/diffutoon_edit/input_video.mp4",
+                "image_folder": None,
+                "height": 1536,
+                "width": 1536,
+                "start_frame_id": 0,
+                "end_frame_id": 30
+            }
+        ],
+        "output_folder": "data/examples/diffutoon_edit/output",
+        "fps": 30
+    },
+    "pipeline": {
+        "seed": 0,
+        "pipeline_inputs": {
+            "prompt": "best quality, perfect anime illustration, light, a girl is dancing, smile, solo",
+            "negative_prompt": "verybadimagenegative_v1.3",
+            "cfg_scale": 7.0,
+            "clip_skip": 2,
+            "denoising_strength": 1.0,
+            "num_inference_steps": 10,
+            "animatediff_batch_size": 16,
+            "animatediff_stride": 8,
+            "unet_batch_size": 1,
+            "controlnet_batch_size": 1,
+            "cross_frame_attention": False,
+            # The following parameters will be overwritten. You don't need to modify them.
+            "input_frames": [],
+            "num_frames": 30,
+            "width": 1536,
+            "height": 1536,
+            "controlnet_frames": []
+        }
+    }
+}
+
+
+runner = SDVideoPipelineRunner()
+runner.run(config_stage_1)
+
+# Replace the color video with the synthesized video
+config_stage_2["data"]["controlnet_frames"][0] = {
+    "video_file": os.path.join(config_stage_1["data"]["output_folder"], "video.mp4"),
+    "image_folder": None,
+    "height": config_stage_2["data"]["input_frames"]["height"],
+    "width": config_stage_2["data"]["input_frames"]["width"],
+    "start_frame_id": None,
+    "end_frame_id": None
+}
+runner.run(config_stage_2)