Merge remote-tracking branch 'upstream/main'

2026-03-24 18:28:10 +00:00 · 2025-02-27 15:23:55 +08:00
parent 0dbb3d333f c760208614
commit 0aca943a39
4 changed files with 32 additions and 12 deletions
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@ DiffSynth Studio is a Diffusion engine. We have restructured architectures inclu
 Until now, DiffSynth Studio has supported the following models:
 * [Wan-Video](https://github.com/Wan-Video/Wan2.1)
 * [StepVideo](https://github.com/stepfun-ai/Step-Video-T2V)
 * [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)
 * [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
@@ -36,7 +37,7 @@ Until now, DiffSynth Studio has supported the following models:
 ## News
- **February 25, 2025** We support Wan-Video, a collection of video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/).
+- **February 25, 2025** We support Wan-Video, a collection of SOTA video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/).
 - **February 17, 2025** We support [StepVideo](https://modelscope.cn/models/stepfun-ai/stepvideo-t2v/summary)! State-of-the-art video synthesis model! See [./examples/stepvideo](./examples/stepvideo/).
--- a/diffsynth/pipelines/wan_video.py
+++ b/diffsynth/pipelines/wan_video.py
@@ -95,7 +95,7 @@ class WanVideoPipeline(BasePipeline):
                offload_dtype=dtype,
                offload_device="cpu",
                onload_dtype=dtype,
-                onload_device="cpu",
+                onload_device=self.device,
                computation_dtype=self.torch_dtype,
                computation_device=self.device,
            ),
@@ -203,8 +203,8 @@ class WanVideoPipeline(BasePipeline):
        cfg_scale=5.0,
        num_inference_steps=50,
        tiled=True,
-        tile_size=(34, 34),
+        tile_size=(30, 52),
-        tile_stride=(18, 16),
+        tile_stride=(15, 26),
        progress_bar_cmd=tqdm,
        progress_bar_st=None,
    ):
--- a/examples/wanvideo/README.md
+++ b/examples/wanvideo/README.md
@@ -2,6 +2,14 @@
 Wan-Video is a collection of video synthesis models open-sourced by Alibaba.
 Before using this model, please install DiffSynth-Studio from **source code**.
 ```shell
 git clone https://github.com/modelscope/DiffSynth-Studio.git
 cd DiffSynth-Studio
 pip install -e .
 ```
 ## Inference
 ### Wan-Video-1.3B-T2V
--- a/examples/wanvideo/train_wan_t2v.py
+++ b/examples/wanvideo/train_wan_t2v.py
@@ -23,8 +23,8 @@ class TextVideoDataset(torch.utils.data.Dataset):
        self.width = width
        self.frame_process = v2.Compose([
            v2.Resize(size=(height, width), antialias=True),
            v2.CenterCrop(size=(height, width)),
            v2.Resize(size=(height, width), antialias=True),
            v2.ToTensor(),
            v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
        ])
@@ -68,16 +68,27 @@ class TextVideoDataset(torch.utils.data.Dataset):
        return frames
-    def load_text_video_raw_data(self, data_id):
+    def is_image(self, file_path):
-        text = self.path[data_id]
+        file_ext_name = file_path.split(".")[-1]
-        video = self.load_video(self.path[data_id])
+        if file_ext_name.lower() in ["jpg", "png", "webp"]:
-        data = {"text": text, "video": video}
+            return True
-        return data
+        return False
    def load_image(self, file_path):
        frame = Image.open(file_path).convert("RGB")
        frame = self.crop_and_resize(frame)
        frame = self.frame_process(frame)
        frame = rearrange(frame, "C H W -> C 1 H W")
        return frame
    def __getitem__(self, data_id):
-        text = self.path[data_id]
+        text = self.text[data_id]
        path = self.path[data_id]
        if self.is_image(path):
            video = self.load_image(path)
        else:
            video = self.load_video(path)
        data = {"text": text, "video": video, "path": path}
        return data