From 7df73ceaaf0e32b266219f7fcff0dc76ef16b623 Mon Sep 17 00:00:00 2001 From: Zhongjie Duan <35051019+Artiprocher@users.noreply.github.com> Date: Wed, 26 Feb 2025 20:03:26 +0800 Subject: [PATCH 1/5] Fix Wan VAE device --- diffsynth/pipelines/wan_video.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/diffsynth/pipelines/wan_video.py b/diffsynth/pipelines/wan_video.py index f43d559..82ad76c 100644 --- a/diffsynth/pipelines/wan_video.py +++ b/diffsynth/pipelines/wan_video.py @@ -95,7 +95,7 @@ class WanVideoPipeline(BasePipeline): offload_dtype=dtype, offload_device="cpu", onload_dtype=dtype, - onload_device="cpu", + onload_device=self.device, computation_dtype=self.torch_dtype, computation_device=self.device, ), @@ -203,8 +203,8 @@ class WanVideoPipeline(BasePipeline): cfg_scale=5.0, num_inference_steps=50, tiled=True, - tile_size=(34, 34), - tile_stride=(18, 16), + tile_size=(30, 52), + tile_stride=(15, 26), progress_bar_cmd=tqdm, progress_bar_st=None, ): From 0219e8d2f32ed297db190e2c70ec5e96bc8ebbb9 Mon Sep 17 00:00:00 2001 From: Zhongjie Duan <35051019+Artiprocher@users.noreply.github.com> Date: Wed, 26 Feb 2025 22:53:07 +0800 Subject: [PATCH 2/5] Update README.md --- examples/wanvideo/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/wanvideo/README.md b/examples/wanvideo/README.md index 656d7c9..6df9b43 100644 --- a/examples/wanvideo/README.md +++ b/examples/wanvideo/README.md @@ -2,6 +2,14 @@ Wan-Video is a collection of video synthesis models open-sourced by Alibaba. +Before using this model, please install DiffSynth-Studio from **source code**. + +```shell +git clone https://github.com/modelscope/DiffSynth-Studio.git +cd DiffSynth-Studio +pip install -e . +``` + ## Inference ### Wan-Video-1.3B-T2V From 7792017a020389f7e87b4a458af596b441f0cfd1 Mon Sep 17 00:00:00 2001 From: Zhongjie Duan <35051019+Artiprocher@users.noreply.github.com> Date: Thu, 27 Feb 2025 10:52:47 +0800 Subject: [PATCH 3/5] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 89035f0..a5fe0e3 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ DiffSynth Studio is a Diffusion engine. We have restructured architectures inclu Until now, DiffSynth Studio has supported the following models: +* [Wan-Video](https://github.com/Wan-Video/Wan2.1) * [StepVideo](https://github.com/stepfun-ai/Step-Video-T2V) * [HunyuanVideo](https://github.com/Tencent/HunyuanVideo) * [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b) @@ -36,7 +37,7 @@ Until now, DiffSynth Studio has supported the following models: ## News -- **February 25, 2025** We support Wan-Video, a collection of video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/). +- **February 25, 2025** We support Wan-Video, a collection of SOTA video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/). - **February 17, 2025** We support [StepVideo](https://modelscope.cn/models/stepfun-ai/stepvideo-t2v/summary)! State-of-the-art video synthesis model! See [./examples/stepvideo](./examples/stepvideo/). From 25a247dd3f8623bfba124fe16b7e70437cf4b53a Mon Sep 17 00:00:00 2001 From: Zhongjie Duan <35051019+Artiprocher@users.noreply.github.com> Date: Thu, 27 Feb 2025 11:06:10 +0800 Subject: [PATCH 4/5] bugfix --- examples/wanvideo/train_wan_t2v.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/wanvideo/train_wan_t2v.py b/examples/wanvideo/train_wan_t2v.py index 53de964..a96c0aa 100644 --- a/examples/wanvideo/train_wan_t2v.py +++ b/examples/wanvideo/train_wan_t2v.py @@ -76,7 +76,7 @@ class TextVideoDataset(torch.utils.data.Dataset): def __getitem__(self, data_id): - text = self.path[data_id] + text = self.text[data_id] path = self.path[data_id] video = self.load_video(path) data = {"text": text, "video": video, "path": path} From fad7aea58a4bc107045172d504876311fad9c934 Mon Sep 17 00:00:00 2001 From: Artiprocher Date: Thu, 27 Feb 2025 12:56:55 +0800 Subject: [PATCH 5/5] support wan image training --- examples/wanvideo/train_wan_t2v.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/examples/wanvideo/train_wan_t2v.py b/examples/wanvideo/train_wan_t2v.py index 53de964..817fd5c 100644 --- a/examples/wanvideo/train_wan_t2v.py +++ b/examples/wanvideo/train_wan_t2v.py @@ -23,8 +23,8 @@ class TextVideoDataset(torch.utils.data.Dataset): self.width = width self.frame_process = v2.Compose([ - v2.Resize(size=(height, width), antialias=True), v2.CenterCrop(size=(height, width)), + v2.Resize(size=(height, width), antialias=True), v2.ToTensor(), v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ]) @@ -68,17 +68,28 @@ class TextVideoDataset(torch.utils.data.Dataset): return frames - def load_text_video_raw_data(self, data_id): - text = self.path[data_id] - video = self.load_video(self.path[data_id]) - data = {"text": text, "video": video} - return data + def is_image(self, file_path): + file_ext_name = file_path.split(".")[-1] + if file_ext_name.lower() in ["jpg", "png", "webp"]: + return True + return False + + + def load_image(self, file_path): + frame = Image.open(file_path).convert("RGB") + frame = self.crop_and_resize(frame) + frame = self.frame_process(frame) + frame = rearrange(frame, "C H W -> C 1 H W") + return frame def __getitem__(self, data_id): - text = self.path[data_id] + text = self.text[data_id] path = self.path[data_id] - video = self.load_video(path) + if self.is_image(path): + video = self.load_image(path) + else: + video = self.load_video(path) data = {"text": text, "video": video, "path": path} return data