mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-24 18:28:10 +00:00
Merge remote-tracking branch 'upstream/main'
This commit is contained in:
@@ -17,6 +17,7 @@ DiffSynth Studio is a Diffusion engine. We have restructured architectures inclu
|
|||||||
|
|
||||||
Until now, DiffSynth Studio has supported the following models:
|
Until now, DiffSynth Studio has supported the following models:
|
||||||
|
|
||||||
|
* [Wan-Video](https://github.com/Wan-Video/Wan2.1)
|
||||||
* [StepVideo](https://github.com/stepfun-ai/Step-Video-T2V)
|
* [StepVideo](https://github.com/stepfun-ai/Step-Video-T2V)
|
||||||
* [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)
|
* [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)
|
||||||
* [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
|
* [CogVideoX](https://huggingface.co/THUDM/CogVideoX-5b)
|
||||||
@@ -36,7 +37,7 @@ Until now, DiffSynth Studio has supported the following models:
|
|||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
- **February 25, 2025** We support Wan-Video, a collection of video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/).
|
- **February 25, 2025** We support Wan-Video, a collection of SOTA video synthesis models open-sourced by Alibaba. See [./examples/wanvideo/](./examples/wanvideo/).
|
||||||
|
|
||||||
- **February 17, 2025** We support [StepVideo](https://modelscope.cn/models/stepfun-ai/stepvideo-t2v/summary)! State-of-the-art video synthesis model! See [./examples/stepvideo](./examples/stepvideo/).
|
- **February 17, 2025** We support [StepVideo](https://modelscope.cn/models/stepfun-ai/stepvideo-t2v/summary)! State-of-the-art video synthesis model! See [./examples/stepvideo](./examples/stepvideo/).
|
||||||
|
|
||||||
|
|||||||
@@ -95,7 +95,7 @@ class WanVideoPipeline(BasePipeline):
|
|||||||
offload_dtype=dtype,
|
offload_dtype=dtype,
|
||||||
offload_device="cpu",
|
offload_device="cpu",
|
||||||
onload_dtype=dtype,
|
onload_dtype=dtype,
|
||||||
onload_device="cpu",
|
onload_device=self.device,
|
||||||
computation_dtype=self.torch_dtype,
|
computation_dtype=self.torch_dtype,
|
||||||
computation_device=self.device,
|
computation_device=self.device,
|
||||||
),
|
),
|
||||||
@@ -203,8 +203,8 @@ class WanVideoPipeline(BasePipeline):
|
|||||||
cfg_scale=5.0,
|
cfg_scale=5.0,
|
||||||
num_inference_steps=50,
|
num_inference_steps=50,
|
||||||
tiled=True,
|
tiled=True,
|
||||||
tile_size=(34, 34),
|
tile_size=(30, 52),
|
||||||
tile_stride=(18, 16),
|
tile_stride=(15, 26),
|
||||||
progress_bar_cmd=tqdm,
|
progress_bar_cmd=tqdm,
|
||||||
progress_bar_st=None,
|
progress_bar_st=None,
|
||||||
):
|
):
|
||||||
|
|||||||
@@ -2,6 +2,14 @@
|
|||||||
|
|
||||||
Wan-Video is a collection of video synthesis models open-sourced by Alibaba.
|
Wan-Video is a collection of video synthesis models open-sourced by Alibaba.
|
||||||
|
|
||||||
|
Before using this model, please install DiffSynth-Studio from **source code**.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
git clone https://github.com/modelscope/DiffSynth-Studio.git
|
||||||
|
cd DiffSynth-Studio
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
## Inference
|
## Inference
|
||||||
|
|
||||||
### Wan-Video-1.3B-T2V
|
### Wan-Video-1.3B-T2V
|
||||||
|
|||||||
@@ -23,8 +23,8 @@ class TextVideoDataset(torch.utils.data.Dataset):
|
|||||||
self.width = width
|
self.width = width
|
||||||
|
|
||||||
self.frame_process = v2.Compose([
|
self.frame_process = v2.Compose([
|
||||||
v2.Resize(size=(height, width), antialias=True),
|
|
||||||
v2.CenterCrop(size=(height, width)),
|
v2.CenterCrop(size=(height, width)),
|
||||||
|
v2.Resize(size=(height, width), antialias=True),
|
||||||
v2.ToTensor(),
|
v2.ToTensor(),
|
||||||
v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
||||||
])
|
])
|
||||||
@@ -68,16 +68,27 @@ class TextVideoDataset(torch.utils.data.Dataset):
|
|||||||
return frames
|
return frames
|
||||||
|
|
||||||
|
|
||||||
def load_text_video_raw_data(self, data_id):
|
def is_image(self, file_path):
|
||||||
text = self.path[data_id]
|
file_ext_name = file_path.split(".")[-1]
|
||||||
video = self.load_video(self.path[data_id])
|
if file_ext_name.lower() in ["jpg", "png", "webp"]:
|
||||||
data = {"text": text, "video": video}
|
return True
|
||||||
return data
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def load_image(self, file_path):
|
||||||
|
frame = Image.open(file_path).convert("RGB")
|
||||||
|
frame = self.crop_and_resize(frame)
|
||||||
|
frame = self.frame_process(frame)
|
||||||
|
frame = rearrange(frame, "C H W -> C 1 H W")
|
||||||
|
return frame
|
||||||
|
|
||||||
|
|
||||||
def __getitem__(self, data_id):
|
def __getitem__(self, data_id):
|
||||||
text = self.path[data_id]
|
text = self.text[data_id]
|
||||||
path = self.path[data_id]
|
path = self.path[data_id]
|
||||||
|
if self.is_image(path):
|
||||||
|
video = self.load_image(path)
|
||||||
|
else:
|
||||||
video = self.load_video(path)
|
video = self.load_video(path)
|
||||||
data = {"text": text, "video": video, "path": path}
|
data = {"text": text, "video": video, "path": path}
|
||||||
return data
|
return data
|
||||||
|
|||||||
Reference in New Issue
Block a user