From 9cea10cc69a9035b8c13a4167b61cd783a6f7e17 Mon Sep 17 00:00:00 2001 From: mi804 <1576993271@qq.com> Date: Thu, 28 Aug 2025 10:13:52 +0800 Subject: [PATCH] minor fix --- README.md | 2 +- README_zh.md | 2 +- diffsynth/models/wan_video_dit_s2v.py | 6 +- examples/wanvideo/README.md | 2 +- examples/wanvideo/README_zh.md | 2 +- .../model_inference/Wan2.1-S2V-14B.py | 69 ------------------- 6 files changed, 7 insertions(+), 76 deletions(-) delete mode 100644 examples/wanvideo/model_inference/Wan2.1-S2V-14B.py diff --git a/README.md b/README.md index 127467d..ce3fea8 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,7 @@ save_video(video, "video1.mp4", fps=15, quality=5) | Model ID | Extra Parameters | Inference | Full Training | Validate After Full Training | LoRA Training | Validate After LoRA Training | |-|-|-|-|-|-|-| -|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](./examples/wanvideo/model_inference/Wan2.1-S2V-14B.py)|-|-|-|-| +|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](./examples/wanvideo/model_inference/Wan2.2-S2V-14B.py)|-|-|-|-| |[Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B)|`input_image`|[code](./examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py)|[code](./examples/wanvideo/model_training/full/Wan2.2-I2V-A14B.sh)|[code](./examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py)|[code](./examples/wanvideo/model_training/lora/Wan2.2-I2V-A14B.sh)|[code](./examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py)| |[Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B)||[code](./examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py)|[code](./examples/wanvideo/model_training/full/Wan2.2-T2V-A14B.sh)|[code](./examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py)|[code](./examples/wanvideo/model_training/lora/Wan2.2-T2V-A14B.sh)|[code](./examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py)| |[Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B)|`input_image`|[code](./examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py)|[code](./examples/wanvideo/model_training/full/Wan2.2-TI2V-5B.sh)|[code](./examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py)|[code](./examples/wanvideo/model_training/lora/Wan2.2-TI2V-5B.sh)|[code](./examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py)| diff --git a/README_zh.md b/README_zh.md index ba8197f..08f62e9 100644 --- a/README_zh.md +++ b/README_zh.md @@ -201,7 +201,7 @@ save_video(video, "video1.mp4", fps=15, quality=5) |模型 ID|额外参数|推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证| |-|-|-|-|-|-|-| -|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](./examples/wanvideo/model_inference/Wan2.1-S2V-14B.py)|-|-|-|-| +|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](./examples/wanvideo/model_inference/Wan2.2-S2V-14B.py)|-|-|-|-| |[Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B)|`input_image`|[code](./examples/wanvideo/model_inference/Wan2.2-I2V-A14B.py)|[code](./examples/wanvideo/model_training/full/Wan2.2-I2V-A14B.sh)|[code](./examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py)|[code](./examples/wanvideo/model_training/lora/Wan2.2-I2V-A14B.sh)|[code](./examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py)| |[Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B)||[code](./examples/wanvideo/model_inference/Wan2.2-T2V-A14B.py)|[code](./examples/wanvideo/model_training/full/Wan2.2-T2V-A14B.sh)|[code](./examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py)|[code](./examples/wanvideo/model_training/lora/Wan2.2-T2V-A14B.sh)|[code](./examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py)| |[Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B)|`input_image`|[code](./examples/wanvideo/model_inference/Wan2.2-TI2V-5B.py)|[code](./examples/wanvideo/model_training/full/Wan2.2-TI2V-5B.sh)|[code](./examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py)|[code](./examples/wanvideo/model_training/lora/Wan2.2-TI2V-5B.sh)|[code](./examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py)| diff --git a/diffsynth/models/wan_video_dit_s2v.py b/diffsynth/models/wan_video_dit_s2v.py index 75b19a4..fa54591 100644 --- a/diffsynth/models/wan_video_dit_s2v.py +++ b/diffsynth/models/wan_video_dit_s2v.py @@ -562,7 +562,7 @@ class WanS2VModel(torch.nn.Module): context, t_mod, seq_len_x, - pre_compute_freqs, + pre_compute_freqs[0], use_reentrant=False, ) x = torch.utils.checkpoint.checkpoint( @@ -577,7 +577,7 @@ class WanS2VModel(torch.nn.Module): context, t_mod, seq_len_x, - pre_compute_freqs, + pre_compute_freqs[0], use_reentrant=False, ) x = torch.utils.checkpoint.checkpoint( @@ -586,7 +586,7 @@ class WanS2VModel(torch.nn.Module): use_reentrant=False, ) else: - x = block(x, context, t_mod, seq_len_x, pre_compute_freqs) + x = block(x, context, t_mod, seq_len_x, pre_compute_freqs[0]) x = self.after_transformer_block(block_id, x, audio_emb_global, merged_audio_emb, seq_len_x) x = x[:, :seq_len_x] diff --git a/examples/wanvideo/README.md b/examples/wanvideo/README.md index 456d957..add9fa5 100644 --- a/examples/wanvideo/README.md +++ b/examples/wanvideo/README.md @@ -48,7 +48,7 @@ save_video(video, "video1.mp4", fps=15, quality=5) | Model ID | Extra Parameters | Inference | Full Training | Full Training Validation | LoRA Training | LoRA Training Validation | |-|-|-|-|-|-|-| -|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](./model_inference/Wan2.1-S2V-14B.py)|-|-|-|-| +|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](./model_inference/Wan2.2-S2V-14B.py)|-|-|-|-| |[Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B)|`input_image`|[code](./model_inference/Wan2.2-I2V-A14B.py)|[code](./model_training/full/Wan2.2-I2V-A14B.sh)|[code](./model_training/validate_full/Wan2.2-I2V-A14B.py)|[code](./model_training/lora/Wan2.2-I2V-A14B.sh)|[code](./model_training/validate_lora/Wan2.2-I2V-A14B.py)| |[Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B)||[code](./model_inference/Wan2.2-T2V-A14B.py)|[code](./model_training/full/Wan2.2-T2V-A14B.sh)|[code](./model_training/validate_full/Wan2.2-T2V-A14B.py)|[code](./model_training/lora/Wan2.2-T2V-A14B.sh)|[code](./model_training/validate_lora/Wan2.2-T2V-A14B.py)| |[Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B)|`input_image`|[code](./model_inference/Wan2.2-TI2V-5B.py)|[code](./model_training/full/Wan2.2-TI2V-5B.sh)|[code](./model_training/validate_full/Wan2.2-TI2V-5B.py)|[code](./model_training/lora/Wan2.2-TI2V-5B.sh)|[code](./model_training/validate_lora/Wan2.2-TI2V-5B.py)| diff --git a/examples/wanvideo/README_zh.md b/examples/wanvideo/README_zh.md index 1ac53ca..57a36c7 100644 --- a/examples/wanvideo/README_zh.md +++ b/examples/wanvideo/README_zh.md @@ -48,7 +48,7 @@ save_video(video, "video1.mp4", fps=15, quality=5) |模型 ID|额外参数|推理|全量训练|全量训练后验证|LoRA 训练|LoRA 训练后验证| |-|-|-|-|-|-|-| -|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](./model_inference/Wan2.1-S2V-14B.py)|-|-|-|-| +|[Wan-AI/Wan2.2-S2V-14B](https://www.modelscope.cn/models/Wan-AI/Wan2.2-S2V-14B)|`input_image`, `input_audio`, `audio_sample_rate`, `s2v_pose_video`|[code](./model_inference/Wan2.2-S2V-14B.py)|-|-|-|-| |[Wan-AI/Wan2.2-I2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-I2V-A14B)|`input_image`|[code](./model_inference/Wan2.2-I2V-A14B.py)|[code](./model_training/full/Wan2.2-I2V-A14B.sh)|[code](./model_training/validate_full/Wan2.2-I2V-A14B.py)|[code](./model_training/lora/Wan2.2-I2V-A14B.sh)|[code](./model_training/validate_lora/Wan2.2-I2V-A14B.py)| |[Wan-AI/Wan2.2-T2V-A14B](https://modelscope.cn/models/Wan-AI/Wan2.2-T2V-A14B)||[code](./model_inference/Wan2.2-T2V-A14B.py)|[code](./model_training/full/Wan2.2-T2V-A14B.sh)|[code](./model_training/validate_full/Wan2.2-T2V-A14B.py)|[code](./model_training/lora/Wan2.2-T2V-A14B.sh)|[code](./model_training/validate_lora/Wan2.2-T2V-A14B.py)| |[Wan-AI/Wan2.2-TI2V-5B](https://modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B)|`input_image`|[code](./model_inference/Wan2.2-TI2V-5B.py)|[code](./model_training/full/Wan2.2-TI2V-5B.sh)|[code](./model_training/validate_full/Wan2.2-TI2V-5B.py)|[code](./model_training/lora/Wan2.2-TI2V-5B.sh)|[code](./model_training/validate_lora/Wan2.2-TI2V-5B.py)| diff --git a/examples/wanvideo/model_inference/Wan2.1-S2V-14B.py b/examples/wanvideo/model_inference/Wan2.1-S2V-14B.py deleted file mode 100644 index bb93871..0000000 --- a/examples/wanvideo/model_inference/Wan2.1-S2V-14B.py +++ /dev/null @@ -1,69 +0,0 @@ -import torch -from PIL import Image -import librosa -from diffsynth import VideoData, save_video_with_audio -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig -from modelscope import dataset_snapshot_download - -pipe = WanVideoPipeline.from_pretrained( - torch_dtype=torch.bfloat16, - device="cuda", - model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), - ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="wav2vec2-large-xlsr-53-english/model.safetensors"), - ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), - ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="Wan2.1_VAE.pth"), - ], - audio_processor_config=ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="wav2vec2-large-xlsr-53-english/"), -) -dataset_snapshot_download( - dataset_id="DiffSynth-Studio/example_video_dataset", - local_dir="./data/example_video_dataset", - allow_file_pattern=f"wans2v/*" -) - -num_frames = 81 # 4n+1 -height = 448 -width = 832 - -prompt = "a person is singing" -negative_prompt = "画面模糊,最差质量,画面模糊,细节模糊不清,情绪激动剧烈,手快速抖动,字幕,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" -input_image = Image.open("data/example_video_dataset/wans2v/pose.png").convert("RGB").resize((width, height)) -# s2v audio input, recommend 16kHz sampling rate -audio_path = 'data/example_video_dataset/wans2v/sing.MP3' -input_audio, sample_rate = librosa.load(audio_path, sr=16000) - -# Speech-to-video -video = pipe( - prompt=prompt, - input_image=input_image, - negative_prompt=negative_prompt, - seed=0, - num_frames=num_frames, - height=height, - width=width, - audio_sample_rate=sample_rate, - input_audio=input_audio, - num_inference_steps=40, -) -save_video_with_audio(video[1:], "video_with_audio.mp4", audio_path, fps=16, quality=5) - -# s2v will use the first (num_frames) frames as reference. height and width must be the same as input_image. And fps should be 16, the same as output video fps. -pose_video_path = 'data/example_video_dataset/wans2v/pose.mp4' -pose_video = VideoData(pose_video_path, height=height, width=width) - -# Speech-to-video with pose -video = pipe( - prompt=prompt, - input_image=input_image, - negative_prompt=negative_prompt, - seed=0, - num_frames=num_frames, - height=height, - width=width, - audio_sample_rate=sample_rate, - input_audio=input_audio, - s2v_pose_video=pose_video, - num_inference_steps=40, -) -save_video_with_audio(video[1:], "video_pose_with_audio.mp4", audio_path, fps=16, quality=5)