diff --git a/diffsynth/pipelines/wan_video_new.py b/diffsynth/pipelines/wan_video_new.py index c9342ea..141660f 100644 --- a/diffsynth/pipelines/wan_video_new.py +++ b/diffsynth/pipelines/wan_video_new.py @@ -1028,8 +1028,8 @@ class WanVideoUnit_S2V(PipelineUnit): if (inputs_shared.get("input_audio") is None and inputs_shared.get("audio_embeds") is None) or pipe.audio_encoder is None or pipe.audio_processor is None: return inputs_shared, inputs_posi, inputs_nega num_frames, height, width, tiled, tile_size, tile_stride = inputs_shared.get("num_frames"), inputs_shared.get("height"), inputs_shared.get("width"), inputs_shared.get("tiled"), inputs_shared.get("tile_size"), inputs_shared.get("tile_stride") - input_audio, audio_embeds, audio_sample_rate = inputs_shared.pop("input_audio"), inputs_shared.pop("audio_embeds"), inputs_shared.get("audio_sample_rate") - s2v_pose_video, s2v_pose_latents, motion_video = inputs_shared.pop("s2v_pose_video"), inputs_shared.pop("s2v_pose_latents"), inputs_shared.pop("motion_video") + input_audio, audio_embeds, audio_sample_rate = inputs_shared.pop("input_audio", None), inputs_shared.pop("audio_embeds", None), inputs_shared.get("audio_sample_rate", 16000) + s2v_pose_video, s2v_pose_latents, motion_video = inputs_shared.pop("s2v_pose_video", None), inputs_shared.pop("s2v_pose_latents", None), inputs_shared.pop("motion_video", None) audio_input_positive = self.process_audio(pipe, input_audio, audio_sample_rate, num_frames, audio_embeds=audio_embeds) inputs_posi.update(audio_input_positive) diff --git a/diffsynth/trainers/unified_dataset.py b/diffsynth/trainers/unified_dataset.py index feea784..c98a160 100644 --- a/diffsynth/trainers/unified_dataset.py +++ b/diffsynth/trainers/unified_dataset.py @@ -231,7 +231,7 @@ class LoadAudio(DataProcessingOperator): def __call__(self, data: str): import librosa input_audio, sample_rate = librosa.load(data, sr=self.sr) - return {'input_audio':input_audio, 'sample_rate':sample_rate} + return input_audio class UnifiedDataset(torch.utils.data.Dataset): diff --git a/examples/wanvideo/model_training/train.py b/examples/wanvideo/model_training/train.py index 010f581..643c8e2 100644 --- a/examples/wanvideo/model_training/train.py +++ b/examples/wanvideo/model_training/train.py @@ -54,9 +54,6 @@ class WanTrainingModule(DiffusionTrainingModule): "height": data["video"][0].size[1], "width": data["video"][0].size[0], "num_frames": len(data["video"]), - "audio_embeds":None, - "s2v_pose_latents":None, - "motion_video":None, # Please do not modify the following parameters # unless you clearly know what this will cause. "cfg_scale": 1, @@ -78,9 +75,6 @@ class WanTrainingModule(DiffusionTrainingModule): inputs_shared["end_image"] = data["video"][-1] elif extra_input == "reference_image" or extra_input == "vace_reference_image": inputs_shared[extra_input] = data[extra_input][0] - elif extra_input == "input_audio": - inputs_shared['input_audio'] = data['input_audio']['input_audio'] - inputs_shared['sample_rate'] = data['input_audio']['sample_rate'] else: inputs_shared[extra_input] = data[extra_input] @@ -118,7 +112,7 @@ if __name__ == "__main__": ), special_operator_map={ "animate_face_video": ToAbsolutePath(args.dataset_base_path) >> LoadVideo(args.num_frames, 4, 1, frame_processor=ImageCropAndResize(512, 512, None, 16, 16)), - 'input_audio': ToAbsolutePath(args.dataset_base_path) >> LoadAudio(sr=16000), + "input_audio": ToAbsolutePath(args.dataset_base_path) >> LoadAudio(sr=16000), } ) model = WanTrainingModule( diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py index b69a575..2df08d2 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py @@ -50,4 +50,4 @@ video = pipe( s2v_pose_video=pose_video, num_inference_steps=40, ) -save_video_with_audio(video[1:], "video_pose_with_audio_full.mp4", audio_path, fps=16, quality=5) +save_video_with_audio(video[1:], "video_pose_with_audio.mp4", audio_path, fps=16, quality=5) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py index f8245d1..a6166b9 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py @@ -6,7 +6,7 @@ from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, - device="cuda:0", + device="cuda", model_configs=[ ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="wav2vec2-large-xlsr-53-english/model.safetensors"), @@ -48,4 +48,4 @@ video = pipe( s2v_pose_video=pose_video, num_inference_steps=40, ) -save_video_with_audio(video[1:], "video_pose_with_audio_lora.mp4", audio_path, fps=16, quality=5) +save_video_with_audio(video[1:], "video_pose_with_audio.mp4", audio_path, fps=16, quality=5)