mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-18 22:08:13 +00:00
update wan2.2-S2V training
This commit is contained in:
@@ -1028,8 +1028,8 @@ class WanVideoUnit_S2V(PipelineUnit):
|
||||
if (inputs_shared.get("input_audio") is None and inputs_shared.get("audio_embeds") is None) or pipe.audio_encoder is None or pipe.audio_processor is None:
|
||||
return inputs_shared, inputs_posi, inputs_nega
|
||||
num_frames, height, width, tiled, tile_size, tile_stride = inputs_shared.get("num_frames"), inputs_shared.get("height"), inputs_shared.get("width"), inputs_shared.get("tiled"), inputs_shared.get("tile_size"), inputs_shared.get("tile_stride")
|
||||
input_audio, audio_embeds, audio_sample_rate = inputs_shared.pop("input_audio"), inputs_shared.pop("audio_embeds"), inputs_shared.get("audio_sample_rate")
|
||||
s2v_pose_video, s2v_pose_latents, motion_video = inputs_shared.pop("s2v_pose_video"), inputs_shared.pop("s2v_pose_latents"), inputs_shared.pop("motion_video")
|
||||
input_audio, audio_embeds, audio_sample_rate = inputs_shared.pop("input_audio", None), inputs_shared.pop("audio_embeds", None), inputs_shared.get("audio_sample_rate", 16000)
|
||||
s2v_pose_video, s2v_pose_latents, motion_video = inputs_shared.pop("s2v_pose_video", None), inputs_shared.pop("s2v_pose_latents", None), inputs_shared.pop("motion_video", None)
|
||||
|
||||
audio_input_positive = self.process_audio(pipe, input_audio, audio_sample_rate, num_frames, audio_embeds=audio_embeds)
|
||||
inputs_posi.update(audio_input_positive)
|
||||
|
||||
@@ -231,7 +231,7 @@ class LoadAudio(DataProcessingOperator):
|
||||
def __call__(self, data: str):
|
||||
import librosa
|
||||
input_audio, sample_rate = librosa.load(data, sr=self.sr)
|
||||
return {'input_audio':input_audio, 'sample_rate':sample_rate}
|
||||
return input_audio
|
||||
|
||||
|
||||
class UnifiedDataset(torch.utils.data.Dataset):
|
||||
|
||||
@@ -54,9 +54,6 @@ class WanTrainingModule(DiffusionTrainingModule):
|
||||
"height": data["video"][0].size[1],
|
||||
"width": data["video"][0].size[0],
|
||||
"num_frames": len(data["video"]),
|
||||
"audio_embeds":None,
|
||||
"s2v_pose_latents":None,
|
||||
"motion_video":None,
|
||||
# Please do not modify the following parameters
|
||||
# unless you clearly know what this will cause.
|
||||
"cfg_scale": 1,
|
||||
@@ -78,9 +75,6 @@ class WanTrainingModule(DiffusionTrainingModule):
|
||||
inputs_shared["end_image"] = data["video"][-1]
|
||||
elif extra_input == "reference_image" or extra_input == "vace_reference_image":
|
||||
inputs_shared[extra_input] = data[extra_input][0]
|
||||
elif extra_input == "input_audio":
|
||||
inputs_shared['input_audio'] = data['input_audio']['input_audio']
|
||||
inputs_shared['sample_rate'] = data['input_audio']['sample_rate']
|
||||
else:
|
||||
inputs_shared[extra_input] = data[extra_input]
|
||||
|
||||
@@ -118,7 +112,7 @@ if __name__ == "__main__":
|
||||
),
|
||||
special_operator_map={
|
||||
"animate_face_video": ToAbsolutePath(args.dataset_base_path) >> LoadVideo(args.num_frames, 4, 1, frame_processor=ImageCropAndResize(512, 512, None, 16, 16)),
|
||||
'input_audio': ToAbsolutePath(args.dataset_base_path) >> LoadAudio(sr=16000),
|
||||
"input_audio": ToAbsolutePath(args.dataset_base_path) >> LoadAudio(sr=16000),
|
||||
}
|
||||
)
|
||||
model = WanTrainingModule(
|
||||
|
||||
@@ -50,4 +50,4 @@ video = pipe(
|
||||
s2v_pose_video=pose_video,
|
||||
num_inference_steps=40,
|
||||
)
|
||||
save_video_with_audio(video[1:], "video_pose_with_audio_full.mp4", audio_path, fps=16, quality=5)
|
||||
save_video_with_audio(video[1:], "video_pose_with_audio.mp4", audio_path, fps=16, quality=5)
|
||||
|
||||
@@ -6,7 +6,7 @@ from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
|
||||
|
||||
pipe = WanVideoPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda:0",
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"),
|
||||
ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="wav2vec2-large-xlsr-53-english/model.safetensors"),
|
||||
@@ -48,4 +48,4 @@ video = pipe(
|
||||
s2v_pose_video=pose_video,
|
||||
num_inference_steps=40,
|
||||
)
|
||||
save_video_with_audio(video[1:], "video_pose_with_audio_lora.mp4", audio_path, fps=16, quality=5)
|
||||
save_video_with_audio(video[1:], "video_pose_with_audio.mp4", audio_path, fps=16, quality=5)
|
||||
|
||||
Reference in New Issue
Block a user