mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-24 18:28:10 +00:00
wan-refactor
This commit is contained in:
@@ -10,4 +10,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--remove_prefix_in_ckpt "pipe.motion_controller." \
|
||||
--output_path "./models/train/Wan2.1-1.3b-speedcontrol-v1_full" \
|
||||
--trainable_models "motion_controller" \
|
||||
--input_contains_motion_bucket_id
|
||||
--extra_inputs "motion_bucket_id"
|
||||
@@ -10,5 +10,4 @@ accelerate launch --config_file examples/wanvideo/model_training/full/accelerate
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-FLF2V-14B-720P_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -11,4 +11,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-1.3B-Control_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_control_video
|
||||
--extra_inputs "control_video"
|
||||
@@ -10,5 +10,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-1.3B-InP_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -11,4 +11,4 @@ accelerate launch --config_file examples/wanvideo/model_training/full/accelerate
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-14B-Control_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_control_video
|
||||
--extra_inputs "control_video"
|
||||
@@ -10,5 +10,4 @@ accelerate launch --config_file examples/wanvideo/model_training/full/accelerate
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-14B-InP_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -0,0 +1,13 @@
|
||||
accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--dataset_base_path data/example_video_dataset \
|
||||
--dataset_metadata_path data/example_video_dataset/metadata_camera_control.csv \
|
||||
--height 480 \
|
||||
--width 832 \
|
||||
--dataset_repeat 100 \
|
||||
--model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
|
||||
--learning_rate 1e-5 \
|
||||
--num_epochs 2 \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_full" \
|
||||
--trainable_models "dit" \
|
||||
--extra_inputs "input_image,camera_control_direction,camera_control_speed"
|
||||
@@ -11,5 +11,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-Control_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_control_video \
|
||||
--input_contains_reference_image
|
||||
--extra_inputs "control_video,reference_image"
|
||||
@@ -10,5 +10,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-InP_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -0,0 +1,13 @@
|
||||
accelerate launch --config_file examples/wanvideo/model_training/full/accelerate_config_14B.yaml examples/wanvideo/model_training/train.py \
|
||||
--dataset_base_path data/example_video_dataset \
|
||||
--dataset_metadata_path data/example_video_dataset/metadata_camera_control.csv \
|
||||
--height 480 \
|
||||
--width 832 \
|
||||
--dataset_repeat 100 \
|
||||
--model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
|
||||
--learning_rate 1e-5 \
|
||||
--num_epochs 2 \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-V1.1-14B-Control-Camera_full" \
|
||||
--trainable_models "dit" \
|
||||
--extra_inputs "input_image,camera_control_direction,camera_control_speed"
|
||||
@@ -11,5 +11,4 @@ accelerate launch --config_file examples/wanvideo/model_training/full/accelerate
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-V1.1-14B-Control_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_control_video \
|
||||
--input_contains_reference_image
|
||||
--extra_inputs "control_video,reference_image"
|
||||
@@ -10,5 +10,4 @@ accelerate launch --config_file examples/wanvideo/model_training/full/accelerate
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-V1.1-14B-InP_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -10,4 +10,4 @@ accelerate launch --config_file examples/wanvideo/model_training/full/accelerate
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-I2V-14B-480P_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_input_image
|
||||
--extra_inputs "input_image"
|
||||
@@ -10,4 +10,4 @@ accelerate launch --config_file examples/wanvideo/model_training/full/accelerate
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-I2V-14B-720P_full" \
|
||||
--trainable_models "dit" \
|
||||
--input_contains_input_image
|
||||
--extra_inputs "input_image"
|
||||
@@ -12,6 +12,5 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--remove_prefix_in_ckpt "pipe.vace." \
|
||||
--output_path "./models/train/Wan2.1-VACE-1.3B-Preview_full" \
|
||||
--trainable_models "vace" \
|
||||
--input_contains_vace_video \
|
||||
--input_contains_vace_reference_image \
|
||||
--extra_inputs "vace_video,vace_reference_image" \
|
||||
--use_gradient_checkpointing_offload
|
||||
@@ -12,6 +12,5 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--remove_prefix_in_ckpt "pipe.vace." \
|
||||
--output_path "./models/train/Wan2.1-VACE-1.3B_full" \
|
||||
--trainable_models "vace" \
|
||||
--input_contains_vace_video \
|
||||
--input_contains_vace_reference_image \
|
||||
--extra_inputs "vace_video,vace_reference_image" \
|
||||
--use_gradient_checkpointing_offload
|
||||
@@ -12,6 +12,5 @@ accelerate launch --config_file examples/wanvideo/model_training/full/accelerate
|
||||
--remove_prefix_in_ckpt "pipe.vace." \
|
||||
--output_path "./models/train/Wan2.1-VACE-14B_full" \
|
||||
--trainable_models "vace" \
|
||||
--input_contains_vace_video \
|
||||
--input_contains_vace_reference_image \
|
||||
--extra_inputs "vace_video,vace_reference_image" \
|
||||
--use_gradient_checkpointing_offload
|
||||
@@ -12,4 +12,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_motion_bucket_id
|
||||
--extra_inputs "motion_bucket_id"
|
||||
@@ -12,5 +12,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -13,4 +13,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_control_video
|
||||
--extra_inputs "control_video"
|
||||
@@ -12,5 +12,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -13,4 +13,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_control_video
|
||||
--extra_inputs "control_video"
|
||||
@@ -12,5 +12,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -0,0 +1,15 @@
|
||||
accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--dataset_base_path data/example_video_dataset \
|
||||
--dataset_metadata_path data/example_video_dataset/metadata_camera_control.csv \
|
||||
--height 480 \
|
||||
--width 832 \
|
||||
--dataset_repeat 100 \
|
||||
--model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
|
||||
--learning_rate 1e-5 \
|
||||
--num_epochs 5 \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_full" \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--extra_inputs "input_image,camera_control_direction,camera_control_speed"
|
||||
@@ -13,5 +13,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_control_video \
|
||||
--input_contains_reference_image
|
||||
--extra_inputs "control_video,reference_image"
|
||||
@@ -12,5 +12,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -0,0 +1,15 @@
|
||||
accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--dataset_base_path data/example_video_dataset \
|
||||
--dataset_metadata_path data/example_video_dataset/metadata_camera_control.csv \
|
||||
--height 480 \
|
||||
--width 832 \
|
||||
--dataset_repeat 100 \
|
||||
--model_id_with_origin_paths "PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:diffusion_pytorch_model*.safetensors,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:models_t5_umt5-xxl-enc-bf16.pth,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:Wan2.1_VAE.pth,PAI/Wan2.1-Fun-V1.1-14B-Control-Camera:models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth" \
|
||||
--learning_rate 1e-5 \
|
||||
--num_epochs 5 \
|
||||
--remove_prefix_in_ckpt "pipe.dit." \
|
||||
--output_path "./models/train/Wan2.1-Fun-V1.1-14B-Control-Camera_full" \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--extra_inputs "input_image,camera_control_direction,camera_control_speed"
|
||||
@@ -13,5 +13,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_control_video \
|
||||
--input_contains_reference_image
|
||||
--extra_inputs "control_video,reference_image"
|
||||
@@ -12,5 +12,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_input_image \
|
||||
--input_contains_end_image
|
||||
--extra_inputs "input_image,end_image"
|
||||
@@ -12,4 +12,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_input_image
|
||||
--extra_inputs "input_image"
|
||||
@@ -12,4 +12,4 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "dit" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_input_image
|
||||
--extra_inputs "input_image"
|
||||
@@ -13,6 +13,5 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "vace" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_vace_video \
|
||||
--input_contains_vace_reference_image \
|
||||
--extra_inputs "vace_video,vace_reference_image" \
|
||||
--use_gradient_checkpointing_offload
|
||||
@@ -13,6 +13,5 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "vace" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_vace_video \
|
||||
--input_contains_vace_reference_image \
|
||||
--extra_inputs "vace_video,vace_reference_image" \
|
||||
--use_gradient_checkpointing_offload
|
||||
@@ -14,6 +14,5 @@ accelerate launch examples/wanvideo/model_training/train.py \
|
||||
--lora_base_model "vace" \
|
||||
--lora_target_modules "q,k,v,o,ffn.0,ffn.2" \
|
||||
--lora_rank 32 \
|
||||
--input_contains_vace_video \
|
||||
--input_contains_vace_reference_image \
|
||||
--extra_inputs "vace_video,vace_reference_image" \
|
||||
--use_gradient_checkpointing_offload
|
||||
@@ -13,14 +13,7 @@ class WanTrainingModule(DiffusionTrainingModule):
|
||||
lora_base_model=None, lora_target_modules="q,k,v,o,ffn.0,ffn.2", lora_rank=32,
|
||||
use_gradient_checkpointing=True,
|
||||
use_gradient_checkpointing_offload=False,
|
||||
# Extra inputs
|
||||
input_contains_input_image=False,
|
||||
input_contains_end_image=False,
|
||||
input_contains_control_video=False,
|
||||
input_contains_reference_image=False,
|
||||
input_contains_vace_video=False,
|
||||
input_contains_vace_reference_image=False,
|
||||
input_contains_motion_bucket_id=False,
|
||||
extra_inputs=None,
|
||||
):
|
||||
super().__init__()
|
||||
# Load models
|
||||
@@ -51,13 +44,7 @@ class WanTrainingModule(DiffusionTrainingModule):
|
||||
# Store other configs
|
||||
self.use_gradient_checkpointing = use_gradient_checkpointing
|
||||
self.use_gradient_checkpointing_offload = use_gradient_checkpointing_offload
|
||||
self.input_contains_input_image = input_contains_input_image
|
||||
self.input_contains_end_image = input_contains_end_image
|
||||
self.input_contains_control_video = input_contains_control_video
|
||||
self.input_contains_reference_image = input_contains_reference_image
|
||||
self.input_contains_vace_video = input_contains_vace_video
|
||||
self.input_contains_vace_reference_image = input_contains_vace_reference_image
|
||||
self.input_contains_motion_bucket_id = input_contains_motion_bucket_id
|
||||
self.extra_inputs = extra_inputs.split(",") if extra_inputs is not None else []
|
||||
|
||||
|
||||
def forward_preprocess(self, data):
|
||||
@@ -85,13 +72,13 @@ class WanTrainingModule(DiffusionTrainingModule):
|
||||
}
|
||||
|
||||
# Extra inputs
|
||||
if self.input_contains_input_image: inputs_shared["input_image"] = data["video"][0]
|
||||
if self.input_contains_end_image: inputs_shared["end_image"] = data["video"][-1]
|
||||
if self.input_contains_control_video: inputs_shared["control_video"] = data["control_video"]
|
||||
if self.input_contains_reference_image: inputs_shared["reference_image"] = data["reference_image"]
|
||||
if self.input_contains_vace_video: inputs_shared["vace_video"] = data["vace_video"]
|
||||
if self.input_contains_vace_reference_image: inputs_shared["vace_reference_image"] = data["vace_reference_image"]
|
||||
if self.input_contains_motion_bucket_id: inputs_shared["motion_bucket_id"] = data["motion_bucket_id"]
|
||||
for extra_input in self.extra_inputs:
|
||||
if extra_input == "input_image":
|
||||
inputs_shared["input_image"] = data["video"][0]
|
||||
elif extra_input == "end_image":
|
||||
inputs_shared["end_image"] = data["video"][-1]
|
||||
else:
|
||||
inputs_shared[extra_input] = data[extra_input]
|
||||
|
||||
# Pipeline units will automatically process the input parameters.
|
||||
for unit in self.pipe.units:
|
||||
@@ -118,12 +105,6 @@ if __name__ == "__main__":
|
||||
lora_target_modules=args.lora_target_modules,
|
||||
lora_rank=args.lora_rank,
|
||||
use_gradient_checkpointing_offload=args.use_gradient_checkpointing_offload,
|
||||
input_contains_input_image=args.input_contains_input_image,
|
||||
input_contains_end_image=args.input_contains_end_image,
|
||||
input_contains_control_video=args.input_contains_control_video,
|
||||
input_contains_reference_image=args.input_contains_reference_image,
|
||||
input_contains_vace_video=args.input_contains_vace_video,
|
||||
input_contains_vace_reference_image=args.input_contains_vace_reference_image,
|
||||
input_contains_motion_bucket_id=args.input_contains_motion_bucket_id,
|
||||
extra_inputs=args.extra_inputs,
|
||||
)
|
||||
launch_training_task(model, dataset, args=args)
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffsynth import save_video, VideoData, load_state_dict
|
||||
from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
|
||||
from modelscope import dataset_snapshot_download
|
||||
|
||||
|
||||
pipe = WanVideoPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
|
||||
],
|
||||
)
|
||||
state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_full/epoch-1.safetensors")
|
||||
pipe.dit.load_state_dict(state_dict)
|
||||
pipe.enable_vram_management()
|
||||
|
||||
video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
|
||||
|
||||
# First and last frame to video
|
||||
video = pipe(
|
||||
prompt="from sunset to night, a small town, light, house, river",
|
||||
negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走",
|
||||
input_image=video[0],
|
||||
camera_control_direction="Left", camera_control_speed=0.0,
|
||||
seed=0, tiled=True
|
||||
)
|
||||
save_video(video, "video_Wan2.1-Fun-V1.1-1.3B-Control-Camera.mp4", fps=15, quality=5)
|
||||
@@ -0,0 +1,32 @@
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffsynth import save_video, VideoData, load_state_dict
|
||||
from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
|
||||
from modelscope import dataset_snapshot_download
|
||||
|
||||
|
||||
pipe = WanVideoPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
|
||||
],
|
||||
)
|
||||
state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-14B-Control-Camera_full/epoch-1.safetensors")
|
||||
pipe.dit.load_state_dict(state_dict)
|
||||
pipe.enable_vram_management()
|
||||
|
||||
video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
|
||||
|
||||
# First and last frame to video
|
||||
video = pipe(
|
||||
prompt="from sunset to night, a small town, light, house, river",
|
||||
negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走",
|
||||
input_image=video[0],
|
||||
camera_control_direction="Left", camera_control_speed=0.0,
|
||||
seed=0, tiled=True
|
||||
)
|
||||
save_video(video, "video_Wan2.1-Fun-V1.1-14B-Control-Camera.mp4", fps=15, quality=5)
|
||||
@@ -0,0 +1,31 @@
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffsynth import save_video, VideoData, load_state_dict
|
||||
from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
|
||||
from modelscope import dataset_snapshot_download
|
||||
|
||||
|
||||
pipe = WanVideoPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
|
||||
],
|
||||
)
|
||||
pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_lora/epoch-4.safetensors", alpha=1)
|
||||
pipe.enable_vram_management()
|
||||
|
||||
video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
|
||||
|
||||
# First and last frame to video
|
||||
video = pipe(
|
||||
prompt="from sunset to night, a small town, light, house, river",
|
||||
negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走",
|
||||
input_image=video[0],
|
||||
camera_control_direction="Left", camera_control_speed=0.0,
|
||||
seed=0, tiled=True
|
||||
)
|
||||
save_video(video, "video_Wan2.1-Fun-V1.1-1.3B-Control-Camera.mp4", fps=15, quality=5)
|
||||
@@ -0,0 +1,31 @@
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffsynth import save_video, VideoData, load_state_dict
|
||||
from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig
|
||||
from modelscope import dataset_snapshot_download
|
||||
|
||||
|
||||
pipe = WanVideoPipeline.from_pretrained(
|
||||
torch_dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
model_configs=[
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"),
|
||||
ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"),
|
||||
],
|
||||
)
|
||||
pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_lora/epoch-4.safetensors", alpha=1)
|
||||
pipe.enable_vram_management()
|
||||
|
||||
video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)
|
||||
|
||||
# First and last frame to video
|
||||
video = pipe(
|
||||
prompt="from sunset to night, a small town, light, house, river",
|
||||
negative_prompt="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走",
|
||||
input_image=video[0],
|
||||
camera_control_direction="Left", camera_control_speed=0.0,
|
||||
seed=0, tiled=True
|
||||
)
|
||||
save_video(video, "video_Wan2.1-Fun-V1.1-14B-Control-Camera.mp4", fps=15, quality=5)
|
||||
Reference in New Issue
Block a user