From ea0a5c5908d3a94447112ed8cb461de4d0ed24d9 Mon Sep 17 00:00:00 2001 From: Artiprocher Date: Sat, 15 Nov 2025 16:47:13 +0800 Subject: [PATCH] bug fix --- .gitignore | 1 + .../configs/vram_management_module_maps.py | 2 +- diffsynth/core/vram/layers.py | 64 +++++++++++++++++++ diffsynth/diffusion/base_pipeline.py | 7 +- diffsynth/pipelines/wan_video.py | 8 +-- examples/test/run.py | 17 ++++- .../model_inference/Wan2.2-Animate-14B.py | 2 +- .../Wan2.2-Animate-14B.py | 2 +- .../validate_full/LongCat-Video.py | 12 ++-- .../Video-As-Prompt-Wan2.1-14B.py | 14 ++-- .../Wan2.1-1.3b-speedcontrol-v1.py | 14 ++-- .../validate_full/Wan2.1-FLF2V-14B-720P.py | 14 ++-- .../validate_full/Wan2.1-Fun-1.3B-Control.py | 14 ++-- .../validate_full/Wan2.1-Fun-1.3B-InP.py | 14 ++-- .../validate_full/Wan2.1-Fun-14B-Control.py | 14 ++-- .../validate_full/Wan2.1-Fun-14B-InP.py | 14 ++-- .../Wan2.1-Fun-V1.1-1.3B-Control-Camera.py | 14 ++-- .../Wan2.1-Fun-V1.1-1.3B-Control.py | 14 ++-- .../validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py | 14 ++-- .../Wan2.1-Fun-V1.1-14B-Control-Camera.py | 14 ++-- .../Wan2.1-Fun-V1.1-14B-Control.py | 14 ++-- .../validate_full/Wan2.1-Fun-V1.1-14B-InP.py | 14 ++-- .../validate_full/Wan2.1-I2V-14B-480P.py | 14 ++-- .../validate_full/Wan2.1-I2V-14B-720P.py | 14 ++-- .../validate_full/Wan2.1-T2V-1.3B.py | 12 ++-- .../validate_full/Wan2.1-T2V-14B.py | 12 ++-- .../validate_full/Wan2.1-VACE-1.3B-Preview.py | 12 ++-- .../validate_full/Wan2.1-VACE-1.3B.py | 12 ++-- .../validate_full/Wan2.1-VACE-14B.py | 12 ++-- .../validate_full/Wan2.2-Animate-14B.py | 14 ++-- .../Wan2.2-Fun-A14B-Control-Camera.py | 14 ++-- .../validate_full/Wan2.2-Fun-A14B-Control.py | 14 ++-- .../validate_full/Wan2.2-Fun-A14B-InP.py | 14 ++-- .../validate_full/Wan2.2-I2V-A14B.py | 14 ++-- .../validate_full/Wan2.2-S2V-14B.py | 5 +- .../validate_full/Wan2.2-T2V-A14B.py | 14 ++-- .../validate_full/Wan2.2-TI2V-5B.py | 12 ++-- .../validate_full/Wan2.2-VACE-Fun-A14B.py | 14 ++-- .../validate_full/krea-realtime-video.py | 14 ++-- .../model_training/validate_full/run_test.py | 25 -------- .../validate_lora/LongCat-Video.py | 11 ++-- .../Video-As-Prompt-Wan2.1-14B.py | 14 ++-- .../Wan2.1-1.3b-speedcontrol-v1.py | 14 ++-- .../validate_lora/Wan2.1-FLF2V-14B-720P.py | 13 ++-- .../validate_lora/Wan2.1-Fun-1.3B-Control.py | 13 ++-- .../validate_lora/Wan2.1-Fun-1.3B-InP.py | 13 ++-- .../validate_lora/Wan2.1-Fun-14B-Control.py | 13 ++-- .../validate_lora/Wan2.1-Fun-14B-InP.py | 13 ++-- .../Wan2.1-Fun-V1.1-1.3B-Control-Camera.py | 14 ++-- .../Wan2.1-Fun-V1.1-1.3B-Control.py | 13 ++-- .../validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py | 13 ++-- .../Wan2.1-Fun-V1.1-14B-Control-Camera.py | 14 ++-- .../Wan2.1-Fun-V1.1-14B-Control.py | 13 ++-- .../validate_lora/Wan2.1-Fun-V1.1-14B-InP.py | 13 ++-- .../validate_lora/Wan2.1-I2V-14B-480P.py | 13 ++-- .../validate_lora/Wan2.1-I2V-14B-720P.py | 13 ++-- .../validate_lora/Wan2.1-T2V-1.3B.py | 11 ++-- .../validate_lora/Wan2.1-T2V-14B.py | 11 ++-- .../validate_lora/Wan2.1-VACE-1.3B-Preview.py | 11 ++-- .../validate_lora/Wan2.1-VACE-1.3B.py | 11 ++-- .../validate_lora/Wan2.1-VACE-14B.py | 11 ++-- .../validate_lora/Wan2.2-Animate-14B.py | 14 ++-- .../Wan2.2-Fun-A14B-Control-Camera.py | 14 ++-- .../validate_lora/Wan2.2-Fun-A14B-Control.py | 13 ++-- .../validate_lora/Wan2.2-Fun-A14B-InP.py | 13 ++-- .../validate_lora/Wan2.2-I2V-A14B.py | 13 ++-- .../validate_lora/Wan2.2-S2V-14B.py | 5 +- .../validate_lora/Wan2.2-T2V-A14B.py | 13 ++-- .../validate_lora/Wan2.2-TI2V-5B.py | 12 ++-- .../validate_lora/Wan2.2-VACE-Fun-A14B.py | 13 ++-- .../validate_lora/krea-realtime-video.py | 14 ++-- .../model_training/validate_lora/run_test.py | 25 -------- 72 files changed, 481 insertions(+), 472 deletions(-) delete mode 100644 examples/wanvideo/model_training/validate_full/run_test.py delete mode 100644 examples/wanvideo/model_training/validate_lora/run_test.py diff --git a/.gitignore b/.gitignore index 5c082a5..ca34bed 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ *.DS_Store *.msc *.mv +log*.txt # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/diffsynth/configs/vram_management_module_maps.py b/diffsynth/configs/vram_management_module_maps.py index 0868e8b..8eb169a 100644 --- a/diffsynth/configs/vram_management_module_maps.py +++ b/diffsynth/configs/vram_management_module_maps.py @@ -46,7 +46,7 @@ VRAM_MANAGEMENT_MODULE_MAPS = { }, "diffsynth.models.wan_video_dit.WanModel": { "diffsynth.models.wan_video_dit.MLP": "diffsynth.core.vram.layers.AutoWrappedModule", - "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule", + "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule", "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule", "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear", "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule", diff --git a/diffsynth/core/vram/layers.py b/diffsynth/core/vram/layers.py index 5ab9a3a..74b7c4d 100644 --- a/diffsynth/core/vram/layers.py +++ b/diffsynth/core/vram/layers.py @@ -198,6 +198,66 @@ class AutoWrappedModule(AutoTorchModule): return getattr(self.module, name) +class AutoWrappedNonRecurseModule(AutoWrappedModule): + + def __init__( + self, + module: torch.nn.Module, + offload_dtype: torch.dtype = None, + offload_device: Union[str, torch.device] = None, + onload_dtype: torch.dtype = None, + onload_device: Union[str, torch.device] = None, + preparing_dtype: torch.dtype = None, + preparing_device: Union[str, torch.device] = None, + computation_dtype: torch.dtype = None, + computation_device: Union[str, torch.device] = None, + vram_limit: float = None, + name: str = "", + disk_map: DiskMap = None, + **kwargs + ): + super().__init__( + module, + offload_dtype, + offload_device, + onload_dtype, + onload_device, + preparing_dtype, + preparing_device, + computation_dtype, + computation_device, + vram_limit, + name, + disk_map, + **kwargs + ) + if self.disk_offload: + self.required_params = [name for name, _ in self.module.named_parameters(recurse=False)] + + def load_from_disk(self, torch_dtype, device, copy_module=False): + if copy_module: + module = copy.deepcopy(self.module) + else: + module = self.module + state_dict = {} + for name in self.required_params: + param = self.disk_map[self.param_name(name)] + param = param.to(dtype=torch_dtype, device=device) + state_dict[name] = param + module.load_state_dict(state_dict, assign=True, strict=False) + return module + + def offload_to_disk(self, model: torch.nn.Module): + for name in self.required_params: + getattr(self, name).to("meta") + + def __getattr__(self, name): + if name in self.__dict__ or name == "module": + return super().__getattr__(name) + else: + return getattr(self.module, name) + + class AutoWrappedLinear(torch.nn.Linear, AutoTorchModule): def __init__( self, @@ -366,11 +426,15 @@ class AutoWrappedLinear(torch.nn.Linear, AutoTorchModule): def enable_vram_management_recursively(model: torch.nn.Module, module_map: dict, vram_config: dict, vram_limit=None, name_prefix="", disk_map=None, **kwargs): + if isinstance(model, AutoWrappedNonRecurseModule): + model = model.module for name, module in model.named_children(): layer_name = name if name_prefix == "" else name_prefix + "." + name for source_module, target_module in module_map.items(): if isinstance(module, source_module): module_ = target_module(module, **vram_config, vram_limit=vram_limit, name=layer_name, disk_map=disk_map, **kwargs) + if isinstance(module_, AutoWrappedNonRecurseModule): + enable_vram_management_recursively(module_, module_map, vram_config, vram_limit=vram_limit, name_prefix=layer_name, disk_map=disk_map, **kwargs) setattr(model, name, module_) break else: diff --git a/diffsynth/diffusion/base_pipeline.py b/diffsynth/diffusion/base_pipeline.py index 55e1f75..8961284 100644 --- a/diffsynth/diffusion/base_pipeline.py +++ b/diffsynth/diffusion/base_pipeline.py @@ -220,7 +220,7 @@ class BasePipeline(torch.nn.Module): module: torch.nn.Module, lora_config: Union[ModelConfig, str] = None, alpha=1, - hotload=False, + hotload=None, state_dict=None, ): if state_dict is None: @@ -233,12 +233,15 @@ class BasePipeline(torch.nn.Module): lora = state_dict lora_loader = self.lora_loader(torch_dtype=self.torch_dtype, device=self.device) lora = lora_loader.convert_state_dict(lora) + if hotload is None: + hotload = hasattr(module, "vram_management_enabled") and getattr(module, "vram_management_enabled") if hotload: if not (hasattr(module, "vram_management_enabled") and getattr(module, "vram_management_enabled")): raise ValueError("VRAM Management is not enabled. LoRA hotloading is not supported.") updated_num = 0 - for name, module in module.named_modules(): + for _, module in module.named_modules(): if isinstance(module, AutoWrappedLinear): + name = module.name lora_a_name = f'{name}.lora_A.weight' lora_b_name = f'{name}.lora_B.weight' if lora_a_name in lora and lora_b_name in lora: diff --git a/diffsynth/pipelines/wan_video.py b/diffsynth/pipelines/wan_video.py index e764fb2..0c9ce6f 100644 --- a/diffsynth/pipelines/wan_video.py +++ b/diffsynth/pipelines/wan_video.py @@ -15,10 +15,10 @@ from ..diffusion import FlowMatchScheduler from ..core import ModelConfig, gradient_checkpoint_forward from ..diffusion.base_pipeline import BasePipeline, PipelineUnit -from ..models.wan_video_dit import WanModel, RMSNorm, sinusoidal_embedding_1d +from ..models.wan_video_dit import WanModel, sinusoidal_embedding_1d from ..models.wan_video_dit_s2v import rope_precompute from ..models.wan_video_text_encoder import WanTextEncoder, HuggingfaceTokenizer -from ..models.wan_video_vae import WanVideoVAE, RMS_norm, CausalConv3d, Upsample +from ..models.wan_video_vae import WanVideoVAE from ..models.wan_video_image_encoder import WanImageEncoder from ..models.wan_video_vace import VaceWanModel from ..models.wan_video_motion_controller import WanMotionControllerModel @@ -526,13 +526,13 @@ class WanVideoUnit_FunReference(PipelineUnit): super().__init__( input_params=("reference_image", "height", "width", "reference_image"), output_params=("reference_latents", "clip_feature"), - onload_model_names=("vae",) + onload_model_names=("vae", "image_encoder") ) def process(self, pipe: WanVideoPipeline, reference_image, height, width): if reference_image is None: return {} - pipe.load_models_to_device(["vae"]) + pipe.load_models_to_device(self.onload_model_names) reference_image = reference_image.resize((width, height)) reference_latents = pipe.preprocess_video([reference_image]) reference_latents = pipe.vae.encode(reference_latents, device=pipe.device) diff --git a/examples/test/run.py b/examples/test/run.py index c35e780..0f84bb4 100644 --- a/examples/test/run.py +++ b/examples/test/run.py @@ -29,7 +29,7 @@ def run_tasks_on_single_GPU(script_path, gpu_id, num_gpu): os.makedirs(target_path, exist_ok=True) if script.endswith(".sh"): cmd = f"CUDA_VISIBLE_DEVICES={gpu_id} bash {source_path} > {target_path}/log.txt 2>&1" - else: + elif script.endswith(".py"): cmd = f"CUDA_VISIBLE_DEVICES={gpu_id} python {source_path} > {target_path}/log.txt 2>&1" print(cmd, flush=True) os.system(cmd) @@ -58,6 +58,12 @@ def run_train_single_GPU(script_path): p.join() +def move_files(prefix, target_folder): + os.makedirs(target_folder, exist_ok=True) + os.system(f"cp -r {prefix}* {target_folder}") + os.system(f"rm -rf {prefix}*") + + if __name__ == "__main__": # run_train_multi_GPU("examples/qwen_image/model_training/full") # run_train_single_GPU("examples/qwen_image/model_training/lora") @@ -66,3 +72,12 @@ if __name__ == "__main__": # run_inference("examples/qwen_image/model_training/validate_full") # run_inference("examples/qwen_image/model_training/validate_lora") run_train_single_GPU("examples/wanvideo/model_inference_low_vram") + move_files("video_", "data/output/model_inference_low_vram") + run_train_single_GPU("examples/wanvideo/model_inference") + move_files("video_", "data/output/model_inference") + run_train_single_GPU("examples/wanvideo/model_training/lora") + run_train_single_GPU("examples/wanvideo/model_training/validate_lora") + move_files("video_", "data/output/validate_lora") + run_train_multi_GPU("examples/wanvideo/model_training/full") + run_train_single_GPU("examples/wanvideo/model_training/validate_full") + move_files("video_", "data/output/validate_full") diff --git a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py index ec02e46..d435b68 100644 --- a/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference/Wan2.2-Animate-14B.py @@ -41,7 +41,7 @@ save_video(video, "video_1_Wan2.2-Animate-14B.mp4", fps=15, quality=5) # Replace snapshot_download("Wan-AI/Wan2.2-Animate-14B", allow_file_pattern="relighting_lora.ckpt", local_dir="models/Wan-AI/Wan2.2-Animate-14B") -lora_state_dict = load_state_dict("models/Wan-AI/Wan2.2-Animate-14B/relighting_lora.ckpt", torch_dtype=torch.float32, device="cuda")["state_dict"] +lora_state_dict = load_state_dict("models/Wan-AI/Wan2.2-Animate-14B/relighting_lora.ckpt", torch_dtype=torch.bfloat16, device="cuda")["state_dict"] pipe.load_lora(pipe.dit, state_dict=lora_state_dict) input_image = Image.open("data/examples/wan/animate/replace_input_image.png") animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:81-4] diff --git a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py index 147ab4b..6397044 100644 --- a/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_inference_low_vram/Wan2.2-Animate-14B.py @@ -52,7 +52,7 @@ save_video(video, "video_1_Wan2.2-Animate-14B.mp4", fps=15, quality=5) # Replace snapshot_download("Wan-AI/Wan2.2-Animate-14B", allow_file_pattern="relighting_lora.ckpt", local_dir="models/Wan-AI/Wan2.2-Animate-14B") -lora_state_dict = load_state_dict("models/Wan-AI/Wan2.2-Animate-14B/relighting_lora.ckpt", torch_dtype=torch.float32, device="cuda")["state_dict"] +lora_state_dict = load_state_dict("models/Wan-AI/Wan2.2-Animate-14B/relighting_lora.ckpt", torch_dtype=torch.bfloat16, device="cuda")["state_dict"] pipe.load_lora(pipe.dit, state_dict=lora_state_dict) input_image = Image.open("data/examples/wan/animate/replace_input_image.png") animate_pose_video = VideoData("data/examples/wan/animate/replace_pose_video.mp4").raw_data()[:81-4] diff --git a/examples/wanvideo/model_training/validate_full/LongCat-Video.py b/examples/wanvideo/model_training/validate_full/LongCat-Video.py index 31ee9a7..62e62e2 100644 --- a/examples/wanvideo/model_training/validate_full/LongCat-Video.py +++ b/examples/wanvideo/model_training/validate_full/LongCat-Video.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="meituan-longcat/LongCat-Video", origin_file_pattern="dit/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="meituan-longcat/LongCat-Video", origin_file_pattern="dit/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/LongCat-Video_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = pipe( prompt="from sunset to night, a small town, light, house, river", diff --git a/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py b/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py index 97af5bb..757b2d9 100644 --- a/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py +++ b/examples/wanvideo/model_training/validate_full/Video-As-Prompt-Wan2.1-14B.py @@ -1,22 +1,22 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="ByteDance/Video-As-Prompt-Wan2.1-14B", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="ByteDance/Video-As-Prompt-Wan2.1-14B", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Video-As-Prompt-Wan2.1-14B_full/epoch-1.safetensors") pipe.vap.load_state_dict(state_dict) -pipe.enable_vram_management() ref_video_path = 'data/example_video_dataset/wanvap/vap_ref.mp4' target_image_path = 'data/example_video_dataset/wanvap/input_image.jpg' diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py b/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py index 124749a..8aef365 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-1.3b-speedcontrol-v1.py @@ -1,22 +1,22 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1", origin_file_pattern="model.safetensors", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1", origin_file_pattern="model.safetensors"), ], ) state_dict = load_state_dict("models/train/Wan2.1-1.3b-speedcontrol-v1_full/epoch-1.safetensors") pipe.motion_controller.load_state_dict(state_dict) -pipe.enable_vram_management() # Text-to-video video = pipe( diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py b/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py index 41a67ed..12ac162 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-FLF2V-14B-720P.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-FLF2V-14B-720P_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py index 6726e9c..c6de603 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-Control.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-1.3B-Control_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py index 3e1e6f3..766d436 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-1.3B-InP.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-1.3B-InP_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py index 08b0acb..d05da17 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-Control.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-14B-Control_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py index d7e39d7..f1953ae 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-14B-InP.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-14B-InP_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py index c5a210a..ee547dc 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py index 6497e1b..34a2591 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-Control.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-1.3B-Control_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py index cd8ee20..106d403 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-1.3B-InP.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-1.3B-InP_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py index 2de5102..4a14a5d 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control-Camera.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-14B-Control-Camera_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py index 0dd2516..dc91697 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-Control.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-14B-Control_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py index 7e944b0..5571b5d 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-Fun-V1.1-14B-InP.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-Fun-V1.1-14B-InP_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py b/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py index c1c8615..7562b4c 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-480P.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-I2V-14B-480P_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py b/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py index d9d39c8..3ca5a38 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-I2V-14B-720P.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-I2V-14B-720P_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/video1.mp4", height=720, width=1280)[0] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py b/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py index 1420514..17f87fa 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-1.3B.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-T2V-1.3B_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = pipe( prompt="from sunset to night, a small town, light, house, river", diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py b/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py index a0107ae..8b9032d 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-T2V-14B.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-T2V-14B_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() video = pipe( prompt="from sunset to night, a small town, light, house, river", diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py b/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py index a916745..8ba0067 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B-Preview.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-VACE-1.3B-Preview_full/epoch-1.safetensors") pipe.vace.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(49)] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py b/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py index 5a371e7..add5ff9 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-1.3B.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-VACE-1.3B_full/epoch-1.safetensors") pipe.vace.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(49)] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py b/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py index 5553471..a861fa3 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.1-VACE-14B.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.1-VACE-14B_full/epoch-1.safetensors") pipe.vace.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(17)] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py index d6fbfc1..0cdce06 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-Animate-14B.py @@ -1,22 +1,22 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.2-Animate-14B_full/epoch-1.safetensors") pipe.animate_adapter.load_state_dict(state_dict, strict=False) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/animate/animate_output.mp4", height=480, width=832)[0] animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:81-4] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py b/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py index 700d503..ce52086 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control-Camera.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,17 +10,16 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.2-Fun-A14B-Control-Camera_high_noise_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) state_dict = load_state_dict("models/train/Wan2.2-Fun-A14B-Control-Camera_low_noise_full/epoch-1.safetensors") pipe.dit2.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py b/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py index 685b48c..ee32d9c 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-Control.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,17 +10,16 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.2-Fun-A14B-Control_high_noise_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) state_dict = load_state_dict("models/train/Wan2.2-Fun-A14B-Control_low_noise_full/epoch-1.safetensors") pipe.dit2.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py b/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py index 154c4e4..678fb55 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-Fun-A14B-InP.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,17 +10,16 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.2-Fun-A14B-InP_high_noise_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) state_dict = load_state_dict("models/train/Wan2.2-Fun-A14B-InP_low_noise_full/epoch-1.safetensors") pipe.dit2.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) # First and last frame to video diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py index 3f6d253..2624341 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-I2V-A14B.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,17 +10,16 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.2-I2V-A14B_high_noise_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) state_dict = load_state_dict("models/train/Wan2.2-I2V-A14B_low_noise_full/epoch-1.safetensors") pipe.dit2.load_state_dict(state_dict) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py index 2df08d2..9e914ff 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-S2V-14B.py @@ -2,7 +2,7 @@ import torch from PIL import Image import librosa from diffsynth import VideoData, save_video_with_audio, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( @@ -19,7 +19,6 @@ pipe = WanVideoPipeline.from_pretrained( state_dict = load_state_dict("models/train/Wan2.2-S2V-14B_full/epoch-0.safetensors") pipe.dit.load_state_dict(state_dict, strict=False) -pipe.enable_vram_management() num_frames = 81 # 4n+1 @@ -50,4 +49,4 @@ video = pipe( s2v_pose_video=pose_video, num_inference_steps=40, ) -save_video_with_audio(video[1:], "video_pose_with_audio.mp4", audio_path, fps=16, quality=5) +save_video_with_audio(video[1:], "video_Wan2.2-S2V-14B.mp4", audio_path, fps=16, quality=5) diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py index be0e000..34453b8 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-T2V-A14B.py @@ -1,24 +1,24 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.2-T2V-A14B_high_noise_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) state_dict = load_state_dict("models/train/Wan2.2-T2V-A14B_low_noise_full/epoch-1.safetensors") pipe.dit2.load_state_dict(state_dict) -pipe.enable_vram_management() video = pipe( prompt="from sunset to night, a small town, light, house, river", diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py index 8a3d36c..75e108f 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-TI2V-5B.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +10,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="Wan2.2_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="Wan2.2_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.2-TI2V-5B_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0] diff --git a/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py b/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py index e566dba..33ac71a 100644 --- a/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py +++ b/examples/wanvideo/model_training/validate_full/Wan2.2-VACE-Fun-A14B.py @@ -1,24 +1,24 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/Wan2.2-VACE-Fun-A14B_high_noise_full/epoch-1.safetensors") pipe.vace.load_state_dict(state_dict) state_dict = load_state_dict("models/train/Wan2.2-VACE-Fun-A14B_low_noise_full/epoch-1.safetensors") pipe.vace2.load_state_dict(state_dict) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(17)] diff --git a/examples/wanvideo/model_training/validate_full/krea-realtime-video.py b/examples/wanvideo/model_training/validate_full/krea-realtime-video.py index d675773..660ac70 100644 --- a/examples/wanvideo/model_training/validate_full/krea-realtime-video.py +++ b/examples/wanvideo/model_training/validate_full/krea-realtime-video.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="krea/krea-realtime-video", origin_file_pattern="krea-realtime-video-14b.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="krea/krea-realtime-video", origin_file_pattern="krea-realtime-video-14b.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) state_dict = load_state_dict("models/train/krea-realtime-video_full/epoch-1.safetensors") pipe.dit.load_state_dict(state_dict) -pipe.enable_vram_management() # Text-to-video video = pipe( @@ -25,4 +25,4 @@ video = pipe( cfg_scale=1, sigma_shift=20, ) -save_video(video, "output.mp4", fps=15, quality=5) +save_video(video, "video_krea-realtime-video.mp4", fps=15, quality=5) diff --git a/examples/wanvideo/model_training/validate_full/run_test.py b/examples/wanvideo/model_training/validate_full/run_test.py deleted file mode 100644 index a4e3203..0000000 --- a/examples/wanvideo/model_training/validate_full/run_test.py +++ /dev/null @@ -1,25 +0,0 @@ -import multiprocessing, os - - -def run_task(scripts, thread_id, thread_num): - for script_id, script in enumerate(scripts): - if script_id % thread_num == thread_id: - log_file_name = script.replace("/", "_") + ".txt" - cmd = f"CUDA_VISIBLE_DEVICES={thread_id} python -u {script} > data/log/{log_file_name} 2>&1" - os.makedirs("data/log", exist_ok=True) - print(cmd, flush=True) - os.system(cmd) - - -if __name__ == "__main__": - scripts = [] - for file_name in os.listdir("examples/wanvideo/model_training/validate_full"): - if file_name != "run_test.py": - scripts.append(os.path.join("examples/wanvideo/model_training/validate_full", file_name)) - - processes = [multiprocessing.Process(target=run_task, args=(scripts, i, 8)) for i in range(8)] - for p in processes: - p.start() - for p in processes: - p.join() - print("Done!") \ No newline at end of file diff --git a/examples/wanvideo/model_training/validate_lora/LongCat-Video.py b/examples/wanvideo/model_training/validate_lora/LongCat-Video.py index 45c1ddb..bf5b5dc 100644 --- a/examples/wanvideo/model_training/validate_lora/LongCat-Video.py +++ b/examples/wanvideo/model_training/validate_lora/LongCat-Video.py @@ -1,20 +1,19 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="meituan-longcat/LongCat-Video", origin_file_pattern="dit/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="meituan-longcat/LongCat-Video", origin_file_pattern="dit/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/LongCat-Video_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = pipe( prompt="from sunset to night, a small town, light, house, river", diff --git a/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py b/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py index 13da5c0..ee962fc 100644 --- a/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Video-As-Prompt-Wan2.1-14B.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="ByteDance/Video-As-Prompt-Wan2.1-14B", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="ByteDance/Video-As-Prompt-Wan2.1-14B", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Video-As-Prompt-Wan2.1-14B_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() ref_video_path = 'data/example_video_dataset/wanvap/vap_ref.mp4' target_image_path = 'data/example_video_dataset/wanvap/input_image.jpg' diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py index 167b871..4137b49 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-1.3b-speedcontrol-v1.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1", origin_file_pattern="model.safetensors", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1", origin_file_pattern="model.safetensors"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-1.3b-speedcontrol-v1_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() # Text-to-video video = pipe( diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py index cd68f0e..62f8c0c 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-FLF2V-14B-720P.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-FLF2V-14B-720P_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py index 7270c38..dc7f9dc 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-Control.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-1.3B-Control_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py index c904dfa..c4134a0 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-1.3B-InP.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-1.3B-InP_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py index 8631d05..1dfec52 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-Control.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-14B-Control_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py index e020aac..92b90a9 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-14B-InP.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-14B-InP_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py index 3023692..d3c44ed 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control-Camera.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +10,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py index ebcfd2f..2a113e7 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-Control.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-V1.1-1.3B-Control_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py index 99eb2b4..8bca71a 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-1.3B-InP.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-V1.1-1.3B-InP_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py index 0edb2d0..353e19e 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control-Camera.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +10,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-V1.1-1.3B-Control-Camera_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py index 6b11098..fae3f8c 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-Control.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-V1.1-14B-Control_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py index 35088fb..6c3298d 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-Fun-V1.1-14B-InP.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-InP", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-Fun-V1.1-14B-InP_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py index 1687e36..fa2d149 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-480P.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-I2V-14B-480P_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py index cd60f37..aa52055 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-I2V-14B-720P.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,14 +9,13 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-720P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-I2V-14B-720P_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/video1.mp4", height=720, width=1280)[0] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py index 7cb6c02..8d316b5 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-1.3B.py @@ -1,20 +1,19 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-T2V-1.3B_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = pipe( prompt="from sunset to night, a small town, light, house, river", diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py index 3b66a49..4b2a445 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-T2V-14B.py @@ -1,20 +1,19 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.1-T2V-14B_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = pipe( prompt="from sunset to night, a small town, light, house, river", diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py index 91cbf92..7a401d8 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B-Preview.py @@ -1,20 +1,19 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.vace, "models/train/Wan2.1-VACE-1.3B-Preview_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(49)] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py index b5fd203..fe5e19b 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-1.3B.py @@ -1,20 +1,19 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-1.3B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.vace, "models/train/Wan2.1-VACE-1.3B_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(49)] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py index bec5df3..5bbe945 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.1-VACE-14B.py @@ -1,20 +1,19 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.vace, "models/train/Wan2.1-VACE-14B_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(17)] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py index 9f6d7c4..79326cd 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-Animate-14B.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="Wan2.1_VAE.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.2-Animate-14B_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/animate/animate_output.mp4", height=480, width=832)[0] animate_pose_video = VideoData("data/examples/wan/animate/animate_pose_video.mp4", height=480, width=832).raw_data()[:81-4] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py index f1e1840..d1967f7 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control-Camera.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +10,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.2-Fun-A14B-Control-Camera_high_noise_lora/epoch-4.safetensors", alpha=1) pipe.load_lora(pipe.dit2, "models/train/Wan2.2-Fun-A14B-Control-Camera_low_noise_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py index 11dda5b..0af942f 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-Control.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +9,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.2-Fun-A14B-Control_high_noise_lora/epoch-4.safetensors", alpha=1) pipe.load_lora(pipe.dit2, "models/train/Wan2.2-Fun-A14B-Control_low_noise_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(81)] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py index 90afa46..0838c41 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-Fun-A14B-InP.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +9,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-InP", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.2-Fun-A14B-InP_high_noise_lora/epoch-4.safetensors", alpha=1) pipe.load_lora(pipe.dit2, "models/train/Wan2.2-Fun-A14B-InP_low_noise_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py index f221ef7..ca32a92 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-I2V-A14B.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +9,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.2-I2V-A14B_high_noise_lora/epoch-4.safetensors", alpha=1) pipe.load_lora(pipe.dit2, "models/train/Wan2.2-I2V-A14B_low_noise_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py index a6166b9..d936ca3 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-S2V-14B.py @@ -2,7 +2,7 @@ import torch from PIL import Image import librosa from diffsynth import VideoData, save_video_with_audio -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, @@ -17,7 +17,6 @@ pipe = WanVideoPipeline.from_pretrained( ) pipe.load_lora(pipe.dit, "models/train/Wan2.2-S2V-14B_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() num_frames = 81 # 4n+1 @@ -48,4 +47,4 @@ video = pipe( s2v_pose_video=pose_video, num_inference_steps=40, ) -save_video_with_audio(video[1:], "video_pose_with_audio.mp4", audio_path, fps=16, quality=5) +save_video_with_audio(video[1:], "video_Wan2.2-S2V-14B.mp4", audio_path, fps=16, quality=5) diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py index ab43927..f2fe595 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-T2V-A14B.py @@ -1,7 +1,7 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,15 +9,14 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-T2V-A14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.2-T2V-A14B_high_noise_lora/epoch-4.safetensors", alpha=1) pipe.load_lora(pipe.dit2, "models/train/Wan2.2-T2V-A14B_low_noise_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = pipe( prompt="from sunset to night, a small town, light, house, river", diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py index e5b16c8..f8f6f23 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-TI2V-5B.py @@ -1,7 +1,8 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig from modelscope import dataset_snapshot_download @@ -9,13 +10,12 @@ pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="Wan2.2_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="Wan2.2_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/Wan2.2-TI2V-5B_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() input_image = VideoData("data/example_video_dataset/video1.mp4", height=480, width=832)[0] diff --git a/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py b/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py index b6e6aff..6270249 100644 --- a/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py +++ b/examples/wanvideo/model_training/validate_lora/Wan2.2-VACE-Fun-A14B.py @@ -1,22 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="low_noise_model/diffusion_pytorch_model*.safetensors"), + ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="PAI/Wan2.2-VACE-Fun-A14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.vace, "models/train/Wan2.2-VACE-Fun-A14B_high_noise_lora/epoch-4.safetensors", alpha=1) pipe.load_lora(pipe.vace2, "models/train/Wan2.2-VACE-Fun-A14B_low_noise_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() video = VideoData("data/example_video_dataset/video1_softedge.mp4", height=480, width=832) video = [video[i] for i in range(17)] diff --git a/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py b/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py index 0742d5b..8aa44c6 100644 --- a/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py +++ b/examples/wanvideo/model_training/validate_lora/krea-realtime-video.py @@ -1,21 +1,21 @@ import torch from PIL import Image -from diffsynth import save_video, VideoData, load_state_dict -from diffsynth.pipelines.wan_video_new import WanVideoPipeline, ModelConfig +from diffsynth.utils.data import save_video, VideoData +from diffsynth.core import load_state_dict +from diffsynth.pipelines.wan_video import WanVideoPipeline, ModelConfig pipe = WanVideoPipeline.from_pretrained( torch_dtype=torch.bfloat16, device="cuda", model_configs=[ - ModelConfig(model_id="krea/krea-realtime-video", origin_file_pattern="krea-realtime-video-14b.safetensors", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth", offload_device="cpu"), - ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth", offload_device="cpu"), + ModelConfig(model_id="krea/krea-realtime-video", origin_file_pattern="krea-realtime-video-14b.safetensors"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth"), + ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth"), ], ) pipe.load_lora(pipe.dit, "models/train/krea-realtime-video_lora/epoch-4.safetensors", alpha=1) -pipe.enable_vram_management() # Text-to-video video = pipe( @@ -25,4 +25,4 @@ video = pipe( cfg_scale=1, sigma_shift=20, ) -save_video(video, "output.mp4", fps=15, quality=5) +save_video(video, "video_krea-realtime-video.mp4", fps=15, quality=5) diff --git a/examples/wanvideo/model_training/validate_lora/run_test.py b/examples/wanvideo/model_training/validate_lora/run_test.py deleted file mode 100644 index 367ee9d..0000000 --- a/examples/wanvideo/model_training/validate_lora/run_test.py +++ /dev/null @@ -1,25 +0,0 @@ -import multiprocessing, os - - -def run_task(scripts, thread_id, thread_num): - for script_id, script in enumerate(scripts): - if script_id % thread_num == thread_id: - log_file_name = script.replace("/", "_") + ".txt" - cmd = f"CUDA_VISIBLE_DEVICES={thread_id} python -u {script} > data/log/{log_file_name} 2>&1" - os.makedirs("data/log", exist_ok=True) - print(cmd, flush=True) - os.system(cmd) - - -if __name__ == "__main__": - scripts = [] - for file_name in os.listdir("examples/wanvideo/model_training/validate_lora"): - if file_name != "run_test.py": - scripts.append(os.path.join("examples/wanvideo/model_training/validate_lora", file_name)) - - processes = [multiprocessing.Process(target=run_task, args=(scripts, i, 8)) for i in range(8)] - for p in processes: - p.start() - for p in processes: - p.join() - print("Done!") \ No newline at end of file