Merge branch 'modelscope:main' into main

2026-04-08 08:58:20 +00:00 · 2025-03-13 14:22:34 +03:00
parent fb5fc09bad 762bcbee58
commit 5c4d3185fb
2 changed files with 52 additions and 9 deletions
--- a/examples/wanvideo/README.md
+++ b/examples/wanvideo/README.md
@@ -157,6 +157,8 @@ CUDA_VISIBLE_DEVICES="0" python examples/wanvideo/train_wan_t2v.py \

 If you wish to train the 14B model, please separate the safetensor files with a comma. For example: `models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,models/Wan-AI/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors`.

+If you wish to train the image-to-video model, please add an extra parameter `--image_encoder_path "models/Wan-AI/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth"`.
+
 For LoRA training, the Wan-1.3B-T2V model requires 16G of VRAM for processing 81 frames at 480P, while the Wan-14B-T2V model requires 60G of VRAM for the same configuration. To further reduce VRAM requirements by 20%-30%, you can include the parameter `--use_gradient_checkpointing_offload`.

 Step 5: Test
--- a/examples/wanvideo/train_wan_t2v.py
+++ b/examples/wanvideo/train_wan_t2v.py
@@ -7,11 +7,12 @@ from diffsynth import WanVideoPipeline, ModelManager, load_state_dict
 from peft import LoraConfig, inject_adapter_in_model
 import torchvision
 from PIL import Image
+import numpy as np



 class TextVideoDataset(torch.utils.data.Dataset):
-    def __init__(self, base_path, metadata_path, max_num_frames=81, frame_interval=1, num_frames=81, height=480, width=832):
+    def __init__(self, base_path, metadata_path, max_num_frames=81, frame_interval=1, num_frames=81, height=480, width=832, is_i2v=False):
        metadata = pd.read_csv(metadata_path)
        self.path = [os.path.join(base_path, "train", file_name) for file_name in metadata["file_name"]]
        self.text = metadata["text"].to_list()
@@ -21,6 +22,7 @@ class TextVideoDataset(torch.utils.data.Dataset):
        self.num_frames = num_frames
        self.height = height
        self.width = width
+        self.is_i2v = is_i2v
            
        self.frame_process = v2.Compose([
            v2.CenterCrop(size=(height, width)),
@@ -48,10 +50,13 @@ class TextVideoDataset(torch.utils.data.Dataset):
            return None
        
        frames = []
+        first_frame = None
        for frame_id in range(num_frames):
            frame = reader.get_data(start_frame_id + frame_id * interval)
            frame = Image.fromarray(frame)
            frame = self.crop_and_resize(frame)
+            if first_frame is None:
+                first_frame = np.array(frame)
            frame = frame_process(frame)
            frames.append(frame)
        reader.close()
@@ -59,7 +64,10 @@ class TextVideoDataset(torch.utils.data.Dataset):
        frames = torch.stack(frames, dim=0)
        frames = rearrange(frames, "T C H W -> C T H W")

-        return frames
+        if self.is_i2v:
+            return frames, first_frame
+        else:
+            return frames


    def load_video(self, file_path):
@@ -78,6 +86,7 @@ class TextVideoDataset(torch.utils.data.Dataset):
    def load_image(self, file_path):
        frame = Image.open(file_path).convert("RGB")
        frame = self.crop_and_resize(frame)
+        first_frame = frame
        frame = self.frame_process(frame)
        frame = rearrange(frame, "C H W -> C 1 H W")
        return frame
@@ -87,10 +96,16 @@ class TextVideoDataset(torch.utils.data.Dataset):
        text = self.text[data_id]
        path = self.path[data_id]
        if self.is_image(path):
+            if self.is_i2v:
+                raise ValueError(f"{path} is not a video. I2V model doesn't support image-to-image training.")
            video = self.load_image(path)
        else:
            video = self.load_video(path)
-        data = {"text": text, "video": video, "path": path}
+        if self.is_i2v:
+            video, first_frame = video
+            data = {"text": text, "video": video, "path": path, "first_frame": first_frame}
+        else:
+            data = {"text": text, "video": video, "path": path}
        return data
    

@@ -100,22 +115,35 @@ class TextVideoDataset(torch.utils.data.Dataset):


 class LightningModelForDataProcess(pl.LightningModule):
-    def __init__(self, text_encoder_path, vae_path, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
+    def __init__(self, text_encoder_path, vae_path, image_encoder_path=None, tiled=False, tile_size=(34, 34), tile_stride=(18, 16)):
        super().__init__()
+        model_path = [text_encoder_path, vae_path]
+        if image_encoder_path is not None:
+            model_path.append(image_encoder_path)
        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
-        model_manager.load_models([text_encoder_path, vae_path])
+        model_manager.load_models(model_path)
        self.pipe = WanVideoPipeline.from_model_manager(model_manager)

        self.tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
        
    def test_step(self, batch, batch_idx):
        text, video, path = batch["text"][0], batch["video"], batch["path"][0]
+        
        self.pipe.device = self.device
        if video is not None:
+            # prompt
            prompt_emb = self.pipe.encode_prompt(text)
+            # video
            video = video.to(dtype=self.pipe.torch_dtype, device=self.pipe.device)
            latents = self.pipe.encode_video(video, **self.tiler_kwargs)[0]
-            data = {"latents": latents, "prompt_emb": prompt_emb}
+            # image
+            if "first_frame" in batch:
+                first_frame = Image.fromarray(batch["first_frame"][0].cpu().numpy())
+                _, _, num_frames, height, width = video.shape
+                image_emb = self.pipe.encode_image(first_frame, num_frames, height, width)
+            else:
+                image_emb = {}
+            data = {"latents": latents, "prompt_emb": prompt_emb, "image_emb": image_emb}
            torch.save(data, path + ".tensors.pth")


@@ -224,7 +252,12 @@ class LightningModelForTrain(pl.LightningModule):
        latents = batch["latents"].to(self.device)
        prompt_emb = batch["prompt_emb"]
        prompt_emb["context"] = prompt_emb["context"][0].to(self.device)
-        
+        image_emb = batch["image_emb"]
+        if "clip_feature" in image_emb:
+            image_emb["clip_feature"] = image_emb["clip_feature"][0].to(self.device)
+        if "y" in image_emb:
+            image_emb["y"] = image_emb["y"][0].to(self.device)
+
        # Loss
        self.pipe.device = self.device
        noise = torch.randn_like(latents)
@@ -236,7 +269,7 @@ class LightningModelForTrain(pl.LightningModule):

        # Compute loss
        noise_pred = self.pipe.denoising_model()(
-            noisy_latents, timestep=timestep, **prompt_emb, **extra_input,
+            noisy_latents, timestep=timestep, **prompt_emb, **extra_input, **image_emb,
            use_gradient_checkpointing=self.use_gradient_checkpointing,
            use_gradient_checkpointing_offload=self.use_gradient_checkpointing_offload
        )
@@ -296,6 +329,12 @@ def parse_args():
        default=None,
        help="Path of text encoder.",
    )
+    parser.add_argument(
+        "--image_encoder_path",
+        type=str,
+        default=None,
+        help="Path of image encoder.",
+    )
    parser.add_argument(
        "--vae_path",
        type=str,
@@ -466,7 +505,8 @@ def data_process(args):
        frame_interval=1,
        num_frames=args.num_frames,
        height=args.height,
-        width=args.width
+        width=args.width,
+        is_i2v=args.image_encoder_path is not None
    )
    dataloader = torch.utils.data.DataLoader(
        dataset,
@@ -476,6 +516,7 @@ def data_process(args):
    )
    model = LightningModelForDataProcess(
        text_encoder_path=args.text_encoder_path,
+        image_encoder_path=args.image_encoder_path,
        vae_path=args.vae_path,
        tiled=args.tiled,
        tile_size=(args.tile_size_height, args.tile_size_width),