From d3d3556ff6440956d84244447cbdf07b4c7b6e28 Mon Sep 17 00:00:00 2001
From: Zhongjie Duan <35051019+Artiprocher@users.noreply.github.com>
Date: Fri, 28 Feb 2025 10:09:48 +0800
Subject: [PATCH 1/3] Update README.md

---
 examples/wanvideo/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/wanvideo/README.md b/examples/wanvideo/README.md
index 16e4f5d..ecfc536 100644
--- a/examples/wanvideo/README.md
+++ b/examples/wanvideo/README.md
@@ -44,6 +44,8 @@ https://github.com/user-attachments/assets/3908bc64-d451-485a-8b61-28f6d32dd92f
 
 Wan-Video-14B-I2V adds the functionality of image-to-video based on Wan-Video-14B-T2V. The model size remains the same, therefore the speed and VRAM requirements are also consistent. See [`./wan_14b_image_to_video.py`](./wan_14b_image_to_video.py).
 
+**In the sample code, we use the same settings as the T2V 14B model, with FP8 quantization enabled by default. However, we found that this model is more sensitive to precision, so when the generated video content experiences issues such as artifacts, please switch to bfloat16 precision and use the `num_persistent_param_in_dit` parameter to control VRAM usage.**
+
 ![Image](https://github.com/user-attachments/assets/adf8047f-7943-4aaa-a555-2b32dc415f39)
 
 https://github.com/user-attachments/assets/c0bdd5ca-292f-45ed-b9bc-afe193156e75

From b6c3d2b74a9bde1b362b104f2b1ab20895676d04 Mon Sep 17 00:00:00 2001
From: ZeYi Lin <944270057@qq.com>
Date: Fri, 28 Feb 2025 12:51:58 +0800
Subject: [PATCH 2/3] fix: logger

---
 diffsynth/trainers/text_to_image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/diffsynth/trainers/text_to_image.py b/diffsynth/trainers/text_to_image.py
index d55d792..7b2690f 100644
--- a/diffsynth/trainers/text_to_image.py
+++ b/diffsynth/trainers/text_to_image.py
@@ -294,7 +294,7 @@ def launch_training_task(model, args):
             mode=args.swanlab_mode,
             logdir=args.output_path,
         )
-        logger = [swanlab_config]
+        logger = [swanlab_logger]
     else:
         logger = []
 

From 61a30673c2530c98fff6751ff92ce74974df88a6 Mon Sep 17 00:00:00 2001
From: Artiprocher <wangye87v5@hotmail.com>
Date: Fri, 28 Feb 2025 15:50:07 +0800
Subject: [PATCH 3/3] align wan tokenizer to official

---
 diffsynth/models/wan_video_vae.py   | 1 +
 diffsynth/prompters/wan_prompter.py | 5 +++++
 requirements.txt                    | 1 +
 3 files changed, 7 insertions(+)

diff --git a/diffsynth/models/wan_video_vae.py b/diffsynth/models/wan_video_vae.py
index ebbee9d..01b5484 100644
--- a/diffsynth/models/wan_video_vae.py
+++ b/diffsynth/models/wan_video_vae.py
@@ -785,6 +785,7 @@ class WanVideoVAE(nn.Module):
                 video = self.single_decode(hidden_state, device)
             video = video.squeeze(0)
             videos.append(video)
+        videos = torch.stack(videos)
         return videos
 
 
diff --git a/diffsynth/prompters/wan_prompter.py b/diffsynth/prompters/wan_prompter.py
index d2c578d..f8c924a 100644
--- a/diffsynth/prompters/wan_prompter.py
+++ b/diffsynth/prompters/wan_prompter.py
@@ -2,20 +2,24 @@ from .base_prompter import BasePrompter
 from ..models.wan_video_text_encoder import WanTextEncoder
 from transformers import AutoTokenizer
 import os, torch
+import ftfy
 import html
 import string
 import regex as re
 
 
 def basic_clean(text):
+    text = ftfy.fix_text(text)
     text = html.unescape(html.unescape(text))
     return text.strip()
 
+
 def whitespace_clean(text):
     text = re.sub(r'\s+', ' ', text)
     text = text.strip()
     return text
 
+
 def canonicalize(text, keep_punctuation_exact_string=None):
     text = text.replace('_', ' ')
     if keep_punctuation_exact_string:
@@ -28,6 +32,7 @@ def canonicalize(text, keep_punctuation_exact_string=None):
     text = re.sub(r'\s+', ' ', text)
     return text.strip()
 
+
 class HuggingfaceTokenizer:
 
     def __init__(self, name, seq_len=None, clean=None, **kwargs):
diff --git a/requirements.txt b/requirements.txt
index 2e958ea..63a871b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ einops
 sentencepiece
 protobuf
 modelscope
+ftfy