align wan tokenizer to official

2026-03-23 17:38:10 +00:00 · 2025-02-28 15:50:07 +08:00
parent 6fa8dbe077
commit 61a30673c2
3 changed files with 7 additions and 0 deletions
--- a/diffsynth/prompters/wan_prompter.py
+++ b/diffsynth/prompters/wan_prompter.py
@@ -2,20 +2,24 @@ from .base_prompter import BasePrompter
 from ..models.wan_video_text_encoder import WanTextEncoder
 from transformers import AutoTokenizer
 import os, torch
+import ftfy
 import html
 import string
 import regex as re


 def basic_clean(text):
+    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()

+
 def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

+
 def canonicalize(text, keep_punctuation_exact_string=None):
    text = text.replace('_', ' ')
    if keep_punctuation_exact_string:
@@ -28,6 +32,7 @@ def canonicalize(text, keep_punctuation_exact_string=None):
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

+
 class HuggingfaceTokenizer:

    def __init__(self, name, seq_len=None, clean=None, **kwargs):