mirror of
https://github.com/modelscope/DiffSynth-Studio.git
synced 2026-03-23 17:38:10 +00:00
align wan tokenizer to official
This commit is contained in:
@@ -2,20 +2,24 @@ from .base_prompter import BasePrompter
|
||||
from ..models.wan_video_text_encoder import WanTextEncoder
|
||||
from transformers import AutoTokenizer
|
||||
import os, torch
|
||||
import ftfy
|
||||
import html
|
||||
import string
|
||||
import regex as re
|
||||
|
||||
|
||||
def basic_clean(text):
|
||||
text = ftfy.fix_text(text)
|
||||
text = html.unescape(html.unescape(text))
|
||||
return text.strip()
|
||||
|
||||
|
||||
def whitespace_clean(text):
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
|
||||
def canonicalize(text, keep_punctuation_exact_string=None):
|
||||
text = text.replace('_', ' ')
|
||||
if keep_punctuation_exact_string:
|
||||
@@ -28,6 +32,7 @@ def canonicalize(text, keep_punctuation_exact_string=None):
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
class HuggingfaceTokenizer:
|
||||
|
||||
def __init__(self, name, seq_len=None, clean=None, **kwargs):
|
||||
|
||||
Reference in New Issue
Block a user