Merge pull request #374 from modelscope/wan-tokenizer-bugfix

align wan tokenizer to official
This commit is contained in:
Zhongjie Duan
2025-02-28 16:05:36 +08:00
committed by GitHub
3 changed files with 7 additions and 0 deletions

View File

@@ -785,6 +785,7 @@ class WanVideoVAE(nn.Module):
video = self.single_decode(hidden_state, device) video = self.single_decode(hidden_state, device)
video = video.squeeze(0) video = video.squeeze(0)
videos.append(video) videos.append(video)
videos = torch.stack(videos)
return videos return videos

View File

@@ -2,20 +2,24 @@ from .base_prompter import BasePrompter
from ..models.wan_video_text_encoder import WanTextEncoder from ..models.wan_video_text_encoder import WanTextEncoder
from transformers import AutoTokenizer from transformers import AutoTokenizer
import os, torch import os, torch
import ftfy
import html import html
import string import string
import regex as re import regex as re
def basic_clean(text): def basic_clean(text):
text = ftfy.fix_text(text)
text = html.unescape(html.unescape(text)) text = html.unescape(html.unescape(text))
return text.strip() return text.strip()
def whitespace_clean(text): def whitespace_clean(text):
text = re.sub(r'\s+', ' ', text) text = re.sub(r'\s+', ' ', text)
text = text.strip() text = text.strip()
return text return text
def canonicalize(text, keep_punctuation_exact_string=None): def canonicalize(text, keep_punctuation_exact_string=None):
text = text.replace('_', ' ') text = text.replace('_', ' ')
if keep_punctuation_exact_string: if keep_punctuation_exact_string:
@@ -28,6 +32,7 @@ def canonicalize(text, keep_punctuation_exact_string=None):
text = re.sub(r'\s+', ' ', text) text = re.sub(r'\s+', ' ', text)
return text.strip() return text.strip()
class HuggingfaceTokenizer: class HuggingfaceTokenizer:
def __init__(self, name, seq_len=None, clean=None, **kwargs): def __init__(self, name, seq_len=None, clean=None, **kwargs):

View File

@@ -10,3 +10,4 @@ einops
sentencepiece sentencepiece
protobuf protobuf
modelscope modelscope
ftfy