custom tokenizer .txt support

This commit is contained in:
josc146 2023-09-18 17:20:55 +08:00
parent f8388a0527
commit 5e5e1e9651

View File

@ -33,7 +33,7 @@ class PIPELINE_ARGS:
class PIPELINE:
def __init__(self, model, WORD_NAME):
def __init__(self, model, WORD_NAME: str):
self.model = model
if WORD_NAME == "cl100k_base":
import tiktoken
@ -46,6 +46,12 @@ class PIPELINE:
self.tokenizer = TRIE_TOKENIZER(
os.path.dirname(os.path.abspath(__file__)) + "/rwkv_vocab_v20230424.txt"
)
else:
if WORD_NAME.endswith(".txt"):
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from rwkv_tokenizer import TRIE_TOKENIZER
self.tokenizer = TRIE_TOKENIZER(WORD_NAME)
else:
from tokenizers import Tokenizer