206 lines
5.6 KiB
Python
206 lines
5.6 KiB
Python
|
# Copyright (c) 2021, EleutherAI
|
||
|
# This file is based on code by the authors denoted below and has been modified from its original version.
|
||
|
#
|
||
|
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
|
||
|
"""Megatron tokenizers."""
|
||
|
|
||
|
from abc import ABC
|
||
|
from abc import abstractmethod
|
||
|
|
||
|
from tokenizers import Tokenizer
|
||
|
from rwkv_tokenizer import RWKV_TOKENIZER, TRIE_TOKENIZER
|
||
|
|
||
|
from typing import List, Union
|
||
|
|
||
|
|
||
|
def build_tokenizer(args):
|
||
|
"""Initialize tokenizer."""
|
||
|
if args.rank == 0:
|
||
|
print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
|
||
|
|
||
|
# Select and instantiate the tokenizer.
|
||
|
|
||
|
if args.tokenizer_type.lower() == "HFTokenizer".lower():
|
||
|
assert args.vocab_file is not None
|
||
|
tokenizer = HFTokenizer(args.vocab_file)
|
||
|
elif args.tokenizer_type.lower() == "RWKVTokenizer".lower():
|
||
|
assert args.vocab_file is not None
|
||
|
tokenizer = RWKVTokenizer(args.vocab_file)
|
||
|
else:
|
||
|
raise NotImplementedError(
|
||
|
"{} tokenizer is not " "implemented.".format(args.tokenizer_type)
|
||
|
)
|
||
|
|
||
|
# Add vocab size.
|
||
|
args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
|
||
|
|
||
|
return tokenizer
|
||
|
|
||
|
|
||
|
def _vocab_size_with_padding(orig_vocab_size, args):
|
||
|
"""Pad vocab size so it is divisible by model parallel size and
|
||
|
still having GPU friendly size."""
|
||
|
|
||
|
after = orig_vocab_size
|
||
|
multiple = args.make_vocab_size_divisible_by * args.model_parallel_size
|
||
|
while (after % multiple) != 0:
|
||
|
after += 1
|
||
|
if args.rank == 0:
|
||
|
print(
|
||
|
" > padded vocab (size: {}) with {} dummy tokens "
|
||
|
"(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after),
|
||
|
flush=True,
|
||
|
)
|
||
|
return after
|
||
|
|
||
|
|
||
|
class AbstractTokenizer(ABC):
|
||
|
"""Abstract class for tokenizer."""
|
||
|
|
||
|
def __init__(self, name):
|
||
|
self.name = name
|
||
|
super().__init__()
|
||
|
|
||
|
@property
|
||
|
@abstractmethod
|
||
|
def vocab_size(self):
|
||
|
pass
|
||
|
|
||
|
@property
|
||
|
@abstractmethod
|
||
|
def vocab(self):
|
||
|
"""Dictionary from vocab text token to id token."""
|
||
|
pass
|
||
|
|
||
|
@property
|
||
|
@abstractmethod
|
||
|
def inv_vocab(self):
|
||
|
"""Dictionary from vocab id token to text token."""
|
||
|
pass
|
||
|
|
||
|
@abstractmethod
|
||
|
def tokenize(self, text):
|
||
|
pass
|
||
|
|
||
|
def detokenize(self, token_ids):
|
||
|
raise NotImplementedError(
|
||
|
"detokenizer is not implemented for {} " "tokenizer".format(self.name)
|
||
|
)
|
||
|
|
||
|
@property
|
||
|
def cls(self):
|
||
|
raise NotImplementedError(
|
||
|
"CLS is not provided for {} " "tokenizer".format(self.name)
|
||
|
)
|
||
|
|
||
|
@property
|
||
|
def sep(self):
|
||
|
raise NotImplementedError(
|
||
|
"SEP is not provided for {} " "tokenizer".format(self.name)
|
||
|
)
|
||
|
|
||
|
@property
|
||
|
def pad(self):
|
||
|
raise NotImplementedError(
|
||
|
"PAD is not provided for {} " "tokenizer".format(self.name)
|
||
|
)
|
||
|
|
||
|
@property
|
||
|
def eod(self):
|
||
|
raise NotImplementedError(
|
||
|
"EOD is not provided for {} " "tokenizer".format(self.name)
|
||
|
)
|
||
|
|
||
|
@property
|
||
|
def mask(self):
|
||
|
raise NotImplementedError(
|
||
|
"MASK is not provided for {} " "tokenizer".format(self.name)
|
||
|
)
|
||
|
|
||
|
|
||
|
class HFTokenizer(AbstractTokenizer):
|
||
|
"""Designed to Integrate HF's Tokenizer library."""
|
||
|
|
||
|
def __init__(self, vocab_file):
|
||
|
name = "HFTokenizer"
|
||
|
super().__init__(name)
|
||
|
|
||
|
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||
|
self.eod_id = self.tokenizer.token_to_id("<|endoftext|>")
|
||
|
self.pad_id = self.tokenizer.token_to_id("<|padding|>")
|
||
|
|
||
|
@property
|
||
|
def vocab_size(self):
|
||
|
return self.tokenizer.get_vocab_size()
|
||
|
|
||
|
@property
|
||
|
def vocab(self):
|
||
|
return self.tokenizer.get_vocab()
|
||
|
|
||
|
@property
|
||
|
def inv_vocab(self):
|
||
|
return self.tokenizer.decoder
|
||
|
|
||
|
def tokenize(self, text: str):
|
||
|
return self.tokenizer.encode(text).ids
|
||
|
|
||
|
def tokenize_batch(self, text_batch: Union[List[str], str]):
|
||
|
return self.tokenizer.encode_batch(text_batch)
|
||
|
|
||
|
def detokenize(self, token_ids):
|
||
|
return self.tokenizer.decode(token_ids)
|
||
|
|
||
|
@property
|
||
|
def eod(self):
|
||
|
return self.eod_id
|
||
|
|
||
|
|
||
|
class RWKVTokenizer(AbstractTokenizer):
|
||
|
"""RWKV Worlds Tokenizer."""
|
||
|
|
||
|
def __init__(self, vocab_file='rwkv_vocab_v20230424.txt'):
|
||
|
name = "RWKVTokenizer"
|
||
|
super().__init__(name)
|
||
|
|
||
|
self.tokenizer = TRIE_TOKENIZER(vocab_file)
|
||
|
self.eod_id = 0 # self.tokenizer.token_to_id("<|endoftext|>")
|
||
|
# self.pad_id = self.tokenizer.token_to_id("<|padding|>")
|
||
|
|
||
|
@property
|
||
|
def vocab_size(self):
|
||
|
return self.tokenizer.get_vocab_size()
|
||
|
|
||
|
@property
|
||
|
def vocab(self):
|
||
|
return self.tokenizer.get_vocab()
|
||
|
|
||
|
@property
|
||
|
def inv_vocab(self):
|
||
|
return self.tokenizer.decode
|
||
|
|
||
|
def tokenize(self, text: str):
|
||
|
return self.tokenizer.encode(text)
|
||
|
|
||
|
def tokenize_batch(self, text_batch: Union[List[str], str]):
|
||
|
return self.tokenizer.encode_batch(text_batch)
|
||
|
|
||
|
def detokenize(self, token_ids):
|
||
|
return self.tokenizer.decode(token_ids)
|
||
|
|
||
|
@property
|
||
|
def eod(self):
|
||
|
return self.eod_id
|