# Copyright (c) 2021, EleutherAI # This file is based on code by the authors denoted below and has been modified from its original version. # # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Megatron tokenizers.""" from abc import ABC from abc import abstractmethod from tokenizers import Tokenizer from rwkv_tokenizer import RWKV_TOKENIZER, TRIE_TOKENIZER from typing import List, Union def build_tokenizer(args): """Initialize tokenizer.""" if args.rank == 0: print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True) # Select and instantiate the tokenizer. if args.tokenizer_type.lower() == "HFTokenizer".lower(): assert args.vocab_file is not None tokenizer = HFTokenizer(args.vocab_file) elif args.tokenizer_type.lower() == "RWKVTokenizer".lower(): assert args.vocab_file is not None tokenizer = RWKVTokenizer(args.vocab_file) else: raise NotImplementedError( "{} tokenizer is not " "implemented.".format(args.tokenizer_type) ) # Add vocab size. args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args) return tokenizer def _vocab_size_with_padding(orig_vocab_size, args): """Pad vocab size so it is divisible by model parallel size and still having GPU friendly size.""" after = orig_vocab_size multiple = args.make_vocab_size_divisible_by * args.model_parallel_size while (after % multiple) != 0: after += 1 if args.rank == 0: print( " > padded vocab (size: {}) with {} dummy tokens " "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after), flush=True, ) return after class AbstractTokenizer(ABC): """Abstract class for tokenizer.""" def __init__(self, name): self.name = name super().__init__() @property @abstractmethod def vocab_size(self): pass @property @abstractmethod def vocab(self): """Dictionary from vocab text token to id token.""" pass @property @abstractmethod def inv_vocab(self): """Dictionary from vocab id token to text token.""" pass @abstractmethod def tokenize(self, text): pass def detokenize(self, token_ids): raise NotImplementedError( "detokenizer is not implemented for {} " "tokenizer".format(self.name) ) @property def cls(self): raise NotImplementedError( "CLS is not provided for {} " "tokenizer".format(self.name) ) @property def sep(self): raise NotImplementedError( "SEP is not provided for {} " "tokenizer".format(self.name) ) @property def pad(self): raise NotImplementedError( "PAD is not provided for {} " "tokenizer".format(self.name) ) @property def eod(self): raise NotImplementedError( "EOD is not provided for {} " "tokenizer".format(self.name) ) @property def mask(self): raise NotImplementedError( "MASK is not provided for {} " "tokenizer".format(self.name) ) class HFTokenizer(AbstractTokenizer): """Designed to Integrate HF's Tokenizer library.""" def __init__(self, vocab_file): name = "HFTokenizer" super().__init__(name) self.tokenizer = Tokenizer.from_file(vocab_file) self.eod_id = self.tokenizer.token_to_id("<|endoftext|>") self.pad_id = self.tokenizer.token_to_id("<|padding|>") @property def vocab_size(self): return self.tokenizer.get_vocab_size() @property def vocab(self): return self.tokenizer.get_vocab() @property def inv_vocab(self): return self.tokenizer.decoder def tokenize(self, text: str): return self.tokenizer.encode(text).ids def tokenize_batch(self, text_batch: Union[List[str], str]): return self.tokenizer.encode_batch(text_batch) def detokenize(self, token_ids): return self.tokenizer.decode(token_ids) @property def eod(self): return self.eod_id class RWKVTokenizer(AbstractTokenizer): """RWKV Worlds Tokenizer.""" def __init__(self, vocab_file='rwkv_vocab_v20230424.txt'): name = "RWKVTokenizer" super().__init__(name) self.tokenizer = TRIE_TOKENIZER(vocab_file) self.eod_id = 0 # self.tokenizer.token_to_id("<|endoftext|>") # self.pad_id = self.tokenizer.token_to_id("<|padding|>") @property def vocab_size(self): return self.tokenizer.get_vocab_size() @property def vocab(self): return self.tokenizer.get_vocab() @property def inv_vocab(self): return self.tokenizer.decode def tokenize(self, text: str): return self.tokenizer.encode(text) def tokenize_batch(self, text_batch: Union[List[str], str]): return self.tokenizer.encode_batch(text_batch) def detokenize(self, token_ids): return self.tokenizer.decode(token_ids) @property def eod(self): return self.eod_id