Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def load_huggingface_tokenizer(tokenizer_path: str) -> (Any, bool):
if os.path.isdir(tokenizer_path):
with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
config = json.load(f)
tokenizer_type = config['tokenizer_type']
tokenizer = {'SP': SentencePieceBPETokenizer,
'BBPE': ByteLevelBPETokenizer,
'CharBPE': CharBPETokenizer,
'BERT': BertWordPieceTokenizer}[tokenizer_type]
if tokenizer_type in ['SP', 'BBPE', 'CharBPE']:
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
merges_file=os.path.join(tokenizer_path, merges_file))
else:
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
is_transformers_tokenizer = False
else:
is_transformers_tokenizer = True
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
return tokenizer, is_transformers_tokenizer
import sys
import argparse
import glob
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", required=False, help="input text file, use '-' for stdin")
parser.add_argument("--tokenizer_type", type=str, choices=['BPE', 'BBPE', "BERT"], help='one of BPE, BBPE, BERT')
parser.add_argument("--serialization_dir", help='path to output BPE model')
parser.add_argument("--vocab_size", help='YTTM vocab size', type=int, default=10000)
args = parser.parse_args()
# Initialize a tokenizer
tokenizer = {
'BPE': SentencePieceBPETokenizer,
'BBPE': ByteLevelBPETokenizer,
'BERT': BertWordPieceTokenizer
}[args.tokenizer_type]
tokenizer = tokenizer()
# Then train it!
tokenizer.train(args.input_file, vocab_size=args.vocab_size)
if not os.path.isdir(args.serialization_dir):
os.makedirs(args.serialization_dir)
tokenizer.save(args.serialization_dir, 'tokenizer')
with open(os.path.join(args.serialization_dir, "config.json"), "w+") as f:
config = vars(args)
json.dump(config, f)