Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import argparse
import glob
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", required=False, help="input text file, use '-' for stdin")
parser.add_argument("--tokenizer_type", type=str, choices=['BPE', 'BBPE', "BERT"], help='one of BPE, BBPE, BERT')
parser.add_argument("--serialization_dir", help='path to output BPE model')
parser.add_argument("--vocab_size", help='YTTM vocab size', type=int, default=10000)
args = parser.parse_args()
# Initialize a tokenizer
tokenizer = {
'BPE': SentencePieceBPETokenizer,
'BBPE': ByteLevelBPETokenizer,
'BERT': BertWordPieceTokenizer
}[args.tokenizer_type]
tokenizer = tokenizer()
# Then train it!
tokenizer.train(args.input_file, vocab_size=args.vocab_size)
if not os.path.isdir(args.serialization_dir):
os.makedirs(args.serialization_dir)
tokenizer.save(args.serialization_dir, 'tokenizer')
with open(os.path.join(args.serialization_dir, "config.json"), "w+") as f:
config = vars(args)
json.dump(config, f)
def load(self, model='lstm', tokenizer_name='bbpe'):
""" Loads model and config for this meta annotation
"""
# Load tokenizer if it is None
if self.tokenizer is None:
vocab_file = self.save_dir + "{}-vocab.json".format(tokenizer_name)
merges_file = self.save_dir + "{}-merges.txt".format(tokenizer_name)
self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, lowercase=True)
# Load embeddings if None
if self.embeddings is None:
embeddings = np.load(open(self.save_dir + "embeddings.npy", 'rb'))
self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
# Load configuration
self.load_config()
# Load MODEL
self.load_model(model=model)
def load_huggingface_tokenizer(tokenizer_path: str) -> (Any, bool):
if os.path.isdir(tokenizer_path):
with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
config = json.load(f)
tokenizer_type = config['tokenizer_type']
tokenizer = {'SP': SentencePieceBPETokenizer,
'BBPE': ByteLevelBPETokenizer,
'CharBPE': CharBPETokenizer,
'BERT': BertWordPieceTokenizer}[tokenizer_type]
if tokenizer_type in ['SP', 'BBPE', 'CharBPE']:
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
merges_file=os.path.join(tokenizer_path, merges_file))
else:
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
is_transformers_tokenizer = False
else:
is_transformers_tokenizer = True
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
return tokenizer, is_transformers_tokenizer