Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def load_huggingface_tokenizer(tokenizer_path: str) -> (Any, bool):
if os.path.isdir(tokenizer_path):
with open(os.path.join(tokenizer_path, 'config.json'), 'r') as f:
config = json.load(f)
tokenizer_type = config['tokenizer_type']
tokenizer = {'SP': SentencePieceBPETokenizer,
'BBPE': ByteLevelBPETokenizer,
'CharBPE': CharBPETokenizer,
'BERT': BertWordPieceTokenizer}[tokenizer_type]
if tokenizer_type in ['SP', 'BBPE', 'CharBPE']:
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.json' in x][0]
merges_file = [x for x in os.listdir(tokenizer_path) if 'merges.txt' in x][0]
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file),
merges_file=os.path.join(tokenizer_path, merges_file))
else:
vocab_file = [x for x in os.listdir(tokenizer_path) if 'vocab.txt' in x][0]
tokenizer = tokenizer(vocab_file=os.path.join(tokenizer_path, vocab_file))
is_transformers_tokenizer = False
else:
is_transformers_tokenizer = True
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
return tokenizer, is_transformers_tokenizer
import glob
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--input_file", required=False, help="input text file, use '-' for stdin")
parser.add_argument("--tokenizer_type", type=str, choices=['BPE', 'BBPE', "BERT"], help='one of BPE, BBPE, BERT')
parser.add_argument("--serialization_dir", help='path to output BPE model')
parser.add_argument("--vocab_size", help='YTTM vocab size', type=int, default=10000)
args = parser.parse_args()
# Initialize a tokenizer
tokenizer = {
'BPE': SentencePieceBPETokenizer,
'BBPE': ByteLevelBPETokenizer,
'BERT': BertWordPieceTokenizer
}[args.tokenizer_type]
tokenizer = tokenizer()
# Then train it!
tokenizer.train(args.input_file, vocab_size=args.vocab_size)
if not os.path.isdir(args.serialization_dir):
os.makedirs(args.serialization_dir)
tokenizer.save(args.serialization_dir, 'tokenizer')
with open(os.path.join(args.serialization_dir, "config.json"), "w+") as f:
config = vars(args)
json.dump(config, f)
"""Example recipe that shows how to use model-specific tokenizers like the
BERT word piece tokenizer to preprocess your incoming text for fast and
efficient NER annotation and to make sure that all annotations you collect
always map to tokens and can be used to train and fine-tune your model
(even if the tokenization isn't that intuitive, because word pieces). The
selection automatically snaps to the token boundaries and you can double-click
single tokens to select them.
Setting "honor_token_whitespace": true will ensure that whitespace between
tokens is only shown if whitespace is present in the original text. This
keeps the text readable.
Requires Prodigy v1.10+ and usese the HuggingFace tokenizers library."""
stream = get_stream(source, loader=loader, input_key="text")
# You can replace this with other tokenizers if needed
tokenizer = BertWordPieceTokenizer(tokenizer_vocab, lowercase=lowercase)
sep_token = tokenizer._parameters.get("sep_token")
cls_token = tokenizer._parameters.get("cls_token")
special_tokens = (sep_token, cls_token)
wp_prefix = tokenizer._parameters.get("wordpieces_prefix")
def add_tokens(stream):
for eg in stream:
tokens = tokenizer.encode(eg["text"])
eg_tokens = []
idx = 0
for (text, (start, end), tid) in zip(
tokens.tokens, tokens.offsets, tokens.ids
):
# If we don't want to see special tokens, don't add them
if hide_special and text in special_tokens:
continue