Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
retry=False,
force=False,
):
"""Test the wordpiecer on a large dataset to find misalignments. If both the
retry and force flag are set (which is the default runtime configuration),
this script should always pass.
* retry: If alignment fails after cleaning and normalizing both sets of
tokens, try again with a more aggressive strategy that strips out all
characters that are not uppercase/lowercase letters.
* force: If alignment still fails, run the word-piece tokenizer on the
individual spaCy tokens, so that alignment is trivial. This should
always work.
"""
cfg = {"retry_alignment": retry, "force_alignment": force}
nlp = get_lang_class(lang)()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
wp = TransformersWordPiecer.from_pretrained(nlp.vocab, trf_name=name, **cfg)
msg.good(f"Loaded WordPiecer for model '{name}'")
with msg.loading("Loading IMDB data..."):
data, _ = thinc.extra.datasets.imdb(limit=n_texts)
texts, _ = zip(*data)
msg.good(f"Using {len(texts)} texts from IMDB data")
msg.info("Processing texts...")
sent_counts = 0
for doc in tqdm.tqdm(nlp.pipe(texts), total=len(texts)):
try:
doc = wp(doc)
sent_counts += len(list(doc.sents))
except AssertionError as e:
if len(e.args) and isinstance(e.args[0], tuple): # Misaligned error
a, b = e.args[0]
def test_lang_initialize(lang, capfd):
"""Test that languages can be initialized."""
nlp = get_lang_class(lang)()
# Check for stray print statements (see #3342)
doc = nlp("test") # noqa: F841
captured = capfd.readouterr()
assert not captured.out
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
if lang is None:
raise ValueError("No --lang specified, but tokenization required")
json_docs = []
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
nlp = get_lang_class(lang)()
sentencizer = nlp.create_pipe("sentencizer")
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
docs = []
for record in batch:
raw_text = record["text"]
if "entities" in record:
ents = record["entities"]
else:
ents = record["spans"]
ents = [(e["start"], e["end"], e["label"]) for e in ents]
doc = nlp.make_doc(raw_text)
sentencizer(doc)
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
doc.ents = _cleanup_spans(spans)
docs.append(doc)
json_docs.append(docs_to_json(docs, id=i))
def create_model(lang, lex_attrs, name=None):
lang_class = get_lang_class(lang)
nlp = lang_class()
for lexeme in nlp.vocab:
lexeme.rank = 0
lex_added = 0
for attrs in lex_attrs:
if "settings" in attrs:
continue
lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs)
lexeme.is_oov = False
lex_added += 1
lex_added += 1
if len(nlp.vocab):
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
else:
oov_prob = DEFAULT_OOV_PROB
def get_model_desc(nlp, model_name):
"""Get human-readable model name, language name and version."""
lang_cls = spacy.util.get_lang_class(nlp.lang)
lang_name = lang_cls.__name__
model_version = nlp.meta["version"]
return "{} - {} (v{})".format(lang_name, model_name, model_version)
def get_tokenizer(lang):
lang_cls = spacy.util.get_lang_class(lang)
return lang_cls().Defaults.create_tokenizer()
def create_nlp_from_config(lang, vectors, pipeline):
lang_class = spacy.util.get_lang_class(lang)
nlp = lang_class()
if vectors is not None:
spacy.cli.train._load_vectors(nlp, vectors)
for name, component_cfg in pipeline.items():
factory = component_cfg.pop("factory")
component = nlp.create_pipe(factory, config=component_cfg)
nlp.add_pipe(component, name=name)
return nlp
def pipeline(merge_patterns=None, terminal_patterns=None):
CYRILLIC_UPPER = r'[\p{Lu}&&\p{Cyrillic}]'
r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER)
Language = get_lang_class('ru')
Language.Defaults.infixes += ('ยซยป',)
Language.Defaults.infixes += ('-',)
Language.Defaults.infixes += ('"\/',)
Language.Defaults.infixes += ('/',)
Language.Defaults.infixes += (r'(?<=[{au}])\.(?=\w+)'.format(au=CYRILLIC_UPPER),)
# Token.set_extension('is_adjective', default=False, force=True)
nlp = Language()
russian_tokenizer = RussianTokenizer(nlp, merge_patterns=merge_patterns, terminal_patterns=terminal_patterns)
nlp.add_pipe(detect_sentence_boundaries, name='detect_sentence_boundaries', first=True)
# nlp.add_pipe(match_adjective, name='match_adjective', after='detect_sentence_boundaries')
nlp.add_pipe(russian_tokenizer, name='russian_tokenizer', after='detect_sentence_boundaries')
for case in SPECIAL_CASES:
nlp.tokenizer.add_special_case(case, [{'ORTH': case}])
def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors):
print("Creating model...")
lang_class = get_lang_class(lang)
nlp = lang_class()
for lexeme in nlp.vocab:
lexeme.rank = 0
lex_added = 0
for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))):
lexeme = nlp.vocab[word]
lexeme.rank = i
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def load_default_model_sentencizer(lang):
""" Load a generic spaCy model and add the sentencizer for sentence tokenization"""
loading_start = time.time()
lang_class = get_lang_class(lang)
nlp = lang_class()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
loading_end = time.time()
loading_time = loading_end - loading_start
return nlp, loading_time, lang + "_default_" + 'sentencizer'