Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_custom_attrs():
attrs = ('contains_emoji', 'equals_emoji', 'emoji_details', 'all_emoji')
nlp = English()
emoji = Emoji(nlp, attrs=attrs)
nlp.add_pipe(emoji, last=True)
doc = nlp(u"Hello ๐")
assert doc._.all_emoji
assert len(doc._.all_emoji) == 1
assert doc[1]._.has('equals_emoji')
assert doc[1]._.emoji_details
def nlp():
return English()
def test_issue3410():
texts = ["Hello world", "This is a test"]
nlp = English()
matcher = Matcher(nlp.vocab)
phrasematcher = PhraseMatcher(nlp.vocab)
with pytest.deprecated_call():
docs = list(nlp.pipe(texts, n_threads=4))
with pytest.deprecated_call():
docs = list(nlp.tokenizer.pipe(texts, n_threads=4))
with pytest.deprecated_call():
list(matcher.pipe(docs, n_threads=4))
with pytest.deprecated_call():
list(phrasematcher.pipe(docs, n_threads=4))
def test_issue3345():
"""Test case where preset entity crosses sentence boundary."""
nlp = English()
doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"])
doc[4].is_sent_start = True
ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}])
ner = EntityRecognizer(doc.vocab)
# Add the OUT action. I wouldn't have thought this would be necessary...
ner.moves.add_action(5, "")
ner.add_label("GPE")
doc = ruler(doc)
# Get into the state just before "New"
state = ner.moves.init_batch([doc])[0]
ner.moves.apply_transition(state, "O")
ner.moves.apply_transition(state, "O")
ner.moves.apply_transition(state, "O")
# Check that B-GPE is valid.
assert ner.moves.is_valid(state, "B-GPE")
yield
finally:
signal.alarm(0)
# URL match regex
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?ยซยปโโโโ])|(?:(? "),
("<", " < "),
]
tokenizer = English().Defaults.create_tokenizer()
# tokenizes and removes URLs (kept in separate list)
def pre_word_url_tokenize(stp):
url_list = list(set(re.findall(URL_REGEX, stp)))
# stp = st.strip()
for i, url in enumerate(url_list):
stp = stp.replace(url, " URL_%d " % (i,))
for a, b in html_pairs:
stp = stp.replace(a, b)
pre_txt = ' '.join([str(x) for x in tokenizer(stp)])
return (' '.join(pre_txt.split()), url_list)
# wrap inside a timer to catch cases where SpaCy tokenizer hangs on too many dots
def word_url_tokenize(st, max_len=20480, max_cont_len=512):
stp = ' '.join([w[:max_cont_len] if w[:max_cont_len].count('.') <= 12 else '.' for w in st.split()[:max_len]])
def main():
# For simplicity, we start off with only the blank English Language class
# and no model or pre-defined pipeline loaded.
nlp = English()
rest_countries = RESTCountriesComponent(nlp) # initialise component
nlp.add_pipe(rest_countries) # add it to the pipeline
doc = nlp("Some text about Colombia and the Czech Republic")
print("Pipeline", nlp.pipe_names) # pipeline contains component name
print("Doc has countries", doc._.has_country) # Doc contains countries
for token in doc:
if token._.is_country:
print(
token.text,
token._.country_capital,
token._.country_latlng,
token._.country_flag,
) # country data
print("Entities", [(e.text, e.label_) for e in doc.ents]) # entities
"""
Copyright (c) Facebook, Inc. and its affiliates.
"""
from copy import deepcopy
# from pprint import pprint
import csv
import json
from spacy.lang.en import English
tokenizer = English().Defaults.create_tokenizer()
def word_tokenize(st):
return [(x.text, x.idx) for x in tokenizer(st)]
rephrases = []
for j in range(5):
with open("rephrase_%d.csv" % (j,)) as csvfile:
g_reader = csv.reader(csvfile)
for i, row in enumerate(g_reader):
if i > 0:
rephrases += [row[-2:]]
brackets = [("(", ")"), ("{", "}"), ("[", "]"), ("*", "*"), ("$", "$"), ("#", "#")]
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5):
"""Normalize and tokenize strings.
Args:
p_iter (iter): iter over strings to normalize and tokenize.
p_batch_size (int): number of batches.
p_thread_count (int): number of threads running.
Returns:
iter: iter over normalized and tokenized string.
"""
global NLP
if not NLP:
NLP = NlpEnglish(parser=False)
output_iter = NLP.pipe(p_iter, \
batch_size=p_batch_size, \
n_threads=p_thread_count)
for doc in output_iter:
tokens = [str(w).strip().lower() for w in doc]
yield ' '.join(tokens)
def __init__(self, resource_file):
# Load the resource file
self.entailment_rules = bsddb.btopen(resource_file, 'r')
# Set threshold to default as recommended
self.threshold = 0.0
self.nlp = English()