Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def EN():
return English()
def EN():
return English()
def EN():
return English()
def nlp():
nlp = English()
return nlp.tokenizer
def EN():
return English().tokenizer
def preprocess_files_recursively(dir, output, num_train_chunks=100, num_valid_chunks=1, newline_token=" <s>"):
num_chunks = num_train_chunks + num_valid_chunks
chunk_id = 0
nlp = spacy.en.English(tagger=False, parser=False, entity=False, matcher=False, serializer=False, load_vectors=False)
if not os.path.exists(output):
os.mkdir(output)
writers = [open(os.path.join(output, "%s_%d.txt" %
("train" if num_valid_chunks <= i else "valid", i-num_valid_chunks if num_valid_chunks <= i else i)), "w") for i in range(num_chunks)]
context = None
word_counts = defaultdict(lambda: 0)
last_token = br_token.split(" ")[0]
start_token = br_token.split(" ")[1] if " " in br_token else None
for sub_dir, _, files in os.walk(dir):
for fn in files:
print("Processing %s" % fn)
fn = os.path.join(sub_dir, fn)
with open(fn, 'rb') as f:
for l in f:
l = l.decode("utf-8")
if l.startswith(""): #new document</s>
def get_tree_from_spacy(argv):
"""Parses the raw text using SpaCy.
Args:
argv: The command line arguments.
Returns:
A list of tree.FullSentence objects, the sentences parsed from the raw text.
"""
en_nlp = spacy.en.English()
sentences = []
file_id = 0
lst = os.listdir(dirs['raw_input']['path'])
lst.sort()
for fn in lst:
file_id += 1
if should_skip_file(fn):
continue
name = dirs['raw_input']['path'] + fn
def __init__(self, fetcher, dry_run=False):
self.fetcher = fetcher
self.logger = logging.getLogger(__name__)
self.parser = English()
# A custom stoplist
STOPLIST = set(nltk_stopwords.words('english') + ["n't", "'s", "'m", "ca","p","t"] + list(ENGLISH_STOP_WORDS))
ALLOWED_STOPLIST=set(('non'))
self.STOPLIST = STOPLIST - ALLOWED_STOPLIST
def __init__(self, tokenizer=None, phrases=None, max_len=6, max_phrases=1000000):
self.max_phrases = max_phrases or 1000000
self.max_len = max_len or 6
self.nlp = English()
if isinstance(phrases, basestring):
self.phrases = read_gazetteer(self.nlp.tokenizer, phrases, n=self.max_phrases)
else:
self.phrases = phrases
self.matcher = PhraseMatcher(self.nlp.tokenizer.vocab, self.phrases, max_length=self.max_len)
import inventoryCount as mainModule
import os
from spacy.en import English
if __name__ == '__main__':
"""
Main module for this example - loads the English main NLP class,
and keeps it in RAM while waiting for the user to re-run it. Allows the
developer to re-edit their module under testing without having
to wait as long to load the English class
"""
# Set the NLP object here for the parameters you want to see,
# or just leave it blank and get all the opts
print "Loading English module... this will take a while."
nlp = English()
print "Done loading English module."
while True:
try:
reload(mainModule)
mainModule.runTest(nlp)
raw_input('================ To reload main module, press Enter ================')
except Exception, e:
print "Unexpected error: " + str(e)
continue