Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys
import spacy
nlp = spacy.blank("en")
def word_tokenize(sent):
doc = nlp(sent)
return [token.text for token in doc]
def normalize_answer(s):
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
"""
Testing for textpipe pipeline.py
"""
import tempfile
import pytest
import spacy
from textpipe.pipeline import Pipeline
TEXT = 'Test sentence for testing'
ents_model_nl = spacy.blank('nl')
ents_model_en = spacy.blank('en')
model_path_nl = tempfile.mkdtemp()
model_path_en = tempfile.mkdtemp()
ents_model_nl.to_disk(model_path_nl)
ents_model_en.to_disk(model_path_en)
STEPS = [('Raw',), ('NWords',), ('Complexity',), ('CleanText',),
('Entities', {'model_mapping': {'nl': 'ents', 'en': 'other_identifier'}})]
PIPELINE_DEF_KWARGS = dict(models=[('ents', 'nl', model_path_nl),
('other_identifier', 'en', model_path_en)])
PIPE = Pipeline(STEPS, **PIPELINE_DEF_KWARGS)
def test_load_custom_model():
"""
self.output_path = output_path
self.train_path = train_path
self.dev_path = dev_path
self.model = model
self.n_iter = n_iter
self.init_tok2vec = init_tok2vec
self.exclusive_classes = exclusive_classes
self.architecture = architecture
self.train_split = train_split
self.label_map = None
if self.model is not None:
self.nlp = spacy.load(self.model)
else:
self.nlp = spacy.blank("en")
if "textcat" not in self.nlp.pipe_names:
self.textcat = self.nlp.create_pipe(
"textcat",
config={
"exclusive_classes": self.exclusive_classes,
"architecture": self.architecture,
},
)
self.nlp.add_pipe(self.textcat, last=True)
else:
self.textcat = self.nlp.get_pipe("textcat")
from tqdm import tqdm
import spacy
import ujson as json
from collections import Counter
import numpy as np
from codecs import open
import os
import config
'''
The content of this file is mostly copied from https://github.com/HKUST-KnowComp/R-Net/blob/master/prepro.py
'''
nlp = spacy.blank("en")
def word_tokenize(sent):
doc = nlp(sent)
return [token.text for token in doc]
def convert_idx(text, tokens):
current = 0
spans = []
for token in tokens:
current = text.find(token, current)
if current < 0:
print("Token {} cannot be found".format(token))
raise Exception()
spans.append((current, current + len(token)))
import tensorflow as tf
import random
from tqdm import tqdm
import spacy
import ujson as json
from collections import Counter
import numpy as np
import os.path
nlp = spacy.blank("en")
def word_tokenize(sent):
doc = nlp(sent)
return [token.text for token in doc]
def convert_idx(text, tokens):
current = 0
spans = []
for token in tokens:
current = text.find(token, current)
if current < 0:
print("Token {} cannot be found".format(token))
raise Exception()
spans.append((current, current + len(token)))
def finish_deserializing(self):
self.bpe_ranks = deserialize_bpe_ranks(self._bpe_ranks)
self.nlp = spacy.blank("en")
self.fix_text = ftfy.fix_text
self.cache = {}
self.decoder = {v: k for k, v in self.encoder.items()}
import tensorflow as tf
import random
from tqdm import tqdm
import spacy
import json
from collections import Counter
import numpy as np
from nltk.tokenize.moses import MosesDetokenizer
from rouge import Rouge as R
import string
import re
nlp = spacy.blank("en")
def word_tokenize(sent):
doc = nlp(sent)
return [token.text for token in doc]
def convert_idx(text, tokens):
current = 0
spans = []
for token in tokens:
current = text.find(token, current)
if current < 0:
print("Token {} cannot be found".format(token))
raise Exception()
spans.append((current, current + len(token)))
current += len(token)
def main(args):
objs = []
with open(args.path, encoding=args.encoding) as f:
for linum, line in enumerate(f):
try:
objs.append(json.loads(line.strip()))
except Exception as e:
message = f'line {linum+1}: {e}'
raise RuntimeError(message)
nlp = spacy.blank('id')
with ProcessPoolExecutor(max_workers=args.max_workers) as exc:
tok_objs = exc.map(partial(tokenize_obj, nlp), objs, chunksize=args.chunk_size)
docs = [Document.from_mapping(obj) for obj in tok_objs]
if args.discard_long_summary:
docs = [doc for doc in docs if not has_long_summary(doc)]
print('\n'.join(json.dumps(doc.to_dict(), sort_keys=True) for doc in docs))
def FeatureExtracter(lang, attrs=[LOWER, SHAPE, PREFIX, SUFFIX], tokenized=True):
nlp = spacy.blank(lang)
nlp.vocab.lex_attr_getters[PREFIX] = lambda string: string[:3]
nlp.vocab.lex_attr_getters[SUFFIX] = lambda string: string[-3:]
def forward(texts, drop=0.0):
if tokenized:
docs = [Doc(nlp.vocab, words) for words in texts]
else:
docs = [nlp(text) for text in texts]
features = [doc.to_array(attrs) for doc in docs]
def backward(d_features, sgd=None):
return d_features
return features, backward
return layerize(forward)