Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
if lang is None:
raise ValueError("No --lang specified, but tokenization required")
json_docs = []
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
nlp = get_lang_class(lang)()
sentencizer = nlp.create_pipe("sentencizer")
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
docs = []
for record in batch:
raw_text = record["text"]
if "entities" in record:
ents = record["entities"]
else:
ents = record["spans"]
ents = [(e["start"], e["end"], e["label"]) for e in ents]
doc = nlp.make_doc(raw_text)
sentencizer(doc)
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
doc.ents = _cleanup_spans(spans)
docs.append(doc)
"--label is a required argument",
"This is the label that will be assigned to all patterns "
"created from terms collected in this dataset. ",
exits=1,
error=True,
)
DB = connect()
def get_pattern(term, label):
return {"label": label, "pattern": [{"lower": t.lower()} for t in term["text"].split()]}
log("RECIPE: Starting recipe terms.to-patterns", locals())
if dataset is None:
log("RECIPE: Reading input terms from sys.stdin")
terms = (srsly.json_loads(line) for line in sys.stdin)
else:
if dataset not in DB:
prints("Can't find dataset '{}'".format(dataset), exits=1, error=True)
terms = DB.get_dataset(dataset)
log(
"RECIPE: Reading {} input terms from dataset {}".format(len(terms), dataset)
)
if output_file:
patterns = [
get_pattern(term, label) for term in terms if term["answer"] == "accept"
]
log("RECIPE: Generated {} patterns".format(len(patterns)))
srsly.write_jsonl(output_file, patterns)
prints("Exported {} patterns".format(len(patterns)), output_file)
else:
log("RECIPE: Outputting patterns")
def _read_inputs(loc, msg):
if loc == "-":
msg.info("Reading input from sys.stdin")
file_ = sys.stdin
file_ = (line.encode("utf8") for line in file_)
else:
input_path = Path(loc)
if not input_path.exists() or not input_path.is_file():
msg.fail("Not a valid input data file", loc, exits=1)
msg.info("Using data from {}".format(input_path.parts[-1]))
file_ = input_path.open()
for line in file_:
data = srsly.json_loads(line)
text = data["text"]
yield text
deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
deserializers["vocab"] = lambda b: self.vocab.from_bytes(
def __iter__(self):
for file_path in self.iter_files():
with bz2.open(str(file_path)) as f:
for line in f:
line = line.strip()
if not line:
continue
comment = srsly.json_loads(line)
if self.is_valid(comment):
text = self.strip_tags(comment["body"])
yield {"text": text}