Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_read_write_delimiters(self, tmpdir):
expected = [
["this is some text", "scandal", 42.0],
["here's some more text: boom!", "escándalo", 1.0],
]
for delimiter in (",", "\t", "|", ":"):
filepath = str(tmpdir.join("test_read_write_csv.csv"))
io.write_csv(expected, filepath, delimiter=delimiter, make_dirs=True)
observed = list(io.read_csv(filepath, delimiter=delimiter))
assert observed == expected
def test_read_write_dict(self, tmpdir):
expected = [
{"text": "this is some text", "kind": "scandal", "number": 42.0},
{"text": "here's some more text: boom!", "kind": "escándalo", "number": 1.0},
]
filepath = str(tmpdir.join("test_read_write_csv_dict.csv"))
io.write_csv(
expected,
filepath,
dialect="excel",
make_dirs=True,
fieldnames=["text", "kind", "number"],
)
observed = [
dict(item)
for item in io.read_csv(
filepath, dialect="excel", fieldnames=["text", "kind", "number"]
)
]
assert observed == expected
def test_read_write_compressed(self, tmpdir):
expected = [
["this is some text", "scandal", 42.0],
["here's some more text: boom!", "escándalo", 1.0],
]
for ext in (".csv", ".csv.gz", ".csv.bz2", ".csv.xz"):
filepath = str(tmpdir.join("test_read_write_csv" + ext))
if compat.PY2 is True and ext != ".csv":
with pytest.raises(ValueError):
io.open_sesame(filepath, mode="wt", encoding=None, make_dirs=True)
else:
io.write_csv(expected, filepath, make_dirs=True)
observed = list(io.read_csv(filepath))
assert observed == expected
rel_fname = "{}.json.gz".format(_split_uri(relation)[1].lower())
rel_fpath = self.data_dir.joinpath(rel_fname)
if rel_fpath.is_file():
LOGGER.debug("loading data for '%s' relation from %s", relation, rel_fpath)
return next(
tio.read_json(rel_fpath, mode="rt", encoding="utf-8", lines=False)
)
else:
rel_data = collections.defaultdict(
lambda: collections.defaultdict(
lambda: collections.defaultdict(set)
)
)
LOGGER.info(
"preparing data for '%s' relation; this may take a while...", relation)
rows = tio.read_csv(self.filepath, delimiter="\t", quoting=1)
with tqdm() as pbar:
for row in rows:
pbar.update(1)
_, rel_type, start_uri, end_uri, _ = row
if rel_type < relation:
continue
elif rel_type > relation:
break
start_lang, start_term, start_sense = _parse_concept_uri(start_uri)
end_lang, end_term, end_sense = _parse_concept_uri(end_uri)
if start_lang == end_lang and start_term != end_term:
rel_data[start_lang][start_term][start_sense].add(end_term)
if is_symmetric:
rel_data[start_lang][end_term][end_sense].add(start_term)
# make relation data json-able (i.e. cast set => list)
for terms in rel_data.values():
def load_iso_639_data(dirpath, exclude=None):
"""
Args:
dirpath (str or :class:`pathlib.Path`)
exclude (Set[str])
Returns:
Dict[str, str]
"""
dirpath = textacy.utils.to_path(dirpath).resolve()
rows = textacy.io.read_csv(
dirpath.joinpath("iso-639-3.tsv").resolve(),
delimiter="\t",
fieldnames=["Id", "Part2B", "Part2T", "Part1", "Scope", "Language_Type", "Ref_Name", "Comment"],
quoting=1,
)
lang_map = {
row["Id"]: row["Part1"]
for row in rows
if row.get("Part1") and
(exclude is None or row["Part1"] not in exclude)
}
return lang_map
def load_tatoeba_data(dirpath, iso_lang_map, min_len=25):
"""
Args:
dirpath (str or :class:`pathlib.Path`)
iso_lang_map (Dict[str, str])
min_len (int): minimum text length in *chars*
Returns:
List[Tuple[str, str]]
"""
dirpath = textacy.utils.to_path(dirpath).resolve()
rows = textacy.io.read_csv(
dirpath.joinpath("sentences.csv"),
fieldnames=["sent_id", "iso-639-3", "text"],
delimiter="\t",
quoting=1,
)
langs = set(iso_lang_map.keys())
ds = [
(row["text"], iso_lang_map[row["iso-639-3"]])
for row in rows
if row["iso-639-3"] in langs
and itertoolz.count(char for char in row["text"] if char.isalnum()) >= min_len
]
return ds