Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Args:
date_range (Tuple[str]): Interval specifying the [start, end) dates
for which comments files will be downloaded. Each item must be
a str formatted as YYYY-MM or YYYY-MM-DD (the latter is converted
to the corresponding YYYY-MM value). Both start and end values
must be specified, but a null value for either is automatically
replaced by the minimum or maximum valid values, respectively.
force (bool): If True, download the dataset, even if it already
exists on disk under ``data_dir``.
"""
date_range = utils.validate_and_clip_range(
date_range, self.full_date_range, val_type=(str, bytes))
filestubs = self._generate_filestubs(date_range)
for filestub in filestubs:
tio.download_file(
urllib.parse.urljoin(DOWNLOAD_ROOT, filestub),
filename=filestub,
dirpath=self.data_dir,
force=force,
)
def download(self, *, force=False):
"""
Download the data as a zipped archive of language-specific text files,
then save it to disk and extract its contents under the ``data_dir`` directory.
Args:
force (bool): If True, always download the dataset even if
it already exists.
"""
filepath = tio.download_file(
DOWNLOAD_URL,
filename="udhr_txt.zip",
dirpath=self.data_dir,
force=force,
)
if filepath:
tio.unpack_archive(filepath, extract_dir=self.data_dir.joinpath("udhr_txt"))
self._check_data()
def download(self, *, force=False):
"""
Download the data as a compressed tar archive file, then save it to disk and
extract its contents under the ``data_dir`` directory.
Args:
force (bool): If True, always download the dataset even if
it already exists.
"""
filepath = tio.download_file(
DOWNLOAD_URL,
filename="aclImdb.tar.gz",
dirpath=self.data_dir,
force=force,
)
if filepath:
tio.unpack_archive(filepath, extract_dir=None)
self._check_data()
def download(self, *, force=False):
"""
Download the Wikimedia CirrusSearch db dump corresponding to the given
``project``, ``lang``, and ``version`` as a compressed JSON file,
and save it to disk under the ``data_dir`` directory.
Args:
force (bool): If True, download the dataset, even if it already
exists on disk under ``data_dir``.
Note:
Some datasets are quite large (e.g. English Wikipedia is ~28GB)
and can take hours to fully download.
"""
file_url = self._get_file_url()
tio.download_file(
file_url,
filename=self._filestub,
dirpath=self.data_dir,
force=force,
)
Download two multilingual collections of short excerpts of journalistic texts,
focused on language groups that are very similar and thus more difficult
to correctly identify.
Args:
dirpath (str or :class:`pathlib.Path`)
force (bool)
References:
http://ttg.uni-saarland.de/resources/DSLCC/
"""
dirpath = textacy.utils.to_path(dirpath).resolve()
for version in [3, 4]:
name = "dslcc{}".format(version)
url = "http://scholar.harvard.edu/files/malmasi/files/{}.zip".format(name)
fpath = textacy.io.download_file(url, dirpath=dirpath, force=force)
if fpath:
textacy.io.unpack_archive(fpath, extract_dir=dirpath.joinpath(name))
def download_iso_639_data(dirpath, force=False):
"""
Download official ISO 639 code table as a TSV,
mapping all language code variations (639-1, 639-2, 639-3)
to each other.
Args:
dirpath (str or :class:`pathlib.Path`)
force (bool)
References:
https://iso639-3.sil.org/code_tables/639/data
"""
url = "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3.tab"
textacy.io.download_file(url, filename="iso-639-3.tsv", dirpath=dirpath, force=force)
def download(self, *, force=False):
"""
Download the data as a zip archive file, then save it to disk and
extract its contents under the :attr:`OxfordTextArchive.data_dir` directory.
Args:
force (bool): If True, always download the dataset even if
it already exists.
"""
filepath = tio.download_file(
DOWNLOAD_URL,
filename=None,
dirpath=self.data_dir,
force=force,
)
if filepath:
tio.unpack_archive(filepath, extract_dir=None)
def download(self, *, force=False):
"""
Download the data as a Python version-specific compressed json file and
save it to disk under the ``data_dir`` directory.
Args:
force (bool): If True, download the dataset, even if it already
exists on disk under ``data_dir``.
"""
release_tag = "supreme_court_py3_v{data_version}".format(data_version=1.0)
url = urllib.parse.urljoin(DOWNLOAD_ROOT, release_tag + "/" + self._filename)
tio.download_file(
url,
filename=self._filename,
dirpath=self.data_dir,
force=force,
)
def download_wili_data(dirpath, force=False):
"""
Args:
dirpath (str or :class:`pathlib.Path`)
force (bool)
References:
https://tatoeba.org/eng/downloads
"""
url = "https://zenodo.org/record/841984/files/wili-2018.zip?download=1"
fpath = textacy.io.download_file(url, dirpath=dirpath, force=force)
if fpath:
textacy.io.unpack_archive(fpath, extract_dir=dirpath)