Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _download_dataset(self, dataset_path, url):
if not os.path.exists(dataset_path):
result, tips, dataset_path = default_downloader.download_file_and_uncompress(
url=url,
save_path=hub.common.dir.DATA_HOME,
print_progress=True,
replace=True)
if not result:
print(tips)
exit()
return dataset_path
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "inews")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "chnsenticorp")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def mixed_segmentation(in_str, rm_punc=False):
nltk_path = os.path.join(THIRD_PARTY_HOME, "nltk_data")
tokenizers_path = os.path.join(nltk_path, "tokenizers")
punkt_path = os.path.join(tokenizers_path, "punkt")
if not os.path.exists(punkt_path):
default_downloader.download_file_and_uncompress(
url=_PUNKT_URL, save_path=tokenizers_path, print_progress=True)
nltk.data.path.append(nltk_path)
in_str = str(in_str).lower().strip()
segs_out = []
temp_str = ""
sp_char = [
'-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',', '。', ':',
'?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', '「', '」', '(',
')', '-', '~', '『', '』'
]
for char in in_str:
if rm_punc and char in sp_char:
continue
if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char:
if temp_str != "":
def __init__(self):
self.dataset_dir = os.path.join(DATA_HOME, "msra_ner")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self._load_train_examples()
self._load_test_examples()
self._load_dev_examples()
def __init__(self, version_2_with_negative=False):
self.dataset_dir = os.path.join(DATA_HOME, "squad_data")
if not os.path.exists(self.dataset_dir):
ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress(
url=_DATA_URL, save_path=DATA_HOME, print_progress=True)
else:
logger.info("Dataset {} already cached.".format(self.dataset_dir))
self.version_2_with_negative = version_2_with_negative
self._load_train_examples(version_2_with_negative, if_has_answer=True)
self._load_dev_examples(version_2_with_negative, if_has_answer=True)