Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def path_from_url(url, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True):
if not save_dir:
save_dir = hanlp_home()
domain, relative_path = parse_url_path(url)
if append_location:
if not url.startswith(prefix):
save_dir = os.path.join(save_dir, 'thirdparty', domain)
else:
# remove the relative path in prefix
middle = prefix.split(domain)[-1].lstrip('/')
if relative_path.startswith(middle):
relative_path = relative_path[len(middle):]
realpath = os.path.join(save_dir, relative_path)
else:
realpath = os.path.join(save_dir, os.path.basename(relative_path))
return realpath
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-31 23:54
from hanlp.common.constant import HANLP_URL
SEMEVAL16_NEWS_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-news-biaffine_20191231_235407.zip'
SEMEVAL16_TEXT_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-text-biaffine_20200101_002257.zip'
SEMEVAL15_PAS_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_pas_20200103_152405.zip'
SEMEVAL15_PSD_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_psd_20200106_123009.zip'
SEMEVAL15_DM_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_dm_20200106_122808.zip'
ALL = {}
def download(url, save_path=None, save_dir=hanlp_home(), prefix=HANLP_URL, append_location=True):
if not save_path:
save_path = path_from_url(url, save_dir, prefix, append_location)
if os.path.isfile(save_path):
eprint('Using local {}, ignore {}'.format(save_path, url))
return save_path
else:
makedirs(parent_dir(save_path))
eprint('Downloading {} to {}'.format(url, save_path))
tmp_path = '{}.downloading'.format(save_path)
remove_file(tmp_path)
try:
def reporthook(count, block_size, total_size):
global start_time, progress_size
if count == 0:
start_time = time.time()
progress_size = 0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 18:44
from hanlp.common.constant import HANLP_URL
CTB_HOME = HANLP_URL + 'embeddings/SUDA-LA-CIP_20200109_021624.zip#'
CTB5_DEP_HOME = CTB_HOME + 'BPNN/data/ctb5/'
CTB5_DEP_TRAIN = CTB5_DEP_HOME + 'train.conll'
CTB5_DEP_VALID = CTB5_DEP_HOME + 'dev.conll'
CTB5_DEP_TEST = CTB5_DEP_HOME + 'test.conll'
CTB7_HOME = CTB_HOME + 'BPNN/data/ctb7/'
CTB7_DEP_TRAIN = CTB7_HOME + 'train.conll'
CTB7_DEP_VALID = CTB7_HOME + 'dev.conll'
CTB7_DEP_TEST = CTB7_HOME + 'test.conll'
CIP_W2V_100_CN = CTB_HOME + 'BPNN/data/embed.txt'
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-29 02:55
from hanlp.common.constant import HANLP_URL
CTB5_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb5_20191229_025833.zip'
CTB7_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb7_20200109_022431.zip'
PTB_BIAFFINE_DEP_EN = HANLP_URL + 'dep/ptb_dep_biaffine_20200101_174624.zip'
ALL = {}
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2020-01-01 03:51
from hanlp.common.constant import HANLP_URL
CHNSENTICORP_BERT_BASE_ZH = HANLP_URL + 'classification/chnsenticorp_bert_base_20200104_164655.zip'
SST2_BERT_BASE_EN = HANLP_URL + 'classification/sst2_bert_base_uncased_en_20200210_090240.zip'
SST2_ALBERT_BASE_EN = HANLP_URL + 'classification/sst2_albert_base_20200122_205915.zip'
EMPATHETIC_DIALOGUES_SITUATION_ALBERT_BASE_EN = HANLP_URL + 'classification/empathetic_dialogues_situation_albert_base_20200122_212250.zip'
EMPATHETIC_DIALOGUES_SITUATION_ALBERT_LARGE_EN = HANLP_URL + 'classification/empathetic_dialogues_situation_albert_large_20200123_142724.zip'
ALL = {}
from hanlp.common.constant import HANLP_URL
CONVSEG_W2V_NEWS_TENSITE = HANLP_URL + 'embeddings/convseg_embeddings.zip'
CONVSEG_W2V_NEWS_TENSITE_WORD_PKU = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.pku.words.w2v50'
CONVSEG_W2V_NEWS_TENSITE_WORD_MSR = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.msr.words.w2v50'
CONVSEG_W2V_NEWS_TENSITE_CHAR = CONVSEG_W2V_NEWS_TENSITE + '#news_tensite.w2v200'
SEMEVAL16_EMBEDDINGS_CN = HANLP_URL + 'embeddings/semeval16_embeddings.zip'
SEMEVAL16_EMBEDDINGS_300_NEWS_CN = SEMEVAL16_EMBEDDINGS_CN + '#news.fasttext.300.txt'
SEMEVAL16_EMBEDDINGS_300_TEXT_CN = SEMEVAL16_EMBEDDINGS_CN + '#text.fasttext.300.txt'
CTB5_FASTTEXT_300_CN = HANLP_URL + 'embeddings/ctb.fasttext.300.txt.zip'
TENCENT_AI_LAB_EMBEDDING = 'https://ai.tencent.com/ailab/nlp/data/Tencent_AILab_ChineseEmbedding.tar.gz#Tencent_AILab_ChineseEmbedding.txt'
RADICAL_CHAR_EMBEDDING_100 = HANLP_URL + 'embeddings/radical_char_vec_20191229_013849.zip#character.vec.txt'
ALL = {}
def get_resource(path: str, save_dir=None, extract=True, prefix=HANLP_URL, append_location=True):
"""
Fetch real path for a resource (model, corpus, whatever)
:param path: the general path (can be a url or a real path)
:param extract: whether to unzip it if it's a zip file
:param save_dir:
:return: the real path to the resource
"""
anchor: str = None
compressed = None
if os.path.isdir(path):
return path
elif os.path.isfile(path):
pass
elif path.startswith('http:') or path.startswith('https:'):
url = path
if '#' in url:
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-30 20:07
from hanlp.common.constant import HANLP_URL
MSRA_NER_BERT_BASE_ZH = HANLP_URL + 'ner/ner_bert_base_msra_20200104_185735.zip'
MSRA_NER_ALBERT_BASE_ZH = HANLP_URL + 'ner/ner_albert_base_zh_msra_20200111_202919.zip'
CONLL03_NER_BERT_BASE_UNCASED_EN = HANLP_URL + 'ner/ner_conll03_bert_base_uncased_en_20200104_194352.zip'
ALL = {}
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-12-28 21:12
from hanlp.common.constant import HANLP_URL
SIGHAN2005_PKU_CONVSEG = HANLP_URL + 'cws/sighan2005-pku-convseg_20200110_153722.zip'
SIGHAN2005_MSR_CONVSEG = HANLP_URL + 'cws/convseg-msr-nocrf-noembed_20200110_153524.zip'
# SIGHAN2005_MSR_BERT_BASE = HANLP_URL + 'cws/cws_bert_base_msra_20191230_194627.zip'
CTB6_CONVSEG = HANLP_URL + 'cws/ctb6_convseg_nowe_nocrf_20200110_004046.zip'
# CTB6_BERT_BASE = HANLP_URL + 'cws/cws_bert_base_ctb6_20191230_185536.zip'
PKU_NAME_MERGED_SIX_MONTHS_CONVSEG = HANLP_URL + 'cws/pku98_6m_conv_ngram_20200110_134736.zip'
# Will be filled up during runtime
ALL = {}