Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# wiki_lm_lstm 0.32
# thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s]
# /root/pythainlp-data/thwiki_model_lstm.pth
print(get_corpus_path('wiki_lm_lstm'))
# output: /root/pythainlp-data/thwiki_model_lstm.pth
"""
# check if the corpus is in local catalog, download if not
corpus_db_detail = get_corpus_db_detail(name)
if not corpus_db_detail or not corpus_db_detail.get("file_name"):
download(name)
corpus_db_detail = get_corpus_db_detail(name)
if corpus_db_detail and corpus_db_detail.get("file_name"):
# corpus is in the local catalog, get full path to the file
path = get_full_data_path(corpus_db_detail.get("file_name"))
# check if the corpus file actually exists, download if not
if not os.path.exists(path):
download(name)
if os.path.exists(path):
return path
return None
tar = tarfile.open(get_corpus_path("scb_1m_th-en_spm"), "r:gz")
tar.extractall()
tar.close()
if get_corpus_path("scb_1m_en-th_moses") is None:
download("scb_1m_en-th_moses", force=True, version="1.0")
tar = tarfile.open(get_corpus_path("scb_1m_en-th_moses"), "r:gz")
tar.extractall()
tar.close()
print("Install model...")
if not os.path.exists(get_full_data_path("scb_1m_th-en_newmm")):
os.mkdir(get_full_data_path("scb_1m_th-en_newmm"))
with tarfile.open(get_corpus_path("scb_1m_th-en_newmm")) as tar:
tar.extractall(path=get_full_data_path("scb_1m_th-en_newmm"))
if not os.path.exists(get_full_data_path("scb_1m_th-en_spm")):
os.mkdir(get_full_data_path("scb_1m_th-en_spm"))
with tarfile.open(get_corpus_path("scb_1m_th-en_spm")) as tar:
tar.extractall(path=get_full_data_path("scb_1m_th-en_spm"))
if not os.path.exists(get_full_data_path("scb_1m_en-th_moses")):
os.mkdir(get_full_data_path("scb_1m_en-th_moses"))
with tarfile.open(get_corpus_path("scb_1m_en-th_moses")) as tar:
tar.extractall(path=get_full_data_path("scb_1m_en-th_moses"))
if get_corpus_path("scb_1m_th-en_spm") is None:
download("scb_1m_th-en_spm", force=True, version="1.0")
tar = tarfile.open(get_corpus_path("scb_1m_th-en_spm"), "r:gz")
tar.extractall()
tar.close()
if get_corpus_path("scb_1m_en-th_moses") is None:
download("scb_1m_en-th_moses", force=True, version="1.0")
tar = tarfile.open(get_corpus_path("scb_1m_en-th_moses"), "r:gz")
tar.extractall()
tar.close()
print("Install model...")
if not os.path.exists(get_full_data_path("scb_1m_th-en_newmm")):
os.mkdir(get_full_data_path("scb_1m_th-en_newmm"))
with tarfile.open(get_corpus_path("scb_1m_th-en_newmm")) as tar:
tar.extractall(path=get_full_data_path("scb_1m_th-en_newmm"))
if not os.path.exists(get_full_data_path("scb_1m_th-en_spm")):
os.mkdir(get_full_data_path("scb_1m_th-en_spm"))
with tarfile.open(get_corpus_path("scb_1m_th-en_spm")) as tar:
tar.extractall(path=get_full_data_path("scb_1m_th-en_spm"))
if not os.path.exists(get_full_data_path("scb_1m_en-th_moses")):
os.mkdir(get_full_data_path("scb_1m_en-th_moses"))
with tarfile.open(get_corpus_path("scb_1m_en-th_moses")) as tar:
tar.extractall(path=get_full_data_path("scb_1m_en-th_moses"))
def _check_hash(dst: str, md5: str) -> None:
"""
Check hash helper.
@param: dst place to put the file
@param: md5 place to hash the file (MD5)
"""
if md5 and md5 != "-":
with open(get_full_data_path(dst), "rb") as f:
content = f.read()
file_md5 = hashlib.md5(content).hexdigest()
if md5 != file_md5:
raise Exception("Hash does not match expected.")
def get_path(model, path1, path2, file=None):
path = os.path.join(os.path.join(get_full_data_path(model), path1), path2)
if file is not None:
return os.path.join(path, file)
return os.path.join(path, "")
tar.extractall()
tar.close()
print("Install model...")
if not os.path.exists(get_full_data_path("scb_1m_th-en_newmm")):
os.mkdir(get_full_data_path("scb_1m_th-en_newmm"))
with tarfile.open(get_corpus_path("scb_1m_th-en_newmm")) as tar:
tar.extractall(path=get_full_data_path("scb_1m_th-en_newmm"))
if not os.path.exists(get_full_data_path("scb_1m_th-en_spm")):
os.mkdir(get_full_data_path("scb_1m_th-en_spm"))
with tarfile.open(get_corpus_path("scb_1m_th-en_spm")) as tar:
tar.extractall(path=get_full_data_path("scb_1m_th-en_spm"))
if not os.path.exists(get_full_data_path("scb_1m_en-th_moses")):
os.mkdir(get_full_data_path("scb_1m_en-th_moses"))
with tarfile.open(get_corpus_path("scb_1m_en-th_moses")) as tar:
tar.extractall(path=get_full_data_path("scb_1m_en-th_moses"))
def _download(url: str, dst: str) -> int:
"""
Download helper.
@param: url to download file
@param: dst place to put the file
"""
_CHUNK_SIZE = 64 * 1024 # 64 KiB
file_size = int(urlopen(url).info().get("Content-Length", -1))
r = requests.get(url, stream=True)
with open(get_full_data_path(dst), "wb") as f:
pbar = None
try:
from tqdm import tqdm
pbar = tqdm(total=int(r.headers["Content-Length"]))
except ImportError:
pbar = None
for chunk in r.iter_content(chunk_size=_CHUNK_SIZE):
if chunk:
f.write(chunk)
if pbar:
pbar.update(len(chunk))
if pbar:
pbar.close()
else:
def get_path(model, path1, path2, file=None):
path = os.path.join(os.path.join(get_full_data_path(model), path1), path2)
if file is not None:
return os.path.join(path, file)
return os.path.join(path, "")