Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def init_bidaf(bidaf_model_dir: str, download_ntlk_punkt: bool = False) -> bool:
if os.path.isdir(bidaf_model_dir):
print("bidaf model directory already present..", file=sys.stderr)
else:
print("Creating bidaf model directory..", file=sys.stderr)
os.makedirs(bidaf_model_dir, exist_ok=True)
# Download Punkt Sentence Tokenizer
if download_ntlk_punkt:
nltk.download("punkt", download_dir=bidaf_model_dir)
nltk.download("punkt")
# Download bidaf onnx model
onnx_model_file = os.path.abspath(os.path.join(bidaf_model_dir, "bidaf.onnx"))
print(f"Checking file {onnx_model_file}..", file=sys.stderr)
if os.path.isfile(onnx_model_file):
print("bidaf.onnx downloaded already!", file=sys.stderr)
else:
print("Downloading bidaf.onnx...", file=sys.stderr)
response = requests.get(
"https://onnxzoo.blob.core.windows.net/models/opset_9/bidaf/bidaf.onnx",
stream=True,
)
with open(onnx_model_file, "wb") as f:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, f)
""" Interface to Alex/Steven's Kunwinjku data. """
from pathlib import Path
import subprocess
from typing import List, Set
import nltk # type: ignore
# TODO This download should be conditional, since a complaint is raised if
# there is no net connection
nltk.download("punkt") # type: ignore
from pympi.Elan import Eaf
from .. import corpus
from .. import config
from ..preprocess.labels import segment_into_tokens
from ..utterance import Utterance
from ..preprocess.labels import LabelSegmenter
from ..corpus import Corpus
BASIC_PHONEMES = set(["a", "b", "d", "dj", "rd", "e", "h", "i", "k", "l",
"rl", "m", "n", "ng", "nj", "rn", "o", "r", "rr", "u",
"w", "y",])
DOUBLE_STOPS = set(["bb", "dd", "djdj", "rdd", "kk"])
DIPHTHONGS = set(["ay", "aw", "ey", "ew", "iw", "oy", "ow", "uy"])
PHONEMES = BASIC_PHONEMES | DOUBLE_STOPS | DIPHTHONGS
def __init__(self, tagger, model):
"""
Performs all necessary preprocessing
:param tagger: Path to the Stanford NER Tagger
:param model: Path to the model for the NER Tagger
"""
# check if model for tokenizer exists
try:
nltk.data.find('punkt.zip')
except:
nltk.download('punkt')
# init NER parser
self.nerParser = StanfordNERTagger(tagger, model)
# init Charniak parser
self.rerankingParser = RerankingParser.fetch_and_load('WSJ+Gigaword-v2')
import json
import math
import numpy as np
import nltk
import os
import torch
from nltk.tokenize import sent_tokenize
from pytorch_pretrained_bert import BertForNextSentencePrediction
from pytorch_pretrained_bert import BertTokenizer
from torch.nn import Softmax
from configuration import CONFIG_DIR
from experiments_output import OUTPUT_DIR
nltk.download('punkt')
CONFIG_PATH = os.path.join(CONFIG_DIR, 'config.json')
def run_bert_ns(data, year, predictions_dict):
"""
Train the BERT LM_experiments for the Next sentence prediction
:param data: The actual data of the year stored on dictionary
:param year: The corresponding year of the data. It is used when we save the predictions
:param predictions_dict: A dict where we save the predictions from our experiments
:return:
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
vocab_size = len(tokenizer.vocab)
def run(self):
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
path = 'data/external/nltk_download_SUCCESS'
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as f:
f.write('Downloaded nltk: stopwords, pinkt, wordnet')
def load_movie_reviews():
# movie_reviews is a sizeable corpus to import, so only load it if we have to
from nltk.corpus import movie_reviews
try:
movie_reviews.categories()
except:
import nltk
print('This appears to be your first time using the NLTK Movie Reviews corpus. We will first download the necessary corpus (this is a one-time download that might take a little while')
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
raw_data = []
# NLTK's corpus is structured in an interesting way
# first iterate through the two categories (pos and neg)
for category in movie_reviews.categories():
if category == 'pos':
pretty_category_name = 'positive'
elif category == 'neg':
pretty_category_name = 'negative'
# each of these categories is just fileids, so grab those
for fileid in movie_reviews.fileids(category):
def getWords(min_len):
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
nltk.download('brown')
nltk.download('punkt')
source = FreqDist(i.lower() for i in brown.words())
source = np.array(source.most_common())[:, :1]
# the Brown corpus contains duplicates and contains
# words with weird punctuation and digits
word_list = np.unique(np.char.lower(source))
p = np.random.permutation(word_list.shape[0])
word_list = word_list[p]
words = [word for word in word_list if len(word) == len(set(word)) and re.search("[^A-Za-z\ ]", word) == None]
output = [word for word in words if len(word) >= min_len and len(word) <= 26 and word[-1:] != 's']
return output
def nltk_download():
"""Download required nltk corpora"""
try:
import nltk
if not nltk.download(required_nltk_corpora):
abort(red('Unable to download nltk corpora: %s' % required_nltk_corpora))
except ImportError:
abort(red("Failed to import nltk"))
# following https://github.com/ryankiros/skip-thoughts#getting-started
skipthoughts_files = [
'dictionary.txt', 'utable.npy', 'btable.npy', 'uni_skip.npz', 'uni_skip.npz.pkl', 'bi_skip.npz',
'bi_skip.npz.pkl',
]
for filename in skipthoughts_files:
src_url = SKIPTHOUGHTS_BASE_URL + filename
print(('Downloading ' + src_url))
urlretrieve(src_url, os.path.join(SKIPTHOUGHTS_DIR, filename),
reporthook=dl_progress_hook)
elif data_name == 'nltk_punkt':
import nltk
print('== NLTK pre-trained Punkt tokenizer for English ==')
nltk.download('punkt')
elif data_name == 'pretrained_model':
print('== Pretrained model ==')
MODEL_DIR = os.path.join(DATA_DIR, 'Models')
pretrained_model_filename = 'latest_model_flowers_temp.ckpt'
src_url = 'https://bitbucket.org/paarth_neekhara/texttomimagemodel/raw/74a4bbaeee26fe31e148a54c4f495694680e2c31/' + pretrained_model_filename
print(('Downloading ' + src_url))
urlretrieve(
src_url,
os.path.join(MODEL_DIR, pretrained_model_filename),
reporthook=dl_progress_hook,
)
else:
raise ValueError('Unknown dataset name: ' + data_name)