Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.decomposition import TruncatedSVD
EMBEDDING_DIM = 200
MEDLINE_WORD_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/medline_word_prob.json'
DISCOURSE_MODEL_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/model.tar.gz'
PUBMED_PRETRAINED_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/wikipedia-pubmed-and-PMC-w2v.txt.gz'
TRAIN_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/train_labels.json'
VALIDATION_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/validation_labels.json'
TEST_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/test_labels.json'
archive = load_archive(DISCOURSE_MODEL_PATH) # discourse model
predictor = Predictor.from_archive(archive, 'discourse_predictor')
assert os.path.exists('wiki.en.bin') == True
ft_model = load_model('wiki.en.bin') # fastText word vector
p_dict = json.load(open(cached_path(MEDLINE_WORD_PATH), 'r'))
def read_embedding(pretrained_path=PUBMED_PRETRAINED_PATH):
"""
Read Pubmed Pretrained embedding from Amazon S3 and
return dictionary of embeddings
"""
embeddings = {}
with EmbeddingsTextFile(pretrained_path) as embeddings_file:
for line in embeddings_file:
token = line.split(' ', 1)[0]
if token in p_dict.keys():
fields = line.rstrip().split(' ')
vector = np.asarray(fields[1:], dtype='float32')
embeddings[token] = vector
return embeddings
def _read(self, file_path):
with open(cached_path(file_path), "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
columns = data_file.readline().strip('\n').split('\t')
for line in data_file.readlines():
if not line:
continue
items = line.strip("\n").split("\t")
tokens = items[columns.index("tokens")]
category = items[columns.index("category")]
instance = self.text_to_instance(tokens=tokens,
category=category)
if instance is not None:
yield instance
def _read(self, file_path: str):
# supporting multi-dataset training:
datasets = []
for ind, single_file_path in enumerate(file_path.split(',')):
single_file_path_cached = cached_path(single_file_path)
zip_handle = gzip.open(single_file_path_cached, 'rb')
datasets.append({'single_file_path':single_file_path, \
'file_handle': zip_handle, \
'num_of_questions':0, 'inst_remainder':[], \
'dataset_weight':1 if self._dataset_weight is None else self._dataset_weight[ind] })
datasets[ind]['header'] = json.loads(datasets[ind]['file_handle'].readline())['header']
is_done = [False for _ in datasets]
while not all(is_done):
for ind, dataset in enumerate(datasets):
if is_done[ind]:
continue
for example in dataset['file_handle']:
example = self.combine_context(json.loads(example))
def multiqa_to_squad(dataset_paths, dataset_weights=None, sample_size = -1):
# take one or more multiqa files and convert it to a squad format file.
# supporting multi-dataset training:
datasets = []
for ind, single_file_path in enumerate(dataset_paths):
single_file_path_cached = cached_path(single_file_path)
zip_handle = gzip.open(single_file_path_cached, 'rb')
datasets.append({'single_file_path': single_file_path, \
'file_handle': zip_handle, \
'num_of_questions': 0, 'inst_remainder': [], \
'dataset_weight': 1 if dataset_weights is None else dataset_weights[ind]})
datasets[ind]['header'] = json.loads(datasets[ind]['file_handle'].readline())['header']
# We will have only one topic here..
squad_data = {'data':[{'title':'','paragraphs':[]}]}
is_done = [False for _ in datasets]
while not all(is_done):
for ind, dataset in enumerate(datasets):
if is_done[ind]:
continue
for example in dataset['file_handle']:
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self._use_all_sql = use_all_sql
self._remove_unneeded_aliases = remove_unneeded_aliases
self._use_prelinked_entities = use_prelinked_entities
self._keep_if_unparsable = keep_if_unparseable
if not self._use_prelinked_entities:
raise ConfigurationError(
"The grammar based text2sql dataset reader "
"currently requires the use of entity pre-linking."
)
self._cross_validation_split_to_exclude = str(cross_validation_split_to_exclude)
if database_file is not None:
database_file = cached_path(database_file)
connection = sqlite3.connect(database_file)
self._cursor = connection.cursor()
else:
self._cursor = None
self._schema_path = schema_path
self._world = Text2SqlWorld(
schema_path,
self._cursor,
use_prelinked_entities=use_prelinked_entities,
use_untyped_entities=use_untyped_entities,
)
def _load_highway(self):
# the highway layers have same dimensionality as the number of cnn filters
cnn_options = self._options["char_cnn"]
filters = cnn_options["filters"]
n_filters = sum(f[1] for f in filters)
n_highway = cnn_options["n_highway"]
# create the layers, and load the weights
self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu)
for k in range(n_highway):
# The AllenNLP highway is one matrix multplication with concatenation of
# transform and carry weights.
with h5py.File(cached_path(self._weight_file), "r") as fin:
# The weights are transposed due to multiplication order assumptions in tf
# vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X))
w_transform = numpy.transpose(fin["CNN_high_{}".format(k)]["W_transform"][...])
# -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x)
w_carry = -1.0 * numpy.transpose(fin["CNN_high_{}".format(k)]["W_carry"][...])
weight = numpy.concatenate([w_transform, w_carry], axis=0)
self._highways._layers[k].weight.data.copy_(torch.FloatTensor(weight))
self._highways._layers[k].weight.requires_grad = self.requires_grad
b_transform = fin["CNN_high_{}".format(k)]["b_transform"][...]
b_carry = -1.0 * fin["CNN_high_{}".format(k)]["b_carry"][...]
bias = numpy.concatenate([b_transform, b_carry], axis=0)
self._highways._layers[k].bias.data.copy_(torch.FloatTensor(bias))
self._highways._layers[k].bias.requires_grad = self.requires_grad
def _read(self, file_path: str):
file_path = cached_path(file_path) # if `file_path` is a URL, redirect to the cache
ace_reader = ACE()
logger.info("Reading ACE Mention instances from dataset files at: %s", file_path)
for sentence in self._sentence_iterate(ace_reader, file_path):
tokens = [Token(t) for t in sentence.words]
if not sentence.mention_tags:
tags = ["O" for _ in tokens]
else:
tags = sentence.mention_tags
yield self.text_to_instance(tokens, tags)
def _open_inside_zip(self, archive_path: str, member_path: Optional[str] = None) -> None:
cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir)
archive = zipfile.ZipFile(cached_archive_path, "r")
if member_path is None:
members_list = archive.namelist()
member_path = self._get_the_only_file_in_the_archive(members_list, archive_path)
member_path = cast(str, member_path)
member_file = archive.open(member_path, "r")
self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
self._archive_handle = archive
def from_params(cls, params: Params, instances: Iterable['adi.Instance'] = None):
vampire_vocab_file = params.pop('vampire_vocab_file')
vocab = cls()
vocab = vocab.from_instances(instances=instances,
tokens_to_add={"classifier": ["@@UNKNOWN@@"]})
vampire_vocab_file = cached_path(vampire_vocab_file)
vocab.set_from_file(filename=vampire_vocab_file,
namespace="vampire",
oov_token="@@UNKNOWN@@",
is_padded=False)
return vocab