Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
which will be used to collect vectors
embedding_name : str
Embedding name picked up from the list of the pretrained aliases
mentioned above
"""
super(IntersectedVocab, self).__init__(vocab_file, *args, **kwargs)
self.type = "intersected"
name = embedding_name.split(".")[0]
dim = embedding_name.split(".")[2][:-1]
middle = embedding_name.split(".")[1]
class_name = EMBEDDING_NAME_CLASS_MAPPING[name]
if not hasattr(vocab, class_name):
from pythia.common.registry import registry
writer = registry.get("writer")
error = "Unknown embedding type: %s" % name, "error"
if writer is not None:
writer.write(error, "error")
raise RuntimeError(error)
params = [middle]
if name == "glove":
params.append(int(dim))
vector_cache = os.path.join(get_pythia_root(), ".vector_cache")
embedding = getattr(vocab, class_name)(*params, cache=vector_cache)
if not isinstance(vec, vocab.Vectors):
vec_name = vec
vec_data = cls._cached_vec_data.get(vec_name)
if vec_data is None:
parts = vec_name.split('.')
if parts[0] == 'fasttext':
if parts[2] == 'bin':
vec_data = FastTextBinary(language=parts[1], cache=cache)
elif parts[2] == 'vec' and parts[1] == 'wiki':
vec_data = FastText(
suffix='wiki-news-300d-1M.vec.zip', cache=cache)
elif parts[2] == 'vec' and parts[1] == 'crawl':
vec_data = FastText(
suffix='crawl-300d-2M.vec.zip', cache=cache)
if vec_data is None:
vec_data = vocab.pretrained_aliases[vec_name](cache=cache)
cls._cached_vec_data[vec_name] = vec_data
vec_datas.append(vec_data)
else:
vec_datas.append(vec)
return vec_datas
if isinstance(arg, textdata.Dataset):
sources += [
getattr(arg, name)
for name, field in arg.fields.items()
if field is self
]
else:
sources.append(arg)
counter = Counter()
for data in sources:
for x in data:
if len(x) > 0:
counter.update(x[0])
specials = [self.unk_token, self.pad_token]
self.vocab = vocab.Vocab(counter, specials=specials, **kwargs)
init = 'glove-twitter'
name = 'twitter.27B'
pretrained_dim = 200
else:
init = 'glove'
name = '840B'
embs = vocab.GloVe(
name=name,
dim=pretrained_dim,
cache=modelzoo_path(
self.opt.get('datapath'), 'models:glove_vectors'
),
)
elif opt['embedding_type'].startswith('fasttext'):
init = 'fasttext'
embs = vocab.FastText(
language='en',
cache=modelzoo_path(
self.opt.get('datapath'), 'models:fasttext_vectors'
),
)
else:
raise RuntimeError('embedding type not implemented')
if opt['embeddingsize'] != pretrained_dim:
rp = torch.Tensor(pretrained_dim, opt['embeddingsize']).normal_()
t = lambda x: torch.mm(x.unsqueeze(0), rp)
else:
t = lambda x: x
cnt = 0
for w, i in self.dict.tok2ind.items():
if w in embs.stoi:
def __init__(self, name):
self.name = name
self.word2index = {}
self.word2count = {}
self.index2word = []
for idx, word in enumerate(['', '', '']):
self.word2index[word] = idx
self.index2word.append(word)
self.word2count[word] = 1
self.n_words = 3
self.glovec = torchtext.vocab.GloVe(cache=osp.join(this_dir, '..', 'data', 'caches'))
lambda **kwargs: vocab.CharNGram(**kwargs),
"fasttext.en.300d":
from graphviz import Digraph
import webcolors
import pprint
import math
from scipy.stats import norm
from color_histogram.core.hist_3d import Hist3D
#import pcl # cd python-pcl -> python setup.py build-ext -i -> python setup.py install
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import torchtext #0. install torchtext==0.2.3 (pip install torchtext==0.2.3)
from torch.nn.functional import cosine_similarity
from collections import Counter
import pcl
import os.path as osp
import os
fasttext = torchtext.vocab.FastText()
_GRAY = (218, 227, 218)
_GREEN = (18, 127, 15)
_WHITE = (255, 255, 255)
class same_node_detection(object):
def __init__(self):
self.compare_all = False
self.class_weight = 10.0/20.0
self.pose_weight = 8.0/20.0
self.color_weight = 2.0/20.0
def compare_class(self, curr_cls, prev_cls, cls_score ):
similar_cls = False
same_cls = 0
score = 0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torchtext.vocab
import pickle
from collections import Counter, defaultdict
DEFAULT_SPECIAL_TOKENS = ["", "<s>", "</s>", "UNK"]
class Vocab(torchtext.vocab.Vocab):
def __init__(self, path=None, unk_token="UNK", picklable=False):
self._unk_token = unk_token
self.itos = []
if picklable:
self.stoi = {}
else:
self.stoi = defaultdict(lambda: 3)
if path:
self.load(path)
def size(self):
return len(self.itos)
def initialize(self, special_tokens=None):
if special_tokens is None:
if not hasattr(vocab, class_name):
from pythia.common.registry import registry
writer = registry.get("writer")
error = "Unknown embedding type: %s" % name, "error"
if writer is not None:
writer.write(error, "error")
raise RuntimeError(error)
params = [middle]
if name == "glove":
params.append(int(dim))
vector_cache = os.path.join(get_pythia_root(), ".vector_cache")
embedding = getattr(vocab, class_name)(*params, cache=vector_cache)
self.vectors = torch.empty(
(self.get_size(), len(embedding.vectors[0])), dtype=torch.float
)
self.embedding_dim = len(embedding.vectors[0])
for i in range(0, 4):
self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i
for i in range(4, self.get_size()):
word = self.itos[i]
embedding_index = embedding.stoi.get(word, None)
if embedding_index is None:
self.vectors[i] = self.vectors[self.UNK_INDEX].clone()
def main(options):
# first pass: collecting vocab
conll_reader = utils.io.CoNLLReader(open(options.train_conll_file))
tokens = []
postags = []
for sent in conll_reader:
for row in sent:
tokens.append(row["FORM"])
postags.append(row["UPOSTAG"])
conll_reader.close()
vocab = torchtext.vocab.Vocab(collections.Counter(tokens), specials=["", ""],
max_size=options.vocab_size)
postags = list(set(postags))
postags.append("")
postags.append("")
postag2idx = dict((pair[1], pair[0]) for pair in enumerate(postags))
oracle_reader = utils.io.OracleReader(open(options.train_oracle_file))
actions = []
for sent in oracle_reader:
for row in sent:
actions.append(make_action_str(row))
actions = list(set(actions))
actions.append("")
actions.append("")
action2idx = dict((pair[1], pair[0]) for pair in enumerate(actions))