Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import os
from collections import OrderedDict
import pandas as pd
from nlpia.data.loaders import get_data, BIGDATA_PATH
from gensim.models import KeyedVectors
word_vectors = get_data('word2vec') # not in book
wordvector_path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz') # not in book, reader required to compose this path
if 'word_vectors' not in globals(): # not in book
WV = word_vectors = get_data('word2vec')
word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)
###################################################
# Still need to create a class derived from gensim's Word2vec model instead of relying on word_vectors globals
COMPONENT_WORDS = OrderedDict([
('placeness', ('geography Geography geographic geographical geographical_location location ' +
'locale locations proximity').split()),
('peopleness', 'human Humans homo_sapiens peole people individuals humankind people men women'.split()),
('animalness', 'animal mammal carnivore animals Animal animal_welfare dog pet cats ani_mal'.split()),
('conceptness', 'concept concepts idea'.split()),
('femaleness', 'female Female females femal woman girl lady'.split()),
])
def component_vector(words):
def load_embeddings(embeddings_file):
# Detect the model format by its extension:
# Binary word2vec format:
if embeddings_file.endswith('.bin.gz') or embeddings_file.endswith('.bin'):
emb_model = models.KeyedVectors.load_word2vec_format(embeddings_file, binary=True,
unicode_errors='replace')
# Text word2vec format:
elif embeddings_file.endswith('.txt.gz') or embeddings_file.endswith('.txt') \
or embeddings_file.endswith('.vec.gz') or embeddings_file.endswith('.vec'):
emb_model = models.KeyedVectors.load_word2vec_format(
embeddings_file, binary=False, unicode_errors='replace')
# ZIP archive from the NLPL vector repository:
elif embeddings_file.endswith('.zip'):
with zipfile.ZipFile(embeddings_file, "r") as archive:
# Loading and showing the metadata of the model:
metafile = archive.open('meta.json')
metadata = json.loads(metafile.read())
for key in metadata:
print(key, metadata[key])
print('============')
# Loading the model itself:
stream = archive.open("model.bin") # or model.txt, if you want to look at the model
emb_model = models.KeyedVectors.load_word2vec_format(
stream, binary=True, unicode_errors='replace')
else:
# Native Gensim format?
def dump_word_embeddings(word2id, emb_size, word2vec_path, embeddings_path):
vocab_size = len(word2id)
word2vec = models.KeyedVectors.load_word2vec_format(
word2vec_path, binary=False)
embeddings = np.random.randn(vocab_size, emb_size)
for word, idx in word2id.items():
if word in word2vec:
embeddings[idx, :] = word2vec[word]
else:
embeddings[idx, :] = np.random.randn(emb_size)
np.save(embeddings_path, embeddings)
def load_from_model_w2v(self, words, maxwords=None, verbose=False):
try:
from gensim.models import KeyedVectors
except ImportError:
raise ValueError("No gensim installation found. Please install "
"`gensim` to load pretrained w2v embeddings.")
start = time.time()
model = KeyedVectors.load_word2vec_format(self.fname, binary=True)
if verbose:
print("Loaded model in {:.3f} secs".format(time.time()-start))
if words is not None:
vectors, outwords = [], []
for word in words:
try:
vectors.append(model[word])
outwords.append(word)
except KeyError:
pass
else:
outwords = list(model.vocab.keys())
if maxwords is not None:
outwords = outwords[:min(maxwords, len(model.vocab)-1)]
vectors = [model[w] for w in outwords]
def PretrainedEmbedding(self):
inputs = Input(shape=(None,), dtype='int32')
embeddings = KeyedVectors.load_word2vec_format(self.word_embedding_path, binary=False)
word_embeddings_weights = K.cast_to_floatx(np.concatenate((np.zeros((1, embeddings.syn0.shape[-1]), dtype=np.float32), embeddings.syn0), axis=0))
embeds = Embedding(len(word_embeddings_weights), word_embeddings_weights.shape[-1],
weights=[word_embeddings_weights], trainable=False)(inputs)
return Model(inputs=inputs, outputs=embeds, name='embedding')
def init_from_scratch(self, tok2int_vocab):
if self.embeddings == "fasttext":
self.embeddings_model = FastText.load_fasttext_format(str(self.emb_model_file))
elif self.embeddings == "word2vec":
self.embeddings_model = KeyedVectors.load_word2vec_format(str(self.emb_model_file),
binary=True)
log.info("[initializing new `{}`]".format(self.__class__.__name__))
self.build_int2emb_vocab(tok2int_vocab)
self.build_emb_matrix(tok2int_vocab)
def get_word_vectors(self, vocab):
"""
加载字向量,并获得相应的字向量矩阵
:param vocab: 字汇表
:return:
"""
word_vectors = (1 / np.sqrt(len(vocab)) * (2 * np.random.rand(len(vocab), self._embedding_size) - 1))
if os.path.splitext(self._word_vectors_path)[-1] == ".bin":
word_vec = gensim.models.KeyedVectors.load_word2vec_format(self._word_vectors_path, binary=True)
else:
word_vec = gensim.models.KeyedVectors.load_word2vec_format(self._word_vectors_path)
for i in range(len(vocab)):
try:
vector = word_vec.wv[vocab[i]]
word_vectors[i, :] = vector
except:
print(vocab[i] + "不存在于字向量中")
return word_vectors
#Here we create instances of our custom paragraph vector model.
#Set values for various parameters
num_features = 4000 # Word vector dimensionality
min_word_count = 30 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 30 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
#We use the same parameters to create one instance that uses normalized bag of words scaling and another that uses tf-idf scaling
par_vec_nbow = CustomParVec(words_by_line, num_workers, num_features, min_word_count, context, downsampling, False)
par_vec_tfidf = CustomParVec(words_by_line, num_workers, num_features, min_word_count, context, downsampling, True)
#We will also experiment with Google's pre-trained word2vec model which has 300 dimensions.
if USE_GOOGLE_NEWS:
model_google = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
par_vec_google = CustomParVec(words_by_line, num_workers, 300, min_word_count, context, downsampling, True, model_google)
#Let's place our models in a single list
if USE_GOOGLE_NEWS:
par_vecs = [par_vec_google, par_vec_nbow, par_vec_tfidf]
else:
par_vecs = [par_vec_nbow, par_vec_tfidf]
#data for any prior RIA we would like to test
policy_documents_liberia = ['Liberia Agenda for Transformation.txt', 'Liberia Eco stabilization and recovery plan-april_2015.txt']
policy_documents_bhutan = ['Eleventh-Five-Year-Plan_Vol-1.txt', '11th-Plan-Vol-2.txt']
policy_documents_namibia = ['na-nbsap-v2-en.txt', 'Agri Book with cover1.txt', 'execution strategy for industrialisation.txt', 'INDC of Namibia Final pdf.txt', 'Namibia_Financial_Sector_Strategy.txt', 'Tourism Policy.txt', 'namibia_national_health_policy_framework_2010-2020.txt', 'nampower booklet_V4.txt', '826_Ministry of Education Strategic Plan 2012-17.txt', 'Namibia_NDP4_Main_Document.txt']
policy_documents_cambodia = ['National Strategic Development Plan 2014-2018 EN Final.txt', 'Cambodia_EducationStrategicPlan_2014_2018.txt', 'Cambodia Climate Change Strategic Plan 2014_2023.txt', 'Cambodia Industrial Development Policy 2015_2025.txt', 'Cambodian Gender Strategic Plan - Neary Rattanak 4_Eng.txt', 'Draft_HealthStrategicPlan2016-2020.txt', 'Cambodia_national-disability-strategic-plan-2014-2018.txt', 'National_Policy_on_Green_Growth_2013_EN.txt', 'tourism_development_stategic_plan_2012_2020_english.txt', 'Labour Migration Policy for Cambodia 2015-2018.txt', 'kh-nbsap-v2-en.txt', 'financial-sector-development-strategy-2011-2020.txt', 'National_Social_Protection_Strategy_for_the_Poor_and_Vulnerable_Eng.txt']
policy_documents_mauritius = ['Agro-forestry Strategy 2016-2020.txt', 'VISION_14June2016Vision 2030DraftVersion4.txt', 'Updated Action Plan of the Energy Strategy 2011 -2025.txt', 'National Water Policy 2014.txt', 'National CC Adaptioin Policy Framework report.txt', 'MauritiusEnergy Strategy 2009-2025.txt', 'Mauritius Govertment programme 2015-2019.txt', 'CBD Strategy and Action Plan.txt']
def __init__(self, model: Union[k.Word2VecModels, str], **kwargs):
self.model = model
self.model_path = k.get_model_path(model)
self.keyed_vector: KeyedVectors = KeyedVectors.load_word2vec_format(self.model_path, **kwargs)
self.embedding_size = self.keyed_vector.vector_size
logging.debug('------------------------------------------------')
logging.debug('Loaded gensim word2vec model')
logging.debug('model : {}'.format(self.model_path))
logging.debug('word count : {}'.format(len(self.keyed_vector.index2entity)))
logging.debug('Top 50 word : {}'.format(self.keyed_vector.index2entity[:50]))
logging.debug('------------------------------------------------')
)
embeddings = cached_path(
f"{embeddings_path_v4}{embeddings[:2]}-crawl-fasttext-300d-1M",
cache_dir=cache_dir,
)
elif not Path(embeddings).exists():
raise ValueError(
f'The given embeddings "{embeddings}" is not available or is not a valid path.'
)
self.name: str = str(embeddings)
self.static_embeddings = True
if str(embeddings).endswith(".bin"):
self.precomputed_word_embeddings = gensim.models.KeyedVectors.load_word2vec_format(
str(embeddings), binary=True
)
else:
self.precomputed_word_embeddings = gensim.models.KeyedVectors.load(
str(embeddings)
)
self.field = field
self.__embedding_length: int = self.precomputed_word_embeddings.vector_size
super().__init__()