Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
embeds = []
for sentence in sentence_list:
emb = []
count = 0
for word in sentence:
if word not in self.w2v.vocab:
continue
emb.append(self.w2v[word])
count += 1
tensor_x = np.array(emb).sum(axis=0) # 纵轴相加
avg_tensor_x = np.divide(tensor_x, count)
embeds.append(avg_tensor_x)
embeds = np.array(embeds)
if debug:
logger.debug(f'sentence tensor shape: {embeds.shape}')
return embeds
def _build_model(self, **kwargs):
if self.embed_model is None:
from tensorflow import keras
if self.token_count == 0:
logger.debug('need to build after build_word2idx')
else:
input_tensor = keras.layers.Input(shape=(self.sequence_length,),
name='input')
layer_embedding = keras.layers.Embedding(self.token_count,
self.embedding_size,
weights=[self.w2v_vector_matrix],
trainable=False,
name='layer_embedding')
embedded_tensor = layer_embedding(input_tensor)
self.embed_model = keras.Model(input_tensor, embedded_tensor)
Returns:
vectorized sentence list
print(token, predicts[i].tolist()[:4])
[CLS] [0.24250675737857819, 0.04605229198932648, ...]
from [0.2858668565750122, 0.12927496433258057, ...]
that [-0.7514970302581787, 0.14548861980438232, ...]
day [0.32245880365371704, -0.043174318969249725, ...]
...
"""
if self.embed_model is None:
raise ValueError('need to build model for embed sentence')
tensor_x = self.process_x_dataset(sentence_list)
if debug:
logger.debug(f'sentence tensor: {tensor_x}')
embed_results = self.embed_model.predict(tensor_x)
return embed_results
token2idx[token] = len(token2idx)
vector_matrix = np.zeros((len(token2idx), w2v.vector_size))
vector_matrix[1] = np.random.rand(w2v.vector_size)
vector_matrix[4:] = w2v.vectors
self.embedding_size = w2v.vector_size
self.w2v_vector_matrix = vector_matrix
self.w2v_token2idx = token2idx
self.w2v_top_words = w2v.index2entity[:50]
self.w2v_model_loaded = True
self.w2v = w2v
self.processor.token2idx = self.w2v_token2idx
self.processor.idx2token = dict([(value, key) for key, value in self.w2v_token2idx.items()])
logger.debug('word count : {}'.format(len(self.w2v_vector_matrix)))
logger.debug('emb size : {}'.format(self.embedding_size))
logger.debug('Top 50 word : {}'.format(self.w2v_top_words))
self.tokenizer = Tokenizer()
w2v_path: word2vec file path
w2v_kwargs: params pass to the ``load_word2vec_format()`` function of ``gensim.models.KeyedVectors`` -
https://radimrehurek.com/gensim/models/keyedvectors.html#module-gensim.models.keyedvectors
sequence_length: ``'auto'``, ``'variable'`` or integer. When using ``'auto'``, use the 95% of corpus length
as sequence length. When using ``'variable'``, model input shape will set to None, which can handle
various length of input, it will use the length of max sequence in every batch for sequence length.
If using an integer, let's say ``50``, the input output sequence length will set to 50.
processor:
"""
if w2v_kwargs is None:
w2v_kwargs = {}
self.w2v_path = w2v_path
self.w2v_kwargs = w2v_kwargs
self.w2v = None
self.w2v_model_loaded = False
logger.debug('load w2v embedding ...')
super(WordEmbedding, self).__init__(sequence_length=sequence_length,
embedding_size=0,
processor=processor)
self._build_token2idx_from_w2v()
if trainable:
self._build_model()