Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _build_token2idx_from_bert(self):
dict_path = os.path.join(self.model_folder, 'vocab.txt')
if not os.path.exists(dict_path):
model_name = self.model_key_map.get(self.model_folder, 'chinese_L-12_H-768_A-12')
url = self.pre_trained_models.get(model_name)
get_file(
model_name + ".zip", url, extract=True,
cache_dir=text2vec.USER_DIR,
cache_subdir=text2vec.USER_DATA_DIR,
verbose=1
)
self.model_folder = os.path.join(text2vec.USER_DATA_DIR, model_name)
dict_path = os.path.join(self.model_folder, 'vocab.txt')
logger.debug(f'load vocab.txt from {dict_path}')
token2idx = {}
with codecs.open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
token = line.strip()
token2idx[token] = len(token2idx)
self.bert_token2idx = token2idx
self.tokenizer = keras_bert.Tokenizer(token2idx)
self.processor.token2idx = self.bert_token2idx
self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
"""
import os
import tensorflow as tf
import text2vec
from text2vec.bert.model import BertSimilarity
if __name__ == '__main__':
sim = BertSimilarity(data_dir='../data/', model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
output_dir=os.path.join(text2vec.USER_DATA_DIR, 'fine_tuned_bert_similarity'))
sim.set_mode(tf.estimator.ModeKeys.TRAIN)
sim.train()
sim.set_mode(tf.estimator.ModeKeys.EVAL)
sim.eval()
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
unique_id += 1
if __name__ == "__main__":
vector = BertVector(model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
output_dir=os.path.join(text2vec.USER_DATA_DIR, 'bert_vector'))
emb = vector.encode(['你好吗朋友', '您好呀小盆友'])
print(str(emb))
print(emb.shape)
continue
line = line.strip()
text_a = None
text_b = None
m = re.match(r"^(.*) \|\|\| (.*)$", line)
if m is None:
text_a = line
else:
text_a = m.group(1)
text_b = m.group(2)
yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
unique_id += 1
if __name__ == "__main__":
vector = BertVector(model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
output_dir=os.path.join(text2vec.USER_DATA_DIR, 'bert_vector'))
emb = vector.encode(['你好吗朋友', '您好呀小盆友'])
print(str(emb))
print(emb.shape)
def _build_token2idx_from_w2v(self):
if not self.w2v_path or not os.path.exists(self.w2v_path):
if self.w2v_path in self.model_key_map:
self.w2v_path = self.model_key_map[self.w2v_path]
model_dict = self.model_key_map.get(self.w2v_path, self.model_key_map['w2v-light-tencent-chinese'])
tar_filename = model_dict.get('tar_filename')
self.w2v_kwargs = {'binary': model_dict.get('binary')}
url = model_dict.get('url')
untar_filename = model_dict.get('untar_filename')
self.w2v_path = os.path.join(text2vec.USER_DATA_DIR, untar_filename)
if not os.path.exists(self.w2v_path):
get_file(
tar_filename, url, extract=True,
cache_dir=text2vec.USER_DIR,
cache_subdir=text2vec.USER_DATA_DIR,
verbose=1
)
t0 = time.time()
w2v = KeyedVectors.load_word2vec_format(self.w2v_path, **self.w2v_kwargs)
w2v.init_sims(replace=True)
logger.debug('load w2v from %s, spend %s s' % (self.w2v_path, time.time() - t0))
token2idx = {
self.processor.token_pad: 0,
self.processor.token_unk: 1,
self.processor.token_bos: 2,
self.processor.token_eos: 3
}
for token in w2v.index2word:
token2idx[token] = len(token2idx)
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
"""
import os
import tensorflow as tf
import text2vec
from text2vec.bert.model import BertSimilarity
if __name__ == '__main__':
sim = BertSimilarity(data_dir='../data/', model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
output_dir=os.path.join(text2vec.USER_DATA_DIR, 'fine_tuned_bert_similarity'))
sim.set_mode(tf.estimator.ModeKeys.TRAIN)
sim.train()
sim.set_mode(tf.estimator.ModeKeys.EVAL)
sim.eval()
def _build_token2idx_from_bert(self):
dict_path = os.path.join(self.model_folder, 'vocab.txt')
if not os.path.exists(dict_path):
model_name = self.model_key_map.get(self.model_folder, 'chinese_L-12_H-768_A-12')
url = self.pre_trained_models.get(model_name)
get_file(
model_name + ".zip", url, extract=True,
cache_dir=text2vec.USER_DIR,
cache_subdir=text2vec.USER_DATA_DIR,
verbose=1
)
self.model_folder = os.path.join(text2vec.USER_DATA_DIR, model_name)
dict_path = os.path.join(self.model_folder, 'vocab.txt')
logger.debug(f'load vocab.txt from {dict_path}')
token2idx = {}
with codecs.open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
token = line.strip()
token2idx[token] = len(token2idx)
self.bert_token2idx = token2idx
self.tokenizer = keras_bert.Tokenizer(token2idx)
self.processor.token2idx = self.bert_token2idx
self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])
def _build_token2idx_from_w2v(self):
if not self.w2v_path or not os.path.exists(self.w2v_path):
if self.w2v_path in self.model_key_map:
self.w2v_path = self.model_key_map[self.w2v_path]
model_dict = self.model_key_map.get(self.w2v_path, self.model_key_map['w2v-light-tencent-chinese'])
tar_filename = model_dict.get('tar_filename')
self.w2v_kwargs = {'binary': model_dict.get('binary')}
url = model_dict.get('url')
untar_filename = model_dict.get('untar_filename')
self.w2v_path = os.path.join(text2vec.USER_DATA_DIR, untar_filename)
if not os.path.exists(self.w2v_path):
get_file(
tar_filename, url, extract=True,
cache_dir=text2vec.USER_DIR,
cache_subdir=text2vec.USER_DATA_DIR,
verbose=1
)
t0 = time.time()
w2v = KeyedVectors.load_word2vec_format(self.w2v_path, **self.w2v_kwargs)
w2v.init_sims(replace=True)
logger.debug('load w2v from %s, spend %s s' % (self.w2v_path, time.time() - t0))
token2idx = {
self.processor.token_pad: 0,
self.processor.token_unk: 1,
self.processor.token_bos: 2,
self.processor.token_eos: 3
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
"""
import os
import tensorflow as tf
import text2vec
from text2vec.bert.model import BertSimilarity
if __name__ == '__main__':
sim = BertSimilarity(data_dir='../data/', model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
output_dir=os.path.join(text2vec.USER_DATA_DIR, 'fine_tuned_bert_similarity'))
sim.set_mode(tf.estimator.ModeKeys.PREDICT)
while True:
print('input start:')
sentence1 = input('sentence1: ')
sentence2 = input('sentence2: ')
predict = sim.predict(sentence1, sentence2)
print(f'similarity:{predict[0][1]}')
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
"""
import os
import tensorflow as tf
import text2vec
from text2vec.bert.model import BertSimilarity
if __name__ == '__main__':
sim = BertSimilarity(data_dir='../data/', model_dir=os.path.join(text2vec.USER_DATA_DIR, 'chinese_L-12_H-768_A-12'),
output_dir=os.path.join(text2vec.USER_DATA_DIR, 'fine_tuned_bert_similarity'))
sim.set_mode(tf.estimator.ModeKeys.PREDICT)
while True:
print('input start:')
sentence1 = input('sentence1: ')
sentence2 = input('sentence2: ')
predict = sim.predict(sentence1, sentence2)
print(f'similarity:{predict[0][1]}')