Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
query = "windy London"
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)
a = bm25.get_top_n(tokenized_query, corpus, n=2)
print(a)
print("*" * 45)
corpus = ['女网红能火的只是一小部分', '当下最火的男明星为鹿晗', "How is the weather today?"]
tokenized_corpus = [segment(doc) for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)
query = '当下最火的女网红是谁?'
tokenized_query = segment(query)
doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)
a = bm25.get_top_n(tokenized_query, corpus, n=2)
print(a)
def setUpClass(cls):
from text2vec.embeddings.bert_embedding import BERTEmbedding
cls.embedding = BERTEmbedding(sequence_length=SEQUENCE_LENGTH)
def setUpClass(cls):
from text2vec.embeddings.word_embedding import WordEmbedding
cls.embedding = WordEmbedding(sequence_length=SEQUENCE_LENGTH)
def get_sentence_examples(self, questions, prefix):
data_list = []
for index, data in enumerate(questions):
data = data.strip().split('\t')
guid = '%s-%d' % (prefix, index)
text_a = tokenization.convert_to_unicode(str(data[0]))
text_b = tokenization.convert_to_unicode(str(data[1]))
label = str(data[2])
data_list.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return data_list
def get_sentence_examples(self, questions, prefix):
data_list = []
for index, data in enumerate(questions):
data = data.strip().split('\t')
guid = '%s-%d' % (prefix, index)
text_a = tokenization.convert_to_unicode(str(data[0]))
text_b = tokenization.convert_to_unicode(str(data[1]))
label = str(data[2])
data_list.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return data_list
def ngrams(words, ngram, join_string=" "):
"""
wrapper for ngram
"""
if ngram == 1:
return NgramUtil.unigrams(words)
elif ngram == 2:
return NgramUtil.bigrams(words, join_string)
elif ngram == 3:
return NgramUtil.trigrams(words, join_string)
elif ngram == 4:
return NgramUtil.fourgrams(words, join_string)
elif ngram == 12:
unigram = NgramUtil.unigrams(words)
bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
return unigram + bigram
elif ngram == 123:
unigram = NgramUtil.unigrams(words)
bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
return unigram + bigram + trigram
def ngrams(words, ngram, join_string=" "):
"""
wrapper for ngram
"""
if ngram == 1:
return NgramUtil.unigrams(words)
elif ngram == 2:
return NgramUtil.bigrams(words, join_string)
elif ngram == 3:
return NgramUtil.trigrams(words, join_string)
elif ngram == 4:
return NgramUtil.fourgrams(words, join_string)
elif ngram == 12:
unigram = NgramUtil.unigrams(words)
bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
return unigram + bigram
elif ngram == 123:
unigram = NgramUtil.unigrams(words)
bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
return unigram + bigram + trigram
def init(self):
if not self.bm25_instance:
if not self.corpus:
logger.error('corpus is none, set corpus with docs.')
raise ValueError("must set corpus, which is documents, list of str")
if isinstance(self.corpus, str):
self.corpus = [self.corpus]
self.corpus_seg = {k: self.tokenizer.tokenize(k) for k in self.corpus}
self.bm25_instance = BM25Okapi(corpus=list(self.corpus_seg.values()))
def sequence_length(self, val: Union[int, str]):
if isinstance(val, str):
if val == 'auto':
logger.debug("Sequence length will auto set at 95% of sequence length")
elif val == 'variable':
val = None
else:
raise ValueError("sequence_length must be an int or 'auto' or 'variable'")
self.processor.sequence_length = val
embeds = []
for sentence in sentence_list:
emb = []
count = 0
for word in sentence:
if word not in self.w2v.vocab:
continue
emb.append(self.w2v[word])
count += 1
tensor_x = np.array(emb).sum(axis=0) # 纵轴相加
avg_tensor_x = np.divide(tensor_x, count)
embeds.append(avg_tensor_x)
embeds = np.array(embeds)
if debug:
logger.debug(f'sentence tensor shape: {embeds.shape}')
return embeds