How to use text2vec - 10 common examples

To help you get started, we’ve selected a few text2vec examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github shibing624 / text2vec / tests / rankbm25_demo.py View on Github external
query = "windy London"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)

a = bm25.get_top_n(tokenized_query, corpus, n=2)
print(a)

print("*" * 45)

corpus = ['女网红能火的只是一小部分', '当下最火的男明星为鹿晗', "How is the weather today?"]
tokenized_corpus = [segment(doc) for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

query = '当下最火的女网红是谁?'
tokenized_query = segment(query)

doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)

a = bm25.get_top_n(tokenized_query, corpus, n=2)
print(a)
github shibing624 / text2vec / tests / test_embedding.py View on Github external
def setUpClass(cls):
        from text2vec.embeddings.bert_embedding import BERTEmbedding
        cls.embedding = BERTEmbedding(sequence_length=SEQUENCE_LENGTH)
github shibing624 / text2vec / tests / test_embedding.py View on Github external
def setUpClass(cls):
        from text2vec.embeddings.word_embedding import WordEmbedding
        cls.embedding = WordEmbedding(sequence_length=SEQUENCE_LENGTH)
github shibing624 / text2vec / text2vec / bert / model.py View on Github external
def get_sentence_examples(self, questions, prefix):
        data_list = []
        for index, data in enumerate(questions):
            data = data.strip().split('\t')
            guid = '%s-%d' % (prefix, index)
            text_a = tokenization.convert_to_unicode(str(data[0]))
            text_b = tokenization.convert_to_unicode(str(data[1]))
            label = str(data[2])
            data_list.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return data_list
github shibing624 / text2vec / text2vec / bert / model.py View on Github external
def get_sentence_examples(self, questions, prefix):
        data_list = []
        for index, data in enumerate(questions):
            data = data.strip().split('\t')
            guid = '%s-%d' % (prefix, index)
            text_a = tokenization.convert_to_unicode(str(data[0]))
            text_b = tokenization.convert_to_unicode(str(data[1]))
            label = str(data[2])
            data_list.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return data_list
github shibing624 / text2vec / text2vec / utils / ngram.py View on Github external
def ngrams(words, ngram, join_string=" "):
        """
        wrapper for ngram
        """
        if ngram == 1:
            return NgramUtil.unigrams(words)
        elif ngram == 2:
            return NgramUtil.bigrams(words, join_string)
        elif ngram == 3:
            return NgramUtil.trigrams(words, join_string)
        elif ngram == 4:
            return NgramUtil.fourgrams(words, join_string)
        elif ngram == 12:
            unigram = NgramUtil.unigrams(words)
            bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
            return unigram + bigram
        elif ngram == 123:
            unigram = NgramUtil.unigrams(words)
            bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
            trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
            return unigram + bigram + trigram
github shibing624 / text2vec / text2vec / utils / ngram.py View on Github external
def ngrams(words, ngram, join_string=" "):
        """
        wrapper for ngram
        """
        if ngram == 1:
            return NgramUtil.unigrams(words)
        elif ngram == 2:
            return NgramUtil.bigrams(words, join_string)
        elif ngram == 3:
            return NgramUtil.trigrams(words, join_string)
        elif ngram == 4:
            return NgramUtil.fourgrams(words, join_string)
        elif ngram == 12:
            unigram = NgramUtil.unigrams(words)
            bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
            return unigram + bigram
        elif ngram == 123:
            unigram = NgramUtil.unigrams(words)
            bigram = [x for x in NgramUtil.bigrams(words, join_string) if len(x.split(join_string)) == 2]
            trigram = [x for x in NgramUtil.trigrams(words, join_string) if len(x.split(join_string)) == 3]
            return unigram + bigram + trigram
github shibing624 / text2vec / text2vec / similarity.py View on Github external
def init(self):
        if not self.bm25_instance:
            if not self.corpus:
                logger.error('corpus is none, set corpus with docs.')
                raise ValueError("must set corpus, which is documents, list of str")

            if isinstance(self.corpus, str):
                self.corpus = [self.corpus]

            self.corpus_seg = {k: self.tokenizer.tokenize(k) for k in self.corpus}
            self.bm25_instance = BM25Okapi(corpus=list(self.corpus_seg.values()))
github shibing624 / text2vec / text2vec / embeddings / embedding.py View on Github external
def sequence_length(self, val: Union[int, str]):
        if isinstance(val, str):
            if val == 'auto':
                logger.debug("Sequence length will auto set at 95% of sequence length")
            elif val == 'variable':
                val = None
            else:
                raise ValueError("sequence_length must be an int or 'auto' or 'variable'")
        self.processor.sequence_length = val
github shibing624 / text2vec / text2vec / embeddings / word_embedding.py View on Github external
embeds = []
        for sentence in sentence_list:
            emb = []
            count = 0
            for word in sentence:
                if word not in self.w2v.vocab:
                    continue
                emb.append(self.w2v[word])
                count += 1
            tensor_x = np.array(emb).sum(axis=0)  # 纵轴相加
            avg_tensor_x = np.divide(tensor_x, count)
            embeds.append(avg_tensor_x)
        embeds = np.array(embeds)
        if debug:
            logger.debug(f'sentence tensor shape: {embeds.shape}')
        return embeds