How to use yake - 10 common examples

To help you get started, we’ve selected a few yake examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github LIAAD / yake / tests / test_yake.py View on Github external
mindshare in this community, too    (though it already has plenty of that thanks to Tensorflow
    and other projects).    Kaggle has a bit of a history with Google, too, but that's pretty recent.
    Earlier this month,    Google and Kaggle teamed up to host a $100,000 machine learning competition
    around classifying    YouTube videos. That competition had some deep integrations with the
    Google Cloud Platform, too.    Our understanding is that Google will keep the service running -
    likely under its current name.    While the acquisition is probably more about Kaggle's community
    than technology, Kaggle did build    some interesting tools for hosting its competition and 'kernels',
    too. On Kaggle, kernels are    basically the source code for analyzing data sets and developers can
    share this code on the    platform (the company previously called them 'scripts').  Like similar
    competition-centric sites,    Kaggle also runs a job board, too. It's unclear what Google will do
    with that part of the service.    According to Crunchbase, Kaggle raised $12.5 million (though PitchBook
    says it's $12.75) since its    launch in 2010. Investors in Kaggle include Index Ventures, SV Angel,
    Max Levchin, Naval Ravikant,    Google chief economist Hal Varian, Khosla Ventures and Yuri Milner
    """

    pyake = yake.KeywordExtractor(lan="en",n=3)

    result = pyake.extract_keywords(text_content)

    print(result)

    keywords = [kw[0] for kw in result]

    print(keywords)
    assert "google" in keywords
    assert "kaggle" in keywords
    assert "san francisco" in keywords
    assert "machine learning" in keywords
github LIAAD / yake / yake / cli.py View on Github external
def run_yake(text_content):
		myake = yake.KeywordExtractor(lan=language, n=ngram_size, dedupLim=dedup_lim, dedupFunc=dedup_func,
									  windowsSize=window_size, top=top)
		results = myake.extract_keywords(text_content)

		table = []
		for kw in results:
			if (verbose):
				table.append({"keyword":kw[0], "score":kw[1]})
			else:
				table.append({"keyword":kw[0]})

		print(tabulate(table, headers="keys"))
github LIAAD / yake / docker / Dockerfiles / yake-server / yake-rest-api.py View on Github external
score:
              type: number
    """

    try:
        assert request.json["text"] , "Invalid text"
        assert len(request.json["language"]) == 2, "Invalid language code"
        assert int(request.json["max_ngram_size"]) , "Invalid max_ngram_size"
        assert int(request.json["number_of_keywords"]) , "Invalid number_of_keywords"

        text = request.json["text"]
        language = request.json["language"]
        max_ngram_size = int(request.json["max_ngram_size"])
        number_of_keywords = int(request.json["number_of_keywords"])

        my_yake = yake.KeywordExtractor(lan=language,
                                        n=max_ngram_size,
                                        top=number_of_keywords,
                                        dedupLim=0.8,
                                        windowsSize=2
                                        )

        keywords = my_yake.extract_keywords(text)
        result  = [{"ngram":x[1] ,"score":x[0]} for x in keywords]

        return jsonify(result), HTTPStatus.OK
    except IOError as e:
        return jsonify("Language not supported"), HTTPStatus.BAD_REQUEST
    except Exception as e:
        return jsonify(str(e)), HTTPStatus.BAD_REQUEST
github LIAAD / yake / yake / datarepresentation.py View on Github external
#Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard: 
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    #Generate candidate keyphrase list
                    candidate = [ (tag, word, term_obj) ]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append( (tag, word, term_obj) )

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append( block_of_word_obj )

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)

        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append( block_of_word_obj )

        if len(sentence_obj_aux) > 0:
            self.sentences_obj.append(sentence_obj_aux)
github LIAAD / yake / yake / datarepresentation.py View on Github external
def build_candidate(self, candidate_string):
        sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
        candidate_terms = []
        for (i, word) in enumerate(sentences_str):
            tag = self.getTag(word, i)
            term_obj = self.getTerm(word, save_non_seen=False)
            if term_obj.tf == 0:
                term_obj = None
            candidate_terms.append( (tag, word, term_obj) )
        if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
            invalid_virtual_cand = composed_word(None)
            return invalid_virtual_cand
        virtual_cand = composed_word(candidate_terms)
        return virtual_cand
github LIAAD / yake / yake / datarepresentation.py View on Github external
block_of_word_obj = []
                else:
                    tag = self.getTag(word, pos_sent)
                    term_obj = self.getTerm(word)
                    term_obj.addOccur(tag, sentence_id, pos_sent, pos_text)
                    pos_text += 1

                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard: 
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    #Generate candidate keyphrase list
                    candidate = [ (tag, word, term_obj) ]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append( (tag, word, term_obj) )

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append( block_of_word_obj )

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)
github LIAAD / yake / yake / datarepresentation.py View on Github external
def build_candidate(self, candidate_string):
        sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
        candidate_terms = []
        for (i, word) in enumerate(sentences_str):
            tag = self.getTag(word, i)
            term_obj = self.getTerm(word, save_non_seen=False)
            if term_obj.tf == 0:
                term_obj = None
            candidate_terms.append( (tag, word, term_obj) )
        if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
            invalid_virtual_cand = composed_word(None)
            return invalid_virtual_cand
        virtual_cand = composed_word(candidate_terms)
        return virtual_cand
github LIAAD / yake / yake / datarepresentation.py View on Github external
simples_sto = unique_term in self.stopword_set
        if unique_term.endswith('s') and len(unique_term) > 3:
            unique_term = unique_term[:-1]

        if unique_term in self.terms:
            return self.terms[unique_term]
                
        # Include this part
        simples_unique_term = unique_term
        for pontuation in self.exclude:
            simples_unique_term = simples_unique_term.replace(pontuation, '')
        # until here
        isstopword = simples_sto or unique_term in self.stopword_set or len(simples_unique_term) < 3
        
        term_id = len(self.terms)
        term_obj = single_word(unique_term, term_id, self.G)
        term_obj.stopword = isstopword

        if save_non_seen:
            self.G.add_node(term_id)
            self.terms[unique_term] = term_obj

        return term_obj
github LIAAD / yake / yake / yake.py View on Github external
def seqm(self, cand1, cand2):
        return Levenshtein.ratio(cand1, cand2)
github LIAAD / yake / yake / yake.py View on Github external
def extract_keywords(self, text):
        text = text.replace('\n\t',' ')
        dc = DataCore(text=text, stopword_set=self.stopword_set, windowsSize=self.windowsSize, n=self.n)
        dc.build_single_terms_features(features=self.features)
        dc.build_mult_terms_features(features=self.features)
        resultSet = []
        todedup = sorted([cc for cc in dc.candidates.values() if cc.isValid()], key=lambda c: c.H)

        if self.dedupLim >= 1.:
            return ([ (cand.H, cand.unique_kw) for cand in todedup])[:self.top]

        for cand in todedup:
            toadd = True
            for (h, candResult) in resultSet:
                dist = self.dedu_function(cand.unique_kw, candResult.unique_kw)
                if dist > self.dedupLim:
                    toadd = False
                    break
            if toadd:

yake

Keyword extraction Python package

LGPL-3.0
Latest version published 4 years ago

Package Health Score

58 / 100
Full package analysis