Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
mindshare in this community, too (though it already has plenty of that thanks to Tensorflow
and other projects). Kaggle has a bit of a history with Google, too, but that's pretty recent.
Earlier this month, Google and Kaggle teamed up to host a $100,000 machine learning competition
around classifying YouTube videos. That competition had some deep integrations with the
Google Cloud Platform, too. Our understanding is that Google will keep the service running -
likely under its current name. While the acquisition is probably more about Kaggle's community
than technology, Kaggle did build some interesting tools for hosting its competition and 'kernels',
too. On Kaggle, kernels are basically the source code for analyzing data sets and developers can
share this code on the platform (the company previously called them 'scripts'). Like similar
competition-centric sites, Kaggle also runs a job board, too. It's unclear what Google will do
with that part of the service. According to Crunchbase, Kaggle raised $12.5 million (though PitchBook
says it's $12.75) since its launch in 2010. Investors in Kaggle include Index Ventures, SV Angel,
Max Levchin, Naval Ravikant, Google chief economist Hal Varian, Khosla Ventures and Yuri Milner
"""
pyake = yake.KeywordExtractor(lan="en",n=3)
result = pyake.extract_keywords(text_content)
print(result)
keywords = [kw[0] for kw in result]
print(keywords)
assert "google" in keywords
assert "kaggle" in keywords
assert "san francisco" in keywords
assert "machine learning" in keywords
def run_yake(text_content):
myake = yake.KeywordExtractor(lan=language, n=ngram_size, dedupLim=dedup_lim, dedupFunc=dedup_func,
windowsSize=window_size, top=top)
results = myake.extract_keywords(text_content)
table = []
for kw in results:
if (verbose):
table.append({"keyword":kw[0], "score":kw[1]})
else:
table.append({"keyword":kw[0]})
print(tabulate(table, headers="keys"))
score:
type: number
"""
try:
assert request.json["text"] , "Invalid text"
assert len(request.json["language"]) == 2, "Invalid language code"
assert int(request.json["max_ngram_size"]) , "Invalid max_ngram_size"
assert int(request.json["number_of_keywords"]) , "Invalid number_of_keywords"
text = request.json["text"]
language = request.json["language"]
max_ngram_size = int(request.json["max_ngram_size"])
number_of_keywords = int(request.json["number_of_keywords"])
my_yake = yake.KeywordExtractor(lan=language,
n=max_ngram_size,
top=number_of_keywords,
dedupLim=0.8,
windowsSize=2
)
keywords = my_yake.extract_keywords(text)
result = [{"ngram":x[1] ,"score":x[0]} for x in keywords]
return jsonify(result), HTTPStatus.OK
except IOError as e:
return jsonify("Language not supported"), HTTPStatus.BAD_REQUEST
except Exception as e:
return jsonify(str(e)), HTTPStatus.BAD_REQUEST
#Create co-occurrence matrix
if tag not in self.tagsToDiscard:
word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
for w in word_windows:
if block_of_word_obj[w][0] not in self.tagsToDiscard:
self.addCooccur(block_of_word_obj[w][2], term_obj)
#Generate candidate keyphrase list
candidate = [ (tag, word, term_obj) ]
cand = composed_word(candidate)
self.addOrUpdateComposedWord(cand)
word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
for w in word_windows:
candidate.append(block_of_word_obj[w])
self.freq_ns[len(candidate)] += 1.
cand = composed_word(candidate[::-1])
self.addOrUpdateComposedWord(cand)
# Add term to the block of words' buffer
block_of_word_obj.append( (tag, word, term_obj) )
if len(block_of_word_obj) > 0:
sentence_obj_aux.append( block_of_word_obj )
if len(sentence_obj_aux) > 0:
self.sentences_obj.append(sentence_obj_aux)
if len(block_of_word_obj) > 0:
sentence_obj_aux.append( block_of_word_obj )
if len(sentence_obj_aux) > 0:
self.sentences_obj.append(sentence_obj_aux)
def build_candidate(self, candidate_string):
sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
candidate_terms = []
for (i, word) in enumerate(sentences_str):
tag = self.getTag(word, i)
term_obj = self.getTerm(word, save_non_seen=False)
if term_obj.tf == 0:
term_obj = None
candidate_terms.append( (tag, word, term_obj) )
if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
invalid_virtual_cand = composed_word(None)
return invalid_virtual_cand
virtual_cand = composed_word(candidate_terms)
return virtual_cand
block_of_word_obj = []
else:
tag = self.getTag(word, pos_sent)
term_obj = self.getTerm(word)
term_obj.addOccur(tag, sentence_id, pos_sent, pos_text)
pos_text += 1
#Create co-occurrence matrix
if tag not in self.tagsToDiscard:
word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
for w in word_windows:
if block_of_word_obj[w][0] not in self.tagsToDiscard:
self.addCooccur(block_of_word_obj[w][2], term_obj)
#Generate candidate keyphrase list
candidate = [ (tag, word, term_obj) ]
cand = composed_word(candidate)
self.addOrUpdateComposedWord(cand)
word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
for w in word_windows:
candidate.append(block_of_word_obj[w])
self.freq_ns[len(candidate)] += 1.
cand = composed_word(candidate[::-1])
self.addOrUpdateComposedWord(cand)
# Add term to the block of words' buffer
block_of_word_obj.append( (tag, word, term_obj) )
if len(block_of_word_obj) > 0:
sentence_obj_aux.append( block_of_word_obj )
if len(sentence_obj_aux) > 0:
self.sentences_obj.append(sentence_obj_aux)
def build_candidate(self, candidate_string):
sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
candidate_terms = []
for (i, word) in enumerate(sentences_str):
tag = self.getTag(word, i)
term_obj = self.getTerm(word, save_non_seen=False)
if term_obj.tf == 0:
term_obj = None
candidate_terms.append( (tag, word, term_obj) )
if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
invalid_virtual_cand = composed_word(None)
return invalid_virtual_cand
virtual_cand = composed_word(candidate_terms)
return virtual_cand
simples_sto = unique_term in self.stopword_set
if unique_term.endswith('s') and len(unique_term) > 3:
unique_term = unique_term[:-1]
if unique_term in self.terms:
return self.terms[unique_term]
# Include this part
simples_unique_term = unique_term
for pontuation in self.exclude:
simples_unique_term = simples_unique_term.replace(pontuation, '')
# until here
isstopword = simples_sto or unique_term in self.stopword_set or len(simples_unique_term) < 3
term_id = len(self.terms)
term_obj = single_word(unique_term, term_id, self.G)
term_obj.stopword = isstopword
if save_non_seen:
self.G.add_node(term_id)
self.terms[unique_term] = term_obj
return term_obj
def seqm(self, cand1, cand2):
return Levenshtein.ratio(cand1, cand2)
def extract_keywords(self, text):
text = text.replace('\n\t',' ')
dc = DataCore(text=text, stopword_set=self.stopword_set, windowsSize=self.windowsSize, n=self.n)
dc.build_single_terms_features(features=self.features)
dc.build_mult_terms_features(features=self.features)
resultSet = []
todedup = sorted([cc for cc in dc.candidates.values() if cc.isValid()], key=lambda c: c.H)
if self.dedupLim >= 1.:
return ([ (cand.H, cand.unique_kw) for cand in todedup])[:self.top]
for cand in todedup:
toadd = True
for (h, candResult) in resultSet:
dist = self.dedu_function(cand.unique_kw, candResult.unique_kw)
if dist > self.dedupLim:
toadd = False
break
if toadd: