Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_real_example():
"""Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
parser = PlaintextParser.from_string(
load_resource("snippets/prevko.txt"),
Tokenizer("czech")
)
summarizer = LsaSummarizer(Stemmer("czech"))
summarizer.stop_words = get_stop_words("czech")
sentences = summarizer(parser.document, 2)
assert len(sentences) == 2
def test_parse_plaintext_long(self):
parser = PlaintextParser.from_string("""
Ako sa máš? Ja dobre! A ty? No
mohlo to byť aj lepšie!!! Ale pohodička.
TOTO JE AKOŽE NADPIS
A toto je text pod ním, ktorý je textový.
A tak ďalej...
VEĽKOLEPÉ PREKVAPENIE
Tretí odstavec v tomto texte je úplne o ničom. Ale má
vety a to je hlavné. Takže sa majte na pozore ;-)
A tak ďalej...
A tak este dalej!
""", Tokenizer("czech"))
def test_article_example():
"""Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
parser = PlaintextParser.from_string(
load_resource("articles/prevko_cz_1.txt"),
Tokenizer("czech")
)
summarizer = LexRankSummarizer(stem_word)
summarizer.stop_words = get_stop_words("czech")
sentences = summarizer(parser.document, 20)
assert len(sentences) == 20
inputLine = ''
for message in messages:
if '\\/' not in message:
inputLine = inputLine + message['message'] + '. '
# blob = TextBlob(inputLine)
# wordCounts = blob.word_counts
# sortedWordCounts = sorted(wordCounts, key=wordCounts.get, reverse=True)
# outputLine = " ".join(sortedWordCounts[:5])
# outputLine = groupName.capitalize() + " summarized as " + outputLine
# self.send_to_whatsapp_id("WACAO!",outputLine)
LANGUAGE = "english"
SENTENCES_COUNT = '20%'
outputLine = groupName.capitalize() + " summarized as: \n"
parser = PlaintextParser.from_string(inputLine, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, SENTENCES_COUNT):
outputLine = outputLine + unicode(str(sentence), "utf-8") + "\n"
self.send_to_whatsapp_id("WACAO!",outputLine)
# print "sum_basic:"
def get_summary_lex_rank(self,num_sentence):
from sumy.parsers.plaintext import PlaintextParser # other parsers available for HTML etc.
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer # We're choosing Lexrank, other algorithms are also built in
try:
parser = HtmlParser.from_url(self.url, Tokenizer("english"))
except:
try:
parser = PlaintextParser.from_string(self.body, Tokenizer("english"))
except Exception as e:
raise(e)
summarizer = LexRankSummarizer()
summary = summarizer(parser.document, num_sentence)
out=''
for sentence in summary:
out+= str(sentence)
return out
def summarize(srt_file, n_sentences, language="english"):
""" Generate segmented summary
Args:
srt_file(str) : The name of the SRT FILE
n_sentences(int): No of sentences
language(str) : Language of subtitles (default to English)
Returns:
list: segment of subtitles
"""
parser = PlaintextParser.from_string(
srt_to_txt(srt_file), Tokenizer(language))
stemmer = Stemmer(language)
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(language)
segment = []
for sentence in summarizer(parser.document, n_sentences):
index = int(re.findall("\(([0-9]+)\)", str(sentence))[0])
item = srt_file[index]
segment.append(srt_segment_to_range(item))
return segment