Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_parse_plaintext(self):
parser = PlaintextParser.from_string("""
Ako sa máš? Ja dobre! A ty? No
mohlo to byť aj lepšie!!! Ale pohodička.
TOTO JE AKOŽE NADPIS
A toto je text pod ním, ktorý je textový.
A tak ďalej...
""", Tokenizer("czech"))
document = parser.document
self.assertEqual(len(document.paragraphs), 2)
self.assertEqual(len(document.paragraphs[0].headings), 0)
self.assertEqual(len(document.paragraphs[0].sentences), 5)
inputLine = ''
for message in messages:
if '\\/' not in message:
inputLine = inputLine + message['message'] + '. '
# blob = TextBlob(inputLine)
# wordCounts = blob.word_counts
# sortedWordCounts = sorted(wordCounts, key=wordCounts.get, reverse=True)
# outputLine = " ".join(sortedWordCounts[:5])
# outputLine = groupName.capitalize() + " summarized as " + outputLine
# self.send_to_whatsapp_id("WACAO!",outputLine)
LANGUAGE = "english"
SENTENCES_COUNT = '20%'
outputLine = groupName.capitalize() + " summarized as: \n"
parser = PlaintextParser.from_string(inputLine, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = LsaSummarizer(stemmer)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
for sentence in summarizer(parser.document, SENTENCES_COUNT):
outputLine = outputLine + unicode(str(sentence), "utf-8") + "\n"
self.send_to_whatsapp_id("WACAO!",outputLine)
# print "sum_basic:"
def main(args=None):
args = docopt(to_string(__doc__), args, version=__version__)
summarizer, document, items_count, reference_summary = handle_arguments(args)
evaluated_sentences = summarizer(document, items_count)
reference_document = PlaintextParser.from_string(reference_summary,
Tokenizer(args["--language"]))
reference_sentences = reference_document.document.sentences
for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS:
if evaluate_document:
result = evaluate(evaluated_sentences, document.sentences)
else:
result = evaluate(evaluated_sentences, reference_sentences)
print("%s: %f" % (name, result))
return 0
def summarize(text):
if isvalid(text):
all_capital = False
# to avoid that all capital letter sentence gives empty output: we lower all and the upper all later on
if text.upper() == text:
text = text.lower()
all_capital = True
if (sys.version_info > (3,0)):
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
else:
parser = PlaintextParser.from_string(text.decode('ascii', errors='ignore'), Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
sentences = [str(s) for s in summarizer(
parser.document, sentences_count=n_sentences)]
if all_capital:
output_sentences = ' '.join(sentences).upper()
all_capital = False
else:
output_sentences = ' '.join(sentences)
from ._compat import to_string, to_unicode, to_bytes, PY3
from .nlp.tokenizers import Tokenizer
from .parsers.html import HtmlParser
from .parsers.plaintext import PlaintextParser
from .summarizers.luhn import LuhnSummarizer
from .summarizers.edmundson import EdmundsonSummarizer
from .summarizers.lsa import LsaSummarizer
from .summarizers.text_rank import TextRankSummarizer
from .summarizers.lex_rank import LexRankSummarizer
from .summarizers.sum_basic import SumBasicSummarizer
from .summarizers.kl import KLSummarizer
from .nlp.stemmers import Stemmer
PARSERS = {
"html": HtmlParser,
"plaintext": PlaintextParser,
}
AVAILABLE_METHODS = {
"luhn": LuhnSummarizer,
"edmundson": EdmundsonSummarizer,
"lsa": LsaSummarizer,
"text-rank": TextRankSummarizer,
"lex-rank": LexRankSummarizer,
"sum-basic": SumBasicSummarizer,
"kl": KLSummarizer,
}
def main(args=None):
args = docopt(to_string(__doc__), args, version=__version__)
summarizer, parser, items_count = handle_arguments(args)
def __init__(self, text, tokenizer):
super(PlaintextParser, self).__init__(tokenizer)
self._text = to_unicode(text).strip()
def summary(text):
stemmer = Stemmer(LANGUAGE)
parser = PlaintextParser(text, Tokenizer(LANGUAGE))
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
short = ""
for sentence in summarizer(parser.document, SENTENCES_COUNT):
short = short + ">" + "* " + str(sentence).decode('ascii','ignore') + "\n\n"
#print(sentence)
return short
def summarize_from_file(self,file_name):
parser = PlaintextParser.from_file(file_name, Tokenizer(self.LANGUAGE))
stemmer = Stemmer(self.LANGUAGE)
summarizer = Summarizer(stemmer)
file_1 = open("summarizer_output.txt","w+")
file_2 = open("summarizer_output2.txt","w+")
for sentence in summarizer(parser.document, self.SENTENCES_COUNT):
file_2.write(str(sentence))
file_1.write(str(sentence))
file_1.write("\n")
file_1.close()
file_2.close()
def main(url, max_sent, language='english'):
tokenizer = Tokenizer(language)
article = alt_extract(url)
parser = PlaintextParser.from_string(article, tokenizer)
return click.echo(get_summarizer(parser, max_sent, language))
def get_summary(long_text, sentences=SENTENCES_COUNT):
parser = PlaintextParser.from_string(chinese_normalnize(long_text), Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
return [str(sentence) for sentence in summarizer(parser.document, sentences)]