Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_empty_document(self):
document = build_document()
summarizer = TextRankSummarizer(Stemmer("english"))
returned = summarizer(document, 10)
self.assertEqual(len(returned), 0)
def test_three_sentences_but_second_winner(self):
document = build_document([
"I am that 1. sentence",
"And I am 2. sentence - winning sentence",
"And I am 3. sentence - winner is my 2nd name",
])
summarizer = TextRankSummarizer()
summarizer.stop_words = ["I", "am", "and", "that"]
returned = summarizer(document, 1)
self.assertEqual(len(returned), 1)
self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")
def test_single_sentence(self):
document = build_document(("I am one sentence",))
summarizer = TextRankSummarizer()
summarizer.stop_words = ("I", "am",)
returned = summarizer(document, 10)
self.assertEqual(len(returned), 1)
def test_sentences_rating(self):
document = build_document([
"a c e g",
"a b c d e f g",
"b d f",
])
summarizer = TextRankSummarizer()
summarizer.stop_words = ["I", "am", "and", "that"]
ratings = summarizer.rate_sentences(document)
self.assertEqual(len(ratings), 3)
self.assertTrue(ratings[document.sentences[1]] > ratings[document.sentences[0]])
self.assertTrue(ratings[document.sentences[0]] > ratings[document.sentences[2]])
def build_text_rank(parser, language):
summarizer = TextRankSummarizer(Stemmer(language))
summarizer.stop_words = get_stop_words(language)
return summarizer
from sumy.utils import get_stop_words
LANGUAGE = "english"
SENTENCES_COUNT = 4
parser = PlaintextParser.from_file("sampleText.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
print("\n====== Luhn ======")
summarizerLuhn = LuhnSummarizer(stemmer)
summarizerLuhn.stop_words = get_stop_words(LANGUAGE)
for sentenceLuhn in summarizerLuhn(parser.document, SENTENCES_COUNT):
print(sentenceLuhn, "\n")
print("====== TextRank ======")
summarizerTR = TextRankSummarizer(stemmer)
summarizerTR.stop_words = get_stop_words(LANGUAGE)
for sentenceTR in summarizerTR(parser.document, SENTENCES_COUNT):
print(sentenceTR, "\n")
print("====== LSA ======")
summarizerLSA = LsaSummarizer(stemmer)
summarizerLSA.stop_words = get_stop_words(LANGUAGE)
for sentenceLSA in summarizerLSA(parser.document, SENTENCES_COUNT):
print(sentenceLSA, "\n")
print("====== Edmonson ======")
summarizerEd = EdmundsonSummarizer(stemmer)
summarizerEd.bonus_words = ('focus', 'proposed', 'method', 'describes')
summarizerEd.stigma_words = ('example')
summarizerEd.null_words = ('literature', 'however')
for sentenceEd in summarizerEd(parser.document, SENTENCES_COUNT):
def summarize(text):
if isvalid(text):
all_capital = False
# to avoid that all capital letter sentence gives empty output: we lower all and the upper all later on
if text.upper() == text:
text = text.lower()
all_capital = True
if (sys.version_info > (3,0)):
parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
else:
parser = PlaintextParser.from_string(text.decode('ascii', errors='ignore'), Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
sentences = [str(s) for s in summarizer(
parser.document, sentences_count=n_sentences)]
if all_capital:
output_sentences = ' '.join(sentences).upper()
all_capital = False
else:
output_sentences = ' '.join(sentences)
return output_sentences
else:
return ''
from .summarizers.text_rank import TextRankSummarizer
from .summarizers.lex_rank import LexRankSummarizer
from .summarizers.sum_basic import SumBasicSummarizer
from .summarizers.kl import KLSummarizer
from .nlp.stemmers import Stemmer
PARSERS = {
"html": HtmlParser,
"plaintext": PlaintextParser,
}
AVAILABLE_METHODS = {
"luhn": LuhnSummarizer,
"edmundson": EdmundsonSummarizer,
"lsa": LsaSummarizer,
"text-rank": TextRankSummarizer,
"lex-rank": LexRankSummarizer,
"sum-basic": SumBasicSummarizer,
"kl": KLSummarizer,
}
def main(args=None):
args = docopt(to_string(__doc__), args, version=__version__)
summarizer, parser, items_count = handle_arguments(args)
for sentence in summarizer(parser.document, items_count):
if PY3:
print(to_unicode(sentence))
else:
print(to_bytes(sentence))