Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_empty_document():
document = build_document()
summarizer = LuhnSummarizer()
returned = summarizer(document, 10)
assert len(returned) == 0
def test_three_sentences():
document = build_document((
"wa s s s wa s s s wa",
"wb s wb s wb s s s s s s s s s wb",
"wc s s wc s s wc",
))
summarizer = LuhnSummarizer()
summarizer.stop_words = ("s",)
returned = summarizer(document, 1)
assert list(map(to_unicode, returned)) == [
"wb s wb s wb s s s s s s s s s wb",
]
returned = summarizer(document, 2)
assert list(map(to_unicode, returned)) == [
"wb s wb s wb s s s s s s s s s wb",
"wc s s wc s s wc",
]
returned = summarizer(document, 3)
assert list(map(to_unicode, returned)) == [
"wa s s s wa s s s wa",
def test_two_sentences():
document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
summarizer = LuhnSummarizer()
summarizer.stop_words = ("já", "jsem", "a", "ta",)
returned = summarizer(document, 10)
assert list(map(to_unicode, returned)) == [
"Já jsem 1. věta",
"A já ta 2. vítězná výhra",
]
def test_various_words_with_significant_percentage():
document = build_document((
"1 a",
"2 b b",
"3 c c c",
"4 d d d",
"5 z z z z",
"6 e e e e e",
))
summarizer = LuhnSummarizer()
summarizer.stop_words = ("1", "2", "3", "4", "5", "6")
returned = summarizer(document, 1)
assert list(map(to_unicode, returned)) == [
"6 e e e e e",
]
returned = summarizer(document, 2)
assert list(map(to_unicode, returned)) == [
"5 z z z z",
"6 e e e e e",
]
returned = summarizer(document, 3)
assert list(map(to_unicode, returned)) == [
"3 c c c",
def test_two_sentences_but_one_winner():
document = build_document((
"Já jsem 1. vítězná ta věta",
"A já ta 2. vítězná věta"
))
summarizer = LuhnSummarizer()
summarizer.stop_words = ("já", "jsem", "a", "ta",)
returned = summarizer(document, 1)
assert list(map(to_unicode, returned)) == [
"A já ta 2. vítězná věta",
]
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
LANGUAGE = "english"
SENTENCES_COUNT = 4
parser = PlaintextParser.from_file("sampleText.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
print("\n====== Luhn ======")
summarizerLuhn = LuhnSummarizer(stemmer)
summarizerLuhn.stop_words = get_stop_words(LANGUAGE)
for sentenceLuhn in summarizerLuhn(parser.document, SENTENCES_COUNT):
print(sentenceLuhn, "\n")
print("====== TextRank ======")
summarizerTR = TextRankSummarizer(stemmer)
summarizerTR.stop_words = get_stop_words(LANGUAGE)
for sentenceTR in summarizerTR(parser.document, SENTENCES_COUNT):
print(sentenceTR, "\n")
print("====== LSA ======")
summarizerLSA = LsaSummarizer(stemmer)
summarizerLSA.stop_words = get_stop_words(LANGUAGE)
for sentenceLSA in summarizerLSA(parser.document, SENTENCES_COUNT):
print(sentenceLSA, "\n")
from .summarizers.luhn import LuhnSummarizer
from .summarizers.edmundson import EdmundsonSummarizer
from .summarizers.lsa import LsaSummarizer
from .summarizers.text_rank import TextRankSummarizer
from .summarizers.lex_rank import LexRankSummarizer
from .summarizers.sum_basic import SumBasicSummarizer
from .summarizers.kl import KLSummarizer
from .nlp.stemmers import Stemmer
PARSERS = {
"html": HtmlParser,
"plaintext": PlaintextParser,
}
AVAILABLE_METHODS = {
"luhn": LuhnSummarizer,
"edmundson": EdmundsonSummarizer,
"lsa": LsaSummarizer,
"text-rank": TextRankSummarizer,
"lex-rank": LexRankSummarizer,
"sum-basic": SumBasicSummarizer,
"kl": KLSummarizer,
}
def main(args=None):
args = docopt(to_string(__doc__), args, version=__version__)
summarizer, parser, items_count = handle_arguments(args)
for sentence in summarizer(parser.document, items_count):
if PY3:
print(to_unicode(sentence))
num_page+=1
linkno=0
else:
LANGUAGE = "english"
SENTENCES_COUNT = 10
parser = HtmlParser.from_url(searchlink, Tokenizer(LANGUAGE))
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Summarisation using Luhn Summarizer
stopwords1 = set(stopwords.words('english'))
datastring=''
# using the LuhnSummarizer
summarizer = LuhnSummarizer()
summarizer.stop_words = stopwords1
for sentence in summarizer(parser.document, SENTENCES_COUNT):
# print(sentence)
datastring+=str(sentence)
return datastring
except:
linkno+=1
if linkno>9:
# if number of links on one page have been exceede, go to the next google link page
num_page+=1
linkno=0
time.sleep(1) #sleep for 10 miliseconds so that Google doesn't throw 503 error
def build_luhn(parser, language):
summarizer = LuhnSummarizer(Stemmer(language))
summarizer.stop_words = get_stop_words(language)
return summarizer