Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_single_sentence(summarizer):
s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
document = build_document([s])
returned = summarizer(document, 10)
assert len(returned) == 1
def test_get_all_content_words_in_doc():
summarizer = _build_summarizer(EMPTY_STOP_WORDS)
s0 = Sentence("One two three.", Tokenizer("english"))
s1 = Sentence("One two three.", Tokenizer("english"))
document = build_document([s0, s1])
content_words = summarizer._get_all_content_words_in_doc(document.sentences)
content_words_freq = {}
for w in content_words:
content_words_freq[w] = content_words_freq.get(w, 0) + 1
content_words_correct = {"one": 2, "two": 2, "three": 2}
assert content_words_freq == content_words_correct
summarizer = _build_summarizer(EMPTY_STOP_WORDS)
s0 = Sentence("Dog cat fish.", Tokenizer("english"))
s1 = Sentence("Dog cat camel.", Tokenizer("english"))
s2 = Sentence("Fish frog horse.", Tokenizer("english"))
document = build_document([s0, s1, s2])
ratings = summarizer._compute_ratings(document.sentences)
assert ratings[s0] == 0
assert ratings[s1] == -2
assert ratings[s2] == -1
# Due to the frequency discounting, after finding sentence s0,
# s2 should come before s1 since only two of its words get discounted
# rather than all 3 of s1's
s0 = Sentence("one two three", Tokenizer("english"))
s1 = Sentence("one two four", Tokenizer("english"))
s2 = Sentence("three five six", Tokenizer("english"))
document = build_document([s0, s1, s2])
ratings = summarizer._compute_ratings(document.sentences)
assert ratings[s0] == 0
assert ratings[s1] == -2
assert ratings[s2] == -1
def test_compute_tf():
summarizer = _build_summarizer(EMPTY_STOP_WORDS)
s0 = Sentence("kicking soccer balls.", Tokenizer("english"))
s1 = Sentence("eating chicken dumplings.", Tokenizer("english"))
document = build_document([s0, s1])
freq = summarizer._compute_tf(document.sentences)
assert freq["kicking"] == 1/6
assert freq["soccer"] == 1/6
assert freq["balls"] == 1/6
assert freq["eating"] == 1/6
assert freq["chicken"] == 1/6
assert freq["dumplings"] == 1/6
document = build_document([s0, s0, s1])
freq = summarizer._compute_tf(document.sentences)
assert freq["kicking"] == 2/9
assert freq["soccer"] == 2/9
assert freq["balls"] == 2/9
assert freq["eating"] == 1/9
assert freq["chicken"] == 1/9
s0 = Sentence("Dog cat fish.", Tokenizer("english"))
s1 = Sentence("Dog cat camel.", Tokenizer("english"))
s2 = Sentence("Fish frog horse.", Tokenizer("english"))
document = build_document([s0, s1, s2])
ratings = summarizer._compute_ratings(document.sentences)
assert ratings[s0] == 0
assert ratings[s1] == -2
assert ratings[s2] == -1
# Due to the frequency discounting, after finding sentence s0,
# s2 should come before s1 since only two of its words get discounted
# rather than all 3 of s1's
s0 = Sentence("one two three", Tokenizer("english"))
s1 = Sentence("one two four", Tokenizer("english"))
s2 = Sentence("three five six", Tokenizer("english"))
document = build_document([s0, s1, s2])
ratings = summarizer._compute_ratings(document.sentences)
assert ratings[s0] == 0
assert ratings[s1] == -2
assert ratings[s2] == -1
def test_single_sentence():
s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
document = build_document([s])
summarizer = _build_summarizer(EMPTY_STOP_WORDS)
returned = summarizer(document, 10)
assert len(returned) == 1
def test_get_all_content_words_in_doc():
summarizer = _build_summarizer(EMPTY_STOP_WORDS)
s0 = Sentence("One two three.", Tokenizer("english"))
s1 = Sentence("One two three.", Tokenizer("english"))
document = build_document([s0, s1])
content_words = summarizer._get_all_content_words_in_doc(document.sentences)
content_words_freq = {}
for w in content_words:
content_words_freq[w] = content_words_freq.get(w, 0) + 1
content_words_correct = {"one": 2, "two": 2, "three": 2}
assert content_words_freq == content_words_correct
def test_tf_idf_metric_should_be_real_number():
"""https://github.com/miso-belica/sumy/issues/41"""
summarizer = KLSummarizer()
frequencies = summarizer.compute_tf([Sentence("There are five words, jop.", Tokenizer("english"))])
assert frequencies == {
"there": 0.2,
"are": 0.2,
"five": 0.2,
"words": 0.2,
"jop": 0.2,
}
def test_compute_tf():
summarizer = _build_summarizer(EMPTY_STOP_WORDS)
s0 = Sentence("kicking soccer balls.", Tokenizer("english"))
s1 = Sentence("eating chicken dumplings.", Tokenizer("english"))
document = build_document([s0, s1])
freq = summarizer._compute_tf(document.sentences)
assert freq["kicking"] == 1/6
assert freq["soccer"] == 1/6
assert freq["balls"] == 1/6
assert freq["eating"] == 1/6
assert freq["chicken"] == 1/6
assert freq["dumplings"] == 1/6
document = build_document([s0, s0, s1])
freq = summarizer._compute_tf(document.sentences)
assert freq["kicking"] == 2/9
assert freq["soccer"] == 2/9
assert freq["balls"] == 2/9
assert freq["eating"] == 1/9
def __eq__(self, sentence):
assert isinstance(sentence, Sentence)
return self._is_heading is sentence._is_heading and self._text == sentence._text