Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from ._summarizer import AbstractSummarizer
class SumBasicSummarizer(AbstractSummarizer):
"""
SumBasic: a frequency-based summarization system that adjusts word frequencies as
sentences are extracted.
Source: http://www.cis.upenn.edu/~nenkova/papers/ipm.pdf
"""
_stop_words = frozenset()
@property
def stop_words(self):
return self._stop_words
@stop_words.setter
def stop_words(self, words):
self._stop_words = frozenset(map(self.normalize_word, words))
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from collections import defaultdict
from ..nlp.stemmers import null_stemmer
from ._summarizer import AbstractSummarizer
from .edmundson_cue import EdmundsonCueMethod
from .edmundson_key import EdmundsonKeyMethod
from .edmundson_title import EdmundsonTitleMethod
from .edmundson_location import EdmundsonLocationMethod
_EMPTY_SET = frozenset()
class EdmundsonSummarizer(AbstractSummarizer):
_bonus_words = _EMPTY_SET
_stigma_words = _EMPTY_SET
_null_words = _EMPTY_SET
def __init__(self, stemmer=null_stemmer, cue_weight=1.0, key_weight=0.0,
title_weight=1.0, location_weight=1.0):
super(EdmundsonSummarizer, self).__init__(stemmer)
self._ensure_correct_weights(cue_weight, key_weight, title_weight,
location_weight)
self._cue_weight = float(cue_weight)
self._key_weight = float(key_weight)
self._title_weight = float(title_weight)
self._location_weight = float(location_weight)
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
import math
from ._summarizer import AbstractSummarizer
class KLSummarizer(AbstractSummarizer):
"""
Method that greedily adds sentences to a summary so long as it decreases the
KL Divergence.
Source: http://www.aclweb.org/anthology/N09-1041
"""
stop_words = frozenset()
def __call__(self, document, sentences_count):
ratings = self._get_ratings(document)
return self._get_best_sentences(document.sentences, sentences_count, ratings)
def _get_ratings(self, document):
sentences = document.sentences
ratings = self._compute_ratings(sentences)
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
import random
from ._summarizer import AbstractSummarizer
class RandomSummarizer(AbstractSummarizer):
"""Summarizer that picks sentences randomly."""
def __call__(self, document, sentences_count):
sentences = document.sentences
ratings = self._get_random_ratings(sentences)
return self._get_best_sentences(sentences, sentences_count, ratings)
def _get_random_ratings(self, sentences):
ratings = list(range(len(sentences)))
random.shuffle(ratings)
return dict((s, r) for s, r in zip(sentences, ratings))
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from .._compat import Counter
from ._summarizer import AbstractSummarizer
class EdmundsonKeyMethod(AbstractSummarizer):
def __init__(self, stemmer, bonus_words):
super(EdmundsonKeyMethod, self).__init__(stemmer)
self._bonus_words = bonus_words
def __call__(self, document, sentences_count, weight):
significant_words = self._compute_significant_words(document, weight)
return self._get_best_sentences(document.sentences,
sentences_count, self._rate_sentence, significant_words)
def _compute_significant_words(self, document, weight):
# keep only stems contained in bonus words
words = map(self.stem_word, document.words)
words = filter(self._is_bonus_word, words)
# compute frequencies of bonus words in document
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from operator import attrgetter
from itertools import chain
from .._compat import ffilter
from ._summarizer import AbstractSummarizer
class EdmundsonTitleMethod(AbstractSummarizer):
def __init__(self, stemmer, null_words):
super(EdmundsonTitleMethod, self).__init__(stemmer)
self._null_words = null_words
def __call__(self, document, sentences_count):
sentences = document.sentences
significant_words = self._compute_significant_words(document)
return self._get_best_sentences(sentences, sentences_count,
self._rate_sentence, significant_words)
def _compute_significant_words(self, document):
heading_words = map(attrgetter("words"), document.headings)
significant_words = chain(*heading_words)
significant_words = map(self.stem_word, significant_words)
from warnings import warn
try:
import numpy
except ImportError:
numpy = None
try:
from numpy.linalg import svd as singular_value_decomposition
except ImportError:
singular_value_decomposition = None
from ._summarizer import AbstractSummarizer
class LsaSummarizer(AbstractSummarizer):
MIN_DIMENSIONS = 3
REDUCTION_RATIO = 1/1
_stop_words = frozenset()
@property
def stop_words(self):
return self._stop_words
@stop_words.setter
def stop_words(self, words):
self._stop_words = frozenset(map(self.normalize_word, words))
def __call__(self, document, sentences_count):
self._ensure_dependecies_installed()
dictionary = self._create_dictionary(document)
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from ..models import TfDocumentModel
from ._summarizer import AbstractSummarizer
class LuhnSummarizer(AbstractSummarizer):
max_gap_size = 4
# TODO: better recognition of significant words (automatic)
significant_percentage = 1
_stop_words = frozenset()
@property
def stop_words(self):
return self._stop_words
@stop_words.setter
def stop_words(self, words):
self._stop_words = frozenset(map(self.normalize_word, words))
def __call__(self, document, sentences_count):
words = self._get_significant_words(document.words)
return self._get_best_sentences(document.sentences,
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from ._summarizer import AbstractSummarizer
class EdmundsonCueMethod(AbstractSummarizer):
def __init__(self, stemmer, bonus_words, stigma_words):
super(EdmundsonCueMethod, self).__init__(stemmer)
self._bonus_words = bonus_words
self._stigma_words = stigma_words
def __call__(self, document, sentences_count, bunus_word_weight, stigma_word_weight):
return self._get_best_sentences(document.sentences,
sentences_count, self._rate_sentence, bunus_word_weight,
stigma_word_weight)
def _rate_sentence(self, sentence, bunus_word_weight, stigma_word_weight):
# count number of bonus/stigma words in sentece
words = map(self.stem_word, sentence.words)
bonus_words_count, stigma_words_count = self._count_words(words)
# compute positive & negative rating
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
import math
try:
import numpy
except ImportError:
numpy = None
from ._summarizer import AbstractSummarizer
class TextRankSummarizer(AbstractSummarizer):
"""An implementation of TextRank algorithm for summarization.
Source: https://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf
"""
epsilon = 1e-4
damping = 0.85
# small number to prevent zero-division error, see https://github.com/miso-belica/sumy/issues/112
_delta = 1e-7
_stop_words = frozenset()
@property
def stop_words(self):
return self._stop_words
@stop_words.setter
def stop_words(self, words):