How to use the sumy.models.TfDocumentModel function in sumy

To help you get started, we’ve selected a few sumy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github miso-belica / sumy / tests / test_models / test_tf.py View on Github external
def test_normalized_words_frequencies():
    words = "a b c d e c b d c e e d e d e".split()
    model = TfDocumentModel(tuple(words))

    assert model.normalized_term_frequency("a") == pytest.approx(1/5)
    assert model.normalized_term_frequency("b") == pytest.approx(2/5)
    assert model.normalized_term_frequency("c") == pytest.approx(3/5)
    assert model.normalized_term_frequency("d") == pytest.approx(4/5)
    assert model.normalized_term_frequency("e") == pytest.approx(5/5)
    assert model.normalized_term_frequency("z") == pytest.approx(0.0)

    assert model.most_frequent_terms() == ("e", "d", "c", "b", "a")
github miso-belica / sumy / tests / test_models / test_tf.py View on Github external
def test_pretokenized_words_frequencies():
    model = TfDocumentModel(("wC", "wC", "WC", "wA", "WB", "wB"))

    assert model.term_frequency("wa") == 1
    assert model.term_frequency("wb") == 2
    assert model.term_frequency("wc") == 3
    assert model.term_frequency("wd") == 0

    assert model.most_frequent_terms() == ("wc", "wb", "wa")
github miso-belica / sumy / tests / test_models / test_tf.py View on Github external
def test_no_tokenizer_with_string():
    with pytest.raises(ValueError):
        TfDocumentModel("text without tokenizer")
github miso-belica / sumy / tests / test_models / test_tf.py View on Github external
def test_pretokenized_words():
    model = TfDocumentModel(("wA", "WB", "wB", "WA"))

    terms = tuple(sorted(model.terms))
    assert terms == ("wa", "wb")
github miso-belica / sumy / tests / test_models / test_tf.py View on Github external
def test_most_frequent_terms():
    tokenizer = Tokenizer("english")
    text = "wE wD wC wB wA wE WD wC wB wE wD WE wC wD wE"
    model = TfDocumentModel(text, tokenizer)

    assert model.most_frequent_terms(1) == ("we",)
    assert model.most_frequent_terms(2) == ("we", "wd")
    assert model.most_frequent_terms(3) == ("we", "wd", "wc")
    assert model.most_frequent_terms(4) == ("we", "wd", "wc", "wb")
    assert model.most_frequent_terms(5) == ("we", "wd", "wc", "wb", "wa")
    assert model.most_frequent_terms() == ("we", "wd", "wc", "wb", "wa")
github miso-belica / sumy / tests / test_models / test_tf.py View on Github external
def test_term_frequency():
    tokenizer = Tokenizer("english")
    text = "wA wB wC wA wA wC wD wCwB"
    model = TfDocumentModel(text, tokenizer)

    assert model.term_frequency("wa") == 3
    assert model.term_frequency("wb") == 1
    assert model.term_frequency("wc") == 2
    assert model.term_frequency("wd") == 1
    assert model.term_frequency("wcwb") == 1
    assert model.term_frequency("we") == 0
    assert model.term_frequency("missing") == 0
github miso-belica / sumy / sumy / evaluation / content_based.py View on Github external
def cosine_similarity(evaluated_model, reference_model):
    """
    Computes cosine similarity of two text documents. Each document
    has to be represented as TF model of non-empty document.

    :returns float:
        0 <= cos <= 1, where 0 means independence and 1 means
        exactly the same.
    """
    if not (isinstance(evaluated_model, TfModel) and isinstance(reference_model, TfModel)):
        raise ValueError(
            "Arguments has to be instances of 'sumy.models.TfDocumentModel'")

    terms = frozenset(evaluated_model.terms) | frozenset(reference_model.terms)

    numerator = 0.0
    for term in terms:
        numerator += evaluated_model.term_frequency(term) * reference_model.term_frequency(term)

    denominator = evaluated_model.magnitude * reference_model.magnitude
    if denominator == 0.0:
        raise ValueError("Document model can't be empty. Given %r & %r" % (
            evaluated_model, reference_model))

    return numerator / denominator
github miso-belica / sumy / sumy / evaluation / __main__.py View on Github external
def evaluate_cosine_similarity(evaluated_sentences, reference_sentences):
    evaluated_words = tuple(chain(*(s.words for s in evaluated_sentences)))
    reference_words = tuple(chain(*(s.words for s in reference_sentences)))
    evaluated_model = TfDocumentModel(evaluated_words)
    reference_model = TfDocumentModel(reference_words)

    return cosine_similarity(evaluated_model, reference_model)
github miso-belica / sumy / sumy / summarizers / luhn.py View on Github external
def _get_significant_words(self, words):
        words = map(self.normalize_word, words)
        words = tuple(self.stem_word(w) for w in words if w not in self._stop_words)

        model = TfDocumentModel(words)

        # take only best `significant_percentage` % words
        best_words_count = int(len(words) * self.significant_percentage)
        words = model.most_frequent_terms(best_words_count)

        # take only words contained multiple times in document
        return tuple(t for t in words if model.term_frequency(t) > 1)
github miso-belica / sumy / sumy / evaluation / content_based.py View on Github external
def unit_overlap(evaluated_model, reference_model):
    """
    Computes unit overlap of two text documents. Documents
    has to be represented as TF models of non-empty document.

    :returns float:
        0 <= overlap <= 1, where 0 means no match and 1 means
        exactly the same.
    """
    if not (isinstance(evaluated_model, TfModel) and isinstance(reference_model, TfModel)):
        raise ValueError(
            "Arguments has to be instances of 'sumy.models.TfDocumentModel'")

    terms1 = frozenset(evaluated_model.terms)
    terms2 = frozenset(reference_model.terms)

    if not terms1 and not terms2:
        raise ValueError(
            "Documents can't be empty. Please pass the valid documents.")

    common_terms_count = len(terms1 & terms2)
    return common_terms_count / (len(terms1) + len(terms2) - common_terms_count)