Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_most_frequent_terms_empty():
tokenizer = Tokenizer("english")
model = TfDocumentModel("", tokenizer)
assert model.most_frequent_terms() == ()
assert model.most_frequent_terms(10) == ()
def test_slovak_alias_into_czech_tokenizer():
tokenizer = Tokenizer("slovak")
assert tokenizer.language == "slovak"
sentences = tokenizer.to_sentences("""
Je to veľmi fajn. Bodaj by nie.
Ale na druhej strane čo je to oproti inému?
To nechám na čitateľa.
""")
expected = (
"Je to veľmi fajn.",
"Bodaj by nie.",
"Ale na druhej strane čo je to oproti inému?",
"To nechám na čitateľa.",
)
assert expected == sentences
def test_terms():
tokenizer = Tokenizer("english")
text = "wA wB wC wD wB wD wE"
model = TfDocumentModel(text, tokenizer)
terms = tuple(sorted(model.terms))
assert terms == ("wa", "wb", "wc", "wd", "we")
results.add_error({'url':url,'lib':last_lib,'message':str(e)})
except Exception as e:
results.add_error({'url':url,'lib':last_lib,'message':str(e)})
#detect lang of the text
try:
lang_detect=detect(text)
except Exception as e:
results.add_error({'url':url,'lib':last_lib,'message':str(e)})
lang_detect=""
#generate summary
sumy_summary=""
sum_title=""
if lang_detect!="":
parser = PlaintextParser.from_string(text, Tokenizer(self.LANGUAGES[lang_detect]))
stemmer = Stemmer(self.LANGUAGES[lang_detect])
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(self.LANGUAGES[lang_detect])
#build title from summary
try:
for sentence in summarizer(parser.document, 1):
sum_title+=sentence.__unicode__()
# build summary
for sentence in summarizer(parser.document, self.SENTENCES_COUNT):
sumy_summary+=sentence.__unicode__()+u"\n"
except:
sumy_summary=""
doc={"link":url,"content":[{"base":url,"language":lang_detect}]}
def summarize(srt_file, n_sentences, language="english"):
""" Generate segmented summary
Args:
srt_file(str) : The name of the SRT FILE
n_sentences(int): No of sentences
language(str) : Language of subtitles (default to English)
Returns:
list: segment of subtitles
"""
parser = PlaintextParser.from_string(
srt_to_txt(srt_file), Tokenizer(language))
stemmer = Stemmer(language)
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(language)
segment = []
for sentence in summarizer(parser.document, n_sentences):
index = int(re.findall("\(([0-9]+)\)", str(sentence))[0])
item = srt_file[index]
segment.append(srt_segment_to_range(item))
return segment
elif args["--text"] is not None:
parser = PARSERS[document_format or "plaintext"]
document_content = args["--text"]
else:
parser = PARSERS[document_format or "plaintext"]
document_content = default_input_stream.read()
items_count = ItemsCount(args["--length"])
language = args["--language"]
if args['--stopwords']:
stop_words = read_stop_words(args['--stopwords'])
else:
stop_words = get_stop_words(language)
parser = parser(document_content, Tokenizer(language))
stemmer = Stemmer(language)
summarizer_class = next(cls for name, cls in AVAILABLE_METHODS.items() if args[name])
summarizer = build_summarizer(summarizer_class, stop_words, stemmer, parser)
return summarizer, parser, items_count
@author: megan squire
"""
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
LANGUAGE = "english"
SENTENCES_COUNT = 4
parser = PlaintextParser.from_file("sampleText.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
print("\n====== Luhn ======")
summarizerLuhn = LuhnSummarizer(stemmer)
summarizerLuhn.stop_words = get_stop_words(LANGUAGE)
for sentenceLuhn in summarizerLuhn(parser.document, SENTENCES_COUNT):
print(sentenceLuhn, "\n")
print("====== TextRank ======")
summarizerTR = TextRankSummarizer(stemmer)
summarizerTR.stop_words = get_stop_words(LANGUAGE)
for sentenceTR in summarizerTR(parser.document, SENTENCES_COUNT):
print(sentenceTR, "\n")
print("====== LSA ======")
summarizerLSA = LsaSummarizer(stemmer)