Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_annotated_text(self):
path = expand_resource_path("snippets/paragraphs.html")
url = "http://www.snippet.org/paragraphs.html"
parser = HtmlParser.from_file(path, url, Tokenizer("czech"))
document = parser.document
self.assertEqual(len(document.paragraphs), 2)
self.assertEqual(len(document.paragraphs[0].headings), 1)
self.assertEqual(len(document.paragraphs[0].sentences), 1)
self.assertEqual(to_unicode(document.paragraphs[0].headings[0]),
"Toto je nadpis prvej úrovne")
self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]),
"Toto je prvý odstavec a to je fajn.")
self.assertEqual(len(document.paragraphs[1].headings), 0)
self.assertEqual(len(document.paragraphs[1].sentences), 2)
elif summary_method == 'kl':
summary_fn = KLSummarizer
elif summary_method == 'sumbasic':
summary_fn = SumBasicSummarizer
else:
raise Exception('Could not find summary method ' + summary_method)
if not os.path.exists(os.path.join(out_dir, summary_method, reference_folder)):
os.makedirs(os.path.join(out_dir, summary_method, reference_folder))
if not os.path.exists(os.path.join(out_dir, summary_method, decoded_folder)):
os.makedirs(os.path.join(out_dir, summary_method, decoded_folder))
print (os.path.join(out_dir, summary_method))
article_names = sorted(os.listdir(articles_dir))
for art_idx, article_name in enumerate(tqdm(article_names)):
file = os.path.join(articles_dir, article_name)
parser = HtmlParser.from_file(file, "", Tokenizer("english"))
summarizer = summary_fn()
summary = summarizer(parser.document, 5) #Summarize the document with 5 sentences
summary = [str(sentence) for sentence in summary]
with open(os.path.join(out_dir, summary_method, decoded_folder, article_name), 'wb') as f:
f.write('\n'.join(summary))
summary_tokenized = []
for sent in summary:
summary_tokenized.append(' '.join(nltk.tokenize.word_tokenize(sent.lower())))
with open(os.path.join(abstract_dir, article_name)) as f:
abstracts_text = f.read()
abstracts = abstracts_text.split('\n\n')
abstracts_sentences = []
for abs_idx, abstract in enumerate(abstracts):
abstract_sents = abstract.split('\n')
def main(url, num_sentences=10, language='english'):
parser = HtmlParser.from_url(url, Tokenizer(language))
stemmer = Stemmer(language)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(language)
for sentence in summarizer(parser.document, num_sentences):
print(sentence)
searchlink=clean_links[i]
print("Search Link --> "+str(searchlink))
if searchlink[-4:]=='.pdf' or searchlink[-4:]=='.ppt':
# go to next link id the current link is a ppt or pdf
print("Can't include ppts or pdfs, trying next link on Google")
linkno+=1
if linkno>9:
# if number of links on one page have been exceede, go to the next google link page
num_page+=1
linkno=0
else:
LANGUAGE = "english"
SENTENCES_COUNT = 10
parser = HtmlParser.from_url(searchlink, Tokenizer(LANGUAGE))
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Summarisation using Luhn Summarizer
stopwords1 = set(stopwords.words('english'))
datastring=''
# using the LuhnSummarizer
summarizer = LuhnSummarizer()
summarizer.stop_words = stopwords1
for sentence in summarizer(parser.document, SENTENCES_COUNT):
# print(sentence)
datastring+=str(sentence)
return datastring
except:
parser = PlaintextParser.from_file(textfile, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
# now summarize: output as [txtfile]_summary.txt
g=open(textfile[0:-4]+'_summary.txt','w')
for sentence in summarizer(parser.document, SENTENCES_COUNT):
print(sentence)
g.write(str(sentence))
g.close()
os.system('open %s'%(textfile[0:-4]+'_summary.txt'))
elif ftype in ['w']:
# for URLS
url=input('what link would you like to summarize on Wikipedia? \n')
parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
# for plaintext
#parser = PlaintextParser.from_file("poetry.txt", Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
# now summarize: output as [txtfile]_summary.txt
g=open('web_summary.txt','w')
for sentence in summarizer(parser.document, SENTENCES_COUNT):
print(sentence)
g.write(str(sentence))
g.close()
os.system('open web_summary.txt')
def scrape(self, url):
complete_url = url
try:
# get summary
print "Retrieving page summary of %s... " % url
parser = HtmlParser.from_url(complete_url, Tokenizer(LANGUAGE))
stemmer = Stemmer(LANGUAGE)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)
url_summary = ''.join(str(sentence) for sentence in summarizer(parser.document, SENTENCES_COUNT))
except Exception, e:
url_summary = "Could not scrape summary. Reason: %s" % e.message
print "Done: %s = %s" % (url, url_summary)
# create scraping result
scraping_result = ScrapingResult()
scraping_result.summary = url_summary
stemmer = Stemmer("english")
summarizer = Summarizer(stemmer)
comment_ids = request.POST.getlist('d_ids[]')
sent_list = []
for comment_id in comment_ids:
comment = Comment.objects.get(id=comment_id)
text = comment.text
text = re.sub('<br>', ' ', text)
text = re.sub('<br>', ' ', text)
parser = HtmlParser.from_string(text, '', Tokenizer("english"))
num_sents = request.GET.get('num_sents', None)
if not num_sents:
all_sents = parser.tokenize_sentences(text)
num_sents = floor(float(len(all_sents))/3.0)
sents = summarizer(parser.document, num_sents)
for sent in sents:
if 'https://en.wikipedia.org/wiki/' in comment.article.url:
text = parser.parse(sent._text)
sent = ''
in_tag = False
for c in text:
if c == '<':
def summarize_from_url(self,url):
parser = HtmlParser.from_url(url, Tokenizer(self.LANGUAGE))
stemmer = Stemmer(self.LANGUAGE)
summarizer = Summarizer(stemmer)
file_1 = open("summarizer_output.txt","w+")
file_2 = open("summarizer_output2.txt","w+")
for sentence in summarizer(parser.document, self.SENTENCES_COUNT):
file_2.write(str(sentence))
file_1.write(str(sentence))
file_1.write("\n")
file_1.close()
file_2.close()
from ..parsers.plaintext import PlaintextParser
from ..summarizers.random import RandomSummarizer
from ..summarizers.luhn import LuhnSummarizer
from ..summarizers.edmundson import EdmundsonSummarizer
from ..summarizers.lsa import LsaSummarizer
from ..summarizers.text_rank import TextRankSummarizer
from ..summarizers.lex_rank import LexRankSummarizer
from ..summarizers.sum_basic import SumBasicSummarizer
from ..summarizers.kl import KLSummarizer
from ..nlp.stemmers import Stemmer
from . import precision, recall, f_score, cosine_similarity, unit_overlap
from . import rouge_1, rouge_2, rouge_l_sentence_level, rouge_l_summary_level
PARSERS = {
"html": HtmlParser,
"plaintext": PlaintextParser,
}
def build_random(parser, language):
return RandomSummarizer()
def build_luhn(parser, language):
summarizer = LuhnSummarizer(Stemmer(language))
summarizer.stop_words = get_stop_words(language)
return summarizer
def build_edmundson(parser, language):
from .utils import ItemsCount, get_stop_words, read_stop_words, fetch_url
from ._compat import to_string, to_unicode, to_bytes, PY3
from .nlp.tokenizers import Tokenizer
from .parsers.html import HtmlParser
from .parsers.plaintext import PlaintextParser
from .summarizers.luhn import LuhnSummarizer
from .summarizers.edmundson import EdmundsonSummarizer
from .summarizers.lsa import LsaSummarizer
from .summarizers.text_rank import TextRankSummarizer
from .summarizers.lex_rank import LexRankSummarizer
from .summarizers.sum_basic import SumBasicSummarizer
from .summarizers.kl import KLSummarizer
from .nlp.stemmers import Stemmer
PARSERS = {
"html": HtmlParser,
"plaintext": PlaintextParser,
}
AVAILABLE_METHODS = {
"luhn": LuhnSummarizer,
"edmundson": EdmundsonSummarizer,
"lsa": LsaSummarizer,
"text-rank": TextRankSummarizer,
"lex-rank": LexRankSummarizer,
"sum-basic": SumBasicSummarizer,
"kl": KLSummarizer,
}
def main(args=None):
args = docopt(to_string(__doc__), args, version=__version__)