Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_links(self):
"""Inline text should be treated as separate paragraph."""
html_string = (
''
'<a>I am <strong>top</strong>-inline\n\n\n\n and I am happy \n</a>'
'<p>normal text</p>'
'<code>\nvar i = -INFINITY;\n</code>'
'<div>after <a>text</a> with variable <var>N</var> </div>'
' I am inline\n\n\n\n and I am happy \n'
''
)
dom = html.fromstring(html_string)
paragraphs = ParagraphMaker.make_paragraphs(dom)
assert len(paragraphs) == 5
self.assert_paragraphs_equal(
paragraphs[0],
words_count=7,
tags_count=2,
text="I am top-inline\nand I am happy",
chars_count_in_links=31
)
self.assert_paragraphs_equal(
paragraphs[1],
words_count=2,
tags_count=0,
text="normal text"
)
self.assert_paragraphs_equal(
def test_no_paragraphs(self):
html_string = ''
dom = html.fromstring(html_string)
returned = html.tostring(dom).decode("utf8")
assert html_string == returned
paragraphs = ParagraphMaker.make_paragraphs(dom)
assert len(paragraphs) == 0
def justext(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
encoding=None, default_encoding=DEFAULT_ENCODING,
enc_errors=DEFAULT_ENC_ERRORS, preprocessor=preprocessor):
"""
Converts an HTML page into a list of classified paragraphs. Each paragraph
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
"""
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
dom = preprocessor(dom)
paragraphs = ParagraphMaker.make_paragraphs(dom)
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
stopwords_low, stopwords_high, max_link_density, no_headings)
revise_paragraph_classification(paragraphs, max_heading_distance)
return paragraphs