How to use the html2text.HTML2Text function in html2text

To help you get started, we’ve selected a few html2text examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github realpython / reader / reader / feed.py View on Github external
articles = _feed(url).entries
    try:
        article = articles[int(article_id)]
    except (IndexError, ValueError):
        max_id = len(articles) - 1
        msg = "Unknown article ID, use ID from 0 to {}".format(max_id)
        raise SystemExit("Error: {}".format(msg))

    # Get article as HTML
    try:
        html = article.content[0].value
    except AttributeError:
        html = article.summary

    # Convert HTML to plain text
    to_text = html2text.HTML2Text()
    to_text.ignore_links = not links
    text = to_text.handle(html)

    return u"# {}\n\n{}".format(article.title, text)
github ovh / cerberus-core / abuse / api / controllers / reports.py View on Github external
def get_dehtmlified(report_id):
    """ Get raw email of report
    """
    try:
        report = Report.get(id=report_id)
        html = html2text.HTML2Text()
        html.body_width = 0
        body = html.handle(report.body.replace("\r\n", "<br>"))
        body = re.sub(r"^(\s*\n){2,}", "\n", body, flags=re.MULTILINE)
        return {"dehtmlify": body}
    except (ObjectDoesNotExist, ValueError):
        raise NotFound("Report not found")
github zulip / zulip / zerver / data_import / hipchat.py View on Github external
mention_user_ids: Set[int]) -> str:
        for user_id in mention_user_ids:
            user = user_handler.get_user(user_id=user_id)
            hipchat_mention = '@{short_name}'.format(**user)
            zulip_mention = '@**{full_name}**'.format(**user)
            content = content.replace(hipchat_mention, zulip_mention)

        content = content.replace('@here', '@**all**')
        return content

    mention_map = dict()  # type: Dict[int, Set[int]]

    zerver_message = []

    import html2text
    h = html2text.HTML2Text()

    for raw_message in raw_messages:
        # One side effect here:

        message_id = NEXT_ID('message')
        mention_user_ids = {
            user_id_mapper.get(id)
            for id in set(raw_message['mention_user_ids'])
            if user_id_mapper.has(id)
        }
        mention_map[message_id] = mention_user_ids

        content = fix_mentions(
            content=raw_message['content'],
            mention_user_ids=mention_user_ids,
        )
github MikimotoH / DLink_Harvester / linksys_0.py View on Github external
def dom2text(dom, ignore_images=True, ignore_emphasis=True, ignore_tables=True):
    from lxml import etree
    import html2text
    htt = html2text.HTML2Text()
    htt.body_width = 0
    htt.ignore_images = ignore_images
    htt.ignore_emphasis = ignore_emphasis
    htt.ignore_tables = ignore_tables
    return htt.handle(etree.tostring(dom).decode())
github kensanata / mastodon-backup / mastodon_archive / expire.py View on Github external
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see .

import sys
import os.path
import math
from progress.bar import Bar
from datetime import timedelta, datetime
from random import shuffle
import signal
import html2text
import textwrap
from . import core

h = html2text.HTML2Text()
h.ignore_links = True

def text(status):
    text = textwrap.fill(h.handle(status["content"])).lstrip();
    text = text.replace("\n", " ")
    if len(text) &gt; 50:
        text = text[0:50] + '...'
    return "%s \"%s\"" % (status["created_at"][0:10], text)

def delete(mastodon, collection, status):
    """
    Delete toot, unfavour favourite, or dismiss notification and mark
    it as deleted. The "record not found" error is handled elsewhere.
    """
    if collection == 'statuses':
        if status["reblog"]:
github sentinelsat / sentinelsat / sentinelsat / sentinel.py View on Github external
response.encoding = "utf-8"
    try:
        response.raise_for_status()
        if test_json:
            response.json()
    except (requests.HTTPError, ValueError):
        msg = "Invalid API response."
        try:
            msg = response.headers["cause-message"]
        except:
            try:
                msg = response.json()["error"]["message"]["value"]
            except:
                if not response.text.strip().startswith("{"):
                    try:
                        h = html2text.HTML2Text()
                        h.ignore_images = True
                        h.ignore_anchors = True
                        msg = h.handle(response.text).strip()
                    except:
                        pass
        api_error = SentinelAPIError(msg, response)
        # Suppress "During handling of the above exception..." message
        # See PEP 409
        api_error.__cause__ = None
        raise api_error
github DefectDojo / django-DefectDojo / dojo / tools / acunetix / parser_helper.py View on Github external
def get_html2text(html):
    """
        converts html to text
    :param html:
    :return: text
    """
    text_maker = html2text.HTML2Text()
    text_maker.body_width = 0
    return text_maker.handle(html)
github wso2 / security-tools / external / django-DefectDojo-1.2.1 / dojo / tools / qualyswebapp / parser.py View on Github external
def htmltext(blob):
    h = html2text.HTML2Text()
    h.ignore_links = False
    return h.handle(blob)
github freiheit / discord_feedbot / feed2discord.py View on Github external
else:
            logger.error("process_field:%s:no such field", field)
            return ""

    else:
        logger.info("%s:process_field:%s:isPlain", FEED, field)
        # Just asking for plain field:
        if field in item:
            # If field is special field "link",
            # then use urljoin to turn relative URLs into absolute URLs
            if field == "link":
                return urljoin(FEED.get("feed_url"), item[field])
            # Else assume it's a "summary" or "content" or whatever field
            # and turn HTML into markdown and don't add any markup:
            else:
                htmlfixer = HTML2Text()
                logger.info(htmlfixer)
                htmlfixer.ignore_links = True
                htmlfixer.ignore_images = True
                htmlfixer.ignore_emphasis = False
                htmlfixer.body_width = 1000
                htmlfixer.unicode_snob = True
                htmlfixer.ul_item_mark = "-"  # Default of "*" likely
                # to bold things, etc...
                markdownfield = htmlfixer.handle(item[field])

                # Try to strip any remaining HTML out.  Not "safe", but
                # simple and should catch most stuff:
                markdownfield = re.sub("&lt;[^&lt;]+?&gt;", "", markdownfield)
                return markdownfield
        else:
            logger.error("process_field:%s:no such field", field)
github Kit4y / html2markdown_Spider / html_to_markdown.py View on Github external
def write2md(dirpath,title,article):
    ## 创建转换器
    h2md = html2text.HTML2Text()
    h2md.ignore_links = False
    ## 转换文档
    article = h2md.handle(article)
    ## 写入文件
    if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
        os.makedirs(dirpath)
    # 创建md文件
    with open(dirpath+title+'.md','w',encoding="utf8") as f:
        lines = article.splitlines()
        for line in lines:
            if line.endswith('-'):
                f.write(line)
            else:
                f.write(line+"\n")
    print(title+"下载完成....")