Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
articles = _feed(url).entries
try:
article = articles[int(article_id)]
except (IndexError, ValueError):
max_id = len(articles) - 1
msg = "Unknown article ID, use ID from 0 to {}".format(max_id)
raise SystemExit("Error: {}".format(msg))
# Get article as HTML
try:
html = article.content[0].value
except AttributeError:
html = article.summary
# Convert HTML to plain text
to_text = html2text.HTML2Text()
to_text.ignore_links = not links
text = to_text.handle(html)
return u"# {}\n\n{}".format(article.title, text)
def get_dehtmlified(report_id):
""" Get raw email of report
"""
try:
report = Report.get(id=report_id)
html = html2text.HTML2Text()
html.body_width = 0
body = html.handle(report.body.replace("\r\n", "<br>"))
body = re.sub(r"^(\s*\n){2,}", "\n", body, flags=re.MULTILINE)
return {"dehtmlify": body}
except (ObjectDoesNotExist, ValueError):
raise NotFound("Report not found")
mention_user_ids: Set[int]) -> str:
for user_id in mention_user_ids:
user = user_handler.get_user(user_id=user_id)
hipchat_mention = '@{short_name}'.format(**user)
zulip_mention = '@**{full_name}**'.format(**user)
content = content.replace(hipchat_mention, zulip_mention)
content = content.replace('@here', '@**all**')
return content
mention_map = dict() # type: Dict[int, Set[int]]
zerver_message = []
import html2text
h = html2text.HTML2Text()
for raw_message in raw_messages:
# One side effect here:
message_id = NEXT_ID('message')
mention_user_ids = {
user_id_mapper.get(id)
for id in set(raw_message['mention_user_ids'])
if user_id_mapper.has(id)
}
mention_map[message_id] = mention_user_ids
content = fix_mentions(
content=raw_message['content'],
mention_user_ids=mention_user_ids,
)
def dom2text(dom, ignore_images=True, ignore_emphasis=True, ignore_tables=True):
from lxml import etree
import html2text
htt = html2text.HTML2Text()
htt.body_width = 0
htt.ignore_images = ignore_images
htt.ignore_emphasis = ignore_emphasis
htt.ignore_tables = ignore_tables
return htt.handle(etree.tostring(dom).decode())
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see .
import sys
import os.path
import math
from progress.bar import Bar
from datetime import timedelta, datetime
from random import shuffle
import signal
import html2text
import textwrap
from . import core
h = html2text.HTML2Text()
h.ignore_links = True
def text(status):
text = textwrap.fill(h.handle(status["content"])).lstrip();
text = text.replace("\n", " ")
if len(text) > 50:
text = text[0:50] + '...'
return "%s \"%s\"" % (status["created_at"][0:10], text)
def delete(mastodon, collection, status):
"""
Delete toot, unfavour favourite, or dismiss notification and mark
it as deleted. The "record not found" error is handled elsewhere.
"""
if collection == 'statuses':
if status["reblog"]:
response.encoding = "utf-8"
try:
response.raise_for_status()
if test_json:
response.json()
except (requests.HTTPError, ValueError):
msg = "Invalid API response."
try:
msg = response.headers["cause-message"]
except:
try:
msg = response.json()["error"]["message"]["value"]
except:
if not response.text.strip().startswith("{"):
try:
h = html2text.HTML2Text()
h.ignore_images = True
h.ignore_anchors = True
msg = h.handle(response.text).strip()
except:
pass
api_error = SentinelAPIError(msg, response)
# Suppress "During handling of the above exception..." message
# See PEP 409
api_error.__cause__ = None
raise api_error
def get_html2text(html):
"""
converts html to text
:param html:
:return: text
"""
text_maker = html2text.HTML2Text()
text_maker.body_width = 0
return text_maker.handle(html)
def htmltext(blob):
h = html2text.HTML2Text()
h.ignore_links = False
return h.handle(blob)
else:
logger.error("process_field:%s:no such field", field)
return ""
else:
logger.info("%s:process_field:%s:isPlain", FEED, field)
# Just asking for plain field:
if field in item:
# If field is special field "link",
# then use urljoin to turn relative URLs into absolute URLs
if field == "link":
return urljoin(FEED.get("feed_url"), item[field])
# Else assume it's a "summary" or "content" or whatever field
# and turn HTML into markdown and don't add any markup:
else:
htmlfixer = HTML2Text()
logger.info(htmlfixer)
htmlfixer.ignore_links = True
htmlfixer.ignore_images = True
htmlfixer.ignore_emphasis = False
htmlfixer.body_width = 1000
htmlfixer.unicode_snob = True
htmlfixer.ul_item_mark = "-" # Default of "*" likely
# to bold things, etc...
markdownfield = htmlfixer.handle(item[field])
# Try to strip any remaining HTML out. Not "safe", but
# simple and should catch most stuff:
markdownfield = re.sub("<[^<]+?>", "", markdownfield)
return markdownfield
else:
logger.error("process_field:%s:no such field", field)
def write2md(dirpath,title,article):
## 创建转换器
h2md = html2text.HTML2Text()
h2md.ignore_links = False
## 转换文档
article = h2md.handle(article)
## 写入文件
if not os.path.exists(dirpath):# 判断目录是否存在,不存在则创建新的目录
os.makedirs(dirpath)
# 创建md文件
with open(dirpath+title+'.md','w',encoding="utf8") as f:
lines = article.splitlines()
for line in lines:
if line.endswith('-'):
f.write(line)
else:
f.write(line+"\n")
print(title+"下载完成....")