Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def replaceAsciiCharRefs(b_text, encoding):
# “
# ċ
if log.isDebug():
assert isinstance(b_text, bytes)
b_parts = re.split(b_pat_ascii_char_ref, b_text)
for i_part, b_part in enumerate(b_parts):
if i_part % 2 != 1:
continue
# reference
try:
if b_part[:3].lower() == "&#x":
code = int(b_part[3:-1], 16)
else:
code = int(b_part[2:-1])
if code <= 0:
raise ValueError()
except (ValueError, OverflowError):
code = -1
if code < 128 or code > 255:
def replaceHtmlEntriesInKeys(u_text):
# &ldash;
# “
# ċ
if log.isDebug():
assert isinstance(u_text, str)
return re.sub(
u_pat_html_entry_key,
replaceHtmlEntryNoEscapeCB,
u_text,
)
def normalizeNewlines(u_text):
"""
convert new lines to unix style and remove consecutive new lines
"""
if log.isDebug():
assert isinstance(u_text, str)
return re.sub(
"[\r\n]+",
"\n",
u_text,
)
def removeNewlines(u_text):
if log.isDebug():
assert isinstance(u_text, str)
return re.sub(
"[\r\n]+",
" ",
u_text,
)
def fixImgLinks(u_text):
"""
Fix img tag links
src attribute value of image tag is often enclosed in \x1e - \x1f
characters.
For example:
<img height="8" width="9" src="\x1e6B6C56EC.png\x1f" border="0">.
Naturally the control characters are not part of the image source name.
They may be used to quickly find all names of resources.
This function strips all such characters.
Control characters \x1e and \x1f are useless in html text, so we may
safely remove all of them, irrespective of context.
"""
if log.isDebug():
assert isinstance(u_text, str)
return u_text.replace("\x1e", "").replace("\x1f", "")
def replaceHtmlEntryNoEscapeCB(u_match):
"""
u_match: instance of _sre.SRE_Match
Replace character entity with the corresponding character
Return the original string if conversion fails.
Use this as a replace function of re.sub.
"""
import html.entities
from pyglossary.html_utils import name2codepoint
u_text = u_match.group(0)
u_name = u_match.group(1)
if log.isDebug():
assert isinstance(u_text, str) and isinstance(u_name, str)
u_res = None
if u_text[:2] == "&#":
# character reference
try:
if u_text[:3].lower() == "&#x":
code = int(u_name, 16)
else:
code = int(u_name)
if code <= 0:
raise ValueError()
u_res = chr(code)
except (ValueError, OverflowError):
u_res = chr(0xFFFD) # replacement character
elif u_text[0] == "&":
def removeControlChars(u_text):
# \x09 - tab
# \x0a - line feed
# \x0b - vertical tab
# \x0d - carriage return
if log.isDebug():
assert isinstance(u_text, str)
return re.sub(
"[\x00-\x08\x0c\x0e-\x1f]",
"",
u_text,
)
def stripHtmlTags(u_text):
if log.isDebug():
assert isinstance(u_text, str)
return re.sub(
"(?:<[/a-zA-Z].*?(?:>|$))+",
" ",
u_text,
)
def escapeNewlines(u_text):
"""
convert text to c-escaped string:
\ -> \\
new line -> \n or \r
"""
if log.isDebug():
assert isinstance(u_text, str)
return re.sub(
"[\\r\\n\\\\]",
escapeNewlinesCallback,
u_text,
)