Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
returns u_word_main, as str instance (utf-8 encoding)
"""
b_word_main, strip_count = stripDollarIndexes(b_word)
if strip_count > 1:
log.debug(
"processKey(%s):\n" % b_word +
"number of dollar indexes = %s" % strip_count,
)
# convert to unicode
if self.strictStringConvertion:
try:
u_word_main = b_word_main.decode(self.sourceEncoding)
except UnicodeError:
log.debug(
"processKey(%s):\n" % b_word +
"conversion error:\n%s" % excMessage()
)
u_word_main = b_word_main.decode(
self.sourceEncoding,
"ignore",
)
else:
u_word_main = b_word_main.decode(self.sourceEncoding, "ignore")
if self.processHtmlInKey:
# u_word_main_orig = u_word_main
u_word_main = stripHtmlTags(u_word_main)
u_word_main = replaceHtmlEntriesInKeys(u_word_main)
# if(re.match(".*[&<>].*", u_word_main_orig)):
# log.debug("original text: " + u_word_main_orig + "\n" \
# + "new text: " + u_word_main + "\n")
u_word_main = removeControlChars(u_word_main)
def processAlternativeKey(self, b_word, b_key):
"""
b_word is a bytes instance
returns u_word_main, as str instance (utf-8 encoding)
"""
b_word_main, strip_count = stripDollarIndexes(b_word)
# convert to unicode
if self.strictStringConvertion:
try:
u_word_main = b_word_main.decode(self.sourceEncoding)
except UnicodeError:
log.debug(
"processAlternativeKey(%s)\n" % b_word +
"key = %s:\n" % b_key +
"conversion error:\n%s" % excMessage()
)
u_word_main = b_word_main.decode(self.sourceEncoding, "ignore")
else:
u_word_main = b_word_main.decode(self.sourceEncoding, "ignore")
# strip "/" before words
u_word_main = re.sub(
self.stripSlashAltKeyPattern,
r"\1\2",
u_word_main,
)
if self.processHtmlInKey:
# u_word_main_orig = u_word_main
u_word_main = stripHtmlTags(u_word_main)
u_word_main = replaceHtmlEntriesInKeys(u_word_main)
continue
code = int(ref, 16)
utf8_text += chr(code)
else:
self.char_references_statistics(text2, encoding)
if encoding == 'cp1252':
text2 = self.replace_ascii_char_refs(text2, encoding)
if self.strictStringConvertion:
try:
u_text = text2.decode(encoding)
except UnicodeError:
log.debug(
'decode_charset_tags({0})\n'
'fragment({1})\n'
'conversion error:\n{2}'
.format(text, text2, excMessage())
)
u_text = text2.decode(encoding, 'replace')
else:
u_text = text2.decode(encoding, 'replace')
utf8_text += u_text
if encoding != defaultEncoding:
defaultEncodingOnly = False
elif i % 3 == 1: # or
if parts[i].startswith('
if len(encodings) > 0:
del encodings[-1]
else:
log.debug(
'decode_charset_tags({0})\n'
'unbalanced tag\n'
def processEntryKey(self, word):
"""
Return entry key in utf-8 encoding
"""
main_word, strip_cnt = self.stripDollarIndexes(word)
if strip_cnt > 1:
log.debug('processEntryKey({0}):\nnumber of dollar indexes = {1}'\
.format(word, strip_cnt))
# convert to unicode
if self.strictStringConvertion:
try:
u_main_word = main_word.decode(self.sourceEncoding)
except UnicodeError:
log.debug(
'processEntryKey({0}):\nconversion error:\n{1}'
.format(word, excMessage())
)
u_main_word = main_word.decode(self.sourceEncoding, 'ignore')
else:
u_main_word = main_word.decode(self.sourceEncoding, 'ignore')
self.decoded_dump_file_write('\n\nkey: ' + u_main_word)
utf8_main_word = u_main_word
if self.processHtmlInKey:
#utf8_main_word_orig = utf8_main_word
utf8_main_word = self.strip_html_tags(utf8_main_word)
utf8_main_word = self.replace_html_entries_in_keys(utf8_main_word)
#if(re.match('.*[&<>].*', utf8_main_word_orig)):
#log.debug('original text: ' + utf8_main_word_orig + '\n' \
#+ 'new text: ' + utf8_main_word + '\n')
utf8_main_word = self.remove_control_chars(utf8_main_word)
utf8_main_word = self.replace_new_lines(utf8_main_word)
def processEntryAlternativeKey(self, raw_word, raw_key):
main_word, strip_cnt = self.stripDollarIndexes(raw_word)
# convert to unicode
if self.strictStringConvertion:
try:
u_main_word = main_word.decode(self.sourceEncoding)
except UnicodeError:
log.debug(
'processEntryAlternativeKey({0})\nkey = {1}:\nconversion error:\n{2}'
.format(raw_word, raw_key, excMessage())
)
u_main_word = main_word.decode(self.sourceEncoding, 'ignore')
else:
u_main_word = main_word.decode(self.sourceEncoding, 'ignore')
# strip '/' before words
u_main_word = re.sub(self.strip_slash_alt_key_pat, r'\1\2', u_main_word)
self.decoded_dump_file_write('\nalt: ' + u_main_word)
utf8_main_word = u_main_word
if self.processHtmlInKey:
#utf8_main_word_orig = utf8_main_word
utf8_main_word = self.strip_html_tags(utf8_main_word)
utf8_main_word = self.replace_html_entries_in_keys(utf8_main_word)
#if(re.match('.*[&<>].*', utf8_main_word_orig)):
)
continue
u_text += chr(int(b_ref, 16))
else:
self.charReferencesStat(b_text2, encoding)
if encoding == "cp1252":
b_text2 = replaceAsciiCharRefs(b_text2, encoding)
if self.strictStringConvertion:
try:
u_text2 = b_text2.decode(encoding)
except UnicodeError:
log.debug(
"decoding charset tags" +
", b_text=%r" % b_text +
"\nfragment: %r" % b_text2 +
"\nconversion error:\n%s" % excMessage()
)
u_text2 = text2.decode(encoding, "replace")
else:
u_text2 = b_text2.decode(encoding, "replace")
u_text += u_text2
if encoding != defaultEncoding:
defaultEncodingOnly = False
elif i % 3 == 1: # or
if b_part.startswith(b"
if encodings:
encodings.pop()
else:
log.debug(
"decoding charset tags, b_text=%r\n" % b_text +
"unbalanced tag\n"