Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if not gold_standard:
return [u'', u'']
if not cetr:
content_comments = RE_COMMENTS_DELIM.split(gold_standard, maxsplit=1)
# if no comments delimiter found, append empty comments string
if len(content_comments) == 1:
content_comments = [content_comments[0], u'']
else:
tree = etree.fromstring(gold_standard, parser=etree.HTMLParser())
content_comments = [u' '.join(text_from_subtree(tree)), u'']
# fix text in case of mangled encodings
content_comments = [ftfy.fix_encoding(content_comments[0]).strip(),
ftfy.fix_encoding(content_comments[1]).strip()]
return content_comments
gold_standard = None
if not gold_standard:
return [u'', u'']
if not cetr:
content_comments = RE_COMMENTS_DELIM.split(gold_standard, maxsplit=1)
# if no comments delimiter found, append empty comments string
if len(content_comments) == 1:
content_comments = [content_comments[0], u'']
else:
tree = etree.fromstring(gold_standard, parser=etree.HTMLParser())
content_comments = [u' '.join(text_from_subtree(tree)), u'']
# fix text in case of mangled encodings
content_comments = [ftfy.fix_encoding(content_comments[0]).strip(),
ftfy.fix_encoding(content_comments[1]).strip()]
return content_comments
def fix_component_encodings(self, tags):
return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(tags)}
Returns:
str
"""
fname = os.path.join(
data_dir, RAW_HTML_DIRNAME, fileroot + RAW_HTML_EXT)
encodings = (encoding,) if encoding else ('utf-8', 'iso-8859-1') # 'utf-16'
for encoding in encodings:
try:
with io.open(fname, mode='rt', encoding=encoding) as f:
raw_html = f.read()
break
except (UnicodeDecodeError, UnicodeError):
raw_html = None
return ftfy.fix_encoding(raw_html).strip()
def main():
print(ftfy.fix_encoding('주문하다 - to intent for?'))