Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_unicode_9():
# This string is 'bɪg'.upper() in Python 3.6 or later, containing the
# new codepoint U+A7AE LATIN CAPITAL LETTER SMALL CAPITAL I.
assert sequence_weirdness("BꞮG") == 0
# That should be less weird than having a definitely-unassigned character
# in the string.
assert sequence_weirdness("B\U00090000G") == 2
def test_emoji_variation_selector():
# The hearts here are explicitly marked as emoji using the variation
# selector U+FE0F. This is not weird.
assert sequence_weirdness('❤\ufe0f' * 10) == 0
def test_unicode_9():
# This string is 'bɪg'.upper() in Python 3.6 or later, containing the
# new codepoint U+A7AE LATIN CAPITAL LETTER SMALL CAPITAL I.
assert sequence_weirdness("BꞮG") == 0
# That should be less weird than having a definitely-unassigned character
# in the string.
assert sequence_weirdness("B\U00090000G") == 2
def test_bmp_characters():
for index in range(0xa0, 0xfffd):
char = chr(index)
# Exclude code points that are not assigned
if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn', 'Sk'):
garble = char.encode('utf-8').decode('latin-1')
# Exclude characters whose re-encoding is protected by the
# 'sequence_weirdness' metric
if sequence_weirdness(garble) >= 0:
garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1')
for garb in (garble, garble2):
fixed, plan = fix_encoding_and_explain(garb)
assert fixed == char
assert apply_plan(garb, plan) == char
def test_unicode_11():
# Unicode 11 has implemented the mtavruli form of the Georgian script.
# They are analogous to capital letters in that they can be used to
# emphasize text or write a headline.
#
# Python will convert to that form when running .upper() on Georgian text,
# starting in version 3.7.0. We want to recognize the result as reasonable
# text on all versions.
#
# This text is the mtavruli form of "ქართული ენა", meaning "Georgian
# language".
georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
assert sequence_weirdness(georgian_mtavruli_text) == 0
mojibake = georgian_mtavruli_text.encode('utf-8').decode('sloppy-windows-1252')
assert fix_encoding(mojibake) == georgian_mtavruli_text
def test_unicode_10():
# This string is the word "thalīṃ" in the Zanabazar Square Script,
# a script added in Unicode 10. These characters are recognized as being
# assigned by Python 3.7, and therefore ftfy should recognize them on
# all versions for consistency.
thalim = "\U00011A1A\U00011A2C\U00011A01\U00011A38"
assert sequence_weirdness(thalim) == 0
def test_emoji_skintone_selector():
# Dear heuristic, you can't call skin-tone selectors weird anymore.
# We welcome Santa Clauses of all colors.
assert sequence_weirdness('🎅🏿🎅🏽🎅🏼🎅🏻') == 0