Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Parse the syntax of a language tag, without looking up anything in the
registry, yet. Returns a list of (type, value) tuples indicating what
information will need to be looked up.
"""
tag = normalize_characters(tag)
if tag in EXCEPTIONS:
return [('grandfathered', tag)]
else:
# The first subtag is always either the language code, or 'x' to mark
# the entire tag as private-use. Other subtags are distinguished
# by their length and format, but the language code is distinguished
# entirely by the fact that it is required to come first.
subtags = tag.split('-')
if subtags[0] == 'x':
if len(subtags) == 1:
raise LanguageTagError("'x' is not a language tag on its own")
else:
# the entire language tag is private use, but we know that,
# whatever it is, it fills the "language" slot
return [('language', tag)]
elif len(subtags[0]) >= 2:
return [('language', subtags[0])] + parse_subtags(subtags[1:])
else:
subtag_error(subtags[0], 'a language code')
def order_error(subtag, got, expected):
"""
Output an error indicating that tags were out of order.
"""
options = SUBTAG_TYPES[expected:]
if len(options) == 1:
expect_str = options[0]
elif len(options) == 2:
expect_str = '%s or %s' % (options[0], options[1])
else:
expect_str = '%s, or %s' % (', '.join(options[:-1]), options[-1])
got_str = SUBTAG_TYPES[got]
raise LanguageTagError("This %s subtag, %r, is out of place. "
"Expected %s." % (got_str, subtag, expect_str))
def subtag_error(subtag, expected='a valid subtag'):
"""
Try to output a reasonably helpful error message based on our state of
parsing. Most of this code is about how to list, in English, the kinds
of things we were expecting to find.
"""
raise LanguageTagError("Expected %s, got %r" % (expected, subtag))
def parse_extension(subtags):
"""
An extension tag consists of a 'singleton' -- a one-character subtag --
followed by other subtags. Extension tags are in the BCP 47 syntax, but
their meaning is outside the scope of the standard.
For example, there's the u- extension, which is used for setting Unicode
properties in some context I'm not aware of.
If the singleton is 'x', it's a private use extension, and consumes the
rest of the tag. Otherwise, it stops at the next singleton.
"""
subtag = subtags[0]
if len(subtags) == 1:
raise LanguageTagError(
"The subtag %r must be followed by something" % subtag
)
if subtag == 'x':
# Private use. Everything after this is arbitrary codes that we
# can't look up.
return [('private', '-'.join(subtags))]
else:
# Look for the next singleton, if there is one.
boundary = 1
while boundary < len(subtags) and len(subtags[boundary]) != 1:
boundary += 1
# We've parsed a complete extension subtag. Return to the main
# parse_subtags function, but expect to find nothing but more