Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Language.get(norm, normalize).to_dict()
)
else:
data.setdefault('extlangs', []).append(value)
elif typ in {'extlang', 'variant', 'extension'}:
data.setdefault(typ + 's', []).append(value)
elif typ == 'language':
if value == 'und':
pass
elif normalize:
replacement = LANGUAGE_REPLACEMENTS.get(value.lower())
if replacement is not None:
# parse the replacement if necessary -- this helps with
# Serbian and Moldovan
data.update(
Language.get(replacement, normalize).to_dict()
)
else:
data['language'] = value
else:
data['language'] = value
elif typ == 'region':
if normalize:
data['region'] = REGION_REPLACEMENTS.get(value.lower(), value)
else:
data['region'] = value
elif typ == 'grandfathered':
# If we got here, we got a grandfathered tag but we were asked
# not to normalize it, or the CLDR data doesn't know how to
# normalize it. The best we can do is set the entire tag as the
# language.
data['language'] = value
>>> standardize_tag('ja-latn-hepburn')
'ja-Latn-hepburn'
>>> standardize_tag('spa-latn-mx')
'es-MX'
If the tag can't be parsed according to BCP 47, this will raise a
LanguageTagError (a subclass of ValueError):
>>> standardize_tag('spa-mx-latn')
Traceback (most recent call last):
...
langcodes.tag_parser.LanguageTagError: This script subtag, 'latn', is out of place. Expected variant, extension, or end of string.
"""
langdata = Language.get(tag, normalize=True)
if macro:
langdata = langdata.prefer_macrolanguage()
return langdata.simplify_script().to_tag()
>>> str(Language.get('und-Arab').maximize())
'ar-Arab-EG'
>>> str(Language.get('und-CH').maximize())
'de-Latn-CH'
>>> str(Language.make().maximize()) # 'MURICA.
'en-Latn-US'
>>> str(Language.get('und-ibe').maximize())
'en-ibe-Latn-US'
"""
if self._filled is not None:
return self._filled
for broader in self.broaden():
tag = broader.to_tag()
if tag in LIKELY_SUBTAGS:
result = Language.get(LIKELY_SUBTAGS[tag], normalize=False)
result = result.update(self)
self._filled = result
return result
raise RuntimeError(
"Couldn't fill in likely values. This represents a problem with "
"the LIKELY_SUBTAGS data."
for attr in self.ATTRIBUTES:
if getattr(self, attr):
items.append('{0}={1!r}'.format(attr, getattr(self, attr)))
return "Language.make({})".format(', '.join(items))
def __str__(self):
return self.to_tag()
# Make the get(), find(), and find_name() functions available at the top level
get = Language.get
find = Language.find
find_name = Language.find_name
# Make the Language object available under the old name LanguageData
LanguageData = Language
def standardize_tag(tag: {str, Language}, macro: bool=False) -> str:
"""
Standardize a language tag:
- Replace deprecated values with their updated versions (if those exist)
- Remove script tags that are redundant with the language
- If *macro* is True, use a macrolanguage to represent the most common
standardized language within that macrolanguage. For example, 'cmn'
(Mandarin) becomes 'zh' (Chinese), and 'arb' (Modern Standard Arabic)
becomes 'ar' (Arabic).
- Format the result according to the conventions of BCP 47
Macrolanguage replacement is not required by BCP 47, but it is required
by the Unicode CLDR.
def _get_name(self, attribute: str, language, min_score: int):
assert attribute in self.ATTRIBUTES
if isinstance(language, Language):
language = language.to_tag()
attr_value = getattr(self, attribute)
if attr_value is None:
return None
names = code_to_names(attribute, attr_value)
names['und'] = getattr(self, attribute)
return self._best_name(names, language, min_score)
def update(self, other: 'Language') -> 'Language':
"""
Update this Language with the fields of another Language.
"""
return Language.make(
language=other.language or self.language,
extlangs=other.extlangs or self.extlangs,
script=other.script or self.script,
region=other.region or self.region,
variants=other.variants or self.variants,
extensions=other.extensions or self.extensions,
private=other.private or self.private
)
return key in self.ATTRIBUTES and getattr(self, key)
def __repr__(self):
items = []
for attr in self.ATTRIBUTES:
if getattr(self, attr):
items.append('{0}={1!r}'.format(attr, getattr(self, attr)))
return "Language.make({})".format(', '.join(items))
def __str__(self):
return self.to_tag()
# Make the get(), find(), and find_name() functions available at the top level
get = Language.get
find = Language.find
find_name = Language.find_name
# Make the Language object available under the old name LanguageData
LanguageData = Language
def standardize_tag(tag: {str, Language}, macro: bool=False) -> str:
"""
Standardize a language tag:
- Replace deprecated values with their updated versions (if those exist)
- Remove script tags that are redundant with the language
- If *macro* is True, use a macrolanguage to represent the most common
standardized language within that macrolanguage. For example, 'cmn'
(Mandarin) becomes 'zh' (Chinese), and 'arb' (Modern Standard Arabic)
becomes 'ar' (Arabic).
>>> Language.find('brazilian portuguese')
Language.make(language='pt', region='BR')
>>> Language.find('simplified chinese')
Language.make(language='zh', script='Hans')
Some language names are ambiguous: for example, there is a language
named 'Fala' in English (with code 'fax'), but 'Fala' is also the
Kwasio word for French. In this case, specifying the language that
the name is in is necessary for disambiguation.
>>> Language.find('fala')
Language.make(language='fr')
>>> Language.find('fala', 'en')
Language.make(language='fax')
"""
return Language.find_name('language', name, language)
The language tag 'sh' (Serbo-Croatian) ended up being politically
problematic, and different standards took different steps to address
this. The IANA made it into a macrolanguage that contains 'sr', 'hr',
and 'bs'. Unicode further decided that it's a legacy tag that should
be interpreted as 'sr-Latn', which the language matching rules say
is mutually intelligible with all those languages.
We complicate the example by adding on the region tag 'QU', an old
provisional tag for the European Union, which is now standardized as
'EU'.
>>> Language.get('sh-QU')
Language.make(language='sr', script='Latn', region='EU')
"""
if isinstance(tag, Language):
if not normalize:
# shortcut: we have the tag already
return tag
# We might need to normalize this tag. Convert it back into a
# string tag, to cover all the edge cases of normalization in a
# way that we've already solved.
tag = tag.to_tag()
if (tag, normalize) in Language._PARSE_CACHE:
return Language._PARSE_CACHE[tag, normalize]
data = {}
# if the complete tag appears as something to normalize, do the
# normalization right away. Smash case when checking, because the
# case normalization that comes from parse_tag() hasn't been applied