Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __contains__(self, key):
return key in self.ATTRIBUTES and getattr(self, key)
def __repr__(self):
items = []
for attr in self.ATTRIBUTES:
if getattr(self, attr):
items.append('{0}={1!r}'.format(attr, getattr(self, attr)))
return "Language.make({})".format(', '.join(items))
def __str__(self):
return self.to_tag()
# Make the get(), find(), and find_name() functions available at the top level
get = Language.get
find = Language.find
find_name = Language.find_name
# Make the Language object available under the old name LanguageData
LanguageData = Language
def standardize_tag(tag: {str, Language}, macro: bool=False) -> str:
"""
Standardize a language tag:
- Replace deprecated values with their updated versions (if those exist)
- Remove script tags that are redundant with the language
- If *macro* is True, use a macrolanguage to represent the most common
standardized language within that macrolanguage. For example, 'cmn'
(Mandarin) becomes 'zh' (Chinese), and 'arb' (Modern Standard Arabic)
36
Comparing Swiss German ('gsw') to standardized German ('de') shows how
these scores can be asymmetrical. Swiss German speakers will understand
German, so the score in that direction is 92. Most German speakers find
Swiss German unintelligible, and CLDR in fact assigns this a score of 16.
This seems a little bit extreme, but the asymmetry is certainly there. And
if your text is tagged as 'gsw', it must be that way for a reason.
>>> tag_match_score('gsw', 'de')
92
>>> tag_match_score('de', 'gsw')
16
"""
desired_ld = Language.get(desired)
supported_ld = Language.get(supported)
return desired_ld.match_score(supported_ld)
"""
# No matter what form of language we got, normalize it to a single
# language subtag
if isinstance(language, Language):
language = language.language
elif isinstance(language, str):
language = get(language).language
if language is None:
language = 'und'
code = name_to_code(tagtype, name, language)
if code is None:
raise LookupError("Can't find any %s named %r" % (tagtype, name))
if '-' in code:
return Language.get(code)
else:
data = {tagtype: code}
return Language.make(**data)
Comparing Swiss German ('gsw') to standardized German ('de') shows how
these scores can be asymmetrical. Swiss German speakers will understand
German, so the score in that direction is 92. Most German speakers find
Swiss German unintelligible, and CLDR in fact assigns this a score of 16.
This seems a little bit extreme, but the asymmetry is certainly there. And
if your text is tagged as 'gsw', it must be that way for a reason.
>>> tag_match_score('gsw', 'de')
92
>>> tag_match_score('de', 'gsw')
16
"""
desired_ld = Language.get(desired)
supported_ld = Language.get(supported)
return desired_ld.match_score(supported_ld)
# case normalization that comes from parse_tag() hasn't been applied
# yet.
tag_lower = tag.lower()
if normalize and tag_lower in LANGUAGE_REPLACEMENTS:
tag = LANGUAGE_REPLACEMENTS[tag_lower]
components = parse_tag(tag)
for typ, value in components:
if typ == 'extlang' and normalize and 'language' in data:
# smash extlangs when possible
minitag = '%s-%s' % (data['language'], value)
norm = LANGUAGE_REPLACEMENTS.get(minitag.lower())
if norm is not None:
data.update(
Language.get(norm, normalize).to_dict()
)
else:
data.setdefault('extlangs', []).append(value)
elif typ in {'extlang', 'variant', 'extension'}:
data.setdefault(typ + 's', []).append(value)
elif typ == 'language':
if value == 'und':
pass
elif normalize:
replacement = LANGUAGE_REPLACEMENTS.get(value.lower())
if replacement is not None:
# parse the replacement if necessary -- this helps with
# Serbian and Moldovan
data.update(
Language.get(replacement, normalize).to_dict()
)