Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_unit(item, text, lang="en_US"):
"""
Extract unit from regex hit.
"""
group_units = ["prefix", "unit1", "unit2", "unit3", "unit4"]
group_operators = ["operator1", "operator2", "operator3", "operator4"]
# How much of the end is removed because of an "incorrect" regex match
unit_shortening = 0
item_units = [item.group(i) for i in group_units if item.group(i)]
if len(item_units) == 0:
unit = load.units(lang).names["dimensionless"]
else:
derived, slash = [], False
multiplication_operator = False
for index in range(0, 5):
unit = item.group(group_units[index])
operator_index = None if index < 1 else group_operators[index - 1]
operator = None if index < 1 else item.group(operator_index)
# disallow spaces as operators in units expressed in their symbols
# Enforce consistency among multiplication and division operators
# Single exceptions are colloquial number abbreviations (5k miles)
if operator in reg.multiplication_operators(lang) or (
operator is None
and unit
and not (index == 1 and unit in reg.suffixes(lang))
):
def disambiguate_unit(unit, text, lang="en_US"):
"""
Resolve ambiguity between units with same names, symbols or abbreviations.
"""
new_unit = (
load.units(lang).symbols.get(unit)
or load.units(lang).surfaces.get(unit)
or load.units(lang).surfaces_lower.get(unit.lower())
or load.units(lang).symbols_lower.get(unit.lower())
)
if not new_unit:
raise KeyError('Could not find unit "%s" from "%s"' % (unit, text))
if len(new_unit) > 1:
transformed = classifier(lang).tfidf_model.transform([clean_text(text, lang)])
scores = classifier(lang).classifier.predict_proba(transformed).tolist()[0]
scores = zip(scores, classifier(lang).target_names)
# Filter for possible names
names = [i.name for i in new_unit]
scores = [i for i in scores if i[1] in names]
# Sort by rank
1: $
2: 20
3: /
4: h
5: None
6: None
7: None
8: None
9: None
10: None
"""
op_keys = sorted(list(operators(lang)), key=len, reverse=True)
unit_keys = sorted(
list(load.units(lang).surfaces.keys()) + list(load.units(lang).symbols.keys()),
key=len,
reverse=True,
)
symbol_keys = sorted(
list(load.units(lang).prefix_symbols.keys()), key=len, reverse=True
)
exponent = exponents_regex(lang).format(superscripts=unicode_superscript_regex())
all_ops = "|".join([r"{}".format(re.escape(i)) for i in op_keys])
all_units = "|".join([r"{}".format(re.escape(i)) for i in unit_keys])
all_symbols = "|".join([r"{}".format(re.escape(i)) for i in symbol_keys])
pattern = r"""
(?(?:%s)(?![a-zA-Z]))? # Currencies, mainly
and unit.original_dimensions[1]["surface"] in reg.suffixes(lang).keys()
):
suffix = unit.original_dimensions[1]["surface"]
# Only apply if at least last value is suffixed by k, M, etc
if re.search(r"\d{}\b".format(suffix), text):
values = [value * reg.suffixes(lang)[suffix] for value in values]
unit.original_dimensions = [
unit.original_dimensions[0]
] + unit.original_dimensions[2:]
dimension_change = True
elif unit.original_dimensions[0]["surface"] in reg.suffixes(lang).keys():
# k/M etc is only applied if non-symbolic surfaces of other units
# (because colloquial) or currency units
symbolic = all(
dim["surface"] in load.units(lang).names[dim["base"]].symbols
for dim in unit.original_dimensions[1:]
)
if not symbolic:
suffix = unit.original_dimensions[0]["surface"]
values = [value * reg.suffixes(lang)[suffix] for value in values]
unit.original_dimensions = unit.original_dimensions[1:]
dimension_change = True
# Usually "1990s" stands for the decade, not the amount of seconds
elif re.match(r"[1-2]\d\d0s", surface):
unit.original_dimensions = []
dimension_change = True
surface = surface[:-1]
span = (span[0], span[1] - 1)
_LOGGER.debug('\tCorrect for "1990s" pattern')
def build_common_words():
# Read raw 4 letter file
path = os.path.join(TOPDIR, "common-words.txt")
words = defaultdict(list) # Collect words based on length
with open(path, "r", encoding="utf-8") as file:
for line in file:
if line.startswith("#"):
continue
line = line.rstrip()
if (
line not in load.units(lang).surfaces_all
and line not in load.units(lang).symbols
):
words[len(line)].append(line)
plural = load.pluralize(line)
if (
plural not in load.units(lang).surfaces_all
and plural not in load.units(lang).symbols
):
words[len(plural)].append(plural)
return words