Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
unit = l.NAMES[item['unit']]
except KeyError:
try:
entity = item['entity']
except KeyError:
print(('Could not find %s, provide "derived" and'
' "entity"' % item['unit']))
return
if entity == 'unknown':
derived = [{
'base': l.NAMES[i['base']].entity.name,
'power': i['power']
} for i in item['dimensions']]
entity = c.Entity(name='unknown', dimensions=derived)
elif entity in l.ENTITIES:
entity = l.ENTITIES[entity]
else:
print(('Could not find %s, provide "derived" and'
' "entity"' % item['unit']))
return
unit = c.Unit(
name=item['unit'],
dimensions=item['dimensions'],
entity=entity)
try:
span = next(
re.finditer(re.escape(item['surface']),
test['req'])).span()
except StopIteration:
print('Surface mismatch for "%s"' % test['req'])
return
uncert = None
print(('Could not find %s, provide "derived" and'
' "entity"' % item['unit']))
return
if entity == 'unknown':
derived = [{
'base': l.NAMES[i['base']].entity.name,
'power': i['power']
} for i in item['dimensions']]
entity = c.Entity(name='unknown', dimensions=derived)
elif entity in l.ENTITIES:
entity = l.ENTITIES[entity]
else:
print(('Could not find %s, provide "derived" and'
' "entity"' % item['unit']))
return
unit = c.Unit(
name=item['unit'],
dimensions=item['dimensions'],
entity=entity)
try:
span = next(
re.finditer(re.escape(item['surface']),
test['req'])).span()
except StopIteration:
print('Surface mismatch for "%s"' % test['req'])
return
uncert = None
if 'uncertainty' in item:
uncert = item['uncertainty']
res.append(
c.Quantity(
value=item['value'],
for item in test['res']:
try:
unit = l.NAMES[item['unit']]
except KeyError:
try:
entity = item['entity']
except KeyError:
print(('Could not find %s, provide "derived" and'
' "entity"' % item['unit']))
return
if entity == 'unknown':
derived = [{
'base': l.NAMES[i['base']].entity.name,
'power': i['power']
} for i in item['dimensions']]
entity = c.Entity(name='unknown', dimensions=derived)
elif entity in l.ENTITIES:
entity = l.ENTITIES[entity]
else:
print(('Could not find %s, provide "derived" and'
' "entity"' % item['unit']))
return
unit = c.Unit(
name=item['unit'],
dimensions=item['dimensions'],
entity=entity)
try:
span = next(
re.finditer(re.escape(item['surface']),
test['req'])).span()
except StopIteration:
print('Surface mismatch for "%s"' % test['req'])
unit = c.Unit(
name=item['unit'],
dimensions=item['dimensions'],
entity=entity)
try:
span = next(
re.finditer(re.escape(item['surface']),
test['req'])).span()
except StopIteration:
print('Surface mismatch for "%s"' % test['req'])
return
uncert = None
if 'uncertainty' in item:
uncert = item['uncertainty']
res.append(
c.Quantity(
value=item['value'],
unit=unit,
surface=item['surface'],
span=span,
uncertainty=uncert))
test['res'] = [i for i in res]
return tests
def test_parse_classifier(self):
all_tests = load_tests(False) + load_tests(True)
# forcedly activate classifier
clf.USE_CLF = True
for test in sorted(all_tests, key=lambda x: len(x['req'])):
quants = p.parse(test['req'])
self.assertEqual(
quants, test['res'],
"{} \n {}".format([quant.__dict__ for quant in quants],
[quant.__dict__ for quant in test['res']]))
def test_training(self):
# TODO - update test to not overwirte existing clf.pickle and wiki.json files.
clf.train_classifier(False)
clf.train_classifier(True)
def get_unit(item, text, lang="en_US"):
"""
Extract unit from regex hit.
"""
group_units = ["prefix", "unit1", "unit2", "unit3", "unit4"]
group_operators = ["operator1", "operator2", "operator3", "operator4"]
# How much of the end is removed because of an "incorrect" regex match
unit_shortening = 0
item_units = [item.group(i) for i in group_units if item.group(i)]
if len(item_units) == 0:
unit = load.units(lang).names["dimensionless"]
else:
derived, slash = [], False
multiplication_operator = False
for index in range(0, 5):
unit = item.group(group_units[index])
operator_index = None if index < 1 else group_operators[index - 1]
operator = None if index < 1 else item.group(operator_index)
# disallow spaces as operators in units expressed in their symbols
# Enforce consistency among multiplication and division operators
# Single exceptions are colloquial number abbreviations (5k miles)
if operator in reg.multiplication_operators(lang) or (
operator is None
and unit
and not (index == 1 and unit in reg.suffixes(lang))
):
def disambiguate_unit(unit, text, lang="en_US"):
"""
Resolve ambiguity between units with same names, symbols or abbreviations.
"""
new_unit = (
load.units(lang).symbols.get(unit)
or load.units(lang).surfaces.get(unit)
or load.units(lang).surfaces_lower.get(unit.lower())
or load.units(lang).symbols_lower.get(unit.lower())
)
if not new_unit:
raise KeyError('Could not find unit "%s" from "%s"' % (unit, text))
if len(new_unit) > 1:
transformed = classifier(lang).tfidf_model.transform([clean_text(text, lang)])
scores = classifier(lang).classifier.predict_proba(transformed).tolist()[0]
scores = zip(scores, classifier(lang).target_names)
# Filter for possible names
names = [i.name for i in new_unit]
scores = [i for i in scores if i[1] in names]
# Sort by rank
def disambiguate_unit(unit_surface, text, lang="en_US"):
"""
Resolve ambiguity between units with same names, symbols or abbreviations.
:returns (str) unit name of the resolved unit
"""
if clf.USE_CLF:
base = clf.disambiguate_unit(unit_surface, text, lang).name
else:
base = (
load.units(lang).symbols[unit_surface]
or load.units(lang).surfaces[unit_surface]
or load.units(lang).surfaces_lower[unit_surface.lower()]
or load.units(lang).symbols_lower[unit_surface.lower()]
)
if len(base) > 1:
base = no_clf.disambiguate_no_classifier(base, text, lang)
elif len(base) == 1:
base = next(iter(base))
if base:
base = base.name
else:
base = "unk"
return base
def disambiguate_no_classifier(entities, text, lang="en_US"):
"""
Disambiguate the entity or unit without a classifier
:param entities:
:param text:
:param lang:
:return: a single entity or unit that has been chosen for
"""
word_sets = load.training_set(lang)
max_entity, max_count, max_relative = None, 0, 0
for entity in entities:
count = 0
total = 0
for word_set in word_sets:
if word_set["unit"] == entity.name:
total += len(word_set["text"])
for word in word_set["text"].split(" "):
count += 1 if word in text else 0
try:
relative = count / total
except ZeroDivisionError:
relative = 0
if relative > max_relative or (relative == max_relative and count > max_count):
max_entity, max_count, max_relative = entity, count, relative