Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_weighted_spans_word_stopwords():
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='word', stop_words='english')
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('lemon', 5), FW('bias', 8)],
neg=[FW('tree', -6)]))
assert w_spans == WeightedSpans(
analyzer='word',
document='i see: a leaning lemon tree',
weighted_spans=[
('lemon', [(17, 22)], 5),
('tree', [(23, 27)], -6)],
other=FeatureWeights(
pos=[FW('bias', 8), FW('see', 2)],
neg=[FW(hl_in_text, -1)],
))
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('lemon', 4), FW('bias', 8)],
neg=[FW('tree', -6)],
neg_remaining=10
))
assert w_spans == WeightedSpans(
analyzer='word',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('lemon', [(17, 22)], 4),
('tree', [(23, 27)], -6)],
other=FeatureWeights(
pos=[FW('bias', 8), FW(hl_in_text, 0)],
neg=[],
neg_remaining=10,
))
def test_no_weighted_spans():
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='char', ngram_range=(3, 4))
vec.fit([doc])
w_spans = get_weighted_spans(doc, vec, FeatureWeights(pos=[], neg=[]))
assert w_spans == WeightedSpans(
analyzer='char',
document='i see: a leaning lemon tree',
weighted_spans=[],
other=FeatureWeights(pos=[], neg=[]))
def test_weighted_spans_word():
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='word')
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('lemon', 4), FW('bias', 8)],
neg=[FW('tree', -6)],
neg_remaining=10
))
assert w_spans == WeightedSpans(
analyzer='word',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('lemon', [(17, 22)], 4),
('tree', [(23, 27)], -6)],
other=FeatureWeights(
pos=[FW('bias', 8), FW(hl_in_text, 0)],
neg=[],
neg_remaining=10,
))
FW([{'name': 'foo', 'sign': 1}, {'name': 'see', 'sign': -1}], 2),
FW([{'name': 'zoo', 'sign': 1}, {'name': 'bar', 'sign': 1}], 3),
],
neg=[
FW([{'name': 'ree', 'sign': 1}, {'name': 'tre', 'sign': 1}], -4),
],
))
assert w_spans == WeightedSpans(
analyzer='char',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('tre', [(23, 26)], -4),
('ree', [(24, 27)], -4),
],
other=FeatureWeights(
pos=[
FW([{'name': 'zoo', 'sign': 1}, {'name': 'bar', 'sign': 1}], 3),
],
neg=[FW(hl_in_text, -2)],
))
def test_targets_with_value():
expl = Explanation(
estimator='some estimator',
targets=[
TargetExplanation(
'y', feature_weights=FeatureWeights(
pos=[FeatureWeight('a', 13, value=1),
FeatureWeight('b', 5, value=2)],
neg=[FeatureWeight('neg1', -10, value=3),
FeatureWeight('neg2', -1, value=4)],
)),
TargetExplanation(
'y2', feature_weights=FeatureWeights(
pos=[FeatureWeight('f', 1, value=5)],
neg=[],
)),
],
)
df = format_as_dataframe(expl)
expected_df = pd.DataFrame(
{'weight': [13, 5, -1, -10, 1],
'value': [1, 2, 4, 3, 5]},
columns=['weight', 'value'],
index=pd.MultiIndex.from_tuples(
[('y', 'a'), ('y', 'b'), ('y', 'neg2'), ('y', 'neg1'),
('y2', 'f')], names=['target', 'feature']))
print(df, expected_df, sep='\n')
assert expected_df.equals(df)
for idx, fw in enumerate(getattr(feature_weights, group)):
key = (group, idx)
if key not in all_found_features and key not in accounted_keys:
other_items.append(fw)
accounted_keys.add(key)
for vec_name, found_features in named_found_features:
if found_features:
other_items.append(FeatureWeight(
feature=FormattedFeatureName(
'{}Highlighted in text (sum)'.format(
'{}: '.format(vec_name) if vec_name else '')),
weight=sum(found_features.values())))
other_items.sort(key=lambda x: abs(x.weight), reverse=True)
return FeatureWeights(
pos=[fw for fw in other_items if fw.weight >= 0],
neg=[fw for fw in other_items if fw.weight < 0],
pos_remaining=feature_weights.pos_remaining,
neg_remaining=feature_weights.neg_remaining,
)
dictionary=new_feature_weight['dictionary']
)
else:
obj = EnrichedFeatureWeight(
feature=new_feature_weight['feature'],
weight=new_feature_weight['weight'],
score=new_feature_weight['score'],
std=new_feature_weight['std'],
value=new_feature_weight['value'],
formatted_value=new_feature_weight['formatted_value'],
dictionary=new_feature_weight['dictionary']
)
new_features.append(obj)
# To build a FeatureWeights object, we then need to sort features by weight and separate positives and negatives:
feature_weights = FeatureWeights(
pos=sorted([f for f in new_features if f.weight >= 0], key=(lambda o: abs(o.weight)), reverse=True),
neg=sorted([f for f in new_features if f.weight < 0], key=(lambda o: abs(o.weight)), reverse=False)
)
explanation.targets[0].feature_weights = feature_weights
return explanation
def get_top_features(feature_names, coef, top, x=None):
pos, neg = _get_top_features(feature_names, coef, top, x)
pos_coef = coef > 0
neg_coef = coef < 0
# pos_sum = sum(w for name, w in pos or [['', 0]])
# neg_sum = sum(w for name, w in neg or [['', 0]])
return FeatureWeights(
pos=pos,
neg=neg,
pos_remaining=pos_coef.sum() - len(pos),
neg_remaining=neg_coef.sum() - len(neg),
# pos_remaining_sum=coef[pos_coef].sum() - pos_sum,