Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('leaning lemon', 5), FW('lemon tree', 8)],
neg=[FW('tree', -6)]))
assert w_spans == WeightedSpans(
analyzer='word',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('tree', [(23, 27)], -6),
('leaning lemon', [(9, 16), (17, 22)], 5),
('lemon tree', [(17, 22), (23, 27)], 8)],
other=FeatureWeights(
pos=[FW(hl_in_text, 9)],
neg=[],
))
def test_weighted_spans_word():
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='word')
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('lemon', 4), FW('bias', 8)],
neg=[FW('tree', -6)],
neg_remaining=10
))
assert w_spans == WeightedSpans(
analyzer='word',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('lemon', [(17, 22)], 4),
('tree', [(23, 27)], -6)],
other=FeatureWeights(
pos=[FW('bias', 8), FW(hl_in_text, 0)],
neg=[],
neg_remaining=10,
))
def test_unhashed_features_other():
""" Check that when there are several candidates, they do not appear in "other"
if at least one is found. If none are found, they should appear in "other"
together.
"""
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='char', ngram_range=(3, 3))
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[
FW([{'name': 'foo', 'sign': 1}, {'name': 'see', 'sign': -1}], 2),
FW([{'name': 'zoo', 'sign': 1}, {'name': 'bar', 'sign': 1}], 3),
],
neg=[
FW([{'name': 'ree', 'sign': 1}, {'name': 'tre', 'sign': 1}], -4),
],
))
assert w_spans == WeightedSpans(
analyzer='char',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('tre', [(23, 26)], -4),
('ree', [(24, 27)], -4),
],
other=FeatureWeights(
pos=[
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='word', stop_words='english')
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('lemon', 5), FW('bias', 8)],
neg=[FW('tree', -6)]))
assert w_spans == WeightedSpans(
analyzer='word',
document='i see: a leaning lemon tree',
weighted_spans=[
('lemon', [(17, 22)], 5),
('tree', [(23, 27)], -6)],
other=FeatureWeights(
pos=[FW('bias', 8), FW('see', 2)],
neg=[FW(hl_in_text, -1)],
))
def test_explain_linear_dense():
clf = LogisticRegression(random_state=42)
data = [{'day': 'mon', 'moon': 'full'},
{'day': 'tue', 'moon': 'rising'},
{'day': 'tue', 'moon': 'rising'},
{'day': 'mon', 'moon': 'rising'}]
vec = DictVectorizer(sparse=False)
X = vec.fit_transform(data)
clf.fit(X, [0, 1, 1, 0])
test_day = {'day': 'tue', 'moon': 'full'}
target_names = ['sunny', 'shady']
res1 = explain_prediction(clf, test_day, vec=vec, target_names=target_names)
expl_text, expl_html = format_as_all(res1, clf)
assert 'day=tue' in expl_text
assert 'day=tue' in expl_html
[test_day_vec] = vec.transform(test_day)
res2 = explain_prediction(
clf, test_day_vec, target_names=target_names,
vectorized=True, feature_names=vec.get_feature_names())
assert res1 == res2
def test_explain_prediction_clf_multitarget(newsgroups_train):
docs, ys, target_names = newsgroups_train
vec = CountVectorizer(stop_words='english', dtype=np.float64)
xs = vec.fit_transform(docs)
clf = LGBMClassifier(n_estimators=100, max_depth=2,
min_child_samples=1, min_child_weight=1)
clf.fit(xs, ys)
doc = 'computer graphics in space: a new religion'
res = explain_prediction(clf, doc, vec=vec, target_names=target_names)
format_as_all(res, clf)
check_targets_scores(res)
graphics_weights = res.targets[1].feature_weights
assert 'computer' in get_all_features(graphics_weights.pos)
religion_weights = res.targets[3].feature_weights
assert 'religion' in get_all_features(religion_weights.pos)
top_target_res = explain_prediction(clf, doc, vec=vec, top_targets=2)
assert len(top_target_res.targets) == 2
assert sorted(t.proba for t in top_target_res.targets) == sorted(
t.proba for t in res.targets)[-2:]
def test_explain_regression_hashing_vectorizer(newsgroups_train_binary):
docs, y, target_names = newsgroups_train_binary
vec = HashingVectorizer(norm=None)
clf = LinearRegression()
clf.fit(vec.fit_transform(docs), y)
# Setting large "top" in order to compare it with CountVectorizer below
# (due to small differences in the coefficients they might have cutoffs
# at different points).
res = explain_prediction(
clf, docs[0], vec=vec, target_names=[target_names[1]], top=1000)
expl, _ = format_as_all(res, clf)
assert len(res.targets) == 1
e = res.targets[0]
assert e.target == 'comp.graphics'
neg = get_all_features(e.feature_weights.neg)
assert 'objective' in neg
assert 'that' in neg
assert 'comp.graphics' in expl
assert 'objective' in expl
assert 'that' in expl
# HashingVectorizer with norm=None is "the same" as CountVectorizer,
# so we can compare it and check that explanation is almost the same.
count_vec = CountVectorizer()
count_clf = LinearRegression()
def test_explain_hashing_vectorizer(newsgroups_train_binary):
# test that we can pass InvertableHashingVectorizer explicitly
vec = HashingVectorizer(n_features=1000)
ivec = InvertableHashingVectorizer(vec)
clf = LogisticRegression(random_state=42)
docs, y, target_names = newsgroups_train_binary
ivec.fit([docs[0]])
X = vec.fit_transform(docs)
clf.fit(X, y)
get_res = lambda **kwargs: explain_prediction(
clf, docs[0], vec=ivec, target_names=target_names, top=20, **kwargs)
res = get_res()
check_explain_linear_binary(res, clf)
assert res == get_res()
res_vectorized = explain_prediction(
clf, vec.transform([docs[0]])[0], vec=ivec, target_names=target_names,
top=20, vectorized=True)
pprint(res_vectorized)
assert res_vectorized == _without_weighted_spans(res)
assert res == get_res(
feature_names=ivec.get_feature_names(always_signed=False))
def test_explain_tree_regressor_multitarget(reg):
X, y = make_regression(n_samples=100, n_targets=3, n_features=10,
random_state=42)
reg.fit(X, y)
res = explain_prediction(reg, X[0])
for expl in format_as_all(res, reg):
for target in ['y0', 'y1', 'y2']:
assert target in expl
assert 'BIAS' in expl
assert any('x%d' % i in expl for i in range(10))
check_targets_scores(res)
top_targets_res = explain_prediction(reg, X[0], top_targets=1)
assert len(top_targets_res.targets) == 1
def test_explain_linear_tuple_top(newsgroups_train):
docs, y, target_names = newsgroups_train
vec = TfidfVectorizer()
clf = LogisticRegression(random_state=42)
X = vec.fit_transform(docs)
clf.fit(X, y)
res_neg = explain_weights(clf, vec=vec, target_names=target_names, top=(0, 10))
expl_neg, _ = format_as_all(res_neg, clf)
for target in res_neg.targets:
assert len(target.feature_weights.pos) == 0
assert len(target.feature_weights.neg) == 10
assert "+0." not in expl_neg
res_pos = explain_weights(clf, vec=vec, target_names=target_names, top=(10, 2))
format_as_all(res_pos, clf)
for target in res_pos.targets:
assert len(target.feature_weights.pos) == 10
assert len(target.feature_weights.neg) == 2