Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_weighted_spans_char():
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='char', ngram_range=(3, 4))
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('a le', 5), FW('on ', 8)],
neg=[FW('lem', -6)]))
assert w_spans == WeightedSpans(
analyzer='char',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('lem', [(17, 20)], -6),
('on ', [(20, 23)], 8),
('a le', [(7, 11)], 5)],
other=FeatureWeights(
pos=[FW(hl_in_text, 9)],
neg=[],
))
def test_no_weighted_spans():
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='char', ngram_range=(3, 4))
vec.fit([doc])
w_spans = get_weighted_spans(doc, vec, FeatureWeights(pos=[], neg=[]))
assert w_spans == WeightedSpans(
analyzer='char',
document='i see: a leaning lemon tree',
weighted_spans=[],
other=FeatureWeights(pos=[], neg=[]))
def test_weighted_spans_word_stopwords():
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='word', stop_words='english')
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('lemon', 5), FW('bias', 8)],
neg=[FW('tree', -6)]))
assert w_spans == WeightedSpans(
analyzer='word',
document='i see: a leaning lemon tree',
weighted_spans=[
('lemon', [(17, 22)], 5),
('tree', [(23, 27)], -6)],
other=FeatureWeights(
pos=[FW('bias', 8), FW('see', 2)],
neg=[FW(hl_in_text, -1)],
))
"""
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='char', ngram_range=(3, 3))
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[
FW([{'name': 'foo', 'sign': 1}, {'name': 'see', 'sign': -1}], 2),
FW([{'name': 'zoo', 'sign': 1}, {'name': 'bar', 'sign': 1}], 3),
],
neg=[
FW([{'name': 'ree', 'sign': 1}, {'name': 'tre', 'sign': 1}], -4),
],
))
assert w_spans == WeightedSpans(
analyzer='char',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('tre', [(23, 26)], -4),
('ree', [(24, 27)], -4),
],
other=FeatureWeights(
pos=[
FW([{'name': 'zoo', 'sign': 1}, {'name': 'bar', 'sign': 1}], 3),
],
neg=[FW(hl_in_text, -2)],
))
def test_weighted_spans_char_wb():
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='char_wb', ngram_range=(3, 4))
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('a le', 5), FW('on ', 8)],
neg=[FW('lem', -6), FW(' lem', -4)]))
assert w_spans == WeightedSpans(
analyzer='char_wb',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('lem', [(17, 20)], -6),
('on ', [(20, 23)], 8),
(' lem', [(16, 20)], -4)],
other=FeatureWeights(
pos=[FW('a le', 5), FW(hl_in_text, 0)],
neg=[],
))
def test_weighted_spans_word():
doc = 'I see: a leaning lemon tree'
vec = CountVectorizer(analyzer='word')
vec.fit([doc])
w_spans = get_weighted_spans(
doc, vec,
FeatureWeights(
pos=[FW('see', 2), FW('lemon', 4), FW('bias', 8)],
neg=[FW('tree', -6)],
neg_remaining=10
))
assert w_spans == WeightedSpans(
analyzer='word',
document='i see: a leaning lemon tree',
weighted_spans=[
('see', [(2, 5)], 2),
('lemon', [(17, 22)], 4),
('tree', [(23, 27)], -6)],
other=FeatureWeights(
pos=[FW('bias', 8), FW(hl_in_text, 0)],
neg=[],
neg_remaining=10,
))
spans=[
('a', [(0, 1)], 1.5),
('b', [(1, 2)], 2.5),
],
),
DocWeightedSpans(
document='xy',
spans=[
('xy', [(0, 2)], -4.5),
],
)]
)),
TargetExplanation(
target='two',
feature_weights=FeatureWeights(pos=[], neg=[]),
weighted_spans=WeightedSpans(
docs_weighted_spans=[
DocWeightedSpans(
document='abc',
spans=[
('a', [(0, 1)], 0.5),
('c', [(2, 3)], 3.5),
],
),
DocWeightedSpans(
document='xz',
spans=[
# char_wb at the start of the document
(' xz', [(-1, 2)], 1.5),
],
)],
)),
def get_weighted_spans(doc, vec, feature_weights):
# type: (Any, Any, FeatureWeights) -> Optional[WeightedSpans]
""" If possible, return a dict with preprocessed document and a list
of spans with weights, corresponding to features in the document.
"""
if isinstance(vec, FeatureUnion):
return _get_weighted_spans_from_union(doc, vec, feature_weights)
else:
result = _get_doc_weighted_spans(doc, vec, feature_weights)
if result is not None:
found_features, doc_weighted_spans = result
return WeightedSpans(
[doc_weighted_spans],
other=_get_other(feature_weights, [('', found_features)]),
)
return None
def feature_fn(name):
if isinstance(name, FormattedFeatureName):
return
if not name.startswith(vec_prefix):
return # drop feature
return name[len(vec_prefix):] # remove prefix
result = _get_doc_weighted_spans(doc, vec, feature_weights, feature_fn)
if result:
found_features, doc_weighted_spans = result
doc_weighted_spans.vec_name = vec_name
named_found_features.append((vec_name, found_features))
docs_weighted_spans.append(doc_weighted_spans)
if docs_weighted_spans:
return WeightedSpans(
docs_weighted_spans,
other=_get_other(feature_weights, named_found_features),
)
else:
return None