Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
features = ss_pipeline.get_features(filter=SampleSizePipeline.integer_filter, flatten=True)
X = self.vectorizer.transform(features)
preds = self.clf.decision_function(X)
sl_words = ss_pipeline.get_words(filter=SampleSizePipeline.integer_filter, flatten=True)
predicted_i = preds.argmax()
predicted = sl_words[predicted_i]
print "predicted sample size: %s" % predicted
'''
So this is kind of hacky. The deal is that we need to
get the spans for the predicted sample size. To this
end, I rely on the span_tokenizer (below), but then I need
to match up the predicted token (sample size) with these
spans.
'''
word_tok = word_tokenizer.span_tokenize(full_text)
for span in word_tok:
start, end = span
cur_word = swap_num(full_text[start:end])
if predicted == cur_word:
logger.debug("sample size predictor -- matched %s for prediction %s" % (
cur_word, predicted))
matched_span = span
break
else:
# then we failed to match the prediction token?!
# @TODO handle better?
logger.warn("ahhhh failed to match sample size prediction")
matched_span = []
ss_row = {
iden_abbr = {}
replacements = {}
for n in range(self.min_n, self.max_n + 1):
for phrase, substring in self.ngram_tokens(tokens, n):
if phrase in self.P:
abbr = self.P[phrase]
iden_abbr[phrase] = abbr
replacements[substring] = self.phrase_sub(phrase)
# Replace these with a phrase token
for substring, newstring in replacements.items():
doc = doc.replace(substring, newstring)
# Now find any abbrs used in the document and replace them
tokens = word_tokenizer(doc)
for phrase, abbr in iden_abbr.items():
tokens = [self.phrase_sub(phrase)
if x == abbr else x for x in tokens]
# This returns word split phrase string
doc = ' '.join(tokens)
return doc
def __call__(self, org_doc):
doc = org_doc
tokens = word_tokenizer(doc)
# First pass, identify which phrases are used
iden_abbr = {}
replacements = {}
for n in range(self.min_n, self.max_n + 1):
for phrase, substring in self.ngram_tokens(tokens, n):
if phrase in self.P:
abbr = self.P[phrase]
iden_abbr[phrase] = abbr
replacements[substring] = self.phrase_sub(phrase)
# Replace these with a phrase token
for substring, newstring in replacements.items():
doc = doc.replace(substring, newstring)