Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
Runs a document to the metamap_annotator pipeline component. This overlays rich medical features by utilizing
MetaMap output and aligning it with a passed spacy Doc object. By medaCy conventions, each overlayed feature
is available as a token extension starting with 'feature_'. This component overlays 'feature_cui' and a
separate boolean feature for each semantic type to detect available under 'feature_is_{type}". This component
was originally designed to increase recall on Drug entities hence by default 'feature_is_orch' and
'feature_is_phsu' where orch and phsu are semantic types corresponding to organic chemicals and pharmalogical
substances respectively.
:param doc: spaCy Doc object to run through pipeline
:return: the same Doc object
"""
logging.debug("Called MetaMapAllTypesOverlayer")
# register all extensions
if self.cuis:
Token.set_extension('feature_cui', default="-1", force=True) # cui feature
if not hasattr(doc._, 'file_name'):
metamap_json = self.metamap.map_text(str(doc))
elif doc._.file_name is None or doc._.file_name == 'STRING_INPUT':
metamap_json = self.metamap.map_text(str(doc))
elif os.path.isfile(doc._.file_name):
# Check if pre-metamapped file exists at expected location
txt_file_path = doc._.file_name
metamapped_path = _get_metamapped_path(txt_file_path)
if not os.path.isfile(metamapped_path):
warnings.warn(
f"No metamapped file was found for '{txt_file_path}'; attempting to run MetaMap over document (results in slower runtime); ensure MetaMap is running")
metamap_json = self.metamap.map_text(str(doc))
else:
# This branch of the decision tree is reached if the file is already metamapped
metamap_json = self.metamap.load(metamapped_path)
def __init__(self, spacy_pipeline):
self.nlp = spacy_pipeline
Token.set_extension('feature_is_volume_unit', default=False)
self.nlp.entity.add_label('volume_unit')
self.volume_matcher = Matcher(self.nlp.vocab)
self.volume_matcher.add('UNIT_OF_VOLUME', None,
[{'LOWER': 'ml'}],
[{'ORTH': 'dL'}],
[{'LOWER': 'cc'}],
[{'ORTH': 'L'}])
def __init__(self, sentiws_path):
self.sentiws = SentiWSWrapper(sentiws_path=sentiws_path)
Token.set_extension('sentiws', getter=self.get_sentiment, force=True)
"""
Recursive search through the dependency tree
looks for triple values in each of the children and calls itself with the children nodes
"""
question_word = None
for word in token.children:
if word.text.lower() in QuestionWord.question_words:
question_word = QuestionWord(word)
word = QuestionWord(word)
if not triple.get_object():
triple.set_object(question_word)
elif word.dep_ in OBJECT_SET:
triple.set_object(word)
if word.dep_ in SUBJECT_SET:
triple.set_subject(word)
if isinstance(word, Token) and word.dep_ not in RECURSION_BLACKLIST:
triple = triple_search(triple, word)
if not triple.get_subject() and question_word:
triple.set_subject(question_word)
return triple
def __init__(self, nlp, path=HUNSPELL_PROFILE):
if path in DEFAULT_DICTIONARY_PATHS:
default_path = DEFAULT_DICTIONARY_PATHS[path]
dic_path, aff_path = (
os.path.join(default_path, 'en_US.dic'),
os.path.join(default_path, 'en_US.aff'),
)
else:
assert len(path) == 2, 'Include two paths: dic_path and aff_path'
dic_path, aff_path = path
self.hobj = HunSpell(dic_path, aff_path)
Token.set_extension('hunspell_spell', default=None)
Token.set_extension('hunspell_suggest', getter=self.get_suggestion)
def install_extensions():
tok2vec_attrs = [
ATTRS.last_hidden_state,
ATTRS.pooler_output,
ATTRS.all_hidden_states,
ATTRS.all_attentions,
ATTRS.d_last_hidden_state,
ATTRS.d_pooler_output,
ATTRS.d_all_hidden_states,
ATTRS.d_all_attentions,
]
for attr in tok2vec_attrs:
Doc.set_extension(attr, default=None)
Span.set_extension(attr, getter=get_span_tok2vec_getter(attr))
Token.set_extension(attr, getter=get_token_tok2vec_getter(attr))
wp_attrs = [ATTRS.alignment, ATTRS.word_pieces, ATTRS.word_pieces_]
for attr in wp_attrs:
Doc.set_extension(attr, default=None)
Span.set_extension(attr, getter=get_span_wp_getter(attr))
Token.set_extension(attr, getter=get_token_wp_getter(attr))
Doc.set_extension(ATTRS.separator, default=None)
Span.set_extension(
ATTRS.separator, getter=lambda span: span.doc._.get(ATTRS.separator)
)
Token.set_extension(
ATTRS.separator, getter=lambda token: token.doc._.get(ATTRS.separator)
)
Doc.set_extension(ATTRS.segments, getter=get_segments)
Span.set_extension(ATTRS.segments, getter=get_segments)
for cls in [Token, Span, Doc]:
cls.set_extension(ATTRS.start, getter=get_wp_start)
def add_spell_checker(self, spell_checker):
spacy_spell_checker = SpacySpellChecker(spell_checker=spell_checker)
self.nlp.add_pipe(spacy_spell_checker, name='spell_checker', last=True)
# Add custom fields needed for this usecase
Token.set_extension('verified', default=False, force=True)
Token.set_extension('norm', default=None, force=True)
Token.set_extension('lower', default=None, force=True)
def __init__(self, spacy_pipeline, labels):
"""
:param spacy_pipeline: An exisiting spacy Language processing pipeline
:param labels: The subset of labels from the gold annotations to restrict labeling to.
"""
self.nlp = spacy_pipeline
self.labels = labels
self.failed_overlay_count = 0
self.failed_identifying_span_count = 0
Token.set_extension('gold_label', default="O", force=True)
def __init__(self, spacy_pipeline):
self.nlp = spacy_pipeline
Token.set_extension('feature_is_measurement_unit', default=False)
self.nlp.entity.add_label('measurement_unit')
self.unit_of_measurement_matcher = Matcher(self.nlp.vocab)
self.unit_of_measurement_matcher.add('UNIT_OF_MEASUREMENT', None,
[{'ENT_TYPE': 'mass_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'volume_unit'}],
[{'ENT_TYPE': 'volume_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'time_unit'}],
[{'ENT_TYPE': 'form_unit'}, {'ORTH': '/'}, {'ENT_TYPE': 'volume_unit'}]
)
# This could also be extended using the alternative and foreign language
# names provided by the API
self.countries = {c["name"]: c for c in countries}
self.label = nlp.vocab.strings[label] # get entity label ID
# Set up the PhraseMatcher with Doc patterns for each country name
patterns = [nlp(c) for c in self.countries.keys()]
self.matcher = PhraseMatcher(nlp.vocab)
self.matcher.add("COUNTRIES", None, *patterns)
# Register attribute on the Token. We'll be overwriting this based on
# the matches, so we're only setting a default value, not a getter.
# If no default value is set, it defaults to None.
Token.set_extension("is_country", default=False)
Token.set_extension("country_capital", default=False)
Token.set_extension("country_latlng", default=False)
Token.set_extension("country_flag", default=False)
# Register attributes on Doc and Span via a getter that checks if one of
# the contained tokens is set to is_country == True.
Doc.set_extension("has_country", getter=self.has_country)
Span.set_extension("has_country", getter=self.has_country)