Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _get_seq_motif(self, refseq_id, residue, pos_str):
seq = self.seq_dict[refseq_id]
pos_1ix = int(pos_str)
pos_0ix = pos_1ix - 1
if seq[pos_0ix] != residue:
self.invalid_site_pos.append((refseq_id, residue, pos_str))
if seq[pos_0ix + 1] == residue:
self.off_by_one.append((refseq_id, residue, pos_str))
motif, respos = \
ProtMapper.motif_from_position_seq(seq, pos_1ix + 1,
self.motif_window)
return {'site_motif': {'motif': motif, 'respos': respos,
'off_by_one': True}}
else:
return {}
else:
# The index of the residue at the start of the window
motif, respos = ProtMapper.motif_from_position_seq(seq, pos_1ix,
self.motif_window)
return {'site_motif': {'motif': motif, 'respos': respos,
'off_by_one': False}}
pos_1ix = int(pos_str)
pos_0ix = pos_1ix - 1
if seq[pos_0ix] != residue:
self.invalid_site_pos.append((refseq_id, residue, pos_str))
if seq[pos_0ix + 1] == residue:
self.off_by_one.append((refseq_id, residue, pos_str))
motif, respos = \
ProtMapper.motif_from_position_seq(seq, pos_1ix + 1,
self.motif_window)
return {'site_motif': {'motif': motif, 'respos': respos,
'off_by_one': True}}
else:
return {}
else:
# The index of the residue at the start of the window
motif, respos = ProtMapper.motif_from_position_seq(seq, pos_1ix,
self.motif_window)
return {'site_motif': {'motif': motif, 'respos': respos,
'off_by_one': False}}
assert hgnc_name is not None
# See if we can get a Uniprot ID from the HGNC symbol--if there is
# a RefSeq ID we wil also try to use it to get an isoform specific
# UP ID, but we will have this one to fall back on. But if we can't
# get one here, then we skip the Statement
up_id_from_hgnc = hgnc_client.get_uniprot_id(hgnc_id)
if not up_id_from_hgnc:
self.no_up_for_hgnc.append((egid, hgnc_name, hgnc_id))
return None
# If we have provided the RefSeq ID, it's because we need to make
# sure that we are getting the right isoform-specific ID (for sequence
# positions of PTMs). Here we try to get the Uniprot ID from the
# Refseq->UP mappings in the protmapper.uniprot_client.
if refseq_id is not None:
# Get the Uniprot IDs from the uniprot client
up_ids = uniprot_client.get_ids_from_refseq(refseq_id,
reviewed_only=True)
# Nothing for this RefSeq ID (quite likely because the RefSeq ID
# is obsolete; take the UP ID from HGNC
if len(up_ids) == 0:
self.no_up_for_refseq.append(refseq_id)
up_id = up_id_from_hgnc
# More than one reviewed entry--no thanks, we'll take the one from
# HGNC instead
elif len(up_ids) > 1:
self.many_ups_for_refseq.append(refseq_id)
up_id = up_id_from_hgnc
# We got a unique, reviewed UP entry for the RefSeq ID
else:
up_id = up_ids[0]
# If it's the canonical isoform, strip off the '-1'
if up_id.endswith('-1'):
concept[NAMESPACE] = _mapped[NAMESPACE]
concept[IDENTIFIER] = _mapped[IDENTIFIER]
concept[NAME] = _mapped[NAME]
return True
elif prefix == 'bel':
logger.warning('could not figure out how to map bel ! %s', name)
return False
if prefix == 'uniprot':
# assume identifier given as name
identifier = get_id_from_mnemonic(name)
if identifier is not None:
concept[IDENTIFIER] = identifier
return True
mnemomic = get_mnemonic(name, web_fallback=True)
if mnemomic is not None:
concept[IDENTIFIER] = name
concept[NAME] = mnemomic
return True
logger.warning('could not interpret uniprot name: %s', name)
return False
try:
id_name_mapping = get_name_id_mapping(prefix)
except (NoOboFoundry, MissingOboBuild) as e:
logger.warning('could not get namespace %s - %s', prefix, e)
return False
if id_name_mapping is None:
logger.warning('unhandled namespace in %s ! %s', prefix, name)
def _handle_identifier_not_name(*, concept, prefix, identifier) -> bool:
# Some namespaces are just too much of a problem at the moment to look up
if prefix in SKIP:
return False
if prefix in NO_NAMES:
concept[NAME] = concept[IDENTIFIER]
return True
if prefix == 'uniprot':
concept[NAME] = get_mnemonic(identifier)
return True
try:
id_name_mapping = get_id_name_mapping(prefix)
except (NoOboFoundry, MissingOboBuild):
return False
if id_name_mapping is None:
logger.warning('could not get names for prefix %s', prefix)
return False
name = id_name_mapping.get(identifier)
if name is None:
logger.warning('could not get name for %s:%s', prefix, identifier)
return False
concept[NAME] = name
concept[IDENTIFIER] = _mapped[IDENTIFIER]
concept[NAME] = _mapped[NAME]
return True
elif prefix == 'bel' and name in compartment_mapping:
_mapped = compartment_mapping[name]
concept[NAMESPACE] = _mapped[NAMESPACE]
concept[IDENTIFIER] = _mapped[IDENTIFIER]
concept[NAME] = _mapped[NAME]
return True
elif prefix == 'bel':
logger.warning('could not figure out how to map bel ! %s', name)
return False
if prefix == 'uniprot':
# assume identifier given as name
identifier = get_id_from_mnemonic(name)
if identifier is not None:
concept[IDENTIFIER] = identifier
return True
mnemomic = get_mnemonic(name, web_fallback=True)
if mnemomic is not None:
concept[IDENTIFIER] = name
concept[NAME] = mnemomic
return True
logger.warning('could not interpret uniprot name: %s', name)
return False
try:
id_name_mapping = get_name_id_mapping(prefix)
except (NoOboFoundry, MissingOboBuild) as e:
mm_ws = '\n' + (' ' * 17)
mm_str = mm_ws.join([str(mm) for mm in self.mapped_mods])
summary = textwrap.dedent("""
MappedStatement:
original_stmt: {0}
mapped_mods: {1}
mapped_stmt: {2}
""")
return summary.format(self.original_stmt, mm_str, self.mapped_stmt)
def __repr__(self):
return str(self)
class SiteMapper(ProtMapper):
"""
Use site information to fix modification sites in Statements.
This is a wrapper around the protmapper package's ProtMapper class and adds
all the additional functionality to handle INDRA Statements and Agents.
Parameters
----------
site_map : dict (as returned by :py:func:`load_site_map`)
A dict mapping tuples of the form `(gene, orig_res, orig_pos)` to a
tuple of the form `(correct_res, correct_pos, comment)`, where `gene`
is the string name of the gene (canonicalized to HGNC); `orig_res` and
`orig_pos` are the residue and position to be mapped; `correct_res` and
`correct_pos` are the corrected residue and position, and `comment` is
a string describing the reason for the mapping (species error, isoform
error, wrong residue name, etc.).
return None
# If no site information for this residue, skip
if mod_condition.position is None or mod_condition.residue is None:
return None
# Otherwise, try to map it and return the mapped site
mapped_site = \
self.map_to_human_ref(up_id, 'uniprot',
mod_condition.residue,
mod_condition.position,
do_methionine_offset=self.do_methionine_offset,
do_orthology_mapping=self.do_orthology_mapping,
do_isoform_mapping=self.do_isoform_mapping)
return mapped_site
default_mapper = SiteMapper(default_site_map)
# TODO: determine if this should be done in the protmapper or if this is the
# preferred place
@lru_cache(maxsize=10000)
def _get_uniprot_id(agent):
"""Return the UniProt ID for an agent, looking up in HGNC if necessary.
If the UniProt ID is a list then return the first ID by default.
"""
up_id = agent.db_refs.get('UP')
hgnc_id = agent.db_refs.get('HGNC')
if up_id is None:
if hgnc_id is None:
# If both UniProt and HGNC refs are missing we can't
# sequence check and so don't report a failure.
'ppi_file must be given.')
if ptm_file and not seq_file:
raise ValueError('If ptm_file is given, seq_file must also be given.')
# Load complexes into dataframe
cplx_df = None
if complexes_file:
cplx_df = pd.read_csv(complexes_file, delimiter='\t', names=_cplx_cols,
dtype='str', na_values=['-', 'None'])
# Load ptm data into dataframe
ptm_df = None
seq_dict = None
if ptm_file:
ptm_df = pd.read_csv(ptm_file, delimiter='\t', names=_ptm_cols,
dtype='str', na_values='-')
# Load protein sequences as a dict keyed by RefSeq ID
seq_dict = load_fasta_sequences(seq_file, id_index=2)
# Load the PPI data into dataframe
ppi_df = None
if ppi_file:
ppi_df = pd.read_csv(ppi_file, delimiter='\t', names=_ppi_cols,
dtype='str')
# Create the processor
return HprdProcessor(id_df, cplx_df, ptm_df, ppi_df, seq_dict, motif_window)