Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dnld_annofile(fin_gpad, 'gpad')
godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
annoobj = GpadReader(fin_gpad, godag=godag)
goids = [
'GO:0042581',
'GO:0101002',
'GO:0042582',
'GO:0070820',
'GO:0008021',
'GO:0005766',
'GO:0016591']
associations = annoobj.get_id2gos('CC')
termcounts = TermCounts(godag, associations)
# Calculate Lin values
p2v = {frozenset([a, b]): lin_sim(a, b, godag, termcounts) for a, b in combo_w_rplc(goids, 2)}
_prt_values(goids, p2v, prt=sys.stdout)
def _test_path_bp_mf(branch_dist, godag, prt):
"""Test distances between BP branch and MF branch."""
go_mf = 'GO:0003676' # level-03 depth-03 nucleic acid binding [molecular_function]
go_bp = 'GO:0007516' # level-04 depth-05 hemocyte development [biological_process]
dst_none = semantic_distance(go_mf, go_bp, godag)
sim_none = semantic_similarity(go_mf, go_bp, godag)
assc = dnld_assc("tair.gaf", godag)
termcounts = TermCounts(godag, assc)
fmt = '({GO1}, {GO2}) {TYPE:6} score = {VAL}\n'
sim_r = resnik_sim(go_mf, go_bp, godag, termcounts)
sim_l = lin_sim(go_mf, go_bp, godag, termcounts)
if prt is not None:
prt.write(fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=dst_none))
prt.write(fmt.format(TYPE='semantic similarity', GO1=go_mf, GO2=go_bp, VAL=sim_none))
prt.write(fmt.format(TYPE='Resnik similarity', GO1=go_mf, GO2=go_bp, VAL=sim_r))
prt.write(fmt.format(TYPE='Lin similarity', GO1=go_mf, GO2=go_bp, VAL=sim_l))
assert dst_none is None
assert sim_none is None
assert sim_r is None
assert sim_l is None
sim_d = semantic_distance(go_mf, go_bp, godag, branch_dist)
if prt is not None:
prt.write(fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=sim_d))
assert sim_d == godag[go_mf].depth + godag[go_bp].depth + branch_dist
def test_semantic_similarity():
"""Test initializing TermCounts with annotations made to alternate GO ID"""
godag = GODag(os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.obo'))
file_id2gos = os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.anno')
name2go = {o.name: o.item_id for o in godag.values()}
assoc = _get_id2gos(file_id2gos, godag, name2go, NAME2NUM)
tcntobj = TermCounts(godag, assoc)
# N_v: Test accuracy of Python equivalent to Java: getNumberOfAnnotations
# Test number of unique genes annotated to a GO Term PLUS genes annotated to a descendant
assert tcntobj.gocnts[name2go['A']] == 100, tcntobj.gocnts
assert tcntobj.gocnts[name2go['B']] == 40, tcntobj.gocnts
assert tcntobj.gocnts[name2go['C']] == 50, tcntobj.gocnts
assert tcntobj.gocnts[name2go['D']] == 10, tcntobj.gocnts
assert tcntobj.gocnts[name2go['E']] == 10, tcntobj.gocnts
assert tcntobj.gocnts[name2go['F']] == 10, tcntobj.gocnts
assert tcntobj.gocnts[name2go['G']] == 30, tcntobj.gocnts
]
# Get all the annotations from arabidopsis.
associations = [
('human', 'goa_human.gaf'),
('yeast', 'sgd.gaf'),
]
godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
for species, assc_name in associations: # Limit test numbers for speed
print()
# Get all the annotations for the current species
fin_assc = os.path.join(REPO, assc_name)
assc_gene2gos = dnld_assc(fin_assc, godag, namespace='MF', prt=None)
# Calculate the information content of the single term, GO:0048364
termcounts = TermCounts(godag, assc_gene2gos)
# Print information values for each GO term
for goid in sorted(goids):
infocontent = get_info_content(goid, termcounts)
term = godag[goid]
print('{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}'.format(
SPECIES=species, GO=goid, INFO=infocontent, NS=term.namespace, NAME=term.name))
# Print semantic similarities between each pair of GO terms
print("GO #1 GO #2 Resnik Lin")
print("---------- ---------- ------ -------")
for go_a, go_b in itertools.combinations(sorted(goids), 2):
# Resnik's similarity measure is defined as the information content of the most
# informative common ancestor. That is, the most specific common parent-term in the GO.
sim_r = resnik_sim(go_a, go_b, godag, termcounts)
# Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
def get_termcounts(fin_anno, godag, namespace='all', **kws):
"""Get termcounts object"""
objanno = get_objanno(fin_anno, godag, namespace)
id2gos = objanno.get_id2gos(namespace=namespace, **kws)
return TermCounts(godag, id2gos)
# Now we can calculate the semantic distance and semantic similarity, as so:
# "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
sim = semantic_similarity(go_id3, go_id4, godag)
print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
GO1=go_id3, GO2=go_id4, VAL=sim))
print(godag[go_id3])
print(godag[go_id4])
# Then we can calculate the information content of the single term, <code>GO:0048364</code>.
# "Information content (GO:0048364) = 7.75481392334
# First get the counts of each GO term.
termcounts = TermCounts(godag, associations)
# Calculate the information content
go_id = "GO:0048364"
infocontent = get_info_content(go_id, termcounts)
print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))
assert infocontent, "FATAL INFORMATION CONTENT"
# Resnik's similarity measure is defined as the information content of the most
# informative common ancestor. That is, the most specific common parent-term in
# the GO. Then we can calculate this as follows:
# Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top
sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
dca = deepest_common_ancestor([go_id3, go_id4], godag)
assert dca == NS2GO['BP']
assert sim_r == get_info_content(dca, termcounts)
assert sim_r == 0.0
def test_semantic_similarity():
"""Test faster version of sematic similarity"""
godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1b.obo'))
name2go = {o.name: o.item_id for o in godag.values()}
assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1b.anno'), godag, name2go)
tcntobj = TermCounts(godag, assoc)
assert tcntobj.gocnts[name2go['I']] == 20
assert tcntobj.gocnts[name2go['L']] == 21
assert tcntobj.gocnts[name2go['M']] == 20
assert tcntobj.gocnts[name2go['N']] == 20
def test_semantic_similarity():
"""Test faster version of sematic similarity"""
godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo'))
name2go = {o.name: o.item_id for o in godag.values()}
assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1a.anno'), godag, name2go)
tcntobj = TermCounts(godag, assoc)
assert tcntobj.gocnts[name2go['I']] == 50
assert tcntobj.gocnts[name2go['L']] == 50
assert tcntobj.gocnts[name2go['M']] == 50
assert tcntobj.gocnts[name2go['N']] == 50
def __init__(self, objcli, godag_version):
# _goids = set(o.id for o in godag.values() if not o.children)
_goids = set(r.GO for r in objcli.results_all)
_tobj = TermCounts(objcli.godag, objcli.objgoeans.get_assoc())
# pylint: disable=line-too-long
self.gosubdag = GoSubDag(_goids, objcli.godag, relationships=True, tcntobj=_tobj, prt=sys.stdout)
self.grprdflt = GrouperDflts(self.gosubdag, objcli.args.goslim)
self.hdrobj = HdrgosSections(self.grprdflt.gosubdag, self.grprdflt.hdrgos_dflt, objcli.sections)
self.pval_fld = objcli.get_pval_field() # primary pvalue of interest
self.ver_list = [godag_version,
self.grprdflt.ver_goslims,
"Sections: {S}".format(S=objcli.args.sections)]
# self.objaartall = self._init_objaartall()