#!/usr/bin/python3 # -*- coding: utf-8 -*- import Graph as gr # Graph library from part 1 of the project def loadOBO(filename): # parse OBO file to create a DAG # obsolete terms are discarded # only is_a and part_of relationships are loaded # [Term] # id: GO:0000028 # name: ribosomal small subunit assembly # namespace: biological_process # def: "The aggregation, arrangement and bonding together of constituent RNAs and proteins to form the small ribosomal subunit." [GOC:jl] # subset: gosubset_prok # synonym: "30S ribosomal subunit assembly" NARROW [GOC:mah] # synonym: "40S ribosomal subunit assembly" NARROW [GOC:mah] # is_a: GO:0022618 ! ribonucleoprotein complex assembly # relationship: part_of GO:0042255 ! ribosome assembly # relationship: part_of GO:0042274 ! ribosomal small subunit biogenesis def parseTerm(lines): # search for obsolete for l in lines: if l.startswith('is_obsolete: true'): return # otherwise create node id = lines.pop(0)[4:].rstrip() term = gr.add_node(g,id) term['id'] = id term['type'] = 'GOTerm' for line in lines: # attributes (name, namespace, def) if line.startswith('name:'): term['name'] = line[6:] elif line.startswith('namespace:'): term['namespace'] = line[11:] elif line.startswith('def:'): term['def'] = line[5:] # relationships elif line.startswith('is_a:'): # is_a parent = line[6:line.index('!')].rstrip() e = gr.add_egde(g,id, parent) e['type'] = 'is_a' elif line.startswith('relationship: part_of '): # part_of line = line[line.index('GO:'):] dest = line[:line.index(' ')] e = gr.add_egde(g,id, dest) e['type'] = 'part_of' # g=gr.createGraph(directed=True, weighted=False) with open(filename) as f: line = f.readline().rstrip() # skip header to reach 1st Term while not line.startswith('[Term]'): line = f.readline().rstrip() buff = [] line = f.readline() stop = False while line and not stop: # buffer lines until the next Term is found line = line.rstrip() # new Term if line.startswith('[Term]'): # next Term found: create corresponding node and edges in parseTerm and empty buffer parseTerm(buff) buff=[] # last Term elif line.startswith('[Typedef]'): parseTerm(buff) stop=True # or append to buffer else: buff.append(line) line = f.readline() return g def loadGOA(go, filename): # !gaf-version: 2.1 # !GO-version: http://purl.obolibrary.org/obo/go/releases/2016-10-29/go.owl # UniProtKB A5A605 ykfM GO:0006974 PMID:20128927 IMP P Uncharacterized protein YkfM YKFM_ECOLI|ykfM|b4586 protein taxon:83333 20100901 EcoCyc # UniProtKB A5A605 ykfM GO:0016020 GO_REF:0000037 IEA C Uncharacterized protein YkfM YKFM_ECOLI|ykfM|b4586 protein taxon:83333 20161029 UniProt # UniProtKB P00448 sodA GO:0004784 GO_REF:0000003 IEA EC:1.15.1.1 F Superoxide dismutase [Mn] SODM_ECOLI|sodA|JW3879|b3908 protein taxon:83333 20161029 UniProt # UniProtKB P00393 ndh NOT GO:0005737 PMID:6784762 IDA C NADH dehydrogenase DHNA_ECOLI|ndh|JW1095|b1109 protein taxon:83333 20100621 EcoliWiki # 0 1 2 3 4 5 6 7 8 9 10 # id name go_id evidence-codes desc aliases names = {} go['names'] = names with open(filename) as f: line = f.readline() while line: if not line.startswith('!'): cols = line.rstrip().split('\t') id = cols[1] if id not in go['nodes']: g = gr.add_node(go,id) g['id'] = id g['type'] = 'GeneProduct' names[cols[2]] = id gp = go['nodes'][id] gp['name'] = cols[2] gp['desc'] = cols[9] gp['aliases'] = cols[10] go_id = cols[4] if go_id not in go['nodes']: gr.add_node(go,go_id) go_term = go['nodes'][go_id] e = gr.add_egde(go, id, go_id) e['type'] = 'annotation' if 'evidence-codes' not in e: e['evidence-codes'] = [] e['evidence-codes'].append( cols[6] ) line = f.readline() ##### lib tests ##### if __name__ == "__main__": print('GeneOntology lib tests')