#!/usr/bin/python3 # -*- coding: utf-8 -*- import Graph as gr # Graph library from part 1 of the project def load_OBO(filename): # parse OBO file to create a DAG # obsolete terms are discarded # only is_a and part_of relationships are loaded # [Term] # id: GO:0000028 # name: ribosomal small subunit assembly # namespace: biological_process # def: "The aggregation, arrangement and bonding together of constituent RNAs and proteins to form the small ribosomal subunit." [GOC:jl] # subset: gosubset_prok # synonym: "30S ribosomal subunit assembly" NARROW [GOC:mah] # synonym: "40S ribosomal subunit assembly" NARROW [GOC:mah] # is_a: GO:0022618 ! ribonucleoprotein complex assembly # relationship: part_of GO:0042255 ! ribosome assembly # relationship: part_of GO:0042274 ! ribosomal small subunit biogenesis def parseTerm(lines): # search for obsolete for l in lines: if l.startswith('is_obsolete: true'): return # otherwise create node id = lines.pop(0)[4:].rstrip() term = gr.add_node(g,id) term['id'] = id term['type'] = 'GOTerm' for line in lines: # attributes (name, namespace, def) if line.startswith('name: '): term['name'] = line[6:] elif line.startswith('namespace: '): term['namespace'] = line[11:] elif line.startswith('def: '): term['def'] = line[5:] elif line.startswith('alt_id: '): g['alt_id'][ line[8:] ] = id # alternate ids # relationships elif line.startswith('is_a:'): # is_a parent = line[6:line.index('!')].rstrip() e = gr.add_edge(g,id, parent) e['type'] = 'is_a' elif line.startswith('relationship: part_of '): # part_of line = line[line.index('GO:'):] dest = line[:line.index(' ')] e = gr.add_edge(g,id, dest) e['type'] = 'part_of' # g=gr.create_graph(directed=True, weighted=False) g['alt_id'] = {} # alternate GO ids with open(filename) as f: line = f.readline().rstrip() # skip header to reach 1st Term while not line.startswith('[Term]'): line = f.readline().rstrip() buff = [] line = f.readline() stop = False while line and not stop: # buffer lines until the next Term is found line = line.rstrip() # new Term if line.startswith('[Term]'): # next Term found: create corresponding node and edges in parseTerm and empty buffer parseTerm(buff) buff=[] # last Term elif line.startswith('[Typedef]'): parseTerm(buff) stop=True # or append to buffer else: buff.append(line) line = f.readline() return g def load_GOA(go, filename): # !gaf-version: 2.1 # !GO-version: http://purl.obolibrary.org/obo/go/releases/2016-10-29/go.owl # UniProtKB A5A605 ykfM GO:0006974 PMID:20128927 IMP P Uncharacterized protein YkfM YKFM_ECOLI|ykfM|b4586 protein taxon:83333 20100901 EcoCyc # UniProtKB A5A605 ykfM GO:0016020 GO_REF:0000037 IEA C Uncharacterized protein YkfM YKFM_ECOLI|ykfM|b4586 protein taxon:83333 20161029 UniProt # UniProtKB P00448 sodA GO:0004784 GO_REF:0000003 IEA EC:1.15.1.1 F Superoxide dismutase [Mn] SODM_ECOLI|sodA|JW3879|b3908 protein taxon:83333 20161029 UniProt # UniProtKB P00393 ndh NOT GO:0005737 PMID:6784762 IDA C NADH dehydrogenase DHNA_ECOLI|ndh|JW1095|b1109 protein taxon:83333 20100621 EcoliWiki # 0 1 2 3 4 5 6 7 8 9 10 # id name go_id evidence-codes desc aliases names = {} go['names'] = names with open(filename) as f: line = f.readline() while line: if not line.startswith('!'): cols = line.rstrip().split('\t') id = cols[1] go_id = cols[4] if go_id not in go['nodes']: # GOTerm not found search alternate ids if go_id in go['alt_id']: # success go_id = go['alt_id'][go_id] # replace term else: # warn user print('Warning: could not attach a gene product (%s) to a non existing GO Term (%s)' % (id, go_id)) if go_id in go['nodes']: # create node for gene product if not already present if id not in go['nodes']: g = gr.add_node(go,id) g['id'] = id g['type'] = 'GeneProduct' names[cols[2]] = id # create or update gene product attributes gp = go['nodes'][id] gp['name'] = cols[2] gp['desc'] = cols[9] gp['aliases'] = cols[10] # attach gene product to GOTerm go_term = go['nodes'][go_id] e = gr.add_edge(go, id, go_id) e['type'] = 'annotation' if 'evidence-codes' not in e: e['evidence-codes'] = [] e['evidence-codes'].append( cols[6] ) else: # go_id or alt_id not found in GOTerms print('Error: could not attach a gene product (%s) to non existing GO Term (%s)' % (id, go_id)) line = f.readline() ##### lib tests ##### if __name__ == "__main__": print('GeneOntology lib tests')