#!/usr/bin/python3 # -*- coding: utf-8 -*- import Graph as gr # Graph library from part 1 of the project def load_OBO(filename): """ parse the OBO file and returns the graph obsolete terms are discarded only is_a and part_of relationships are loaded Extract of a file to be parsed: [Term] id: GO:0000028 name: ribosomal small subunit assembly namespace: biological_process def: "The aggregation, arrangement and bonding together of constituent RNAs and proteins to form the small ribosomal subunit." [GOC:jl] subset: gosubset_prok synonym: "30S ribosomal subunit assembly" NARROW [GOC:mah] synonym: "40S ribosomal subunit assembly" NARROW [GOC:mah] is_a: GO:0022618 ! ribonucleoprotein complex assembly relationship: part_of GO:0042255 ! ribosome assembly relationship: part_of GO:0042274 ! ribosomal small subunit biogenesis """ def parseTerm(lines): # search for obsolete for l in lines: if l.startswith('is_obsolete: true'): return # otherwise create node id = lines.pop(0)[4:].rstrip() term = gr.add_node(g,id) term['id'] = id term['type'] = 'GOTerm' for line in lines: # attributes (name, namespace, def) if line.startswith('name: '): term['name'] = line[6:] elif line.startswith('namespace: '): term['namespace'] = line[11:] elif line.startswith('def: '): term['def'] = line[5:] elif line.startswith('alt_id: '): g['alt_id'][ line[8:] ] = id # alternate ids # relationships elif line.startswith('is_a:'): # is_a parent = line[6:line.index('!')].rstrip() e = gr.add_edge(g,id, parent) e['type'] = 'is_a' elif line.startswith('relationship: part_of '): # part_of line = line[line.index('GO:'):] dest = line[:line.index(' ')] e = gr.add_edge(g,id, dest) e['type'] = 'part_of' # g=gr.create_graph(directed=True, weighted=False) g['alt_id'] = {} # alternate GO ids with open(filename) as f: line = f.readline().rstrip() # skip header to reach 1st Term while not line.startswith('[Term]'): line = f.readline().rstrip() buff = [] line = f.readline() stop = False while line and not stop: # buffer lines until the next Term is found line = line.rstrip() # new Term if line.startswith('[Term]'): # next Term found: create corresponding node and edges in parseTerm and empty buffer parseTerm(buff) buff=[] # last Term elif line.startswith('[Typedef]'): parseTerm(buff) stop=True # or append to buffer else: buff.append(line) line = f.readline() return g def load_GOA(go, filename): """ parse GOA file and add annotated gene products to previsouly loaded graph go Extract of a file to be parsed: !gaf-version: 2.1 !GO-version: http://purl.obolibrary.org/obo/go/releases/2016-10-29/go.owl UniProtKB A5A605 ykfM GO:0006974 PMID:20128927 IMP P Uncharacterized protein YkfM YKFM_ECOLI|ykfM|b4586 protein taxon:83333 20100901 EcoCyc UniProtKB A5A605 ykfM GO:0016020 GO_REF:0000037 IEA C Uncharacterized protein YkfM YKFM_ECOLI|ykfM|b4586 protein taxon:83333 20161029 UniProt UniProtKB P00448 sodA GO:0004784 GO_REF:0000003 IEA EC:1.15.1.1 F Superoxide dismutase [Mn] SODM_ECOLI|sodA|JW3879|b3908 protein taxon:83333 20161029 UniProt UniProtKB P00393 ndh NOT GO:0005737 PMID:6784762 IDA C NADH dehydrogenase DHNA_ECOLI|ndh|JW1095|b1109 protein taxon:83333 20100621 EcoliWiki 0 1 2 3 4 5 6 7 8 9 10 id name go_id evidence-codes desc aliases """ names = {} go['names'] = names # gene names or gene product names (column 3) with open(filename) as f: line = f.readline() while line: if not line.startswith('!'): cols = line.rstrip().split('\t') id = cols[1] go_id = cols[4] if go_id not in go['nodes']: # GOTerm not found search alternate ids if go_id in go['alt_id']: # success go_id = go['alt_id'][go_id] # replace term else: # warn user print('Warning: could not attach a gene product (%s) to a non existing GO Term (%s)' % (id, go_id)) if go_id in go['nodes']: # create node for gene product if not already present if id not in go['nodes']: g = gr.add_node(go,id) g['id'] = id g['type'] = 'GeneProduct' names[cols[2]] = id # create or update gene product attributes gp = go['nodes'][id] gp['name'] = cols[2] gp['desc'] = cols[9] gp['aliases'] = cols[10] # attach gene product to GOTerm go_term = go['nodes'][go_id] e = gr.add_edge(go, id, go_id) e['type'] = 'annotation' if 'evidence-codes' not in e: e['evidence-codes'] = [] e['evidence-codes'].append( cols[6] ) else: # go_id or alt_id not found in GOTerms print('Error: could not attach a gene product (%s) to non existing GO Term (%s)' % (id, go_id)) line = f.readline() def max_depth(go, node): """ go graph traversal to find the longest path length from node (GOTerm) to a leaf (node with no successor) Returns the length of the longest path """ return ['unimplemented yet'] def GOTerms(go, gp, all=True, evidence_code=None): """ return the GOTerms associated to the provided gene product (gp) go: Gene Ontology graph gp: gene product all: if True, all the GOTerms and their ancestors will be return, otherwise only the GOTerms directly associated to the gene product will be returned. evidence_code: ignored for the moment Returns a list of GOTerms identifiers, e.g. ['GO:0005215','GO:0005515','GO:0006810','GO:0006974','GO:0008643'] """ return ['unimplemented yet'] def GeneProducts(go, term, all=True, evidence_code=None): """ return the gene products anotated by the provided GOTerm go: Gene Ontology graph term: GOTerm id all: if True, all the gene products directly and undirectly annotated (linked to a descendant of GOTerm) will be return, otherwise only the gene products directly associated to the GOTerm will be returned. evidence_code: ignored for the moment Returns a list of gene products identifiers, e.g. ['P0AAG5', 'P0AFY6', 'P10907', 'P16676', 'P23886'] """ return ['unimplemented yet'] ##### lib tests ##### if __name__ == "__main__": print('GeneOntology lib tests')