#!/usr/bin/env python3 # -*- coding: utf-8 -*- from pprint import pprint #import Graph as gr # Graph library from part 1 of the project import re import numpy as np import sys def create_graph(directed = True, weighted = False, weight_attribute = None): # TP1 """ create a dictionnary representing a graph and returns it. """ g = { 'nodes': {}, 'edges': {}, 'nb_edges': 0, 'directed': directed, 'weighted': weighted, 'weight_attribute': weight_attribute } return g def add_node(g, n, attributes = None): # TP1 """ add a node n (node id provided as a string or int) to the graph g. attributes on the node can be provided by a dict. returns the node n attributes. """ if n not in g['nodes']: # ensure node does not already exist if attributes is None: # create empty attributes if not provided attributes = {} g['nodes'][n] = attributes g['edges'][n] = {} # init outgoing edges return g['nodes'][n] # return node attributes def add_edge(g, n1, n2, attributes = None, n1_attributes = None, n2_attributes = None): # TP1 # create nodes if they do not exist if n1 not in g['nodes']: add_node(g, n1, n1_attributes) # ensure n1 exists if n2 not in g['nodes']: add_node(g, n2, n2_attributes) # ensure n2 exists # add edge(s) only if they do not exist if n2 not in g['edges'][n1]: if attributes is None: # create empty attributes if not provided attributes = {} g['edges'][n1][n2] = attributes if not g['directed']: g['edges'][n2][n1] = g['edges'][n1][n2] # share the same attributes as n1->n2 g['nb_edges'] += 1 return g['edges'][n1][n2] # return edge attributes def load_OBO(filename): """ parse the OBO file and returns the graph obsolete terms are discarded only is_a and part_of relationships are loaded Extract of a file to be parsed: [Term] id: GO:0000028 name: ribosomal small subunit assembly namespace: biological_process def: "The aggregation, arrangement and bonding together of constituent RNAs and proteins to form the small ribosomal subunit." [GOC:jl] subset: gosubset_prok synonym: "30S ribosomal subunit assembly" NARROW [GOC:mah] synonym: "40S ribosomal subunit assembly" NARROW [GOC:mah] is_a: GO:0022618 ! ribonucleoprotein complex assembly relationship: part_of GO:0042255 ! ribosome assembly relationship: part_of GO:0042274 ! ribosomal small subunit biogenesis """ def parseTerm(lines): # search for obsolete for l in lines: if l.startswith('is_obsolete: true'): return # otherwise create node go_id = re_go_id.match(lines.pop(0)).group(1) go_node = add_node(g,go_id) go_node['id'] = go_id go_node['type'] = 'GOTerm' for line in lines: if re_go_name.match(line): go_node['name'] = re_go_name.match(line).group(1) elif re_go_namespace.match(line): go_node['namespace'] = re_go_namespace.match(line).group(1) elif re_go_def.match(line): go_node['def'] = re_go_def.match(line).group(1) # relationships elif re_go_is_a.match(line): parent_id = re_go_is_a.match(line).group(1) e = add_edge(g,go_id, parent_id) e['type'] = 'is_a' elif re_go_part_of.match(line): parent_id = re_go_part_of.match(line).group(1) e = add_edge(g, go_id, parent_id) e['type'] = 'part_of' # instantiate directed graph and additionnal graph attributes g=create_graph(directed=True, weighted=False) g['alt_id'] = {} # alternate GO ids # regexp to parse term lines re_go_id = re.compile('^id:\s+(GO:\d+)\s*$') re_go_name = re.compile('^name:\s+(.+)\s*$') re_go_namespace = re.compile('^namespace:\s+(.+)\s*$') re_go_def = re.compile('^def:\s+"(.+)"\s.*$') re_go_alt_id = re.compile('^alt_id:\s+(GO:\d+)\s*$') re_go_is_a = re.compile('^is_a:\s+(GO:\d+)\s') re_go_xref = re.compile('^xref:\s+(\S+)\s*$') re_go_part_of = re.compile('^relationship:\s+part_of\s+(GO:\d+)\s') with open(filename) as f: line = f.readline().rstrip() # skip header while not line.startswith('[Term]'): line = f.readline().rstrip() buff = [] line = f.readline() stop = False while line and not stop: line = line.rstrip() # new Term if line.startswith('[Term]'): parseTerm(buff) buff=[] # last Term elif line.startswith('[Typedef]'): parseTerm(buff) stop=True # or append to buffer else: buff.append(line) line = f.readline() return g if __name__ == "__main__": # argv[1] → go-basic.obo, argv[2] → nodes|edges→is_a/part_of go = load_OBO(sys.argv[1]) #pprint(go) if sys.argv[2]=='nodes': print("id\tdesc\tdef\tnamespace") for n in go['nodes'].values(): print(f"{n['id']}\t{n['name']}\t{n['def']}\t{n['namespace']}") elif sys.argv[2]=='edges': print("term1\tterm2\trelationship") for t1 in go['edges']: for t2 in go['edges'][t1]: rel = go['edges'][t1][t2]['type'] if sys.argv[3]==rel: print(f"{t1}\t{t2}\t{rel}")