silico.biotoul.fr
 

M1 BBS Graphes TP Dessin et Introduction iGraph

From silico.biotoul.fr

(Difference between revisions)
Jump to: navigation, search
m (Chargement d'un arbre au format Newick)
m (Fructherman-Reingold)
Line 328: Line 328:
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
-
import Graph as gr
+
from pprint import pprint
 +
import numpy as np # for Floyd-Warshall matrices
 +
from math import sqrt
 +
from random import random
 +
 +
# TP1 functions
 +
###############
 +
 +
def createGraph(directed = True, weighted = False): # TP1
 +
g = { 'nb_nodes': 0, 'nb_edges': 0, 'weight_attribute': None , 'nodes': {}, 'edges': {}, 'directed': directed, 'weighted': weighted }
 +
return g
 +
 +
def addNode(g, n, attributes = None): # TP1
 +
if n not in g['nodes']: # ensure node does not already exist
 +
if attributes is None: # create empty attributes if not provided
 +
attributes = {}
 +
g['nodes'][n] = attributes
 +
g['edges'][n] = {}
 +
g['nb_nodes'] += 1
 +
return g['nodes'][n]
 +
 +
def addEdge(g, n1, n2, attributes = None): # TP1
 +
# create nodes if they do not exist
 +
if n1 not in g['nodes']: addNode(g, n1) # ensure n1 exists
 +
if n2 not in g['nodes']: addNode(g, n2) # ensure n2 exists
 +
# add edges if they do not exist
 +
if n2 not in g['edges'][n1]:
 +
if attributes is None:
 +
attributes = {}
 +
g['edges'][n1][n2] = attributes
 +
if not g['directed']:
 +
g['edges'][n2][n1] = {}
 +
g['edges'][n2][n1] = g['edges'][n1][n2]
 +
g['nb_edges'] += 1
 +
return g['edges'][n1][n2]
 +
 +
def loadSIF(filename, directed=True): # TP1
 +
# line syntax: nodeD <relationship type> nodeE nodeF nodeB
 +
g = createGraph(directed)
 +
with open(filename) as f: # OPEN FILE
 +
# PROCESS THE REMAINING LINES
 +
row = f.readline().rstrip()
 +
while row:
 +
vals = row.split('\t')
 +
att = { 'type': vals[1] } # set edge type
 +
for i in range(2, len(vals)):
 +
addEdge(g, vals[0], vals[i], att)
 +
row = f.readline().rstrip()
 +
return g
 +
 +
#~ def printGraph(g): # TP1
 +
#~ print(' directed: ', g['directed'])
 +
#~ print(' weighted: ', g['weighted'])
 +
#~ for a in g['attributes']:
 +
#~ print( ' ', a, ': ', g['attributes'][a] , sep='')
 +
#~ print(' Nodes:')
 +
#~ for n in g['nodes']:
 +
#~ print('  ', n, ':')
 +
#~ for a in g['nodes'][n]['attributes']:
 +
#~ print('    ', a, ': ', a, sep='')
 +
#~ print('    neighbors:', list(g['edges'][n].keys()))
 +
#~ print(' Edges')
 +
#~ for n in g['nodes']:
 +
#~ for v in g['edges'][n]:
 +
#~ a = g['edges'][n][v]
 +
#~ print (' ', n, '->',v)
 +
#~ for a in g['edges'][n][v]:
 +
#~ print('  ',a, ':', g['edges'][n][v][a])
 +
 +
def dfs(g): # TP1
 +
a = { 'color': {}, 'predecessor': {}, 'time': 0, 'in': {}, 'out': {} }
 +
g['dfs'] = a
 +
for n in g['nodes']:
 +
a['color'][n] = 'white'
 +
a['predecessor'][n] = None
 +
for n in g['nodes']:
 +
if a['color'][n] == 'white':
 +
dfsVisit(g,n)
 +
 +
def dfsVisit(g,n): # TP1
 +
a = g['dfs']
 +
a['color'][n] = 'gray'
 +
a['time'] += 1
 +
a['in'][n] = a['time']
 +
for v in g['edges'][n]:
 +
# retrieve edge for classification (tree, forward, cross, back egdge)
 +
e = g['edges'][n][v]
 +
if a['color'][v] == 'white':
 +
a['predecessor'][v] = n
 +
dfsVisit(g,v)
 +
e['dfs_type'] = 'tree'
 +
elif a['color'][v] == 'gray':
 +
e['dfs_type'] = 'back'
 +
else:  # black (either forward or cross, make use of dfs_in)
 +
if a['in'][n] > a['in'][v]:
 +
e['dfs_type'] = 'cross'
 +
else:
 +
e['dfs_type'] = 'forward'
 +
a['color'][n] = 'black'
 +
a['time'] += 1
 +
a['out'][n] = a['time']
 +
 +
def isAcyclic(g): # TP1
 +
dfs(g)
 +
for n in g['edges']:
 +
for v in g['edges'][n]:
 +
if g['edges'][n][v]['dfs_type'] == 'back':
 +
return False
 +
return True
 +
 +
def topologicalSort(g): # TP1
 +
dfs(g)
 +
a = g['dfs']['out']
 +
return [(k, a[k]) for k in sorted(a, key=a.get, reverse=True)]
 +
 +
 +
############# TP 1 tests #############
 +
######################################
 +
def TP1():
 +
g = createGraph()
 +
pprint (g)
 +
addNode(g, 'A')
 +
addNode(g, 'B')
 +
addEdge(g, 'A', 'B')
 +
pprint (g)
 +
 +
g = loadSIF('dressing.sif')
 +
pprint(g)
 +
#printGraph(g)
 +
 +
 +
dfs(g)
 +
pprint(g)
 +
 +
print('Is it acyclic?', isAcyclic(g))
 +
 +
for (k,v) in topologicalSort(g): print(k)
 +
 +
g2 = loadSIF('directed.graph.with.cycle.slide29.sif')
 +
pprint(g2)
 +
print('Is g2 acyclic?', isAcyclic(g2))
 +
 +
def dfs_stack(g, source):
 +
stack = [ source ]
 +
marked = {}
 +
i=0
 +
while len(stack) > 0:
 +
n = stack.pop()
 +
if n not in marked:
 +
i += 1
 +
marked[n] = i
 +
for v in g['edges'][n]:
 +
stack.append(v)
 +
print(marked)
 +
dfs_stack(g,'sous-vetements')
 +
 +
print('nodes connected to WBBJ')
 +
g = loadSIF('String_EcolA_coexpression.sif', directed=False)
 +
a = { 'color': {}, 'predecessor': {}, 'time': 0, 'in': {}, 'out': {} }
 +
g['dfs'] = a
 +
for n in g['nodes']:
 +
a['color'][n] = 'white'
 +
a['predecessor'][n] = None
 +
dfsVisit(g,'WBBJ')
 +
for n in g['nodes']:
 +
if g['dfs']['color'][n] != 'white':
 +
pprint(n)
 +
 +
# TP2 #
 +
#######
 +
def bfs(g, source): # TP2
 +
a = { 'color': {}, 'distance': {}, 'predecessor': {} }
 +
g['bfs'] = a
 +
for n in g['nodes']:
 +
a['color'][n] = 'white'
 +
a['distance'][n] = float("inf")
 +
a['predecessor'][n] = None
 +
a['color'][source] = 'gray'
 +
a['distance'][source] = 0
 +
queue = [ source ]
 +
visited = []
 +
while len(queue) > 0:
 +
u = queue.pop()
 +
visited.append(u)
 +
for v in g['edges'][u]:
 +
if a['color'][v] == 'white':
 +
a['color'][v] = 'gray'
 +
a['distance'][v] = a['distance'][u] + 1
 +
a['predecessor'][v] = u
 +
queue.append(v)
 +
a['color'][u] = 'black'
 +
return visited
 +
 +
def test_BFS():
 +
print ('#### BFS ####')
 +
g = loadSIF('dressing.sif')
 +
pprint(g)
 +
bfs(g, 'sous-vetements')
 +
pprint(g['bfs'])
 +
a = g['bfs']['distance']
 +
print( [ (k, a[k]) for k in sorted(a, key=a.get)] )
 +
# remarque: on obtient vest car après chaussure mais nécessite chemise non parcouru
 +
 +
### Belman-Ford ###
 +
def loadTAB(filename): # TP2
 +
g = createGraph()
 +
with open(filename) as f:
 +
# GET COLUMNS NAMES
 +
tmp = f.readline().rstrip()
 +
attNames= tmp.split('\t')
 +
# REMOVES FIRST TWO COLUMNS WHICH CORRESPONDS TO THE LABELS OF THE CONNECTED VERTICES
 +
attNames.pop(0)
 +
attNames.pop(0)
 +
# PROCESS THE REMAINING LINES
 +
row = f.readline().rstrip()
 +
while row:
 +
vals = row.split('\t')
 +
v1 = vals.pop(0)
 +
v2 = vals.pop(0)
 +
att = {}
 +
for i in range(len(attNames)):
 +
att[ attNames[i] ] = vals[i]
 +
addEdge(g, v1, v2, att)
 +
row = f.readline().rstrip() # NEXT LINE
 +
return g
 +
 +
def BellmanFord_initializeSingleSource(g, source):
 +
a = { 'distance' : {}, 'predecessor': {} }
 +
g['Bellman-Ford'] = a
 +
for n in g['nodes']:
 +
a['distance'][n] = float("inf")
 +
a['predecessor'][n] = None
 +
a['distance'][source] = 0
 +
 +
def BellmanFord_relax(g, u, v, weight_attribute = None):
 +
w=1
 +
a = g['Bellman-Ford']
 +
if weight_attribute is not None: w = float(g['edges'][u][v][weight_attribute])
 +
if a['distance'][v] > a['distance'][u] + w:
 +
a['distance'][v] = a['distance'][u] + w
 +
 +
def BellmanFord(g, source, weight_attribute = None):
 +
if g['weighted'] and weight_attribute is None: weight_attribute = g['weight_attribute']
 +
BellmanFord_initializeSingleSource(g,source)
 +
for i in range( g['nb_nodes'] - 1):
 +
for n in g['nodes']:
 +
for v in g['edges'][n]:
 +
e = g['edges'][n][v]
 +
BellmanFord_relax(g, n, v, weight_attribute)
 +
 +
def test_BellmanFord():
 +
print('#### Bellman-Ford ####')
 +
g = loadTAB('Graphe.Bellman-Ford.tab')
 +
g['weighted'] = True
 +
g['weight_attribute'] = 'weight'
 +
pprint(g)
 +
BellmanFord(g, 'A')
 +
pprint(g['Bellman-Ford']['distance'])
 +
 +
# Floyd-Warshall
 +
def adjacency_matrix(g):
 +
ids =  sorted( g['nodes'].keys() )
 +
#~ pprint( ids.index('A') )
 +
nb_ids = len(ids)
 +
M = np.full( ( nb_ids, nb_ids), np.inf)
 +
for i in ids:
 +
M[ ids.index(i), ids.index(i) ] = 0
 +
for j in ids:
 +
if i!=j:
 +
if j in g['edges'][i]:
 +
if g['weighted']: # weighted graph
 +
if g['weight_attribute'] is None:
 +
M[ ids.index(i), ids.index(j) ] = 1
 +
else:
 +
M[ ids.index(i), ids.index(j) ] = float(g['edges'][i][j][ g['weight_attribute'] ])
 +
else: # unweighted graph
 +
M[ ids.index(i), ids.index(j) ] = 1
 +
return M
 +
 +
def FloydWarshall(g):
 +
D = adjacency_matrix(g) # shortest distance matrix (initially the adjacency matrix)
 +
ids = sorted( g['nodes'] )
 +
n = len(ids) # graph order
 +
N = np.full( (n, n), -1, dtype = np.int_ ) # path matrix (next node to go to reach one node)
 +
# initialize direct neighbors
 +
for i in ids:
 +
for j in ids:
 +
if j in g['edges'][i] or i==j:
 +
N[ ids.index(i), ids.index(j) ] = ids.index(j)
 +
# compute shortest pathes
 +
for k in range(n):
 +
for i in range(n):
 +
for j in range(n):
 +
if D[i,k] + D[k,j] < D[i,j]:
 +
D[i,j] = D[i,k] + D[k,j]
 +
N[i,j] = N[i,k]
 +
a = { 'D': D, 'N': N }
 +
g['Floyd-Warshall'] = a
 +
 +
def FloydWarshallPath(g, source, destination):
 +
if 'Floyd-Warshall' not in g:
 +
FloydWarshall(g)
 +
ids = sorted(g['nodes'])
 +
if source not in ids:
 +
pprint(source + ' is not in the graph')
 +
return []
 +
if destination not in ids:
 +
pprint(destination + ' is not in the graph')
 +
return []
 +
if g['Floyd-Warshall']['D'][ ids.index(source), ids.index(destination) ] == np.inf:
 +
pprint('No path from '+source+' to '+destination)
 +
return []
 +
N = g['Floyd-Warshall']['N']
 +
path = [ source ]
 +
k = N[ ids.index(source), ids.index(destination) ]
 +
while k != ids.index(destination):
 +
path.append(ids[k])
 +
k = N[ k, ids.index(destination) ]
 +
path.append(destination)
 +
return path
 +
 +
def diameter(g):
 +
if not 'Floyd-Warshall' in  g: FloydWarshall(g)
 +
g['diameter']=np.max(g['Floyd-Warshall']['D'])
 +
return g['diameter']
 +
 +
def test_FloydWarshall():
 +
print('#### Floyd-Warshall ####')
 +
g = loadTAB('Graphe.Floyd-Warshall.tab')
 +
g['weighted'] = True
 +
g['weight_attribute'] = 'weight'
 +
FloydWarshall(g)
 +
pprint(g)
 +
path = FloydWarshallPath(g, 'A', 'B')
 +
pprint(path)
 +
pprint(FloydWarshallPath(g, 'A', 'C') )
 +
pprint(FloydWarshallPath(g, 'E', 'C') )
 +
pprint(FloydWarshallPath(g, 'E', 'A') )
 +
pprint(g['Floyd-Warshall']['D'])
 +
print('diameter:', diameter(g))
 +
 +
## TP2 tests #####
 +
def TP2():
 +
print('########### TP2 ##############')
 +
#~ test_BFS()
 +
#~ test_BellmanFord()
 +
test_FloydWarshall()
 +
 +
#TP3
def layoutFruchtermanReingold(g, nb_iterations=500, cool=1.0, max_displacement=10.0, width = 800, height=600):
def layoutFruchtermanReingold(g, nb_iterations=500, cool=1.0, max_displacement=10.0, width = 800, height=600):
Line 342: Line 690:
del g['nodes'][n]['force_x']
del g['nodes'][n]['force_x']
del g['nodes'][n]['force_y']
del g['nodes'][n]['force_y']
-
 
+
def layoutFruchtermanReingoldInit(g, cool=1.0, max_displacement=10.0, width = 800, height=600):
def layoutFruchtermanReingoldInit(g, cool=1.0, max_displacement=10.0, width = 800, height=600):
"""
"""
Line 362: Line 710:
n['force_x'] = float(0)
n['force_x'] = float(0)
n['force_y'] = float(0)
n['force_y'] = float(0)
-
 
+
def layoutFruchtermanReingoldIteration(g):
def layoutFruchtermanReingoldIteration(g):
"""
"""
Line 373: Line 721:
m['force_x'] = float(0)
m['force_x'] = float(0)
m['force_y'] = float(0)
m['force_y'] = float(0)
-
for u in n['nodes']:
+
for u in g['nodes']:
n = g['nodes'][u]
n = g['nodes'][u]
if u!=v:
if u!=v:
Line 408: Line 756:
g['nodes'][n]['x'] +=  g['nodes'][n]['force_x']
g['nodes'][n]['x'] +=  g['nodes'][n]['force_x']
g['nodes'][n]['y'] +=  g['nodes'][n]['force_y']
g['nodes'][n]['y'] +=  g['nodes'][n]['force_y']
-
g['nodes'][n]['x'] = min( fr['canvas_width'], max( 0, fr['x'] ) )  
+
g['nodes'][n]['x'] = min( fr['canvas_width'], max( 0, fr['canvas_width'] ) )  
-
g['nodes'][n]['y'] = min( fr['canvas_height'], max( 0, fr['y'] ) )  
+
g['nodes'][n]['y'] = min( fr['canvas_height'], max( 0, fr['canvas_height'] ) )  
# cooling
# cooling
fr['max_displacement'] *= fr['cool']
fr['max_displacement'] *= fr['cool']
 +
 +
 +
##### main #####
 +
if __name__ == "__main__":
 +
g=loadSIF('dressing.sif')
 +
layoutFruchtermanReingold(g)
 +
pprint(g)
</source>
</source>

Revision as of 16:22, 7 December 2016

Contents

Utilisation de la bibliothèque iGraph sous R

Pour cela nous allons travailler avec le graphe ci-contre dans lequel les sommets correspondent à des gènes de différents organismes procaryotes et les liens correspondent à une relation d'isorthologie inférée entre les gènes.

Graphe des isorthologues des protéines affines de systèmes ABC de la sous-famille 1 (import de sucres) dessiné par l'algorithme de Fruchterman-Reingold

Fichiers au format edgelist :

Premiers pas

La librairie iGraph met à disposition tout un ensemble de fonctions pour le traitement et la visualisation de graphes. Nous allons utiliser aujourd'hui son interfaçage avec R. Pour la charger :

library(igraph)

Pour charger un graphe (différents format possibles : pajek, newick, ...) :

g = read.graph("Cleandb_Luca_1_S_1_1_65_Iso_Tr_1-CC1.tgr", directed=FALSE)

Consulter l'aide de la fonction (?read.graph) pour voir les autres formats supportés.


Pour l'afficher, il faut au préalable en effectuer le dessin (layout) :

# soit en une ligne en passant la fonction de dessin :
plot(g, layout=layout.fruchterman.reingold)
 
# soit en sauvegardant ce layout dans une variable :
g.FR = layout.fruchterman.reingold(g)
plot(g, layout=g.FR, vertex.size=3, vertex.label=NA)

Consulter l'aide des fonction plot.igraph et layout.fructhterman.reingold pour voir les options ainsi que les autres algorithmes de dessin disponibles.

Vous trouverez la documentation de la librairie sur le site dédié. Pour celle de l'interface R en ligne : http://igraph.sourceforge.net/doc/R/00Index.html

  • Pour obtenir la liste des sommets : V(g)
  • la liste des arêtes : E(g)


  • Quel est l'ordre du graphe ? Combien a-t-il d'arêtes ?


On peut assigner des étiquettes aux sommets : V(g)$name = vector_of_labels

Remarque : la librairie étant codée en C puis interfacée avec R, cela créé au moins un petit inconvénient : les indices en C vont de 0 à n-1 alors que en R ils vont de 1 à n.

Charger les étiquettes des sommets :

V(g)$name=as.character(read.table("Cleandb_Luca_1_S_1_1_65_Iso_Tr_1-CC1.cod")[,2])
plot(g, layout=g.FR, vertex.size=3, vertex.label=V(g)$name)

Les paramètres de la fonction plot de igraph sont décrits dans la documentation.

Il est possible de stocker de l'information sur le graphe, les sommets et/ou les arêtes avec les fonctions dédiées :

get.vertex.attribute(g, name="name")
get.vertex.attribute(g, name="name", index=10)


  • Quel est le diamètre du graphe ?
  • Lister les points d'articulation (articulation.points)
  • Longueur moyenne des plus courts chemins (sans valuation) (average.path.length)
  • Afficher sa représentation canonique (canonical.permutation) et la matrice d'adjacence correspondante
  • Lister les cliques maximales (maximal.cliques)
  • Lister les composantes connexes (clusters) avant et après suppression du point d'articulation
  • Parcours en largeur et en profondeur (graph.bfs et graph.dfs)
  • Obtenir le line graph (line.graph)
  • Arbre couvrant de poids minimum (minimum.spanning.tree)
  • Plus courts chemins : utiliser les fonctions shortest.paths et get.shortest.paths pour obtenir la longueur des plus courts chemin entre tous les sommets, ainsi que le plus court chemin entre les sommets BantA01.AAP27658.1 et HmarA01.YUFN.
  • la betweenness d'une arête est le nombre de plus courts chemins passant par cette arête. Utiliser la fonction edge.betweenness pour calculer cette valeur et l'ajouter au dessin.

Suppression du point d'articulation, extraction du sous graphe induit et affichage avant l'analyse des composantes connexes :

vids = which( V(g) != V(g)[ articulation.points(g) ])
g1 = induced.subgraph(g, vids )
g1.layout = g.FR[V(g) != V(g)[ articulation.points(g) ],] # pour conserver le même layout que g
plot(g1, vertex.size=3, layout=g1.layout, vertex.label=V(g)$name, vertex.label.cex=.5)
clusters(g1)

Partitionnement

Pour partitionner le graphe en communautés, différentes méthodes sont disponibles. Vous allez utilisez la betweenness des arêtes pour effectuer le partitionnement du graphe. Cette méthode sélectionne les arêtes dont la betweenness est la plus importantes afin de former des communautés (clusters).

Cette méthode ne fournit pas le meilleur partitionnement : seulement le clustering hiérarchique. Il va donc falloir déterminer où couper l'arbre pour former les clusters. Pour cela, il va falloir déterminer à quelle étape du clustering hiérarchique se situe le maximum de la mesure de qualité du partitionnement (on utilisera la modularité).

# community detection with edge-betweenness
com = edge.betweenness.community(g)
par(cex=.5)
plot(as.dendrogram(com))
 
 
# find best partition
com$best_step=0
com$best_modularity = -Inf
for (i in 1:nrow(com$merges)) {
  ctm = cut_at(com, steps=i)
  mod = modularity(g, ctm)
  print(paste("step: ",i,", modularity:",mod))
  if (mod>com$best_modularity) {
    com$best_step = i
    com$best_modularity = mod
    com$membership=ctm
  }
}
 
# plot best partition
plot(g, layout=g.FR, vertex.color=com$membership, vertex.size=3, vertex.label=NA, main=paste("step",com$best_step,", modularity:",com$best_modularity))

High Dimensional Embedding

A présent, vous allez devoir implémenter l'algorithme de dessin HDE vu en cours puisqu'il n'est pas disponible dans iGraph.

On rappelle son principe :

  • calculer les plus courts chemins entre chaque paire de sommets (utiliser la fonction shortest.paths(g))
  • déterminer les sommets pivots
    • le premier est pris au hasard
    • les suivants sont sélectionnés l'un après l'autre en maximisant à chaque fois leur distance aux pivots déjà sélectionnés
  • effectuer une ACP sur les coordonnées des sommets du graphes sur chacun des axes pivots
  • effectuer la projection par rapport aux 2 ou 3 composantes principales

Sélection des pivots:

pivot_selection = function(g, n.pivots) {
	g.sp = shortest.paths(g)
	g.pivots = c( 1 )
	g.others = 2:length(V(g))
	while ( length(g.pivots) < n.pivots ) {
		p.new.i = 1 # index in g.others
		p.new.best.i = g.others[1] # intialize with index of first non pivot
		p.new.best.dist = 0
		# iterate over non pivots (g.others)
		while ( p.new.i <= length(g.others) ) {
			p.new = g.others[ p.new.i ]
			p.new.dist = 0
			for (p.old in g.pivots) { # add shortest path to each selected pivots
				p.new.dist = p.new.dist + g.sp[ p.old, p.new ]
			}
			# keep it if it maximizes distance
			if (p.new.dist > p.new.best.dist) {
				p.new.best.dist = p.new.dist
				p.new.best.i = p.new.i
			}
			p.new.i = p.new.i + 1
		}
		# add best pivot to selected pivots
		g.pivots = c(g.pivots, g.others[p.new.best.i])
		# remove best pivot from non pivots
		g.others = g.others[ -p.new.best.i ]
	}
	g.pivots
}


Comparer le résultat obtenu par rapport à Fruchterman-Reingold sur le graphe précédent ainsi que sur une grille régulière de 10x10 générée avec la fonction suivante :

lat=graph.lattice(length=10,dim=2)
# layout
lat.FR=layout.fruchterman.reingold(lat)
lat.HDE=layout.hde(lat)
# tracé
plot(lat, layout=lat.FR, vertex.size=3, vertex.label=NA)
plot(lat, layout=lat.HDE, vertex.size=3, vertex.label=NA)

Augmenter la taille de la grille (par exemple 30), et apprécier le temps pris pour le dessin par l'algorithme de FR, et par HDE.

Disposant d'une matrice de distance, il également envisageable de faire du multidimensional scaling (cf. TD de Traitement de données biologiques sur les analyses multivariées) :

# MDS Layout
layout.mds = function(g, ndim=2) {
	g.sp = shortest.paths(g)
	fit <- cmdscale(g.sp,eig=TRUE, k=ndim) # k est le nombre de dimensions
	fit$points[,1:ndim]
}
lat.MDS=layout.mds(lat)
plot(lat, layout=lat.MDS, vertex.size=3, vertex.label=NA)

Avec HDE et MDS, on peut aussi dessiner le graphe en 3 dimensions :

# grille
lat=graph.lattice(length=10,dim=2)
lat.HDE=layout.hde(lat,ndim=3)
rglplot(lat, layout=lat.HDE, vertex.size=3, vertex.label=NA)
# cube
lat=graph.lattice(length=6,dim=3)
lat.HDE=layout.hde(lat,ndim=3)
rglplot(lat, layout=lat.HDE, vertex.size=3, vertex.label=NA)

Dessin de graphes

Un arbre phylogénétique récupéré sur le site PATRIC. Télécharger au format Newick.
Toy example. Télécharger au format SVG ou Newick.

Dans cette partie, nous allons voir comment lire/écrire un arbre au format Newick. Puis, comment afficher l'arborescence avec différents algorithmes :

  • textuelle comme par exemple une arborescence de fichiers
  • graphique


Pour cela, nous allons utiliser les arbres ci-contre.


Chargement d'un arbre au format Newick

La première étape consiste donc à pouvoir charger l'arbre au format Newick dans un script python.

Pour cela, vous pouvez compléter la librairie ci-dessous qui utilise la librairie que vous avez développée au cours des séances précédentes. manipuler les arbres :

#!/usr/bin/python3
# -*- coding: utf-8 -*-
 
from pprint import pprint
import Graph as gr
 
def loadNewick(filename):
	def parseNewick(g, text):
		i=0
		stack=[]
		idx=0
		current = { 'parent': None, 'index': 0, 'id': 'root', 'length': 1.0 }
		nodes = [ current ]
		while i < len(text):
			if text[i]==' ': # skip spaces
				i+=1
			if text[i]==';': # finished add root
				gr.addNode(g, current['id'], current)
				g['root'] = current['id']
				i+=1
			elif text[i]=='(': # new child
				idx += 1
				N = { 'parent': current['index'], 'index': idx, 'id': idx, 'length': 1.0 }
				nodes.append(N)
				stack.insert(0, current['index'])
				current = N
				i+=1
			elif text[i]==',': # end of label or branch length... next child incoming
				# create previous node
				gr.addNode(g, current['id'], current)
				parent = nodes[ stack[0] ] # parent is at the top of the stack
				idx += 1
				N = { 'parent': parent['index'], 'index': idx, 'id': idx, 'length': 1.0 }
				nodes.append(N)
				current = N
				i+=1
			elif text[i]==')': # new child complete
				gr.addNode(g, current['id'], current)
				current=nodes[ stack.pop(0) ]
				i+=1
			elif text[i]==':': # incoming branch length
				# parse number
				nbText = ''
				i+=1
				while text[i] in ['0','1','2','3','4','5','6','7','8','9','0','.','E','e','-']:
					nbText+=text[i]
					i+=1
				current['length'] = float(nbText)
			else: # maybe incoming node label
				label=''
				while not text[i] in [' ',')',':',',',';']:
					label+=text[i]
					i+=1
				current['id'] = label
		# all nodes created with correct label if provided, create edges
		for n in nodes:
			if n['parent'] is not None:
				gr.addEdge(g, nodes[ n['parent'] ]['id'], n['id'], { 'length': n['length']} )
				n['parent'] = nodes[ n['parent'] ]['id'] # update parent id
			# clean up node attributes
			del n['index']
			del n['length']
	#
	g = gr.createGraph(directed=True, weighted=True)
	g['weight_attribute'] = 'length'
	g['rooted'] = True
	with open(filename) as f: 
		text = f.readline().rstrip()
		parseNewick(g,text)
	return g
 
 
##### main #####
if __name__ == "__main__":
	print('Tree lib tests')
	print("\n####")
	print(" # loadNewick('aTree.nwk')")
	t = loadNewick('aTree.nwk')

Affichage au format texte

Ajouter une fonction pour afficher un arbre sous forme d'arborescence de fichier. Le résultat pour toyExample devrait ressembler à ce qui suit :

|- A
|  |-0.4- F
|  |  |-0.2- B
|  |  |-0.4- J
|  |  |-0.2- E
|  |  |  |-0.2- C
|  |  |  |-0.4- D
|  |  |  |-0.2- H
|  |  |  |-0.4- I
|  |-0.2- G
|  |  |-0.2- K
|  |  |-0.4- L

Écrire une procédure pour exporter l'arbre au format Newick.

Fructherman-Reingold

#!/usr/bin/python3
# -*- coding: utf-8 -*-
 
from pprint import pprint
import numpy as np # for Floyd-Warshall matrices
from math import sqrt
from random import random
 
 
# TP1 functions
###############
 
def createGraph(directed = True, weighted = False): # TP1
	g = { 'nb_nodes': 0, 'nb_edges': 0, 'weight_attribute': None , 'nodes': {}, 'edges': {}, 'directed': directed, 'weighted': weighted }
	return g
 
def addNode(g, n, attributes = None): # TP1
	if n not in g['nodes']: # ensure node does not already exist
		if attributes is None: # create empty attributes if not provided
			attributes = {}
		g['nodes'][n] = attributes
		g['edges'][n] = {}
		g['nb_nodes'] += 1
	return g['nodes'][n]
 
def addEdge(g, n1, n2, attributes = None): # TP1
	# create nodes if they do not exist
	if n1 not in g['nodes']: addNode(g, n1) # ensure n1 exists
	if n2 not in g['nodes']: addNode(g, n2) # ensure n2 exists
	# add edges if they do not exist
	if n2 not in g['edges'][n1]:
		if attributes is None:
			attributes = {}
		g['edges'][n1][n2] = attributes
		if not g['directed']:
			g['edges'][n2][n1] = {}
			g['edges'][n2][n1] = g['edges'][n1][n2]
		g['nb_edges'] += 1
	return g['edges'][n1][n2]
 
def loadSIF(filename, directed=True): # TP1
	# line syntax: nodeD <relationship type> nodeE nodeF nodeB
	g = createGraph(directed)
	with open(filename) as f: # OPEN FILE
		# PROCESS THE REMAINING LINES
		row = f.readline().rstrip()
		while row:
			vals = row.split('\t')
			att = { 'type': vals[1] } # set edge type
			for i in range(2, len(vals)):
				addEdge(g, vals[0], vals[i], att)
			row = f.readline().rstrip()
	return g
 
#~ def printGraph(g): # TP1
	#~ print(' directed: ', g['directed'])
	#~ print(' weighted: ', g['weighted'])
	#~ for a in g['attributes']:
		#~ print( ' ', a, ': ', g['attributes'][a] , sep='')
	#~ print(' Nodes:')
	#~ for n in g['nodes']:
		#~ print('  ', n, ':')
		#~ for a in g['nodes'][n]['attributes']:
			#~ print('    ', a, ': ', a, sep='')
		#~ print('     neighbors:', list(g['edges'][n].keys()))
	#~ print(' Edges')
	#~ for n in g['nodes']:
		#~ for v in g['edges'][n]:
			#~ a = g['edges'][n][v]
			#~ print (' ', n, '->',v)
			#~ for a in g['edges'][n][v]:
				#~ print('   ',a, ':', g['edges'][n][v][a])
 
def dfs(g): # TP1
	a = { 'color': {}, 'predecessor': {}, 'time': 0, 'in': {}, 'out': {} }
	g['dfs'] = a
	for n in g['nodes']:
		a['color'][n] = 'white'
		a['predecessor'][n] = None
	for n in g['nodes']:
		if a['color'][n] == 'white':
			dfsVisit(g,n)
 
def dfsVisit(g,n): # TP1
	a = g['dfs']
	a['color'][n] = 'gray'
	a['time'] += 1
	a['in'][n] = a['time']
	for v in g['edges'][n]:
		# retrieve edge for classification (tree, forward, cross, back egdge)
		e = g['edges'][n][v]
		if a['color'][v] == 'white':
			a['predecessor'][v] = n
			dfsVisit(g,v)
			e['dfs_type'] = 'tree'
		elif a['color'][v] == 'gray':
			e['dfs_type'] = 'back'
		else:  # black (either forward or cross, make use of dfs_in)
			if a['in'][n] > a['in'][v]:
					e['dfs_type'] = 'cross'
			else:
				e['dfs_type'] = 'forward'
	a['color'][n] = 'black'
	a['time'] += 1
	a['out'][n] = a['time']
 
def isAcyclic(g): # TP1
	dfs(g)
	for n in g['edges']:
		for v in g['edges'][n]:
			if g['edges'][n][v]['dfs_type'] == 'back':
				return False
	return True
 
def topologicalSort(g): # TP1
	dfs(g)
	a = g['dfs']['out']
	return [(k, a[k]) for k in sorted(a, key=a.get, reverse=True)]
 
 
############# TP 1 tests #############
######################################
def TP1():
	g = createGraph()
	pprint (g)
	addNode(g, 'A')
	addNode(g, 'B')
	addEdge(g, 'A', 'B')
	pprint (g)
 
	g = loadSIF('dressing.sif')
	pprint(g)
	#printGraph(g)
 
 
	dfs(g)
	pprint(g)
 
	print('Is it acyclic?', isAcyclic(g))
 
	for (k,v) in topologicalSort(g): print(k)
 
	g2 = loadSIF('directed.graph.with.cycle.slide29.sif')
	pprint(g2)
	print('Is g2 acyclic?', isAcyclic(g2))
 
	def dfs_stack(g, source):
		stack = [ source ]
		marked = {}
		i=0
		while len(stack) > 0:
			n = stack.pop()
			if n not in marked:
				i += 1
				marked[n] = i
				for v in g['edges'][n]:
					stack.append(v)
		print(marked)
	dfs_stack(g,'sous-vetements')
 
	print('nodes connected to WBBJ')
	g = loadSIF('String_EcolA_coexpression.sif', directed=False)
	a = { 'color': {}, 'predecessor': {}, 'time': 0, 'in': {}, 'out': {} }
	g['dfs'] = a
	for n in g['nodes']:
		a['color'][n] = 'white'
		a['predecessor'][n] = None
	dfsVisit(g,'WBBJ')
	for n in g['nodes']:
		if g['dfs']['color'][n] != 'white':
			pprint(n)
 
# TP2 #
#######
def bfs(g, source): # TP2
	a = { 'color': {}, 'distance': {}, 'predecessor': {} }
	g['bfs'] = a
	for n in g['nodes']:
		a['color'][n] = 'white'
		a['distance'][n] = float("inf")
		a['predecessor'][n] = None
	a['color'][source] = 'gray'
	a['distance'][source] = 0
	queue = [ source ]
	visited = []
	while len(queue) > 0:
		u = queue.pop()
		visited.append(u)
		for v in g['edges'][u]:
			if a['color'][v] == 'white':
				a['color'][v] = 'gray'
				a['distance'][v] = a['distance'][u] + 1
				a['predecessor'][v] = u
				queue.append(v)
		a['color'][u] = 'black'
	return visited
 
def test_BFS():
	print ('#### BFS ####')
	g = loadSIF('dressing.sif')
	pprint(g)
	bfs(g, 'sous-vetements')
	pprint(g['bfs'])
	a = g['bfs']['distance']
	print( [ (k, a[k]) for k in sorted(a, key=a.get)] )
	# remarque: on obtient vest car après chaussure mais nécessite chemise non parcouru
 
### Belman-Ford ###
def loadTAB(filename): # TP2
	g = createGraph()
	with open(filename) as f: 
		# GET COLUMNS NAMES
		tmp = f.readline().rstrip()
		attNames= tmp.split('\t')
		# REMOVES FIRST TWO COLUMNS WHICH CORRESPONDS TO THE LABELS OF THE CONNECTED VERTICES
		attNames.pop(0)
		attNames.pop(0)
		# PROCESS THE REMAINING LINES
		row = f.readline().rstrip()
		while row:
			vals = row.split('\t')
			v1 = vals.pop(0)
			v2 = vals.pop(0)
			att = {}
			for i in range(len(attNames)):
				att[ attNames[i] ] = vals[i]
			addEdge(g, v1, v2, att)
			row = f.readline().rstrip() # NEXT LINE
		return g
 
def BellmanFord_initializeSingleSource(g, source):
	a = { 'distance' : {}, 'predecessor': {} }
	g['Bellman-Ford'] = a
	for n in g['nodes']:
		a['distance'][n] = float("inf")
		a['predecessor'][n] = None
	a['distance'][source] = 0
 
def BellmanFord_relax(g, u, v, weight_attribute = None):
	w=1
	a = g['Bellman-Ford']
	if weight_attribute is not None: w = float(g['edges'][u][v][weight_attribute])
	if a['distance'][v] > a['distance'][u] + w:
		a['distance'][v] = a['distance'][u] + w
 
def BellmanFord(g, source, weight_attribute = None):
	if g['weighted'] and weight_attribute is None: weight_attribute = g['weight_attribute']
	BellmanFord_initializeSingleSource(g,source)
	for i in range( g['nb_nodes'] - 1):
		for n in g['nodes']:
			for v in g['edges'][n]:
				e = g['edges'][n][v]
				BellmanFord_relax(g, n, v, weight_attribute)
 
def test_BellmanFord():
	print('#### Bellman-Ford ####')
	g = loadTAB('Graphe.Bellman-Ford.tab')
	g['weighted'] = True
	g['weight_attribute'] = 'weight'
	pprint(g)
	BellmanFord(g, 'A')
	pprint(g['Bellman-Ford']['distance'])
 
# Floyd-Warshall
def adjacency_matrix(g):
	ids =  sorted( g['nodes'].keys() )
	#~ pprint( ids.index('A') )
	nb_ids = len(ids)
	M = np.full( ( nb_ids, nb_ids), np.inf)
	for i in ids:
		M[ ids.index(i), ids.index(i) ] = 0
		for j in ids:
			if i!=j:
				if j in g['edges'][i]:
					if g['weighted']: # weighted graph
						if g['weight_attribute'] is None:
							M[ ids.index(i), ids.index(j) ] = 1
						else:
							M[ ids.index(i), ids.index(j) ] = float(g['edges'][i][j][ g['weight_attribute'] ])
					else: # unweighted graph
						M[ ids.index(i), ids.index(j) ] = 1
	return M
 
def FloydWarshall(g):
	D = adjacency_matrix(g) # shortest distance matrix (initially the adjacency matrix)
	ids = sorted( g['nodes'] )
	n = len(ids) # graph order
	N = np.full( (n, n), -1, dtype = np.int_ ) # path matrix (next node to go to reach one node)
	# initialize direct neighbors
	for i in ids:
		for j in ids:
			if j in g['edges'][i] or i==j:
				N[ ids.index(i), ids.index(j) ] = ids.index(j)
	# compute shortest pathes
	for k in range(n):
		for i in range(n):
			for j in range(n):
				if D[i,k] + D[k,j] < D[i,j]:
					D[i,j] = D[i,k] + D[k,j]
					N[i,j] = N[i,k]
	a = { 'D': D, 'N': N }
	g['Floyd-Warshall'] = a
 
def FloydWarshallPath(g, source, destination): 
	if 'Floyd-Warshall' not in g:
		FloydWarshall(g)
	ids = sorted(g['nodes'])
	if source not in ids:
		pprint(source + ' is not in the graph')
		return []
	if destination not in ids:
		pprint(destination + ' is not in the graph')
		return []
	if g['Floyd-Warshall']['D'][ ids.index(source), ids.index(destination) ] == np.inf:
		pprint('No path from '+source+' to '+destination)
		return []
	N = g['Floyd-Warshall']['N']
	path = [ source ]
	k = N[ ids.index(source), ids.index(destination) ]
	while k != ids.index(destination):
		path.append(ids[k])
		k = N[ k, ids.index(destination) ]
	path.append(destination)
	return path
 
def diameter(g):
	if not 'Floyd-Warshall' in  g: FloydWarshall(g)
	g['diameter']=np.max(g['Floyd-Warshall']['D'])
	return g['diameter']
 
def test_FloydWarshall():
	print('#### Floyd-Warshall ####')
	g = loadTAB('Graphe.Floyd-Warshall.tab')
	g['weighted'] = True
	g['weight_attribute'] = 'weight'
	FloydWarshall(g)
	pprint(g)
	path = FloydWarshallPath(g, 'A', 'B')
	pprint(path)
	pprint(FloydWarshallPath(g, 'A', 'C') )
	pprint(FloydWarshallPath(g, 'E', 'C') )
	pprint(FloydWarshallPath(g, 'E', 'A') )
	pprint(g['Floyd-Warshall']['D'])
	print('diameter:', diameter(g))
 
## TP2 tests #####
def TP2():
	print('########### TP2 ##############')
	#~ test_BFS()
	#~ test_BellmanFord()
	test_FloydWarshall()
 
#TP3
 
def layoutFruchtermanReingold(g, nb_iterations=500, cool=1.0, max_displacement=10.0, width = 800, height=600):
	"""
	The force directed layout algorithm adapted from Fruchterman-Reingold, 1991.
	"""
	layoutFruchtermanReingoldInit(g, cool, max_displacement, width, height)
	for i in range(nb_iterations):
		layoutFruchtermanReingoldIteration(g)
	# remove temp attributes
	for n in g['nodes']:
		del g['nodes'][n]['force_x']
		del g['nodes'][n]['force_y']
 
def layoutFruchtermanReingoldInit(g, cool=1.0, max_displacement=10.0, width = 800, height=600):
	"""
	Utility function (initialization) for the force directed layout algorithm adapted from Fruchterman-Reingold, 1991.
	"""
	k = sqrt(float(width)*height/g['nb_nodes'])/2 # optimal distance between 2 nodes
	fr = {}
	g['fr'] = fr
	fr['k'] = k
	fr['cool'] = cool
	fr['canvas_width'] = width
	fr['canvas_height'] = height
	fr['max_displacement'] = max_displacement
	# initalize node positions
	for nid in g['nodes']:
		n = g['nodes'][nid]
		n['x'] = random()*float(width)
		n['y'] = random()*float(height)
		n['force_x'] = float(0)
		n['force_y'] = float(0)
 
def layoutFruchtermanReingoldIteration(g):
	"""
	Utility function (an iteration for nodes displacements) for the force directed layout algorithm adapted from Fruchterman-Reingold, 1991.
	"""
	# calculate repulsive forces
	fr = g['fr']
	for v in g['nodes']:
		m = g['nodes'][v]
		m['force_x'] = float(0)
		m['force_y'] = float(0)
		for u in g['nodes']:
			n = g['nodes'][u]
			if u!=v:
				dx = m['x']-n['x']
				dy = m['y']-n['y']
				d = sqrt( dx**2 + dy**2 )
				# avoid too small distances
				if d < 0.1:
					dx = 0.1 * random() + 0.1
					dy = 0.1 * random() + 0.1
					d = sqrt( dx**2 + dy**2 )
				force = fr['k']**2/d
				m['force_x'] += force * dx / d
				m['force_y'] += force * dy / d
	# calculate attractive forces
	for u in g['nodes']:
		for v in g['edges'][u]:
			if u!=v: # do not process loops (division by 0)
				dx = g['nodes'][v]['x'] - g['nodes'][u]['x']
				dy = g['nodes'][v]['y'] - g['nodes'][u]['y']
				d = sqrt( dx**2 + dy**2 )
				force = d**2 / fr['k']
				g['nodes'][u]['force_x'] += force * dx / d
				g['nodes'][u]['force_y'] += force * dy / d
				g['nodes'][v]['force_x'] -= force * dx / d
				g['nodes'][v]['force_y'] -= force * dy / d
	# move nodes
	for n in g['nodes']:
		disp = sqrt( g['nodes'][n]['force_x']**2 + g['nodes'][n]['force_y']**2 )
		if disp > fr['max_displacement']:
			scale = sqrt( fr['max_displacement']**2 / disp**2 )
			g['nodes'][n]['force_x'] *= scale
			g['nodes'][n]['force_y'] *= scale
		g['nodes'][n]['x'] +=  g['nodes'][n]['force_x']
		g['nodes'][n]['y'] +=  g['nodes'][n]['force_y']
		g['nodes'][n]['x'] = min( fr['canvas_width'], max( 0, fr['canvas_width'] ) ) 
		g['nodes'][n]['y'] = min( fr['canvas_height'], max( 0, fr['canvas_height'] ) ) 
	# cooling
	fr['max_displacement'] *= fr['cool']
 
 
##### main #####
if __name__ == "__main__":
	g=loadSIF('dressing.sif')
	layoutFruchtermanReingold(g)
	pprint(g)


Annexes

HDE
grid30x50 Fructherman-Reingold
grid30x50 HDE