Source code for pyamg.graph

"""Algorithms related to graphs."""

from warnings import warn
import numpy as np
from scipy import sparse

from . import amg_core


[docs] def asgraph(G): """Return (square) array as sparse. Parameters ---------- G : sparray Sparse matrix. Returns ------- csr_array or csc_array Converted array. """ if not sparse.issparse(G) or G.format not in ('csc', 'csr'): G = sparse.csr_array(G) if G.shape[0] != G.shape[1]: raise ValueError('expected square matrix') return G
[docs] def maximal_independent_set(G, algo='serial', k=None): """Compute a maximal independent vertex set for a graph. Parameters ---------- G : sparray Symmetric matrix, preferably in sparse CSR or CSC format. The nonzeros of G represent the edges of an undirected graph. algo : {'serial', 'parallel'} Algorithm used to compute the MIS * serial : greedy serial algorithm * parallel : variant of Luby's parallel MIS algorithm k : int Minimum separation between MIS vertices. Returns ------- array - ``S[i] = 1`` if vertex i is in the MIS. - ``S[i] = 0`` otherwise. Notes ----- Diagonal entries in the G (self loops) will be ignored. Luby's algorithm is significantly more expensive than the greedy serial algorithm. """ G = asgraph(G) N = G.shape[0] mis = np.empty(N, dtype='intc') mis[:] = -1 if k is None: if algo == 'serial': fn = amg_core.maximal_independent_set_serial fn(N, G.indptr, G.indices, -1, 1, 0, mis) elif algo == 'parallel': fn = amg_core.maximal_independent_set_parallel fn(N, G.indptr, G.indices, -1, 1, 0, mis, np.random.rand(N), -1) else: raise ValueError(f'Unknown algorithm ({algo})') else: fn = amg_core.maximal_independent_set_k_parallel fn(N, G.indptr, G.indices, k, mis, np.random.rand(N), -1) return mis
[docs] def vertex_coloring(G, method='MIS'): """Compute a vertex coloring of a graph. Parameters ---------- G : sparray Symmetric matrix, preferably in sparse CSR or CSC format The nonzeros of G represent the edges of an undirected graph. method : str Algorithm used to compute the vertex coloring: * 'MIS' - Maximal Independent Set * 'JP' - Jones-Plassmann (parallel) * 'LDF' - Largest-Degree-First (parallel) Returns ------- array An array of vertex colors (integers beginning at 0). Notes ----- Diagonal entries in the G (self loops) will be ignored. """ G = asgraph(G) N = G.shape[0] coloring = np.empty(N, dtype='intc') if method == 'MIS': fn = amg_core.vertex_coloring_mis fn(N, G.indptr, G.indices, coloring) elif method == 'JP': fn = amg_core.vertex_coloring_jones_plassmann fn(N, G.indptr, G.indices, coloring, np.random.rand(N)) elif method == 'LDF': fn = amg_core.vertex_coloring_LDF fn(N, G.indptr, G.indices, coloring, np.random.rand(N)) else: raise ValueError(f'Unknown method ({method})') return coloring
[docs] def bellman_ford(G, centers, method='standard', tiebreaking=True): """Bellman-Ford iteration. Parameters ---------- G : sparray Directed graph with positive weights. centers : list Starting centers or source nodes. method : string - 'standard': base implementation of Bellman-Ford. - 'balanced': a balanced version of Bellman-Ford. tiebreaking : bool Tie break flag if ``method='balanced'``. Returns ------- array Distance of each point to the nearest center. array Index of the nearest center. array Predecessors in the array. See Also -------- pyamg.amg_core.bellman_ford scipy.sparse.csgraph.bellman_ford Notes ----- This should be viewed as the transpose of Bellman-Ford in scipy.sparse.csgraph. Here, bellman_ford is used to find the shortest path from any point *to* the seeds. In csgraph, bellman_ford is used to find "the shortest distance from point i to point j". So csgraph.bellman_ford could be run `for seed in seeds`. Also note that ``test_graph.py`` tests against ``csgraph.bellman_ford(G.T)``. """ G = asgraph(G) n = G.shape[0] if G.nnz > 0: if G.data.min() < 0: raise ValueError('Bellman-Ford is defined only for positive weights.') if G.dtype == complex: raise ValueError('Bellman-Ford is defined only for real weights.') centers = np.asarray(centers, dtype=np.int32) num_clusters = len(centers) # allocate space for returns and working arrays distances = np.full(n, np.inf, dtype=G.dtype) # distance to cluster center (inf) nearest = np.full(n, -1, dtype=np.int32) # nearest center, cluster membership (-1) predecessors = np.full(n, -1, dtype=np.int32) # predecessor on the shortest path (-1) distances[centers] = 0 # distance = 0 at centers nearest[centers] = np.arange(num_clusters) # number the membership if method == 'standard': amg_core.bellman_ford(n, G.indptr, G.indices, G.data, centers, # IN distances, nearest, predecessors) # OUT elif method == 'balanced': predecessors_count = np.full(n, 0, dtype=np.int32) cluster_size = np.full(num_clusters, 1, dtype=np.int32) amg_core.bellman_ford_balanced(n, G.indptr, G.indices, G.data, centers, # IN distances, nearest, predecessors, # OUT predecessors_count, cluster_size, # OUT tiebreaking) else: raise ValueError(f'Method {method} is not supported in Bellman-Ford') return distances, nearest, predecessors
[docs] def lloyd_cluster(G, centers, maxiter=5): """Perform Lloyd clustering on graph with weighted edges. Parameters ---------- G : csr_array, csc_array A sparse matrix of size (n,n) where each nonzero entry G[i,j] is the distance between nodes i and j. centers : int array If centers is an integer, then its value determines the number of clusters. Otherwise, centers is an array of unique integers between 0 and n-1 that will be used as the initial centers for clustering. maxiter : int Maximum number of iterations. Returns ------- int array Id of each cluster of points. int array Index of each seed. Notes ----- If G has complex values, abs(G) is used instead. Only positive edge weights may be used """ G = asgraph(G) n = G.shape[0] # complex dtype if G.dtype.kind == 'c': G = np.abs(G) if G.nnz > 0: if G.data.min() < 0: raise ValueError('Lloyd Clustering is defined only for positive weights.') if np.isscalar(centers): centers = np.random.permutation(n)[:centers] centers = np.int32(centers) else: centers = np.asarray(centers, dtype=np.int32) num_clusters = len(centers) if num_clusters < 1: raise ValueError('at least one center is required') if centers.min() < 0: raise ValueError(f'invalid center index {centers.min()}') if centers.max() >= n: raise ValueError(f'invalid center index {centers.max()}') distances = np.full(n, np.inf, dtype=G.dtype) clusters = np.full(n, -1, dtype=np.int32) predecessors = np.full(n, -1, dtype=np.int32) distances[centers] = 0 clusters[centers] = np.arange(num_clusters) changed = True it = 0 _dist, _near, _pred = bellman_ford(G, centers, method='standard') while changed and it < maxiter: if it > 0: distances.fill(np.inf) clusters.fill(-1) predecessors.fill(-1) distances[centers] = 0 clusters[centers] = np.arange(num_clusters) amg_core.bellman_ford(n, G.indptr, G.indices, G.data, centers, # IN distances, clusters, predecessors) # OUT changed = amg_core.most_interior_nodes(n, G.indptr, G.indices, G.data, centers, distances, clusters, predecessors) it += 1 return clusters, centers
[docs] def balanced_lloyd_cluster(G, centers, maxiter=5, rebalance_iters=5, tiebreaking=True): """Perform Lloyd clustering on graph with weighted edges. Parameters ---------- G : csr_array, csc_array A sparse nxn matrix where each nonzero entry G[i,j] is the distance between nodes i and j. centers : int array If centers is an integer, then its value determines the number of clusters. Otherwise, centers is an array of unique integers between 0 and n-1 that will be used as the initial centers for clustering. maxiter : int Number of bellman_ford_balanced->center_nodes iterations to run within the clustering. rebalance_iters : int Number of post-Lloyd rebalancing iterations to run. tiebreaking : bool, default True Flag for triggering tiebreaking. Returns ------- clusters : int array id of each cluster of points centers : int array index of each center Notes ----- - If G has complex values, abs(G) is used instead. - Only positive edge weights may be used - This version computes improved cluster centers with Floyd-Warshall and also uses a balanced version of Bellman-Ford to try and find nearly-equal-sized clusters. - Repeated calls to bellman_ford_balanced() in the rebalance loop can result in different centers. This is due to the tie-breaker based on aggregate size in bellman_ford_balanced(). Alternatively, the graph can be seeded with a small random number to make the edge lengths (and distances) unique. """ G = asgraph(G) n = G.shape[0] # complex dtype if G.dtype.kind == 'c': G = np.abs(G) if G.nnz > 0: if G.data.min() < 0: raise ValueError('Lloyd Clustering is defined only for positive weights.') if np.isscalar(centers): centers = np.random.permutation(n)[:centers] # same as np.random.choice() centers = np.int32(centers) else: centers = np.asarray(centers, dtype=np.int32) num_clusters = len(centers) if num_clusters < 1: raise ValueError('at least one center is required') if centers.min() < 0: raise ValueError(f'invalid center index {centers.min()}') if centers.max() >= n: raise ValueError(f'invalid center index {centers.max()}') # create work arrays for C++ # empty() values are initialized in the kernel maxsize = int(12*np.ceil(n / num_clusters)) d = np.full(n, np.inf, dtype=G.dtype) # distance to cluster center (inf) m = np.full(n, -1, dtype=np.int32) # cluster membership or index (-1) p = np.full(n, -1, dtype=np.int32) # predecessor on the shortest path (-1) pc = np.full(n, 0, dtype=np.int32) # predecessor count (0) s = np.full(num_clusters, 1, dtype=np.int32) # cluster size (1) d[centers] = 0 # distance = 0 at centers m[centers] = np.arange(num_clusters) # number the membership Cptr = np.empty(num_clusters, dtype=np.int32) # ptr to start in C for each cluster CC = np.empty(n, dtype=np.int32) # FW global index for current cluster D = np.empty(maxsize*maxsize, dtype=G.dtype) # FW distance array P = np.empty(maxsize*maxsize, dtype=np.int32) # FW predecessor array L = np.empty(n, dtype=np.int32) # FW local index for current cluster q = np.empty(maxsize, dtype=G.dtype) # FW work array for d**2 # global work array for distances dist_all = np.empty((num_clusters, maxsize*maxsize), dtype=G.dtype, order='C') for riter in range(rebalance_iters+1): # lloyd cluster balanced it = 0 changed1 = True changed2 = True # reinitialize d.fill(np.inf) m.fill(-1) p.fill(-1) p[centers] = centers pc.fill(0) pc[centers] = 1 s.fill(1) d[centers] = 0 m[centers] = np.arange(num_clusters) while (changed1 or changed2) and (it < maxiter): changed1 = amg_core.bellman_ford_balanced(n, G.indptr, G.indices, G.data, centers, d, m, p, pc, s, tiebreaking) if s.max() > maxsize: raise ValueError('maxsize (maximum cluster size) is too small') if m.min() < 0 or d.min() < 0: raise ValueError('Encountered a disconnected nodes from m or d: ' f'{m.min()=} {d.min()=})') changed2 = amg_core.center_nodes(n, G.indptr, G.indices, G.data, Cptr, D, P, CC, L, q, centers, d, m, p, pc, s) it += 1 # slow list version: [len(np.where(p==i)[0]) for i in range(len(p))] truepc = np.bincount(p[p > -1], minlength=len(p)) if np.count_nonzero(truepc - pc): raise ValueError('Predecessor count is incorrect.') # don't rebalance on last pass if riter == rebalance_iters: break # don't rebalance a single cluster if num_clusters < 2: break # calculate distances dist_all.fill(np.inf) for a in range(num_clusters): N = s[a] # cluster size Nloc = Cptr[a]+N if Nloc >= G.shape[0]: Nloc = None P.fill(-1) amg_core.floyd_warshall(G.shape[0], G.indptr, G.indices, G.data, dist_all[a, :].ravel(), P, CC[Cptr[a]:Nloc], L, m, a, N) # rebalance centers, rebalance_change = _rebalance(G, centers, m, d, dist_all, num_clusters) # rebalance did nothing if not rebalance_change: break return m, centers
[docs] def _rebalance(G, c, m, d, dist_all, num_clusters): """Rebalance clusters. Parameters ---------- G : sparray Sparse graph. c : array List of centers. m : array Cluster membership. d : array Distance to cluster center. dist_all : array Node-to-node distance for every node in each cluster. num_clusters : int Number of clusters (= number centers). Return ------ array List of new centers. bool Indicate whether centers has changed. """ newc = c.copy() # aggregate-to-aggregate neighbors AggOp = sparse.coo_array((np.ones(len(m)), (np.arange(len(m)), m))).tocsr() Agg2Agg = AggOp.T @ G @ AggOp Agg2Agg = Agg2Agg.tocsr() # calculate elimination and split measures E = _elimination_penalty(G, m, d, dist_all, num_clusters) S, c1, c2 = _split_improvement(m, d, dist_all, num_clusters) # sort both ascending M = np.ones(num_clusters, dtype=bool) Esortidx = np.argsort(E) Ssortidx = np.argsort(S) i_e = 0 # elimination index i_s = num_clusters-1 # splitting index rebalance_change = False # 0, 1, ..., num_clusters-1 while i_e <= (num_clusters-1) and i_s >= 0: a_e = Esortidx[i_e] a_s = Ssortidx[i_s] if not M[a_e] or a_e == a_s: # is cluster a_e modifiable and distinct from a_s? i_e += 1 continue if not M[a_s]: # is cluster a_s modifiable? i_s -= 1 continue if E[a_e] > S[a_s]: break M[Agg2Agg[[a_e], :].indices] = False # neighbors of a_e M[Agg2Agg[[a_s], :].indices] = False # neighbors of a_s newc[a_e] = c1[a_s] # redefine centers newc[a_s] = c2[a_s] # redefine centers rebalance_change = True return newc, rebalance_change
[docs] def _elimination_penalty(A, m, d, dist_all, num_clusters): """Calculate elimination penalty. see _rebalance() """ Acol = A.tocsc() # pylint: disable=too-many-nested-blocks E = np.inf * np.ones(num_clusters) for a in range(num_clusters): E[a] = 0 Va = np.int32(np.where(m == a)[0]) N = len(Va) for iloc, _ in enumerate(Va): dmin = np.inf for jloc, j in enumerate(Va): for k in Acol[:, [j]].indices: if m[k] != m[j]: jiloc = jloc * N + iloc dmin = min(d[k] + A[k, j] + dist_all[a, jiloc], dmin) E[a] += dmin**2 E[a] -= np.sum(d[Va]**2) return E
[docs] def _split_improvement(m, d, dist_all, num_clusters): """Calculate split improvement. see _rebalance() """ S = np.inf * np.ones(num_clusters) I = -1 * np.ones(num_clusters, dtype=np.int32) # better cluster centers if split J = -1 * np.ones(num_clusters, dtype=np.int32) # better cluster centers if split for a in range(num_clusters): S[a] = np.inf Va = np.int32(np.where(m == a)[0]) N = len(Va) for iloc, i in enumerate(Va): for jloc, j in enumerate(Va): Snew = 0 for kloc, _ in enumerate(Va): ikloc = iloc * N + kloc jkloc = jloc * N + kloc if dist_all[a, ikloc] < dist_all[a, jkloc]: Snew = Snew + dist_all[a, ikloc]**2 else: Snew = Snew + dist_all[a, jkloc]**2 if Snew < S[a]: S[a] = Snew I[a] = i J[a] = j S[a] = np.sum(d[Va]**2) - S[a] return S, I, J
[docs] def _choice(p): """Random selection based on a distribution. Parameters ---------- p : array Probabilities [0,1], with sum(p) == 1. Return ------ int Index to a selected integer based on the distribution of p. Notes ----- For efficiency, there are no checks. """ a = p / np.max(p) i = -1 while True: i = np.random.randint(len(a)) if np.random.rand() < a[i]: break return i
[docs] def kmeanspp_seed(G, nseeds): """K-means++ seed. Parameters ---------- G : sparray Sparse graph on which to seed. nseeds : int Number of seeds. Return ------ array List of seeds. Notes ----- This is a reference algorithms, at O(n^3). TODO - needs testing """ warn('kmeanspp_seed is O(n^3) -- use only for testing') n = G.shape[0] C = np.random.choice(n, 1, replace=False) for _ in range(nseeds-1): d = sparse.csgraph.bellman_ford(G, directed=False, indices=C) d = d.min(axis=0) # shortest path from a seed d = d**2 # distance squared p = d / np.sum(d) # probability # newC = np.random.choice(n, 1, p=p, replace=False) # <- does not work properly newC = _choice(p) C = np.append(C, newC) return C
[docs] def connected_components(G): """Compute the connected components of a graph. The connected components of a graph G, which is represented by a symmetric sparse matrix, are labeled with the integers 0,1,..(K-1) where K is the number of components. Parameters ---------- G : symmetric matrix, preferably in sparse CSR or CSC format The nonzeros of G represent the edges of an undirected graph. Returns ------- ndarray An array of component labels for each vertex of the graph. Notes ----- If the nonzero structure of G is not symmetric, then the result is undefined. Examples -------- >>> from pyamg.graph import connected_components >>> print(connected_components( [[0,1,0],[1,0,1],[0,1,0]] )) [0 0 0] >>> print(connected_components( [[0,1,0],[1,0,0],[0,0,0]] )) [0 0 1] >>> print(connected_components( [[0,0,0],[0,0,0],[0,0,0]] )) [0 1 2] >>> print(connected_components( [[0,1,0,0],[1,0,0,0],[0,0,0,1],[0,0,1,0]] )) [0 0 1 1] """ G = asgraph(G) N = G.shape[0] components = np.empty(N, G.indptr.dtype) fn = amg_core.connected_components fn(N, G.indptr, G.indices, components) return components
[docs] def symmetric_rcm(A): """Symmetric Reverse Cutthill-McKee. Parameters ---------- A : sparray Sparse matrix. Returns ------- sparray Permuted matrix with reordering. See Also -------- pseudo_peripheral_node Notes ----- Get a pseudo-peripheral node, then call BFS. Examples -------- >>> from pyamg import gallery >>> from pyamg.graph import symmetric_rcm >>> n = 200 >>> density = 1.0/n >>> A = gallery.sprand(n, n, density, format='csr') >>> S = A + A.T >>> # try the visualizations >>> # import matplotlib.pyplot as plt >>> # plt.figure() >>> # plt.subplot(121) >>> # plt.spy(S,marker='.') >>> # plt.subplot(122) >>> # plt.spy(symmetric_rcm(S),marker='.') """ _dummy_root, order, _dummy_level = pseudo_peripheral_node(A) p = order[::-1] return A[p, :][:, p]
[docs] def pseudo_peripheral_node(A): """Find a pseudo peripheral node. Parameters ---------- A : sparray Sparse matrix. Returns ------- x : int Location of the node. order : array BFS ordering. level : array BFS levels. Notes ----- Algorithm in Saad. """ n = A.shape[0] valence = np.diff(A.indptr) # select an initial node x, set delta = 0 x = int(np.random.rand() * n) delta = 0 while True: # do a level-set traversal from x order, level = breadth_first_search(A, x) # select a node y in the last level with min degree maxlevel = level.max() lastnodes = np.where(level == maxlevel)[0] lastnodesvalence = valence[lastnodes] minlastnodesvalence = lastnodesvalence.min() y = np.where(lastnodesvalence == minlastnodesvalence)[0][0] y = lastnodes[y] # if d(x,y)>delta, set, and go to bfs above if level[y] > delta: x = y delta = level[y] else: return x, order, level
[docs] def metis_partition(G, nparts=5, seed=None): """Perform partitioning of graph with weighted edges using METIS. Parameters ---------- G : sparray A sparse n x n matrix where each nonzero entry G[i,j] is the distance between nodes i and j. G[i,j] is required to be integer. nparts : int Number of parts in the resulting partition. seed : int Random seed for METIS. Returns ------- array Array of n x 1 indices from 0 ... nparts-1. """ G = sparse.csr_array(G) if G.dtype.kind != 'i': raise ValueError('METIS partitioning requires integer weights') if G.nnz > 0: if G.data.min() < 0: raise ValueError('METIS partitioning requires positive integer weights.') if not isinstance(nparts, int) or nparts < 1: raise ValueError('nparts should be a positive integer') try: import pymetis # noqa: PLC0415 except ImportError as expt: raise ImportError('pymetis required for METIS partitioning') from expt # set diagonal to zero and force reallocation G = G.tocoo() G.setdiag(0) G = G.tocsr() # metis options opt = pymetis.Options() opt.contig = 1 if seed: opt.seed = seed _, parts = pymetis.part_graph(nparts, xadj=G.indptr, adjncy=G.indices, eweights=G.data, options=opt) return np.array(parts)