0

I'm trying to implement the depth-first search (DFS) algorithm for directed graphs as described in Cormen et al., Introduction to Algorithms (3rd ed.). Here is my implementation so far:

import pytest
from collections import OrderedDict
import copy


class Node(object):
    def __init__(self, color='white', parent=None, d=None, f=None):
        self.color = color
        self.parent = parent
        self.d = d          # Discovery time
        self.f = f          # Finishing time


class Graph(object):
    def __init__(self, edges, node_indices=None):
        self.edges = edges
        self.nodes = self.initialize_nodes(node_indices )
        self.adj = self.initialize_adjacency_list()

    def initialize_nodes(self, node_indices=None):
        if node_indices is None:
            node_indices = sorted(list(set(node for edge in self.edges for node in edge)))
        return OrderedDict([(node_index, Node()) for node_index in node_indices])

    def initialize_adjacency_list(self):
        A = {node: [] for node in self.nodes}
        for edge in self.edges:
            u, v = edge
            A[u].append(v)
        return A

    def dfs(self):
        self.time = 0
        for u, node in self.nodes.items():
            if node.color == 'white':
                self.dfs_visit(u)

    def dfs_visit(self, u):
        self.time += 1
        self.nodes[u].d = self.time
        self.nodes[u].color = 'gray'
        for v in self.adj[u]:
            if self.nodes[v].color == 'white':
                self.nodes[v].parent = u
                self.dfs_visit(v)
        self.nodes[u].color = 'black'
        self.time += 1
        self.nodes[u].f = self.time

    @staticmethod
    def transpose(edges):
        return [(v,u) for (u,v) in edges]

    def strongly_connected_components(self):
        self.dfs()
        finishing_times = {u: node.f for u, node in self.nodes.items()}
        self.__init__(self.transpose(self.edges))
        node_indices = sorted(finishing_times, key=finishing_times.get, reverse=True)
        self.nodes = self.initialize_nodes(node_indices)
        self.dfs()
        return self.trees()

    def trees(self):
        _trees = []
        nodes = copy.deepcopy(self.nodes)
        while nodes:
            for u, node in nodes.items():
                if node.parent is None:
                    _trees.append([u])
                    nodes.pop(u)
                else:
                    for tree in _trees:
                        if node.parent in tree:
                            tree.append(u)
                            nodes.pop(u)
        return _trees

To test that it works, I've taken an example from Figure 22.9 of the book:

enter image description here

After renaming the nodes a to h 1 to 8, respectively, I ran the following test:

def test_strongly_connected_components():
    edges = [(1,2), (5,1), (2,5), (5,6), (2,6), (6,7), (7,6), (2,3), (3,7), (3,4), (4,3), (4,8), (7,8), (8,8)]
    graph = Graph(edges)
    assert graph.strongly_connected_components() == [[1, 5, 2], [3, 4], [6, 7], [8]]


if __name__ == "__main__":
    pytest.main([__file__+"::test_strongly_connected_components", "-s"])

This test passes, confirming the gray-shaded SCCs in the figure.

For the 'real' exercise, however, I need to use an input file, SCC.txt, which contains 875,714 lines representing edges (as a head-tail pair of integers), and output the size of the five largest SCCs. To this end I tried the following test:

@pytest.fixture
def edges():
    with open('SCC.txt') as f:
        return [tuple(map(int, line.split())) for line in f.read().splitlines()]

def test_SCC_on_full_graph(edges):
    graph = Graph(edges)
    SCCs = graph.strongly_connected_components()
    print([map(len, SCCs)].sort(reverse=True))      # Read off the size of the largest SCCs


if __name__ == "__main__":
    pytest.main([__file__+"::test_SCC_on_full_graph", "-s"])

However, I run into a RuntimeError: maximum recursion depth exceeded in cmp:

_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <scc.Graph object at 0x103253690>, u = 209099

    def dfs_visit(self, u):
            self.time += 1
            self.nodes[u].d = self.time
            self.nodes[u].color = 'gray'
            for v in self.adj[u]:
>                   if self.nodes[v].color == 'white':
E                   RuntimeError: maximum recursion depth exceeded in cmp

scc.py:53: RuntimeError
========================== 1 failed in 21.79 seconds ===========================

I've read about increasing sys.setrecursionlimit, but this doesn't seem to be a recommended practice. Other than than I'm not sure how I could improve the code as it fairly literally implements the pseudocode given in the book. Any ideas on how I can overcome this error?

2 Answers 2

2

I managed to solve the problem using the threading library with an increased stack_size and recursion limit. Here is the code of the solution:

import sys
import pytest
from collections import OrderedDict
import copy
import threading


class Node(object):
    def __init__(self, color='white', parent=None, d=None, f=None):
        self.color = color
        self.parent = parent
        self.d = d          # Discovery time
        self.f = f          # Finishing time


class Graph(object):
    def __init__(self, edges, node_indices=None):
        self.edges = edges
        self.nodes = self.initialize_nodes(node_indices )
        self.adj = self.initialize_adjacency_list()
        self.trees = dict()

    def initialize_nodes(self, node_indices=None):
        if node_indices is None:
            node_indices = sorted(list(set(node for edge in self.edges for node in edge)))
        return OrderedDict([(node_index, Node()) for node_index in node_indices])

    def initialize_adjacency_list(self):
        A = {node: [] for node in self.nodes}
        for edge in self.edges:
            u, v = edge
            A[u].append(v)
        return A

    def dfs(self):
        self.time = 0
        for u, node in self.nodes.items():
            if node.color == 'white':
                self.dfs_visit(u, root=u)

    def dfs_visit(self, u, root=None):
        if u == root:
            self.trees[root] = set()
        self.time += 1
        self.nodes[u].d = self.time
        self.nodes[u].color = 'gray'
        for v in self.adj[u]:
            if self.nodes[v].color == 'white':
                self.nodes[v].parent = u
                self.trees[root].add(v)
                self.dfs_visit(v, root=root)
        self.nodes[u].color = 'black'
        self.time += 1
        self.nodes[u].f = self.time

    @staticmethod
    def transpose(edges):
        return [(v,u) for (u,v) in edges]

    def strongly_connected_components(self):
        self.dfs()
        finishing_times = {u: node.f for u, node in self.nodes.items()}
        self.__init__(self.transpose(self.edges))
        node_indices = sorted(finishing_times, key=finishing_times.get, reverse=True)
        self.nodes = self.initialize_nodes(node_indices)
        self.dfs()

        trees = copy.deepcopy(self.trees)
        for k, v in trees.items():
            v.add(k)
        return trees.values()


@pytest.fixture
def edges():
    with open('SCC.txt') as f:
        return [tuple(map(int, line.split())) for line in f.read().splitlines()]

def SCC_on_full_graph():
    E = edges()
    graph = Graph(E)
    SCCs = graph.strongly_connected_components()

    SCC_sizes = sorted(list(map(len, SCCs)), reverse=True)
    print(SCC_sizes[:5])                            # Read off the size of the 5 largest SCCs


if __name__ == "__main__":
    threading.stack_size(67108864)
    sys.setrecursionlimit(2**20)
    thread = threading.Thread(target=SCC_on_full_graph)
    thread.start()
Sign up to request clarification or add additional context in comments.

Comments

1

The DFS has to be logically DFS, but programmatically you can try a work around.

  1. writing the DFS in such a way that you can retry it from one of the top functions, if it reaches a near the recursion limit.

  2. Try to use multiprocessing.

PS: Is it possible that an infinite recursion is occurring for the larger dataset? logical error which comes forth when using a larger dataset. If you have datasets of incremental sizes, you could also identify the limit of the algorithm's implementation in python.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.