import random
def skip():pass
class State:
states = []
def __init__(self, accepting=False):
self.n = len(State.states)
State.states.append(self)
self.accepting = accepting
self.transitions = {}
def __repr__(self):
return f'q_{self.n}'
def add(self, c, s):
self.transitions.setdefault(c, set()).add(s)
def __eq__(self, o):
return self.n == o.n
def __hash__(self):
return hash((State, self.n))
def epsilon_helper(self):
z = {self}
if '' in self.transitions:
for s in self.transitions.pop(''):
z.update(s.epsilon_helper())
self.epsilon_helper = lambda:z
return z
def remove_epsilon(self):
self.remove_epsilon = skip
for a in self.epsilon_helper():
a.remove_epsilon()
for x, S in a.transitions.items():
for b in list(S):
b.remove_epsilon()
for c in b.epsilon_helper():
self.add(x, c)
c.remove_epsilon()
def freeze(self):
self.freeze = lambda:None
for c, s in self.transitions.items():
for x in s:
x.freeze()
self.transitions[c] = frozenset(s)
def count_accepting(self):
x = self.accepting + sum(s.count_accepting() for S in self.transitions.values() for s in S)
self.count_accepting = lambda:x
return x
def accept(self, w):
if w == '': return self.accepting
return any(s.accept(w[1:]) for s in self.transitions.get(w[0],()))
def pick(self, i):
if self.accepting:
if i == 0: return ''
i -= 1
for j, s in self.transitions.items():
s = min(s)
x = s.count_accepting()
if i < x:
return j + s.pick(i)
i -= x
raise IndexError
def levenshtein(w, n):
A = set(w)
W = len(w)
states = [[State(j <= i) for j in range(len(w),-1,-1)] for i in range(n,-1,-1)]
for i, c in enumerate(w):
for j in range(n):
s = states[j][i]
for x in A:
s.add(x, states[j+1][i])
if x != c:
s.add(x, states[j+1][i+1])
s.add('', states[j+1][i+1])
s.add(c, states[j][i+1])
states[n][i].add(c, states[n][i+1])
for j in range(n):
s = states[j][W]
for x in A:
s.add(x, states[j+1][W])
s = states[0][0]
s.remove_epsilon()
s.freeze()
return s
def union(states):
d = {}
accepting = False
for s in states:
for c, S in s.transitions.items():
d[c] = d.get(c, frozenset()) | S
if s.accepting:
accepting = True
return d, accepting
def difference(q0, q1, D = None):
if D is None:
return difference(frozenset([q0]), frozenset([q1]), {})
if (q0, q1) in D:
return D[q0, q1]
d0, a0 = union(q0)
d1, a1 = union(q1)
Q0 = State(a0 and not a1)
D[q0, q1] = Q0
for i in set(d0) | set(d1):
Q0.transitions[i] = frozenset([difference(d0.get(i, frozenset()), d1.get(i, frozenset()), D)])
return Q0
def exact(w, n):
if n == 0: raise ValueError
return difference(levenshtein(w, n), levenshtein(w, n-1))
def f(w, n):
s = exact(w, n)
return s.pick(random.randrange(s.count_accepting()))
Attempt This Online!
My attempt at implementing user1502040's idea of sampling the DFA.
The algorithm runs in basically three stages:
- First it generates NFAs for
s,k and s,k-1 (where each accepts words with at most k or k-1 errors respectively)
- The NFAs have \$O(|s|\cdot k)\$ states, and building them takes time proportional to that (also proportional to the size of the alphabet)
- Then it uses the algorithm outlined in this cs.se answer to find a DFA for the difference of two NFAs
- This takes the majority of the time, but I'm not sure of the actual complexity. Given that the algorithm is basically the powerset construction, it's worst case exponential, but it appears to do better than that. (I found an algorithm that can find a DFA for a given word with
<=k errors that takes time linear with respect to the length of the word (for fixed k). See Schulz and Mihov (2002)).
- I tried graphing the number of states in the DFA for
s="1111011012" and k from 1 to 30 and it appeared linear (especially for k>10), but I also tried s="the quick brown fox jumps over the lazy dog" and k from 1 to 7 and it appeared exponential. **
- It counts the number of states, and picks a random index and chooses the path with that index in a preorder traversal of the DFA. (am I using my jargon right?)
- This takes time linear in the size of the DFA (using dynamic programming).
In practice, the program appears to run quite quickly; it takes about 5 seconds for s="the quick brown fox jumps over the lazy dog" and k=5.
** For some reason the image feature isn't working for me, so here is the data: [35, 101, 228, 436, 722, 1080, 1464, 1833, 2184, 2519, 2844, 3162, 3480, 3798, 4116, 4434, 4752, 5070, 5388, 5706, 6024, 6342, 6660, 6978, 7296, 7614, 7932, 8250, 8568] and [172, 669, 2579, 10226, 40094, 155722, 606044]