I have taken Generic Human’s answer, slightly modified it to solve your problem.
You need to copy these 125k words, sorted by frequency into a text file, name the file words-by-frequency.txt.
from math import log
# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
with open("words-by-frequency.txt") as f:
words = [line.strip() for line in f.readlines()]
wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
maxword = max(len(x) for x in words)
def infer_spaces(s):
"""Uses dynamic programming to infer the location of spaces in a string
without spaces."""
# Find the best match for the i first characters, assuming cost has
# been built for the i-1 first characters.
# Returns a pair (match_cost, match_length).
def best_match(i):
candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)
# Build the cost array.
cost = [0]
for i in range(1,len(s)+1):
c,k = best_match(i)
cost.append(c)
# Backtrack to recover the minimal-cost string.
out = []
i = len(s)
while i>0:
c,k = best_match(i)
assert c == cost[i]
out.append(s[i-k:i])
i -= k
return " ".join(reversed(out))
Running the function with the input:
messy_txt = "T he administrati on is doing bad things, and not fulfilling what it prom ised"
print(infer_spaces(messy_txt.lower().replace(' ', '').replace(',', '')).capitalize())
The administration is doing bad things and not fulfilling what it promised
>>>
Edit: The code below doesn't require the text file and works just for your input i.e., "T he administrati on is doing bad things, and not fulfilling what it prom ised"
from math import log
# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).
words = ["the", "administration", "is", "doing", "bad",
"things", "and", "not", "fulfilling", "what",
"it", "promised"]
wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
maxword = max(len(x) for x in words)
def infer_spaces(s):
"""Uses dynamic programming to infer the location of spaces in a string
without spaces."""
# Find the best match for the i first characters, assuming cost has
# been built for the i-1 first characters.
# Returns a pair (match_cost, match_length).
def best_match(i):
candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)
# Build the cost array.
cost = [0]
for i in range(1,len(s)+1):
c,k = best_match(i)
cost.append(c)
# Backtrack to recover the minimal-cost string.
out = []
i = len(s)
while i>0:
c,k = best_match(i)
assert c == cost[i]
out.append(s[i-k:i])
i -= k
return " ".join(reversed(out))
messy_txt = "T he administrati on is doing bad things, and not fulfilling what it prom ised"
print(infer_spaces(messy_txt.lower().replace(' ', '').replace(',', '')).capitalize())
The administration is doing bad things and not fulfilling what it promised
>>>
I have just tried the above edit at repl.it and it printed the output as shown.