Could you please help with speeding-up this code?
Input: UTF-8 text (encoded 1-3 times from known pool of encodings). Every time was encoded and decoded by random encoding from pool. Original was koi8-r.
Output: UTF-8 text decoded
You can use only built-in libs or own code in one script
import sys
import timeit
from string import punctuation, digits, whitespace
# check if every input text is decoded from 1 to 3 times with encodings from encodingnames
def chain_decode(text):
# words to check if text is decoded
firstWord = "ПРОЦ"
lastWord = "КНЦ;"
# encoding only part of text for speeding up
startInput = text[0:4]
endInput = text[-4:]
allowchars = frozenset("АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" + punctuation + digits + whitespace)
if startInput == firstWord and endInput == lastWord:
return text
# koi8_r is known original encoding
firstWord = firstWord.encode("koi8_r")
lastWord = lastWord.encode("koi8_r")
encodingNames = "cp037 cp1006 cp1250 cp1251 cp1253 cp1254 cp1255 cp1256 cp1257 cp1258 cp437 cp720 cp737 cp775 cp850 cp852 cp855 cp864 cp866 cp869 cp874 cp875 hp_roman8 iso8859_10 iso8859_16 iso8859_4 iso8859_5 koi8_r latin_1 mac_croatian mac_greek mac_iceland mac_latin2".split()
# pairs are used to speed up decoding for 3-times case
# dicts to cache decoding part results
startpairs, midpairs, endpairs = dict(), dict(), dict()
def isDecoded(word):
nonlocal allowchars
for ch in word:
if ch not in allowchars:
return False
return True
def isFinal(word):
# known tokens in decoded text
if "ОСОВЕННО" not in word and "ВЫВОД: " in word:
return True
else:
return False
def decode():
nonlocal text, startpairs, midpairs, endpairs, endInput, startInput
# try and except to skip iterations and check is pair with current text encodable/decodable
# 1 pair case
for enc1 in encodingNames:
try:
tmpfirst, tmplast = startInput.encode(enc1).decode(enc1), endInput.encode(enc1).decode(enc1)
for enc2 in encodingNames:
if enc1 != enc2:
try:
subtmpfirst, subtmplast = tmpfirst.encode(enc2), tmplast.encode(enc2)
if isDecoded(subtmpfirst.decode("koi8-r")) and isDecoded(subtmplast.decode("koi8-r")):
res = text.encode(enc1).decode(enc1).encode(enc2).decode("koi8-r")
if isFinal(res):
return res
# caching result
endpairs[enc1 + ' ' + enc2] = (subtmpfirst, subtmplast)
except Exception:
pass
except Exception:
pass
# 1-pair case but for second/third pair
for enc1 in encodingNames:
try:
tmpfirst, tmplast = firstWord.decode(enc1), lastWord.decode(enc1)
for enc2 in encodingNames:
if enc1 != enc2:
try:
subtmpfirst, subtmplast = tmpfirst.encode(enc2), tmplast.encode(enc2)
startpairs[enc2 + ' ' + enc1] = (subtmpfirst, subtmplast)
except Exception:
pass
except Exception:
pass
# 2-pairs case
for firstpair in startpairs.keys():
for lastpair in endpairs.keys():
if startpairs[firstpair][0] in endpairs[lastpair][0] and startpairs[firstpair][1] in endpairs[lastpair][1]:
try:
# variables for readability
firstenc, secondenc = firstpair.split()
threeenc, fourenc = lastpair.split()
res = text.encode(firstenc).decode(firstenc).encode(secondenc).decode(threeenc).encode(
fourenc).decode("koi8-r")
if isFinal(res):
return res
except Exception:
pass
# 3-pairs case
for firstpair in startpairs.keys():
try:
tmpfirst, tmplast = startpairs[firstpair][0], startpairs[firstpair][1]
for lastpair in endpairs.keys():
try:
subtmpfirst, subtmplast = endpairs[lastpair][0], endpairs[lastpair][1]
for enc1 in encodingNames:
try:
finaltmpfirst, finaltmplast = tmpfirst.decode(enc1), tmplast.decode(enc1)
for enc2 in encodingNames:
if enc1 != enc2:
try:
finalsubtmpfirst, finalsubtmplast = finaltmpfirst.encode(
enc2), finaltmplast.encode(enc2)
if finalsubtmpfirst in subtmpfirst and finalsubtmplast in subtmplast:
firstenc, secondenc = lastpair.split()
threeenc, fourenc = enc2, enc1
fiveenc, sixenc = firstpair.split()
res = text.encode(firstenc).decode(firstenc).encode(secondenc).decode(
threeenc).encode(fourenc).decode(fiveenc).encode(sixenc).decode(
"koi8-r")
if isFinal(res):
return res
except Exception:
pass
except Exception:
pass
except Exception:
pass
except Exception:
pass
return "ERROR"
return decode()
if __name__ == "__main__":
text = sys.stdin.buffer.read().decode("utf-8").strip()
chain_decode(text)
# print(timeit.timeit("chain_decode(text)",number=10, globals=globals()))
Is there a way to speed up 30-50%? I have 2s TL for small text in worst case on a bad test machine Code structure simplification is also appreciated. Example input:
ЪЫЩж ИЩЫЖРНЩ();
ЭвЭЩН: "ЦЩкОЮ ЫЖРЭУЮУн ЩЗЫЖРЩЦ ТЬХЩЭУн ХУмЧЩЬЮУ."
НЩ ЩЮ ЭЬО 0-8-6+4/ЭЬОЦ;
ЪЫЩж ОЬХУ 6()6=7+6()3::
ЭвЭЩН: "ЪЫУмОЦ 7."
ЭвЭЩН: "ЭОЬд НЫТИ ЪЫЩОФЮЖ ЭЬОЦ ОЦТ ЪЫУСЩНУЮЬн?"
ФЧж;
Example output:
ПРОЦ ГОРАЗДО();
ВЫВОД: "МОЖЕТ РАЗВИТИЯ ОБРАЗОМ УСЛОВИЯ ЛИЧНОСТИ."
ДО ОТ ВСЕ 0-8-6+4/ВСЕМ;
ПРОЦ ЕСЛИ 6()6=7+6()3::
ВЫВОД: "ПРИЧЕМ 7."
ВЫВОД: "ВЕСЬ ДРУГ ПРОЕКТА ВСЕМ ЕМУ ПРИХОДИТСЯ?"
КНЦ;
Example encoding chain:
text.encode('cp855').decode('cp855').encode('cp1251').decode('iso8859_5').encode('cp1251').decode('cp869').encode('mac_greek').decode('koi8-r')
.encode(enc1).decode(enc1)? Does it change the input in same way? I could not find anything It would not result in the same result as the input \$\endgroup\$