Effective encoding-decoding chain determination (time optimization)

Question

Could you please help with speeding-up this code?

Input: UTF-8 text (encoded 1-3 times from known pool of encodings). Every time was encoded and decoded by random encoding from pool. Original was koi8-r.

Output: UTF-8 text decoded

You can use only built-in libs or own code in one script

import sys
import timeit
from string import punctuation, digits, whitespace


# check if every input text is decoded from 1 to 3 times with encodings from encodingnames
def chain_decode(text):
    # words to check if text is decoded
    firstWord = "ПРОЦ"
    lastWord = "КНЦ;"
    # encoding only part of text for speeding up
    startInput = text[0:4]
    endInput = text[-4:]
    allowchars = frozenset("АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" + punctuation + digits + whitespace)

    if startInput == firstWord and endInput == lastWord:
        return text

    # koi8_r is known original encoding
    firstWord = firstWord.encode("koi8_r")
    lastWord = lastWord.encode("koi8_r")

    encodingNames = "cp037 cp1006 cp1250 cp1251 cp1253 cp1254 cp1255 cp1256 cp1257 cp1258 cp437 cp720 cp737 cp775 cp850 cp852 cp855 cp864 cp866 cp869 cp874 cp875 hp_roman8 iso8859_10 iso8859_16 iso8859_4 iso8859_5 koi8_r latin_1 mac_croatian mac_greek mac_iceland mac_latin2".split()
    # pairs are used to speed up decoding for 3-times case
    # dicts to cache decoding part results
    startpairs, midpairs, endpairs = dict(), dict(), dict()

    def isDecoded(word):
        nonlocal allowchars
        for ch in word:
            if ch not in allowchars:
                return False
        return True

    def isFinal(word):
        # known tokens in decoded text
        if "ОСОВЕННО" not in word and "ВЫВОД: " in word:
            return True
        else:
            return False

    def decode():
        nonlocal text, startpairs, midpairs, endpairs, endInput, startInput
        # try and except to skip iterations and check is pair with current text encodable/decodable
        # 1 pair case
        for enc1 in encodingNames:
            try:
                tmpfirst, tmplast = startInput.encode(enc1).decode(enc1), endInput.encode(enc1).decode(enc1)
                for enc2 in encodingNames:
                    if enc1 != enc2:
                        try:
                            subtmpfirst, subtmplast = tmpfirst.encode(enc2), tmplast.encode(enc2)
                            if isDecoded(subtmpfirst.decode("koi8-r")) and isDecoded(subtmplast.decode("koi8-r")):
                                res = text.encode(enc1).decode(enc1).encode(enc2).decode("koi8-r")
                                if isFinal(res):
                                    return res
                            # caching result
                            endpairs[enc1 + ' ' + enc2] = (subtmpfirst, subtmplast)
                        except Exception:
                            pass
            except Exception:
                pass

        # 1-pair case but for second/third pair
        for enc1 in encodingNames:
            try:
                tmpfirst, tmplast = firstWord.decode(enc1), lastWord.decode(enc1)
                for enc2 in encodingNames:
                    if enc1 != enc2:
                        try:
                            subtmpfirst, subtmplast = tmpfirst.encode(enc2), tmplast.encode(enc2)
                            startpairs[enc2 + ' ' + enc1] = (subtmpfirst, subtmplast)
                        except Exception:
                            pass
            except Exception:
                pass

        # 2-pairs case
        for firstpair in startpairs.keys():
            for lastpair in endpairs.keys():
                if startpairs[firstpair][0] in endpairs[lastpair][0] and startpairs[firstpair][1] in endpairs[lastpair][1]:
                    try:
                        # variables for readability
                        firstenc, secondenc = firstpair.split()
                        threeenc, fourenc = lastpair.split()
                        res = text.encode(firstenc).decode(firstenc).encode(secondenc).decode(threeenc).encode(
                            fourenc).decode("koi8-r")
                        if isFinal(res):
                            return res
                    except Exception:
                        pass

        # 3-pairs case
        for firstpair in startpairs.keys():
            try:
                tmpfirst, tmplast = startpairs[firstpair][0], startpairs[firstpair][1]
                for lastpair in endpairs.keys():
                    try:
                        subtmpfirst, subtmplast = endpairs[lastpair][0], endpairs[lastpair][1]
                        for enc1 in encodingNames:
                            try:
                                finaltmpfirst, finaltmplast = tmpfirst.decode(enc1), tmplast.decode(enc1)
                                for enc2 in encodingNames:
                                    if enc1 != enc2:
                                        try:
                                            finalsubtmpfirst, finalsubtmplast = finaltmpfirst.encode(
                                                enc2), finaltmplast.encode(enc2)
                                            if finalsubtmpfirst in subtmpfirst and finalsubtmplast in subtmplast:
                                                firstenc, secondenc = lastpair.split()
                                                threeenc, fourenc = enc2, enc1
                                                fiveenc, sixenc = firstpair.split()
                                                res = text.encode(firstenc).decode(firstenc).encode(secondenc).decode(
                                                    threeenc).encode(fourenc).decode(fiveenc).encode(sixenc).decode(
                                                    "koi8-r")
                                                if isFinal(res):
                                                    return res
                                        except Exception:
                                            pass
                            except Exception:
                                pass
                    except Exception:
                        pass
            except Exception:
                pass
        return "ERROR"

    return decode()


if __name__ == "__main__":
    text = sys.stdin.buffer.read().decode("utf-8").strip()
    chain_decode(text)
    # print(timeit.timeit("chain_decode(text)",number=10, globals=globals()))

Is there a way to speed up 30-50%? I have 2s TL for small text in worst case on a bad test machine Code structure simplification is also appreciated. Example input:

ЪЫЩж ИЩЫЖРНЩ();
  ЭвЭЩН: "ЦЩкОЮ ЫЖРЭУЮУн ЩЗЫЖРЩЦ ТЬХЩЭУн ХУмЧЩЬЮУ."
    НЩ ЩЮ ЭЬО 0-8-6+4/ЭЬОЦ;
    ЪЫЩж ОЬХУ 6()6=7+6()3::
    ЭвЭЩН: "ЪЫУмОЦ 7."
    ЭвЭЩН: "ЭОЬд НЫТИ ЪЫЩОФЮЖ ЭЬОЦ ОЦТ ЪЫУСЩНУЮЬн?"
ФЧж;

Example output:

ПРОЦ ГОРАЗДО();
  ВЫВОД: "МОЖЕТ РАЗВИТИЯ ОБРАЗОМ УСЛОВИЯ ЛИЧНОСТИ."
    ДО ОТ ВСЕ 0-8-6+4/ВСЕМ;
    ПРОЦ ЕСЛИ 6()6=7+6()3::
    ВЫВОД: "ПРИЧЕМ 7."
    ВЫВОД: "ВЕСЬ ДРУГ ПРОЕКТА ВСЕМ ЕМУ ПРИХОДИТСЯ?"
КНЦ;

Example encoding chain:

text.encode('cp855').decode('cp855').encode('cp1251').decode('iso8859_5').encode('cp1251').decode('cp869').encode('mac_greek').decode('koi8-r')

Welcome to CODE REVIEW@SE. Please explain in your above question: What does text is decoded k times with encoding from encodingnames mean? — greybeard
– greybeard, Commented Dec 6, 2022 at 16:25
@greybeard Yeah, I've edited comment. It means that every encoding with what original text was transformated has been taken from encodingNames set There has been from 1 to 3 steps of decoding and encoding but how much it's unknown and should be determined from input text — Kir S
– Kir S, Commented Dec 6, 2022 at 20:26
Can you please five example input and output for the code? I cannot understand does your code is trying to achieve. @KirS — LazyGoose
– LazyGoose, Commented Dec 10, 2022 at 13:46
On another note, why do you do the following code .encode(enc1).decode(enc1) ? Does it change the input in same way? I could not find anything It would not result in the same result as the input — LazyGoose
– LazyGoose, Commented Dec 10, 2022 at 13:47
@LazyGoose firstly text is in UTF-8 (default) but as I understand it directly maps to cp855. It was for reissurance so may be this step it is really not necessary. I added one example (worst case, 3 pairs) — Kir S
– Kir S, Commented Dec 10, 2022 at 15:00

LazyGoose · Accepted Answer · 2022-12-11 23:02:55Z

I don't know if it good enough for you, but if pre-calculations do not count, you can use the solution below.

Basically, I compute all options from ПРОЦ and when text is given, the program only need to use its memory and fetch the correct encoding.

Although computing x1, x2 and x3 is not fast, function solve is fast :)

import itertools

encodingNames = "cp037 cp1006 cp1250 cp1251 cp1253 cp1254 cp1255 cp1256 cp1257 cp1258 cp437 cp720 cp737 cp775 cp850 cp852 cp855 cp864 cp866 cp869 cp874 cp875 hp_roman8 iso8859_10 iso8859_16 iso8859_4 iso8859_5 koi8_r latin_1 mac_croatian mac_greek mac_iceland mac_latin2".split()


def compute(word, encodings):
    try:
        for i in range(0, len(encodings), 2):
            word = word.encode(encodings[i]).decode(encodings[i + 1])
    except Exception:
        return None
    return word


def compute_options(swords):
    return {
        source: encodings
        for encodings in itertools.product(encodingNames, repeat=2)
        if encodings[0] != encodings[1]
        for sword in swords
        for source in [compute(sword, encodings)]
        if source is not None
    }


x1 = compute_options(['ПРОЦ'])
x2 = compute_options(x1.keys())
x3 = compute_options(x2.keys())


def solve(text):
    word = text[:4]
    solution = tuple()
    if word in x3:
        encodings = tuple(reversed(x3[word]))
        solution += encodings
        word = compute(word, encodings)
    if word in x2:
        encodings = tuple(reversed(x2[word]))
        solution += encodings
        word = compute(word, encodings)
    if word in x1:
        encodings = tuple(reversed(x1[word]))
        solution += encodings
    return compute(text, solution)


def main():
    a = """ЪЫЩж ИЩЫЖРНЩ();
      ЭвЭЩН: "ЦЩкОЮ ЫЖРЭУЮУн ЩЗЫЖРЩЦ ТЬХЩЭУн ХУмЧЩЬЮУ."
        НЩ ЩЮ ЭЬО 0-8-6+4/ЭЬОЦ;
        ЪЫЩж ОЬХУ 6()6=7+6()3::
        ЭвЭЩН: "ЪЫУмОЦ 7."
        ЭвЭЩН: "ЭОЬд НЫТИ ЪЫЩОФЮЖ ЭЬОЦ ОЦТ ЪЫУСЩНУЮЬн?"
    ФЧж;"""
    print(solve(a))


if __name__ == '__main__':
    main()

```

Stack Exchange Network

Effective encoding-decoding chain determination (time optimization)

1 Answer 1

You must log in to answer this question.

Hot Network Questions

Effective encoding-decoding chain determination (time optimization)

1 Answer 1

You must log in to answer this question.

Related

Hot Network Questions