From a6a6090ae6424537942deb297604349d746e95fa Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 17 Jan 2022 22:39:22 +0100 Subject: [PATCH 01/97] Added codec: rick --- README.md | 1 + codext/stegano/__init__.py | 1 + codext/stegano/rick.py | 30 ++++++++++++++++++++++++++++++ docs/enc/stegano.md | 17 +++++++++++++++++ 4 files changed, 49 insertions(+) create mode 100755 codext/stegano/rick.py diff --git a/README.md b/README.md index b290247..4dfcd76 100644 --- a/README.md +++ b/README.md @@ -301,6 +301,7 @@ o - [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution - [X] `resistor`: aka resistor color codes +- [X] `rick`: aka Rick cipher (in reference to Rick Astley's song "*Never gonna give you up*") - [X] `sms`: also called _T9 code_ ; uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding - [X] `whitespace`: replaces bits with whitespaces and tabs - [X] `whitespace_after_before`: variant of `whitespace` ; encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") diff --git a/codext/stegano/__init__.py b/codext/stegano/__init__.py index febe509..03541a7 100755 --- a/codext/stegano/__init__.py +++ b/codext/stegano/__init__.py @@ -1,6 +1,7 @@ # -*- coding: UTF-8 -*- from .klopf import * from .resistor import * +from .rick import * from .sms import * from .whitespace import * diff --git a/codext/stegano/rick.py b/codext/stegano/rick.py new file mode 100755 index 0000000..f15881d --- /dev/null +++ b/codext/stegano/rick.py @@ -0,0 +1,30 @@ +# -*- coding: UTF-8 -*- +"""Rick Astley Codec - Rick Astley's song content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rick|rick-astley)': {'this is a test': "TELL LET You gonna + You gonna + NEVER + TELL UP gonna TELL"}, +} + + +# inspired from: https://github.com/moongazer07/rick-cipher +ENCMAP = { + 'A': "NEVER", 'B': "GONNA", 'C': "GIVE", 'D': "YOU", 'E': "UP", 'F': "Never", 'G': "Gonna", 'H': "LET", 'I': "You", + 'J': "DOWN", 'K': "NEver", 'L': "GOnna", 'M': "TURN", 'N': "AROUND", 'O': "AND", 'P': ["DESERT", "DESSERT"], + 'Q': "YOu", 'R': "NEVer", 'S': "gonna", 'T': "TELL", 'U': "A", 'V': "LIE", 'W': "and", 'X': "HURT", 'Y': "you", + 'Z': "rick astley", ' ': "+", '.': ".", '\n': "\n", + '0': "0", '1': "1", '2': "2", '3': "3", '4': "4", '5': "5", '6': "6", '7': "7", '8': "8", '9': "9", +} + + +add_map("rick", ENCMAP, "?", " ", ignore_case="encode", pattern=r"^rick(?:[-_]astley)?(?:[-_]cipher)?$", + printables_rate=1.) + diff --git a/docs/enc/stegano.md b/docs/enc/stegano.md index ecd2732..d2fb212 100644 --- a/docs/enc/stegano.md +++ b/docs/enc/stegano.md @@ -38,6 +38,23 @@ This uses the [electronic color code](https://en.wikipedia.org/wiki/Electronic_c ----- +### Rick Cipher + +This converts letters to words from Rick Astley's famous song "*Never gonna give you up*". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rick` | text <-> words from Risk's song | `rick-astley`, `rick_cipher`, `rick-astley-cipher` | case-insensitive while encoding + +```python +>>> codext.encode("Test String", "rick") +'TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna' +>>> codext.decode("TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna", "rick") +'TEST STRING' +``` + +----- + ### SMS (T9) This codec implements the SMS encoding, also caled T9, that is the conversion from characters to their corresponding phone keystrokes. From 69e6009450de95d3870bb1b4b607858404fda4d5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 19 Jan 2022 09:25:22 +0100 Subject: [PATCH 02/97] Improved list_encodings function --- .coveragerc | 1 + codext/__common__.py | 29 ++++++++++++++++------------- tests/test_common.py | 2 ++ 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/.coveragerc b/.coveragerc index e6f78f7..0baf7fa 100644 --- a/.coveragerc +++ b/.coveragerc @@ -10,6 +10,7 @@ exclude_lines = if.*?__name__.*?==.*?.__main__.: def main\(\)\: def __stdin_pipe\(\)\: + for line in __stdin_pipe\(\)\: def __literal_eval\(o\)\: def __print_tabular\(lst, space\=4\)\: except ImportError: diff --git a/codext/__common__.py b/codext/__common__.py index 2cb6eb0..3e9bab8 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -6,7 +6,7 @@ import random import re import sys -from encodings.aliases import aliases +from encodings.aliases import aliases as ALIASES from functools import reduce, wraps from importlib import import_module from inspect import currentframe @@ -40,10 +40,7 @@ "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] CODECS_REGISTRY = None CODECS_CATEGORIES = ["native", "custom"] -try: - LANG = getlocale()[0][:2].lower() -except TypeError: - LANG = None +LANG = getlocale()[0][:2].lower() if getlocale() else None MASKS = { 'a': printable, 'b': "".join(chr(i) for i in range(256)), @@ -601,7 +598,7 @@ def examples(encoding, number=10): except LookupError: pass i += 1 - for alias, codec in aliases.items(): + for alias, codec in ALIASES.items(): if name == codec: if codec not in e: e.append(codec) @@ -634,7 +631,7 @@ def list_encodings(*categories): # first, determine the list of valid categories valid_categories = list_categories() # then, if "non-native" is in the input list, extend the list with the whole categories but "native" - categories = list(categories) + categories, exclude = list(categories), [] for c in categories[:]: if c == "non-native": for c in valid_categories: @@ -642,11 +639,17 @@ def list_encodings(*categories): continue categories.append(c) categories.remove("non-native") - break + if c.startswith("~"): + exclude.append(c[1:]) + categories.remove(c) + try: + categories.remove(c[1:]) + except ValueError: + pass # now, filter codecs according to the input list of categories enc = [] - if len(categories) == 0 or "native" in categories: - for a in set(aliases.values()): + if (len(categories) == 0 or "native" in categories) and "native" not in exclude: + for a in set(ALIASES.values()): try: __orig_lookup(a) except LookupError: @@ -660,7 +663,7 @@ def list_encodings(*categories): else: ci = search_function(generate_string_from_regex(p)) c = "other" if ci is None else ci.parameters['category'] - if len(categories) == 0 or c in categories: + if (len(categories) == 0 or c in categories) and c not in exclude: enc.append(name) for category in categories: if category not in valid_categories: @@ -873,7 +876,7 @@ def lookup(encoding, macro=True): try: # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters ci = __orig_lookup(encoding) - ci.parameters = {'category': "native", 'module': "codecs", 'name': aliases.get(ci.name, ci.name)} + ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} return ci except LookupError: if not macro: @@ -932,7 +935,7 @@ def search(encoding_regex): if c >= 3: matches.append(n) break - for s, n in aliases.items(): + for s, n in ALIASES.items(): if re.search(encoding_regex, s) or re.search(encoding_regex, n): matches.append(n) break diff --git a/tests/test_common.py b/tests/test_common.py index 39a0a68..62b493f 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -90,6 +90,8 @@ def test_list_codecs(self): self.assertTrue(len(codext.list("non-native")) > 0) self.assertTrue(len(codext.list("native", "non-native", "crypto", "base")) > 0) self.assertTrue(len(codext.list("native", "language", "crypto")) > 0) + self.assertTrue(len(codext.list("~crypto")) > 0) + self.assertEqual(set(codext.list("~native")), set(codext.list("non-native"))) self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native"))) self.assertRaises(ValueError, codext.list, "BAD_CATEGORY") self.assertTrue(codext.is_native("base64_codec")) From 8e99915078f3ea927a9314f8ac72c63f14143eb1 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 19 Jan 2022 09:26:25 +0100 Subject: [PATCH 03/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 0a5af26..3d0e623 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.11.3 +1.11.4 From ed14c33d5c191919f6e302f9b4c5f5a11baa9c6e Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 19 Jan 2022 15:16:12 +0100 Subject: [PATCH 04/97] Fixed codec: rick --- codext/stegano/rick.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/codext/stegano/rick.py b/codext/stegano/rick.py index f15881d..30986e4 100755 --- a/codext/stegano/rick.py +++ b/codext/stegano/rick.py @@ -20,8 +20,9 @@ 'A': "NEVER", 'B': "GONNA", 'C': "GIVE", 'D': "YOU", 'E': "UP", 'F': "Never", 'G': "Gonna", 'H': "LET", 'I': "You", 'J': "DOWN", 'K': "NEver", 'L': "GOnna", 'M': "TURN", 'N': "AROUND", 'O': "AND", 'P': ["DESERT", "DESSERT"], 'Q': "YOu", 'R': "NEVer", 'S': "gonna", 'T': "TELL", 'U': "A", 'V': "LIE", 'W': "and", 'X': "HURT", 'Y': "you", - 'Z': "rick astley", ' ': "+", '.': ".", '\n': "\n", + 'Z': "rick", ' ': "+", '.': ".", '\n': "\n", '0': "0", '1': "1", '2': "2", '3': "3", '4': "4", '5': "5", '6': "6", '7': "7", '8': "8", '9': "9", + '': "astley", # silent this token for decoding ("rick astley" causes an issue with the separator " ") } From d7a744be6b19da917dfba0b0a1ddd07d8eeb491e Mon Sep 17 00:00:00 2001 From: smarbal Date: Wed, 19 Jan 2022 16:23:32 +0100 Subject: [PATCH 05/97] Implement tap code --- codext/languages/__init__.py | 1 + codext/languages/tap.py | 50 ++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 codext/languages/tap.py diff --git a/codext/languages/__init__.py b/codext/languages/__init__.py index 8222254..940fa10 100755 --- a/codext/languages/__init__.py +++ b/codext/languages/__init__.py @@ -7,4 +7,5 @@ from .radio import * from .southpark import * from .tomtom import * +from .tap import * diff --git a/codext/languages/tap.py b/codext/languages/tap.py new file mode 100644 index 0000000..efcac11 --- /dev/null +++ b/codext/languages/tap.py @@ -0,0 +1,50 @@ +# -*- coding: UTF-8 -*- +"""Tap code - Tap/knock code encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(tap)': {'this is a test': ".... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ...."} +} + + +def build_encmap(map) : + dict = {} + i = 0 + for col in range(1,6) : + for row in range(1,6) : + dict[map[i]] = "" + col * "." + " " + row * "." + i += 1 + dict['k'] = dict['c'] + dict[' '] = '' + return dict + +def encode_tap(text, errors = 'strict') : + map = 'abcdefghijlmnopqrstuvwxyz' + ENCMAP = build_encmap(map) + encoded = "" + for i, letter in enumerate(text) : + encoded += ENCMAP[letter.lower()] + if i != len(text) - 1 and letter != ' ': + encoded += ' ' + return encoded, len(text) + + +def decode_tap(text, errors = 'strict') : + map = 'abcdefghijlmnopqrstuvwxyz' + ENCMAP = build_encmap(map) + decoded = "" + for elem in text.split(" ") : + decoded += next(key for key, value in ENCMAP.items() if value == elem) + return decoded, len(text) + + +add("tap", encode_tap, decode_tap, ignore_case="encode") + From 37cfdbbc09eb32b13bb738ff5009cd7b3325b57b Mon Sep 17 00:00:00 2001 From: smarbal Date: Wed, 19 Jan 2022 16:41:04 +0100 Subject: [PATCH 06/97] Fix error handling --- codext/languages/tap.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/codext/languages/tap.py b/codext/languages/tap.py index efcac11..624b490 100644 --- a/codext/languages/tap.py +++ b/codext/languages/tap.py @@ -23,7 +23,6 @@ def build_encmap(map) : dict[map[i]] = "" + col * "." + " " + row * "." i += 1 dict['k'] = dict['c'] - dict[' '] = '' return dict def encode_tap(text, errors = 'strict') : @@ -31,20 +30,26 @@ def encode_tap(text, errors = 'strict') : ENCMAP = build_encmap(map) encoded = "" for i, letter in enumerate(text) : - encoded += ENCMAP[letter.lower()] + try : + encoded += ENCMAP[letter.lower()] + except KeyError : + pass if i != len(text) - 1 and letter != ' ': encoded += ' ' return encoded, len(text) -def decode_tap(text, errors = 'strict') : +def decode_tap(text, errors = 'ignore') : map = 'abcdefghijlmnopqrstuvwxyz' ENCMAP = build_encmap(map) decoded = "" for elem in text.split(" ") : - decoded += next(key for key, value in ENCMAP.items() if value == elem) + try : + decoded += next(key for key, value in ENCMAP.items() if value == elem) + except StopIteration : + print("Invalid character(s) in the input. This is what could be decoded :") return decoded, len(text) -add("tap", encode_tap, decode_tap, ignore_case="encode") +add("tap", encode_tap, decode_tap, ignore_case="both") From 5e1e294e54d044025a198b818784f6cd91219e97 Mon Sep 17 00:00:00 2001 From: smarbal Date: Wed, 19 Jan 2022 17:05:17 +0100 Subject: [PATCH 07/97] Add tap code to documentation --- README.md | 1 + codext/languages/tap.py | 3 ++- docs/enc/languages.md | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4dfcd76..6776436 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,7 @@ o - [X] `southpark`: converts letters to Kenny's language from Southpark (whitespace is also handled) - [X] `southpark-icase`: case insensitive variant of `southpark` - [X] `tomtom`: similar to `morse`, using slashes and backslashes +- [X] `tap` : converts tap/knock code commonly used by prisoners #### Others diff --git a/codext/languages/tap.py b/codext/languages/tap.py index 624b490..8cc516f 100644 --- a/codext/languages/tap.py +++ b/codext/languages/tap.py @@ -11,7 +11,8 @@ __examples__ = { - 'enc(tap)': {'this is a test': ".... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ...."} + 'enc(tap)': {'this is a test' : '.... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ....'}, + 'dec(tap)': {'.... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ....' : 'thisisatest'} } diff --git a/docs/enc/languages.md b/docs/enc/languages.md index 2665d19..f83ecd1 100644 --- a/docs/enc/languages.md +++ b/docs/enc/languages.md @@ -164,3 +164,19 @@ This codec is similar to morse. It converts text into slashes and backslashes. 'THIS IS A TEST' ``` +----- + +### Tap + +Converts tap/knock code [commonly used by prisoners](https://en.wikipedia.org/wiki/Tap_code). Uses 25 letters, 'k' codes as 'c'. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`tap` | text <-> tap/knock encoded text | `tap` | uses "` `" (double space) as a separator for letters. No spaces between words after decoding. + +```python +>>> codext.encode("this is a test", "tap") +'.... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ....' +>>> codext.decode(".... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ....", "tap") +'thisisatest' +``` From ba991a62440d07706002a7137c46e4bd37c775f3 Mon Sep 17 00:00:00 2001 From: smarbal <35641452+smarbal@users.noreply.github.com> Date: Wed, 19 Jan 2022 17:23:31 +0100 Subject: [PATCH 08/97] Update languages.md --- docs/enc/languages.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/enc/languages.md b/docs/enc/languages.md index f83ecd1..19a52e3 100644 --- a/docs/enc/languages.md +++ b/docs/enc/languages.md @@ -172,7 +172,7 @@ Converts tap/knock code [commonly used by prisoners](https://en.wikipedia.org/wi **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`tap` | text <-> tap/knock encoded text | `tap` | uses "` `" (double space) as a separator for letters. No spaces between words after decoding. +`tap` | text <-> tap/knock encoded text | `tap` | uses '   ' (double space) as a separator for letters. No spaces between words after decoding. ```python >>> codext.encode("this is a test", "tap") From 49c83a7bebf7f353cd3c980e7525b36e002c8990 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 19 Jan 2022 17:41:37 +0100 Subject: [PATCH 09/97] Applied minor improvements to codebase --- codext/__common__.py | 67 +++++++++++++++++++++++++++++++------------- codext/__init__.py | 7 +++++ tests/test_common.py | 15 +++++----- 3 files changed, 63 insertions(+), 26 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 3e9bab8..5a5b827 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -7,7 +7,7 @@ import re import sys from encodings.aliases import aliases as ALIASES -from functools import reduce, wraps +from functools import reduce, update_wrapper, wraps from importlib import import_module from inspect import currentframe from itertools import chain, product @@ -39,8 +39,11 @@ "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] CODECS_REGISTRY = None +CODECS_OVERWRITTEN = [] CODECS_CATEGORIES = ["native", "custom"] -LANG = getlocale()[0][:2].lower() if getlocale() else None +LANG = getlocale() +if LANG: + LANG = (LANG[0] or "")[:2].lower() MASKS = { 'a': printable, 'b': "".join(chr(i) for i in range(256)), @@ -142,6 +145,20 @@ def __repr__(self): return "" % (self.name, id(self)) +# inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python +class Repr(object): + def __init__(self, name, func): + self.__name = name + self.__func = func + update_wrapper(self, func) + + def __call__(self, *args, **kwargs): + return self.__func(*args, **kwargs) + + def __repr__(self): + return "" % (self.__name, id(self)) + + def __stdin_pipe(): """ Stdin pipe read function. """ try: @@ -173,6 +190,12 @@ def _stripl(s, st_lines, st_crlf): return s +def _with_repr(name): + def _wrapper(f): + return Repr(name, f) + return _wrapper + + def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs): """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern and with file handling. @@ -195,6 +218,7 @@ def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs= raise ValueError("At least one en/decoding function must be defined") glob = currentframe().f_back.f_globals # search function for the new encoding + @_with_repr(ename) def getregentry(encoding): if encoding != ename and not (pattern and re.match(pattern, encoding)): return @@ -304,6 +328,7 @@ class StreamReader(Codec, codecs.StreamReader): getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases'])) getregentry.__pattern__ = pattern register(getregentry, add_to_codecs) + return getregentry def add_macro(mname, *encodings): @@ -500,7 +525,7 @@ def __get_value(token, position, case_changed=False): return __get_value(token_inv_case, position, True) return error_func(token, position) if isinstance(result, list): - result = random.choice(result) + result = result[0] return result + lsep # if a separator is defined, rely on it by splitting the input text @@ -567,7 +592,7 @@ def __get_value(token, position, case_changed=False): kwargs['repl_minlen_b'] = max(1, min(map(len, map(b, set(smapdict.values()) - {''})))) except: pass - add(ename, __generic_code(), __generic_code(True), **kwargs) + return add(ename, __generic_code(), __generic_code(True), **kwargs) codecs.add_map = add_map @@ -651,17 +676,15 @@ def list_encodings(*categories): if (len(categories) == 0 or "native" in categories) and "native" not in exclude: for a in set(ALIASES.values()): try: - __orig_lookup(a) + ci = __orig_lookup(a) except LookupError: continue - enc.append(a) - for search_function in __codecs_registry: + if lookup(a) is ci: + enc.append(ci.name) + for search_function in CODECS_OVERWRITTEN + __codecs_registry: name = search_function.__name__.replace("_", "-") p = search_function.__pattern__ - if p is None: - ci = search_function(name) - else: - ci = search_function(generate_string_from_regex(p)) + ci = search_function(name) if p is None else search_function(generate_string_from_regex(p)) c = "other" if ci is None else ci.parameters['category'] if (len(categories) == 0 or c in categories) and c not in exclude: enc.append(name) @@ -834,8 +857,9 @@ def _handle_error(token, position, output=""): __orig_register = _codecs.register -def __add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=True): - add(ename, encode, decode, pattern, text, add_to_codecs) +def __add(ename, encode=None, decode=None, pattern=None, text=True, **kwargs): + kwargs.pop('add_to_codecs', None) + return add(ename, encode, decode, pattern, text, True, **kwargs) __add.__doc__ = add.__doc__ codecs.add = __add @@ -862,19 +886,19 @@ def encode(obj, encoding='utf-8', errors='strict'): def lookup(encoding, macro=True): """ Hooked lookup function for searching first for codecs in the local registry of this module. """ # first, try to match the given encoding with codecs' search functions - for search_function in __codecs_registry: + for search_function in CODECS_OVERWRITTEN + __codecs_registry: codecinfo = search_function(encoding) if codecinfo is not None: return codecinfo # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo - for search_function in __codecs_registry: + for search_function in CODECS_OVERWRITTEN + __codecs_registry: if search_function.__name__.replace("_", "-") == encoding or \ encoding in getattr(search_function, "__aliases__", []): codecinfo = search_function(generate_string_from_regex(search_function.__pattern__)) if codecinfo is not None: return codecinfo + # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters try: - # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters ci = __orig_lookup(encoding) ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} return ci @@ -898,14 +922,19 @@ def register(search_function, add_to_codecs=False): to remove the codec later """ if search_function not in __codecs_registry: - __codecs_registry.append(search_function) + try: + __orig_lookup(search_function.__name__) + l = CODECS_OVERWRITTEN + except LookupError: + l = __codecs_registry + l.append(search_function) if add_to_codecs: __orig_register(search_function) -def __register(search_function, add_to_codecs=True): +def __register(search_function): """ Same as register(...), but with add_to_codecs set by default to True. """ - register(search_function, add_to_codecs) + register(search_function, True) codecs.register = __register diff --git a/codext/__init__.py b/codext/__init__.py index 80f1d00..a37a98a 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -3,6 +3,7 @@ """ from __future__ import print_function +from _codecs import lookup as orig_lookup from ast import literal_eval from six import binary_type, text_type @@ -26,6 +27,12 @@ reset() +# overwritten native codec +add("uu", lambda i, e="strict": orig_lookup("uu").encode(b(i), e), + lambda i, e="strict": orig_lookup("uu").decode(b(i), e), + pattern=r"^uu(?:[-_]encode|codec)?$", add_to_codecs=True, category="native") + + def __literal_eval(o): """ Non-failing ast.literal_eval alias function. """ try: diff --git a/tests/test_common.py b/tests/test_common.py index 62b493f..0d9b381 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -8,7 +8,7 @@ import json import random import sys -from codext.__common__ import PERS_MACROS, PERS_MACROS_FILE +from codext.__common__ import CODECS_OVERWRITTEN, PERS_MACROS, PERS_MACROS_FILE from six import b, binary_type, text_type from unittest import TestCase @@ -56,17 +56,17 @@ def test_add_codec(self): self.assertRaises(ValueError, codext.add, "test") self.assertRaises(ValueError, codext.add, "test", "BAD") self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") - self.assertIsNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) self.assertEqual(codext.encode("test", "dummy"), "test") ci = codext.lookup("dummy") for k in ["add_to_codecs", "category", "examples", "name", "pattern", "text"]: self.assertIn(k, ci.parameters.keys()) - self.assertIsNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) + self.assertIsNotNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) self.assertRaises(AttributeError, codext.lookup, "dummy_errored1") def test_add_map_codec(self): ENCMAP = [{'a': "A", 'b': "B", 'c': "C"}, {'d': "D", 'e': "E", 'f': "F"}, {'g': "G", 'h': "H", 'i': "I"}] - self.assertIsNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) + self.assertIsNotNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) self.assertRaises(ValueError, codext.add_map, "dummy2", "BAD_ENCMAP") self.assertEqual(codext.encode("abc", "dummy2"), "ABC") self.assertEqual(codext.encode("abc", "dummy2-1"), "ABC") @@ -74,7 +74,7 @@ def test_add_map_codec(self): self.assertEqual(codext.encode("ghi", "dummy2-3"), "GHI") self.assertRaises(LookupError, codext.encode, "test", "dummy2-4") ENCMAP = {'': {'a': "A", 'b': "B"}, r'bad': {'a': "B", 'b': "A"}} - self.assertIsNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) + self.assertIsNotNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) self.assertRaises(LookupError, codext.encode, "test", "dummy3_inverted") self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, ignore_case="BAD") self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, intype="BAD") @@ -98,13 +98,13 @@ def test_list_codecs(self): self.assertFalse(codext.is_native("base64")) def test_remove_codec(self): - self.assertIsNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) self.assertEqual(codext.encode("test", "dummy"), "test") self.assertIsNone(codext.remove("dummy")) self.assertRaises(LookupError, codext.encode, "test", "dummy") # special case, when adding a new codec also to the native codecs registry, then it won't be possible to remove # it afterwards - self.assertIsNone(codecs.add("dummy2", dummy_encode, dummy_decode)) + self.assertIsNotNone(codecs.add("dummy2", dummy_encode, dummy_decode)) self.assertEqual(codecs.encode("test", "dummy2"), "test") self.assertIsNone(codecs.remove("dummy2")) self.assertEqual(codecs.encode("test", "dummy2"), "test") @@ -122,6 +122,7 @@ def test_reset_codecs(self): self.assertIsNone(codext.reset()) self.assertIsNotNone(codext.encode("test", "morse")) self.assertRaises(LookupError, codext.encode, "test", "dummy") + self.assertTrue(len(CODECS_OVERWRITTEN) > 0) def test_search_codecs(self): self.assertIsNotNone(codext.search("morse")) From d8380ee4e891538b4c2b593d64c74eb940c0395b Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 19 Jan 2022 17:41:48 +0100 Subject: [PATCH 10/97] Added codec: galactic --- README.md | 3 ++- codext/languages/__init__.py | 1 + codext/languages/galactic.py | 35 +++++++++++++++++++++++++++++++++++ docs/enc/languages.md | 17 +++++++++++++++++ 4 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 codext/languages/galactic.py diff --git a/README.md b/README.md index 4dfcd76..303035d 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ [![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) [![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) -This library extends the native [`codecs`](https://docs.python.org/3/library/codecs.html) library (namely for adding new custom encodings and character mappings) and provides a myriad of new encodings (static or parametrized, like `rot` or `xor`), hence its named combining *CODecs EXTension*. +[**CodExt**](https://github.com/dhondta/python-codext) is a (Python2-3 compatible) library that extends the native [`codecs`](https://docs.python.org/3/library/codecs.html) library (namely for adding new custom encodings and character mappings) and provides **120+ new codecs**, hence its name combining *CODecs EXTension*. It also features a **guess mode** for decoding multiple layers of encoding and **CLI tools** for convenience. ```sh $ pip install codext @@ -281,6 +281,7 @@ o - [X] `braille`: well-known braille language (Python 3 only) - [X] `ipsum`: aka lorem ipsum +- [X] `galactic`: aka galactic alphabet or Minecraft enchantment language (Python 3 only) - [X] `leetspeak`: based on minimalistic elite speaking rules - [X] `morse`: uses whitespace as a separator - [X] `navajo`: only handles letters (not full words from the Navajo dictionary) diff --git a/codext/languages/__init__.py b/codext/languages/__init__.py index 8222254..54114ea 100755 --- a/codext/languages/__init__.py +++ b/codext/languages/__init__.py @@ -1,5 +1,6 @@ # -*- coding: UTF-8 -*- from .braille import * +from .galactic import * from .ipsum import * from .leetspeak import * from .morse import * diff --git a/codext/languages/galactic.py b/codext/languages/galactic.py new file mode 100644 index 0000000..e77cb3a --- /dev/null +++ b/codext/languages/galactic.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Galactic Alphabet Codec - Minecraft enchantment language content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc-dec(galactic|minecraft_enchanting_language)': ["test " + MASKS['l']], + 'enc(galactic-alphabet|minecraft)': {'Bad test#': None}, +} + + +# source: https://shapecatcher.com +ENCMAP = { + 'a': ["ᒋ", "ᔑ"], 'b': ["⦣", "ゝ", "ʖ"], 'c': ["ì", "ᓵ"], 'd': "↸", 'e': ["ᒷ", "Ŀ"], 'f': ["𝌁", "⎓"], + 'g': ["𐌝", "┤", "⫞", "⊣"], 'h': ["₸", "⍑", "╤"], 'i': "╎", 'j': ["⫶", "⁝", "ⵗ", "⋮"], 'k': "ꖌ", 'l': "ꖎ", + 'm': ["ᒲ", "⟓"], 'n': ["ソ", "リ"], 'o': ["⁊", "フ", "ㇷ", "𝙹"], 'p': ["ⅱ", "ij", "‼", "!"], + 'q': ["ᑑ", "⊐", "コ"], 'r': ["⸬", "∷", "⛚"], 's': ["߆", "𝈿", "ꝇ", "ᓭ"], 't': ["ℸ", "ヿ", "⅂", "Ꞁ"], + 'u': ["⚍", "⍨"], 'v': ["𝍦", "⍊", "╧"], 'w': ["∴", "⸫", "⛬"], 'x': ["ꜘ", "╱", " ̷", "⟋"], + 'y': ["║", "‖", "∥", "ǁ", "𝄁", "|"], 'z': ["ᑎ", "⋂", "∩", "⨅", "⛫"], + ' ': [" ", "⠀"], +} + + +if PY3: + add_map("galactic", ENCMAP, ignore_case="encode", printables_rate=0., + pattern=r"^(?:galactic(?:[-_]alphabet)?|minecraft(?:[-_](?:enchantment|enchanting[-_]language))?)$") + diff --git a/docs/enc/languages.md b/docs/enc/languages.md index 2665d19..212326d 100644 --- a/docs/enc/languages.md +++ b/docs/enc/languages.md @@ -23,6 +23,23 @@ It supports letters, digits and some special characters. ----- +### Galactic + +This implements the [Minecraft's enchanting table](https://www.thegamer.com/minecraft-enchantment-table-language-guide/) using resembling Unicode characters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`galactic` | text <-> Minecraft enchantment symbols | `galactic-alphabet`, `minecraft_enchantment`, `minecraft-enchanting-language` | Python 3 only + +```python +>>> codext.encode("this is a test", "galactic") +'ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ' +>>> codext.decode("ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ", "galactic") +'this is a test' +``` + +----- + ### Ipsum This implements a codec that uses lorem ipsum words. It selects random words per letter and keeps the following punctuations: "`.,:;+=-*/\\`". From 73b1534d88f3f2fc83203b7737fe7070592fe0ca Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 19 Jan 2022 18:26:15 +0100 Subject: [PATCH 11/97] Improved codec: tap --- README.md | 2 +- codext/languages/__init__.py | 2 +- codext/languages/tap.py | 59 +++++++++++++----------------------- docs/enc/languages.md | 28 ++++++++--------- 4 files changed, 37 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index ff81c6d..e208594 100644 --- a/README.md +++ b/README.md @@ -288,8 +288,8 @@ o - [X] `radio`: aka NATO or radio phonetic alphabet - [X] `southpark`: converts letters to Kenny's language from Southpark (whitespace is also handled) - [X] `southpark-icase`: case insensitive variant of `southpark` +- [X] `tap`: converts text to tap/knock code, commonly used by prisoners - [X] `tomtom`: similar to `morse`, using slashes and backslashes -- [X] `tap` : converts tap/knock code commonly used by prisoners #### Others diff --git a/codext/languages/__init__.py b/codext/languages/__init__.py index 9333ede..196b8d3 100755 --- a/codext/languages/__init__.py +++ b/codext/languages/__init__.py @@ -7,6 +7,6 @@ from .navajo import * from .radio import * from .southpark import * -from .tomtom import * from .tap import * +from .tomtom import * diff --git a/codext/languages/tap.py b/codext/languages/tap.py index 8cc516f..efd551d 100644 --- a/codext/languages/tap.py +++ b/codext/languages/tap.py @@ -11,46 +11,29 @@ __examples__ = { - 'enc(tap)': {'this is a test' : '.... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ....'}, - 'dec(tap)': {'.... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ....' : 'thisisatest'} + 'enc(tap|knock-code|tap_code)': {'this is a test' : ".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. ." + "⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ...."}, } +__guess__ = ["tap", "tap-inv"] -def build_encmap(map) : - dict = {} - i = 0 - for col in range(1,6) : - for row in range(1,6) : - dict[map[i]] = "" + col * "." + " " + row * "." +def __build_encmap(a): + d, i = {}, 0 + for x in range(1,6): + for y in range(1,6): + d[a[i]] = x * "." + " " + y * "." i += 1 - dict['k'] = dict['c'] - return dict - -def encode_tap(text, errors = 'strict') : - map = 'abcdefghijlmnopqrstuvwxyz' - ENCMAP = build_encmap(map) - encoded = "" - for i, letter in enumerate(text) : - try : - encoded += ENCMAP[letter.lower()] - except KeyError : - pass - if i != len(text) - 1 and letter != ' ': - encoded += ' ' - return encoded, len(text) - - -def decode_tap(text, errors = 'ignore') : - map = 'abcdefghijlmnopqrstuvwxyz' - ENCMAP = build_encmap(map) - decoded = "" - for elem in text.split(" ") : - try : - decoded += next(key for key, value in ENCMAP.items() if value == elem) - except StopIteration : - print("Invalid character(s) in the input. This is what could be decoded :") - return decoded, len(text) - - -add("tap", encode_tap, decode_tap, ignore_case="both") + d['k'], d[' '] = d['c'], " " + return d + + + +ENCMAP = { + '': __build_encmap("abcdefghijlmnopqrstuvwxyz"), + 'inv': __build_encmap("abcdefghijlmnopqrstuvwxyz"[::-1]), +} + + +if PY3: + add_map("tap", ENCMAP, ignore_case="both", sep="⠀", pattern=r"^(?:tap|knock)(?:[-_]code)?(|inv)$") diff --git a/docs/enc/languages.md b/docs/enc/languages.md index bae7de8..3735d15 100644 --- a/docs/enc/languages.md +++ b/docs/enc/languages.md @@ -166,34 +166,34 @@ This encodes text according to Kenny's language in Southpark. ----- -### Tom-Tom +### Tap -This codec is similar to morse. It converts text into slashes and backslashes. +This codec implements the [tap/knock code](https://en.wikipedia.org/wiki/Tap_code) commonly used by prisoners. It uses 25 letters, "*k*" is encoded to the same token than "*c*". **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator +`tap` | text <-> tap/knock encoded text | `knock`, `tap-code` | uses a large Unicode whitespace as a token separator ; Python 3 only ```python ->>> codext.encode("this is a test", "tom-tom") -'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' ->>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") -'THIS IS A TEST' +>>> codext.encode("this is a test", "tap") +'.... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....' +>>> codext.decode(".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....", "knock") +'this is a test' ``` ----- -### Tap +### Tom-Tom -Converts tap/knock code [commonly used by prisoners](https://en.wikipedia.org/wiki/Tap_code). Uses 25 letters, 'k' codes as 'c'. +This codec is similar to morse. It converts text into slashes and backslashes. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`tap` | text <-> tap/knock encoded text | `tap` | uses '   ' (double space) as a separator for letters. No spaces between words after decoding. +`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator ```python ->>> codext.encode("this is a test", "tap") -'.... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ....' ->>> codext.decode(".... .... .. ... .. .... .... ... .. .... .... ... . . .... .... . ..... .... ... .... ....", "tap") -'thisisatest' +>>> codext.encode("this is a test", "tom-tom") +'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' +>>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") +'THIS IS A TEST' ``` From 9e4d71edbb198fdee052d7615ceffbdcf3d4b0d0 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 19 Jan 2022 18:27:30 +0100 Subject: [PATCH 12/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 3d0e623..e6dbb7c 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.11.4 +1.11.5 From b519bc260702051bded924448a4f4bcdc121581d Mon Sep 17 00:00:00 2001 From: smarbal Date: Sun, 23 Jan 2022 11:45:20 +0100 Subject: [PATCH 13/97] Implement working railfence encryption --- codext/crypto/__init__.py | 1 + codext/crypto/railfence.py | 74 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 codext/crypto/railfence.py diff --git a/codext/crypto/__init__.py b/codext/crypto/__init__.py index 29673c3..6928637 100755 --- a/codext/crypto/__init__.py +++ b/codext/crypto/__init__.py @@ -4,6 +4,7 @@ from .bacon import * from .barbie import * from .citrix import * +from .railfence import * from .rot import * from .scytale import * from .shift import * diff --git a/codext/crypto/railfence.py b/codext/crypto/railfence.py new file mode 100644 index 0000000..db66eac --- /dev/null +++ b/codext/crypto/railfence.py @@ -0,0 +1,74 @@ +# -*- coding: UTF-8 -*- +"""Rail Fence Cipher Codec - rail fence encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" + + + +from ..__common__ import * + + +__examples__ = { + 'enc(rail-5-3)': "it sss etiath " +} + + + +def __buildf(text, rails, offset = 0) : + l, rail, dr = len(text), offset, 1 + f = [["#"] * l for i in range(rails)] + for x in range(l) : + f[rail][x] = text[x] + if rail >= rails - 1: + dr = -1 + elif rail <= 0: + dr = 1 + rail += dr + for elem in f : + print(elem) + return f + +def railfence_encode(rails, offset = 0) : + def encode(text, errors="strict") : + print(len(text)) + + c,l = '', len(text) + f = __buildf(text,rails,offset) + for r in range(rails) : + for x in range(l) : + if f[r][x] != '#' : + c += f[r][x] + return c, l + return encode + +def railfence_decode(rails, offset = 0) : + def decode(text, errors = 'strict') : + f = __buildf("x" * len(text), rails, offset) + plain, i = '', 0 + ra, l = range(rails), range(len(text)) + + #Put the characters in the right place + for r in ra: + for x in l : + if f[r][x] == "x" : + f[r][x] = text[i] + i += 1 + #Read the characters in the right order + for x in l : + for r in ra: + if f[r][x] != '#' : + plain += f[r][x] + + return plain, len(plain) + + return decode + +add("rail", railfence_encode, railfence_decode, r"rail-(\d+)\-(\d+)$") + +#rail-(\d+)\-(\d+) +#rail-(\d+)(\-*(\d+)) \ No newline at end of file From 33a10be9f351882599559d4d5a8f0f051175ebf2 Mon Sep 17 00:00:00 2001 From: smarbal Date: Sun, 23 Jan 2022 11:54:13 +0100 Subject: [PATCH 14/97] Add exemples --- codext/crypto/railfence.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/codext/crypto/railfence.py b/codext/crypto/railfence.py index db66eac..f2a42f9 100644 --- a/codext/crypto/railfence.py +++ b/codext/crypto/railfence.py @@ -14,7 +14,8 @@ __examples__ = { - 'enc(rail-5-3)': "it sss etiath " + 'enc(rail-5-3)': {'this is a test' : 'it sss etiath '}, + 'dec(rail-7-4)': {'a stiet shsti': 'this is a test'} } From c5b1239e9a5ec97edab1edc683c4f8b29d1ae82e Mon Sep 17 00:00:00 2001 From: smarbal Date: Sun, 23 Jan 2022 12:21:45 +0100 Subject: [PATCH 15/97] Add up/down parameter for encoding and exemples --- codext/crypto/railfence.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/codext/crypto/railfence.py b/codext/crypto/railfence.py index f2a42f9..6e3fdb2 100644 --- a/codext/crypto/railfence.py +++ b/codext/crypto/railfence.py @@ -14,14 +14,20 @@ __examples__ = { - 'enc(rail-5-3)': {'this is a test' : 'it sss etiath '}, - 'dec(rail-7-4)': {'a stiet shsti': 'this is a test'} + 'enc(rail-5-3|rail_5_3)': {'this is a test' : 'it sss etiath '}, + 'enc(railup-5-3|railup_5_3)' :{'this is a test': 'h tiats e ssit'}, + 'dec(rail-7-4|rail_7_4)': {'a stiet shsti': 'this is a test'} } -def __buildf(text, rails, offset = 0) : - l, rail, dr = len(text), offset, 1 +def __buildf(text, rails, offset = 0, up = 0) : + l, rail = len(text), offset + if up != '' : + dr = -1 + rail = rails - offset - 1 + else : + dr = 1 f = [["#"] * l for i in range(rails)] for x in range(l) : f[rail][x] = text[x] @@ -30,16 +36,12 @@ def __buildf(text, rails, offset = 0) : elif rail <= 0: dr = 1 rail += dr - for elem in f : - print(elem) return f -def railfence_encode(rails, offset = 0) : +def railfence_encode(up = 0, rails = 3, offset = 0) : def encode(text, errors="strict") : - print(len(text)) - c,l = '', len(text) - f = __buildf(text,rails,offset) + f = __buildf(text,rails,offset, up) for r in range(rails) : for x in range(l) : if f[r][x] != '#' : @@ -47,9 +49,9 @@ def encode(text, errors="strict") : return c, l return encode -def railfence_decode(rails, offset = 0) : +def railfence_decode(up = 0,rails = 3, offset = 0) : def decode(text, errors = 'strict') : - f = __buildf("x" * len(text), rails, offset) + f = __buildf("x" * len(text), rails, offset, up) plain, i = '', 0 ra, l = range(rails), range(len(text)) @@ -69,7 +71,4 @@ def decode(text, errors = 'strict') : return decode -add("rail", railfence_encode, railfence_decode, r"rail-(\d+)\-(\d+)$") - -#rail-(\d+)\-(\d+) -#rail-(\d+)(\-*(\d+)) \ No newline at end of file +add("rail", railfence_encode, railfence_decode, r"rail(up)?[-_](\d+)[-_](\d+)$") From dc8a122ccd92e5cfa725d504df12506bc0e1e458 Mon Sep 17 00:00:00 2001 From: smarbal Date: Sun, 23 Jan 2022 12:42:50 +0100 Subject: [PATCH 16/97] Add rail fence to docs --- README.md | 3 ++- docs/enc/crypto.md | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e208594..a52a8b0 100644 --- a/README.md +++ b/README.md @@ -258,7 +258,8 @@ o - [X] `atbash`: aka Atbash Cipher - [X] `bacon`: aka Baconian Cipher - [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) -- [X] `citrix`: aka Citrix CTX1 passord encoding +- [X] `citrix`: aka Citrix CTX1 password encoding +- [X] `rail`: aka Rail Fence Cipher - [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) - [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) - [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) diff --git a/docs/enc/crypto.md b/docs/enc/crypto.md index 00b4f51..93b1221 100644 --- a/docs/enc/crypto.md +++ b/docs/enc/crypto.md @@ -128,6 +128,23 @@ This implements the Citrix CTX1 password encoding algorithm. ----- +### Rail Fence Cipher + +This implements the Rail Fence encoding algorithm. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`| The encoding fence is built from the top. Careful to trailing whitespaces. +`railup` | text <-> rail fence ciphertext, X rails and Y offset | `railup-X-Y`, `railup_X_Y`| The encoding fence is built from the bottom. Inverted compaired to the `rail` codec. + +```python +>>> codext.encode("this is a test", "rail-5-3") +'it sss etiath ' +>>> codext.decode("it sss etiath ", "rail-5-3") +'this is a test' +``` + +----- ### ROT N This is a dynamic encoding, that is, it can be called with an integer to define the ROT offset. Encoding will apply a positive offset, decoding will apply a negative one. From 45df3f07ac2ba621ab96208204bdc35f83926821 Mon Sep 17 00:00:00 2001 From: smarbal Date: Sun, 23 Jan 2022 13:26:03 +0100 Subject: [PATCH 17/97] Fix up flag for rail fence --- codext/crypto/railfence.py | 8 ++++---- docs/enc/crypto.md | 4 +--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/codext/crypto/railfence.py b/codext/crypto/railfence.py index 6e3fdb2..25ff28e 100644 --- a/codext/crypto/railfence.py +++ b/codext/crypto/railfence.py @@ -15,7 +15,7 @@ __examples__ = { 'enc(rail-5-3|rail_5_3)': {'this is a test' : 'it sss etiath '}, - 'enc(railup-5-3|railup_5_3)' :{'this is a test': 'h tiats e ssit'}, + 'enc(rail-5-3-up|rail_5_3-up)' :{'this is a test': 'h tiats e ssit'}, 'dec(rail-7-4|rail_7_4)': {'a stiet shsti': 'this is a test'} } @@ -38,7 +38,7 @@ def __buildf(text, rails, offset = 0, up = 0) : rail += dr return f -def railfence_encode(up = 0, rails = 3, offset = 0) : +def railfence_encode(rails = 3, offset = 0, up = 0) : def encode(text, errors="strict") : c,l = '', len(text) f = __buildf(text,rails,offset, up) @@ -49,7 +49,7 @@ def encode(text, errors="strict") : return c, l return encode -def railfence_decode(up = 0,rails = 3, offset = 0) : +def railfence_decode(rails = 3, offset = 0, up = 0) : def decode(text, errors = 'strict') : f = __buildf("x" * len(text), rails, offset, up) plain, i = '', 0 @@ -71,4 +71,4 @@ def decode(text, errors = 'strict') : return decode -add("rail", railfence_encode, railfence_decode, r"rail(up)?[-_](\d+)[-_](\d+)$") +add("rail", railfence_encode, railfence_decode, r"rail[-_](\d+)[-_](\d+)[-_]?(up)?$") diff --git a/docs/enc/crypto.md b/docs/enc/crypto.md index 93b1221..a9c13f7 100644 --- a/docs/enc/crypto.md +++ b/docs/enc/crypto.md @@ -134,9 +134,7 @@ This implements the Rail Fence encoding algorithm. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`| The encoding fence is built from the top. Careful to trailing whitespaces. -`railup` | text <-> rail fence ciphertext, X rails and Y offset | `railup-X-Y`, `railup_X_Y`| The encoding fence is built from the bottom. Inverted compaired to the `rail` codec. - +`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`| The encoding fence is built from the top. Careful to trailing whitespaces. The `up` flag is used to build the fence from the bottom to the top. ```python >>> codext.encode("this is a test", "rail-5-3") 'it sss etiath ' From 9efecc54464f2588070410b4d218943d7c9aa282 Mon Sep 17 00:00:00 2001 From: smarbal Date: Sun, 23 Jan 2022 13:27:54 +0100 Subject: [PATCH 18/97] Fix spaces --- codext/crypto/railfence.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/codext/crypto/railfence.py b/codext/crypto/railfence.py index 25ff28e..3c231a1 100644 --- a/codext/crypto/railfence.py +++ b/codext/crypto/railfence.py @@ -28,7 +28,9 @@ def __buildf(text, rails, offset = 0, up = 0) : rail = rails - offset - 1 else : dr = 1 + f = [["#"] * l for i in range(rails)] + for x in range(l) : f[rail][x] = text[x] if rail >= rails - 1: @@ -41,7 +43,7 @@ def __buildf(text, rails, offset = 0, up = 0) : def railfence_encode(rails = 3, offset = 0, up = 0) : def encode(text, errors="strict") : c,l = '', len(text) - f = __buildf(text,rails,offset, up) + f = __buildf(text, rails, offset, up) for r in range(rails) : for x in range(l) : if f[r][x] != '#' : From 30313578fc757ce7c85b0f5b1c059223f72ea6b1 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 26 Jan 2022 22:59:30 +0100 Subject: [PATCH 19/97] Improved codec: railfence --- README.md | 2 +- codext/crypto/railfence.py | 104 ++++++++++++++++++++++--------------- docs/enc/crypto.md | 9 ++-- 3 files changed, 69 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index a52a8b0..370f3fd 100644 --- a/README.md +++ b/README.md @@ -259,7 +259,7 @@ o - [X] `bacon`: aka Baconian Cipher - [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) - [X] `citrix`: aka Citrix CTX1 password encoding -- [X] `rail`: aka Rail Fence Cipher +- [X] `railfence`: aka Rail Fence Cipher - [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) - [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) - [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) diff --git a/codext/crypto/railfence.py b/codext/crypto/railfence.py index 3c231a1..3d150c0 100644 --- a/codext/crypto/railfence.py +++ b/codext/crypto/railfence.py @@ -1,5 +1,5 @@ # -*- coding: UTF-8 -*- -"""Rail Fence Cipher Codec - rail fence encoding. +"""Rail Fence Cipher Codec - rail fence content encoding. This codec: - en/decodes strings from str to str @@ -7,31 +7,33 @@ - decodes file content to str (read) - encodes file content from str to bytes (write) """ - - - from ..__common__ import * __examples__ = { - 'enc(rail-5-3|rail_5_3)': {'this is a test' : 'it sss etiath '}, - 'enc(rail-5-3-up|rail_5_3-up)' :{'this is a test': 'h tiats e ssit'}, - 'dec(rail-7-4|rail_7_4)': {'a stiet shsti': 'this is a test'} + 'enc(rail_123|rail-2-123)': {'this is a test': None}, + 'enc(railfence|zigzag)': {'this is a test': "t ashsi etist"}, + 'enc(rail-5|zigzag_5)': {'this is a test': "tah istsiet s"}, + 'enc(rail_5-3|rail_5_3)': {'this is a test': "it sss etiath "}, + 'enc(rail-5-3-up|rail_5_3-up)': {'this is a test': "h tiats e ssit"}, + 'enc(rail-7-4|rail_7_4)': {'this is a test': "a stiet shsti"}, + 'dec(zigzag)': {'': ""}, } +__guess__ = ["railfence-%d" % i for i in range(1, 11)] + ["railfence-%d-up" % i for i in range(1, 11)] - -def __buildf(text, rails, offset = 0, up = 0) : +def __build(text, rails, offset, up): l, rail = len(text), offset - if up != '' : + # set the starting rail and direction + if up: dr = -1 rail = rails - offset - 1 - else : + else: dr = 1 - - f = [["#"] * l for i in range(rails)] - - for x in range(l) : + # create rails + f = [[None] * l for i in range(rails)] + # now zig-zag between rails + for x in range(l): f[rail][x] = text[x] if rail >= rails - 1: dr = -1 @@ -40,37 +42,55 @@ def __buildf(text, rails, offset = 0, up = 0) : rail += dr return f -def railfence_encode(rails = 3, offset = 0, up = 0) : - def encode(text, errors="strict") : - c,l = '', len(text) - f = __buildf(text, rails, offset, up) - for r in range(rails) : - for x in range(l) : - if f[r][x] != '#' : - c += f[r][x] - return c, l + +def __check(length, rails, offset): + if rails > length: + raise ParameterError("Bad parameter for encoding 'railfence': rails=%d (should be >%d)" % (rails, length)) + if offset > rails: + raise ParameterError("Bad parameter for encoding 'railfence': offset=%d (should be >%d)" % (offset, rails)) + + +def railfence_encode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def encode(text, errors="strict"): + r, l = "", len(text) + __check(l, rails, offset) + f = __build(text, rails, offset, up) + for rail in range(rails): + for x in range(l): + if f[rail][x] is not None: + r += f[rail][x] + return r, l return encode -def railfence_decode(rails = 3, offset = 0, up = 0) : - def decode(text, errors = 'strict') : - f = __buildf("x" * len(text), rails, offset, up) - plain, i = '', 0 - ra, l = range(rails), range(len(text)) - #Put the characters in the right place - for r in ra: - for x in l : - if f[r][x] == "x" : - f[r][x] = text[i] +def railfence_decode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def decode(text, errors="strict"): + # this if block is particularly useful with Python2 ; see codecs.py at line 492 in comparison with codecs.py + # from Python3 at line 501: in Python2, a last block can be read while empty while in Python3 not + # as a consequence, in Python2, an error is triggered as an empty text cannot be decoded with Rail Fence with + # a rails parameter > 0 (see the __check(length, rails, offset)) function + if text == "": + return "", 0 + r, i, l = "", 0, len(text) + __check(l, rails, offset) + f = __build("." * len(text), rails, offset, up) + # put the characters in the right place + for rail in range(rails): + for x in range(l): + if f[rail][x] == ".": + f[rail][x] = text[i] i += 1 - #Read the characters in the right order - for x in l : - for r in ra: - if f[r][x] != '#' : - plain += f[r][x] + # read the characters in the right order + for x in range(l): + for rail in range(rails): + if f[rail][x] is not None: + r += f[rail][x] + return r, len(text) + return decode - return plain, len(plain) - return decode +add("railfence", railfence_encode, railfence_decode, + r"^(?:rail(?:[-_]?fence)?|zigzag)(?:[-_]([1-9]|[1-9]\d+)(?:[-_]([0-9]|[1-9]\d+))?(?:[-_](up))?)?$") -add("rail", railfence_encode, railfence_decode, r"rail[-_](\d+)[-_](\d+)[-_]?(up)?$") diff --git a/docs/enc/crypto.md b/docs/enc/crypto.md index a9c13f7..974f49d 100644 --- a/docs/enc/crypto.md +++ b/docs/enc/crypto.md @@ -130,15 +130,18 @@ This implements the Citrix CTX1 password encoding algorithm. ### Rail Fence Cipher -This implements the Rail Fence encoding algorithm. +This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`| The encoding fence is built from the top. Careful to trailing whitespaces. The `up` flag is used to build the fence from the bottom to the top. +`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`, `zigzag`, ... | + ```python +>>> codext.encode("this is a test", "zigzag") +'t ashsi etist' >>> codext.encode("this is a test", "rail-5-3") 'it sss etiath ' ->>> codext.decode("it sss etiath ", "rail-5-3") +>>> codext.decode("it sss etiath ", "zigzag_5-3") 'this is a test' ``` From 1ebbdf8c3e74989010c64fa1ceb1f6fca60863be Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 26 Jan 2022 22:59:50 +0100 Subject: [PATCH 20/97] New release --- codext/VERSION.txt | 2 +- codext/__common__.py | 9 +++++++-- codext/crypto/affine.py | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index e6dbb7c..6b37cb7 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.11.5 +1.11.6 diff --git a/codext/__common__.py b/codext/__common__.py index 5a5b827..2af9387 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -37,7 +37,7 @@ "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "is_native", "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", "register", "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", - "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] + "ParameterError", "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] CODECS_REGISTRY = None CODECS_OVERWRITTEN = [] CODECS_CATEGORIES = ["native", "custom"] @@ -145,6 +145,9 @@ def __repr__(self): return "" % (self.name, id(self)) +class ParameterError(ValueError): + __module__ = Exception.__module__ + # inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python class Repr(object): def __init__(self, name, func): @@ -908,7 +911,9 @@ def lookup(encoding, macro=True): try: return CodecMacro(encoding) except LookupError: - raise LookupError("unknown encoding: %s" % encoding) + e = LookupError("unknown encoding: %s" % encoding) + e.__cause__ = e # stop exception chaining + raise e codecs.lookup = lookup diff --git a/codext/crypto/affine.py b/codext/crypto/affine.py index f016868..cc18818 100755 --- a/codext/crypto/affine.py +++ b/codext/crypto/affine.py @@ -28,5 +28,5 @@ def encmap_factory(mask=None): return encmap -add_map("affine", encmap_factory, pattern=r"affine(?:[-_]cipher)?(?:[-_](.+?\-\d+\,\d+))?$") +add_map("affine", encmap_factory, pattern=r"^affine(?:[-_]cipher)?(?:[-_](.+?\-\d+\,\d+))?$") From c0cd162f0e62246fcadf26bf5385bf6a0ff7560d Mon Sep 17 00:00:00 2001 From: dhondta Date: Thu, 3 Feb 2022 23:29:52 +0100 Subject: [PATCH 21/97] Added codec: hexagram --- README.md | 6 ++++-- codext/stegano/__init__.py | 1 + codext/stegano/hexagram.py | 37 +++++++++++++++++++++++++++++++++++++ docs/enc/stegano.md | 17 +++++++++++++++++ 4 files changed, 59 insertions(+), 2 deletions(-) create mode 100755 codext/stegano/hexagram.py diff --git a/README.md b/README.md index 370f3fd..0683960 100644 --- a/README.md +++ b/README.md @@ -213,10 +213,11 @@ o #### BaseXX -- [X] `ascii85`: classical ASCII85 (Python3 only) -- [X] `baseN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) (incl base32, 36, 45, 58, 62, 63, 64, 91, 100, 122) +- [X] `baseN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) (incl [z]base32, 36, 45, 58, 62, 63, 64, [z]85, 91, 100, 122) - [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) ; supports any possible base +This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `base85` codec. + #### Binary - [X] `baudot`: supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... @@ -302,6 +303,7 @@ o #### Steganography +- [X] `hexagram`: uses Base64 and encodes the result to a charset of [I Ching hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) (as implemented [here](https://github.com/qntm/hexagram-encode)) - [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution - [X] `resistor`: aka resistor color codes - [X] `rick`: aka Rick cipher (in reference to Rick Astley's song "*Never gonna give you up*") diff --git a/codext/stegano/__init__.py b/codext/stegano/__init__.py index 03541a7..0f5d06b 100755 --- a/codext/stegano/__init__.py +++ b/codext/stegano/__init__.py @@ -1,4 +1,5 @@ # -*- coding: UTF-8 -*- +from .hexagram import * from .klopf import * from .resistor import * from .rick import * diff --git a/codext/stegano/hexagram.py b/codext/stegano/hexagram.py new file mode 100755 index 0000000..4c32095 --- /dev/null +++ b/codext/stegano/hexagram.py @@ -0,0 +1,37 @@ +# -*- coding: UTF-8 -*- +"""Hexagram Codec - hexagram content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +if PY3: + __examples__ = { + 'enc(hexagram|iching|i-ching-hexagrams)': {'this is a test': "䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯"}, + } + + ENCMAP = {c1: c2 for c1, c2 in zip("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=", + "䷁䷗䷆䷒䷎䷣䷭䷊䷏䷲䷧䷵䷽䷶䷟䷡䷇䷂䷜䷻䷦䷾䷯䷄䷬䷐䷮䷹䷞䷰䷛䷪䷖䷚䷃䷨䷳䷕" + "䷑䷙䷢䷔䷿䷥䷷䷝䷱䷍䷓䷩䷺䷼䷴䷤䷸䷈䷋䷘䷅䷉䷠䷌䷫䷀☯")} + DECMAP = {c2: c1 for c1, c2 in ENCMAP.items()} + + def hexagram_encode(input, errors="strict"): + return "".join(ENCMAP[c] for c in codecs.encode(input, "base64")), len(input) + + def hexagram_decode(input, errors="strict"): + r, ehandler = "", handle_error("hexagram", errors, decode=True) + for i, c in enumerate(input): + try: + r += DECMAP[c] + except KeyError: + r += ehandler(c, i, r) + return codecs.decode(r, "base64"), len(input) + + add("hexagram", hexagram_encode, hexagram_decode, printables_rate=0., + pattern=r"^(?:(?:i-ching-)?hexagrams?|i-?ching)$") + diff --git a/docs/enc/stegano.md b/docs/enc/stegano.md index d2fb212..57dfb18 100644 --- a/docs/enc/stegano.md +++ b/docs/enc/stegano.md @@ -4,6 +4,23 @@ ----- +### Hexagrams (I Ching) + +This uses Base64 and then encodes output characters to [I Ching Hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) such that implemented [here](https://github.com/qntm/hexagram-encode). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`hexagram` | text <-> hexagrams-encoded Base64 | `hexagrams`, `i-ching-hexagrams`, `iching` | Python3 only + +```python +>>> codext.encode("this is a test", "hexagram") +'䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯' +>>> codext.decode("䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯", "iching") +'this is a test' +``` + +----- + ### Klopf Code This is a Polybius code with the trivial alphabetical distribution ("A" -> (1,1), "B" -> (2,1), ...). This can be tested [here](https://gc.de/gc/klopfcode/). From 31406e13f81b55be24c5b6f79b8a661cc011a8c0 Mon Sep 17 00:00:00 2001 From: dhondta Date: Thu, 3 Feb 2022 23:31:52 +0100 Subject: [PATCH 22/97] Fixed minor issues in base --- codext/base/_base.py | 3 ++- codext/base/base45.py | 6 +----- codext/base/base91.py | 6 +----- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/codext/base/_base.py b/codext/base/_base.py index 9bd4d23..3190155 100755 --- a/codext/base/_base.py +++ b/codext/base/_base.py @@ -251,7 +251,8 @@ def _main(): c, f = _input(args.file), [encode, decode][args.decode] c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") try: - c = f(c, "base" + base + ["", "-inv"][args.invert], ["strict", "ignore"][args.ignore_garbage]) + c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], + ["strict", "ignore"][args.ignore_garbage]) except Exception as err: print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) return 1 diff --git a/codext/base/base45.py b/codext/base/base45.py index 12a3912..e3d2fea 100755 --- a/codext/base/base45.py +++ b/codext/base/base45.py @@ -28,10 +28,6 @@ __ord = lambda c: ord(c) if not isinstance(c, int) else c -class Base45DecodeError(ValueError): - pass - - def base45_encode(mode): mode = mode.replace("inverted", "inv").replace("_", "-").lstrip("-") b45 = B45[['inv', ''][mode == ""]] @@ -57,7 +53,7 @@ def base45_decode(mode): b45 = {c: i for i, c in enumerate(B45[['inv', ''][mode == ""]])} def decode(text, errors="strict"): t, s, err = b(text), "", "'base45' codec can't decode character '%s' in position %d" - ehandler = handle_error("base45", errors, Base45DecodeError, decode=True) + ehandler = handle_error("base45", errors, decode=True) for i in range(0, len(text), 3): try: n = b45[__chr(t[i])] diff --git a/codext/base/base91.py b/codext/base/base91.py index 9e12f07..d7ca416 100755 --- a/codext/base/base91.py +++ b/codext/base/base91.py @@ -26,10 +26,6 @@ __ord = lambda c: ord(c) if not isinstance(c, int) else c -class Base91DecodeError(ValueError): - pass - - def base91_encode(mode): mode = mode.replace("alternate", "alt").replace("inverted", "inv").replace("_", "-").lstrip("-") b91 = B91[mode if mode in B91.keys() else ""] @@ -79,7 +75,7 @@ def base91_decode(mode): b91 = {c: i for i, c in enumerate(B91[mode if mode in B91.keys() else ""])} def decode(text, errors="strict"): t, s, bits, alt = b(text), "", "", mode.startswith("alt") - ehandler = handle_error("base91", errors, Base91DecodeError, decode=True) + ehandler = handle_error("base91", errors, decode=True) for i in range(0, len(t), 2): try: n = b91[__chr(t[i])] * [1, 91][alt] From 0f6e6355359ce5a342b084af3bdb0687b331e62b Mon Sep 17 00:00:00 2001 From: dhondta Date: Fri, 4 Feb 2022 17:50:24 +0100 Subject: [PATCH 23/97] Applied minor improvements --- codext/__common__.py | 67 +++++++++++++++++++++++++----------------- codext/common/cases.py | 13 ++++---- codext/common/dummy.py | 6 ++-- docs/manipulations.md | 2 +- 4 files changed, 50 insertions(+), 38 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 2af9387..79625a2 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -19,17 +19,14 @@ from six import binary_type, string_types, text_type, BytesIO from string import * from types import FunctionType, ModuleType -try: # Python3 - from importlib import reload -except ImportError: - pass -try: # Python3 - from inspect import getfullargspec -except ImportError: - from inspect import getargspec as getfullargspec try: # Python 2 + import __builtin__ as builtins + from inspect import getargspec as getfullargspec from string import maketrans except ImportError: # Python 3 + import builtins + from importlib import reload + from inspect import getfullargspec maketrans = str.maketrans @@ -37,7 +34,7 @@ "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "is_native", "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", "register", "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", - "ParameterError", "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] + "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] CODECS_REGISTRY = None CODECS_OVERWRITTEN = [] CODECS_CATEGORIES = ["native", "custom"] @@ -75,6 +72,7 @@ fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x s2i = lambda s: int(codecs.encode(s, "base16"), 16) +exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) class CodecMacro(tuple): @@ -145,9 +143,6 @@ def __repr__(self): return "" % (self.name, id(self)) -class ParameterError(ValueError): - __module__ = Exception.__module__ - # inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python class Repr(object): def __init__(self, name, func): @@ -185,6 +180,14 @@ def _input(infile): return c +def _set_exc(name, etype="ValueError"): + if not hasattr(builtins, name): + exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) + setattr(builtins, name, locals()[name]) +_set_exc("InputSizeLimitError") +_set_exc("ParameterError") + + def _stripl(s, st_lines, st_crlf): if st_crlf: s = s.replace(b"\r\n", b"") if isb(s) else s.replace("\r\n", "") @@ -213,12 +216,18 @@ def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs= to remove the codec later """ remove(ename) - if encode and not isinstance(encode, FunctionType): - raise ValueError("Bad 'encode' function") - if decode and not isinstance(decode, FunctionType): - raise ValueError("Bad 'decode' function") + if encode: + if not isinstance(encode, FunctionType): + raise ValueError("Bad 'encode' function") + _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin + if decode: + if not isinstance(decode, FunctionType): + raise ValueError("Bad 'decode' function") + _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin if not encode and not decode: raise ValueError("At least one en/decoding function must be defined") + for exc in kwargs.get('extra_exceptions', []): + _set_exc(exc) # create additional custom exceptions as builtins glob = currentframe().f_back.f_globals # search function for the new encoding @_with_repr(ename) @@ -516,7 +525,8 @@ def code(text, errors="strict"): text = "".join(str(ord(c)).zfill(3) for c in text) r = "" lsep = "" if decode else sep if len(sep) <= 1 else sep[0] - error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode) + kind = ["character", "token"][tmaxlen > 1] + error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode, kind) # get the value from the mapping dictionary, trying the token with its inverted case if relevant def __get_value(token, position, case_changed=False): @@ -722,6 +732,11 @@ def remove(name): json.dump(PERS_MACROS, f, indent=2) except KeyError: pass + for s in ["En", "De"]: + try: + delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) + except AttributeError: + pass codecs.remove = remove @@ -815,7 +830,7 @@ def get_alphabet_from_mask(mask): # generic error handling function -def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, item="position"): +def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, kind="character", item="position"): """ This shortcut function allows to handle error modes given some tuning parameters. :param ename: encoding name @@ -826,13 +841,9 @@ def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=Fal :param decode: whether we are encoding or decoding :param item: position item description (for describing the error ; e.g. "group" or "token") """ - name = "".join(t.capitalize() for t in re.split(r"[-_+]", ename)) - # dynamically make dedicated exception classes bound to the related codec module - exc = "%s%scodeError" % (name, ["En", "De"][decode]) - glob = {'__name__': "__main__"} - exec("class %s(ValueError): pass" % exc, glob) - - def _handle_error(token, position, output=""): + exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) + + def _handle_error(token, position, output="", eename=None): """ This handles an encoding/decoding error according to the selected handling mode. :param token: input token to be encoded/decoded @@ -840,9 +851,11 @@ def _handle_error(token, position, output=""): :param output: output, as decoded up to the position of the error """ if errors == "strict": - msg = "'{}' codec can't {}code character '{}' in {} {}" - err = glob[exc](msg.format(ename, ["en", "de"][decode], token, item, position)) + msg = "'%s' codec can't %scode %s '%s' in %s %d" + token = token[:7] + "..." if len(token) > 10 else token + err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) err.output = output + err.__cause__ = err raise err elif errors == "leave": return token + sep diff --git a/codext/common/cases.py b/codext/common/cases.py index fa7d09c..16b7812 100644 --- a/codext/common/cases.py +++ b/codext/common/cases.py @@ -15,24 +15,23 @@ pascal = lambda i, e="strict": ("".join(x.capitalize() for x in re.findall(r"[0-9a-z]+", i.lower())), len(i)) -add("camel", lambda i, e: uncapitalize(pascal(i, e)[0]), None, r"^camel(?:[-_]?case)?$") -add("pascal", pascal, None, r"^pascal(?:[-_]?case)?$") +add("camelcase", lambda i, e="strict": uncapitalize(pascal(i, e)[0]), None, r"^camel(?:[-_]?case)?$") +add("pascalcase", pascal, None, r"^pascal(?:[-_]?case)?$") capitalize = lambda i, e="strict": (i.capitalize(), len(i)) uncapitalize = lambda i, e="strict": (i[0].lower() + i[1:] if len(i) > 0 else "", len(i)) add("capitalize", capitalize, uncapitalize) -lowercase = lambda i, e="strict": (i.lower(), len(i)) -uppercase = lambda i, e="strict": (i.upper(), len(i)) +lowercase, uppercase = lambda i, e="strict": (i.lower(), len(i)), lambda i, e="strict": (i.upper(), len(i)) add("uppercase", uppercase, lowercase, r"^upper(?:case)?$") add("lowercase", lowercase, uppercase, r"^lower(?:case)?$") slugify = lambda i, e="strict", d="-": (re.sub(r"[^0-9a-z]+", d, i.lower()).strip(d), len(i)) -add("slugify", lambda i, e: slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$") -add("snake", lambda i, e: slugify(i, e, "_"), None, r"^snake(?:[-_]?case)$") +add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$") +add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)$") swapcase = lambda i, e="strict": (i.swapcase(), len(i)) -add("swapcase", swapcase, swapcase, r"^swap(?:[-_]?case)?$") +add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$") title = lambda i, e="strict": (i.title(), len(i)) untitle = lambda i, e="strict": (" ".join(w[0].lower() + w[1:] if len(w) > 0 else "" for w in i.split()), len(i)) diff --git a/codext/common/dummy.py b/codext/common/dummy.py index cd98e4f..f2dd2fb 100755 --- a/codext/common/dummy.py +++ b/codext/common/dummy.py @@ -11,11 +11,11 @@ """ import re -from ..__common__ import add +from ..__common__ import * def replace(pair, *args): - def code(input, error="strict"): + def code(input, errors="strict"): return input.replace(pair[0], pair[1]), len(input) return code add("replace", replace, replace, r"^replace[-_]?((?!.*(.).*\2)..)$", guess=None) @@ -26,7 +26,7 @@ def code(input, error="strict"): def substitute(token, replacement): - def code(input, error="strict"): + def code(input, errors="strict"): return input.replace(token, replacement), len(input) return code add("substitute", substitute, substitute, r"^substitute[-_]?(.*?)/(.*?)$", guess=None) diff --git a/docs/manipulations.md b/docs/manipulations.md index 641cab6..7962278 100644 --- a/docs/manipulations.md +++ b/docs/manipulations.md @@ -16,7 +16,7 @@ These transformation functions are simple string transformations, including `str `pascalcase` | text --> pascal-case text | `pascal` | no decoding `slugify` | text --> slug | `slug`, `kebab`, `kebabcase` | no decoding `snakecase` | text --> snake-case text | `snake` | no decoding -`swapcase` | text <-> case-swapped text | `swap` | +`swapcase` | text <-> case-swapped text | `swap`, `invert`, `invertcase` | `title` | text <-> titled text | | decoding "untitles" the text `uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase` From c24ab651489384bd3f8a8b7fcf67e4efbcf592ac Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 5 Feb 2022 10:38:17 +0100 Subject: [PATCH 24/97] Fixed minor issues --- codext/__common__.py | 26 +++++++++++++++++++------- codext/common/cases.py | 2 +- codext/others/html.py | 3 ++- codext/others/letters.py | 4 ++-- tests/test_common.py | 1 + tests/test_generated.py | 4 +++- 6 files changed, 28 insertions(+), 12 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 79625a2..3db87a9 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -19,19 +19,26 @@ from six import binary_type, string_types, text_type, BytesIO from string import * from types import FunctionType, ModuleType -try: # Python 2 +try: # Python2 import __builtin__ as builtins - from inspect import getargspec as getfullargspec - from string import maketrans -except ImportError: # Python 3 +except ImportError: import builtins - from importlib import reload +try: # Python2 from inspect import getfullargspec +except ImportError: + from inspect import getargspec as getfullargspec +try: # Python2 + from string import maketrans +except ImportError: maketrans = str.maketrans +try: # Python3 + from importlib import reload +except ImportError: + pass __all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", - "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "is_native", + "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "i2s", "is_native", "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", "register", "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] @@ -75,6 +82,11 @@ exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) +def i2s(input): + h = hex(input)[2:].rstrip("eL") + return codecs.decode(h.zfill(len(h) + len(h) % 2), "hex") + + class CodecMacro(tuple): """Macro details when looking up the codec registry. """ def __new__(cls, name): @@ -170,11 +182,11 @@ def __stdin_pipe(): def _input(infile): # handle input file or stdin + c = b("") if infile: with open(infile, 'rb') as f: c = f.read() else: - c = b("") for line in __stdin_pipe(): c += line return c diff --git a/codext/common/cases.py b/codext/common/cases.py index 16b7812..65fbdf2 100644 --- a/codext/common/cases.py +++ b/codext/common/cases.py @@ -28,7 +28,7 @@ slugify = lambda i, e="strict", d="-": (re.sub(r"[^0-9a-z]+", d, i.lower()).strip(d), len(i)) add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$") -add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)$") +add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)?$") swapcase = lambda i, e="strict": (i.swapcase(), len(i)) add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$") diff --git a/codext/others/html.py b/codext/others/html.py index 2f23cb2..0a128b3 100755 --- a/codext/others/html.py +++ b/codext/others/html.py @@ -287,5 +287,6 @@ def htmlentity_decode(text, errors="strict"): return s, len(text) -add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$") +add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$", + extra_exceptions=["HtmlEntityDecodeError"]) diff --git a/codext/others/letters.py b/codext/others/letters.py index 6fcd6e9..e27ae96 100755 --- a/codext/others/letters.py +++ b/codext/others/letters.py @@ -60,7 +60,7 @@ def encode(text, errors="strict"): try: s += encmap[c] except KeyError: - s += handle_error(letters + "_indices", errors)(c, i) + s += handle_error("letter-indices", errors)(c, i) return "".join(encmap.get(c.upper(), c) for c in text), len(text) return encode @@ -81,7 +81,7 @@ def decode(text, errors="strict"): except (IndexError, KeyError): pass if err: - s += handle_error(letters + "_indices", errors, decode=True)(text[i], i) + s += handle_error("letter-indices", errors, decode=True)(text[i], i) return s, len(text) return decode diff --git a/tests/test_common.py b/tests/test_common.py index 0d9b381..ec57aaa 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -123,6 +123,7 @@ def test_reset_codecs(self): self.assertIsNotNone(codext.encode("test", "morse")) self.assertRaises(LookupError, codext.encode, "test", "dummy") self.assertTrue(len(CODECS_OVERWRITTEN) > 0) + self.assertIsNotNone(str(CODECS_OVERWRITTEN[0])) def test_search_codecs(self): self.assertIsNotNone(codext.search("morse")) diff --git a/tests/test_generated.py b/tests/test_generated.py index 54a3f60..6b89129 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -108,7 +108,9 @@ def _template(self): with codecs.open(tfile, 'wb', encoding=ename) as f: f.write(b(s1)) with codecs.open(tfile, 'rb', encoding=ename) as f: - s = f.read() if PY3 else f.read().rstrip("\x00") + s = f.read() + if not PY3 and re.search("[^\x00]\x00$", s): + s = s[:-1] self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s))) os.remove(tfile) return _template From 5667d2b77c72509e2ee836d8e19544c75188c810 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 5 Feb 2022 10:38:28 +0100 Subject: [PATCH 25/97] Improved base codecs --- README.md | 43 ++++++--- codext/base/__init__.py | 1 - codext/base/_base.py | 80 ++++++++++------ codext/base/_base2n.py | 25 ++--- codext/base/ascii85.py | 28 ------ codext/base/base100.py | 6 +- codext/base/base45.py | 17 ++-- codext/base/base85.py | 202 +++++++++++++++++++++++++++++++++------- codext/base/base91.py | 20 ++-- codext/base/baseN.py | 67 +++++++------ docs/enc/base.md | 10 +- tests/test_base.py | 86 +++++++---------- 12 files changed, 359 insertions(+), 226 deletions(-) delete mode 100755 codext/base/ascii85.py diff --git a/README.md b/README.md index 0683960..fb1e95f 100644 --- a/README.md +++ b/README.md @@ -211,14 +211,33 @@ o ## :page_with_curl: List of codecs -#### BaseXX - -- [X] `baseN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) (incl [z]base32, 36, 45, 58, 62, 63, 64, [z]85, 91, 100, 122) +#### [BaseXX](https://python-codext.readthedocs.io/en/latest/enc/base.html) + +- [X] `base1`: useless, but for the sake of completeness +- [X] `base2`: simple conversion to binary (with a variant with a reversed alphabet) +- [X] `base3`: conversion to ternary (with a variant with a reversed alphabet) +- [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet) +- [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet) +- [X] `base10`: simple conversion to decimal +- [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) +- [X] `base26`: conversion to alphabet letters +- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html)) +- [X] `base36`: [Base36](https://en.wikipedia.org/wiki/Base36) conversion to letters and digits (with a variant inverting both groups) +- [X] `base45`: [Base45](https://datatracker.ietf.org/doc/html/draft-faltstrom-base45-04.txt) DRAFT algorithm (with a variant inverting letters and digits) +- [X] `base58`: multiple versions of [Base58](https://en.bitcoinwiki.org/wiki/Base58) (bitcoin, flickr, ripple) +- [X] `base62`: [Base62](https://en.wikipedia.org/wiki/Base62) conversion to lower- and uppercase letters and digits (with a variant with letters and digits inverted) +- [X] `base63`: similar to `base62` with the "`_`" added +- [X] `base64`: classical conversion according to RFC4648 with its variant URL (or *file*) (it also holds a variant with letters and digits inverted) +- [X] `base67`: custom conversion using some more special characters (also with a variant with letters and digits inverted) +- [X] `base85`: all variants of Base85 ([Ascii85](https://fr.wikipedia.org/wiki/Ascii85), [z85](https://rfc.zeromq.org/spec/32), [Adobe](https://dencode.com/string/ascii85), [(x)btoa](https://dencode.com/string/ascii85), [RFC1924](https://datatracker.ietf.org/doc/html/rfc1924), [XML](https://datatracker.ietf.org/doc/html/draft-kwiatkowski-base85-for-xml-00)) +- [X] `base91`: [Base91](http://base91.sourceforge.net) custom conversion +- [X] `base100` (or *emoji*): [Base100](https://github.com/AdamNiederer/base100) custom conversion +- [X] `base122`: [Base100](http://blog.kevinalbs.com/base122) custom conversion - [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) ; supports any possible base This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `base85` codec. -#### Binary +#### [Binary](https://python-codext.readthedocs.io/en/latest/enc/binary.html) - [X] `baudot`: supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... - [X] `baudot-spaced`: variant of `baudot` ; groups of 5 bits are whitespace-separated @@ -232,17 +251,17 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `manchester-inverted`: variant of `manchester` ; XORes each bit of the input with `10` - [X] `rotateN`: rotates characters by the specified number of bits (*N* belongs to [1, 7] ; Python 3 only) -#### Common +#### [Common](https://python-codext.readthedocs.io/en/latest/enc/common.html) - [X] `a1z26`: keeps words whitespace-separated and uses a custom character separator - [X] `cases`: set of case-related encodings (including camel-, kebab-, lower-, pascal-, upper-, snake- and swap-case, slugify, capitalize, title) -- [X] `dummy`: set of simple encodings (including replace, reverse, word-reverse, substite and strip-spaces) +- [X] `dummy`: set of simple encodings (including integer, replace, reverse, word-reverse, substite and strip-spaces) - [X] `octal`: dummy octal conversion (converts to 3-digits groups) - [X] `octal-spaced`: variant of `octal` ; dummy octal conversion, handling whitespace separators - [X] `ordinal`: dummy character ordinals conversion (converts to 3-digits groups) - [X] `ordinal-spaced`: variant of `ordinal` ; dummy character ordinals conversion, handling whitespace separators -#### Compression +#### [Compression](https://python-codext.readthedocs.io/en/latest/enc/compressions.html) - [X] `gzip`: standard Gzip compression/decompression - [X] `lz77`: compresses the given data with the algorithm of Lempel and Ziv of 1977 @@ -253,7 +272,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba > :warning: Compression functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. -#### Cryptography +#### [Cryptography](https://python-codext.readthedocs.io/en/latest/enc/crypto.html) - [X] `affine`: aka Affine Cipher - [X] `atbash`: aka Atbash Cipher @@ -268,7 +287,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba > :warning: Crypto functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. -#### Hashing +#### [Hashing](https://python-codext.readthedocs.io/en/latest/enc/hashing.html) - [X] `blake`: includes BLAKE2b and BLAKE2s (Python 3 only ; relies on `hashlib`) - [X] `checksums`: includes Adler32 and CRC32 (relies on `zlib`) @@ -279,7 +298,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba > :warning: Hash functions are of course definitely **NOT** encoding functions ; they are implemented for convenience with the `.encode(...)` API from `codecs` and useful for chaning codecs. -#### Languages +#### [Languages](https://python-codext.readthedocs.io/en/latest/enc/languages.html) - [X] `braille`: well-known braille language (Python 3 only) - [X] `ipsum`: aka lorem ipsum @@ -293,7 +312,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `tap`: converts text to tap/knock code, commonly used by prisoners - [X] `tomtom`: similar to `morse`, using slashes and backslashes -#### Others +#### [Others](https://python-codext.readthedocs.io/en/latest/enc/others.html) - [X] `dna`: implements the 8 rules of DNA sequences (N belongs to [1,8]) - [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) @@ -301,7 +320,7 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `markdown`: unidirectional encoding from Markdown to HTML - [X] `url`: aka URL encoding -#### Steganography +#### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano.html) - [X] `hexagram`: uses Base64 and encodes the result to a charset of [I Ching hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) (as implemented [here](https://github.com/qntm/hexagram-encode)) - [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution diff --git a/codext/base/__init__.py b/codext/base/__init__.py index dd167cb..de8e9bd 100755 --- a/codext/base/__init__.py +++ b/codext/base/__init__.py @@ -2,7 +2,6 @@ from argparse import ArgumentParser, RawTextHelpFormatter from types import MethodType -from .ascii85 import * from .base45 import * from .base85 import * from .base91 import * diff --git a/codext/base/_base.py b/codext/base/_base.py index 3190155..b5abf10 100755 --- a/codext/base/_base.py +++ b/codext/base/_base.py @@ -10,9 +10,13 @@ from types import FunctionType, MethodType from ..__common__ import * +from ..__common__ import _set_exc from ..__info__ import __version__ +_set_exc("BaseError") +_set_exc("BaseEncodeError") +_set_exc("BaseDecodeError") """ Curve fitting: @@ -44,18 +48,7 @@ [ 0.02827357 0.00510124 -0.99999984 0.01536941] """ EXPANSION_FACTOR = lambda base: 0.02827357 / (base**0.00510124-0.99999984) + 0.01536941 - - -class BaseError(ValueError): - pass - - -class BaseDecodeError(BaseError): - pass - - -class BaseEncodeError(BaseError): - pass +SIZE_LIMIT = 1024 * 1024 * 1024 def _generate_charset(n): @@ -95,14 +88,19 @@ def _get_charset(charset, p=""): except KeyError: pass # or handle [p]arameter as a pattern - default, n = None, None + default, n, best = None, None, None for pattern, cset in charset.items(): n = len(cset) - if pattern == "": + if re.match(pattern, ""): default = cset continue - if re.match(pattern, p): - return cset + m = re.match(pattern, p) + if m: # find the longest match from the patterns + s, e = m.span() + if e - s > len(best or ""): + best = pattern + if best: + return charset[best] # special case: the given [p]arameter can be the charset itself if it has the right length p = re.sub(r"^[-_]+", "", p) if len(p) == n: @@ -110,7 +108,7 @@ def _get_charset(charset, p=""): # or simply rely on key '' if default is not None: return default - raise ValueError("Bad charset descriptor") + raise ValueError("Bad charset descriptor ('%s')" % p) # generic base en/decoding functions @@ -123,6 +121,12 @@ def base_encode(input, charset, errors="strict", exc=BaseEncodeError): :param exc: exception to be raised in case of error """ i, n, r = input if isinstance(input, integer_types) else s2i(input), len(charset), "" + if n == 1: + if i > SIZE_LIMIT: + raise InputSizeLimitError("Input exceeded size limit") + return i * charset[0] + if n == 10: + return str(i) if charset == digits else "".join(charset[int(x)] for x in str(i)) while i > 0: i, c = divmod(i, n) r = charset[c] + r @@ -138,11 +142,15 @@ def base_decode(input, charset, errors="strict", exc=BaseDecodeError): :param exc: exception to be raised in case of error """ i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc) + if n == 1: + return i2s(len(input)) + if n == 10: + return i2s(int(input)) if charset == digits else "".join(str(charset.index(c)) for c in input) for k, c in enumerate(input): try: i = i * n + charset.index(c) except ValueError: - handle_error("base", errors, exc, decode=True)(c, k, dec(i)) + handle_error("base", errors, exc, decode=True)(c, k, dec(i), "base%d" % n) return dec(i) @@ -162,15 +170,19 @@ def base(charset, pattern, pow2=False, encode_template=base_encode, decode_templ raise BaseError("Bad charset ; {} is not a power of 2".format(n)) def encode(param="", *args): - a = _get_charset(charset, param) + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) def _encode(input, errors="strict"): + if len(input) == 0: + return "", 0 return encode_template(input, a, errors), len(input) return _encode def decode(param="", *args): - a = _get_charset(charset, param) + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) sl, sc = "\n" not in a, "\n" not in a and not "\r" in a def _decode(input, errors="strict"): + if len(input) == 0: + return "", 0 input = _stripl(input, sc, sl) return decode_template(input, a, errors), len(input) return _decode @@ -205,10 +217,14 @@ def _decode(input, errors="strict"): expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) -def main(n, ref=None, alt=None, inv=True): +def main(n, ref=None, alt=None, inv=True, swap=True): base = str(n) + ("-" + alt.lstrip("-") if alt else "") src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" + text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ + "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ + " encoded stream." % {'base': base, 'source': src} + text = "\n".join(x for x in wrap(text, 74)) descr = """Usage: base%(base)s [OPTION]... [FILE] Base%(base)s encode or decode FILE, or standard input, to standard output. @@ -217,20 +233,19 @@ def main(n, ref=None, alt=None, inv=True): Mandatory arguments to long options are mandatory for short options too. -d, --decode decode data -i, --ignore-garbage when decoding, ignore non-alphabet characters -%(inv)s -w, --wrap=COLS wrap encoded lines after COLS character (default 76). +%(inv)s%(swap)s -w, --wrap=COLS wrap encoded lines after COLS character (default 76). Use 0 to disable line wrapping --help display this help and exit --version output version information and exit -%(source)sWhen decoding, the input may contain newlines in addition to the bytes of -the formal base%(base)s alphabet. Use --ignore-garbage to attempt to recover -from any other non-alphabet bytes in the encoded stream. +%(text)s Report base%(base)s translation bugs to Full documentation at: -""" % {'base': base, 'source': src, - 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. lower- and uppercase)\n"][inv]} +""" % {'base': base, 'text': text, + 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], + 'swap': ["", " -s, --swapcase swap the case\n"][swap]} def _main(): p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) @@ -240,6 +255,8 @@ def _main(): p.add_argument("-i", "--ignore-garbage", action="store_true") if inv: p.add_argument("-I", "--invert", action="store_true") + if swap: + p.add_argument("-s", "--swapcase", action="store_true") p.add_argument("-w", "--wrap", type=int, default=76) p.add_argument("--help", action="help") p.add_argument("--version", action="version") @@ -249,14 +266,19 @@ def _main(): args.wrap = 0 args.invert = getattr(args, "invert", False) c, f = _input(args.file), [encode, decode][args.decode] - c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") + if swap and args.decode: + c = codecs.decode(c, "swapcase") + c = b(c).rstrip(b"\r\n") try: c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], ["strict", "ignore"][args.ignore_garbage]) except Exception as err: print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) return 1 - for l in (wrap(ensure_str(c), args.wrap) if args.wrap > 0 else [ensure_str(c)]): + c = ensure_str(c) + if swap and not args.decode: + c = codecs.encode(c, "swapcase") + for l in (wrap(c, args.wrap) if args.wrap > 0 else [c]): print(l) return 0 return _main diff --git a/codext/base/_base2n.py b/codext/base/_base2n.py index 3fd24ca..d34072d 100755 --- a/codext/base/_base2n.py +++ b/codext/base/_base2n.py @@ -5,23 +5,16 @@ from math import ceil, log from ..__common__ import * -from ._base import base, _get_charset, BaseError +from ..__common__ import _set_exc +from ._base import base, _get_charset _bin = lambda x: bin(x if isinstance(x, int) else ord(x)) # base en/decoding functions for N a power of 2 -class Base2NError(BaseError): - pass - - -class Base2NDecodeError(BaseError): - pass - - -class Base2NEncodeError(BaseError): - pass +_set_exc("Base2NDecodeError") +_set_exc("Base2NEncodeError") def base2n(charset, pattern=None, name=None, **kwargs): @@ -35,13 +28,12 @@ def base2n(charset, pattern=None, name=None, **kwargs): base(charset, pattern, True, base2n_encode, base2n_decode, name, **kwargs) -def base2n_encode(string, charset, errors="strict", exc=Base2NEncodeError): +def base2n_encode(string, charset, errors="strict"): """ 8-bits characters to base-N encoding for N a power of 2. :param string: string to be decoded :param charset: base-N characters set :param errors: errors handling marker - :param exc: exception to be raised in case of error """ bs, r, n = "", "", len(charset) # find the number of bits for the given character set and the quantum @@ -66,13 +58,12 @@ def base2n_encode(string, charset, errors="strict", exc=Base2NEncodeError): return r + int(l / nb_out - len(r)) * "=" -def base2n_decode(string, charset, errors="strict", exc=Base2NDecodeError): +def base2n_decode(string, charset, errors="strict"): """ Base-N to 8-bits characters decoding for N a power of 2. :param string: string to be decoded :param charset: base-N characters set :param errors: errors handling marker - :param exc: exception to be raised in case of error """ bs, r, n = "", "", len(charset) # particular case: for hex, ensure the right case in the charset ; not that this way, if mixed cases are used, it @@ -95,7 +86,9 @@ def base2n_decode(string, charset, errors="strict", exc=Base2NDecodeError): bs += ("{:0>%d}" % nb_in).format(_bin(charset.index(c))[2:]) except ValueError: if errors == "strict": - raise exc("'base' codec can't decode character '{}' in position {}".format(c, i)) + e = Base2NDecodeError("'base%d' codec can't decode character '%s' in position %d" % (n, c, i)) + e.__cause__ = e # block exceptions chaining + raise e elif errors == "replace": bs += "0" * nb_in elif errors == "ignore": diff --git a/codext/base/ascii85.py b/codext/base/ascii85.py deleted file mode 100755 index 9cd4d77..0000000 --- a/codext/base/ascii85.py +++ /dev/null @@ -1,28 +0,0 @@ -# -*- coding: UTF-8 -*- -"""ASCII85 Codec - ascii85 content encoding. - -This is a simple wrapper for adding base64.a85**code to the codecs. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import base64 - -from ..__common__ import * - - -__examples__ = {'enc(ascii85|ascii-85|ascii_85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}} - - -if PY3: - def ascii85_encode(input, errors='strict'): - return base64.a85encode(b(input)), len(input) - - def ascii85_decode(input, errors='strict'): - return base64.a85decode(b(input)), len(input) - - add("ascii85", ascii85_encode, ascii85_decode, r"^ascii[-_]?85$", entropy=6.36, expansion_factor=1.25) - diff --git a/codext/base/base100.py b/codext/base/base100.py index 7d4b993..2a6e596 100755 --- a/codext/base/base100.py +++ b/codext/base/base100.py @@ -16,17 +16,17 @@ # no __examples__ ; handled manually in tests/test_base.py -def base100_encode(input, errors='strict'): +def base100_encode(input, errors="strict"): raise NotImplementedError -def base100_decode(input, errors='strict'): +def base100_decode(input, errors="strict"): raise NotImplementedError if PY3: class Base100DecodeError(ValueError): - pass + __module__ = "builtins" def base100_encode(input, errors="strict"): input = b(input) diff --git a/codext/base/base45.py b/codext/base/base45.py index e3d2fea..37e0157 100755 --- a/codext/base/base45.py +++ b/codext/base/base45.py @@ -7,30 +7,31 @@ - decodes file content to str (read) - encodes file content from str to bytes (write) """ -from ._base import digits, lower, main, upper +from ._base import _get_charset, digits, lower, main, upper from ..__common__ import * __examples__ = { 'enc(base45|base-45|base_45)': {'this is a test!': "AWE+EDH44.OEOCC7WE QEX0"}, + 'enc(base45-inv|base_45_inv)': {'this is a test!': "K6O+ONREE.YOYMMH6O 0O7A"}, 'dec(base45)': {'BAD STRING\00': None, 'AWE+EDH44.OEOCC7WE QEX000': None}, } __guess__ = ["base45", "base45-inv"] B45 = { - '': digits + upper + " $%*+-./:", - 'inv': upper + digits + " $%*+-./:", + '': digits + upper + " $%*+-./:", + '[-_]inv(?:erted)?$': upper + digits + " $%*+-./:", } -__chr = lambda c: chr(c) if isinstance(c, int) else c +__chr = lambda c: chr(c >> 8) + chr(c & 0xff) if isinstance(c, int) and 256 <= c <= 65535 else \ + chr(c) if isinstance(c, int) else c __ord = lambda c: ord(c) if not isinstance(c, int) else c def base45_encode(mode): - mode = mode.replace("inverted", "inv").replace("_", "-").lstrip("-") - b45 = B45[['inv', ''][mode == ""]] + b45 = _get_charset(B45, mode) def encode(text, errors="strict"): t, s = b(text), "" for i in range(0, len(text), 2): @@ -50,9 +51,9 @@ def encode(text, errors="strict"): def base45_decode(mode): mode = mode.replace("inverted", "inv").replace("_", "-").lstrip("-") - b45 = {c: i for i, c in enumerate(B45[['inv', ''][mode == ""]])} + b45 = {c: i for i, c in enumerate(_get_charset(B45, mode))} def decode(text, errors="strict"): - t, s, err = b(text), "", "'base45' codec can't decode character '%s' in position %d" + t, s = b(text), "" ehandler = handle_error("base45", errors, decode=True) for i in range(0, len(text), 3): try: diff --git a/codext/base/base85.py b/codext/base/base85.py index afdf575..41fed20 100755 --- a/codext/base/base85.py +++ b/codext/base/base85.py @@ -10,41 +10,177 @@ - encodes file content from str to bytes (write) """ import base64 +from six import integer_types -from ._base import main +from ._base import _get_charset, digits, lower, main, upper from ..__common__ import * -__examples__ = {'enc(base85|base-85|base_85)': {'this is a test': "bZBXFAZc?TVIXv6b94"}} if PY3 else \ - {'enc(base85': None} - - -#FIXME: implement Z85 (ZeroMQ) in base85.py ; cfr spec https://rfc.zeromq.org/spec/32/ -#FIXME: implement base85-rfc1924 in base85.py -#B85 = { -# r'': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], -# r'[-_]z(eromq)?$': digits + upper + lower + ".-:+=^!/*?&<>()[]{}@%$#", -# r'[-_]rfc1924$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", -#} -#base(B85, r"^base[-_]?85(|[-_](?:z(?:eromq)?|rfc1924))$") - - -def base85_encode(input, errors='strict'): - raise NotImplementedError - - -def base85_decode(input, errors='strict'): - raise NotImplementedError - - -if PY3: - def base85_encode(input, errors='strict'): - return base64.b85encode(b(input)), len(input) - - def base85_decode(input, errors='strict'): - return base64.b85decode(b(input)), len(input) - - -add("base85", base85_encode, base85_decode, r"^base[-_]?85$", entropy=7.05, expansion_factor=1.25) -main = main(85, "RFC 1924") +__examples__ = { + 'enc-dec(base85|z85|base85-ipv6)': ["@random{512,1024,2048}"], + 'enc-dec(base85-btoa|base85-xbtoa)': ["@random{512,1024,2048}"], + 'enc(base85|ascii85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}, + 'enc(base85-adobe)': {'this is a test': "<~FD,B0+DGm>@3BZ'F*%~>", + 'this is a test\0\0\0\0\0\0': "<~FD,B0+DGm>@3BZ'F*%B^z~>"}, + 'enc(z85|base85-z)': {'this is a test': "BzbxfazC)tvixV6B94"}, + 'enc(base85-ipv6|base85_rfc1924)': {'this is a test': "bZBXFAZc?TVIXv6b94"}, + 'enc(base85_btoa)': {'this is a test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this\0\0\0\0test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this is a test\0\0\0\0': "FD,B0+DGm>y@3BZ'F*%B^z"}, + 'enc(base85-xbtoa)': {'this is a test': "xbtoa Begin\nFD,B0+DGm>@3BZ'F*%B^\nxbtoa End N 14 e E 4b" \ + " S 523 R 1b132e"}, + 'dec(base85-xbtoa)': {'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End': None, + 'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End N 14 e E 4b S 523 R 000bad': + None}, + 'enc(base85-xml)': {'this is a test': "bZBXFAZc@TVIXv6b94"}, + 'enc(base85|ascii85)': {'this\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0test': "FD,B0zzz!!!\"@ATMq"}, +} +__guess__ = ["ascii85", "z85", "base85-ipv6", "base85-xml", "base85-adobe", "base85-xbtoa"] + + +B85 = { + r'(base[-_]?85([-_]ascii)?|ascii85)$': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], + r'(z85|base[-_]?85[-_]z(eromq)?)$': digits + lower + upper + ".-:+=^!/*?&<>()[]{}@%$#", + r'base[-_]?85[-_](rfc1924|ipv6)$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", + r'base[-_]?85[-_]xml$': digits + upper + lower[:-1] + "!#$()*+,-./:;=?@^`{|}~z_", +} +B85[r'(base[-_]?85[-_]adobe)$'] = B85[r'(base[-_]?85[-_]x?btoa)$'] = B85[r'(base[-_]?85([-_]ascii)?|ascii85)$'] +POW85 = [85 ** i for i in range(5)] + + +def __format(text, mode, decode=False, **kwargs): + if "adobe" in mode: + if decode: + if text.startswith("<~") and text.endswith("~>"): + text = text[2:-2] + else: + text = "<~" + text + "~>" + elif "xbtoa" in mode: + sp, ep = "xbtoa [bB]egin\n", "xbtoa [eE]nd" + if decode: + if re.match(r"^xbtoa\s+[bB]egin\n", text) and \ + re.search(r"\nxbtoa\s+[eE]nd N \d+{h} E{h} S{h} R{h}\s*$".format(h=" [0-9a-fA-F]+"), text): + text = "".join(text.split("\n")[1:-1]).replace(" ", "") + elif not decode: + l, t = kwargs['length'], "\n".join(text[i:i+78] for i in range(0, len(text), 78)) + text = "xbtoa Begin\n%s\nxbtoa End N %d %x E %x S %x R %x" % \ + (t, l, l, kwargs['c_xor'], kwargs['c_sum'], kwargs['c_rot']) + return text + + +def __xbtoa_values(text): + try: + hr = "[0-9a-fA-F]+" + return re.search(r"\nxbtoa\s+[eE]nd N (\d+) ({h}) E ({h}) S ({h}) R ({h})\s*$".format(h=hr), text).groups() + except: + raise Base85DecodeError("Bad or missing xbtoa parameters") + + +def base85_encode(mode): + b85 = _get_charset(B85, mode) + def encode(input, errors="strict"): + r, l, kw = "", len(input), {} + if l == 0: + return input, 0 + if "xbtoa" in mode: + kw['length'] = l + kw['c_xor'], kw['c_sum'], kw['c_rot'] = 0, 0, 0 + n_pad = (4 - l % 4) % 4 + for i in range(0, l, 4): + block = input[i:i+4] + if block == "\0\0\0\0" and b85[-3:] == "stu": + r += "z" + if block == "\x20\x20\x20\x20" and "btoa" in mode: + r += "y" + if "xbtoa" in mode: + for c in block: + k = ord(c) + kw['c_xor'] ^= k + kw['c_sum'] += k + 1 + kw['c_rot'] <<= 1 + if kw['c_rot'] & 0x80000000: + kw['c_rot'] += 1 + kw['c_rot'] += k + if block == "\0\0\0\0" and b85[-3:] == "stu" or block == "\x20\x20\x20\x20" and "btoa" in mode: + continue + if len(block) < 4: + block += n_pad * "\0" + n, bl = s2i(block), "" + for _ in range(5): + n, k = divmod(n, 85) + bl = b85[k] + bl + r += bl + if "btoa" not in mode and n_pad: + r = r[:-n_pad] + if b85[-3:] == "stu" and r[-5:] == "!!!!!": + r = r[:-5] + "z" + return __format(r, mode, **kw), l + return encode + + +def base85_decode(mode): + b85 = _get_charset(B85, mode) + def decode(input, errors="strict"): + r, l, i, n_pad = "", len(input), 0, 0 + if l == 0: + return input, 0 + if "xbtoa" in mode: + v = __xbtoa_values(input) + n_last = int(v[0]) % 4 + c_xor, c_sum, c_rot = 0, 0, 0 + input = __format(input, mode, True) + ehandler = handle_error("base85", errors, decode=True) + if b85[-3:] == "stu" and input[-1] == "z": + input = input[:-1] + "!!!!!" + l = len(input) + while i < l: + n, incr = 0, 5 + if input[i] == "z" and b85[-3:] == "stu": + bl, incr = "\0\0\0\0", 1 + elif input[i] == "y" and "btoa" in mode: + bl, incr = "\x20\x20\x20\x20", 1 + else: + block = input[i:i+5] + if len(block) < 5: + n_pad = 5 - len(block) % 5 + block += n_pad * "\0" + for k, c in enumerate(block[::-1]): + try: + n += (b85.index(c) if c != "\0" else 255) * POW85[k] + except ValueError: + r += ehandler(c, i + k, r) + bl = codecs.decode("{:0>8}".format(hex(n & 0xffffffff)[2:]), "hex") + if "xbtoa" in mode: + if i + 5 == l and n_last > 0: + bl = bl[:n_last] + for c in bl: + k = ord(c) + c_xor ^= k + c_sum += k + 1 + c_rot <<= 1 + if c_rot & 0x80000000: + c_rot += 1 + c_rot += k + r += bl + i += incr + if n_pad > 0: + r = r[:-n_pad] + if "xbtoa" in mode: + chkv = ["%d" % len(r), "%x" % len(r), "%x" % c_xor, "%x" % c_sum, "%x" % c_rot] + if any(v1 != v2 for v1, v2 in zip(v, chkv)) and errors == "strict": + raise Base85ValueError("A check value does not match (%s != %s)" % (str(list(v)).replace("'", ""), + str(chkv).replace("'", ""))) + return r, l + return decode + + +add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, + pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", + extra_exceptions=["Base85ValueError"]) +main85 = main(85, None, "ascii") +main85adobe = main(85, None, "adobe") +main85btoa = main(85, None, "btoa") +main85rfc1924 = main(85, "RFC 1924", "ipv6") +main85xml = main(85, "", "xml") +main85zeromq = main(85, "", "zeromq") diff --git a/codext/base/base91.py b/codext/base/base91.py index d7ca416..de6373d 100755 --- a/codext/base/base91.py +++ b/codext/base/base91.py @@ -7,7 +7,7 @@ - decodes file content to str (read) - encodes file content from str to bytes (write) """ -from ._base import digits, lower, main, upper +from ._base import _get_charset, digits, lower, main, upper from ..__common__ import * # no __examples__ ; handled manually in tests/test_base.py @@ -15,10 +15,10 @@ B91 = { - '': upper + lower + digits + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", - 'inv': lower + upper + digits + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", - 'alt': "!#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_" + lower + "{|}", - 'alt-inv': "!#$%&'()*+,-./" + digits + ":;<=>?@" + lower + "[\\]^_" + upper + "{|}", + r'': upper + lower + digits + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", + r'[-_]inv(erted)?$': digits + upper + lower + "!#$%&()*+,./:;<=>?@[]^_`{|}~\"", + r'[-_]alt(ernate)?$': "!#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_" + lower + "{|}", + r'[-_]alt(ernate)?[-_]inv(erted)?$': "!#$%&'()*+,-./" + upper + ":;<=>?@" + lower + "[\\]^_" + digits + "{|}", } @@ -27,11 +27,10 @@ def base91_encode(mode): - mode = mode.replace("alternate", "alt").replace("inverted", "inv").replace("_", "-").lstrip("-") - b91 = B91[mode if mode in B91.keys() else ""] + b91 = _get_charset(B91, mode) def encode(text, errors="strict"): t, s, bits = b(text), "", "" - if mode.startswith("alt"): + if re.search(r'[-_]alt(ernate)?$', mode): while len(bits) < 13 and t: bits += "{:08b}".format(__ord(t[0])) t = t[1:] @@ -71,10 +70,9 @@ def encode(text, errors="strict"): def base91_decode(mode): - mode = mode.replace("alternate", "alt").replace("inverted", "inv").replace("_", "-").lstrip("-") - b91 = {c: i for i, c in enumerate(B91[mode if mode in B91.keys() else ""])} + b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))} def decode(text, errors="strict"): - t, s, bits, alt = b(text), "", "", mode.startswith("alt") + t, s, bits, alt = b(text), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None ehandler = handle_error("base91", errors, decode=True) for i in range(0, len(t), 2): try: diff --git a/codext/base/baseN.py b/codext/base/baseN.py index 02681fe..de9a44c 100755 --- a/codext/base/baseN.py +++ b/codext/base/baseN.py @@ -12,63 +12,74 @@ from ._base2n import base2n -B2 = {r'': "01", r'[-_]inv(erted)?': "10"} -base2n(B2, r"^(?:base[-_]?2|bin)(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{2})$", expansion_factor=8.) +B1 = {chr(i): chr(i) for i in range(2**8)} +B1[''] = "A" +base(B1, r"^(?:base[-_]?1(|[-_].)|unary)$") +main1 = main(1) + + +B2 = {r'': "01", r'[-_]inv(erted)?$': "10"} +base2n(B2, r"^(?:base[-_]?2|bin(?:ary)?)(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{2})$", expansion_factor=8.) main2 = main(2) -B3 = {r'': "123", r'[-_]inv(erted)?': "321"} +B3 = {r'': "123", r'[-_]inv(erted)?$': "321"} base(B3, r"^base[-_]?3(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{3})$", expansion_factor=5.) main3 = main(3) -B4 = {r'': "1234", r'[-_]inv(erted)?': "4321"} +B4 = {r'': "1234", r'[-_]inv(erted)?$': "4321"} base2n(B4, r"^base[-_]?4(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{4})$", expansion_factor=4.) main4 = main(4) -B8 = {r'': "abcdefgh", r'[-_]inv(erted)?': "hgfedcba"} +B8 = {r'': "abcdefgh", r'[-_]inv(erted)?$': "hgfedcba"} base2n(B8, r"^base[-_]?8(|[-_]inv(?:erted)?|[-_](?!.*(.).*\2)[a-zA-Z0-9]{8})$") main8 = main(8) -B16 = {'': digits + "ABCDEF", 'inv': "ABCDEF" + digits} +B10 = {r'': "0123456789"} +base(B10, r"^(?:base[-_]?10|int(?:eger)?)$") +main10 = main(10) + + +B16 = {'': digits + "ABCDEF", '[-_]inv(erted)?$': "ABCDEF" + digits} base2n(B16, r"^(?:base[-_]?16|hex)(|[-_]inv(?:erted)?)$", expansion_factor=2.) main16 = main(16, "RFC 4648") -B26 = {'': upper, 'inv': lower} -base(B26, r"^base[-_]?26(|[-_]inv(?:erted)?)$") -main26 = main(26) +B26 = {'': upper} +base(B26, r"^base[-_]?26$") +main26 = main(26, inv=False) B32 = { - r'': upper + "234567", - r'[-_]inv(erted)?$': "234567" + upper, - r'(?:[-_]ext(?:ended)?)?[-_]hex$': digits + upper[:22], - r'[-_]geohash': digits + "bcdefghjkmnpqrstuvwxyz", + r'': upper + "234567", + r'[-_]?z(?:base32)$': "ybndrfg8ejkmcpqxot1uwisza345h769", + r'[-_]inv(erted)?$': "234567" + upper, + r'(?:[-_](ext(ended)?)?)?[-_]hex$': digits + upper[:22], + r'[-_]?crockford': digits + "ABCDEFGHJKMNPQRSTVWXYZ", + r'[-_]?geohash': digits + "bcdefghjkmnpqrstuvwxyz", } -base2n(B32, r"^base[-_]?32(|[-_]inv(?:erted)?|(?:[-_]ext(?:ended)?)?[-_]hex|[-_]geohash)$", padding_char="=", - guess=["base32", "base32-inv", "base32-hex", "base32-geohash"]) +base2n(B32, r"^(?:base[-_]?32(|[-_]inv(?:erted)?|(?:[-_]ext(?:ended)?)?[-_]hex|[-_](?:z|geohash|crockford))|" + r"(zbase32|geohash|crockford))$", padding_char="=", + guess=["base32", "base32-inv", "base32-hex", "base32-geohash", "base32-crockford"]) main32 = main(32, "RFC 4648") main32hex = main(32, "RFC 4648", "hex", False) -main32geo = main(32, "RFC 4648", "geohash", False) - - -ZB32 = {'': "ybndrfg8ejkmcpqxot1uwisza345h769"} -base2n(ZB32, r"^z[-_]?base[-_]?32$", name="zbase32", padding_char="=") +main32geo = main(32, "", "geohash", False) +main32geo = main(32, "", "crockford", False) mainz32 = main(32, "", "z", False) -B36 = {'': digits + upper, 'inv': upper + digits} +B36 = {'': digits + upper, '[-_]inv(erted)?$': upper + digits} base(B36, r"^base[-_]?36(|[-_]inv(?:erted)?)$") main36 = main(36, "") B58 = { - r'(|[-_](bc|bitcoin))$': "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz", - r'[-_](rp|ripple)$': "rpshnaf39wBUDNEGHJKLM4PQRST7VWXYZ2bcdeCg65jkm8oFqi1tuvAxyz", - r'[-_](fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ", + r'(|[-_]?(bc|bitcoin))$': "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz", + r'[-_]?(rp|ripple)$': "rpshnaf39wBUDNEGHJKLM4PQRST7VWXYZ2bcdeCg65jkm8oFqi1tuvAxyz", + r'[-_]?(fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ", } base(B58, r"^base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))$", guess=["base58-bitcoin", "base58-ripple", "base58-flickr"]) @@ -77,25 +88,25 @@ main58fl = main(58, "", "flickr") -B62 = {'': digits + upper + lower, 'inv': digits + lower + upper} +B62 = {'': digits + upper + lower, '[-_]inv(erted)?$': upper + lower + digits} base(B62, r"^base[-_]?62(|[-_]inv(?:erted)?)$") main62 = main(62, "") -B63 = {'': upper + lower + digits + "$", 'inv': lower + upper + digits + "$"} +B63 = {'': digits + upper + lower + "_", 'inv': upper + lower + digits + "_"} base(B63, r"^base[-_]?63(|[-_]inv(?:erted)?)$") main63 = main(63) B64 = { r'': upper + lower + digits + "+/", - r'[-_]inv(erted)?$': lower + upper + digits + "+/", + r'[-_]inv(erted)?$': digits + upper + lower + "+/", r'[-_]?(file|url)(safe)?$': upper + lower + digits + "-_", } base2n(B64, r"^base[-_]?64(|[-_]inv(?:erted)?|[-_]?(?:file|url)(?:safe)?)$", padding_char="=", guess=["base64", "base64-inv", "base64-url"]) main64 = main(64, "RFC 4648") -main64url = main(64, "RFC 4648 / Base64URL", "url", False) +main64url = main(64, "RFC 4648 / Base64URL", "url") B67 = { diff --git a/docs/enc/base.md b/docs/enc/base.md index c4700d1..73b78ff 100644 --- a/docs/enc/base.md +++ b/docs/enc/base.md @@ -125,13 +125,17 @@ Note that for `base64`, it overwrites the native `base64_codec` to also support ----- -### Ascii85 +### Base85 -This encoding relies on the `base64` library and is only supported in Python 3. +This encoding implements various different versions of Base85. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`ascii85` | text <-> ascii85 | none | Python 3 only +`base85` | text <-> ascii85 | `ascii85` | +`base85` | text <-> z85 | `z85`, `base85-zeromq` | +`base85` | text <-> base85-ipv6 | `base85-ipv6`, `base85-rfc1924` | +`base85` | text <-> base85-adobe | `base85-adobe` | +`base85` | text <-> base85-btoa | `base85-btoa`, `base85-xbtoa` | ```python >>> codext.encode("this is a test", "ascii85") diff --git a/tests/test_base.py b/tests/test_base.py index c9dfb9c..f92e38f 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -37,6 +37,13 @@ def test_new_base_codec(self): self.assertIsNone(base({'': "01234"}, r"^base5(test)?$")) self.assertIsNotNone(codecs.encode(STR, "base5test")) self.assertRaises(ValueError, base, {'': "01234"}, "base5-test", pow2=True) + self.assertEqual("", codecs.decode("", "base5test")) + + def test_codec_base1(self): + C = "A" + for i in range(3): + self.assertIsNotNone(codecs.encode(i * C, "base1")) + self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") def test_codec_base2(self): STR = "test" @@ -127,18 +134,15 @@ def test_codec_base16(self): self.assertRaises(ValueError, codecs.decode, B16_3, "hex") def test_codec_base32(self): - B32 = "ORUGS4ZANFZSAYJAORSXG5A=" - self.assertEqual(codecs.encode(STR, "base32"), B32) - self.assertEqual(codecs.encode(b(STR), "base32"), b(B32)) - self.assertEqual(codecs.decode(B32, "base32"), STR) - self.assertEqual(codecs.decode(b(B32), "base32"), b(STR)) - B32 = "qtwg1h3ypf31yajyqt1zg7y=" - self.assertEqual(codecs.encode(STR, "zbase32"), B32) - self.assertEqual(codecs.encode(b(STR), "z-base-32"), b(B32)) - self.assertEqual(codecs.decode(B32, "z_base_32"), STR) - self.assertEqual(codecs.decode(b(B32), "zbase32"), b(STR)) - self.assertRaises(ValueError, codecs.decode, B32.rstrip("="), "zbase32") - self.assertRaises(ValueError, codecs.decode, B32.rstrip("="), "zbase32", "BAD") + for b32, enc in zip(["ORUGS4ZANFZSAYJAORSXG5A=", "qtwg1h3ypf31yajyqt1zg7y=", "EHK6ISP0D5PI0O90EHIN6T0=", + "fjn6kwt0e5tk0s90fjkr6x0=", "EHM6JWS0D5SJ0R90EHJQ6X0="], + ["base32", "zbase32", "base32-hex", "geohash", "crockford"]): + self.assertEqual(codecs.encode(STR, enc), b32) + self.assertEqual(codecs.encode(b(STR), enc), b(b32)) + self.assertEqual(codecs.decode(b32, enc), STR) + self.assertEqual(codecs.decode(b(b32), enc), b(STR)) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc, "BAD") def test_codec_base36(self): B36 = "4WMHTK6UZL044O91NKCEB8" @@ -170,58 +174,32 @@ def test_codec_base58(self): self.assertEqual(codecs.encode(STR, "base58-url"), B58) def test_codec_base62(self): - B62 = "CsoB4HQ5gmgMyCenF7E" - self.assertEqual(codecs.encode(STR, "base62"), B62) - self.assertEqual(codecs.encode(b(STR), "base62"), b(B62)) - self.assertEqual(codecs.decode(B62, "base62"), STR) - self.assertEqual(codecs.decode(b(B62), "base62"), b(STR)) - B62 = "cSOb4hq5GMGmYcENf7e" - self.assertEqual(codecs.encode(STR, "base62-inv"), B62) - self.assertEqual(codecs.decode(B62, "base62-inv"), STR) + for b62, enc in zip(["CsoB4HQ5gmgMyCenF7E", "M2yLERaFqwqW8MoxPHO"], ["base62", "base62-inv"]): + self.assertEqual(codecs.encode(STR, enc), b62) + self.assertEqual(codecs.encode(b(STR), enc), b(b62)) + self.assertEqual(codecs.decode(b62, enc), STR) + self.assertEqual(codecs.decode(b(b62), enc), b(STR)) def test_codec_base64(self): - B64 = "dGhpcyBpcyBhIHRlc3Q=" - self.assertEqual(codecs.encode(STR, "base64"), B64) - self.assertEqual(codecs.encode(b(STR), "base64"), b(B64)) - self.assertEqual(codecs.decode(B64, "base64"), STR) - self.assertEqual(codecs.decode(b(B64), "base64"), b(STR)) - B64 = "DgHPCYbPCYbHihrLC3q=" - self.assertEqual(codecs.encode(STR, "base64-inv"), B64) - self.assertEqual(codecs.decode(B64, "base64-inv"), STR) - - def test_codec_base85(self): - if PY3: - B85 = "bZBXFAZc?TVIXv6b94" - self.assertEqual(codecs.encode(STR, "base85"), B85) - self.assertEqual(codecs.encode(b(STR), "base85"), b(B85)) - self.assertEqual(codecs.decode(B85, "base85"), STR) - self.assertEqual(codecs.decode(b(B85), "base85"), b(STR)) + for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): + self.assertEqual(codecs.encode(STR, enc), b64) + self.assertEqual(codecs.encode(b(STR), enc), b(b64)) + self.assertEqual(codecs.decode(b64, enc), STR) + self.assertEqual(codecs.decode(b(b64), enc), b(STR)) def test_codec_base91(self): - B91 = ",X,<:WRT%yxth90oZB" - self.assertEqual(codecs.encode(STR, "base91"), B91) - self.assertEqual(codecs.encode(b(STR), "base91"), b(B91)) - self.assertEqual(codecs.decode(B91, "base91"), STR) - self.assertEqual(codecs.decode(b(B91), "base91"), b(STR)) - B91 = ",x,<:wrt%YXTH90Ozb" - self.assertEqual(codecs.encode(STR, "base91-inv"), B91) - self.assertEqual(codecs.decode(B91, "base91-inv"), STR) + for b91, enc in zip([",X,<:WRT%yxth90oZB", ",N,<:MHJ%onjXzqeP1", "Jx&[jv4S3Wg>,71@Jk", "yJy^\\IDFsdc?Tof:L#"], + ["base91", "base91-inv", "base91-alt", "base91-alt-inv"]): + self.assertEqual(codecs.encode(STR, enc), b91) + self.assertEqual(codecs.encode(b(STR), enc), b(b91)) + self.assertEqual(codecs.decode(b91, enc), STR) + self.assertEqual(codecs.decode(b(b91), enc), b(STR)) self.assertIsNotNone(codecs.encode("\x00\x00", "base91")) self.assertIsNotNone(codecs.decode("abc", "base91")) self.assertIsNotNone(codecs.decode("AD", "base91")) self.assertRaises(ValueError, codecs.decode, "\xff", "base91") self.assertRaises(ValueError, codecs.decode, "a\xff", "base91") - B91A = "Jx&[jv4S3Wg>,71@Jk" - self.assertEqual(codecs.encode(STR, "base91-alt"), B91A) - self.assertEqual(codecs.encode(b(STR), "base91-alt"), b(B91A)) - self.assertEqual(codecs.decode(B91A, "base91_alt"), STR) - self.assertEqual(codecs.decode(b(B91A), "base91_alt"), b(STR)) - B91A = "jX&[JV4s3wG>,71@jK" - self.assertEqual(codecs.encode(STR, "base91-alt-inv"), B91A) - self.assertEqual(codecs.decode(B91A, "base91_alt_inv"), STR) self.assertIsNotNone(codecs.encode("\x00\x00", "base91-alt")) - self.assertIsNotNone(codecs.decode("abc", "base91-alt")) - self.assertIsNotNone(codecs.decode("AD", "base91_alt")) def test_codec_base100(self): if PY3: From b061d74af4670f998d3c65cc1f264fc63fef61c2 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 5 Feb 2022 10:38:50 +0100 Subject: [PATCH 26/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 6b37cb7..0eed1a2 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.11.6 +1.12.0 From 4765b3ea33a8576916400922a936f34b6e4fe412 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 5 Feb 2022 14:52:29 +0100 Subject: [PATCH 27/97] Fixed minor issues + New release --- codext/VERSION.txt | 2 +- codext/base/_base.py | 22 ++++++++++++---------- codext/base/base100.py | 2 +- codext/base/base122.py | 2 +- codext/base/base85.py | 4 ++-- codext/base/base91.py | 2 +- codext/base/baseN.py | 6 +++--- setup.cfg | 16 ++++++++++++---- tests/test_base.py | 2 +- 9 files changed, 34 insertions(+), 24 deletions(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 0eed1a2..f8f4f03 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.12.0 +1.12.1 diff --git a/codext/base/_base.py b/codext/base/_base.py index b5abf10..05aaed0 100755 --- a/codext/base/_base.py +++ b/codext/base/_base.py @@ -6,7 +6,7 @@ from math import log from six import integer_types, string_types from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable -from textwrap import wrap +from textwrap import wrap as wraptext from types import FunctionType, MethodType from ..__common__ import * @@ -217,14 +217,14 @@ def _decode(input, errors="strict"): expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) -def main(n, ref=None, alt=None, inv=True, swap=True): +def main(n, ref=None, alt=None, inv=True, swap=True, wrap=True): base = str(n) + ("-" + alt.lstrip("-") if alt else "") src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ " encoded stream." % {'base': base, 'source': src} - text = "\n".join(x for x in wrap(text, 74)) + text = "\n".join(x for x in wraptext(text, 74)) descr = """Usage: base%(base)s [OPTION]... [FILE] Base%(base)s encode or decode FILE, or standard input, to standard output. @@ -233,8 +233,7 @@ def main(n, ref=None, alt=None, inv=True, swap=True): Mandatory arguments to long options are mandatory for short options too. -d, --decode decode data -i, --ignore-garbage when decoding, ignore non-alphabet characters -%(inv)s%(swap)s -w, --wrap=COLS wrap encoded lines after COLS character (default 76). - Use 0 to disable line wrapping +%(inv)s%(swap)s%(wrap)s --help display this help and exit --version output version information and exit @@ -245,7 +244,9 @@ def main(n, ref=None, alt=None, inv=True, swap=True): Full documentation at: """ % {'base': base, 'text': text, 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], - 'swap': ["", " -s, --swapcase swap the case\n"][swap]} + 'swap': ["", " -s, --swapcase swap the case\n"][swap], + 'wrap': ["", " -w, --wrap=COLS wrap encoded lines after COLS character (default 76).\n"+ 26 * " " + \ + "Use 0 to disable line wrapping"][wrap]} def _main(): p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) @@ -257,7 +258,8 @@ def _main(): p.add_argument("-I", "--invert", action="store_true") if swap: p.add_argument("-s", "--swapcase", action="store_true") - p.add_argument("-w", "--wrap", type=int, default=76) + if wrap: + p.add_argument("-w", "--wrap", type=int, default=76) p.add_argument("--help", action="help") p.add_argument("--version", action="version") p.version = "CodExt " + __version__ @@ -266,7 +268,7 @@ def _main(): args.wrap = 0 args.invert = getattr(args, "invert", False) c, f = _input(args.file), [encode, decode][args.decode] - if swap and args.decode: + if swap and args.swapcase and args.decode: c = codecs.decode(c, "swapcase") c = b(c).rstrip(b"\r\n") try: @@ -276,9 +278,9 @@ def _main(): print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) return 1 c = ensure_str(c) - if swap and not args.decode: + if swap and args.swapcase and not args.decode: c = codecs.encode(c, "swapcase") - for l in (wrap(c, args.wrap) if args.wrap > 0 else [c]): + for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): print(l) return 0 return _main diff --git a/codext/base/base100.py b/codext/base/base100.py index 2a6e596..db0b3c9 100755 --- a/codext/base/base100.py +++ b/codext/base/base100.py @@ -52,5 +52,5 @@ def base100_decode(input, errors="strict"): add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$", expansion_factor=1.) -main = main(100, "") +main100 = main(100, "") diff --git a/codext/base/base122.py b/codext/base/base122.py index 989dbec..33a42ad 100755 --- a/codext/base/base122.py +++ b/codext/base/base122.py @@ -102,5 +102,5 @@ def _get_7bits(currB, bob, B, decoded): add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) -main = main(122, "") +main122 = main(122, "") diff --git a/codext/base/base85.py b/codext/base/base85.py index 41fed20..bc6d8b2 100755 --- a/codext/base/base85.py +++ b/codext/base/base85.py @@ -177,9 +177,9 @@ def decode(input, errors="strict"): add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", extra_exceptions=["Base85ValueError"]) -main85 = main(85, None, "ascii") +main85 = main(85, None) main85adobe = main(85, None, "adobe") -main85btoa = main(85, None, "btoa") +main85xbtoa = main(85, None, "xbtoa", wrap=False) main85rfc1924 = main(85, "RFC 1924", "ipv6") main85xml = main(85, "", "xml") main85zeromq = main(85, "", "zeromq") diff --git a/codext/base/base91.py b/codext/base/base91.py index de6373d..6f0d6ec 100755 --- a/codext/base/base91.py +++ b/codext/base/base91.py @@ -109,5 +109,5 @@ def decode(text, errors="strict"): add("base91", base91_encode, base91_decode, r"^base[-_]?91((?:|[-_]alt(?:ernate)?)(?:|[-_]inv(?:erted)?)?)$", entropy=6.5, expansion_factor=1.231) -main = main(91, "") +main91 = main(91, "") diff --git a/codext/base/baseN.py b/codext/base/baseN.py index de9a44c..93fe5d9 100755 --- a/codext/base/baseN.py +++ b/codext/base/baseN.py @@ -55,7 +55,7 @@ B32 = { r'': upper + "234567", - r'[-_]?z(?:base32)$': "ybndrfg8ejkmcpqxot1uwisza345h769", + r'[-_]?z(?:base32)?$': "ybndrfg8ejkmcpqxot1uwisza345h769", r'[-_]inv(erted)?$': "234567" + upper, r'(?:[-_](ext(ended)?)?)?[-_]hex$': digits + upper[:22], r'[-_]?crockford': digits + "ABCDEFGHJKMNPQRSTVWXYZ", @@ -67,7 +67,7 @@ main32 = main(32, "RFC 4648") main32hex = main(32, "RFC 4648", "hex", False) main32geo = main(32, "", "geohash", False) -main32geo = main(32, "", "crockford", False) +main32crk = main(32, "", "crockford", False) mainz32 = main(32, "", "z", False) @@ -106,7 +106,7 @@ base2n(B64, r"^base[-_]?64(|[-_]inv(?:erted)?|[-_]?(?:file|url)(?:safe)?)$", padding_char="=", guess=["base64", "base64-inv", "base64-url"]) main64 = main(64, "RFC 4648") -main64url = main(64, "RFC 4648 / Base64URL", "url") +main64url = main(64, "RFC 4648 / Base64URL", "url", False) B67 = { diff --git a/setup.cfg b/setup.cfg index 3061de4..6d8eb28 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,15 +44,18 @@ python-requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4 [options.entry_points] console_scripts = + base1 = codext.base.baseN:main1 base2 = codext.base.baseN:main2 base3 = codext.base.baseN:main3 base4 = codext.base.baseN:main4 base8 = codext.base.baseN:main8 + base10 = codext.base.baseN:main10 base16 = codext.base.baseN:main16 base26 = codext.base.baseN:main26 base32 = codext.base.baseN:main32 base32-hex = codext.base.baseN:main32hex base32-geohash = codext.base.baseN:main32geo + base32-crockford = codext.base.baseN:main32crk base32-z = codext.base.baseN:mainz32 base36 = codext.base.baseN:main36 base45 = codext.base.base45:main @@ -64,9 +67,14 @@ console_scripts = base64 = codext.base.baseN:main64 base64-url = codext.base.baseN:main64url base67 = codext.base.baseN:main67 - base85 = codext.base.base85:main - base91 = codext.base.base91:main - base100 = codext.base.base100:main - base122 = codext.base.base122:main + base85 = codext.base.base85:main85 + base85-adobe = codext.base.base85:main85adobe + base85-xbtoa = codext.base.base85:main85xbtoa + base85-ipv6 = codext.base.base85:main85rfc1924 + base85-xml = codext.base.base85:main85xml + base85-zeromq = codext.base.base85:main85zeromq + base91 = codext.base.base91:main91 + base100 = codext.base.base100:main100 + base122 = codext.base.base122:main122 codext = codext.__init__:main debase = codext.base.__init__:main diff --git a/tests/test_base.py b/tests/test_base.py index f92e38f..33eff65 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -181,7 +181,7 @@ def test_codec_base62(self): self.assertEqual(codecs.decode(b(b62), enc), b(STR)) def test_codec_base64(self): - for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): + for b64, enc in zip(["dGhpcyBpcyBhIHRlc3QK", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): self.assertEqual(codecs.encode(STR, enc), b64) self.assertEqual(codecs.encode(b(STR), enc), b(b64)) self.assertEqual(codecs.decode(b64, enc), STR) From 2944192589b758be84238ab5b6aa97f77b1eba83 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 9 Feb 2022 22:55:49 +0100 Subject: [PATCH 28/97] Fixed codec: base45 --- codext/base/base45.py | 1 - 1 file changed, 1 deletion(-) diff --git a/codext/base/base45.py b/codext/base/base45.py index 37e0157..6f15150 100755 --- a/codext/base/base45.py +++ b/codext/base/base45.py @@ -50,7 +50,6 @@ def encode(text, errors="strict"): def base45_decode(mode): - mode = mode.replace("inverted", "inv").replace("_", "-").lstrip("-") b45 = {c: i for i, c in enumerate(_get_charset(B45, mode))} def decode(text, errors="strict"): t, s = b(text), "" From c06ee585bb334d5cafa0adb6c5ebd38e73e78be3 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 9 Feb 2022 22:57:48 +0100 Subject: [PATCH 29/97] Renamed debase to unbase --- README.md | 8 ++++---- codext/base/__init__.py | 44 ++++++++++++++++++++++++++--------------- setup.cfg | 2 +- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index fb1e95f..d4c51e3 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.h

Using CodExt from the command line

Using base tools from the command line

-

Using the debase command line tool

+

Using the unbase command line tool

## :computer: Usage (main CLI tool) Tweet on codext @@ -80,7 +80,7 @@ $ codext list macros example-macro ``` -## :computer: Usage (base CLI tool) Tweet on debase +## :computer: Usage (base CLI tool) Tweet on unbase ```session $ echo "Test string !" | base122 @@ -100,10 +100,10 @@ Test string ! ``` ```session -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | debase -m 3 +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -m 3 Test string ! -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | debase -f Test +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -f Test Test string ! ``` diff --git a/codext/base/__init__.py b/codext/base/__init__.py index de8e9bd..5859f6b 100755 --- a/codext/base/__init__.py +++ b/codext/base/__init__.py @@ -13,45 +13,57 @@ def main(): - descr = """Usage: debase [OPTION]... [FILE] -Base decode multi-layer FILE, or standard input, to standard output. + descr = """Usage: unbase [OPTION]... [FILE] +Decode multi-layer base encoded FILE, or standard input, to standard output. With no FILE, or when FILE is -, read standard input. Optional arguments: + -e, --extended also consider generic base codecs while guess-decoding -f, --stop-function set the result chceking function (default: text) - format: printables|text|flag|lang_[bigram]|[regex] - -i, --ignore-generic ignore generic base codecs while guess-decoding + format: printables|text|flag|lang_[bigram] -M, --max-depth maximum codec search depth (default: 5) -m, --min-depth minimum codec search depth (default: 0) - -s, --do-not-stop do not stop if a valid output is found + -p, --pattern pattern to be matched while searching + -s, --show show the decoding chain --help display this help and exit --verbose show guessing information and steps --version output version information and exit -Report debase bugs to +Report unbase bugs to Full documentation at: """ parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) parser.format_help = MethodType(lambda s: s.description, parser) parser.add_argument("file", nargs="?") + parser.add_argument("-e", "--extended", action="store_true") parser.add_argument("-f", "--stop-function", default="text") - parser.add_argument("-i", "--ignore-generic", action="store_true") - parser.add_argument("-M", "--max-depth", default=5, type=int) - parser.add_argument("-m", "--min-depth", default=0, type=int) - parser.add_argument("-s", "--do-not-stop", action="store_true") + parser.add_argument("-M", "--max-depth", type=int, default=10) + parser.add_argument("-m", "--min-depth", type=int, default=0) + parser.add_argument("-p", "--pattern") + parser.add_argument("-s", "--show", action="store_true") parser.add_argument("--help", action="help") parser.add_argument("--version", action="version") parser.add_argument("--verbose", action="store_true") parser.version = "CodExt " + __version__ args = parser.parse_args() - excl = [[], ["base%d-generic" % i for i in range(2, 255)]][args.ignore_generic] - sfunc = getattr(stopfunc, args.stop_function, args.stop_function) + excl, s = [["base%d-generic" % i for i in range(2, 256)], []][args.extended], args.stop_function + if re.match(r"lang_[a-z]{2}$", s) and all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(stopfunc.LANG_BACKEND) + #TODO: validate args.stop_function + #TODO: make --stop-function and --pattern mutually exclusive + sfunc = getattr(stopfunc, s, s) c = _input(args.file) c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") - r = codecs.guess(c, sfunc, args.min_depth, args.max_depth, exclude=excl, codec_categories="base", - stop=not args.do_not_stop, show=True, scoring_heuristic=False, debug=args.verbose) - if not args.do_not_stop: - print("Could not decode :-(" if len(r) == 0 else ensure_str(list(r.items())[0][1])) + r = codecs.guess(c, sfunc, 0, args.max_depth, exclude=tuple(excl), codec_categories="base", + stop=False, show=args.verbose, scoring_heuristic=False, debug=args.verbose) + if len(r) == 0: + print("Could not decode :-(") + return 0 + ans = max(r.items(), key=lambda x: len(x[0])) + if args.show: + print(" - ".join(ans[0])) + print(ensure_str(ans[1])) + return 0 diff --git a/setup.cfg b/setup.cfg index 6d8eb28..958a404 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,4 +77,4 @@ console_scripts = base100 = codext.base.base100:main100 base122 = codext.base.base122:main122 codext = codext.__init__:main - debase = codext.base.__init__:main + unbase = codext.base.__init__:main From 00f2cbad60c7ed3ee43756d3e9d9bfe5e920c547 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 9 Feb 2022 22:58:27 +0100 Subject: [PATCH 30/97] Applied minor improvements --- codext/__common__.py | 19 ++++++++++++------- codext/__init__.py | 8 ++++++-- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 3db87a9..89522e8 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1154,7 +1154,7 @@ def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=R stopfunc.default = stopfunc.text stopfunc.LANG_BACKEND = None -stopfunc.LANG_BACKENDS = [n for n in ["langid", "langdetect", "pycld2", "cld3", "textblob"] if __module_exists(n)] +stopfunc.LANG_BACKENDS = [n for n in ["pycld2", "langdetect", "langid", "cld3", "textblob"] if __module_exists(n)] if len(stopfunc.LANG_BACKENDS) > 0: stopfunc.LANG_BACKEND = stopfunc.LANG_BACKENDS[0] if "cld3" in stopfunc.LANG_BACKENDS: @@ -1223,7 +1223,6 @@ def _load_lang_backend(backend=None): flng = "lang_%s" % LANG if getattr(stopfunc, flng, None): stopfunc.default = getattr(stopfunc, flng) -_load_lang_backend(stopfunc.LANG_BACKEND) stopfunc._reload_lang = _load_lang_backend @@ -1244,10 +1243,12 @@ def __develop(encodings): def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_categories, exclude, result, found=(), - stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): + stop=True, show=False, scoring_heuristic=False, extended=False, debug=False, regex=False): """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ if depth > min_depth and stop_func(input): - if not stop and show and found not in result: + if regex: + stop = True + if not stop and (show or debug) and found not in result: s = repr(input) s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s s = "[+] %s: %s" % (", ".join(found), s) @@ -1287,7 +1288,7 @@ def expand(items, descr=None, transform=None): if debug: print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, codec_categories, exclude, result, - found + (encoding, ), stop, show, scoring_heuristic, extended, debug) + found + (encoding, ), stop, show, scoring_heuristic, extended, debug, regex) def __rank(prev_input, input, prev_encoding, codecs, heuristic=False, extended=False, yield_score=False): @@ -1374,7 +1375,7 @@ def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=F epxf = f - .1 <= expf <= f + .1 elif isinstance(expf, (tuple, list)) and len(expf) == 2: expf = f - expf[1] <= expf[0] <= expf[1] + .1 - s += .1 + s += [-1., .1][expf] # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the # number of input characters to take bad entropies of shorter strings into account entr = sc.get('entropy', {}) @@ -1408,18 +1409,22 @@ def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, codec_cat """ Try decoding without the knowledge of the encoding(s). """ if max_depth <= 0: raise ValueError("Depth must be a non-null positive integer") + if min_depth > max_depth: + raise ValueError("Min depth shall be less than or equal to the max depth") if len(found) > 0: for encoding in found: input = decode(input, encoding) + regex = False if isinstance(stop_func, string_types): stop_func = stopfunc.regex(stop_func) + regex = True result = {} if len(input) > 0: try: # breadth-first search for d in range(max_depth): __guess("", input, stop_func, 0, d+1, min_depth, codec_categories, exclude, result, tuple(found), stop, - show, scoring_heuristic, extended, debug) + show, scoring_heuristic, extended, debug, regex) if stop and len(result) > 0: return result except KeyboardInterrupt: diff --git a/codext/__init__.py b/codext/__init__.py index a37a98a..3b98af4 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -121,7 +121,7 @@ def main(): help="while using the regex stop function, set it as case-insensitive (default: False)") guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" " the search but may be more accurate (default: False)") - if len(stopfunc.LANG_BACKENDS) == 0: + if len(stopfunc.LANG_BACKENDS) > 0: _lb = stopfunc.LANG_BACKEND guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], help="natural language detection backend (default: %s)" % _lb) @@ -206,8 +206,12 @@ def main(): else: print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") elif args.command == "guess": + s, lb = args.stop_function, args.lang_backend + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) r = codecs.guess(c, - getattr(stopfunc, args.stop_function, ["", "(?i)"][args.icase] + args.stop_function), + getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, args.codec_categories, From ab4f8107aea19d15d68d5b3a0c6efc42a28566a1 Mon Sep 17 00:00:00 2001 From: dhondta Date: Fri, 11 Feb 2022 19:22:16 +0100 Subject: [PATCH 31/97] Added category: web --- README.md | 7 ++++-- codext/others/__init__.py | 2 -- codext/web/__init__.py | 4 ++++ codext/{others => web}/html.py | 0 codext/{others => web}/url.py | 0 docs/enc/others.md | 36 ------------------------------ docs/enc/web.md | 40 ++++++++++++++++++++++++++++++++++ 7 files changed, 49 insertions(+), 40 deletions(-) create mode 100755 codext/web/__init__.py rename codext/{others => web}/html.py (100%) rename codext/{others => web}/url.py (100%) create mode 100644 docs/enc/web.md diff --git a/README.md b/README.md index d4c51e3..0b69bfb 100644 --- a/README.md +++ b/README.md @@ -315,10 +315,8 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba #### [Others](https://python-codext.readthedocs.io/en/latest/enc/others.html) - [X] `dna`: implements the 8 rules of DNA sequences (N belongs to [1,8]) -- [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) - [X] `letter-indices`: encodes consonants and/or vowels with their corresponding indices - [X] `markdown`: unidirectional encoding from Markdown to HTML -- [X] `url`: aka URL encoding #### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano.html) @@ -330,6 +328,11 @@ This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `ba - [X] `whitespace`: replaces bits with whitespaces and tabs - [X] `whitespace_after_before`: variant of `whitespace` ; encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") +#### [Web](https://python-codext.readthedocs.io/en/latest/enc/web.html) + +- [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) +- [X] `url`: aka URL encoding + ## :clap: Supporters diff --git a/codext/others/__init__.py b/codext/others/__init__.py index d16459e..22d6830 100755 --- a/codext/others/__init__.py +++ b/codext/others/__init__.py @@ -1,7 +1,5 @@ # -*- coding: UTF-8 -*- from .dna import * -from .html import * from .letters import * from .markdown import * -from .url import * diff --git a/codext/web/__init__.py b/codext/web/__init__.py new file mode 100755 index 0000000..b29367a --- /dev/null +++ b/codext/web/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: UTF-8 -*- +from .html import * +from .url import * + diff --git a/codext/others/html.py b/codext/web/html.py similarity index 100% rename from codext/others/html.py rename to codext/web/html.py diff --git a/codext/others/url.py b/codext/web/url.py similarity index 100% rename from codext/others/url.py rename to codext/web/url.py diff --git a/docs/enc/others.md b/docs/enc/others.md index 199ba52..3470611 100644 --- a/docs/enc/others.md +++ b/docs/enc/others.md @@ -31,23 +31,6 @@ CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT ----- -### HTML Entities - -This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) - -```python ->>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") -'Тħĩş Їś ą Ţêšŧ' ->>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") -'Тħĩş Їś ą Ţêšŧ' -``` - ------ - ### Letter indices This encodes consonants and/or vowels with their respective indices. This codec is case insensitive, strips white spaces and only applies to letters. @@ -94,22 +77,3 @@ This is only for "encoding" (converting) Markdown to HTML. '

Test

\n\n

paragraph

\n' ``` ------ - -### URL - -This handles URL encoding, regardless of the case when decoding and with no error. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`url` | text <-> URL encoded text | `url`, `urlencode` | - -```python ->>> codecs.encode("?=this/is-a_test/../", "url") -'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' ->>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") -'?=this/is-a_test/../' ->>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") -'?=this/is-a_test/../' -``` - diff --git a/docs/enc/web.md b/docs/enc/web.md new file mode 100644 index 0000000..80c6a20 --- /dev/null +++ b/docs/enc/web.md @@ -0,0 +1,40 @@ +## Web + +`codext` implements some common Web-related encodings. + +----- + +### HTML Entities + +This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) + +```python +>>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") +'Тħĩş Їś ą Ţêšŧ' +>>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") +'Тħĩş Їś ą Ţêšŧ' +``` + +----- + +### URL + +This handles URL encoding, regardless of the case when decoding and with no error. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`url` | text <-> URL encoded text | `url`, `urlencode` | + +```python +>>> codecs.encode("?=this/is-a_test/../", "url") +'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' +>>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") +'?=this/is-a_test/../' +>>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") +'?=this/is-a_test/../' +``` + From 6816da0e62596d69661827eb39cbffda7f080ecf Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 21 Feb 2022 12:40:16 +0100 Subject: [PATCH 32/97] Improved base --- codext/base/baseN.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/codext/base/baseN.py b/codext/base/baseN.py index 93fe5d9..f935bf9 100755 --- a/codext/base/baseN.py +++ b/codext/base/baseN.py @@ -14,7 +14,7 @@ B1 = {chr(i): chr(i) for i in range(2**8)} B1[''] = "A" -base(B1, r"^(?:base[-_]?1(|[-_].)|unary)$") +base(B1, r"^(?:base[-_]?1(|[-_].)|unary)$", guess=[]) main1 = main(1) @@ -82,7 +82,7 @@ r'[-_]?(fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ", } base(B58, r"^base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))$", - guess=["base58-bitcoin", "base58-ripple", "base58-flickr"]) + guess=["base58", "base58-ripple", "base58-flickr"]) main58bc = main(58, "", "bitcoin") main58rp = main(58, "", "ripple") main58fl = main(58, "", "flickr") @@ -117,6 +117,11 @@ main67 = main(67) +B128 = {r'': "".join(chr(i) for i in range(128))} +base(B128, r"^base[-_]?128$", padding_char="=") +main128 = main(128, None, False) + + # generic base encodings, to be added after all others as they have the precedence base_generic() From a1155a690682a2f3e6fe92fa1e331aef31ced055 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 21 Feb 2022 12:40:57 +0100 Subject: [PATCH 33/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index f8f4f03..6b89d58 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.12.1 +1.12.2 From a9ff116250e4e8e0c3b779a441e2165d45dd9ddf Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 22 Feb 2022 18:49:01 +0100 Subject: [PATCH 34/97] Improved codec: rot --- codext/crypto/rot.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/codext/crypto/rot.py b/codext/crypto/rot.py index 12bfdaf..3f696f4 100755 --- a/codext/crypto/rot.py +++ b/codext/crypto/rot.py @@ -30,7 +30,7 @@ } __guess1__ = ["rot-%d" % i for i in range(1, 26)] + ["rot-47"] __guess2__ = ["progressive-rot-%d" % i for i in range(1, 26)] + ["progressive-rot-n%d" % i for i in range(1, 26)] -__guess3__ = ["alternative-rot-%d" % i for i in range(1, 26)] +__guess3__ = ["alternative-rot-%d" % i for i in range(1, 26) if i != 13] ROT47 = "!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" @@ -90,7 +90,8 @@ def decode(text, errors="strict"): return decode -add("alternative-rot", arot_encode, arot_decode, r"a(?:lt(?:ernative)?-)?(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5])$", +# note: alternative-rot-13 is equivalent to rot-13, therefore excluded from the regex +add("alternative-rot", arot_encode, arot_decode, r"a(?:lt(?:ernative)?-)?(?:caesar|rot)[-_]?([1-9]|1[0-24-9]|2[0-5])$", penalty=.2, entropy=lambda e: e, printables_rate=lambda pr: pr, transitive=True, examples=__examples3__, guess=__guess3__) add("rot", rot_encode, rot_decode, r"(?:caesar|rot)[-_]?([1-9]|1[0-9]|2[0-5]|47)$", aliases=["caesar"], penalty=.2, From b186a968dc7e0ac52f3303d59758f65f5a2a3d98 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 22 Feb 2022 18:49:10 +0100 Subject: [PATCH 35/97] Fixed bug in guess mode --- codext/__common__.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 89522e8..9b7936f 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1243,11 +1243,9 @@ def __develop(encodings): def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_categories, exclude, result, found=(), - stop=True, show=False, scoring_heuristic=False, extended=False, debug=False, regex=False): + stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ if depth > min_depth and stop_func(input): - if regex: - stop = True if not stop and (show or debug) and found not in result: s = repr(input) s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s @@ -1288,7 +1286,7 @@ def expand(items, descr=None, transform=None): if debug: print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, codec_categories, exclude, result, - found + (encoding, ), stop, show, scoring_heuristic, extended, debug, regex) + found + (encoding, ), stop, show, scoring_heuristic, extended, debug) def __rank(prev_input, input, prev_encoding, codecs, heuristic=False, extended=False, yield_score=False): @@ -1371,8 +1369,8 @@ def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=F expf = expf(f, encoding) except TypeError: expf = expf(f) - elif isinstance(expf, (int, float)): - epxf = f - .1 <= expf <= f + .1 + if isinstance(expf, (int, float)): + expf = (f - .1 <= expf <= f + .1) elif isinstance(expf, (tuple, list)) and len(expf) == 2: expf = f - expf[1] <= expf[0] <= expf[1] + .1 s += [-1., .1][expf] @@ -1414,17 +1412,15 @@ def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, codec_cat if len(found) > 0: for encoding in found: input = decode(input, encoding) - regex = False if isinstance(stop_func, string_types): stop_func = stopfunc.regex(stop_func) - regex = True result = {} if len(input) > 0: try: # breadth-first search for d in range(max_depth): __guess("", input, stop_func, 0, d+1, min_depth, codec_categories, exclude, result, tuple(found), stop, - show, scoring_heuristic, extended, debug, regex) + show, scoring_heuristic, extended, debug) if stop and len(result) > 0: return result except KeyboardInterrupt: From b3253142fa93bc5d089788c26e58512940fbf367 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 23 Feb 2022 00:03:22 +0100 Subject: [PATCH 36/97] Fixed minor issues --- codext/__common__.py | 12 ++++++++---- codext/__init__.py | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 9b7936f..35d1fc5 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1303,9 +1303,13 @@ class _Text(object): __slots__ = ["entropy", "lcharset", "len", "padding", "printables"] def __init__(self, text, pad_char=None): + c = text[-1] + last_char = c if isinstance(c, int) else ord(c) + self.padding = pad_char is not None and last_char == ord(pad_char) + if self.padding: + text = text.rstrip(pad_char) self.len = len(text) self.lcharset = len(set(text)) - self.padding = pad_char is not None and text[-1] in [pad_char, b(pad_char)] self.printables = float(len([c for c in text if (chr(c) if isinstance(c, int) else c) in printable])) / self.len self.entropy = entropy(text) @@ -1363,16 +1367,16 @@ def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=F s += .1 expf = sc.get('expansion_factor', 1.) if expf: - f = float(len(new_input)) / obj.len + f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f if isinstance(expf, type(lambda: None)): try: # this case allows to consider the current encoding name from the current codec expf = expf(f, encoding) except TypeError: expf = expf(f) if isinstance(expf, (int, float)): - expf = (f - .1 <= expf <= f + .1) + expf = (1/f - .1 <= 1/expf <= 1/f + .1) elif isinstance(expf, (tuple, list)) and len(expf) == 2: - expf = f - expf[1] <= expf[0] <= expf[1] + .1 + expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] s += [-1., .1][expf] # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the # number of input characters to take bad entropies of shorter strings into account diff --git a/codext/__init__.py b/codext/__init__.py index 3b98af4..0fa49d5 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -88,7 +88,7 @@ def main(): "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", ]) parser = argparse.ArgumentParser(description=descr, epilog=examples, formatter_class=argparse.RawTextHelpFormatter) - sparsers = parser.add_subparsers(dest="command", help="command to be executed") + sparsers = parser.add_subparsers(dest="command", required=True, help="command to be executed") parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", @@ -140,7 +140,7 @@ def main(): search = sparsers.add_parser("search", help="search for codecs") search.add_argument("pattern", nargs="+", help="encoding pattern to search") listi = sparsers.add_parser("list", help="list items") - lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed") + lsparsers = listi.add_subparsers(dest="type", required=True, help="type of item to be listed") liste = lsparsers.add_parser("encodings", help="list encodings") liste.add_argument("category", nargs="*", help="selected categories") listm = lsparsers.add_parser("macros", help="list macros") From 3cf1f5dbd67b78a771b55885d5e2c20eaf31fd81 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 23 Feb 2022 00:03:29 +0100 Subject: [PATCH 37/97] Refined tests --- tests/test_base.py | 14 ++++++++------ tests/test_common.py | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/test_base.py b/tests/test_base.py index 33eff65..7b3dae0 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -44,6 +44,7 @@ def test_codec_base1(self): for i in range(3): self.assertIsNotNone(codecs.encode(i * C, "base1")) self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") + self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") def test_codec_base2(self): STR = "test" @@ -181,7 +182,7 @@ def test_codec_base62(self): self.assertEqual(codecs.decode(b(b62), enc), b(STR)) def test_codec_base64(self): - for b64, enc in zip(["dGhpcyBpcyBhIHRlc3QK", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): + for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): self.assertEqual(codecs.encode(STR, enc), b64) self.assertEqual(codecs.encode(b(STR), enc), b(b64)) self.assertEqual(codecs.decode(b64, enc), STR) @@ -224,11 +225,12 @@ def test_base_main(self): tfile = "test-base-main.txt" with open(tfile, 'w') as f: f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") - sys.argv = [tmp[0], tfile] - for m in main32, main64url: - self.assertEqual(m(), 0) - sys.argv = [tmp[0], tfile, "-d"] - self.assertEqual(main2(), 1) + for swap_arg in [[], ["-s"]]: + sys.argv = [tmp[0], tfile] + swap_arg + for m in main32, main64url: + self.assertEqual(m(), 0) + sys.argv = [tmp[0], tfile, "-d"] + swap_arg + self.assertEqual(main2(), 1) os.remove(tfile) sys.argv[:] = tmp diff --git a/tests/test_common.py b/tests/test_common.py index ec57aaa..a35abfd 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -150,7 +150,7 @@ def test_guess_decode(self): self.assertIsNone(codext.stopfunc._reload_lang()) _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "test", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) self.assertIn("test-codec", codext.list_encodings("test")) self.assertEqual(codext.decode("TEST=", "test"), "TEST") self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, codec_categories="test", max_depth=2, @@ -204,7 +204,7 @@ def test_guess_decode(self): def test_rank_input(self): codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "test", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) STR = "This is a test string !" ENC = codext.encode(STR, "base64") self.assertTrue(len(codext.rank(ENC)) > 20) From cb6656845940db1386a563e08ea13ac918f51bb5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 23 Feb 2022 00:03:46 +0100 Subject: [PATCH 38/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 6b89d58..81f3632 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.12.2 +1.12.3 From 544e1cc39a7b4e793864087d3d2cf4cc77d73038 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 26 Feb 2022 16:01:45 +0100 Subject: [PATCH 39/97] Fixed multiple base codecs --- codext/base/base100.py | 2 +- codext/base/base122.py | 4 ++-- codext/base/base91.py | 4 ++-- codext/base/baseN.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/codext/base/base100.py b/codext/base/base100.py index db0b3c9..f5faa1d 100755 --- a/codext/base/base100.py +++ b/codext/base/base100.py @@ -37,7 +37,7 @@ def base100_encode(input, errors="strict"): return bytes(r), len(input) def base100_decode(input, errors="strict"): - input = b(input) + input = b(_stripl(input, True, True)) if errors == "ignore": input = input.replace(b"\n", "") if len(input) % 4 != 0: diff --git a/codext/base/base122.py b/codext/base/base122.py index 33a42ad..f580ff8 100755 --- a/codext/base/base122.py +++ b/codext/base/base122.py @@ -98,9 +98,9 @@ def _get_7bits(currB, bob, B, decoded): currB, bob = _get_7bits(currB, bob, input[i] & 127, r) else: currB, bob = _get_7bits(currB, bob, input[i], r) - return "".join(map(chr, r)), len(input) + return "".join(map(chr, r)).rstrip("\0"), len(input) add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) -main122 = main(122, "") +main122 = main(122, "", wrap=False) diff --git a/codext/base/base91.py b/codext/base/base91.py index 6f0d6ec..21a21d5 100755 --- a/codext/base/base91.py +++ b/codext/base/base91.py @@ -72,7 +72,7 @@ def encode(text, errors="strict"): def base91_decode(mode): b91 = {c: i for i, c in enumerate(_get_charset(B91, mode))} def decode(text, errors="strict"): - t, s, bits, alt = b(text), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None + t, s, bits, alt = b(_stripl(text, True, True)), "", "", re.search(r'[-_]alt(ernate)?$', mode) is not None ehandler = handle_error("base91", errors, decode=True) for i in range(0, len(t), 2): try: @@ -103,7 +103,7 @@ def decode(text, errors="strict"): bits = bits[8:] elif not alt and len(bits) > 0 and not set(bits) == {"0"}: s += chr(int(bits, 2)) - return s, len(t) + return s.rstrip("\0"), len(t) return decode diff --git a/codext/base/baseN.py b/codext/base/baseN.py index f935bf9..3c63453 100755 --- a/codext/base/baseN.py +++ b/codext/base/baseN.py @@ -82,7 +82,7 @@ r'[-_]?(fl|flickr|short[-]?url|url)$': "123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ", } base(B58, r"^base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))$", - guess=["base58", "base58-ripple", "base58-flickr"]) + guess=["base58-bitcoin", "base58-ripple", "base58-flickr"]) main58bc = main(58, "", "bitcoin") main58rp = main(58, "", "ripple") main58fl = main(58, "", "flickr") @@ -119,7 +119,7 @@ B128 = {r'': "".join(chr(i) for i in range(128))} base(B128, r"^base[-_]?128$", padding_char="=") -main128 = main(128, None, False) +main128 = main(128, None, False, wrap=False) # generic base encodings, to be added after all others as they have the precedence From 95f4b80825e101e3e44cc7b5d961a993c5736cce Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 26 Feb 2022 16:02:39 +0100 Subject: [PATCH 40/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 81f3632..89c881b 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.12.3 +1.12.4 From f8bd7b741c181b2789f4cf4785e10ac0e67c237c Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:24:29 +0100 Subject: [PATCH 41/97] Fixed codec: shift --- codext/crypto/shift.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codext/crypto/shift.py b/codext/crypto/shift.py index 89ca992..599e60d 100755 --- a/codext/crypto/shift.py +++ b/codext/crypto/shift.py @@ -19,12 +19,12 @@ def ord_shift_decode(i): - return ord_shift_encode(-i) + return ord_shift_encode(-int(i)) def ord_shift_encode(i): def encode(text, errors="strict"): - r = "".join(chr((ord(c) + i) % 256) for c in text) + r = "".join(chr((ord(c) + int(i)) % 256) for c in text) return r, len(r) return encode From d9cc79ae047cf9d384fd0068006d43b52a73771a Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:24:42 +0100 Subject: [PATCH 42/97] Fixed codec: scytale --- codext/crypto/scytale.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/crypto/scytale.py b/codext/crypto/scytale.py index 32e6e96..7490241 100755 --- a/codext/crypto/scytale.py +++ b/codext/crypto/scytale.py @@ -17,7 +17,7 @@ 'enc(scytale2|scytale-2|scytale_2)': {'this is a test': "ti satshsi et"}, 'enc(scytale5|scytale-5|scytale_5)': {'this is a test': "tithsei ssat "}, } -__guess__ = ["scytale-%d" % i for i in range(10)] +__guess__ = ["scytale-%d" % i for i in range(1, 10)] PADDING_CHAR = "" From 1e31eab38491fc5768383a21332a06f974c5f1e6 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:25:27 +0100 Subject: [PATCH 43/97] Fixed bug in base --- codext/base/_base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/codext/base/_base.py b/codext/base/_base.py index 05aaed0..fce8b9a 100755 --- a/codext/base/_base.py +++ b/codext/base/_base.py @@ -191,7 +191,11 @@ def _decode(input, errors="strict"): kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) n = "base{}".format(n) if name is None else name - kwargs['guess'] = kwargs.get('guess', [n]) + try: + g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] + except AttributeError: + g = [n] + kwargs['guess'] = kwargs.get('guess', g) add(n, encode, decode, pattern, entropy=nb, **kwargs) From d09dd0be29ffb3ce1a42c5e42eb1c58e0e3e5faf Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:25:41 +0100 Subject: [PATCH 44/97] Improved unbase tool --- codext/base/__init__.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/codext/base/__init__.py b/codext/base/__init__.py index 5859f6b..8c0d220 100755 --- a/codext/base/__init__.py +++ b/codext/base/__init__.py @@ -19,7 +19,7 @@ def main(): With no FILE, or when FILE is -, read standard input. Optional arguments: - -e, --extended also consider generic base codecs while guess-decoding + -E, --extended also consider generic base codecs while guess-decoding -f, --stop-function set the result chceking function (default: text) format: printables|text|flag|lang_[bigram] -M, --max-depth maximum codec search depth (default: 5) @@ -36,28 +36,23 @@ def main(): """ parser = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) parser.format_help = MethodType(lambda s: s.description, parser) + group = parser.add_mutually_exclusive_group() parser.add_argument("file", nargs="?") - parser.add_argument("-e", "--extended", action="store_true") - parser.add_argument("-f", "--stop-function", default="text") + parser.add_argument("-E", "--extended", action="store_true") + group.add_argument("-f", "--stop-function", default="text") parser.add_argument("-M", "--max-depth", type=int, default=10) parser.add_argument("-m", "--min-depth", type=int, default=0) - parser.add_argument("-p", "--pattern") + group.add_argument("-p", "--pattern") parser.add_argument("-s", "--show", action="store_true") parser.add_argument("--help", action="help") parser.add_argument("--version", action="version") parser.add_argument("--verbose", action="store_true") parser.version = "CodExt " + __version__ args = parser.parse_args() - excl, s = [["base%d-generic" % i for i in range(2, 256)], []][args.extended], args.stop_function - if re.match(r"lang_[a-z]{2}$", s) and all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(stopfunc.LANG_BACKEND) - #TODO: validate args.stop_function - #TODO: make --stop-function and --pattern mutually exclusive - sfunc = getattr(stopfunc, s, s) - c = _input(args.file) + c, e = _input(args.file), [["base%d-generic" % i for i in range(2, 256)], []][args.extended] c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") - r = codecs.guess(c, sfunc, 0, args.max_depth, exclude=tuple(excl), codec_categories="base", - stop=False, show=args.verbose, scoring_heuristic=False, debug=args.verbose) + r = codecs.guess(c, stopfunc._validate(args.stop_function), 0, args.max_depth, "base", tuple(e), stop=False, + show=args.verbose, debug=args.verbose) if len(r) == 0: print("Could not decode :-(") return 0 From 0d132317fc4d214311d1f81071bbf916437d3532 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:26:07 +0100 Subject: [PATCH 45/97] Improved codext tool --- codext/__init__.py | 115 +++++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 50 deletions(-) diff --git a/codext/__init__.py b/codext/__init__.py index 0fa49d5..692ab48 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -33,12 +33,23 @@ pattern=r"^uu(?:[-_]encode|codec)?$", add_to_codecs=True, category="native") -def __literal_eval(o): - """ Non-failing ast.literal_eval alias function. """ - try: - return literal_eval(str(o)) - except ValueError: - return literal_eval("'" + str(o) + "'") +def __format_list(items, include=True): + if items is None: + return + d = {-1: list_encodings() if include else []} + for n, i in enumerate(items): + try: + depth, i = i.split(":") + depth = int(depth.strip().replace("~", "-")) + if depth < 0: + depth = -1 + except ValueError: + if n == 0: + d[-1] = [] + depth = -1 + d.setdefault(depth, []) + d[depth].append(i.strip()) + return d def __print_tabular(lst, space=4): @@ -70,6 +81,19 @@ def __print_tabular(lst, space=4): def main(): import argparse, os + + class _CustomFormatter(argparse.RawTextHelpFormatter): + def __init__(self, prog, **kwargs): + kwargs['max_help_position'] = 32 + super(_CustomFormatter, self).__init__(prog, **kwargs) + + def _format_action_invocation(self, action): + if not action.option_strings: + metavar, = self._metavar_formatter(action, action.dest)(1) + return metavar + else: + return ", ".join(action.option_strings) + descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ .format(__version__, __author__, __email__, __copyright__, __license__, __source__) @@ -87,62 +111,68 @@ def main(): "echo -en \"test\" | codext encode base64 gzip | codext guess", "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", ]) - parser = argparse.ArgumentParser(description=descr, epilog=examples, formatter_class=argparse.RawTextHelpFormatter) - sparsers = parser.add_subparsers(dest="command", required=True, help="command to be executed") + kw = {'formatter_class': _CustomFormatter} + parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) + kw2 = {'required': True} if PY3 else {} + sparsers = parser.add_subparsers(dest="command", help="command to be executed", **kw2) parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", help="strip newlines from input (default: False)") - encode = sparsers.add_parser("encode", help="encode input using the specified codecs") + encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) encode.add_argument("encoding", nargs="+", help="list of encodings to apply") encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], help="error handling (default: strict)") - decode = sparsers.add_parser("decode", help="decode input using the specified codecs") + decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) decode.add_argument("encoding", nargs="+", help="list of encodings to apply") decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], help="error handling (default: strict)") - guess = sparsers.add_parser("guess", help="try guessing the decoding codecs") + guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") - guess.add_argument("-c", "--codec-categories", nargs="*", help="codec categories to be included in the search ; " - "format: string|tuple") - guess.add_argument("-d", "--min-depth", default=0, type=int, help="minimum codec search depth before triggering " - "results (default: 0)") - guess.add_argument("-D", "--max-depth", default=5, type=int, help="maximum codec search depth (default: 5)") - guess.add_argument("-e", "--exclude-codecs", nargs="*", help="codecs to be explicitely not used ; " - "format: string|tuple") + guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") guess.add_argument("-E", "--extended", action="store_true", help="while using the scoring heuristic, also consider null scores (default: False)") lng = "lang_%s" % LANG def_func = lng if getattr(stopfunc, lng, None) else "text" - guess.add_argument("-f", "--stop-function", default=def_func, help="result checking function (default: %s) ; " - "format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-sensitive ; add -i to " - "force it as case-insensitive or add '(?i)' in front of the expression" % def_func) - guess.add_argument("-i", "--case-insensitive", dest="icase", action="store_true", - help="while using the regex stop function, set it as case-insensitive (default: False)") + guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " + "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" + "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" + % def_func) guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" " the search but may be more accurate (default: False)") + guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", + help="while using the regex stop function, set it as case-insensitive (default: False)") if len(stopfunc.LANG_BACKENDS) > 0: _lb = stopfunc.LANG_BACKEND guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], help="natural language detection backend (default: %s)" % _lb) + guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", + help="minimum codec search depth before triggering results (default: 0)") + guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", + help="maximum codec search depth (default: 5)") guess.add_argument("-s", "--do-not-stop", action="store_true", help="do not stop if a valid output is found (default: False)") guess.add_argument("-v", "--verbose", action="store_true", help="show guessing information and steps (default: False)") - rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input") - rank.add_argument("-c", "--codec-categories", help="codec categories to be included in the search ; " - "format: string|tuple|list(strings|tuples)") - rank.add_argument("-e", "--exclude-codecs", help="codecs to be explicitely not used ; " - "format: string|tuple|list(strings|tuples)") + rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) + rank.add_argument("-c", "--codec-categories", nargs="*", action="extend", metavar="CATEGORY", + help="codec categories to be included in the search ; format: string|tuple|list(strings|tuples)") + rank.add_argument("-e", "--exclude-codecs", nargs="*", action="extend", metavar="CODEC", + help="codecs to be explicitely not used ; format: string|tuple|list(strings|tuples)") rank.add_argument("-E", "--extended", action="store_true", help="while using the scoring heuristic, also consider null scores (default: False)") rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") search = sparsers.add_parser("search", help="search for codecs") search.add_argument("pattern", nargs="+", help="encoding pattern to search") listi = sparsers.add_parser("list", help="list items") - lsparsers = listi.add_subparsers(dest="type", required=True, help="type of item to be listed") + lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", **kw2) liste = lsparsers.add_parser("encodings", help="list encodings") - liste.add_argument("category", nargs="*", help="selected categories") + liste.add_argument("category", nargs="+", help="selected categories") listm = lsparsers.add_parser("macros", help="list macros") addm = sparsers.add_parser("add-macro", help="add a macro to the registry") addm.add_argument("name", help="macro's name") @@ -150,15 +180,7 @@ def main(): remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") remm.add_argument("name", help="macro's name") args = parser.parse_args() - try: - args.codec_categories = _lst(map(__literal_eval, args.codec_categories)) - except (AttributeError, TypeError): - pass - try: - args.exclude_codecs = _lst(map(__literal_eval, args.exclude_codecs)) - except (AttributeError, TypeError): - pass - #print(args.codec_categories, args.exclude_codecs) + args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) try: # if a search pattern is given, only handle it if args.command == "search": @@ -211,17 +233,9 @@ def main(): all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): stopfunc._reload_lang(lb) r = codecs.guess(c, - getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), - args.min_depth, - args.max_depth, - args.codec_categories, - args.exclude_codecs, - args.encoding, - not args.do_not_stop, - True, # show - not args.no_heuristic, - args.extended, - args.verbose) + getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, + args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show + not args.no_heuristic, args.extended, args.verbose) for i, o in enumerate(r.items()): e, out = o if len(e) > 0: @@ -238,6 +252,7 @@ def main(): s = "[+] %.5f: %s" % (i[0], e) print(s if len(s) <= 80 else s[:77] + "...") except Exception as e: + raise e m = str(e) print("codext: " + m[0].lower() + m[1:]) From ad01045ad33093658cc19fe48b7deb6bddfc4c47 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:26:18 +0100 Subject: [PATCH 46/97] Improved guess performance --- .coveragerc | 7 +- codext/__common__.py | 382 +++++++++++++++++++++++++------------------ tests/test_common.py | 22 +-- 3 files changed, 237 insertions(+), 174 deletions(-) diff --git a/.coveragerc b/.coveragerc index 0baf7fa..4ccc970 100644 --- a/.coveragerc +++ b/.coveragerc @@ -11,7 +11,7 @@ exclude_lines = def main\(\)\: def __stdin_pipe\(\)\: for line in __stdin_pipe\(\)\: - def __literal_eval\(o\)\: + def __format_list\(items, include\=True\)\: def __print_tabular\(lst, space\=4\)\: except ImportError: except NameError: @@ -20,3 +20,8 @@ exclude_lines = if PY3 def encode\(self, input, final\=False\)\: def decode\(self, input, final\=False\)\: + def _detect\(text\)\: + def _lang\(lang\)\: + if stopfunc\.LANG_BACKEND\: + def _validate\(stop_function, lang_backend\=\"none\"\)\: + except KeyboardInterrupt\: diff --git a/codext/__common__.py b/codext/__common__.py index 35d1fc5..e45fb1e 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -45,6 +45,7 @@ CODECS_REGISTRY = None CODECS_OVERWRITTEN = [] CODECS_CATEGORIES = ["native", "custom"] +CODECS_CACHE = {} LANG = getlocale() if LANG: LANG = (LANG[0] or "")[:2].lower() @@ -674,17 +675,16 @@ def list_categories(): # particular category, hardcoded from base/_base.py c += ["base-generic"] return c +list_categories() def list_encodings(*categories): """ Get a list of all codecs. """ - # first, determine the list of valid categories - valid_categories = list_categories() - # then, if "non-native" is in the input list, extend the list with the whole categories but "native" + # if "non-native" is in the input list, extend the list with the whole categories but "native" categories, exclude = list(categories), [] for c in categories[:]: if c == "non-native": - for c in valid_categories: + for c in CODECS_CATEGORIES: if c == "native" or c in categories: continue categories.append(c) @@ -714,7 +714,7 @@ def list_encodings(*categories): if (len(categories) == 0 or c in categories) and c not in exclude: enc.append(name) for category in categories: - if category not in valid_categories: + if category not in CODECS_CATEGORIES: raise ValueError("Category '%s' does not exist" % category) return sorted(list(set(enc)), key=_human_keys) @@ -1226,23 +1226,22 @@ def _load_lang_backend(backend=None): stopfunc._reload_lang = _load_lang_backend -def __develop(encodings): - """ Private method for developing the input list of encodings, trying to extend it with every encoding name. """ - enc = [] - for e in (encodings or []): - try: - ci = lookup(e, False) - g = ci.parameters['guess'] - except: - g = [e] - if e in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected - enc.append(e) - else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected - enc.extend(g) - return enc +def _validate(stop_function, lang_backend="none"): + s, lb = stop_function, lang_backend + if isinstance(s, string_types): + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + f = getattr(stopfunc, s, None) + if f: + return f + elif not isinstance(s, FunctionType): + raise ValueError("Bad stop function") + return s +stopfunc._validate = _validate -def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_categories, exclude, result, found=(), +def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, codecs, result, found=(), stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ if depth > min_depth and stop_func(input): @@ -1254,47 +1253,60 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, codec_cat result[found] = input if depth >= max_depth or len(result) > 0 and stop: return - # compute included and excluded codecs for this depth - def expand(items, descr=None, transform=None): - items = items or [] - # format 1: when string, take it as the only items at any depth - if isinstance(items, string_types): - r = (items, ) - # format 2: when tuple, consider it as a list of items at any depth - elif isinstance(items, tuple): - r = items - # format 3: when list, consider it as the list of tuples of items with the order number corresponding to the - # applicable depth - elif isinstance(items, list): - try: - r = items[depth] or () - if isinstance(r, string_types): - r = (r, ) - except IndexError: - r = () - else: - raise ValueError("Bad %sformat %s" % (["%s " % descr, ""][descr is None], items)) - return r if transform is None else transform(*r) - # parse valid encodings, expanding included/excluded codecs - c, e = expand(codec_categories, "codec_categories", list_encodings), __develop(expand(exclude, "exclude")) prev_enc = found[-1] if len(found) > 0 else "" - for new_input, encoding in __rank(prev_input, input, prev_enc, c, scoring_heuristic, extended): + e = encodings.get(depth, encodings.get(-1, [])) + for new_input, encoding in __rank(prev_input, input, prev_enc, e, codecs, scoring_heuristic, extended): if len(result) > 0 and stop: return - if encoding in e: - continue if debug: print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) - __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, codec_categories, exclude, result, + __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, codecs, result, found + (encoding, ), stop, show, scoring_heuristic, extended, debug) -def __rank(prev_input, input, prev_encoding, codecs, heuristic=False, extended=False, yield_score=False): +def __make_encodings_dict(include, exclude): + """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible + encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ + codecs = {} + def _develop(d, keep=True): + d = d or {} + for k, v in d.items(): + l, cc = [], [e for e in v if e in CODECS_CATEGORIES] + for enc in (list_encodings(*cc) if len(cc) > 0 or keep else [] + \ + [e for e in v if e not in CODECS_CATEGORIES]): + try: + g = lookup(enc, False).parameters['guess'] + except: + g = [enc] + if enc in g and not keep: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected + l.append(enc) + else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected + l.extend(g) + d[k] = l + if keep: + for e in l: + # cache newly loaded CodecInfo objects + ci = lookup(e, False) + n = ci.name + if n in CODECS_CACHE: + ci = CODECS_CACHE[n] # keep the cached object + else: + CODECS_CACHE[n] = ci # cache the new object + codecs[e] = ci + return d + exclude = _develop(exclude, False) + return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()}, codecs + + +def __rank(prev_input, input, prev_encoding, encodings, codecs, heuristic=False, extended=False, yield_score=False): """ Filter valid encodings and rank them by relevance. """ ranking = {} - for codec in codecs: - for score, new_input, encoding in __score(prev_input, input, prev_encoding, codec, heuristic, extended): - ranking[encoding] = (score, new_input) + for encoding in encodings: + try: + score, new = __score(prev_input, input, prev_encoding, encoding, codecs.get(encoding), heuristic, extended) + except TypeError: + continue + ranking[encoding] = (score, new) for encoding, result in sorted(ranking.items(), key=lambda x: -x[1][0]): yield result if yield_score else result[1], encoding @@ -1304,7 +1316,7 @@ class _Text(object): def __init__(self, text, pad_char=None): c = text[-1] - last_char = c if isinstance(c, int) else ord(c) + pad_char, last_char = (b(pad_char), c) if isinstance(c, int) else (pad_char, ord(c)) self.padding = pad_char is not None and last_char == ord(pad_char) if self.padding: text = text.rstrip(pad_char) @@ -1314,136 +1326,182 @@ def __init__(self, text, pad_char=None): self.entropy = entropy(text) -def __score(prev_input, input, prev_encoding, codec, heuristic=False, extended=False): +def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): """ Score relevant encodings given an input. """ - obj, ci = None, lookup(codec, False) # NB: lookup(...) won't fail as the codec value comes from list_encodings(...) - sc = ci.parameters.get('scoring', {}) - no_error, transitive = ci.parameters.get('no_error', False), sc.get('transitive', False) - for encoding in ci.parameters.get('guess', [codec]): - # ignore encodings that fail to decode with their default errors handling value - try: - new_input = decode(input, encoding) - except: - continue - # ignore encodings that give an output identical to the input (identity transformation) or to the previous input - if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): - continue - # ignore encodings that transitively give the same output (identity transformation by chaining twice a same - # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) - if transitive and prev_encoding: - ci_prev = lookup(prev_encoding, False) - if ci_prev.parameters['name'] == ci.parameters['name']: - continue - # compute input's characteristics only once and only if the control flow reaches this point - pad = sc.get('padding_char') - if obj is None: - obj = _Text(input, pad) - if heuristic: - # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base - # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates - s = -sc.get('penalty', .0) - # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; - # on the contrary, if the length of input text's charset is strictly greater, give a penalty - lcs = sc.get('len_charset', 256) - if isinstance(lcs, type(lambda: None)): - lcs = int(lcs(encoding)) - if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: - s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) - elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: - s -= .2 # this can occur for encodings with no_error set to True - # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, - # or a penalty when it should not be encountered but it is present - if pad and obj.padding: - s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus - elif not pad and obj.padding: - s -= .1 # it could arise a padding character is encountered while not being padding => small penalty - # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when - # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) - if not no_error: - pr = sc.get('printables_rate', 0) - if isinstance(pr, type(lambda: None)): - pr = float(pr(obj.printables)) - if obj.printables - pr <= .05: - s += .1 - expf = sc.get('expansion_factor', 1.) - if expf: - f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f - if isinstance(expf, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - expf = expf(f, encoding) - except TypeError: - expf = expf(f) - if isinstance(expf, (int, float)): - expf = (1/f - .1 <= 1/expf <= 1/f + .1) - elif isinstance(expf, (tuple, list)) and len(expf) == 2: - expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] - s += [-1., .1][expf] - # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the - # number of input characters to take bad entropies of shorter strings into account - entr = sc.get('entropy', {}) - entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr - if isinstance(entr, type(lambda: None)): + obj = None + sc = codec.parameters.get('scoring', {}) + no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) + # ignore encodings that fail to decode with their default errors handling value + try: + new_input = decode(input, encoding) + except: + return + # ignore encodings that give an output identical to the input (identity transformation) or to the previous input + if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): + return + # ignore encodings that transitively give the same output (identity transformation by chaining twice a same + # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) + if transitive and prev_encoding: + ci_prev = lookup(prev_encoding, False) + if ci_prev.parameters['name'] == codec.parameters['name']: + return + # compute input's characteristics only once and only if the control flow reaches this point + pad = sc.get('padding_char') + if obj is None: + obj = _Text(input, pad) + if heuristic: + # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base + # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates + s = -sc.get('penalty', .0) + # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; + # on the contrary, if the length of input text's charset is strictly greater, give a penalty + lcs = sc.get('len_charset', 256) + if isinstance(lcs, type(lambda: None)): + lcs = int(lcs(encoding)) + if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: + s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) + elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: + s -= .2 # this can occur for encodings with no_error set to True + # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, + # or a penalty when it should not be encountered but it is present + if pad and obj.padding: + s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus + elif not pad and obj.padding: + s -= .1 # it could arise a padding character is encountered while not being padding => small penalty + # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when + # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) + if not no_error: + pr = sc.get('printables_rate', 0) + if isinstance(pr, type(lambda: None)): + pr = float(pr(obj.printables)) + if obj.printables - pr <= .05: + s += .1 + expf = sc.get('expansion_factor', 1.) + if expf: + f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f + if isinstance(expf, type(lambda: None)): try: # this case allows to consider the current encoding name from the current codec - entr = entr(obj.entropy, encoding) + expf = expf(f, encoding) except TypeError: - entr = entr(obj.entropy) - if entr is not None: - # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1) - d_entr = min(4e-05 * obj.len**2 - .003 * obj.len, 1) * abs(entr - entropy(new_input)) - if d_entr <= .5: - s += .5 - d_entr - # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) - bonus = sc.get('bonus_func') - if bonus is not None: - if isinstance(bonus, type(lambda: None)): - bonus = bonus(obj, ci, encoding) - if bonus: - s += .2 - else: - s = 1. - # exclude negative (and eventually null) scores as they are (hopefully) not relevant - if extended and s >= .0 or not extended and s > .0: - yield s, new_input, encoding + expf = expf(f) + if isinstance(expf, (int, float)): + expf = (1/f - .1 <= 1/expf <= 1/f + .1) + elif isinstance(expf, (tuple, list)) and len(expf) == 2: + expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] + s += [-1., .1][expf] + # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the + # number of input characters to take bad entropies of shorter strings into account + entr = sc.get('entropy', {}) + entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr + if isinstance(entr, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + entr = entr(obj.entropy, encoding) + except TypeError: + entr = entr(obj.entropy) + if entr is not None: + # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1) + d_entr = min(4e-05 * obj.len**2 - .003 * obj.len, 1) * abs(entr - entropy(new_input)) + if d_entr <= .5: + s += .5 - d_entr + # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) + bonus = sc.get('bonus_func') + if bonus is not None: + if isinstance(bonus, type(lambda: None)): + bonus = bonus(obj, codec, encoding) + if bonus: + s += .2 + else: + s = 1. + # exclude negative (and eventually null) scores as they are (hopefully) not relevant + if extended and s >= .0 or not extended and s > .0: + return s, new_input -def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, codec_categories=None, exclude=None, found=(), +def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): - """ Try decoding without the knowledge of the encoding(s). """ + """ Try decoding without the knowledge of the encoding(s). + + :param input: input text to be guessed + :param stop_func: function defining the stop condition + :param min_depth: minimum search depth + :param max_depth: maximum search depth + ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means include every encoding) + :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means exclude no encoding) + :param found: tuple of already found encodings + :param stop: whether to stop or not when a valid solution is found + :param show: whether to immediately show once a solution is found + :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., + meaning that every non-failing encoding will be considered with no order of precedence) + :param extended: whether to also consider null scores with the heuristic + :param debug: whether to show each attempt at each depth during computation + """ + if len(input) == 0: + return "" + # check for min and max depths if max_depth <= 0: raise ValueError("Depth must be a non-null positive integer") if min_depth > max_depth: raise ValueError("Min depth shall be less than or equal to the max depth") + # take the tuple of found encodings into account if len(found) > 0: for encoding in found: input = decode(input, encoding) + # handle the stop function as a regex if a string was given if isinstance(stop_func, string_types): stop_func = stopfunc.regex(stop_func) + # reformat include and exclude arguments ; supported formats: + for n, l in zip(["inc", "exc"], [include, exclude]): + if l is None: + if n == "inc": + include = l = {-1: CODECS_CATEGORIES} + else: + exclude = l = {} + # "category" OR "enc_name" OR whatever => means a single item for all depths + if isinstance(l, string_types): + if n == "inc": + include = l = {-1: [l]} + else: + exclude = l = {-1: [l]} + # ["enc_name1", "enc_name2", ...] => means for all depths + if isinstance(l, (list, tuple)): + if n == "inc": + include = l = {-1: l} + else: + exclude = l = {-1: l} + # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings + if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): + raise ValueError("Include argument shall be a list or a dictionary with integer keys") + # precompute encodings lists per depth and cache the related CodecInfo objects + encodings, codecs = __make_encodings_dict(include, exclude) result = {} - if len(input) > 0: - try: - # breadth-first search - for d in range(max_depth): - __guess("", input, stop_func, 0, d+1, min_depth, codec_categories, exclude, result, tuple(found), stop, - show, scoring_heuristic, extended, debug) - if stop and len(result) > 0: - return result - except KeyboardInterrupt: - pass + try: + # breadth-first search + for d in range(max_depth): + __guess("", input, stop_func, 0, d+1, min_depth, encodings, codecs, result, tuple(found), stop, show, + scoring_heuristic, extended, debug) + if stop and len(result) > 0: + break + except KeyboardInterrupt: + pass + CODECS_CACHE = {} return result codecs.guess = guess -def rank(input, extended=False, limit=-1, codec_categories=None, exclude=None): - """ Rank the most probable encodings based on the given input. """ - if isinstance(codec_categories, string_types): - codec_categories = (codec_categories, ) - codecs = list_encodings(*(codec_categories or ())) - for e in __develop(exclude): - try: - codecs.remove(e) - except ValueError: - pass - r = list(__rank(None, input, "", codecs, True, extended, True)) +def rank(input, extended=False, limit=-1, include=None, exclude=None): + """ Rank the most probable encodings based on the given input. + + :param input: input text to be evaluated + :param extended: whether to consider null scores too (NB: negative scores are not output !) + :param limit: number of encodings to be returned (-1 means all of them) + :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) + :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) + """ + encodings, codecs = __make_encodings_dict({0: include or CODECS_CATEGORIES}, {0: exclude or []}) + r = list(__rank(None, input, "", encodings[0], codecs, True, extended, True)) + CODECS_CACHE = {} return r[:limit] if len(r) > 1 else r codecs.rank = rank diff --git a/tests/test_common.py b/tests/test_common.py index a35abfd..6eddd7e 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -148,14 +148,15 @@ def test_encode_multiple_rounds(self): def test_guess_decode(self): self.assertIsNone(codext.stopfunc._reload_lang()) + self.assertIsNotNone(codext.stopfunc._validate("flag")) _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) self.assertIn("test-codec", codext.list_encodings("test")) self.assertEqual(codext.decode("TEST=", "test"), "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, codec_categories="test", max_depth=2, + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, scoring_heuristic=False).items())[0][1], "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, codec_categories=["test", "base"], + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], max_depth=2).items())[0][1], "TEST") STR = "This is a test" self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) @@ -163,12 +164,12 @@ def test_guess_decode(self): self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, exclude=["base100"]))) self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) - self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=[None])), 0) + self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, show=True))) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, codec_categories="base", + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", exclude=("base64", "base64-url"))), 0) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, codec_categories="base", + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) self.assertRaises(ValueError, codext.guess, STR, max_depth=0) self.assertRaises(ValueError, codext.guess, STR, exclude=42) @@ -198,8 +199,7 @@ def test_guess_decode(self): self.assertEqual(encoding, found_encodings[0]) txt = "".join(chr(i) for i in range(256)) b64 = codext.encode(txt, "base64") - self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, - codec_categories="base"))) + self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") def test_rank_input(self): @@ -210,10 +210,10 @@ def test_rank_input(self): self.assertTrue(len(codext.rank(ENC)) > 20) self.assertEqual(len(codext.rank(ENC, limit=20)), 20) self.assertEqual(codext.rank(ENC, exclude=["rot"])[0][1], "base64") - self.assertEqual(codext.rank(ENC, codec_categories="base")[0][0][1], STR) - self.assertEqual(codext.rank(ENC, codec_categories=["base"])[0][0][1], STR) - self.assertIsNotNone(codext.rank(ENC, codec_categories=["base"], exclude=["does_not_exist"])[0][0][1], STR) - self.assertIsNotNone(codext.rank("TEST=", codec_categories=["test", "base"])[0][0][1], "TEST") + self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) + self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) + self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) + self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") def test_handle_macros(self): MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" From c79e2bdf6633df8e1c643afeea728fc6033315b9 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 20:32:16 +0100 Subject: [PATCH 47/97] Fixed codec: baudot --- codext/binary/baudot.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/codext/binary/baudot.py b/codext/binary/baudot.py index ae5fc32..a57e1ea 100755 --- a/codext/binary/baudot.py +++ b/codext/binary/baudot.py @@ -10,9 +10,9 @@ from ..__common__ import * -__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us", "murray", "uk"] +__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us"] if PY3: - __CODES.extend(["ita2_meteo", "mtk2"]) + __CODES.extend(["ita2_meteo", "mtk2", "murray", "uk"]) __guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] __examples1__ = { 'enc(baudot-BAD_ALPHABET)': None, @@ -51,7 +51,7 @@ PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us" + (r"|meteo" if PY3 else r"") + r")" + \ - (r"|mtk2" if PY3 else r"") + r"|murray|uk|us_tty)(?:[-_](?:lsb|msb))?)?$" + (r"|mtk2|murray|uk" if PY3 else r"") + r"|us_tty)(?:[-_](?:lsb|msb))?)?$" # reserved character RES_CHR = "\xff" @@ -116,20 +116,22 @@ "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", ] # Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) -MURRAY = [ - "00100", "11011", - " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \ - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*", -] +if PY3: + MURRAY = [ + "00100", "11011", + " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", + "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \ + "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*", + ] # English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code # https://en.wikipedia.org/wiki/Baudot_code) -UK = [ - "10000", "01000", - "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", - "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \ - "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+", -] +if PY3: + UK = [ + "10000", "01000", + "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", + "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \ + "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+", + ] def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): From 3076c9ff0a1182caf5a361e54e9d99887b7f5348 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 27 Feb 2022 21:06:40 +0100 Subject: [PATCH 48/97] New release --- codext/VERSION.txt | 2 +- codext/__init__.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 89c881b..feaae22 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.12.4 +1.13.0 diff --git a/codext/__init__.py b/codext/__init__.py index 692ab48..486dd2f 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -180,7 +180,8 @@ def _format_action_invocation(self, action): remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") remm.add_argument("name", help="macro's name") args = parser.parse_args() - args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) + if args.command in ["guess", "rank"]: + args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) try: # if a search pattern is given, only handle it if args.command == "search": From 803e211c1922dd8e98a0d77eb466ec4d8eeb7e75 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Feb 2022 09:22:40 +0100 Subject: [PATCH 49/97] Added codec: base11 --- codext/base/baseN.py | 7 ++++++- docs/enc/base.md | 31 +++++++++++-------------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/codext/base/baseN.py b/codext/base/baseN.py index 3c63453..cf4abe4 100755 --- a/codext/base/baseN.py +++ b/codext/base/baseN.py @@ -39,10 +39,15 @@ B10 = {r'': "0123456789"} -base(B10, r"^(?:base[-_]?10|int(?:eger)?)$") +base(B10, r"^(?:base[-_]?10|int(?:eger)?|dec(?:imal)?)$") main10 = main(10) +B11 = {r'': "0123456789a", r'[-_]inv(erted)?$': "a0123456789"} +base(B11, r"^base[-_]?11(|[-_]inv(?:erted)?)$") +main11 = main(11) + + B16 = {'': digits + "ABCDEF", '[-_]inv(erted)?$': "ABCDEF" + digits} base2n(B16, r"^(?:base[-_]?16|hex)(|[-_]inv(?:erted)?)$", expansion_factor=2.) main16 = main(16, "RFC 4648") diff --git a/docs/enc/base.md b/docs/enc/base.md index 73b78ff..757965e 100644 --- a/docs/enc/base.md +++ b/docs/enc/base.md @@ -12,12 +12,12 @@ Common base encodings with N a power of 2: **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | -`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | charset: `1234` -`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | charset: `abcdefgh` +`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) +`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) +`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) `base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | -`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)` | -`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | human-oriented Base32 +`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex +`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 `base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | !!! note "Aliases" @@ -62,10 +62,12 @@ Note that for `base64`, it overwrites the native `base64_codec` to also support **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`base3` | text <-> Base3 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | +`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) +`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | +`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | `base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | `base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | -`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | supports Bitcoin, Ripple and short URL +`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL `base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | `base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | `base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | @@ -131,11 +133,7 @@ This encoding implements various different versions of Base85. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`base85` | text <-> ascii85 | `ascii85` | -`base85` | text <-> z85 | `z85`, `base85-zeromq` | -`base85` | text <-> base85-ipv6 | `base85-ipv6`, `base85-rfc1924` | -`base85` | text <-> base85-adobe | `base85-adobe` | -`base85` | text <-> base85-btoa | `base85-btoa`, `base85-xbtoa` | +`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | ```python >>> codext.encode("this is a test", "ascii85") @@ -156,16 +154,9 @@ This encoding implements various different versions of Base85. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`base85` | text <-> Base85 encoded text | `base[-_]?85` | Python 3 only (relies on `base64` module) `base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only `base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only - -```python ->>> codecs.encode("this is a test", "base85") -'bZBXFAZc?TVIXv6b94' ->>> codecs.decode("bZBXFAZc?TVIXv6b94", "base85") -'this is a test' -``` +`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset ```python >>> codecs.encode("this is a test", "base100") From b37e8a15b9b1f309a5229433e65318f16bd47b3c Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Feb 2022 09:22:54 +0100 Subject: [PATCH 50/97] Improved docs about crypto --- README.md | 1 + docs/enc/crypto.md | 13 ++++--------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 0b69bfb..2ce70be 100644 --- a/README.md +++ b/README.md @@ -219,6 +219,7 @@ o - [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet) - [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet) - [X] `base10`: simple conversion to decimal +- [X] `base11`: conversion to digits with a "*a*" - [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) - [X] `base26`: conversion to alphabet letters - [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html)) diff --git a/docs/enc/crypto.md b/docs/enc/crypto.md index 974f49d..e59ab0f 100644 --- a/docs/enc/crypto.md +++ b/docs/enc/crypto.md @@ -152,9 +152,8 @@ This is a dynamic encoding, that is, it can be called with an integer to define **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_1`, `caesar1` | -`rot` | text <-> rot(X) ciphertext | ... | -`rot` | text <-> rot(25) ciphertext | `rot25`, `rot-25`, `rot_25`, `caesar25` | +`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ +`rot47` | text <-> rot47 ciphertext | | ```python >>> codext.encode("this is a test", "rot-15") @@ -173,9 +172,7 @@ This is a dynamic encoding, that is, it can be called with an integer to define **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-1`, `shift_1` | -`shift` | text <-> shift(X) ciphertext | ... | -`shift` | text <-> shift(255) ciphertext | `shift255`, `shift-255`, `shift_255` | +`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ ```python >>> codext.encode("this is a test", "shift-3") @@ -194,9 +191,7 @@ This is a dynamic encoding, that is, it can be called with an integer to define **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor1`, `xor-1`, `xor_1` | -`xor` | text <-> XOR(X) ciphertext | ... | -`xor` | text <-> XOR(255) ciphertext | `XOR255`, `xor255`, `xor-255`, `xor_255` | +`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ ```python >>> codext.encode("this is a test", "xor-10") From d64c4f34b4c96b59976105680cbd35232be33e17 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Feb 2022 09:23:29 +0100 Subject: [PATCH 51/97] Applied minor improvement --- codext/__common__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/__common__.py b/codext/__common__.py index e45fb1e..94ad19f 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1333,7 +1333,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) # ignore encodings that fail to decode with their default errors handling value try: - new_input = decode(input, encoding) + new_input = codec.decode(input)[0] except: return # ignore encodings that give an output identical to the input (identity transformation) or to the previous input From e3092151c35da08d048a92c8dc03269ee95aed5c Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Feb 2022 18:40:33 +0100 Subject: [PATCH 52/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index feaae22..b50dd27 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.0 +1.13.1 From df8ff0fe870315c15eee63428516b47a1a74628b Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 12 Mar 2022 23:20:40 +0100 Subject: [PATCH 53/97] Fixed codec: uu --- codext/__common__.py | 3 ++- codext/__init__.py | 6 ----- codext/others/__init__.py | 1 + codext/others/uuencode.py | 47 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 7 deletions(-) create mode 100644 codext/others/uuencode.py diff --git a/codext/__common__.py b/codext/__common__.py index 94ad19f..0c7ec17 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1312,9 +1312,10 @@ def __rank(prev_input, input, prev_encoding, encodings, codecs, heuristic=False, class _Text(object): - __slots__ = ["entropy", "lcharset", "len", "padding", "printables"] + __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] def __init__(self, text, pad_char=None): + self.text = text c = text[-1] pad_char, last_char = (b(pad_char), c) if isinstance(c, int) else (pad_char, ord(c)) self.padding = pad_char is not None and last_char == ord(pad_char) diff --git a/codext/__init__.py b/codext/__init__.py index 486dd2f..661357a 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -27,12 +27,6 @@ reset() -# overwritten native codec -add("uu", lambda i, e="strict": orig_lookup("uu").encode(b(i), e), - lambda i, e="strict": orig_lookup("uu").decode(b(i), e), - pattern=r"^uu(?:[-_]encode|codec)?$", add_to_codecs=True, category="native") - - def __format_list(items, include=True): if items is None: return diff --git a/codext/others/__init__.py b/codext/others/__init__.py index 22d6830..aa7ffa2 100755 --- a/codext/others/__init__.py +++ b/codext/others/__init__.py @@ -2,4 +2,5 @@ from .dna import * from .letters import * from .markdown import * +from .uuencode import * diff --git a/codext/others/uuencode.py b/codext/others/uuencode.py new file mode 100644 index 0000000..5377493 --- /dev/null +++ b/codext/others/uuencode.py @@ -0,0 +1,47 @@ +# -*- coding: UTF-8 -*- +"""UU Codec - UU content encoding, relying on the native uu package. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from io import BytesIO +from uu import decode as _dec, encode as _enc + +from ..__common__ import * + + +__examples__ = { + 'enc(uu|uu_codec)': {'this is a test': "begin 666 -\n.=&AI Date: Sat, 12 Mar 2022 23:20:52 +0100 Subject: [PATCH 54/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index b50dd27..61ce01b 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.1 +1.13.2 From 837e91a4eb427b926089550509c7cea91f6accab Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 28 Mar 2022 08:03:38 +0200 Subject: [PATCH 55/97] New release --- codext/VERSION.txt | 2 +- codext/common/dummy.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 61ce01b..01b7568 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.2 +1.13.3 diff --git a/codext/common/dummy.py b/codext/common/dummy.py index f2dd2fb..7f4be19 100755 --- a/codext/common/dummy.py +++ b/codext/common/dummy.py @@ -35,7 +35,11 @@ def code(input, errors="strict"): reverse = lambda i, e="strict": (i[::-1], len(i)) add("reverse", reverse, reverse) -word_reverse = lambda i, e="strict": (" ".join(w[::-1] for w in i.split()), len(i)) +_revl = lambda i, wd=False: "".join((" ".join(w[::-1] for w in l.split()) if wd else l[::-1]) \ + if not re.match(r"(\r?\n)", l) else l for l in re.split(r"(\r?\n)", i)) +line_reverse = lambda i, e="strict": (_revl(i), len(i)) +add("reverse-lines", line_reverse, line_reverse, r"^reverse[-_]lines$") +word_reverse = lambda i, e="strict": (_revl(i, True), len(i)) add("reverse-words", word_reverse, word_reverse, r"^reverse[-_]words$") strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i)) From 564aa5a3743da63ab8b2de8564e9efe41288296b Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:04:03 +0200 Subject: [PATCH 56/97] Refactored codec: uu --- codext/others/uuencode.py | 49 +++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/codext/others/uuencode.py b/codext/others/uuencode.py index 5377493..a2f2fb6 100644 --- a/codext/others/uuencode.py +++ b/codext/others/uuencode.py @@ -7,39 +7,48 @@ - decodes file content to str (read) - encodes file content from str to bytes (write) """ -from io import BytesIO -from uu import decode as _dec, encode as _enc +from binascii import a2b_uu as _dec, b2a_uu as _enc from ..__common__ import * __examples__ = { 'enc(uu|uu_codec)': {'this is a test': "begin 666 -\n.=&AI 0 and lines[-1].strip(b" \t\r\n\f") in [b"", b"`"]: + lines = lines[:-1] + r = b"" + for l in lines: + r += _dec(l.strip(b" \t\r\n\f")) + return r, len(text) add("uu", uu_encode, uu_decode, pattern=r"^uu(?:[-_]?encode|[-_]codec)?$", From fb292e998df7ef2eb0193cf52b0c1b2cd84ab614 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:05:03 +0200 Subject: [PATCH 57/97] Improved guessing and ranking --- codext/__common__.py | 130 +++++++++++++++++++++---------------------- codext/__init__.py | 12 ++-- 2 files changed, 72 insertions(+), 70 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 0c7ec17..ea1281a 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -744,6 +744,10 @@ def remove(name): json.dump(PERS_MACROS, f, indent=2) except KeyError: pass + try: + del CODECS_CACHE[name] + except KeyError: + pass for s in ["En", "De"]: try: delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) @@ -864,6 +868,7 @@ def _handle_error(token, position, output="", eename=None): """ if errors == "strict": msg = "'%s' codec can't %scode %s '%s' in %s %d" + token = ensure_str(token) token = token[:7] + "..." if len(token) > 10 else token err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) err.output = output @@ -968,36 +973,37 @@ def __register(search_function): codecs.register = __register -def search(encoding_regex): +def search(encoding_regex, extended=True): """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way into the local registry but also tries a simple lookup with the original lookup function. """ matches = [] - for search_function in __codecs_registry: + for search_function in CODECS_OVERWRITTEN + __codecs_registry: n = search_function.__name__ for name in [n, n.replace("_", "-")]: if re.search(encoding_regex, name): - matches.append(name) + matches.append(n.replace("_", "-")) continue - # in some cases, encoding_regex can match a generated string that uses a particular portion of its generating - # pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also find "morse" or - # "atbash" very rarely because of their dynamic patterns and the limited number of randomly generated strings - # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of matches ; - # executing 5 times the string generation for a given codec but adding the codec to the list of matches only - # if we get at least 3 matches ensures that we consider up to 2 failures that could be stochastic, therefore - # drastically decreasing the probability to get a "junk" encoding in the matches list - c = 0 - for i in range(5): - for s in generate_strings_from_regex(search_function.__pattern__): - if re.search(encoding_regex, s): - c += 1 + if extended: + # in some cases, encoding_regex can match a generated string that uses a particular portion of its + # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also + # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly + # generated strings + # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of + # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of + # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be + # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list + c = 0 + for i in range(5): + for s in generate_strings_from_regex(search_function.__pattern__): + if re.search(encoding_regex, s): + c += 1 + break + if c >= 3: + matches.append(n) break - if c >= 3: - matches.append(n) - break for s, n in ALIASES.items(): if re.search(encoding_regex, s) or re.search(encoding_regex, n): matches.append(n) - break return sorted(list(set(matches)), key=_human_keys) codecs.search = search @@ -1241,7 +1247,7 @@ def _validate(stop_function, lang_backend="none"): stopfunc._validate = _validate -def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, codecs, result, found=(), +def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ if depth > min_depth and stop_func(input): @@ -1255,58 +1261,53 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings return prev_enc = found[-1] if len(found) > 0 else "" e = encodings.get(depth, encodings.get(-1, [])) - for new_input, encoding in __rank(prev_input, input, prev_enc, e, codecs, scoring_heuristic, extended): + for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): if len(result) > 0 and stop: return if debug: print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) - __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, codecs, result, - found + (encoding, ), stop, show, scoring_heuristic, extended, debug) + __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), + stop, show, scoring_heuristic, extended, debug) def __make_encodings_dict(include, exclude): """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ - codecs = {} def _develop(d, keep=True): d = d or {} for k, v in d.items(): l, cc = [], [e for e in v if e in CODECS_CATEGORIES] - for enc in (list_encodings(*cc) if len(cc) > 0 or keep else [] + \ + # list from in-scope categories and then everything that is not a category + for enc in ((list_encodings(*cc) if len(cc) > 0 or keep else []) + \ [e for e in v if e not in CODECS_CATEGORIES]): - try: - g = lookup(enc, False).parameters['guess'] - except: - g = [enc] - if enc in g and not keep: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected + g = [] + for e in (search(enc, False) or [enc]): + try: + ci = lookup(e, False) + g.extend(ci.parameters['guess']) + except: + pass + if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected l.append(enc) - else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected + else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected l.extend(g) - d[k] = l - if keep: - for e in l: - # cache newly loaded CodecInfo objects - ci = lookup(e, False) - n = ci.name - if n in CODECS_CACHE: - ci = CODECS_CACHE[n] # keep the cached object - else: - CODECS_CACHE[n] = ci # cache the new object - codecs[e] = ci + d[k] = list(set(l)) return d exclude = _develop(exclude, False) - return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()}, codecs + return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()} -def __rank(prev_input, input, prev_encoding, encodings, codecs, heuristic=False, extended=False, yield_score=False): +def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): """ Filter valid encodings and rank them by relevance. """ ranking = {} - for encoding in encodings: + for e in encodings: try: - score, new = __score(prev_input, input, prev_encoding, encoding, codecs.get(encoding), heuristic, extended) - except TypeError: - continue - ranking[encoding] = (score, new) + codec = CODECS_CACHE[e] + except KeyError: + CODECS_CACHE[e] = codec = lookup(e, False) + t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) + if t: + ranking[e] = t for encoding, result in sorted(ranking.items(), key=lambda x: -x[1][0]): yield result if yield_score else result[1], encoding @@ -1315,16 +1316,16 @@ class _Text(object): __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] def __init__(self, text, pad_char=None): - self.text = text - c = text[-1] - pad_char, last_char = (b(pad_char), c) if isinstance(c, int) else (pad_char, ord(c)) - self.padding = pad_char is not None and last_char == ord(pad_char) + self.text = ensure_str(text) + c = self.text[-1] + pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) + self.padding = pad_char is not None and last_char == pad_char if self.padding: text = text.rstrip(pad_char) - self.len = len(text) - self.lcharset = len(set(text)) - self.printables = float(len([c for c in text if (chr(c) if isinstance(c, int) else c) in printable])) / self.len - self.entropy = entropy(text) + self.len = len(self.text) + self.lcharset = len(set(self.text)) + self.printables = float(len([c for c in self.text if c in printable])) / self.len + self.entropy = entropy(self.text) def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): @@ -1386,13 +1387,14 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, except TypeError: expf = expf(f) if isinstance(expf, (int, float)): + tmp = expf expf = (1/f - .1 <= 1/expf <= 1/f + .1) elif isinstance(expf, (tuple, list)) and len(expf) == 2: expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] s += [-1., .1][expf] # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the # number of input characters to take bad entropies of shorter strings into account - entr = sc.get('entropy', {}) + entr = sc.get('entropy', lambda e: e) entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr if isinstance(entr, type(lambda: None)): try: # this case allows to consider the current encoding name from the current codec @@ -1401,7 +1403,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, entr = entr(obj.entropy) if entr is not None: # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1) - d_entr = min(4e-05 * obj.len**2 - .003 * obj.len, 1) * abs(entr - entropy(new_input)) + d_entr = min(5.958194e-06 * obj.len**2 - .002381 * obj.len, 1) * abs(entr - entropy(new_input)) if d_entr <= .5: s += .5 - d_entr # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) @@ -1475,12 +1477,11 @@ def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=N if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): raise ValueError("Include argument shall be a list or a dictionary with integer keys") # precompute encodings lists per depth and cache the related CodecInfo objects - encodings, codecs = __make_encodings_dict(include, exclude) - result = {} + encodings, result = __make_encodings_dict(include, exclude), {} try: # breadth-first search for d in range(max_depth): - __guess("", input, stop_func, 0, d+1, min_depth, encodings, codecs, result, tuple(found), stop, show, + __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, scoring_heuristic, extended, debug) if stop and len(result) > 0: break @@ -1500,9 +1501,8 @@ def rank(input, extended=False, limit=-1, include=None, exclude=None): :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) """ - encodings, codecs = __make_encodings_dict({0: include or CODECS_CATEGORIES}, {0: exclude or []}) - r = list(__rank(None, input, "", encodings[0], codecs, True, extended, True)) - CODECS_CACHE = {} + encodings = __make_encodings_dict({-1: include or CODECS_CATEGORIES}, {-1: exclude or []}) + r = list(__rank(None, input, "", encodings[-1], True, extended, True)) return r[:limit] if len(r) > 1 else r codecs.rank = rank diff --git a/codext/__init__.py b/codext/__init__.py index 661357a..f95abb8 100644 --- a/codext/__init__.py +++ b/codext/__init__.py @@ -154,12 +154,14 @@ def _format_action_invocation(self, action): guess.add_argument("-v", "--verbose", action="store_true", help="show guessing information and steps (default: False)") rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) - rank.add_argument("-c", "--codec-categories", nargs="*", action="extend", metavar="CATEGORY", - help="codec categories to be included in the search ; format: string|tuple|list(strings|tuples)") - rank.add_argument("-e", "--exclude-codecs", nargs="*", action="extend", metavar="CODEC", - help="codecs to be explicitely not used ; format: string|tuple|list(strings|tuples)") + rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") rank.add_argument("-E", "--extended", action="store_true", help="while using the scoring heuristic, also consider null scores (default: False)") + rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") search = sparsers.add_parser("search", help="search for codecs") search.add_argument("pattern", nargs="+", help="encoding pattern to search") @@ -243,7 +245,7 @@ def _format_action_invocation(self, action): if len(r) == 0: print("Could not decode :-(") elif args.command == "rank": - for i, e in codecs.rank(c, args.extended, args.limit, args.codec_categories, args.exclude_codecs): + for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): s = "[+] %.5f: %s" % (i[0], e) print(s if len(s) <= 80 else s[:77] + "...") except Exception as e: From b4a29503392cfdbb89f6e58d48441c9f60e8e6e4 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:05:19 +0200 Subject: [PATCH 58/97] Refined case codecs --- codext/common/cases.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/codext/common/cases.py b/codext/common/cases.py index 65fbdf2..8aa87e4 100644 --- a/codext/common/cases.py +++ b/codext/common/cases.py @@ -20,20 +20,20 @@ capitalize = lambda i, e="strict": (i.capitalize(), len(i)) uncapitalize = lambda i, e="strict": (i[0].lower() + i[1:] if len(i) > 0 else "", len(i)) -add("capitalize", capitalize, uncapitalize) +add("capitalize", capitalize, uncapitalize, penalty=.2) lowercase, uppercase = lambda i, e="strict": (i.lower(), len(i)), lambda i, e="strict": (i.upper(), len(i)) -add("uppercase", uppercase, lowercase, r"^upper(?:case)?$") -add("lowercase", lowercase, uppercase, r"^lower(?:case)?$") +add("uppercase", uppercase, lowercase, r"^upper(?:case)?$", penalty=.2) +add("lowercase", lowercase, uppercase, r"^lower(?:case)?$", penalty=.2) slugify = lambda i, e="strict", d="-": (re.sub(r"[^0-9a-z]+", d, i.lower()).strip(d), len(i)) add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$") add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)?$") swapcase = lambda i, e="strict": (i.swapcase(), len(i)) -add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$") +add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) title = lambda i, e="strict": (i.title(), len(i)) untitle = lambda i, e="strict": (" ".join(w[0].lower() + w[1:] if len(w) > 0 else "" for w in i.split()), len(i)) -add("title", title, untitle) +add("title", title, untitle, penalty=.2) From 224b2d005eb3020b9470ab8b6cf182bcd0a49b89 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:05:44 +0200 Subject: [PATCH 59/97] Refined tests/test_common --- tests/test_common.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_common.py b/tests/test_common.py index 6eddd7e..934155f 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -151,7 +151,7 @@ def test_guess_decode(self): self.assertIsNotNone(codext.stopfunc._validate("flag")) _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) self.assertIn("test-codec", codext.list_encodings("test")) self.assertEqual(codext.decode("TEST=", "test"), "TEST") self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, @@ -203,13 +203,15 @@ def test_guess_decode(self): self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") def test_rank_input(self): + codext.remove("test_codec") + self.assertRaises(LookupError, codext.encode, "TEST", "test") codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "test", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) STR = "This is a test string !" ENC = codext.encode(STR, "base64") self.assertTrue(len(codext.rank(ENC)) > 20) self.assertEqual(len(codext.rank(ENC, limit=20)), 20) - self.assertEqual(codext.rank(ENC, exclude=["rot"])[0][1], "base64") + self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url"]) self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) From dcbeba184f89ac7fd31bf7b4dfbca2fc1809ee70 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 01:31:45 +0200 Subject: [PATCH 60/97] Fixed scoring for compression codecs --- codext/__common__.py | 4 ++-- codext/compressions/__init__.py | 6 ++++++ codext/compressions/gzipp.py | 2 +- codext/compressions/lz77.py | 2 +- codext/compressions/pkzip.py | 6 +++--- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index ea1281a..f65d210 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1402,8 +1402,8 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, except TypeError: entr = entr(obj.entropy) if entr is not None: - # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (100w,.1) and (200w,1) - d_entr = min(5.958194e-06 * obj.len**2 - .002381 * obj.len, 1) * abs(entr - entropy(new_input)) + # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) + d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - entropy(new_input)) if d_entr <= .5: s += .5 - d_entr # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) diff --git a/codext/compressions/__init__.py b/codext/compressions/__init__.py index 37f1fa5..606a1dc 100755 --- a/codext/compressions/__init__.py +++ b/codext/compressions/__init__.py @@ -4,3 +4,9 @@ from .lz78 import * from .pkzip import * + +for e in list_encodings("compression"): + ci = lookup(e, False) + ci.parameters['scoring']['entropy'] = 7.9 + ci.parameters['scoring']['expansion_factor'] = lambda f: f + diff --git a/codext/compressions/gzipp.py b/codext/compressions/gzipp.py index da52b5a..14e65bc 100755 --- a/codext/compressions/gzipp.py +++ b/codext/compressions/gzipp.py @@ -40,5 +40,5 @@ def gzip_decompress(data, errors="strict"): return r, len(r) -add("gzip", gzip_compress, gzip_decompress, entropy=7.9) +add("gzip", gzip_compress, gzip_decompress) diff --git a/codext/compressions/lz77.py b/codext/compressions/lz77.py index 662f02c..bdfcf13 100644 --- a/codext/compressions/lz77.py +++ b/codext/compressions/lz77.py @@ -70,5 +70,5 @@ def lz77_decompress(input, errors="strict"): return out, len(out) -add("lz77", lz77_compress, lz77_decompress, entropy=7.9) +add("lz77", lz77_compress, lz77_decompress) diff --git a/codext/compressions/pkzip.py b/codext/compressions/pkzip.py index ebbcbce..47d9cd5 100755 --- a/codext/compressions/pkzip.py +++ b/codext/compressions/pkzip.py @@ -46,11 +46,11 @@ def _decode(data, errors="strict"): add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", - entropy=7.9, examples=__examples1__, guess=["deflate"]) + examples=__examples1__, guess=["deflate"]) add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", - entropy=7.9, examples=__examples2__, guess=["bz2"]) + examples=__examples2__, guess=["bz2"]) add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", - entropy=7.9, examples=__examples3__, guess=["lzma"]) + examples=__examples3__, guess=["lzma"]) From a1b41fab747e36bf2eaf7f3037715f9fc7a28ddf Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 08:31:49 +0200 Subject: [PATCH 61/97] Fixed minor issues --- codext/__common__.py | 4 ++-- tests/test_common.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index f65d210..41cb5b2 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -1308,7 +1308,7 @@ def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extende t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) if t: ranking[e] = t - for encoding, result in sorted(ranking.items(), key=lambda x: -x[1][0]): + for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): yield result if yield_score else result[1], encoding @@ -1403,7 +1403,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, entr = entr(obj.entropy) if entr is not None: # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) - d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - entropy(new_input)) + d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) if d_entr <= .5: s += .5 - d_entr # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) diff --git a/tests/test_common.py b/tests/test_common.py index 934155f..8bbf410 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -211,7 +211,7 @@ def test_rank_input(self): ENC = codext.encode(STR, "base64") self.assertTrue(len(codext.rank(ENC)) > 20) self.assertEqual(len(codext.rank(ENC, limit=20)), 20) - self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url"]) + self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) From 3d7f43dea12c40e330b6be8b8fb5011d3c8ee13b Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 29 Mar 2022 08:33:03 +0200 Subject: [PATCH 62/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 01b7568..80138e7 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.3 +1.13.4 From 281ca1bacbbc0891f7e2987bbbd161f507823bd3 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 7 Sep 2022 20:36:19 +0200 Subject: [PATCH 63/97] Added codec: tokenize --- codext/common/dummy.py | 12 +++++++++++- docs/manipulations.md | 10 +++++++++- tests/test_manual.py | 2 ++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/codext/common/dummy.py b/codext/common/dummy.py index 7f4be19..b45c023 100755 --- a/codext/common/dummy.py +++ b/codext/common/dummy.py @@ -22,7 +22,7 @@ def code(input, errors="strict"): # important note: ^ # using "{2}" here instead will break the codec # this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will -# faill to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo +# fail to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo def substitute(token, replacement): @@ -45,3 +45,13 @@ def code(input, errors="strict"): strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i)) add("strip-spaces", strip_spaces, strip_spaces, guess=None) +def tokenize(n): + tlen = int(n[8:].lstrip("-_")) + def code(input, errors="strict"): + l = len(input) + if tlen > l: + raise LookupError("unknown encoding: %s" % n) + return " ".join(input[i:i+tlen] for i in range(0, l, tlen)), l + return code +add("tokenize", tokenize, tokenize, r"^(tokenize[-_]?[1-9][0-9]*)$", guess=None) + diff --git a/docs/manipulations.md b/docs/manipulations.md index 7962278..8857ca7 100644 --- a/docs/manipulations.md +++ b/docs/manipulations.md @@ -43,11 +43,12 @@ These transformation functions are simple string transformations. **Codec** | **Conversions** | **Aliases** | **Comment** :---: | :---: | --- | --- -`replace` | text <-> text with single-char replaced | | +`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ `reverse` | text <-> reversed text | | `reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) `strip-spaces` | text <-> all whitespaces stripped | | `substitute` | text <-> text with token substituted | | +`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). @@ -58,6 +59,13 @@ $ echo -en "test string" | codext encode reverse-words | codext encode reverse r string_test ``` +Another example: + +```sh +$ echo -en "3132333435" | codext encode tokenize-2 +31 32 33 34 35 +``` + Or using encodings chaining: ```sh diff --git a/tests/test_manual.py b/tests/test_manual.py index 4211df7..64b1843 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -100,6 +100,8 @@ def test_codec_dummy_str_manips(self): self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) + self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") + self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") def test_codec_hash_functions(self): STR = b"This is a test string!" From 4792a99b3a3780765b80c68f0bbcb46da27a2f7b Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 11 Sep 2022 19:13:41 +0200 Subject: [PATCH 64/97] Fixed minor bugs --- codext/__common__.py | 26 +++++++++++++++----------- tests/test_generated.py | 12 +++++++++--- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/codext/__common__.py b/codext/__common__.py index 41cb5b2..9d9400c 100644 --- a/codext/__common__.py +++ b/codext/__common__.py @@ -109,10 +109,11 @@ def __new__(cls, name): for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): if re.match(r"enc(-dec)?\(", action): for e in (examples.keys() if action.startswith("enc(") else examples or []): - rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) if rd: - for n in (rd.group(1) or "512").split(","): - self.encode("".join(chr(randint(0, 255)) for i in range(int(n)))) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + self.encode(s.lower() if rd.group(1) else s) continue self.encode(e) @@ -1276,10 +1277,9 @@ def __make_encodings_dict(include, exclude): def _develop(d, keep=True): d = d or {} for k, v in d.items(): - l, cc = [], [e for e in v if e in CODECS_CATEGORIES] + l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] # list from in-scope categories and then everything that is not a category - for enc in ((list_encodings(*cc) if len(cc) > 0 or keep else []) + \ - [e for e in v if e not in CODECS_CATEGORIES]): + for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): g = [] for e in (search(enc, False) or [enc]): try: @@ -1293,8 +1293,8 @@ def _develop(d, keep=True): l.extend(g) d[k] = list(set(l)) return d - exclude = _develop(exclude, False) - return {k: [x for x in v if x not in exclude.get(k, [])] for k, v in _develop(include).items()} + _excl, _incl = _develop(exclude, False), _develop(include) + return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): @@ -1304,7 +1304,10 @@ def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extende try: codec = CODECS_CACHE[e] except KeyError: - CODECS_CACHE[e] = codec = lookup(e, False) + try: + CODECS_CACHE[e] = codec = lookup(e, False) + except LookupError: + continue t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) if t: ranking[e] = t @@ -1321,7 +1324,7 @@ def __init__(self, text, pad_char=None): pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) self.padding = pad_char is not None and last_char == pad_char if self.padding: - text = text.rstrip(pad_char) + text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) self.len = len(self.text) self.lcharset = len(set(self.text)) self.printables = float(len([c for c in self.text if c in printable])) / self.len @@ -1501,7 +1504,8 @@ def rank(input, extended=False, limit=-1, include=None, exclude=None): :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) """ - encodings = __make_encodings_dict({-1: include or CODECS_CATEGORIES}, {-1: exclude or []}) + encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, + exclude if isinstance(exclude, dict) else {-1: exclude or []}) r = list(__rank(None, input, "", encodings[-1], True, extended, True)) return r[:limit] if len(r) > 1 else r codecs.rank = rank diff --git a/tests/test_generated.py b/tests/test_generated.py index 6b89129..614562f 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -36,6 +36,11 @@ def _template(self): for ename in m.groups(): if ename is None: continue + # buggy generated encoding names + try: + lookup(ename) + except LookupError: + continue # erroneous encoding name test if examples is None: self.assertRaises(LookupError, f1, "test", ename) @@ -72,11 +77,12 @@ def _template(self): # examples validation tests if k.startswith("enc-dec") and isinstance(examples, list): for e in examples[:]: - rd = re.match(r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) if rd: examples.remove(e) - for n in (rd.group(1) or "512").split(","): - examples.append("".join(chr(randint(0, 255)) for i in range(int(n)))) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + examples.append(s.lower() if rd.group(1) else s) for s in [""] + examples: self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) From b4e1eb66fb8764df992cc6434f0e69a6eedbd9b5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 11 Sep 2022 19:13:57 +0200 Subject: [PATCH 65/97] Added codec: kbshift --- codext/others/__init__.py | 1 + codext/others/kbshift.py | 66 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100755 codext/others/kbshift.py diff --git a/codext/others/__init__.py b/codext/others/__init__.py index aa7ffa2..3bbf102 100755 --- a/codext/others/__init__.py +++ b/codext/others/__init__.py @@ -1,5 +1,6 @@ # -*- coding: UTF-8 -*- from .dna import * +from .kbshift import * from .letters import * from .markdown import * from .uuencode import * diff --git a/codext/others/kbshift.py b/codext/others/kbshift.py new file mode 100755 index 0000000..2bd0991 --- /dev/null +++ b/codext/others/kbshift.py @@ -0,0 +1,66 @@ +# -*- coding: UTF-8 -*- +"""Keyboard-Shift Codec - keyboard line shifting content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +LAYOUTS = { + 'ansi': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;'\nzxcvbnm<>\n,./", + 'azerty': "azertyuiop\nqsdfghjklm\nwxcvbn", + 'azerty-be': "³1234567890°_\n²&é\"'(§è!çà)-\n|@#^{}\nazertyuiop$\n€[]\n¨*\nqsdfghjklm%£\nùµ\n´`\n>wxcvbn?./+\n<,;:=\n\\~", + 'azerty-fr': "1234567890°+\n²&é\"'(-è_çà)=\n~#{[|`\\^@]}\nazertyuiop¨£\nqsdfghjklm%µ\nù*\n>wxcvbn?./§\n<,;:!", + 'dvorak': "~!@#$%^&*(){}\n`1234567890[]\n\"<>pyfgcrl?+|\n',./=\\\naoeuidhtns_\n-\n:qjkxbmwvz\n;", + 'qwerty': "qwertyuiop\nasdfghjkl\nzxcvbnm", + 'qwerty-us': "~!@#$%^&*()_+\n`1234567890-=\nqwertyuiop{}|\n[]\\\nasdfghjkl:\"\n;,\nzxcvbnm<>?\n./", +} +__per_len = {} +for k, s in LAYOUTS.items(): + i = max(map(len, s.split("\n"))) + __per_len.setdefault(i, []) + __per_len[i].append(k) + + +__examples__ = {"enc-dec(kbshift_%s_%d)" % (kb, n): ["@irandom{256,512}"] for n in range(10) for kb in LAYOUTS.keys()} +__guess__ = [] +for mlen, kbs in __per_len.items(): + for k in kbs: + __guess__.extend(["kbshift-%s-%d" % (k, i+1) for i in range(mlen)]) + + +def _kbshift(text, keyboard="azerty", n=1, decode=False): + r = "" + for c in text: + nc = None + for l in LAYOUTS[keyboard].splitlines(): + if c.lower() in l: + nc = l[(l.index(c.lower()) + [-1, 1][decode] * n) % len(l)] + break + r += c if nc is None else nc + return r + + +def kbshift_encode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def encode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift)) + return r, len(r) + return encode + + +def kbshift_decode(scheme): + kb, shift = re.match(r"^(.*?)[-_]?(\d+)$", scheme or "azerty-1").groups() + def decode(text, errors="strict"): + r = _kbshift(ensure_str(text), kb, int(shift), True) + return r, len(r) + return decode + + +add("kbshift", kbshift_encode, kbshift_decode, entropy=lambda e: e,printables_rate=lambda pr: pr, transitive=True, + pattern=r"^kbshift(?:|[-_]((?:az|qw)erty[-_]?[1-9]|(?:ansi|azerty-(?:be|fr)|dvorak|qwerty-us)[-_]?(?:[1-9]|1[0-2])))$") + From cd234d5d97867f1470b45499694f3776aa74569b Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 12 Sep 2022 21:53:13 +0200 Subject: [PATCH 66/97] New release --- codext/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codext/VERSION.txt b/codext/VERSION.txt index 80138e7..850e742 100644 --- a/codext/VERSION.txt +++ b/codext/VERSION.txt @@ -1 +1 @@ -1.13.4 +1.14.0 From 13960f1dbf2b322047bdc4285ba075bf023dd176 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 13:52:24 +0100 Subject: [PATCH 67/97] Moved to pyproject.toml --- pyproject.toml | 87 +++++++++++++++++++ setup.cfg | 80 ----------------- setup.py | 4 - {codext => src/codext}/VERSION.txt | 0 {codext => src/codext}/__common__.py | 0 {codext => src/codext}/__info__.py | 0 {codext => src/codext}/__init__.py | 0 {codext => src/codext}/base/__init__.py | 0 {codext => src/codext}/base/_base.py | 0 {codext => src/codext}/base/_base2n.py | 0 {codext => src/codext}/base/base100.py | 0 {codext => src/codext}/base/base122.py | 0 {codext => src/codext}/base/base45.py | 0 {codext => src/codext}/base/base85.py | 0 {codext => src/codext}/base/base91.py | 0 {codext => src/codext}/base/baseN.py | 0 {codext => src/codext}/binary/__init__.py | 0 {codext => src/codext}/binary/baudot.py | 0 {codext => src/codext}/binary/bcd.py | 0 {codext => src/codext}/binary/excess3.py | 0 {codext => src/codext}/binary/gray.py | 0 {codext => src/codext}/binary/manchester.py | 0 {codext => src/codext}/binary/rotate.py | 0 {codext => src/codext}/common/__init__.py | 0 {codext => src/codext}/common/a1z26.py | 0 {codext => src/codext}/common/cases.py | 0 {codext => src/codext}/common/dummy.py | 0 {codext => src/codext}/common/octal.py | 0 {codext => src/codext}/common/ordinal.py | 0 .../codext}/compressions/__init__.py | 0 {codext => src/codext}/compressions/gzipp.py | 0 {codext => src/codext}/compressions/lz77.py | 0 {codext => src/codext}/compressions/lz78.py | 0 {codext => src/codext}/compressions/pkzip.py | 0 {codext => src/codext}/crypto/__init__.py | 0 {codext => src/codext}/crypto/affine.py | 0 {codext => src/codext}/crypto/atbash.py | 0 {codext => src/codext}/crypto/bacon.py | 0 {codext => src/codext}/crypto/barbie.py | 0 {codext => src/codext}/crypto/citrix.py | 0 {codext => src/codext}/crypto/railfence.py | 0 {codext => src/codext}/crypto/rot.py | 0 {codext => src/codext}/crypto/scytale.py | 0 {codext => src/codext}/crypto/shift.py | 0 {codext => src/codext}/crypto/xor.py | 0 {codext => src/codext}/hashing/__init__.py | 0 {codext => src/codext}/hashing/blake.py | 0 {codext => src/codext}/hashing/checksums.py | 0 {codext => src/codext}/hashing/crypt.py | 0 {codext => src/codext}/hashing/md.py | 0 {codext => src/codext}/hashing/sha.py | 0 {codext => src/codext}/hashing/shake.py | 0 {codext => src/codext}/languages/__init__.py | 0 {codext => src/codext}/languages/braille.py | 0 {codext => src/codext}/languages/galactic.py | 0 {codext => src/codext}/languages/ipsum.py | 0 {codext => src/codext}/languages/leetspeak.py | 0 {codext => src/codext}/languages/morse.py | 0 {codext => src/codext}/languages/navajo.py | 0 {codext => src/codext}/languages/radio.py | 0 {codext => src/codext}/languages/southpark.py | 0 {codext => src/codext}/languages/tap.py | 0 {codext => src/codext}/languages/tomtom.py | 0 {codext => src/codext}/macros.json | 0 {codext => src/codext}/others/__init__.py | 0 {codext => src/codext}/others/dna.py | 0 {codext => src/codext}/others/kbshift.py | 0 {codext => src/codext}/others/letters.py | 0 {codext => src/codext}/others/markdown.py | 0 {codext => src/codext}/others/uuencode.py | 0 {codext => src/codext}/stegano/__init__.py | 0 {codext => src/codext}/stegano/hexagram.py | 0 {codext => src/codext}/stegano/klopf.py | 0 {codext => src/codext}/stegano/resistor.py | 0 {codext => src/codext}/stegano/rick.py | 0 {codext => src/codext}/stegano/sms.py | 0 {codext => src/codext}/stegano/whitespace.py | 0 {codext => src/codext}/web/__init__.py | 0 {codext => src/codext}/web/html.py | 0 {codext => src/codext}/web/url.py | 0 80 files changed, 87 insertions(+), 84 deletions(-) create mode 100644 pyproject.toml delete mode 100644 setup.cfg delete mode 100644 setup.py rename {codext => src/codext}/VERSION.txt (100%) rename {codext => src/codext}/__common__.py (100%) rename {codext => src/codext}/__info__.py (100%) rename {codext => src/codext}/__init__.py (100%) rename {codext => src/codext}/base/__init__.py (100%) rename {codext => src/codext}/base/_base.py (100%) rename {codext => src/codext}/base/_base2n.py (100%) rename {codext => src/codext}/base/base100.py (100%) rename {codext => src/codext}/base/base122.py (100%) rename {codext => src/codext}/base/base45.py (100%) rename {codext => src/codext}/base/base85.py (100%) rename {codext => src/codext}/base/base91.py (100%) rename {codext => src/codext}/base/baseN.py (100%) rename {codext => src/codext}/binary/__init__.py (100%) rename {codext => src/codext}/binary/baudot.py (100%) rename {codext => src/codext}/binary/bcd.py (100%) rename {codext => src/codext}/binary/excess3.py (100%) rename {codext => src/codext}/binary/gray.py (100%) rename {codext => src/codext}/binary/manchester.py (100%) rename {codext => src/codext}/binary/rotate.py (100%) rename {codext => src/codext}/common/__init__.py (100%) rename {codext => src/codext}/common/a1z26.py (100%) rename {codext => src/codext}/common/cases.py (100%) rename {codext => src/codext}/common/dummy.py (100%) rename {codext => src/codext}/common/octal.py (100%) rename {codext => src/codext}/common/ordinal.py (100%) rename {codext => src/codext}/compressions/__init__.py (100%) rename {codext => src/codext}/compressions/gzipp.py (100%) rename {codext => src/codext}/compressions/lz77.py (100%) rename {codext => src/codext}/compressions/lz78.py (100%) rename {codext => src/codext}/compressions/pkzip.py (100%) rename {codext => src/codext}/crypto/__init__.py (100%) rename {codext => src/codext}/crypto/affine.py (100%) rename {codext => src/codext}/crypto/atbash.py (100%) rename {codext => src/codext}/crypto/bacon.py (100%) rename {codext => src/codext}/crypto/barbie.py (100%) rename {codext => src/codext}/crypto/citrix.py (100%) rename {codext => src/codext}/crypto/railfence.py (100%) rename {codext => src/codext}/crypto/rot.py (100%) rename {codext => src/codext}/crypto/scytale.py (100%) rename {codext => src/codext}/crypto/shift.py (100%) rename {codext => src/codext}/crypto/xor.py (100%) rename {codext => src/codext}/hashing/__init__.py (100%) rename {codext => src/codext}/hashing/blake.py (100%) rename {codext => src/codext}/hashing/checksums.py (100%) rename {codext => src/codext}/hashing/crypt.py (100%) rename {codext => src/codext}/hashing/md.py (100%) rename {codext => src/codext}/hashing/sha.py (100%) rename {codext => src/codext}/hashing/shake.py (100%) rename {codext => src/codext}/languages/__init__.py (100%) rename {codext => src/codext}/languages/braille.py (100%) rename {codext => src/codext}/languages/galactic.py (100%) rename {codext => src/codext}/languages/ipsum.py (100%) rename {codext => src/codext}/languages/leetspeak.py (100%) rename {codext => src/codext}/languages/morse.py (100%) rename {codext => src/codext}/languages/navajo.py (100%) rename {codext => src/codext}/languages/radio.py (100%) rename {codext => src/codext}/languages/southpark.py (100%) rename {codext => src/codext}/languages/tap.py (100%) rename {codext => src/codext}/languages/tomtom.py (100%) rename {codext => src/codext}/macros.json (100%) rename {codext => src/codext}/others/__init__.py (100%) rename {codext => src/codext}/others/dna.py (100%) rename {codext => src/codext}/others/kbshift.py (100%) rename {codext => src/codext}/others/letters.py (100%) rename {codext => src/codext}/others/markdown.py (100%) rename {codext => src/codext}/others/uuencode.py (100%) rename {codext => src/codext}/stegano/__init__.py (100%) rename {codext => src/codext}/stegano/hexagram.py (100%) rename {codext => src/codext}/stegano/klopf.py (100%) rename {codext => src/codext}/stegano/resistor.py (100%) rename {codext => src/codext}/stegano/rick.py (100%) rename {codext => src/codext}/stegano/sms.py (100%) rename {codext => src/codext}/stegano/whitespace.py (100%) rename {codext => src/codext}/web/__init__.py (100%) rename {codext => src/codext}/web/html.py (100%) rename {codext => src/codext}/web/url.py (100%) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ce377f3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,87 @@ +[build-system] +requires = ["setuptools>=61.0", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +version = {attr = "codext.__info__.__version__"} + +[tool.setuptools.packages.find] +where = ["src"] + +[project] +name = "codext" +authors = [ + {name="Alexandre D'Hondt", email="alexandre.dhondt@gmail.com"}, +] +description = "Library for producing ASCII arts from a text or an image" +license = {file = "LICENSE"} +keywords = ["python", "development", "programming", "ascii-art", "banner-generator", "quote-generator", "cowsay"] +requires-python = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4" +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Software Development :: Libraries :: Python Modules", +] +dependencies = [ + "markdown2==2.3.10; python_version=='2.7'", + "markdown2>=2.4.0; python_version>='3.6'", + "six", +] +dynamic = ["version"] + +[project.readme] +file = "README.md" +content-type = "text/markdown" + +[project.urls] +documentation = "https://python-codext.readthedocs.io/en/latest/?badge=latest" +homepage = "https://github.com/dhondta/python-codext" +issues = "https://github.com/dhondta/python-codext/issues" +repository = "https://github.com/dhondta/python-codext" + +[project.scripts] +base1 = "codext.base.baseN:main1" +base2 = "codext.base.baseN:main2" +base3 = "codext.base.baseN:main3" +base4 = "codext.base.baseN:main4" +base8 = "codext.base.baseN:main8" +base10 = "codext.base.baseN:main10" +base16 = "codext.base.baseN:main16" +base26 = "codext.base.baseN:main26" +base32 = "codext.base.baseN:main32" +base32-hex = "codext.base.baseN:main32hex" +base32-geohash = "codext.base.baseN:main32geo" +base32-crockford = "codext.base.baseN:main32crk" +base32-z = "codext.base.baseN:mainz32" +base36 = "codext.base.baseN:main36" +base45 = "codext.base.base45:main" +base58-bitcoin = "codext.base.baseN:main58bc" +base58-ripple = "codext.base.baseN:main58rp" +base58-flickr = "codext.base.baseN:main58fl" +base62 = "codext.base.baseN:main62" +base63 = "codext.base.baseN:main63" +base64 = "codext.base.baseN:main64" +base64-url = "codext.base.baseN:main64url" +base67 = "codext.base.baseN:main67" +base85 = "codext.base.base85:main85" +base85-adobe = "codext.base.base85:main85adobe" +base85-xbtoa = "codext.base.base85:main85xbtoa" +base85-ipv6 = "codext.base.base85:main85rfc1924" +base85-xml = "codext.base.base85:main85xml" +base85-zeromq = "codext.base.base85:main85zeromq" +base91 = "codext.base.base91:main91" +base100 = "codext.base.base100:main100" +base122 = "codext.base.base122:main122" +codext = "codext.__init__:main" +unbase = "codext.base.__init__:main" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 958a404..0000000 --- a/setup.cfg +++ /dev/null @@ -1,80 +0,0 @@ -[metadata] -name = codext -version = file: codext/VERSION.txt -author = Alexandre D'Hondt -author-email = alexandre.dhondt@gmail.com -home-page = https://github.com/dhondta/python-codext -description = Native codecs extension -long_description = file: README.md -long_description_content_type = text/markdown -keywords = - python - development - programming - codecs - encodings -license = GPLv3 -license-file = LICENSE -classifier = - Development Status :: 5 - Production/Stable - Environment :: Console - Intended Audience :: Developers - License :: OSI Approved :: GNU General Public License v3 (GPLv3) - Programming Language :: Python :: 2 - Programming Language :: Python :: 2.7 - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Topic :: Software Development :: Libraries :: Python Modules - -[options] -packages = find: -include_package_data = False -install_requires = - markdown2==2.3.10; python_version=='2.7' # rq.filter: >=2.4.0 - markdown2>=2.4.0; python_version>='3.6' - six -setup-requires = setuptools -python-requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4 - -[options.package_data] -* = *.txt,*.json - -[options.entry_points] -console_scripts = - base1 = codext.base.baseN:main1 - base2 = codext.base.baseN:main2 - base3 = codext.base.baseN:main3 - base4 = codext.base.baseN:main4 - base8 = codext.base.baseN:main8 - base10 = codext.base.baseN:main10 - base16 = codext.base.baseN:main16 - base26 = codext.base.baseN:main26 - base32 = codext.base.baseN:main32 - base32-hex = codext.base.baseN:main32hex - base32-geohash = codext.base.baseN:main32geo - base32-crockford = codext.base.baseN:main32crk - base32-z = codext.base.baseN:mainz32 - base36 = codext.base.baseN:main36 - base45 = codext.base.base45:main - base58-bitcoin = codext.base.baseN:main58bc - base58-ripple = codext.base.baseN:main58rp - base58-flickr = codext.base.baseN:main58fl - base62 = codext.base.baseN:main62 - base63 = codext.base.baseN:main63 - base64 = codext.base.baseN:main64 - base64-url = codext.base.baseN:main64url - base67 = codext.base.baseN:main67 - base85 = codext.base.base85:main85 - base85-adobe = codext.base.base85:main85adobe - base85-xbtoa = codext.base.base85:main85xbtoa - base85-ipv6 = codext.base.base85:main85rfc1924 - base85-xml = codext.base.base85:main85xml - base85-zeromq = codext.base.base85:main85zeromq - base91 = codext.base.base91:main91 - base100 = codext.base.base100:main100 - base122 = codext.base.base122:main122 - codext = codext.__init__:main - unbase = codext.base.__init__:main diff --git a/setup.py b/setup.py deleted file mode 100644 index c823345..0000000 --- a/setup.py +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python -from setuptools import setup - -setup() diff --git a/codext/VERSION.txt b/src/codext/VERSION.txt similarity index 100% rename from codext/VERSION.txt rename to src/codext/VERSION.txt diff --git a/codext/__common__.py b/src/codext/__common__.py similarity index 100% rename from codext/__common__.py rename to src/codext/__common__.py diff --git a/codext/__info__.py b/src/codext/__info__.py similarity index 100% rename from codext/__info__.py rename to src/codext/__info__.py diff --git a/codext/__init__.py b/src/codext/__init__.py similarity index 100% rename from codext/__init__.py rename to src/codext/__init__.py diff --git a/codext/base/__init__.py b/src/codext/base/__init__.py similarity index 100% rename from codext/base/__init__.py rename to src/codext/base/__init__.py diff --git a/codext/base/_base.py b/src/codext/base/_base.py similarity index 100% rename from codext/base/_base.py rename to src/codext/base/_base.py diff --git a/codext/base/_base2n.py b/src/codext/base/_base2n.py similarity index 100% rename from codext/base/_base2n.py rename to src/codext/base/_base2n.py diff --git a/codext/base/base100.py b/src/codext/base/base100.py similarity index 100% rename from codext/base/base100.py rename to src/codext/base/base100.py diff --git a/codext/base/base122.py b/src/codext/base/base122.py similarity index 100% rename from codext/base/base122.py rename to src/codext/base/base122.py diff --git a/codext/base/base45.py b/src/codext/base/base45.py similarity index 100% rename from codext/base/base45.py rename to src/codext/base/base45.py diff --git a/codext/base/base85.py b/src/codext/base/base85.py similarity index 100% rename from codext/base/base85.py rename to src/codext/base/base85.py diff --git a/codext/base/base91.py b/src/codext/base/base91.py similarity index 100% rename from codext/base/base91.py rename to src/codext/base/base91.py diff --git a/codext/base/baseN.py b/src/codext/base/baseN.py similarity index 100% rename from codext/base/baseN.py rename to src/codext/base/baseN.py diff --git a/codext/binary/__init__.py b/src/codext/binary/__init__.py similarity index 100% rename from codext/binary/__init__.py rename to src/codext/binary/__init__.py diff --git a/codext/binary/baudot.py b/src/codext/binary/baudot.py similarity index 100% rename from codext/binary/baudot.py rename to src/codext/binary/baudot.py diff --git a/codext/binary/bcd.py b/src/codext/binary/bcd.py similarity index 100% rename from codext/binary/bcd.py rename to src/codext/binary/bcd.py diff --git a/codext/binary/excess3.py b/src/codext/binary/excess3.py similarity index 100% rename from codext/binary/excess3.py rename to src/codext/binary/excess3.py diff --git a/codext/binary/gray.py b/src/codext/binary/gray.py similarity index 100% rename from codext/binary/gray.py rename to src/codext/binary/gray.py diff --git a/codext/binary/manchester.py b/src/codext/binary/manchester.py similarity index 100% rename from codext/binary/manchester.py rename to src/codext/binary/manchester.py diff --git a/codext/binary/rotate.py b/src/codext/binary/rotate.py similarity index 100% rename from codext/binary/rotate.py rename to src/codext/binary/rotate.py diff --git a/codext/common/__init__.py b/src/codext/common/__init__.py similarity index 100% rename from codext/common/__init__.py rename to src/codext/common/__init__.py diff --git a/codext/common/a1z26.py b/src/codext/common/a1z26.py similarity index 100% rename from codext/common/a1z26.py rename to src/codext/common/a1z26.py diff --git a/codext/common/cases.py b/src/codext/common/cases.py similarity index 100% rename from codext/common/cases.py rename to src/codext/common/cases.py diff --git a/codext/common/dummy.py b/src/codext/common/dummy.py similarity index 100% rename from codext/common/dummy.py rename to src/codext/common/dummy.py diff --git a/codext/common/octal.py b/src/codext/common/octal.py similarity index 100% rename from codext/common/octal.py rename to src/codext/common/octal.py diff --git a/codext/common/ordinal.py b/src/codext/common/ordinal.py similarity index 100% rename from codext/common/ordinal.py rename to src/codext/common/ordinal.py diff --git a/codext/compressions/__init__.py b/src/codext/compressions/__init__.py similarity index 100% rename from codext/compressions/__init__.py rename to src/codext/compressions/__init__.py diff --git a/codext/compressions/gzipp.py b/src/codext/compressions/gzipp.py similarity index 100% rename from codext/compressions/gzipp.py rename to src/codext/compressions/gzipp.py diff --git a/codext/compressions/lz77.py b/src/codext/compressions/lz77.py similarity index 100% rename from codext/compressions/lz77.py rename to src/codext/compressions/lz77.py diff --git a/codext/compressions/lz78.py b/src/codext/compressions/lz78.py similarity index 100% rename from codext/compressions/lz78.py rename to src/codext/compressions/lz78.py diff --git a/codext/compressions/pkzip.py b/src/codext/compressions/pkzip.py similarity index 100% rename from codext/compressions/pkzip.py rename to src/codext/compressions/pkzip.py diff --git a/codext/crypto/__init__.py b/src/codext/crypto/__init__.py similarity index 100% rename from codext/crypto/__init__.py rename to src/codext/crypto/__init__.py diff --git a/codext/crypto/affine.py b/src/codext/crypto/affine.py similarity index 100% rename from codext/crypto/affine.py rename to src/codext/crypto/affine.py diff --git a/codext/crypto/atbash.py b/src/codext/crypto/atbash.py similarity index 100% rename from codext/crypto/atbash.py rename to src/codext/crypto/atbash.py diff --git a/codext/crypto/bacon.py b/src/codext/crypto/bacon.py similarity index 100% rename from codext/crypto/bacon.py rename to src/codext/crypto/bacon.py diff --git a/codext/crypto/barbie.py b/src/codext/crypto/barbie.py similarity index 100% rename from codext/crypto/barbie.py rename to src/codext/crypto/barbie.py diff --git a/codext/crypto/citrix.py b/src/codext/crypto/citrix.py similarity index 100% rename from codext/crypto/citrix.py rename to src/codext/crypto/citrix.py diff --git a/codext/crypto/railfence.py b/src/codext/crypto/railfence.py similarity index 100% rename from codext/crypto/railfence.py rename to src/codext/crypto/railfence.py diff --git a/codext/crypto/rot.py b/src/codext/crypto/rot.py similarity index 100% rename from codext/crypto/rot.py rename to src/codext/crypto/rot.py diff --git a/codext/crypto/scytale.py b/src/codext/crypto/scytale.py similarity index 100% rename from codext/crypto/scytale.py rename to src/codext/crypto/scytale.py diff --git a/codext/crypto/shift.py b/src/codext/crypto/shift.py similarity index 100% rename from codext/crypto/shift.py rename to src/codext/crypto/shift.py diff --git a/codext/crypto/xor.py b/src/codext/crypto/xor.py similarity index 100% rename from codext/crypto/xor.py rename to src/codext/crypto/xor.py diff --git a/codext/hashing/__init__.py b/src/codext/hashing/__init__.py similarity index 100% rename from codext/hashing/__init__.py rename to src/codext/hashing/__init__.py diff --git a/codext/hashing/blake.py b/src/codext/hashing/blake.py similarity index 100% rename from codext/hashing/blake.py rename to src/codext/hashing/blake.py diff --git a/codext/hashing/checksums.py b/src/codext/hashing/checksums.py similarity index 100% rename from codext/hashing/checksums.py rename to src/codext/hashing/checksums.py diff --git a/codext/hashing/crypt.py b/src/codext/hashing/crypt.py similarity index 100% rename from codext/hashing/crypt.py rename to src/codext/hashing/crypt.py diff --git a/codext/hashing/md.py b/src/codext/hashing/md.py similarity index 100% rename from codext/hashing/md.py rename to src/codext/hashing/md.py diff --git a/codext/hashing/sha.py b/src/codext/hashing/sha.py similarity index 100% rename from codext/hashing/sha.py rename to src/codext/hashing/sha.py diff --git a/codext/hashing/shake.py b/src/codext/hashing/shake.py similarity index 100% rename from codext/hashing/shake.py rename to src/codext/hashing/shake.py diff --git a/codext/languages/__init__.py b/src/codext/languages/__init__.py similarity index 100% rename from codext/languages/__init__.py rename to src/codext/languages/__init__.py diff --git a/codext/languages/braille.py b/src/codext/languages/braille.py similarity index 100% rename from codext/languages/braille.py rename to src/codext/languages/braille.py diff --git a/codext/languages/galactic.py b/src/codext/languages/galactic.py similarity index 100% rename from codext/languages/galactic.py rename to src/codext/languages/galactic.py diff --git a/codext/languages/ipsum.py b/src/codext/languages/ipsum.py similarity index 100% rename from codext/languages/ipsum.py rename to src/codext/languages/ipsum.py diff --git a/codext/languages/leetspeak.py b/src/codext/languages/leetspeak.py similarity index 100% rename from codext/languages/leetspeak.py rename to src/codext/languages/leetspeak.py diff --git a/codext/languages/morse.py b/src/codext/languages/morse.py similarity index 100% rename from codext/languages/morse.py rename to src/codext/languages/morse.py diff --git a/codext/languages/navajo.py b/src/codext/languages/navajo.py similarity index 100% rename from codext/languages/navajo.py rename to src/codext/languages/navajo.py diff --git a/codext/languages/radio.py b/src/codext/languages/radio.py similarity index 100% rename from codext/languages/radio.py rename to src/codext/languages/radio.py diff --git a/codext/languages/southpark.py b/src/codext/languages/southpark.py similarity index 100% rename from codext/languages/southpark.py rename to src/codext/languages/southpark.py diff --git a/codext/languages/tap.py b/src/codext/languages/tap.py similarity index 100% rename from codext/languages/tap.py rename to src/codext/languages/tap.py diff --git a/codext/languages/tomtom.py b/src/codext/languages/tomtom.py similarity index 100% rename from codext/languages/tomtom.py rename to src/codext/languages/tomtom.py diff --git a/codext/macros.json b/src/codext/macros.json similarity index 100% rename from codext/macros.json rename to src/codext/macros.json diff --git a/codext/others/__init__.py b/src/codext/others/__init__.py similarity index 100% rename from codext/others/__init__.py rename to src/codext/others/__init__.py diff --git a/codext/others/dna.py b/src/codext/others/dna.py similarity index 100% rename from codext/others/dna.py rename to src/codext/others/dna.py diff --git a/codext/others/kbshift.py b/src/codext/others/kbshift.py similarity index 100% rename from codext/others/kbshift.py rename to src/codext/others/kbshift.py diff --git a/codext/others/letters.py b/src/codext/others/letters.py similarity index 100% rename from codext/others/letters.py rename to src/codext/others/letters.py diff --git a/codext/others/markdown.py b/src/codext/others/markdown.py similarity index 100% rename from codext/others/markdown.py rename to src/codext/others/markdown.py diff --git a/codext/others/uuencode.py b/src/codext/others/uuencode.py similarity index 100% rename from codext/others/uuencode.py rename to src/codext/others/uuencode.py diff --git a/codext/stegano/__init__.py b/src/codext/stegano/__init__.py similarity index 100% rename from codext/stegano/__init__.py rename to src/codext/stegano/__init__.py diff --git a/codext/stegano/hexagram.py b/src/codext/stegano/hexagram.py similarity index 100% rename from codext/stegano/hexagram.py rename to src/codext/stegano/hexagram.py diff --git a/codext/stegano/klopf.py b/src/codext/stegano/klopf.py similarity index 100% rename from codext/stegano/klopf.py rename to src/codext/stegano/klopf.py diff --git a/codext/stegano/resistor.py b/src/codext/stegano/resistor.py similarity index 100% rename from codext/stegano/resistor.py rename to src/codext/stegano/resistor.py diff --git a/codext/stegano/rick.py b/src/codext/stegano/rick.py similarity index 100% rename from codext/stegano/rick.py rename to src/codext/stegano/rick.py diff --git a/codext/stegano/sms.py b/src/codext/stegano/sms.py similarity index 100% rename from codext/stegano/sms.py rename to src/codext/stegano/sms.py diff --git a/codext/stegano/whitespace.py b/src/codext/stegano/whitespace.py similarity index 100% rename from codext/stegano/whitespace.py rename to src/codext/stegano/whitespace.py diff --git a/codext/web/__init__.py b/src/codext/web/__init__.py similarity index 100% rename from codext/web/__init__.py rename to src/codext/web/__init__.py diff --git a/codext/web/html.py b/src/codext/web/html.py similarity index 100% rename from codext/web/html.py rename to src/codext/web/html.py diff --git a/codext/web/url.py b/src/codext/web/url.py similarity index 100% rename from codext/web/url.py rename to src/codext/web/url.py From 7c79988bf8aa267bbcb177b78391d41ac5f7db3c Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 13:53:02 +0100 Subject: [PATCH 68/97] Updated the documentation --- .readthedocs.yml | 8 ++-- README.md | 15 +++---- docs/coverage.svg | 1 + docs/js/collapsible-navbar.js | 54 ----------------------- docs/mkdocs.yml | 55 ++++++++++++++++++++++++ docs/{ => pages}/cli.md | 0 docs/{ => pages}/demos/using-bases.gif | Bin docs/{ => pages}/demos/using-codext.gif | Bin docs/{ => pages}/demos/using-debase.gif | Bin docs/{ => pages}/enc/base.md | 0 docs/{ => pages}/enc/binary.md | 0 docs/{ => pages}/enc/common.md | 0 docs/{ => pages}/enc/compressions.md | 0 docs/{ => pages}/enc/crypto.md | 0 docs/{ => pages}/enc/hashing.md | 0 docs/{ => pages}/enc/languages.md | 0 docs/{ => pages}/enc/others.md | 0 docs/{ => pages}/enc/stegano.md | 0 docs/{ => pages}/enc/web.md | 0 docs/{ => pages}/features.md | 0 docs/{ => pages}/guessing.md | 0 docs/{ => pages}/howto.md | 0 docs/{imgs => pages/img}/banner.png | Bin docs/pages/img/icon.png | Bin 0 -> 23561 bytes docs/{imgs => pages/img}/logo.png | Bin docs/{ => pages}/index.md | 0 docs/{ => pages}/manipulations.md | 0 docs/requirements.txt | 6 +++ mkdocs.yml | 31 ------------- 29 files changed, 74 insertions(+), 96 deletions(-) create mode 100644 docs/coverage.svg delete mode 100644 docs/js/collapsible-navbar.js create mode 100644 docs/mkdocs.yml rename docs/{ => pages}/cli.md (100%) rename docs/{ => pages}/demos/using-bases.gif (100%) rename docs/{ => pages}/demos/using-codext.gif (100%) rename docs/{ => pages}/demos/using-debase.gif (100%) rename docs/{ => pages}/enc/base.md (100%) rename docs/{ => pages}/enc/binary.md (100%) rename docs/{ => pages}/enc/common.md (100%) rename docs/{ => pages}/enc/compressions.md (100%) rename docs/{ => pages}/enc/crypto.md (100%) rename docs/{ => pages}/enc/hashing.md (100%) rename docs/{ => pages}/enc/languages.md (100%) rename docs/{ => pages}/enc/others.md (100%) rename docs/{ => pages}/enc/stegano.md (100%) rename docs/{ => pages}/enc/web.md (100%) rename docs/{ => pages}/features.md (100%) rename docs/{ => pages}/guessing.md (100%) rename docs/{ => pages}/howto.md (100%) rename docs/{imgs => pages/img}/banner.png (100%) create mode 100644 docs/pages/img/icon.png rename docs/{imgs => pages/img}/logo.png (100%) rename docs/{ => pages}/index.md (100%) rename docs/{ => pages}/manipulations.md (100%) create mode 100644 docs/requirements.txt delete mode 100644 mkdocs.yml diff --git a/.readthedocs.yml b/.readthedocs.yml index e8f4e71..0e991f8 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,6 +1,8 @@ version: 2 + mkdocs: - configuration: mkdocs.yml -formats: all + configuration: docs/mkdocs.yml + python: - version: 3.6 + install: + - requirements: docs/requirements.txt diff --git a/README.md b/README.md index 2ce70be..35aa6c2 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,12 @@ -

+

CodExt Tweet

Encode/decode anything.

[![PyPi](https://img.shields.io/pypi/v/codext.svg)](https://pypi.python.org/pypi/codext/) [![Read The Docs](https://readthedocs.org/projects/python-codext/badge/?version=latest)](https://python-codext.readthedocs.io/en/latest/?badge=latest) -[![Build Status](https://travis-ci.com/dhondta/python-codext.svg?branch=master)](https://travis-ci.com/dhondta/python-codext) -[![Coverage Status](https://coveralls.io/repos/github/dhondta/python-codext/badge.svg?branch=master)](https://coveralls.io/github/dhondta/python-codext?branch=master) +[![Build Status](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml/badge.svg)](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml) +[![Coverage Status](https://raw.githubusercontent.com/dhondta/python-codext/main/docs/coverage.svg)](#) [![Python Versions](https://img.shields.io/pypi/pyversions/codext.svg)](https://pypi.python.org/pypi/codext/) -[![Requirements Status](https://requires.io/github/dhondta/python-codext/requirements.svg?branch=master)](https://requires.io/github/dhondta/python-codext/requirements/?branch=master) [![Known Vulnerabilities](https://snyk.io/test/github/dhondta/python-codext/badge.svg?targetFile=requirements.txt)](https://snyk.io/test/github/dhondta/python-codext?targetFile=requirements.txt) [![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) [![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) @@ -20,13 +19,13 @@ $ pip install codext Want to contribute a new codec ? | Want to contribute a new macro ? :----------------------------------:|:------------------------------------: -Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/master/codext/macros.json) +Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/main/codext/macros.json) ## :mag: Demonstrations -

Using CodExt from the command line

-

Using base tools from the command line

-

Using the unbase command line tool

+

Using CodExt from the command line

+

Using base tools from the command line

+

Using the unbase command line tool

## :computer: Usage (main CLI tool) Tweet on codext diff --git a/docs/coverage.svg b/docs/coverage.svg new file mode 100644 index 0000000..bde433b --- /dev/null +++ b/docs/coverage.svg @@ -0,0 +1 @@ +coverage: 99.53%coverage99.53% \ No newline at end of file diff --git a/docs/js/collapsible-navbar.js b/docs/js/collapsible-navbar.js deleted file mode 100644 index b1e1593..0000000 --- a/docs/js/collapsible-navbar.js +++ /dev/null @@ -1,54 +0,0 @@ -String.prototype.format = function() { - a = this; - for (k in arguments) { - a = a.replace("{" + k + "}", arguments[k]) - } - return a -} - -$(document).ready(function () { - $('li.toctree-l1').each(function () { - var parent = $(this); - var span = parent.find('span:first'); - var sibling = null; - var remove = true; - $('li.toctree-l1').each(function() { - var a = $(this).find('a:first'); - if (a.text() != '' && a.text() == span.text()) { - parent.prepend(a); - span.remove(); - span = a; - if ($(this).hasClass('current')) parent.addClass('current'); - sibling = $(this); - return false - } - }); - if (sibling === null && parent.find('ul.subnav:not(li.toctree-l2)').children('li').length) { - sibling = parent; - remove = false; - } - if (sibling !== null) { - var ul = parent.find('ul.subnav:not(li.toctree-l2)'); - var new_a = ''; - if (!ul.children('li.current').length && !parent.hasClass('current')) { - ul.hide(); - $(new_a.format("left")).insertBefore(span); - } else { - $(new_a.format("down")).insertBefore(span); - } - if (remove) sibling.remove(); - } - }); - $('a.collapse-navbar').click(function () { - var parent = $(this).closest('li.toctree-l1'); - var subnav = parent.find('ul.subnav:not(li.toctree-l2)'); - if ($(this).hasClass('fa-caret-left')) { - subnav.show(); - $(this).removeClass('fa-caret-left'); - $(this).addClass('fa-caret-down'); - } else { - subnav.hide(); - $(this).addClass('fa-caret-left'); - $(this).removeClass('fa-caret-down'); - } -});}); diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 0000000..a39ccb0 --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,55 @@ +site_author: dhondta +site_name: "Codext - Extension of native codecs for Python" +repo_url: https://github.com/dhondta/python-codext +copyright: Copyright © 2021-2023 Alexandre D'Hondt +docs_dir: pages +nav: + - Introduction: index.md + - Features: features.md + - 'Guess mode': guessing.md + - Encodings: + - Base: enc/base.md + - Binary: enc/binary.md + - Common: enc/common.md + - Compressions: enc/compressions.md + - Cryptography: enc/crypto.md + - Hashing: enc/hashing.md + - Languages: enc/languages.md + - Others: enc/others.md + - Steganography: enc/stegano.md + - 'String manipulations': manipulations.md + - 'CLI tool': cli.md + - 'Create your codec': howto.md +extra: + generator: false + social: + - icon: fontawesome/solid/paper-plane + link: mailto:alexandre.dhondt@gmail.com + name: Contact Alex + - icon: fontawesome/brands/github + link: https://github.com/dhondta + name: Alex on GitHub + - icon: fontawesome/brands/linkedin + link: https://www.linkedin.com/in/alexandre-d-2ab2aa14/ + name: Alex on LinkedIn + - icon: fontawesome/brands/twitter + link: https://twitter.com/alex_dhondt + name: Alex on Twitter +theme: + name: material + palette: + - scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to light mode + logo: img/logo.png + favicon: img/icon.png +use_directory_urls: false +markdown_extensions: + - toc: + permalink: true + - admonition diff --git a/docs/cli.md b/docs/pages/cli.md similarity index 100% rename from docs/cli.md rename to docs/pages/cli.md diff --git a/docs/demos/using-bases.gif b/docs/pages/demos/using-bases.gif similarity index 100% rename from docs/demos/using-bases.gif rename to docs/pages/demos/using-bases.gif diff --git a/docs/demos/using-codext.gif b/docs/pages/demos/using-codext.gif similarity index 100% rename from docs/demos/using-codext.gif rename to docs/pages/demos/using-codext.gif diff --git a/docs/demos/using-debase.gif b/docs/pages/demos/using-debase.gif similarity index 100% rename from docs/demos/using-debase.gif rename to docs/pages/demos/using-debase.gif diff --git a/docs/enc/base.md b/docs/pages/enc/base.md similarity index 100% rename from docs/enc/base.md rename to docs/pages/enc/base.md diff --git a/docs/enc/binary.md b/docs/pages/enc/binary.md similarity index 100% rename from docs/enc/binary.md rename to docs/pages/enc/binary.md diff --git a/docs/enc/common.md b/docs/pages/enc/common.md similarity index 100% rename from docs/enc/common.md rename to docs/pages/enc/common.md diff --git a/docs/enc/compressions.md b/docs/pages/enc/compressions.md similarity index 100% rename from docs/enc/compressions.md rename to docs/pages/enc/compressions.md diff --git a/docs/enc/crypto.md b/docs/pages/enc/crypto.md similarity index 100% rename from docs/enc/crypto.md rename to docs/pages/enc/crypto.md diff --git a/docs/enc/hashing.md b/docs/pages/enc/hashing.md similarity index 100% rename from docs/enc/hashing.md rename to docs/pages/enc/hashing.md diff --git a/docs/enc/languages.md b/docs/pages/enc/languages.md similarity index 100% rename from docs/enc/languages.md rename to docs/pages/enc/languages.md diff --git a/docs/enc/others.md b/docs/pages/enc/others.md similarity index 100% rename from docs/enc/others.md rename to docs/pages/enc/others.md diff --git a/docs/enc/stegano.md b/docs/pages/enc/stegano.md similarity index 100% rename from docs/enc/stegano.md rename to docs/pages/enc/stegano.md diff --git a/docs/enc/web.md b/docs/pages/enc/web.md similarity index 100% rename from docs/enc/web.md rename to docs/pages/enc/web.md diff --git a/docs/features.md b/docs/pages/features.md similarity index 100% rename from docs/features.md rename to docs/pages/features.md diff --git a/docs/guessing.md b/docs/pages/guessing.md similarity index 100% rename from docs/guessing.md rename to docs/pages/guessing.md diff --git a/docs/howto.md b/docs/pages/howto.md similarity index 100% rename from docs/howto.md rename to docs/pages/howto.md diff --git a/docs/imgs/banner.png b/docs/pages/img/banner.png similarity index 100% rename from docs/imgs/banner.png rename to docs/pages/img/banner.png diff --git a/docs/pages/img/icon.png b/docs/pages/img/icon.png new file mode 100644 index 0000000000000000000000000000000000000000..da3cb311f8063334794b119e1e4e76e86f73407f GIT binary patch literal 23561 zcmYhh2|QG9_%@EBgchk#DWbGsHZ!9Ivu|dfF-t`=X2$F@W}it#l=h^Bw9rN>(V~S= zX^&`Ava}Z|^i7+R_xSzZ_w)bzjB}oIpXa%s{l2dIKF>x5jo>xLe~goplNX7IWdZlS z!~dh*fa~IAd6iC16H84vzDZ+~Db!M@5Gdw|cG8uGQAPLAd zI*CH2kV^jd9Rvh{hJheqUucF9s5TGEG4+e)o z(LgFrp*7opyA26Xgu~_nDU@EXmhz-xioz5E!AC+7;ZXQ+5ekt*jSGQbfV5hnk^(n^ zG)bi!E+R3AtlgoEM3FaE!t;K9&vc=&LC z*dm$8p!omQHY}8yM9Kf#LA`=$Ad`$ng*Xx~4-?`n7zNOY;XD?p!KeU6<$uQL|J6PG z!erM=|5qxJS`{H=4Uk3ge}_-XAxE*qa;=y{Wg_Ti5nmK#{$K4{ z5zvGGQOy8GFbw>^N)YjCp*RGBW>AtWFq4#n!l1FqgeXA5W2tcpiUP!9>Eo2TNI@LJ z0%*d+qj*{hNN0;;Fp^Yep(c)m!>f1@8xJos!_5e|S)k|3KzK+bh71*xsY!4Q$p}TG zQg}2d2PiIApg?E=!=e-e^M%GSP<)aU!G@6-AcY8{1?t74$#OD+OJzcVxxwo!$y}NY zufhst3Vx&{UW5aLG72G*A1`C@kvxUjB2O{#iAp5egpP`WD(F%1lw>d2BPJ{hQScnUemC>#X?GejD7e2!RV zl$d#X6c(5mGu|fFnn8AgiGsvJ;lKpgK}ZaTOe7E>be$$rqDewaVaY&6W&%|XXo6yB zQCuX;Y-fYOJWG z)FQ2bO9c#(l!QoeB$3QQu0tSY0S!e9@hJ!hmy48OC|F4fpKGKMBB=~|6pTYj29f0y zv=xJw+5}W2kXe97!H{Yi)hu;j!D>Drfy?b?HlK+yS=cn4L}|i-ttbiu$%#i;wOE5l zL=~xYT$D7*NR1*1;PG<1j_FX~QzGN}2&EXI74aMzBf^N2z(kmMoty)Kh?o>>GMdID z3t4PB2g-w4Z8$oW$Rq$$L&V!G8o=n{RYDETu8^Z7Oc;scpxccS3@8~2AsSV*WG;!O zfJm7#l#s*_86k4B5sgV9(=BwYQYe!|CD}+|nhh#JQ-OUFhquDHc%B4}vXFqzU=5L6 zI!VIUz_0?kjSI3H;zaQxom4;p>a<8$27*|LP2q#&GOP)THv%O9If0Ojb65ygiJigV z%C$1ETq!_{MGPH5s)&p-%eZiC3K@aWB&X1A1|by#vS~H8;gN?*Bup$($pn%~S}`<< z;6RB|Yz!+&Du7CqNk+a=p~cu7EHVTy784{k64C;~JJig0ff=Vu(NgsYV?2UMSLjqq zg;}mvDEKBV0%Xwe3=Aesj*Pd-#1JY=C6-ck1QH4h!7>pzf>t6o8j=Khp41@IBPddV zAzsW9s;Pn~7#HV&;7J;)G?GTK&?B*Qc9g`X!<0s?MhXjnEIQ!7=9 zG<+qKiUq~7p+vEkqR<+UmUxE&os8ngA(a@72Iv{ypi>ivJE)|w1Oy6Am?Wp+Ax1Ub zfsw#8DH?hth=&*QmE0(Zf`#J)TNKUU+F)@C42WV?66j2x@k#_vLeV0XXtYJ4!%7W+SOAqK ztJox-lx|Lz((Td|hzdJnp1B)V5;mB|p4kE(PL=-ljjnv}}@eCnG zh_<23Bt4gBAaLkF&tTSJm|rvKa_HB_)olVG7h-1qgzb z(FKZljRKF01WRQE3|%1N#qus0%;;6kT8N+4x)n*uq3Jwq*23gBn3eM z*T?H6nn*Ps%e08&EC5{4*bplg55*EpXee1^MJD4B9GOMK2apbJLxOdxWID%Alo*U$ zlU);!M>x=6Br*k>qOj?d60ie>Mga|s0>SKTHVF!*lF$SqTSNwHnRcU{gcIn|Tnd>c z)+8asXpWuAv^GqNNlS`LCP%;}1q%>m{u5nKibMKL4}yEp{|l7qnzCIb^6CruKm!3I230CW^<(ST`ukk(?;k&IM_CPl6g zXmMZ-7-WhY##WvJOhWJsdY;;BHM58cyNZD$5%i=KD_;_4Vv)3FseuBC!jZH%tq?G6 zQAuJR7N{Iefx)7578%c?q#7{EMy$Zjr11bAiYCZK8eY6ikxbwb`2c~!qlHXUio=Y8 zve*C};tP3f8Jg%28sexJjGb)2MLCQtg#)G)06VsyJk@iH;MPtoBG1LTOebsW_y`U>D)Uk;FK@ ziKUVnqbwkJGQp&PQB-`1Qiu~%a5g4CQqIv5M0^;`1lLH|ktCHJLUk}fTrnb2&67y= z4nBj9!t2=@u13feV+C5F%EUy;^f*2a#l$Pj#7H?^7$<-MBo@sAX#`}Q3M7o;u;SHb zsVI`pAth;si87f=#Zf>+t|A_*fRhDOwMK}*YV{Z)A&whwOk(S!kUX{k02{PQK1}IU za0G=T#WGBkNL7?bB1KT3Xf*;@iEulHK-7}08l?g&r$H=gv6==-(pdx=i9-Q3+8_!( zooQfNxlsZk!UESTR2qvz1>>kz<&j3VHYMI6QSsv0 zd^L&@9|y1>$S`|CBZy|Z0kAh{sGLp_D^qmDcq5XGRV(#+f`o5mOLdrdKAH|T!B|35 z9Nvxw$!Tx`mTEC!S#*U}PSs+Be7YS_z-L;`WU>yUV&PQU6b1mPL^u?w zwxFym(54()LOm(Z&j1+4lu_7w{hdO_Bf+M5y=8UEHEe}iX{*cU~mOM zc9am4K!A2oF)Y|Hb4^wv2oek5D#xdoq>&su&z=;CP2$Ru04B$G7$^vu%@hTVmuMs! zGv8w2fYDNr1j;~D2yuiIrUV&p2k|TfhD3*F7?{b)OhP17Wkc`+^mCj}~VaO`3gRaM@@n$GdNrXex++m|HQ^G_# zHj~LjS|xD=CM(&jHE{4eFm_Kg+Q6clFm?h_W7liw5T=$$Ls3L(IbI}EVEA!R zBQOU5U?k%8nDz$siFvPEE!q zE2vNe6h(${2uFw);P^&{M4}hWNTTFJwx$b5&0sy7-{!yx4l zV~CLw6FX9k5nzyd15Sa7jFf{(w0J_K#EPKXD0H113wU}dWT9CJXYjCM2n@`p090BS zhfyO`4zh(S1eiHGN*d3{BX~#(hX6%jK~w@yz#}HxpxPuWoM}%=f#_hEIKY!aN3j)3 zehLyK7OQm-z?fiFR*aF)2AYT>CXxAahdRX+8Arpj*f6V7CKHJD94N|0j6|V_5gdiZ z0}LLYLgeVkNdi6uM20)KCIK;pM>1f*N_~=%7a6CQDd;o=MPr8W4P3lh!;BPD1;SxJ z(w3CMAR$C*DM<~+66_cYLPG`iInk~~4%0=ENry6H=`^;?1m!}>#Q3Cm2;zTUuA@0SoE%Hnf=z(6z&j|kq$r{T%B64^b{tBeK?;Tm7Me*TTI~2FundTt$SiUs zQcDH6Gf;q%gpwoVj>sq&FcxSEhRKSuAtD`okQjw9iseWul!OAQfQ(>^nK4ZFOn{ds z19alCQXGW}3y< z23$b^Q#63P022>;e`dMGmTVq&{V`ZDA52HH^k|+K1Xltp1FbY+9L#vC87UFO%K&J` zV?Y?FOeIB099RxA1x}BYi;*c7rCo_5S)+u*;H|~U)DkLdc%-=?tIC3u;IzaP31FDe zDw$1dOX667xCx+2nnXuYWlSha3Zq#ifMD3wmKf<$9)kphbL`NlNUK=QATfdI!owX% zs$DKd>6kLB#b$`J+3{jr91M%3Ab|eiv05~TkHph-0D|Gs9GO)CtPp^u;W;|o@LG`} z06L5(f|SD#Sc?tM6b#QD4iPoXa8%Oas$q60kD;Q2hB*x&wAeKgfM25HnE)IO3+OT? z;7Q{xRNx}UD4_`JaKvZ$H@pe{U$p3dLPx;!|BcK+afq0aWll~rok&;|$CmKrz$hF0 z{lL|iJCBln9#o`o&v@K; zKPjd~YKm<=y0Tc25BnX{`)q&z{+&k13sYb1&y@$G$%b3jns4|2ta`akZ1B#52Q?=dq5QYko{hT_5B%kSeGYLV z-fS$N0Zo1ow~HA3;KG&M5_WNPNbdc&dAD!fx^-ggG$bhbXHR>TlT%syaeRwm536Lq z7UWxVGYO%2VG5o*w`+`(r>j$ZvHEiJ&^G%Dg464pwMU%Fp9OTP*Tx-U_*y~x zFjtbCY$TMl*}k2c%PzXHZvFZn@5m*q{`^>#d+^|c0WNJXG+qZ7oG$vq?<+Euc zHYh2f>cok8y8C;=UZ#ZHJ=Yk$sJ*>?$(@ITL?UrR_p`E>JKLPcf68L!ln-oSwKjjf z`e*+ltG4${((MhV!}}O9zqVDReAeU9>0isY#yIa%o^)DK5mw&f;#X6yJ8|N~4b!Mf zQc$#DC*gtlB}$ ztPP5IP|>ihvA({4&s~Xafro9{cx}w*sL8$ecYSIe5FKj#v1Js)CvsMB@W|UwpJWbP zvz}VFTGI7>;A{QQvy!_x;4^2={23Rpc=X)4bM z$mv64!MG8l#vYA%apa*~rceJhL+%xm9q*#y6N#h$-HB^BxEkbqYp+LoT`jwq=6)vg zr+-B7)6AKeufOLcHE;v`{C@x4IC0Luo3p0dLK=r+Roo_9$gw4bbA86_ANvthg1&n} zd~-~D%eiF1RPbA{zzO8}GjN&rx|vlsoIVFQz0!i7UVjoa(x0|HzIegsT^U6WW>d5F z6wx*vcC5?HEO#s4N6-Y#W zGvP;?tk*#}#OOmQdi_f;$DSjI-y-)-*KH35|Lgm8tL|WA{;b&65l*sw(^G5m^73BJ zn?HYwz7P#c2F1K3Lzb0O48rDgnP|bCZ(ae5e^qbUvSmipRLHV|!2L(Tfpup7Y^PUiL?0g2v@UnX=FYv#*tBj$?C-lf=j;U5R^FU!V+^U}xiKWM z?veR2G4$i1*Wr&3gj|IjdD1zvSH8!wQir{1vJ?bXh& z#)otF(v=|Ja8<<%(e#C;Q0eN9_-6T!<2yc&+)97w_3ZTW)+ptZ1^3#FF*Di zbPb4xnY}OM-yc+_=bt6f&0e>xmiMmI1(Rn?+K5dE+A+FsyB4xny?fujR?)yqWyQj? zMMbibYb#dxUBU%1f#wo&_zU8l(*m57Dav`Fp}%Nl&#Yiy&$h(Emec2=5F^68UhJC7 z?g5O(f)~GU&Dl!sl+(O`)2E)GDvHLgbvb@VJAKI>GIrNKJ~kbTPLG@C zwl^;4^_w?)-hKRXIOp!CQXOFTwYzukzTfXyB5<0eUU*<^zvErgvg*L{ZP$~&yf7c8 zGQ)ElD^O=&oa|WfMZT_CE)VQ{SsSxzC1ie(-v_U;jLVmib2efhZMm;FrvJqW z@9b92`eXSnBr~t(uxvCLt>MH=nooYP@ZvV@jts}2{OL-Ti>mCRZ4DjjE4_l8IT1$Dt1j|e^S=z7k}9#lMO@ZYR@>d2(`w5K@N z{4z~?@!Z1KmsfYtwDhs#gtN_ugBYLIjjUW-aM9%L3T%!S*CO$r6~U6zw4hLS_h>oK2YV;& z;zU>a`CPPl$F13O-xHfUzj`x$M!H^#m1i$qpPM_Ux(KslC8>mS`qfv->jMjaCqPz> zbKdx2^vO+oA$jipFOUVRD_0K=)s2g3!`*4(z8%qD{=C9t`b=tr=lKbV%l=)pr6xUx z9C-XF`}Nk8!NJ$MpRcfPm9@@9oYULuM`L$CI~zOIzxOBOBk`H_S3voB^h(>G^u=w# zJ7nEY?(UYish)KO*1dc-za+7-a3Ik+?a-x5zmC0lA$+pmxNo@@y{yT<QojAXsDPa0E(Bhzazftw4 zBB#jv^TXkfc;7$O^RqYJWF#aUJpNF={@K%~+ov9>2$k>gIkPOJnhCB+2G&YKr~i$$ zp39Y!#<==lcbz`A6m>yv8s+QL<*IOfs>cYtXyP%Bi^eQQgx@mWiJ9Q21Fiy6#n{wk z!TNr!->zq!D`GFr8EGfJ{;0ZH{(*m55jW>Ye^%|doQXubl$$18N{Icji}Y$R@!Ts@ zyz7i{S4a5;{3>{CFaIJ=@7kX^(|u%f=L=8yCHn?#uQTPt_w@Yy6WG%DMaxc>zPz${ zbm5@~n$OSA+-9@c?FelfG1#wk7N++WZjUAK@z2ReA_Izf=b9s*Pl~5*m_7c=n!;uIVO{Kx~S_>Cz|5q?egVI4Ef^w_)q^H z{KRK0Io33%e){RTCtoMbogeF-Fx8t}^pI!mNN;sjPLzaxO?YX~HAn4|&DiFib)aNj zc{I+Qe%tlx=k&n7u7W6ipR5j+HD~L#@2=Jo-_xebai^^wyVe8+a&z;Z&`HPTn2`A*{vxK`tdcx zxAkeG=UE+XuISG(baPnB@;!f9JmktU?~axNchaoZow$GXr59)Jn{`+J^TgAs^_IgI zL?NlWswTP)bp}Hp|F~F$Z7pBm{B-Iu!1;LQ8g!;FeOlndUso2`s@I04!=?9pJm1$v z{uL{Kd~Uh-z5Cgyw=SBFEA}h@gfRJ@nM)VKT?Q(Cg3@oKuA5)5CG!%)ns@F`c-tLS zUicdN{B^sxL{2!Ear^bxn1hX3fVruA5}gh@w+3_yx!)^GduIRiJHp6iys;NrK`&$;Gmk+;`4odMaj1_qKC)J8;*s3%AEA_ z^z3w5*|-CYMDK|cos-P&l5Zay5rytffpsyaeUru*DK#bevnBDuzkHYgP0)r`G{AH@16(p0Tdx-1hy*tb6nCZpCJUb%$*_rSRrY&w9Ppy5{NKyInK8cIwq7 zh1mNI4a>1iADZaYtpf|EJNs*DQJnxi`xbi0DZq3r%f0ewtefcZO zC(M2#eH~s-W{*0Mv20_Hhj~RuxHsR_7qGuRG#M42z$P4>eDJ4jO5)jn*O!O%gm=M9 zcuw{H<6VyP>ef{z=+X{Nuikk0dqcIKo=cj4Z~6%TDUd6lUeJeHIyY|)Ih)f-YhT=2 z(HrWnnIg?e`;pdsu_~rxhz59+i@#iL@@>j&+ceF~-^-6FYEnVQO0ESDwPn)VCLfZl>#K_kCa`Gf z(Dg;Xca%NWqEidOGocfmG!xDPj?NG4Id6(z_oq(-Pur`kqxhS~59!i9`x*aT)ZYz> z7u0!){%hO6Etm0cW1ByqrpByJh3D_SgvA!&|>C_!igy{OTot z+qXOp`PTB{#*>m~-O2i$U4?VpEl)flrWNPkje-2hJ)EIDAQC<9cr`+QVFKjyTrVL7 zg~w-kmxL;+!o)+xtBX+k7R{2CB$QcCmfjri60hxEME0QkeFL;>QpR{%4)*pOd+1mP>H{Wj&mag6qwV^HX#u^XFE@yIk z*`%5~uA6{aSbf_Id1(7n`IGE@2gY7*u?8X*9bq;xS-+dB7uCL4D?8HG);2F)k`^{5 z->YN}$YpFRXx{AEczbtq=?YVANA=2!0({JeoBx*1H13XFzCzYwjN9$v#Z8J$auM*e zJ#r%gvicYCzP}K9wQ8QNy%PYFH=S_xnEV-iF6UTm+n!zWJ+V9QzYi`Rldis+=Ir() zxheYJ9zz1)5`8ye&$Bku*H}8n?>-ZOBR1?hy3NI_EW^9!b64H5+{PI@ojaG8nkQsr zC9eEZ(KQRd<@v_j3&)>duoeIKesh1x1&0pMHS+AQU$!vWgSR7dg73XIc-F%rk6M0e z+8c@%{7v**ko9%RUs3ag)}rx|s*L-C|9;n?Ko?F%oEhI1QJP-cdnu_m+Wn2!&wC45)u3Po#^%1orPnI26XnygxtGFPVzDDj2TPgx2+&?`{e_`3a^=iPu zICuQ&)vGGN`&@qcAvyIm=WTPMpxI{FmgV)ODW6o}d3UGk#gx*0$$n-)EK7s+bc85uKaiJ z$n!OmCCBHm=k4Ctd2d$zsG|{*agw){zpqVf{~OyAlJWPR@cHU9X|}|^(!1BMNc+UK zGv5$C%$qc^_r=S{(2e0zcE8Jq^Em#cK3d`A^dfFWaafubAqn6kVejd&WmL-J=)hU zKcw0)b^m3abMN#!E@0e*anG|B+HwkgGUAUL3?5sO73uW|Xho}NHLz_6PD$lD| zof@2SAK#l+)4kaE-^~4s1_?)I1TI*hZml$T6Ca^>lbld2H@J=lIo=61LWkmk(X$ZxoCTYi3_h+YxcFYc%59iPv8eA%#K; zE96>}TxUz>to(1NzsFnmub>1x{!`dG*O=t9hp#1%8iKeF-F#%}cj`RqG6IXQbg@^^RYjYaNtlcftB<%iz}J@cA6Bm2kq;=u07 zZ`|@ay0$MpR{j@MCEK}q=V(jqlieeKiJy31*t0gylhNa-$UZ&>G`}%($F+jZH^)EU z&;GcFS`bSDpZvRvTJAcq^6I9G0eAIx|G-F`eMyE1&6i@lOTMpklsIEQc$dO^%c~3% zoT=WhfWlU{i}&t*k}fuBd9r^&ytZ933pBhJI`)g|MGrtkng#F%FXBxAFzH7&I#|bcBb^z z)-R+}=$MS^J?ZD?XJr*E+Uke-qS`+D#?^Nz)v*!=!XM_cbm*IQ_xh_+Am}z>@nytk z7a$(|`_;*^nuRZZ1=qzn&H^#q+`)nFG8yhdUSW$%%ZuV8&d&r#nwOVYL&dO##jas@ z6FR40C;x4J@wU4xXY|)SIg;bof2}0!{q*i;PVmy$)=TrVxB*4rLa*-9)xAv-iMN+d z_2~IYD%+OU**DN2--()fLVxm9vpeoYtEWHI>{D>-rFhfrcl+i=sAM6+9BPN@@vhg| z0eO4Jxz;`D@XFajL#3@A>KxLQ0{*uvh= z`;wA3dkJ#}Nu#E^G#r^je6ziB_ntlBkCYV$>{{hSxhy+Hle_1?>N&+FbOD7On~{0> z>bBx%A6<@2^_m%Wci_A+Yx`~7_1CS^zOOTW3T}%J@5!ApW_DFW;^|2{dxidxE6J3# z@w`iet{6>6uqp;V@MoLhdyUn**z}YgzJLp;-02mOxiI_EWFMbkckGF!u3Oz(rX;4O zWN+HG?ZM@hKeeHRw~g~37Oh${NI8@_zdys;m$Lr*&P8a;Ubhb+#XBz%)yExt_=VLA z>g@4-y$&Y^>el+RO^=0Bb_Nj`QDpFJbiOti$OK%Tp=1X?I>8*}T zDgRNk(Dn~PNT2QQ@V~JHaaj3w5Bb;t$?5zO!{xRGs2SDg$t2R(H&1^pf;J;+H?h%fq@afxo@UGgkU-e@v-so93L)-+AU~r0(YAme8DD>b~8(&G^ue zwG6ZG$S?1JAm(iP!MLJTdv3j)uy^v_5gEVdAKd7XHn!$L@TZlHyMwN7i8=n}l|z*I zXW`D}$z@HBtufa(Zv1i~VF3uY#6%l0w5#+-{>lzrx<=|w2CHKgS8)8}!Uo88{04|H_CZH(LA zk@G*;M%r5Wh$pXi8Ekv~c3sllXFi?W@O4u=3Z1=OytW(Xb?gm3xe#qyv!_V#_d$s1 z^E`yLF(!1=3;GP*D8P!1oxTwJUtv>v@b^_(Nzu(wV}eu%q$6|AyUB~U%}Db|-ah)r ze`gMz>8pS1HTT}7DiR?Le;L2_oX)yn@LcxcQ1%bhR1_3d#F>}ehn_t$KWy*h((2k# zuUE#gUzsjndvp&bGWxdGh-#C+mC-O@LnK(|5>MKg0HD^AyfPzrpK#(eBD(Nvr2cx<0Ta7 z4-om#3A62B571c`|K`$SHizkR;Mnu$igVY0UXTgT|9BH{v{%u{KRvrP1T`@qa>V7N zr50h`pPS$cekU;kA_EYJ%G8Z5a)`=?Wt>fNV&r~euvV(_u`tI)B zQRFgV*`3to2VUL$HFJZm*E#yM_wiFD8JnYamv^b73tQpmUjOX$esXd7;2GZ$X>%W* zNti8rW-(#boaHUNTi;w9<|W)p2LCN?@T#h}PMM89nixg0+AcY?j|-u?5l_eehLy#D8|3ajowH%M0Rn+IFH%l+)+ z5!6raXAea&`21goX~iqo|G|715jJn$r<48t{VlTWhds~G1LLkd_o(s!?K+4My5ArF zFR6V0(ecM0ej;u3uV39Ts^#?FQO49e$0}>?r~&f|N5I7$8W|JIl!xFQ9;K@UwpT<$pp}g2v_$|CZtNBB>6iHSO_=&l zIYxCPDdKnV-tnGa0Mgv>gqF=cmEN*QyKnk>`yVjh*Z7OR$zx_oD>#4Jh_vY~mt-M? zwDp5YA3zi59jY6Ze|klCe}Q$?xsofdXooNKPb>4j5QX~oc?jdSVM)IaZaRpTZnDst z%b~&f?vP5iC|wb*W^>DtEfrHBFn!q6siggr&Mcl;uP}-PzqJ18)XNAtB@D8S*4{qRfNPjZq^WdUC z=U$y%=lyW&4n10Sta+r@V|$P!-#E)=UEgnnkCF$KCCp219NW3$Yq)Xoy4WP2-V<5% z!XjJ|RyCHiUUmL52182cY^lD{e)xs2S?iwN6R7R$`W1R7>~Z!OLtFm)t?8TR-p#Bh zrxw0f(sr=96(_drf7IrfKEmk|dHJgAspu?rEa~##+!OPnm!Fz9yJq=n?dl!>dRCv} zoM`JdZ|iqEesUz{so5|$<|l0&Ddgae)J8!`{QlFALgQ8^rQRlEgEo9q+@6ZR&DAlq z2b`9#0YWm#>oYS`?_ViiE(@XTovdj344&T-bXPz9XUCB*&YPbfsqwr|Zgue;%rj-%n*l+E?zWliJH?tkG!Xs@|>`f>XrAGc9hfFc+&uXj9LCw;$!l%tUHkF@w$5_xg*jyAS>kH{ih4_1_r6k zyEZs3&q(NuJJCpOvt^`p9=|opFJWKRw0+{u4}-|ZcZJPfKH}OSw-DJ+%^2M zG1z+}$uMy%e#zpHx3O<7Putm0S#nXg<6E)U-#LHxUYOF8`{ZMrIrq*AtbRe}LP`T_ zGe39q>ZJ9LbGGypj+Ymt{&)A{==_wuPghjkXqO6wLKq#l33uqwq0L`22{mzEle$U< zh)d^FM?(^M?&{Ux$qaAN!7m%8hoqK5IGBFIW`V-F;h6KW&Pg*#t@_O|n z|GcWIz_jPW0`*|RyhG#*aXE=6PCov&$ZgYc&vA&4!Dsl`*jDrFKUS^JGRV?Pt&ZAe zsNNnfBo?ZRUcKHPzpJF|8dhSwNvWAqPc?_W!#;t-od#=ZR6H#Q^ipTQ;nN%2K;+c(1Bo@`dhGF`@Cjv{{E#rcne$gs=HqPyS5^h7L4 zJ*f_7Tou+u4M;CGf0rD7@o4#u?Rob)_#?jL*1wGCDEkJ-As!B1a6PN9^|f8O{q0rv zWa5e|yi|b9bXSX(99T8;$sF&$?HQ;e?7Eu!F;~6byIvD@`8=68`{zp6n#BXoKHR5s zp6rY6qc(LaS9G^TP7bV#V4q{Z%DrD?*f+i_z-d;-LWyIuv15-#xFvJO(NJ=7#^9ro z0ve4L0hp9xkC3^0?!T{!ez~*j+>rL6t@>pBg0Fbhwb#f0I7k)d7;J6dG^p`VNYewK zj=9zRqQ2K1Fa0G;TD>w;mL#MZ%09d2 z!R%d~H2Q)OX@fqv9b@75PTj~=Fy)_2DJfB@e-3J&-cDRTd-}tXmn|oayRqwj->F<@ zr_@ir>VM!+zc8Ueq?w)vD>{QZn{{PBZo$@yfI@g!qVdvUeB!-n6Xt-4RqK%BayQPM zu_iG&d*Q~bg_RyJDk~ptd6QFTc2sv&igytGyLuP3k%HkXyV~9^KgKE;ThnpH|95gw zkNc;smvlcF(d~ zvc*#^i7%XPa=*_TYb{x{c-r^=bv zs}wjj)aZ0NF=mDwve)xTr@fn| z1Oz93egDRLQ{jh29V{)kM)i#Z3%tYl@5LO}^cU>-_ zyM3T|_TF(tY?bjPk_$W)WPdFGKnm4Dv2U3$PsF}ZJu8S2MG{7gbOxHkAp-YrwfMr3+U zEt)#Ta9$PO6b~E-iO;isd-*Wh&+sAR#lGcDI||)VA3nbuql+H!Yr98TH~nKR%sgev z+ou)to~Nvy{`>QxI}<=>??bU$(>{&-ukH}G?|PaY+rT(hlGZr3niNf#`|C%N>h<=0 zi(Z@Xn4Y2Vz*Q^61+~{+%<7F*m?uR2cpup7L@qCQutqw&^UVt{*q^x05zC;E;*Y+y z9VI(Q{j1I$4>%$l1f-dd4+~0sim*n5;rcZ9kq~v5`-}e&Z$))HiY#ou-{b0%7T4*n z`Qs%>*P1`Sa`hn}eg3N3Dh}m~dNxcdM|@v;DSxHgQE$KB7f)QsiAy&R3}mi(&N#cZ z_q%u7+)HCxADoU{6`g$Jc6QMfBHY6?HDHl5&>l$1yz#eOkMMT74&AN#PVpdAbOd4! z@$S@}s$6Q*2oHk;lP)K=vaW7+qXv7gn!khi>d{v5aj5e1(lM~i&+Qiz6KcC=@IFQb zRTa*PU9>a-x?lHt``dLnp62-4sp+x1UCGDIYij7~tR>e2*3Nk=yZU-xRba>=R~h> zI~&(rMVNGZ;*#PWxHZ*|5m)cu9M|DnIN?%^Hy`_{y%#s&Kfb#8di`h+Ii#=AC5aL zfXsj0(@TD!*xBM%d_HAKBG8VB8^e|2UO^MCV@h*?{n^pXW4FI`MV||VAHAq}xV<2z zO)<^03+WaelQVMCC)U|M!hBTkSa-=l829OX!`H=AN)|1(b@buqyLUxjD)jSQ51urMJByyk0!7ez9`7&I{QYmlZei&dQ_7?HH3h4--1t!e z2floNGh7>wsvuqVoBrIZ=vFPQIT6Pt?+nJY$IBj4AFmwqpXV%(pPycke|>IR`p$TM z%-^urOD>gMAvT*#6NUY`+$q-{c*tE1>LbOJ58SAIIMkh{gP``%H?||p@v!QIm=R`$ zTZYKV;av7IRcgcL88@fxDX`AS6a6u*-`DI$y<^$92-_z542&Eg;m&4mYaJ`YO|1vRf|!q616GY2v=$ZXR2-=WTXy|(sw@7PJ2k`?=4^{-WD zXWcwda%A6j<5k|os+vuAcUF&##e5reVD9?eiXEOts^73l#6z4!f@7d>AdOSdH~nyfFw%4Aha`1#^bgE3=M>mI@0E=lzFD%iR$`rf@at&a-byYFTu zM7Tzb%_9UipZ1tKGpskKVOsz8Y0gI~7mxUO_FrbkvkNV$2WO4L-szb7^Sk&E`I^js zsAkol6O9Eiy^GiU`#%9y8LH-dm_L8M#Y~IbjygFxRbnjOj5p6@)YjEI7&aGZbw%5z z88rE3?lhwxj?q3p;Lbmpsl!`(jb=Y&K-Ow#AV(uMLU!WLub?c)`cmdz7&xW2^NEI` zF;4&*)^QiPH+2)4N?W7Rkm0@gKoc}886m^-Yz7SN&Ri)^`g&WsDbBAsK&w-J z`K3W$8^#?}%xS-T;TW?V^(_fnV<5x+p;n%D&%ren+9pv~R#j6DH99r8IJp8^2-^8X zk^kaO^GI4U*)X(blZu=_UikRuZ_|C?g2sf5_><9@78#J)1sd8h(Q{(;cehQoAssEz zwrtju7nG4qT<~AeA^nt*!>DYpXy} zc8|COUC%vd^w?-k!~l8(4b1Bq@Ng`1X~hJMV^%6NWm@DwW;1BC9TQDVckiiJ(V1%; z3EE$W_m*8vJ?nCrY!@at);4uViJeaTI7|DF z>Z#N}k5$l^u0MDJGeuHUK`Y41h8>$%TOCA@Wl&wy;CR;*iDqxPf8-!(b;`RE%^mJ=1 zX!aI3o;6C5dBts#4rNG7k|FbiclI9 z{TTF*cm&oQc^9;G4TTx-8g$SvsyGlRCS;-B5iotwTjW}3T0-3$nw2p7&NnAMGDRFY z5;j!EzT9{5J-(1hhD4(y&WP=jKo%7hwLU#P{qM}q*elTr3JQJ_i^Wf?HFa?PHY=*F zQJVo<>v7Y4mO0faf1G{%Z?@*BcNA!r^klUEKDz5*1Q5FGygF+0&vvghpB zv7_|Bfdgg%y4D6GBA_yP#pDYc@wnyy4bzRQ)#}Kanwq5sgTW2&b1wTZoXu{C9Afry zVAP|4K_1Hwz5%s54VgX@M=UpaV7N*>$Xe|88$00A{Sy`+k57`}S`F817)tlkg|CP^ zP;)mXWOy$=&{&YsnHHHEe*w^V(-v6Tby5^H4FgWl#>s3lnQW=MyZf*7XQid3NqW7W zrDww82)Vhrc8`~{RImKuZ^!;dpC|;HwBnDCq@}xA&Q`Xp(PaJVVN&ayBWnyE1hjLx zC*iDu9}rxBVHEeNjNjo@`XNZ@^f>ftKbTB!=sSiIr(rHy*V20E@vdX zWtIbl2^tL<%g2AtoH?2W3l^}l3)%^qP<2-!S8mqvXpWG{WC&W%pMU-t_V3@{=hHJTj(W6N~F9zWHxd*|x9@B`Q8tyjtq<#x`j8RUF9!#DSQBT$CY_!d>P+hAt+79vET%!b5XO4h)PS$Fu~vS@`5uG?I!^LOE;=EuSGy>j9F=X zub08kJrKS*zLey=!1zp@HZj>9PJb9xek<=1?7zC%^gLY121Jd55wR2D*K1ozz6)l< zSWa~5#@I}Z(R5_vX#CAI5_m6q8D<6<>dY*_!{^!|v>9-SL}P-Mnwkn9urkd~K9qRLO+{S7YM`^%Ahk!uNs#Bo5QwFScj zAv1y|804$bXofM}ZymTsV<~uoiAz#+UQV zlI@b)QOvxkH`KwFGYz%H@fge(=I!DShi?1?m}sR}`ysIY*kUNGY>2VslVr^6qZ{{Q z-Jd04bf~_-oM?2eGOGh-#{pJC;^>Vc`YB?YK^x}Z|WU;R-zsKsfyUY{D_Iwc{{bQk`rVNhU+-;VG*cxQ?1?jLkj7rC>K8|kI zLgLU!-oAZ%l}@J{L?9b5V1VJ&sZ+FBi8s??TLMigmG(Sz=n!n#!s5iHaYV>sA_E}M zt?ZXxzQ1nn;9F^m^=}GjwnAo8CoLSrg$(n^q9Ghp8zf))59>@!^b+jZ9zHtoio-+a zIJQgT*U!}kj2SbgDm69Loe#45^$SmdrnUl_mzUQ$gTWBJVa*~qdBHpxustu*T~?Rj z=~|oV;ar>V>#i+`?>{UdtbKTI=2{M~$EUDmO0?F34Et7}Gqyc!Jh_78nZ<^iidTny zNNQ1`;oQ-UtMO;fH1xeDPMioRydonb$C;)}u31R{G?VFv({Z~t0|Ha0&_;u`M$>Aw zqg5)^B5mIO{%bZJhfCM(w{O}r=$)ei$>9|hWjWO<74cu%F!LnI@N27C$)5Z19c#&- z4E@V68eON?kzFK)(P2J+PZuAvN+V6}2Yo)D#-9rs4cQMr{BYN_@aGAMGm~h1#4*+)C+Ho>oAqELQAWO6DLl?>C$Gi5_fuQ zP0&0&J?FW(xfLR0*q96Uy@#&B@5gR6yHxw}!^U~7Nf)+1c%(h)xGAp$e(4;GT?JwCGott48t`U`3D-v=_Gj8D#g*i3P(${+S=N=))c%_sT9+@tHd{F zBr!Aj@Tnlu(hJ~dN?PTWo4J}AE#>d1Uo%36jox+sw?T%vFyg~T1#`xe3mS~LL}rD4IWJ}>rKh@Q;cYy(WtOi^%D zj~+cXUAb~)8m1|ztgKwfyttj9r2+t7ksrHuoz_63A;Ypaw$?tvXW|}vE+ZiDmeM`Q z;QjD)%E`Uc{oGQhR%_wDqSWe|jtLoF`_G(Hr_Ep1B%|C`Ng2QRaP*9NrW4b>_j-hJ+VAK;jM`FP+bnkAChRWFIyS=@db}|)^Xc(9I z0nU)r{Q2_@STB0^?Adt>7cOL_1F|I1xG-3VYQoDn3^Y26kfj1%mLwV$31yZTrYF71_0lpG&5ABfw z7zXnG`_P*vzkWL3D^@YVNTiJ?mzy;Ccpf?91#R4V-z-Hr8y4P|0l6Es^O z!&ejJ849r>eMqqfEcJZkW-??I-(q=6V5u5RfQLr(xg(dGik#9v228(>ux>uHns{iS z?Gh{f(q(lp4LcUFU(RsJcJ!3V&9uk|T4G`%?AWmbFld~@X>$Gg^+BB4N^Kgnf`WoK z#A5Mk&{cuHv>_C&wLr5CG9Y!uF}euXN*(+(mX^M!*Xut7LmlYLP5}bfT2rT`0$$7)jiqQX zwfa{_7ZIN^jJf>kuf-;S(CcISKl}llxce*o$tXfko|6)TOyW!= zBC}zboy^Gv3ha}h@Bjb@|4BqaR4cK}Li04COp8p==#V({m_Zc=Rv{q;7Z5Upjcx+I z+m`&sSQ5})aO^$4xVU&80Mu72HR!=mc@+#AWs9%TEEVwLzGhs|<9ZAm`~SUQ&WL5i zt993fuT7_6Aqfn1Tm8qvHjytDJ7z7@Vopv@%8(&LV)0lyB+gW#1fBke3>nhkvn)^r ztSw12EUO|Gi#KB0D$|cclLu7AxeOQOa@J1L5xY*lR8)>o#?opFTx(%)3Ek+8z`iR# zSWRQlJaWtYs@zPA+!-${EbO_QoSYxS!ouL}*|S8FnIP-mzyAlx$;l`k%rY%<-L2ag zXe4Ti&iF_ViwF#dJ&M<^(%oMRU`)qU9sJg_b+Sk-*}`gk>p48YmR+YKAZsZ=9C05T zqLql-K3(1tA;TP)bV-c8nVFf>1(a+;xnnW1l+VYvMW;^g?w8i5iA18q-rnA;5VE?w zouJG77&Pvqhu6CFGAlaPSkU&?4IX~0uASNS9PvWi;x%lDRzkl!%vQcwp-_a=nNZM2 z2BjDN*T;gA&1#Zj&Y-j^XxJAtK{+`&?e+8X!-bBB?w_ivWOz+wej?Q5fi9%pZILkR(Mo8>NH%tT7{*`HkkLkR{0?`Nsg|_4SS5}Q@}S4T zR`+as7x(raZ^;xUm&>u(R{#J+;aI6us*xIvhAn{%I@~xM_NcqE3RHIhYVQMRiU8^= zpxk3pqF(GXoT4NE_$QJAC3aD{N-E#jc;ok}#j27KZIla#cFR(<(v&Gv?jAdKEZTTO z!WRu0bILP$W3&~}tay9uk;~<*3^TUcTyX%f)U{r(-+=2m{4=fR=xLVodie6GiA17| zqT1X~x1Ifbt)XE|9r|M1(#3?Lm9n$5VdTh>P*Fj4&lvDYvMth(G1p}noe@DlR4ajI z2971r?8!c4@3C^Zd=@51H2#tCQYw|}DT;cIlj13L`?b$jRBFne@+_YHNf9v&W>aO{|n(f&Vp z90G=*nVO{+Y)=Rs7aqDE507sWZ)#CTfyUQod!|_&7$MMTb0AR8~`1Z|OHSZ|8vEaPQdZ6eVJnU{=*uq~0#R$Z~TQI+pL+ zP9kUL&H=rrOGHwx!yyz_=j_QTzl-i#rWWQE9T(slW|m4F`!RXo-&ximA0MA1rKP2W znMso-!KzjDIXrNjnULX@lUa=fr%YC0V{?IKFF5udF9c6ZfM$b)=gqq-wBg~C@b)B+ zws-ks!7*aBTrQ_KO-um=XiDnZwJR@V_@U|Gz`($37cX9XfxC-*5I_R}<^Yg~ZD!Dz zyK|Z=d(=I=pxGl?TLU!Xj}(_#w8{d_k{8m_(j>;2={vbxKAoL0xL$!~@^Cf!n9Y^T z<*bZ_4+3cRNGW@cwR5f5)K z@Eq2H(;5%}U>1ufwCm(b65|U)={b$tKRYD}EzNDn2rQd%0Ez^*Ui#)T!llCqoZ3PW`Ds-%! zpxGlG>^aun-LFF>3G+=h7ic_^5`XkK>x47!@9OIMRdsdsKllWepd@1)j2c^%V6;~T z?l$ozNUfozmE5P-nIz*h?j3f2h$ zW(#C^3~mOv3q*tsKf`H=G`8JL1z4zW5-MxGF*TrHn%jYgNHTB~qOO_mD9^0{FNB2dG7Oi%6c1Am?sj1cK zyNeeuPP}vHPQ5Fyv8`LTZedVR5LQk$HAX*J$;a%O?u}C?zAILYq>w`@j`iKUcSo;W zxiWpU;IY$g^U%gST?$Z%ibR|;&DKeb8>LR2I(4XAE@vqn6$(Y_ z)~#F3)^vUP^qDD>$=WmTS65f3yLj>9X{}b(`IyYjSgQFK^qn%}nai$45m)1t%mVOixQolNqJn8J$kY zl6v&95{U$V!9D3;a8_2J9S{%@il3F1mgc9WrQxyMQ2aKFP+VMG z)wOHaE?XOqPMtcrhlGS^_U+rZDLp-%J08sG&m;|-^3eYW+pA(H(KMam00000NkvXX Hu0mjf-7_Wm literal 0 HcmV?d00001 diff --git a/docs/imgs/logo.png b/docs/pages/img/logo.png similarity index 100% rename from docs/imgs/logo.png rename to docs/pages/img/logo.png diff --git a/docs/index.md b/docs/pages/index.md similarity index 100% rename from docs/index.md rename to docs/pages/index.md diff --git a/docs/manipulations.md b/docs/pages/manipulations.md similarity index 100% rename from docs/manipulations.md rename to docs/pages/manipulations.md diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..b27f8dd --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,6 @@ +jinja2<3.1.0 +mkdocs==1.2.3 +mkdocs-bootswatch +mkdocs-material +mkdocs-rtd-dropdown +pymdown-extensions diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index e9fa675..0000000 --- a/mkdocs.yml +++ /dev/null @@ -1,31 +0,0 @@ -site_name: "Codext - Extension of native codecs for Python" -repo_url: https://github.com/dhondta/python-codext -site_author: dhondta -docs_dir: docs -nav: - - Introduction: index.md - - Features: features.md - - 'Guess mode': guessing.md - - Encodings: - - Base: enc/base.md - - Binary: enc/binary.md - - Common: enc/common.md - - Compressions: enc/compressions.md - - Cryptography: enc/crypto.md - - Hashing: enc/hashing.md - - Languages: enc/languages.md - - Others: enc/others.md - - Steganography: enc/stegano.md - - 'String manipulations': manipulations.md - - 'CLI tool': cli.md - - 'Create your codec': howto.md -extra: - mailto: alexandre.dhondt@gmail.com -theme: readthedocs -extra_javascript: - - js/collapsible-navbar.js -use_directory_urls: false -markdown_extensions: - - toc: - permalink: true - - admonition From f28816667d78e2dfaa76996730ffe6ff74843561 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 13:53:28 +0100 Subject: [PATCH 69/97] Removed Travis CI config --- .travis.yml | 86 ----------------------------------------------------- 1 file changed, 86 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 39ff698..0000000 --- a/.travis.yml +++ /dev/null @@ -1,86 +0,0 @@ -language: python -jobs: - allow_failures: - - arch: arm64 - - os: osx - - python: nightly - fast_finish: true - include: - - python: 2.7 - - python: 3.6 - - python: 3.7 - - python: 3.8 - - python: 3.9 - - python: nightly - - os: windows - language: shell - before_install: choco install python2 --version 2.7.18 - env: PATH=/c/Python27:/c/Python27/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.6.8 - env: PATH=/c/Python36:/c/Python36/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.7.6 - env: PATH=/c/Python37:/c/Python37/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.8.1 - env: PATH=/c/Python38:/c/Python38/Scripts:$PATH - - os: windows - language: shell - before_install: choco install python --version 3.9.0 - env: PATH=/c/Python39:/c/Python39/Scripts:$PATH - - python: 2.7 - arch: arm64 - - python: 3.6 - arch: arm64 - - python: 3.7 - arch: arm64 - dist: focal - - python: 3.8 - arch: arm64 - - python: 3.9 - arch: arm64 - - python: nightly - arch: arm64 - - os: osx - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=2.7.18 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.6.8 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - osx_image: xcode11.3 - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.7.6 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - osx_image: xcode11.3 - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.8.1 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION - - os: osx - osx_image: xcode11.3 - language: shell - env: - - PATH=/Users/travis/.pyenv/shims:$PATH - - PYENV_VERSION=3.9.0 - before_install: travis_wait brew upgrade pyenv && pyenv install $PYENV_VERSION -cache: pip -install: - - python -m pip install --upgrade pip - - pip install pytest pytest-cov coveralls markdown2 six . -script: pytest --cov=codext --cov-report=term-missing tests -after_success: coveralls From 8508e2882ab6ce90d69ff4342511e5a7a32917df Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 13:53:40 +0100 Subject: [PATCH 70/97] Added GitHub Actions --- .github/workflows/pypi-publish.yml | 37 +++++++++++++ .github/workflows/python-package.yml | 79 ++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 .github/workflows/pypi-publish.yml create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml new file mode 100644 index 0000000..392e026 --- /dev/null +++ b/.github/workflows/pypi-publish.yml @@ -0,0 +1,37 @@ +# This workflow will deploy the Python package to PyPi.org + +name: deploy + +env: + package: codext + +on: + push: + branches: + - main + paths: + - '**/VERSION.txt' + workflow_run: + workflows: ["build"] + types: [completed] + +jobs: + deploy: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Cleanup README + run: | + sed -ri 's/^(##*)\s*:.*:\s*/\1 /g' README.md + awk '{if (match($0,"## Supporters")) exit; print}' README.md > README + mv -f README README.md + - run: python3 -m pip install --upgrade build && python3 -m build + - name: Upload ${{ env.package }} to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + verbose: true + verify_metadata: false diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..9010fab --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,79 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: build + +env: + package: codext + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest pytest-cov coverage + pip install -r requirements.txt + pip install . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test ${{ env.package }} with pytest + run: | + pytest --cov=$package + coverage: + needs: build + runs-on: ubuntu-latest + env: + cov_badge_path: docs/coverage.svg + steps: + - uses: actions/checkout@v3 + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install pytest pytest-cov + pip install -r requirements.txt + pip install . + - name: Make coverage badge for ${{ env.package }} + run: | + pip install genbadge[coverage] + pytest --cov=$package --cov-report=xml + genbadge coverage -i coverage.xml -o $cov_badge_path + - name: Verify Changed files + uses: tj-actions/verify-changed-files@v12 + id: changed_files + with: + files: ${{ env.cov_badge_path }} + - name: Commit files + if: steps.changed_files.outputs.files_changed == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add $cov_badge_path + git commit -m "Updated coverage.svg" + - name: Push changes + if: steps.changed_files.outputs.files_changed == 'true' + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.github_token }} + branch: ${{ github.ref }} From fb303c91a17b91d13b0d6449026c7661ad593807 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 14:15:23 +0100 Subject: [PATCH 71/97] Fixed issue with md4 --- src/codext/__common__.py | 5 +++++ src/codext/hashing/md.py | 3 ++- tests/test_manual.py | 3 ++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 9d9400c..d88dcbe 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -35,6 +35,11 @@ from importlib import reload except ImportError: pass +try: # from Python 3.11, it seems that 'sre_parse' is not bound to 're' anymore + re.sre_parse +except AttributeError: + import sre_parse as __sre_parse + re.sre_parse = __sre_parse __all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", diff --git a/src/codext/hashing/md.py b/src/codext/hashing/md.py index 181d85c..6463722 100644 --- a/src/codext/hashing/md.py +++ b/src/codext/hashing/md.py @@ -55,6 +55,7 @@ def md2(data): add("md2", lambda s, error="strict": (md2(s), len(s)), guess=None) -add("md4", lambda s, error="strict": (hashlib.new("md4", b(s)).hexdigest(), len(s)), guess=None) add("md5", lambda s, error="strict": (hashlib.new("md5", b(s)).hexdigest(), len(s)), guess=None) +if "md4" in hashlib.algorithms_available: + add("md4", lambda s, error="strict": (hashlib.new("md4", b(s)).hexdigest(), len(s)), guess=None) diff --git a/tests/test_manual.py b/tests/test_manual.py index 64b1843..6a1d09f 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -3,6 +3,7 @@ """Manual codec tests. """ +import hashlib import os import random from six import binary_type, string_types @@ -105,7 +106,7 @@ def test_codec_dummy_str_manips(self): def test_codec_hash_functions(self): STR = b"This is a test string!" - for h in ["adler32", "md2", "md4", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: + for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: self.assertIsNotNone(codecs.encode(STR, h)) self.assertRaises(NotImplementedError, codecs.decode, STR, h) if PY3: From c46912fcfd22ebcdde268dbdf7713052b9dec74b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 12 Feb 2023 13:23:12 +0000 Subject: [PATCH 72/97] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index bde433b..78f9f98 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.53%coverage99.53% \ No newline at end of file +coverage: 99.03%coverage99.03% \ No newline at end of file From 5438e5363500b5665d1ce646fc54e1879410300e Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 12 Feb 2023 14:29:05 +0100 Subject: [PATCH 73/97] New release --- pyproject.toml | 4 ++-- src/codext/VERSION.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ce377f3..099d04b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,9 +13,9 @@ name = "codext" authors = [ {name="Alexandre D'Hondt", email="alexandre.dhondt@gmail.com"}, ] -description = "Library for producing ASCII arts from a text or an image" +description = "Native codecs extension" license = {file = "LICENSE"} -keywords = ["python", "development", "programming", "ascii-art", "banner-generator", "quote-generator", "cowsay"] +keywords = ["python", "development", "programming", "codecs", "encodings"] requires-python = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4" classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 850e742..a4cc557 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.14.0 +1.14.2 From 8a902fa6aa1768e33c649d51633703087e486e39 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 15 Feb 2023 15:30:17 +0100 Subject: [PATCH 74/97] Updated docs requirements --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index b27f8dd..a4427bc 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,5 @@ jinja2<3.1.0 -mkdocs==1.2.3 +mkdocs>=1.3.0 mkdocs-bootswatch mkdocs-material mkdocs-rtd-dropdown From 3190f8ed91d16d8ba6582477cf8c2e8b930fe9ba Mon Sep 17 00:00:00 2001 From: dhondta Date: Fri, 28 Apr 2023 00:19:21 +0200 Subject: [PATCH 75/97] Dropped support for Python2 + Applied minor changes --- .coveragerc | 51 +- .github/workflows/python-package.yml | 158 +- docs/pages/cli.md | 366 ++-- docs/pages/enc/base.md | 346 ++- docs/pages/enc/binary.md | 334 ++- docs/pages/enc/common.md | 140 +- docs/pages/enc/compressions.md | 2 - docs/pages/enc/crypto.md | 410 ++-- docs/pages/enc/hashing.md | 2 - docs/pages/enc/languages.md | 396 ++-- docs/pages/enc/stegano.md | 244 +-- docs/pages/enc/web.md | 78 +- docs/pages/features.md | 674 +++--- docs/pages/guessing.md | 342 ++- docs/pages/howto.md | 482 ++-- docs/pages/index.md | 20 +- docs/pages/manipulations.md | 149 +- pyproject.toml | 10 +- pytest.ini | 2 + src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 3037 +++++++++++++------------- src/codext/__init__.py | 512 ++--- src/codext/base/_base.py | 581 +++-- src/codext/base/base100.py | 103 +- src/codext/base/base122.py | 204 +- src/codext/base/base85.py | 371 ++-- src/codext/binary/baudot.py | 576 +++-- src/codext/binary/rotate.py | 103 +- src/codext/common/cases.py | 5 +- src/codext/compressions/pkzip.py | 111 +- src/codext/crypto/railfence.py | 192 +- src/codext/hashing/blake.py | 22 +- src/codext/hashing/crypt.py | 4 +- src/codext/hashing/md.py | 4 +- src/codext/hashing/sha.py | 23 +- src/codext/hashing/shake.py | 22 +- src/codext/languages/braille.py | 67 +- src/codext/languages/galactic.py | 5 +- src/codext/languages/tap.py | 77 +- src/codext/others/uuencode.py | 2 +- src/codext/stegano/hexagram.py | 76 +- src/codext/web/html.py | 580 +++-- tests/test_base.py | 471 ++-- tests/test_common.py | 493 ++--- tests/test_generated.py | 297 +-- tests/test_manual.py | 340 ++- 46 files changed, 6200 insertions(+), 6286 deletions(-) create mode 100644 pytest.ini diff --git a/.coveragerc b/.coveragerc index 4ccc970..b677975 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,27 +1,24 @@ -[run] -source = codext -omit = - codext/__info__.py - codext/**/__init__.py - -[report] -exclude_lines = - pragma: no cover - if.*?__name__.*?==.*?.__main__.: - def main\(\)\: - def __stdin_pipe\(\)\: - for line in __stdin_pipe\(\)\: - def __format_list\(items, include\=True\)\: - def __print_tabular\(lst, space\=4\)\: - except ImportError: - except NameError: - raise NotImplementedError - if not PY3 - if PY3 - def encode\(self, input, final\=False\)\: - def decode\(self, input, final\=False\)\: - def _detect\(text\)\: - def _lang\(lang\)\: - if stopfunc\.LANG_BACKEND\: - def _validate\(stop_function, lang_backend\=\"none\"\)\: - except KeyboardInterrupt\: +[run] +source = codext +omit = + src/codext/__info__.py + src/codext/**/__init__.py + +[report] +exclude_lines = + pragma: no cover + if.*?__name__.*?==.*?.__main__.: + def main\(\)\: + def __stdin_pipe\(\)\: + for line in __stdin_pipe\(\)\: + def __format_list\(items, include\=True\)\: + def __print_tabular\(lst, space\=4\)\: + except ImportError: + except NameError: + raise NotImplementedError + def _detect\(text\)\: + def _lang\(lang\)\: + if stopfunc\.LANG_BACKEND\: + def _validate\(stop_function, lang_backend\=\"none\"\)\: + except KeyboardInterrupt\: + if alt and len\(t\) \% 2 \=\= 1\: diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 9010fab..62476a7 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -1,79 +1,79 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - -name: build - -env: - package: codext - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] - steps: - - uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install ${{ env.package }} - run: | - python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov coverage - pip install -r requirements.txt - pip install . - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test ${{ env.package }} with pytest - run: | - pytest --cov=$package - coverage: - needs: build - runs-on: ubuntu-latest - env: - cov_badge_path: docs/coverage.svg - steps: - - uses: actions/checkout@v3 - - name: Install ${{ env.package }} - run: | - python -m pip install --upgrade pip - python -m pip install pytest pytest-cov - pip install -r requirements.txt - pip install . - - name: Make coverage badge for ${{ env.package }} - run: | - pip install genbadge[coverage] - pytest --cov=$package --cov-report=xml - genbadge coverage -i coverage.xml -o $cov_badge_path - - name: Verify Changed files - uses: tj-actions/verify-changed-files@v12 - id: changed_files - with: - files: ${{ env.cov_badge_path }} - - name: Commit files - if: steps.changed_files.outputs.files_changed == 'true' - run: | - git config --local user.email "github-actions[bot]@users.noreply.github.com" - git config --local user.name "github-actions[bot]" - git add $cov_badge_path - git commit -m "Updated coverage.svg" - - name: Push changes - if: steps.changed_files.outputs.files_changed == 'true' - uses: ad-m/github-push-action@master - with: - github_token: ${{ secrets.github_token }} - branch: ${{ github.ref }} +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: build + +env: + package: codext + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest pytest-cov pytest-pythonpath coverage + pip install -r requirements.txt + pip install . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test ${{ env.package }} with pytest + run: | + pytest --cov=$package + coverage: + needs: build + runs-on: ubuntu-latest + env: + cov_badge_path: docs/coverage.svg + steps: + - uses: actions/checkout@v3 + - name: Install ${{ env.package }} + run: | + python -m pip install --upgrade pip + python -m pip install pytest pytest-cov pytest-pythonpath + pip install -r requirements.txt + pip install . + - name: Make coverage badge for ${{ env.package }} + run: | + pip install genbadge[coverage] + pytest --cov=$package --cov-report=xml + genbadge coverage -i coverage.xml -o $cov_badge_path + - name: Verify Changed files + uses: tj-actions/verify-changed-files@v12 + id: changed_files + with: + files: ${{ env.cov_badge_path }} + - name: Commit files + if: steps.changed_files.outputs.files_changed == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add $cov_badge_path + git commit -m "Updated coverage.svg" + - name: Push changes + if: steps.changed_files.outputs.files_changed == 'true' + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.github_token }} + branch: ${{ github.ref }} diff --git a/docs/pages/cli.md b/docs/pages/cli.md index 111913c..4b22cd4 100644 --- a/docs/pages/cli.md +++ b/docs/pages/cli.md @@ -1,184 +1,182 @@ -## CLI Tool - -`codext` has a Command-Line Interface tool. - ------ - -### Using Codext from the terminal - -The help message describes everything to know: - -```sh -usage: codext [-h] [-i INFILE] [-o OUTFILE] [-s] {encode,decode,guess,search} ... - -Codecs Extension (CodExt) 1.8.1 - -Author : Alexandre D'Hondt (alexandre.dhondt@gmail.com) -Copyright: © 2019-2021 A. D'Hondt -License : GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html) -Source : https://github.com/dhondta/python-codext - -This tool allows to encode/decode input strings/files with an extended set of codecs. - -positional arguments: - {encode,decode,guess,search} - command to be executed - encode encode input using the specified codecs - decode decode input using the specified codecs - guess try guessing the decoding codecs - search search for codecs - -optional arguments: - -h, --help show this help message and exit - -i INFILE, --input-file INFILE - input file (if none, take stdin as input) - -o OUTFILE, --output-file OUTFILE - output file (if none, display result to stdout) - -s, --strip-newlines strip newlines from input - -usage examples: -- codext search bitcoin -- codext decode base32 -i file.b32 -- codext encode morse < to_be_encoded.txt -- echo "test" | codext encode base100 -- echo -en "test" | codext encode braille -o test.braille -- codext encode base64 < to_be_encoded.txt > text.b64 -- echo -en "test" | codext encode base64 | codext encode base32 -- echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 -- echo -en "test" | codext encode upper reverse base32 | codext decode base32 reverse lower -- echo -en "test" | codext encode upper reverse base32 base64 morse -- echo -en "test" | codext encode base64 gzip | codext guess -- echo -en "test" | codext encode base64 gzip | codext guess gzip -c base -``` - -!!! note "Input/output" - - STDIN can be used as shown in an example from the help message, like when using the common Linux tool `base64`. - - Unless an output file is specified, the result is displayed in STDOUT. - -!!! note "Encodings chaining" - - Encodings can be chained as shown in the last examples of the help message. This can be practical for quickly manipulating data. - -### Execution examples - -**Scenario 1**: 2-stages encoded flag - -Creating the payload: - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 -pwTDSWRUbXTuMQs5EDgKpjgW8MiJVw1 -``` - -From this point, the only thing we know is that we are searching for "*flag*" (with eventually other characters, i.e. leetspeak). - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag -Codecs: base58, rotate-3 -A somewhat weird F1@9 ! -``` - -Executing the previous command will take a few tens of seconds. With few stages to be guessed, using the scoring heuristic can be far quicker to get to the right output. The following takes less than a second. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag --heuristic -Codecs: base58, rotate-3 -A somewhat weird F1@9 ! -``` - -**Scenario 2**: Multi-stage-encoded flag - -Creating the payload: - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse -.... -.-- --.- --. -- ....- - -.- -- . ..... -..- --. ..--- .-.. .. . .- ..... .-- -.-. ..... -.. --- -. --.- --.- . --. -- .-. --... ..-. ..- --.- -.-. -- -...- -...- -...- -``` - -When looking at the string, it is easy to figure out it is morse. The problem, at this point, is that this codec is case-insensitive and always returns lowercase characters, as shown hereafter. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse -hyqgm4tkme5xg2liea5wc5donqqegmr7fuqcm=== -``` - -In order to get it guessed as Base32, it is necessary to put it back to uppercase (in other words, decode from lowercase). - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase -HYQGM4TKME5XG2LIEA5WC5DONQQEGMR7FUQCM=== -``` - -Now that we know we are searching for something with "*flag*" (with eventually other characters), we can use the predefined "`flag`" stop function. - -```session -$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase | codext guess -f flag -Codecs: base32, barbie -A somewhat weird F1@9 ! -``` - -**Scenario 3**: Base-encoded rotated shifted secret (English) message - -Creating the payload: - -```session -$ echo "My super secret string" | codext encode shift-1 rotate-2 base58 base64 -NDNxaFdieXh0Z29XOVZpWWpjRGNpRWgyZE44Z2FNU0g= -``` - -First, we shall simplify as much as possible ; we can easily guess that Base64 was used as the first encoding scheme: - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext rank -[+] 1.00002: base62 -[+] 0.99401: base64 -[+] 0.70806: rotate-1 -[+] 0.70806: rotate-2 -[+] 0.70806: rotate-3 -[+] 0.70806: rotate-4 -[+] 0.70806: rotate-5 -[+] 0.70806: rotate-6 -[+] 0.70806: rotate-7 -[+] 0.70806: rotate-left-1 - -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base62 -%¤q ´!.[æ&[fÿhbð^ - -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 -h4nRqFifSnRjFfQxRHuVpxjxpP8cCR -``` - -Afterwards, we can still try to simplify ; - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext rank -[+] 1.00185: base58 -[+] 0.99091: base62 -[+] 0.67001: rotate-1 -[+] 0.67001: rotate-2 -[+] 0.67001: rotate-3 -[+] 0.67001: rotate-4 -[+] 0.67001: rotate-5 -[+] 0.67001: rotate-6 -[+] 0.67001: rotate-7 -[+] 0.67001: rotate-left-1 -``` - -From here, let us assume that `base58` is effectively the right second-stage encoding. Guessing the two remaining encodings with no more information will now take a few seconds. As multiple outputs can be recognized as normal text, we will use the "`-s`" option not to stop on the first output successfully decoded as text. Moreover, if we have the intuition that the output shall be English text, we can use a more refined stop function like "`lang_en`" with the "`-f`" option. - -```session -$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext decode base58 | codext guess -s -f lang_en -[...] -[+] rotate-2, rot-1: My!super!secret!string -[+] rotate-2, rot-23: Qc!wytiv!wigvix!wxvmrk -[+] rotate-2, shift-1: My super secret string -[+] rotate-2, shift-20: :f\r`b]R_\r`RP_Ra\r`a_V[T -[...] -[+] rotate-left-6, shift-1: My super secret string -^C^C^C -``` - -We can then stop the research with Ctrl+C. The right output has been found ! - +`codext` has a Command-Line Interface tool. + +----- + +### Using Codext from the terminal + +The help message describes everything to know: + +```sh +usage: codext [-h] [-i INFILE] [-o OUTFILE] [-s] {encode,decode,guess,search} ... + +Codecs Extension (CodExt) 1.8.1 + +Author : Alexandre D'Hondt (alexandre.dhondt@gmail.com) +Copyright: © 2019-2021 A. D'Hondt +License : GPLv3 (https://www.gnu.org/licenses/gpl-3.0.fr.html) +Source : https://github.com/dhondta/python-codext + +This tool allows to encode/decode input strings/files with an extended set of codecs. + +positional arguments: + {encode,decode,guess,search} + command to be executed + encode encode input using the specified codecs + decode decode input using the specified codecs + guess try guessing the decoding codecs + search search for codecs + +optional arguments: + -h, --help show this help message and exit + -i INFILE, --input-file INFILE + input file (if none, take stdin as input) + -o OUTFILE, --output-file OUTFILE + output file (if none, display result to stdout) + -s, --strip-newlines strip newlines from input + +usage examples: +- codext search bitcoin +- codext decode base32 -i file.b32 +- codext encode morse < to_be_encoded.txt +- echo "test" | codext encode base100 +- echo -en "test" | codext encode braille -o test.braille +- codext encode base64 < to_be_encoded.txt > text.b64 +- echo -en "test" | codext encode base64 | codext encode base32 +- echo -en "mrdvm6teie6t2cq=" | codext encode upper | codext decode base32 | codext decode base64 +- echo -en "test" | codext encode upper reverse base32 | codext decode base32 reverse lower +- echo -en "test" | codext encode upper reverse base32 base64 morse +- echo -en "test" | codext encode base64 gzip | codext guess +- echo -en "test" | codext encode base64 gzip | codext guess gzip -c base +``` + +!!! note "Input/output" + + STDIN can be used as shown in an example from the help message, like when using the common Linux tool `base64`. + + Unless an output file is specified, the result is displayed in STDOUT. + +!!! note "Encodings chaining" + + Encodings can be chained as shown in the last examples of the help message. This can be practical for quickly manipulating data. + +### Execution examples + +**Scenario 1**: 2-stages encoded flag + +Creating the payload: + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 +pwTDSWRUbXTuMQs5EDgKpjgW8MiJVw1 +``` + +From this point, the only thing we know is that we are searching for "*flag*" (with eventually other characters, i.e. leetspeak). + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag +Codecs: base58, rotate-3 +A somewhat weird F1@9 ! +``` + +Executing the previous command will take a few tens of seconds. With few stages to be guessed, using the scoring heuristic can be far quicker to get to the right output. The following takes less than a second. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode rotate-3 base58 | codext guess -f flag --heuristic +Codecs: base58, rotate-3 +A somewhat weird F1@9 ! +``` + +**Scenario 2**: Multi-stage-encoded flag + +Creating the payload: + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse +.... -.-- --.- --. -- ....- - -.- -- . ..... -..- --. ..--- .-.. .. . .- ..... .-- -.-. ..... -.. --- -. --.- --.- . --. -- .-. --... ..-. ..- --.- -.-. -- -...- -...- -...- +``` + +When looking at the string, it is easy to figure out it is morse. The problem, at this point, is that this codec is case-insensitive and always returns lowercase characters, as shown hereafter. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse +hyqgm4tkme5xg2liea5wc5donqqegmr7fuqcm=== +``` + +In order to get it guessed as Base32, it is necessary to put it back to uppercase (in other words, decode from lowercase). + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase +HYQGM4TKME5XG2LIEA5WC5DONQQEGMR7FUQCM=== +``` + +Now that we know we are searching for something with "*flag*" (with eventually other characters), we can use the predefined "`flag`" stop function. + +```session +$ echo "A somewhat weird F1@9 !" | codext encode barbie-1 base32 morse | codext decode morse lowercase | codext guess -f flag +Codecs: base32, barbie +A somewhat weird F1@9 ! +``` + +**Scenario 3**: Base-encoded rotated shifted secret (English) message + +Creating the payload: + +```session +$ echo "My super secret string" | codext encode shift-1 rotate-2 base58 base64 +NDNxaFdieXh0Z29XOVZpWWpjRGNpRWgyZE44Z2FNU0g= +``` + +First, we shall simplify as much as possible ; we can easily guess that Base64 was used as the first encoding scheme: + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext rank +[+] 1.00002: base62 +[+] 0.99401: base64 +[+] 0.70806: rotate-1 +[+] 0.70806: rotate-2 +[+] 0.70806: rotate-3 +[+] 0.70806: rotate-4 +[+] 0.70806: rotate-5 +[+] 0.70806: rotate-6 +[+] 0.70806: rotate-7 +[+] 0.70806: rotate-left-1 + +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base62 +%¤q ´!.[æ&[fÿhbð^ + +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 +h4nRqFifSnRjFfQxRHuVpxjxpP8cCR +``` + +Afterwards, we can still try to simplify ; + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext rank +[+] 1.00185: base58 +[+] 0.99091: base62 +[+] 0.67001: rotate-1 +[+] 0.67001: rotate-2 +[+] 0.67001: rotate-3 +[+] 0.67001: rotate-4 +[+] 0.67001: rotate-5 +[+] 0.67001: rotate-6 +[+] 0.67001: rotate-7 +[+] 0.67001: rotate-left-1 +``` + +From here, let us assume that `base58` is effectively the right second-stage encoding. Guessing the two remaining encodings with no more information will now take a few seconds. As multiple outputs can be recognized as normal text, we will use the "`-s`" option not to stop on the first output successfully decoded as text. Moreover, if we have the intuition that the output shall be English text, we can use a more refined stop function like "`lang_en`" with the "`-f`" option. + +```session +$ echo "aDRuUnFGaWZTblJqRmZReFJIdVZweGp4cFA4Y0NS" | codext decode base64 | codext decode base58 | codext guess -s -f lang_en +[...] +[+] rotate-2, rot-1: My!super!secret!string +[+] rotate-2, rot-23: Qc!wytiv!wigvix!wxvmrk +[+] rotate-2, shift-1: My super secret string +[+] rotate-2, shift-20: :f\r`b]R_\r`RP_Ra\r`a_V[T +[...] +[+] rotate-left-6, shift-1: My super secret string +^C^C^C +``` + +We can then stop the research with Ctrl+C. The right output has been found ! + diff --git a/docs/pages/enc/base.md b/docs/pages/enc/base.md index 757965e..dc7b26c 100644 --- a/docs/pages/enc/base.md +++ b/docs/pages/enc/base.md @@ -1,174 +1,172 @@ -## Base - -`codext` defines a far broader set of Base-encodings than in the original library. - ------ - -### Classical base 2^N encodings - -This namely adds the classical BaseXX encodings like 16 (hexadecimal) and 32 (RFC 3548), which are not available in the native codecs. - -Common base encodings with N a power of 2: - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) -`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) -`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) -`base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | -`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex -`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 -`base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | - -!!! note "Aliases" - - All the aliases are case insensitive for base encodings. - -```python ->>> codext.encode("test", "base2") -'01110100011001010111001101110100' ->>> codext.encode("test", "base2-inv") -'10001011100110101000110010001011' -``` - -```python ->>> codecs.encode("this is a test", "base16") -'7468697320697320612074657374' ->>> codecs.decode("7468697320697320612074657374", "base16") -'this is a test' ->>> codecs.encode("this is a test", "base16-inv") -'1E02031DCA031DCA0BCA1E0F1D1E' -``` - -```python ->>> codext.encode("this is a test", "base32") -'ORUGS4ZANFZSAYJAORSXG5A=' ->>> codext.decode("ORUGS4ZANFZSAYJAORSXG5A=", "base32") -'this is a test' -``` - -Note that for `base64`, it overwrites the native `base64_codec` to also support en/decoding from str. - -```python ->>> codecs.encode("this is a test", "base64") -'dGhpcyBpcyBhIHRlc3Q=' ->>> codecs.decode("dGhpcyBpcyBhIHRlc3Q=", "base64") -'this is a test' -``` - ------ - -### Generic base encodings - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) -`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | -`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | -`base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | -`base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | -`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL -`base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | -`base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | -`base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | -`base91-alt` | text <-> Alternate Base91 encoded text | `base[-_]?91[-_]alt(?:ernate)?(|[-_]inv(erted)?)` | Another version of Base91 - -```python ->>> codext.encode("test", "base3") -'23112113223321323322' -``` - -```python ->>> codecs.encode("test", "base36") -'WANEK4' ->>> codecs.decode("4WMHTK6UZL044O91NKCEB8", "base36") -'this is a test' -``` - -```python ->>> codext.encode("this is a test!", "base45") -'AWE+EDH44.OEOCC7WE QEX0' ->>> codext.decode('AWE+EDH44.OEOCC7WE QEX0', "base45") -'this is a test!' -``` - -```python ->>> codext.encode("this is a test", "base58") -'jo91waLQA1NNeBmZKUF' ->>> codext.encode("this is a test", "base58-ripple") -'jo9rA2LQwr44eBmZK7E' ->>> codext.encode("this is a test", "base58-url") -'JN91Wzkpa1nnDbLyjtf' -``` - -```python ->>> codecs.encode("test", "base62") -'289lyu' ->>> codecs.encode("this is a test", "base62") -'CsoB4HQ5gmgMyCenF7E' -``` - -```python ->>> codecs.encode("This is a test !", "base91") -'nX,<:WRT%yxth90oZB^C' ->>> codext.encode("This is a test !", "base91-alt") -'?a&[jv4S3Wg>,71@Jo#K' -``` - -!!! note "Generic encodings" - - Base encodings are available for any N other than the ones explicitely specified using the "`-generic`" suffix. Their charsets consist of printable characters from the `string` module for N up to 100 and for characters composed from the 256 possible ordinals for a greater N. - - :::python - >>> codext.encode("test", "base3-generic") - '12001002112210212211' - >>> codext.encode("test", "base17-generic") - '4cf60456' - ------ - -### Base85 - -This encoding implements various different versions of Base85. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | - -```python ->>> codext.encode("this is a test", "ascii85") -"FD,B0+DGm>@3BZ'F*%" ->>> codext.decode("FD,B0+DGm>@3BZ'F*%", "ascii85") -'this is a test' ->>> with open("ascii85.txt", 'w', encoding="ascii85") as f: - f.write("this is a test") -14 ->>> with open("ascii85.txt", encoding="ascii85") as f: - f.read() -'this is a test' -``` - ------ - -### Other base encodings - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only -`base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only -`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset - -```python ->>> codecs.encode("this is a test", "base100") -'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' ->>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") -'this is a test' -``` - -```python ->>> codecs.encode("this is a test", "base122") -':\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft' ->>> codecs.decode(":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", "base122") -'this is a test' -``` - +`codext` defines a far broader set of Base-encodings than in the original library. + +----- + +### Classical base 2^N encodings + +This namely adds the classical BaseXX encodings like 16 (hexadecimal) and 32 (RFC 3548), which are not available in the native codecs. + +Common base encodings with N a power of 2: + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base2` | text <-> Base2 encoded text | `(base[-_]?2|bin)-inv(erted)?` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_AB`) +`base4` | text <-> Base4 encoded text | `base[-_]?4-inv(erted)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_6VC9`) +`base8` | text <-> Base8 encoded text | `base[-_]?8-inv(erted)` | Charset: `abcdefgh` ; Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_A5c96T7x`) +`base16` | text <-> Base16 encoded text | `base[-_]?16-inv(erted)` | +`base32` | text <-> Base32 encoded text | `base[-_]?32-inv(erted)`, `base32-crockford`, `base32_geohash`, ... | Also supports Base32 Crockford, Geohash and Hex +`zbase32` | text <-> ZBase32 encoded text | `z[-_]?base[-_]?32` | Human-oriented Base32 +`base64` | text <-> Base64 encoded text | `base[-_]?64-inv(erted)` | + +!!! note "Aliases" + + All the aliases are case insensitive for base encodings. + +```python +>>> codext.encode("test", "base2") +'01110100011001010111001101110100' +>>> codext.encode("test", "base2-inv") +'10001011100110101000110010001011' +``` + +```python +>>> codecs.encode("this is a test", "base16") +'7468697320697320612074657374' +>>> codecs.decode("7468697320697320612074657374", "base16") +'this is a test' +>>> codecs.encode("this is a test", "base16-inv") +'1E02031DCA031DCA0BCA1E0F1D1E' +``` + +```python +>>> codext.encode("this is a test", "base32") +'ORUGS4ZANFZSAYJAORSXG5A=' +>>> codext.decode("ORUGS4ZANFZSAYJAORSXG5A=", "base32") +'this is a test' +``` + +Note that for `base64`, it overwrites the native `base64_codec` to also support en/decoding from str. + +```python +>>> codecs.encode("this is a test", "base64") +'dGhpcyBpcyBhIHRlc3Q=' +>>> codecs.decode("dGhpcyBpcyBhIHRlc3Q=", "base64") +'this is a test' +``` + +----- + +### Generic base encodings + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base3` | text <-> Base3 encoded text | `base[-_]?3(|[-_]inv(erted)?)` | Dynamic charset parameter `[-_]...`, amongst letters and digits (e.g. `_C2Z`) +`base10` | text <-> Base10 encoded text | `base[-_]?10|int(?:eger)?|dec(?:imal)?` | +`base11` | text <-> Base11 encoded text | `base[-_]?11(|[-_]inv(erted)?)` | +`base36` | text <-> Base36 encoded text | `base[-_]?36(|[-_]inv(erted)?)` | +`base45` | text <-> Base45 encoded text | `base[-_]?45(|[-_]inv(erted)?)` | +`base58` | text <-> Base58 encoded text | `base[-_]?58(|[-_](bc|bitcoin|rp|ripple|fl|flickr|short[-]?url|url))` | Supports Bitcoin, Ripple and short URL +`base62` | text <-> Base62 encoded text | `base[-_]?62(|[-_]inv(erted)?)` | +`base63` | text <-> Base63 encoded text | `base[-_]?63(|[-_]inv(erted)?)` | +`base91` | text <-> Base91 encoded text | `base[-_]?91(|[-_]inv(erted)?)` | +`base91-alt` | text <-> Alternate Base91 encoded text | `base[-_]?91[-_]alt(?:ernate)?(|[-_]inv(erted)?)` | Another version of Base91 + +```python +>>> codext.encode("test", "base3") +'23112113223321323322' +``` + +```python +>>> codecs.encode("test", "base36") +'WANEK4' +>>> codecs.decode("4WMHTK6UZL044O91NKCEB8", "base36") +'this is a test' +``` + +```python +>>> codext.encode("this is a test!", "base45") +'AWE+EDH44.OEOCC7WE QEX0' +>>> codext.decode('AWE+EDH44.OEOCC7WE QEX0', "base45") +'this is a test!' +``` + +```python +>>> codext.encode("this is a test", "base58") +'jo91waLQA1NNeBmZKUF' +>>> codext.encode("this is a test", "base58-ripple") +'jo9rA2LQwr44eBmZK7E' +>>> codext.encode("this is a test", "base58-url") +'JN91Wzkpa1nnDbLyjtf' +``` + +```python +>>> codecs.encode("test", "base62") +'289lyu' +>>> codecs.encode("this is a test", "base62") +'CsoB4HQ5gmgMyCenF7E' +``` + +```python +>>> codecs.encode("This is a test !", "base91") +'nX,<:WRT%yxth90oZB^C' +>>> codext.encode("This is a test !", "base91-alt") +'?a&[jv4S3Wg>,71@Jo#K' +``` + +!!! note "Generic encodings" + + Base encodings are available for any N other than the ones explicitely specified using the "`-generic`" suffix. Their charsets consist of printable characters from the `string` module for N up to 100 and for characters composed from the 256 possible ordinals for a greater N. + + :::python + >>> codext.encode("test", "base3-generic") + '12001002112210212211' + >>> codext.encode("test", "base17-generic") + '4cf60456' + +----- + +### Base85 + +This encoding implements various different versions of Base85. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base85` | text <-> ascii85 | `(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)` | + +```python +>>> codext.encode("this is a test", "ascii85") +"FD,B0+DGm>@3BZ'F*%" +>>> codext.decode("FD,B0+DGm>@3BZ'F*%", "ascii85") +'this is a test' +>>> with open("ascii85.txt", 'w', encoding="ascii85") as f: + f.write("this is a test") +14 +>>> with open("ascii85.txt", encoding="ascii85") as f: + f.read() +'this is a test' +``` + +----- + +### Other base encodings + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`base100` | text <-> Base100 encoded text | `base[-_]?100|emoji` | Python 3 only +`base122` | text <-> Base122 encoded text | `base[-_]?122` | Python 3 only +`base128` | text <-> Base128 encoded text | `base[-_]?128` | Relies on the ASCII charset + +```python +>>> codecs.encode("this is a test", "base100") +'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' +>>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") +'this is a test' +``` + +```python +>>> codecs.encode("this is a test", "base122") +':\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft' +>>> codecs.decode(":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", "base122") +'this is a test' +``` + diff --git a/docs/pages/enc/binary.md b/docs/pages/enc/binary.md index 745ef82..0ed7fb0 100644 --- a/docs/pages/enc/binary.md +++ b/docs/pages/enc/binary.md @@ -1,168 +1,166 @@ -## Binary - -`codext` also adds common binary encodings. For instance, the Manchester code, that encodes digits, is applied to the ordinals of the input text and the resulting binary stream is converted back to characters. - ------ - -### Baudot - -It supports various formats such as CCITT-1 and CCITT-2, ITA1 and ITA2, and some others. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`baudot` | text <-> text | Baudot code bits | `baudot-ccitt1`, `baudot_ccitt2_lsb`, ... | supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... -`baudot-spaced` | text <-> Baudot code groups of bits | `baudot-spaced-ita1_lsb`, `baudot_spaced_ita2_msb`, ... | groups of 5 bits are whitespace-separated -`baudot-tape` | text <-> Baudot code tape | `baudot-tape-mtk2`, `baudot_tape_murray`, ... | outputs a string that looks like a perforated tape - -!!! note "LSB / MSB" - - "`_lsb`" or "`_msb`" can be specified in the codec name to set the bits order. If not specified, it defaults to MSB. - - -```python ->>> codext.encode("12345", "baudot-fr") -'010000000100010001000010100111' ->>> codext.decode("010000000100010001000010100111", "baudot-fr") -'12345' -``` - -```python ->>> codext.encode("TEST", "baudot-spaced_uk") -'10101 00010 10100 10101' ->>> codext.decode("10101 00010 10100 10101", "baudot-spaced_uk") -'TEST' -``` - -```python ->>> s = codext.encode("HELLO WORLD!", "baudot-tape_ita2") ->>> print(s) -***.** -* *. - . * -* .* -* .* -** . - *. -* .** -** . - * .* -* .* - * . * -** .** - **. * ->>> codext.decode(s, "baudot-tape_ita2") -'HELLO WORLD!' -``` - ------ - -### Binary Coded Decimal (BCD) - -It converts characters to their odrinals, left-pads with zeros, converts digits to 4-bits groups and then make characters with the assembled groups. It can also use a 4-bits prefix for making new characters. It then allows to define extended versions of BCD. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`bcd` | text <-> BCD encoded text | `binary_coded_decimals` | -`bcd-extended0` | text <-> BCD encoded text using prefix `0000` | `bcd_ext0`, `bcd-extended-zeros`, `binary_coded_decimals_extended_0` | -`bcd-extended1` | text <-> BCD encoded text using prefix `1111` | `bcd_ext1`, `bcd-extended-ones`, `binary_coded_decimals_extended_1` | - -```python ->>> codext.encode("Test", "bcd") -'\x08A\x01\x11Q\x16' ->>> codext.decode("\x08A\x01\x11Q\x16", "binary_coded_decimal") -'Test' ->>> codext.encode("Test", "bcd_ext_zero") -'\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00' ->>> codext.decode("\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", "bcd-ext0") -'Test' ->>> codext.encode("Test", "bcd_extended_ones") -'\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0' ->>> codext.decode("\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", "bcd_ext1") -'Test' -``` - ------ - -### Excess-3 - -Also called *Stibitz code*, it converts characters to ordinals, left-pads with zeros and then applies Excess-3 (Stibitz) code to get groups of 4 bits that are finally reassembled into bytes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`excess3` | text <-> XS3 encoded text | `excess-3`, `xs3`, `stibitz` | - -```python ->>> codext.encode("This is a test!", "excess-3") -';t7C\x84H6T8D\x83e<£eD\x944D\x84I6`' ->>> codext.decode(";t7C\x84H6T8D\x83e<£eD\x944D\x84I6`", "stibitz") -'This is a test!' -``` - ------ - -### Gray - -Also called *reflected binary code*, it implements the Gray code applied to characters while converted to bytes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`gray` | text <-> gray encoded text | `reflected-bin`, `reflected_binary` | - -```python ->>> codext.encode("this is a test", "gray") -'N\\]J0]J0Q0NWJN' ->>> codext.decode("N\\]J0]J0Q0NWJN", "gray") -'this is a test' ->>> codext.encode("THIS IS A TEST", "gray") -'~lmz0mz0a0~gz~' ->>> codext.decode("~lmz0mz0a0~gz~", "gray") -'THIS IS A TEST' -``` - ------ - -### Manchester - -This codec XORes each group of 4 bits of the input text with a 1-byte clock signal, e.g. `0x55` giving in binary `01010101`. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`manchester` | text <-> manchester encoded text | | clock signal is `0x55` (`01010101`) -`manchester-inverted` | text <-> manchester encoded text | `ethernet`, `ieee802.4` | clock signal is `0xaa` (`10101010`) - -```python ->>> codext.encode("This is a test!", "manchester") -'fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV' ->>> codext.decode("fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV", "manchester") -'This is a test!' ->>> codext.encode("This is a test!", "manchester-inverted") -'\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©' ->>> codext.decode("\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©", "ethernet") -'This is a test!' -``` - ------ - -### Rotate N bits - -This codec rotates of N bits each byte of an input string. - -!!! note "Lossless" - - This codec does not use the "`<<`" and "`>>`" operators as it is lossy in some cases. Instead, it rotates per group of 8 bits. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rotate` | text <-> N-bits-rotated text | `rotate-N`, `rotate_bits-N`, `rotate-right-N`, `rotate_left_N` | N belongs to [1,7] ; when nothing specified, it rotates to the right - -```python ->>> codext.encode("test", "rotate-1") -':29:' ->>> codext.encode("test", "rotatebits-1") -':29:' ->>> codext.encode("test", "rotate_right-1") -':29:' ->>> codext.encode("test", "rotate_left_1") -'èÊæè' -``` - +`codext` also adds common binary encodings. For instance, the Manchester code, that encodes digits, is applied to the ordinals of the input text and the resulting binary stream is converted back to characters. + +----- + +### Baudot + +It supports various formats such as CCITT-1 and CCITT-2, ITA1 and ITA2, and some others. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`baudot` | text <-> text | Baudot code bits | `baudot-ccitt1`, `baudot_ccitt2_lsb`, ... | supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... +`baudot-spaced` | text <-> Baudot code groups of bits | `baudot-spaced-ita1_lsb`, `baudot_spaced_ita2_msb`, ... | groups of 5 bits are whitespace-separated +`baudot-tape` | text <-> Baudot code tape | `baudot-tape-mtk2`, `baudot_tape_murray`, ... | outputs a string that looks like a perforated tape + +!!! note "LSB / MSB" + + "`_lsb`" or "`_msb`" can be specified in the codec name to set the bits order. If not specified, it defaults to MSB. + + +```python +>>> codext.encode("12345", "baudot-fr") +'010000000100010001000010100111' +>>> codext.decode("010000000100010001000010100111", "baudot-fr") +'12345' +``` + +```python +>>> codext.encode("TEST", "baudot-spaced_uk") +'10101 00010 10100 10101' +>>> codext.decode("10101 00010 10100 10101", "baudot-spaced_uk") +'TEST' +``` + +```python +>>> s = codext.encode("HELLO WORLD!", "baudot-tape_ita2") +>>> print(s) +***.** +* *. + . * +* .* +* .* +** . + *. +* .** +** . + * .* +* .* + * . * +** .** + **. * +>>> codext.decode(s, "baudot-tape_ita2") +'HELLO WORLD!' +``` + +----- + +### Binary Coded Decimal (BCD) + +It converts characters to their odrinals, left-pads with zeros, converts digits to 4-bits groups and then make characters with the assembled groups. It can also use a 4-bits prefix for making new characters. It then allows to define extended versions of BCD. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`bcd` | text <-> BCD encoded text | `binary_coded_decimals` | +`bcd-extended0` | text <-> BCD encoded text using prefix `0000` | `bcd_ext0`, `bcd-extended-zeros`, `binary_coded_decimals_extended_0` | +`bcd-extended1` | text <-> BCD encoded text using prefix `1111` | `bcd_ext1`, `bcd-extended-ones`, `binary_coded_decimals_extended_1` | + +```python +>>> codext.encode("Test", "bcd") +'\x08A\x01\x11Q\x16' +>>> codext.decode("\x08A\x01\x11Q\x16", "binary_coded_decimal") +'Test' +>>> codext.encode("Test", "bcd_ext_zero") +'\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00' +>>> codext.decode("\x00\x08\x04\x01\x00\x01\x01\x01\x05\x01\x01\x06\x00", "bcd-ext0") +'Test' +>>> codext.encode("Test", "bcd_extended_ones") +'\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0' +>>> codext.decode("\xf0\xf8\xf4\xf1\xf0\xf1\xf1\xf1\xf5\xf1\xf1\xf6\xf0", "bcd_ext1") +'Test' +``` + +----- + +### Excess-3 + +Also called *Stibitz code*, it converts characters to ordinals, left-pads with zeros and then applies Excess-3 (Stibitz) code to get groups of 4 bits that are finally reassembled into bytes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`excess3` | text <-> XS3 encoded text | `excess-3`, `xs3`, `stibitz` | + +```python +>>> codext.encode("This is a test!", "excess-3") +';t7C\x84H6T8D\x83e<£eD\x944D\x84I6`' +>>> codext.decode(";t7C\x84H6T8D\x83e<£eD\x944D\x84I6`", "stibitz") +'This is a test!' +``` + +----- + +### Gray + +Also called *reflected binary code*, it implements the Gray code applied to characters while converted to bytes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`gray` | text <-> gray encoded text | `reflected-bin`, `reflected_binary` | + +```python +>>> codext.encode("this is a test", "gray") +'N\\]J0]J0Q0NWJN' +>>> codext.decode("N\\]J0]J0Q0NWJN", "gray") +'this is a test' +>>> codext.encode("THIS IS A TEST", "gray") +'~lmz0mz0a0~gz~' +>>> codext.decode("~lmz0mz0a0~gz~", "gray") +'THIS IS A TEST' +``` + +----- + +### Manchester + +This codec XORes each group of 4 bits of the input text with a 1-byte clock signal, e.g. `0x55` giving in binary `01010101`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`manchester` | text <-> manchester encoded text | | clock signal is `0x55` (`01010101`) +`manchester-inverted` | text <-> manchester encoded text | `ethernet`, `ieee802.4` | clock signal is `0xaa` (`10101010`) + +```python +>>> codext.encode("This is a test!", "manchester") +'fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV' +>>> codext.decode("fei\x95i\x96jZYUi\x96jZYUiVYUjeifjZjeYV", "manchester") +'This is a test!' +>>> codext.encode("This is a test!", "manchester-inverted") +'\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©' +>>> codext.decode("\x99\x9a\x96j\x96i\x95¥¦ª\x96i\x95¥¦ª\x96©¦ª\x95\x9a\x96\x99\x95¥\x95\x9a¦©", "ethernet") +'This is a test!' +``` + +----- + +### Rotate N bits + +This codec rotates of N bits each byte of an input string. + +!!! note "Lossless" + + This codec does not use the "`<<`" and "`>>`" operators as it is lossy in some cases. Instead, it rotates per group of 8 bits. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rotate` | text <-> N-bits-rotated text | `rotate-N`, `rotate_bits-N`, `rotate-right-N`, `rotate_left_N` | N belongs to [1,7] ; when nothing specified, it rotates to the right + +```python +>>> codext.encode("test", "rotate-1") +':29:' +>>> codext.encode("test", "rotatebits-1") +':29:' +>>> codext.encode("test", "rotate_right-1") +':29:' +>>> codext.encode("test", "rotate_left_1") +'èÊæè' +``` + diff --git a/docs/pages/enc/common.md b/docs/pages/enc/common.md index 34a566c..1739ca8 100644 --- a/docs/pages/enc/common.md +++ b/docs/pages/enc/common.md @@ -1,71 +1,69 @@ -## Common - -`codext` also provides some very common encodings, for the sake of simplicity (e.g. while chaining codecs with [the CLI tool](../cli.html)). - ------ - -### A1Z26 - -This simple codec converts letters to their order number in the alphabet using a separator between characters and keeping words separated by a whitespace. It is similar to the [`consonant-vowel-indices`](others.html#letter-indices) encoding. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`a1z26` | text <-> alphabet order numbers | `a1z26`, `a1z26-/`, `a1z26-,`, ... | this codec does not preserve the case and is dynamic (separator of characters in each word can be customized among these: "`-_/|,;:*`") - -```python ->>> codext.encode("This is a test", "a1z26") -'20-8-9-19 9-19 1 20-5-19-20' ->>> codext.decode("20-8-9-19 9-19 1 20-5-19-20", "a1z26") -'this is a test' -``` - ------ - -### Octal - -This simple codec converts characters into their octal values. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`octal` | text <-> octal digits | `octals` | groups of 3-chars octal values when encoded -`octal-spaced` | text <-> spaced octal digits | `octals-spaced` | whitespace-separated suite of variable-length groups of octal digits when encoded - -```python ->>> codext.encode("this is a test", "octal") -'164150151163040151163040141040164145163164' ->>> codext.decode("164150151163040151163040141040164145163164", "octals") -'this is a test' -``` - -```python ->>> codext.encode("this is a test", "octal-spaced") -'164 150 151 163 40 151 163 40 141 40 164 145 163 164' ->>> codext.decode("164 150 151 163 40 151 163 40 141 40 164 145 163 164", "octals-spaced") -'this is a test' -``` - ------ - -### Ordinal - -This simple codec converts characters into their ordinals. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`ordinal` | text <-> ordinal digits | `ordinals` | groups of 3-chars ordinal values when encoded -`ordinal-spaced` | text <-> spaced ordinal digits | `ordinals-spaced` | whitespace-separated suite of variable-length groups of ordinal digits when encoded - -```python ->>> codext.encode("this is a test", "ordinal") -'116104105115032105115032097032116101115116' ->>> codext.decode("116104105115032105115032097032116101115116", "ordinals") -'this is a test' -``` - -```python ->>> codext.encode("this is a test", "ordinal-spaced") -'116 104 105 115 32 105 115 32 97 32 116 101 115 116' ->>> codext.decode("116 104 105 115 32 105 115 32 97 32 116 101 115 116", "ordinals-spaced") -'this is a test' -``` - +`codext` also provides some very common encodings, for the sake of simplicity (e.g. while chaining codecs with [the CLI tool](../cli.html)). + +----- + +### A1Z26 + +This simple codec converts letters to their order number in the alphabet using a separator between characters and keeping words separated by a whitespace. It is similar to the [`consonant-vowel-indices`](others.html#letter-indices) encoding. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`a1z26` | text <-> alphabet order numbers | `a1z26`, `a1z26-/`, `a1z26-,`, ... | this codec does not preserve the case and is dynamic (separator of characters in each word can be customized among these: "`-_/|,;:*`") + +```python +>>> codext.encode("This is a test", "a1z26") +'20-8-9-19 9-19 1 20-5-19-20' +>>> codext.decode("20-8-9-19 9-19 1 20-5-19-20", "a1z26") +'this is a test' +``` + +----- + +### Octal + +This simple codec converts characters into their octal values. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`octal` | text <-> octal digits | `octals` | groups of 3-chars octal values when encoded +`octal-spaced` | text <-> spaced octal digits | `octals-spaced` | whitespace-separated suite of variable-length groups of octal digits when encoded + +```python +>>> codext.encode("this is a test", "octal") +'164150151163040151163040141040164145163164' +>>> codext.decode("164150151163040151163040141040164145163164", "octals") +'this is a test' +``` + +```python +>>> codext.encode("this is a test", "octal-spaced") +'164 150 151 163 40 151 163 40 141 40 164 145 163 164' +>>> codext.decode("164 150 151 163 40 151 163 40 141 40 164 145 163 164", "octals-spaced") +'this is a test' +``` + +----- + +### Ordinal + +This simple codec converts characters into their ordinals. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`ordinal` | text <-> ordinal digits | `ordinals` | groups of 3-chars ordinal values when encoded +`ordinal-spaced` | text <-> spaced ordinal digits | `ordinals-spaced` | whitespace-separated suite of variable-length groups of ordinal digits when encoded + +```python +>>> codext.encode("this is a test", "ordinal") +'116104105115032105115032097032116101115116' +>>> codext.decode("116104105115032105115032097032116101115116", "ordinals") +'this is a test' +``` + +```python +>>> codext.encode("this is a test", "ordinal-spaced") +'116 104 105 115 32 105 115 32 97 32 116 101 115 116' +>>> codext.decode("116 104 105 115 32 105 115 32 97 32 116 101 115 116", "ordinals-spaced") +'this is a test' +``` + diff --git a/docs/pages/enc/compressions.md b/docs/pages/enc/compressions.md index a5437cf..5c4fd2e 100644 --- a/docs/pages/enc/compressions.md +++ b/docs/pages/enc/compressions.md @@ -1,5 +1,3 @@ -## Compressions - `codext` provides a few common compression codecs. ----- diff --git a/docs/pages/enc/crypto.md b/docs/pages/enc/crypto.md index e59ab0f..b189c0e 100644 --- a/docs/pages/enc/crypto.md +++ b/docs/pages/enc/crypto.md @@ -1,206 +1,204 @@ -## Cryptography - -`codext` also implements several simple cryptographic ciphers. But how does it relate to encoding while a key is required ? `codext` focuses on ciphers that have a weak key. With dynamically named encodings, it is then possible to define a bunch of encodings, one for each value of the key. For instance, Barbie Typewriter has a key with only 4 possible values. The `barbie` codec can then be `barbie-1`, ..., `barbie-4`. - -!!! note "Available masks" - - Some cipher codecs use character masks to generate their alphabets. Groups of characters are indicated using a headin "`?`". - - `a`: printable characters - `b`: all 8-bits chars - `d`: digits - `h`: lowercase hexadecimal - `H`: uppercase hexadecimal - `l`: lowercase letters - `p`: punctuation characters - `s`: whitespace - `u`: uppercase letters - - When combining masks, only one occurrence of each character is taken in the final alphabet. - - So, for instance, the following masks yield the following alphabets: - - - `?l?u?d?s`: "`abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 `" - - `?s.,?!?u?d`: "` .,?!ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`" - ------ - -### Affine Cipher - -This codec implements the Affine monoalphabetic substitution cipher. It is parametrizable with a mask for generating the alphabet and the parameters `a` and `b`. By default, it uses mask "`lus`" and parameters `a=1` and `b=2` but it can be set as in the examples hereafter. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`affine` | text <-> affine ciphertext | `affine`, `affine_cipher-?l?u?d?s-5,8`, `affine-?s.,?!?u?d-23,6`, ... | Mask-generated alphabet ; uses default mask "`?l?u?s`" with `a=1` and `b=2` - -```python ->>> codext.encode("this is a test", "affine") -'vjkubkubcbvguv' ->>> codext.decode("vjkubkubcbvguv", "affine") -'this is a test' ->>> codext.encode("this is a test", "affine-?l?u?d?s-5,8") -'ORWJdWJdidOCJO' ->>> codext.decode("ORWJdWJdidOCJO", "affine-?l?u?d?s-5,8") -'this is a test' ->>> codext.encode("THIS IS A TEST", "affine-?s.,?!?u?d-5,8") -'AW1 D1 D2DAH A' ->>> codext.decode("AW1 D1 D2DAH A", "affine-?s.,?!?u?d-5,8") -'THIS IS A TEST' -``` - -!!! warning "Parameters `a` and `b`" - - Not all values are suitable for `a` and `b`. If a generated encoding map has mapping collisions, an exception is raised telling that `a` and `b` are bad. - ------ - -### Atbash Cipher - -It implements the monoalphabetic substitution cipher used for the Hebrew alphabet. By default, it considers the lowercase and uppercase letters, inverted per group, as the alphabet. It can also use a mask to extend it. Note that it does not generate any error for characters that are not part of the alphabet. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`atbash` | text <-> Atbash ciphertext | `atbash`, `atbash_cipher-?l?d?s`, ... | Mask-generated alphabet ; uses default mask "`?u?l`" - -```python ->>> codext.encode("this is a test", "atbash") -'gsrh rh z gvhg' ->>> codext.encode("this is a test", "atbash-[?l?u?p?s]") -'.^]/a]/a a.{/.' ->>> codext.decode(".^]/a]/a a.{/.", "atbash_cipher_[?l?u?p?s]") -'this is a test' -``` - ------ - -### Baconian Cipher - -It support only letters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`bacon` | text <-> Bacon ciphertext | `bacon-cipher`, `baconian_cipher`, `bacon-01`, `bacon-10` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `ab`) - -```python ->>> codext.encode("this is a test", "bacon") -'baaba aabbb abaaa baaab abaaa baaab aaaaa baaba aabaa baaab baaba' ->>> codext.encode("this is a test", "bacon_01") -'10010 00111 01000 10001 01000 10001 00000 10010 00100 10001 10010' ->>> codext.decode("-..-. ..--- .-... -...- .-... -...- ..... -..-. ..-.. -...- -..-.", "bacon_.-") -'THIS IS A TEST' -``` - ------ - -### Barbie Typewriter - -It implements the cipher for its 4 different keys. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`barbie` | text <-> Barbie ciphertext | `barbie-1`, `barbie-2`, `barbie-3`, `barbie-4` - -```python ->>> codext.encode("this is a test", "barbie-1") -'hstf tf i hafh' ->>> codext.encode("this is a test", "barbie_3") -'fpsu su h ftuf' ->>> codext.decode("fpsu su h ftuf", "barbie-3") -'this is a test' -``` - ------ - -### Citrix CTX1 - -This implements the Citrix CTX1 password encoding algorithm. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`citrix` | text <-> Citrix CTX1 ciphertext | `citrix`, `citrix-1`, `citrix_ctx1` | - -```python ->>> codext.encode("this is a test", "citrix-ctx1") -'NBBMNAAGIDEPJJBMNIFNIMEMJKEL' ->>> codext.decode("NBBMNAAGIDEPJJBMNIFNIMEMJKEL", "citrix-ctx1") -'this is a test' -``` - ------ - -### Rail Fence Cipher - -This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`, `zigzag`, ... | - -```python ->>> codext.encode("this is a test", "zigzag") -'t ashsi etist' ->>> codext.encode("this is a test", "rail-5-3") -'it sss etiath ' ->>> codext.decode("it sss etiath ", "zigzag_5-3") -'this is a test' -``` - ------ -### ROT N - -This is a dynamic encoding, that is, it can be called with an integer to define the ROT offset. Encoding will apply a positive offset, decoding will apply a negative one. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ -`rot47` | text <-> rot47 ciphertext | | - -```python ->>> codext.encode("this is a test", "rot-15") -'iwxh xh p ithi' ->>> codext.encode("iwxh xh p ithi", "rot20") -'cqrb rb j cnbc' ->>> codext.decode("cqrb rb j cnbc", "rot_9") -'this is a test' -``` - ------ - -### Shift - -This is a dynamic encoding, that is, it can be called with an integer to define the shift offset. Encoding will apply a positive offset, decoding will apply a negative one. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ - -```python ->>> codext.encode("this is a test", "shift-3") -'wklv#lv#d#whvw' ->>> codext.decode("wklv#lv#d#whvw", "shift10") -'mabl\x19bl\x19Z\x19m^lm' ->>> codext.encode("mabl\x19bl\x19Z\x19m^lm", "ordshift_7") -'this is a test' -``` - ------ - -### XOR with 1 byte - -This is a dynamic encoding, that is, it can be called with an integer to define the ordinal of the byte to XOR with the input text. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ - -```python ->>> codext.encode("this is a test", "xor-10") -'~bcy*cy*k*~oy~' ->>> codext.encode("this is a test", "xor-30") -'jvwm>wm>\x7f>j{mj' ->>> codext.decode("this is a test", "xor-30") -'jvwm>wm>\x7f>j{mj' ->>> codext.encode("~bcy*cy*k*~oy~", "xor-10") -'this is a test' -``` - +`codext` also implements several simple cryptographic ciphers. But how does it relate to encoding while a key is required ? `codext` focuses on ciphers that have a weak key. With dynamically named encodings, it is then possible to define a bunch of encodings, one for each value of the key. For instance, Barbie Typewriter has a key with only 4 possible values. The `barbie` codec can then be `barbie-1`, ..., `barbie-4`. + +!!! note "Available masks" + + Some cipher codecs use character masks to generate their alphabets. Groups of characters are indicated using a headin "`?`". + + `a`: printable characters + `b`: all 8-bits chars + `d`: digits + `h`: lowercase hexadecimal + `H`: uppercase hexadecimal + `l`: lowercase letters + `p`: punctuation characters + `s`: whitespace + `u`: uppercase letters + + When combining masks, only one occurrence of each character is taken in the final alphabet. + + So, for instance, the following masks yield the following alphabets: + + - `?l?u?d?s`: "`abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 `" + - `?s.,?!?u?d`: "` .,?!ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789`" + +----- + +### Affine Cipher + +This codec implements the Affine monoalphabetic substitution cipher. It is parametrizable with a mask for generating the alphabet and the parameters `a` and `b`. By default, it uses mask "`lus`" and parameters `a=1` and `b=2` but it can be set as in the examples hereafter. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`affine` | text <-> affine ciphertext | `affine`, `affine_cipher-?l?u?d?s-5,8`, `affine-?s.,?!?u?d-23,6`, ... | Mask-generated alphabet ; uses default mask "`?l?u?s`" with `a=1` and `b=2` + +```python +>>> codext.encode("this is a test", "affine") +'vjkubkubcbvguv' +>>> codext.decode("vjkubkubcbvguv", "affine") +'this is a test' +>>> codext.encode("this is a test", "affine-?l?u?d?s-5,8") +'ORWJdWJdidOCJO' +>>> codext.decode("ORWJdWJdidOCJO", "affine-?l?u?d?s-5,8") +'this is a test' +>>> codext.encode("THIS IS A TEST", "affine-?s.,?!?u?d-5,8") +'AW1 D1 D2DAH A' +>>> codext.decode("AW1 D1 D2DAH A", "affine-?s.,?!?u?d-5,8") +'THIS IS A TEST' +``` + +!!! warning "Parameters `a` and `b`" + + Not all values are suitable for `a` and `b`. If a generated encoding map has mapping collisions, an exception is raised telling that `a` and `b` are bad. + +----- + +### Atbash Cipher + +It implements the monoalphabetic substitution cipher used for the Hebrew alphabet. By default, it considers the lowercase and uppercase letters, inverted per group, as the alphabet. It can also use a mask to extend it. Note that it does not generate any error for characters that are not part of the alphabet. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`atbash` | text <-> Atbash ciphertext | `atbash`, `atbash_cipher-?l?d?s`, ... | Mask-generated alphabet ; uses default mask "`?u?l`" + +```python +>>> codext.encode("this is a test", "atbash") +'gsrh rh z gvhg' +>>> codext.encode("this is a test", "atbash-[?l?u?p?s]") +'.^]/a]/a a.{/.' +>>> codext.decode(".^]/a]/a a.{/.", "atbash_cipher_[?l?u?p?s]") +'this is a test' +``` + +----- + +### Baconian Cipher + +It support only letters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`bacon` | text <-> Bacon ciphertext | `bacon-cipher`, `baconian_cipher`, `bacon-01`, `bacon-10` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `ab`) + +```python +>>> codext.encode("this is a test", "bacon") +'baaba aabbb abaaa baaab abaaa baaab aaaaa baaba aabaa baaab baaba' +>>> codext.encode("this is a test", "bacon_01") +'10010 00111 01000 10001 01000 10001 00000 10010 00100 10001 10010' +>>> codext.decode("-..-. ..--- .-... -...- .-... -...- ..... -..-. ..-.. -...- -..-.", "bacon_.-") +'THIS IS A TEST' +``` + +----- + +### Barbie Typewriter + +It implements the cipher for its 4 different keys. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`barbie` | text <-> Barbie ciphertext | `barbie-1`, `barbie-2`, `barbie-3`, `barbie-4` + +```python +>>> codext.encode("this is a test", "barbie-1") +'hstf tf i hafh' +>>> codext.encode("this is a test", "barbie_3") +'fpsu su h ftuf' +>>> codext.decode("fpsu su h ftuf", "barbie-3") +'this is a test' +``` + +----- + +### Citrix CTX1 + +This implements the Citrix CTX1 password encoding algorithm. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`citrix` | text <-> Citrix CTX1 ciphertext | `citrix`, `citrix-1`, `citrix_ctx1` | + +```python +>>> codext.encode("this is a test", "citrix-ctx1") +'NBBMNAAGIDEPJJBMNIFNIMEMJKEL' +>>> codext.decode("NBBMNAAGIDEPJJBMNIFNIMEMJKEL", "citrix-ctx1") +'this is a test' +``` + +----- + +### Rail Fence Cipher + +This implements the Rail Fence encoding algorithm, using 3 rails and offset 0 as the default parameters. The encoding fence is built from the top ; the `up` flag can be used to build the fence from the bottom. Note that trying parameters that do not fit the input length will trigger a `ValueError` mentioning the bad value. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rail` | text <-> rail fence ciphertext, X rails and Y offset | `rail-X-Y`, `rail_X_Y`, `rail-X-Y-up`, `zigzag`, ... | + +```python +>>> codext.encode("this is a test", "zigzag") +'t ashsi etist' +>>> codext.encode("this is a test", "rail-5-3") +'it sss etiath ' +>>> codext.decode("it sss etiath ", "zigzag_5-3") +'this is a test' +``` + +----- +### ROT N + +This is a dynamic encoding, that is, it can be called with an integer to define the ROT offset. Encoding will apply a positive offset, decoding will apply a negative one. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rot` | text <-> rot(1) ciphertext | `rot1`, `rot-1`, `rot_25`, `caesar13` | Dynamic ROT parameter ; belongs to [1, 26[ +`rot47` | text <-> rot47 ciphertext | | + +```python +>>> codext.encode("this is a test", "rot-15") +'iwxh xh p ithi' +>>> codext.encode("iwxh xh p ithi", "rot20") +'cqrb rb j cnbc' +>>> codext.decode("cqrb rb j cnbc", "rot_9") +'this is a test' +``` + +----- + +### Shift + +This is a dynamic encoding, that is, it can be called with an integer to define the shift offset. Encoding will apply a positive offset, decoding will apply a negative one. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`shift` | text <-> shift(1) ciphertext | `shift1`, `shift-158`, `shift_255` | Dynamic shift parameter ; belongs to [1, 256[ + +```python +>>> codext.encode("this is a test", "shift-3") +'wklv#lv#d#whvw' +>>> codext.decode("wklv#lv#d#whvw", "shift10") +'mabl\x19bl\x19Z\x19m^lm' +>>> codext.encode("mabl\x19bl\x19Z\x19m^lm", "ordshift_7") +'this is a test' +``` + +----- + +### XOR with 1 byte + +This is a dynamic encoding, that is, it can be called with an integer to define the ordinal of the byte to XOR with the input text. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`xor` | text <-> XOR(1) ciphertext | `XOR1`, `xor22`, `xor-158`, `xor_255` | Dynamic XOR parameter ; belongs to [1, 256[ + +```python +>>> codext.encode("this is a test", "xor-10") +'~bcy*cy*k*~oy~' +>>> codext.encode("this is a test", "xor-30") +'jvwm>wm>\x7f>j{mj' +>>> codext.decode("this is a test", "xor-30") +'jvwm>wm>\x7f>j{mj' +>>> codext.encode("~bcy*cy*k*~oy~", "xor-10") +'this is a test' +``` + diff --git a/docs/pages/enc/hashing.md b/docs/pages/enc/hashing.md index d1b0298..0f6f151 100644 --- a/docs/pages/enc/hashing.md +++ b/docs/pages/enc/hashing.md @@ -1,5 +1,3 @@ -## Hashing - `codext` provides hash functions through the `.encode(...)` API for convenience (e.g. while chaining codecs with [the CLI tool](../cli.html)). ----- diff --git a/docs/pages/enc/languages.md b/docs/pages/enc/languages.md index 3735d15..9aa805c 100644 --- a/docs/pages/enc/languages.md +++ b/docs/pages/enc/languages.md @@ -1,199 +1,197 @@ -## Languages - -`codext` also adds some common languages for encoding. - ------ - -### Braille - -It supports letters, digits and some special characters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`braille` | text <-> braille symbols | | Python 3 only - -```python ->>> codext.encode("this is a test", "braille") -'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' ->>> codext.encode("THIS IS A TEST", "braille") -'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' ->>> codext.decode("⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞", "braille") -'this is a test' -``` - ------ - -### Galactic - -This implements the [Minecraft's enchanting table](https://www.thegamer.com/minecraft-enchantment-table-language-guide/) using resembling Unicode characters. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`galactic` | text <-> Minecraft enchantment symbols | `galactic-alphabet`, `minecraft_enchantment`, `minecraft-enchanting-language` | Python 3 only - -```python ->>> codext.encode("this is a test", "galactic") -'ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ' ->>> codext.decode("ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ", "galactic") -'this is a test' -``` - ------ - -### Ipsum - -This implements a codec that uses lorem ipsum words. It selects random words per letter and keeps the following punctuations: "`.,:;+=-*/\\`". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`ipsum` | text <-> latin words | `loremipsum`, `lorem-ipsum` | words from the classical lorem ipsum - -```python ->>> codext.encode("This is a test.", "ipsum") -'Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.' ->>> codext.decode("Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.", "lorem-ipsum") -'This is a test.' -``` - ------ - -### Leetspeak - -This implements a very basic ruleset of elite speaking. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`leetspeak` | text <-> leetspeak encoded text | `leet`, `1337`, `leetspeak` | based on minimalistic elite speaking rules - -```python ->>> codext.encode("this is a test", "leetspeak") -'7h15 15 4 7357' ->>> codext.decode("7h15 15 4 7357", "leetspeak") -'ThIS IS A TEST' -``` - ------ - -### Morse - -It supports of course letters and digits, but also a few special characters: `.,;:?!/\\@&=-_'" $()`. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`morse` | text <-> morse encoded text | none | uses whitespace as a separator, dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `/-.`) - -```python ->>> codext.encode("this is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("this is a test", "morse/-.") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("this is a test", "morse_ABC") -'B CCCC CC CCC A CC CCC A CB A B C CCC B' ->>> codext.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") -'this is a test' ->>> with codext.open("morse.txt", 'w', encoding="morse") as f: - f.write("this is a test") -14 ->>> with codext.open("morse.txt", encoding="morse") as f: - f.read() -'this is a test' -``` - ------ - -### Navajo - -It implements the letters from the [Navajo Code Talkers' Dictionary](https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html). It conserves digits and newlines. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`navajo` | text <-> Navajo | | - -```python ->>> import codext ->>> codext.encode("this is a test 123", "navajo") -'a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3' ->>> codext.decode("a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3", "navajo") -'this is a test 123' -``` - ------ - -### Radio Alphabet - -This is also known as the [NATO phonetic alphabet](https://en.wikipedia.org/wiki/NATO_phonetic_alphabet). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`radio` | text <-> radio alphabet words | `military_alphabet`, `nato-phonetic-alphabet`, `radio-alphabet` | - -```python ->>> codext.encode("foobar", "nato_phonetic_alphabet") -'Foxtrot Oscar Oscar Bravo Alpha Romeo' ->>> codext.decode("Foxtrot Oscar Oscar Bravo Alpha Romeo", "radio-alphabet") -'FOOBAR' -``` - ------ - -### Southpark - -This encodes text according to Kenny's language in Southpark. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`southpark` | text <-> Kenny's language | `kenny` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `fFMmpP`) -`southpark-icase` | text <-> Kenny's language | `kenny_icase` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `FMP`) - -```python ->>> codext.encode("This is a Test", "southpark") -'FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp' ->>> codext.decode('FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp', "kenny") -'This is a Test' ->>> codext.encode("This is a test", "kenny_123456") -'245415411144111411144211444111145455144145' ->>> codext.decode("245415411144111411144211444111145455144145", "kenny-123456") -'This is a test' ->>> codext.encode("this is a test", "kenny_icase") -'FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP' ->>> codext.decode("FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP", "southpark-icase") -'this is a test' ->>> codext.encode("this is a test", "southpark-icase_123") -'123213211122111211122111222111123233122123' ->>> codext.decode('123213211122111211122111222111123233122123', "kenny_icase-123") -'this is a test' -``` - ------ - -### Tap - -This codec implements the [tap/knock code](https://en.wikipedia.org/wiki/Tap_code) commonly used by prisoners. It uses 25 letters, "*k*" is encoded to the same token than "*c*". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`tap` | text <-> tap/knock encoded text | `knock`, `tap-code` | uses a large Unicode whitespace as a token separator ; Python 3 only - -```python ->>> codext.encode("this is a test", "tap") -'.... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....' ->>> codext.decode(".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....", "knock") -'this is a test' -``` - ------ - -### Tom-Tom - -This codec is similar to morse. It converts text into slashes and backslashes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator - -```python ->>> codext.encode("this is a test", "tom-tom") -'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' ->>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") -'THIS IS A TEST' -``` +`codext` also adds some common languages for encoding. + +----- + +### Braille + +It supports letters, digits and some special characters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`braille` | text <-> braille symbols | | Python 3 only + +```python +>>> codext.encode("this is a test", "braille") +'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' +>>> codext.encode("THIS IS A TEST", "braille") +'⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞' +>>> codext.decode("⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞", "braille") +'this is a test' +``` + +----- + +### Galactic + +This implements the [Minecraft's enchanting table](https://www.thegamer.com/minecraft-enchantment-table-language-guide/) using resembling Unicode characters. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`galactic` | text <-> Minecraft enchantment symbols | `galactic-alphabet`, `minecraft_enchantment`, `minecraft-enchanting-language` | Python 3 only + +```python +>>> codext.encode("this is a test", "galactic") +'ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ' +>>> codext.decode("ℸ₸╎߆ ╎߆ ᒋ ℸᒷ߆ℸ", "galactic") +'this is a test' +``` + +----- + +### Ipsum + +This implements a codec that uses lorem ipsum words. It selects random words per letter and keeps the following punctuations: "`.,:;+=-*/\\`". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`ipsum` | text <-> latin words | `loremipsum`, `lorem-ipsum` | words from the classical lorem ipsum + +```python +>>> codext.encode("This is a test.", "ipsum") +'Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.' +>>> codext.decode("Torquent hystericus id sit interdum sit aliquam tempor erat scelerisque taciti.", "lorem-ipsum") +'This is a test.' +``` + +----- + +### Leetspeak + +This implements a very basic ruleset of elite speaking. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`leetspeak` | text <-> leetspeak encoded text | `leet`, `1337`, `leetspeak` | based on minimalistic elite speaking rules + +```python +>>> codext.encode("this is a test", "leetspeak") +'7h15 15 4 7357' +>>> codext.decode("7h15 15 4 7357", "leetspeak") +'ThIS IS A TEST' +``` + +----- + +### Morse + +It supports of course letters and digits, but also a few special characters: `.,;:?!/\\@&=-_'" $()`. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`morse` | text <-> morse encoded text | none | uses whitespace as a separator, dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `/-.`) + +```python +>>> codext.encode("this is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("this is a test", "morse/-.") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("this is a test", "morse_ABC") +'B CCCC CC CCC A CC CCC A CB A B C CCC B' +>>> codext.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") +'this is a test' +>>> with codext.open("morse.txt", 'w', encoding="morse") as f: + f.write("this is a test") +14 +>>> with codext.open("morse.txt", encoding="morse") as f: + f.read() +'this is a test' +``` + +----- + +### Navajo + +It implements the letters from the [Navajo Code Talkers' Dictionary](https://www.history.navy.mil/research/library/online-reading-room/title-list-alphabetically/n/navajo-code-talker-dictionary.html). It conserves digits and newlines. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`navajo` | text <-> Navajo | | + +```python +>>> import codext +>>> codext.encode("this is a test 123", "navajo") +'a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3' +>>> codext.decode("a-woh cha tkin klesh - a-chi klesh - be-la-sana - a-woh dzeh klesh a-woh - 1 2 3", "navajo") +'this is a test 123' +``` + +----- + +### Radio Alphabet + +This is also known as the [NATO phonetic alphabet](https://en.wikipedia.org/wiki/NATO_phonetic_alphabet). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`radio` | text <-> radio alphabet words | `military_alphabet`, `nato-phonetic-alphabet`, `radio-alphabet` | + +```python +>>> codext.encode("foobar", "nato_phonetic_alphabet") +'Foxtrot Oscar Oscar Bravo Alpha Romeo' +>>> codext.decode("Foxtrot Oscar Oscar Bravo Alpha Romeo", "radio-alphabet") +'FOOBAR' +``` + +----- + +### Southpark + +This encodes text according to Kenny's language in Southpark. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`southpark` | text <-> Kenny's language | `kenny` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `fFMmpP`) +`southpark-icase` | text <-> Kenny's language | `kenny_icase` | Dynamic tokens mapping ; we can define a mapping of encoding's tokens (original tokens: `FMP`) + +```python +>>> codext.encode("This is a Test", "southpark") +'FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp' +>>> codext.decode('FmpmfpmfffmmfffmfffmmfffmmmfffFmpmppfmmfmp', "kenny") +'This is a Test' +>>> codext.encode("This is a test", "kenny_123456") +'245415411144111411144211444111145455144145' +>>> codext.decode("245415411144111411144211444111145455144145", "kenny-123456") +'This is a test' +>>> codext.encode("this is a test", "kenny_icase") +'FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP' +>>> codext.decode("FMPMFPMFFFMMFFFMFFFMMFFFMMMFFFFMPMPPFMMFMP", "southpark-icase") +'this is a test' +>>> codext.encode("this is a test", "southpark-icase_123") +'123213211122111211122111222111123233122123' +>>> codext.decode('123213211122111211122111222111123233122123', "kenny_icase-123") +'this is a test' +``` + +----- + +### Tap + +This codec implements the [tap/knock code](https://en.wikipedia.org/wiki/Tap_code) commonly used by prisoners. It uses 25 letters, "*k*" is encoded to the same token than "*c*". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`tap` | text <-> tap/knock encoded text | `knock`, `tap-code` | uses a large Unicode whitespace as a token separator ; Python 3 only + +```python +>>> codext.encode("this is a test", "tap") +'.... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....' +>>> codext.decode(".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. .⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ....", "knock") +'this is a test' +``` + +----- + +### Tom-Tom + +This codec is similar to morse. It converts text into slashes and backslashes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`tomtom` | text <-> tom-tom encoded text | `tom-tom` | uses "`|`" as a separator + +```python +>>> codext.encode("this is a test", "tom-tom") +'\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\' +>>> codext.decode("\\\\/\\ /\\\\ /\\\\\\ \\/\\ | /\\\\\\ \\/\\ | / | \\\\/\\ /\\ \\/\\ \\\\/\\", "tomtom") +'THIS IS A TEST' +``` diff --git a/docs/pages/enc/stegano.md b/docs/pages/enc/stegano.md index 57dfb18..1a3a5fa 100644 --- a/docs/pages/enc/stegano.md +++ b/docs/pages/enc/stegano.md @@ -1,123 +1,121 @@ -## Steganography - -`codext` defines a few steganography-related encodings. While encoding is not really steganography (that is, concealing data within data), the following codecs are worth creating this category as they relate to converting data into something that could mislead the unaware reader. - ------ - -### Hexagrams (I Ching) - -This uses Base64 and then encodes output characters to [I Ching Hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) such that implemented [here](https://github.com/qntm/hexagram-encode). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`hexagram` | text <-> hexagrams-encoded Base64 | `hexagrams`, `i-ching-hexagrams`, `iching` | Python3 only - -```python ->>> codext.encode("this is a test", "hexagram") -'䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯' ->>> codext.decode("䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯", "iching") -'this is a test' -``` - ------ - -### Klopf Code - -This is a Polybius code with the trivial alphabetical distribution ("A" -> (1,1), "B" -> (2,1), ...). This can be tested [here](https://gc.de/gc/klopfcode/). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`klopf` | text <-> klopf encoded text | `klopfcode` | - -```python ->>> codext.encode("this is a test", "klopf") -'44324234 4234 11 44513444' ->>> codext.decode("44324234 4234 11 44513444", "klopf") -'THIS IS A TEST' -``` - ------ - -### Resistor Color Codes - -This uses the [electronic color code](https://en.wikipedia.org/wiki/Electronic_color_code#Resistor_color-coding) to encode digits, displaying colors in the terminal with ANSI color codes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`resistor` | text <-> resistor colors | `condensator`, `resistors-color`, `resistor_color_code` | visually, it only works in a terminal supporting ANSI color codes - -```python ->>> codext.encode("1234", "resistor") -'\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m' ->>> codext.decode("\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m", "condensators_color") -'1234' -``` - ------ - -### Rick Cipher - -This converts letters to words from Rick Astley's famous song "*Never gonna give you up*". - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`rick` | text <-> words from Risk's song | `rick-astley`, `rick_cipher`, `rick-astley-cipher` | case-insensitive while encoding - -```python ->>> codext.encode("Test String", "rick") -'TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna' ->>> codext.decode("TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna", "rick") -'TEST STRING' -``` - ------ - -### SMS (T9) - -This codec implements the SMS encoding, also caled T9, that is the conversion from characters to their corresponding phone keystrokes. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`sms` | text <-> phone keystrokes | `nokia`, `nokia_3310`, `t9` | uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding - -```python ->>> codext.encode("this is a test", "sms") -'8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8' ->>> codext.decode("8_44_444_7777_0_444_7777_0_2_0_8_33_7777_8", "nokia") -'this is a test' ->>> codext.decode("8_44_444_7777_0-444-7777_0-2_0_8_33-7777-8", "t9") -'this is a test' -``` - ------ - -### Whitespaces - -This simple encoding replaces zeros and ones of the binary version of the input text with spaces and tabs. It is supported either with its original mapping or with the inverted mapping. - -!!! warning "Encoding, not programming !" - - This should not be confused with the [whitespace esoteric language](https://en.wikipedia.org/wiki/Whitespace_(programming_language)). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`whitespace` | text <-> whitespaces and tabs | `whitespaces?-inv(erted)?` | The default encoding uses tabs for zeros and spaces for ones -`whitespace_after_before` | text <-> whitespaces[letter]whitespaces | | This codec encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") - -```python ->>> codext.encode("test", "whitespace") -'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' ->>> codext.encode("test", "whitespaces") -'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' ->>> codext.encode("test", "whitespaces-inv") -' \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ' ->>> codext.decode(" \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ", "whitespaces_inverted") -'test' -``` - -```python ->>> codext.encode("test", "whitespace+after-before") -' m \n l \n u \n m ' ->>> codext.decode(" m \n l \n u \n m ", "whitespace+after-before") -'test' -``` +`codext` defines a few steganography-related encodings. While encoding is not really steganography (that is, concealing data within data), the following codecs are worth creating this category as they relate to converting data into something that could mislead the unaware reader. + +----- + +### Hexagrams (I Ching) + +This uses Base64 and then encodes output characters to [I Ching Hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) such that implemented [here](https://github.com/qntm/hexagram-encode). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`hexagram` | text <-> hexagrams-encoded Base64 | `hexagrams`, `i-ching-hexagrams`, `iching` | Python3 only + +```python +>>> codext.encode("this is a test", "hexagram") +'䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯' +>>> codext.decode("䷰䷭䷚䷔䷞䷺䷗䷔䷞䷺䷗䷚䷏䷊䷂䷕䷞䷈䷇☯", "iching") +'this is a test' +``` + +----- + +### Klopf Code + +This is a Polybius code with the trivial alphabetical distribution ("A" -> (1,1), "B" -> (2,1), ...). This can be tested [here](https://gc.de/gc/klopfcode/). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`klopf` | text <-> klopf encoded text | `klopfcode` | + +```python +>>> codext.encode("this is a test", "klopf") +'44324234 4234 11 44513444' +>>> codext.decode("44324234 4234 11 44513444", "klopf") +'THIS IS A TEST' +``` + +----- + +### Resistor Color Codes + +This uses the [electronic color code](https://en.wikipedia.org/wiki/Electronic_color_code#Resistor_color-coding) to encode digits, displaying colors in the terminal with ANSI color codes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`resistor` | text <-> resistor colors | `condensator`, `resistors-color`, `resistor_color_code` | visually, it only works in a terminal supporting ANSI color codes + +```python +>>> codext.encode("1234", "resistor") +'\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m' +>>> codext.decode("\x1b[48;5;130m \x1b[0;00m\x1b[48;5;1m \x1b[0;00m\x1b[48;5;214m \x1b[0;00m\x1b[48;5;11m \x1b[0;00m", "condensators_color") +'1234' +``` + +----- + +### Rick Cipher + +This converts letters to words from Rick Astley's famous song "*Never gonna give you up*". + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`rick` | text <-> words from Risk's song | `rick-astley`, `rick_cipher`, `rick-astley-cipher` | case-insensitive while encoding + +```python +>>> codext.encode("Test String", "rick") +'TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna' +>>> codext.decode("TELL UP gonna TELL + gonna TELL NEVer You AROUND Gonna", "rick") +'TEST STRING' +``` + +----- + +### SMS (T9) + +This codec implements the SMS encoding, also caled T9, that is the conversion from characters to their corresponding phone keystrokes. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`sms` | text <-> phone keystrokes | `nokia`, `nokia_3310`, `t9` | uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding + +```python +>>> codext.encode("this is a test", "sms") +'8-44-444-7777-0-444-7777-0-2-0-8-33-7777-8' +>>> codext.decode("8_44_444_7777_0_444_7777_0_2_0_8_33_7777_8", "nokia") +'this is a test' +>>> codext.decode("8_44_444_7777_0-444-7777_0-2_0_8_33-7777-8", "t9") +'this is a test' +``` + +----- + +### Whitespaces + +This simple encoding replaces zeros and ones of the binary version of the input text with spaces and tabs. It is supported either with its original mapping or with the inverted mapping. + +!!! warning "Encoding, not programming !" + + This should not be confused with the [whitespace esoteric language](https://en.wikipedia.org/wiki/Whitespace_(programming_language)). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`whitespace` | text <-> whitespaces and tabs | `whitespaces?-inv(erted)?` | The default encoding uses tabs for zeros and spaces for ones +`whitespace_after_before` | text <-> whitespaces[letter]whitespaces | | This codec encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") + +```python +>>> codext.encode("test", "whitespace") +'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' +>>> codext.encode("test", "whitespaces") +'\t \t \t\t\t \t\t \t \t \t\t \t \t \t\t' +>>> codext.encode("test", "whitespaces-inv") +' \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ' +>>> codext.decode(" \t\t\t \t \t\t \t \t \t\t\t \t\t \t\t\t \t ", "whitespaces_inverted") +'test' +``` + +```python +>>> codext.encode("test", "whitespace+after-before") +' m \n l \n u \n m ' +>>> codext.decode(" m \n l \n u \n m ", "whitespace+after-before") +'test' +``` diff --git a/docs/pages/enc/web.md b/docs/pages/enc/web.md index 80c6a20..4477a1f 100644 --- a/docs/pages/enc/web.md +++ b/docs/pages/enc/web.md @@ -1,40 +1,38 @@ -## Web - -`codext` implements some common Web-related encodings. - ------ - -### HTML Entities - -This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) - -```python ->>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") -'Тħĩş Їś ą Ţêšŧ' ->>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") -'Тħĩş Їś ą Ţêšŧ' -``` - ------ - -### URL - -This handles URL encoding, regardless of the case when decoding and with no error. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`url` | text <-> URL encoded text | `url`, `urlencode` | - -```python ->>> codecs.encode("?=this/is-a_test/../", "url") -'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' ->>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") -'?=this/is-a_test/../' ->>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") -'?=this/is-a_test/../' -``` - +`codext` implements some common Web-related encodings. + +----- + +### HTML Entities + +This implements the full list of characters available at [this reference](https://dev.w3.org/html5/html-author/charref). + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`html` | text <-> HTML entities | `html-entity`, `html_entities` | implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) + +```python +>>> codext.encode("Тħĩş Їś ą Ţêšŧ", "html") +'Тħĩş Їś ą Ţêšŧ' +>>> codext.decode("Тħĩş Їś ą Ţêšŧ", "html-entities") +'Тħĩş Їś ą Ţêšŧ' +``` + +----- + +### URL + +This handles URL encoding, regardless of the case when decoding and with no error. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`url` | text <-> URL encoded text | `url`, `urlencode` | + +```python +>>> codecs.encode("?=this/is-a_test/../", "url") +'%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F' +>>> codext.decode("%3F%3Dthis%2Fis-a_test%2F%2E%2E%2F", "urlencode") +'?=this/is-a_test/../' +>>> codext.decode("%3f%3dthis%2fis-a_test%2f%2e%2e%2f", "urlencode") +'?=this/is-a_test/../' +``` + diff --git a/docs/pages/features.md b/docs/pages/features.md index 11316f0..02b375b 100644 --- a/docs/pages/features.md +++ b/docs/pages/features.md @@ -1,338 +1,336 @@ -## Features - -Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched. - -`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used. - -!!! note "The `open` built-in function" - - Two behaviors are to be considered when using `codext`: - - 1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`. - 2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`. - - This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library. - -!!! warning "Lossy conversion" - - Some encodings are lossy, meaning that it is not always possible to decode back to the exact start string. This should be considered especially when chaining codecs. - ------ - -### Add a custom encoding - -New codecs can be added easily using the new function `add`. - -```python ->>> import codext ->>> help(codext.add) -Help on function add in module codext.__common__: - -add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False) - This adds a new codec to the codecs module setting its encode and/or decode - functions, eventually dynamically naming the encoding with a pattern and - with file handling (if text is True). - - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the - built-in open(...) but will make it impossible - to remove the codec later - -``` - -Here is a simple example of how to add a basic codec: - -```python -import codext - -def mycodec_encode(text, errors="strict"): - # do some encoding stuff - return encoded, len(text) - -def mycodec_decode(text, errors="strict"): - # do some decoding stuff - return decoded, len(text) - -codext.add("mycodec", mycodec_encode, mycodec_decode) -``` - -In this first example, we can see that: - -- The `decode`/`encode` functions have a signature holding a keyword-argument "`errors`" for error handling. This comes from the syntax for making a codec for the `codecs` native library. This argument can have multiple values, namely "`strict`" for raising an exception when an de/encoding error occurs, while "`replace`" allows to replace the character at the position of the error with a generic character and also "`ignore`" that simply ignores the error and continues without adding anything to the resulting string. -- These functions always return a pair with the resulting string and the length of consumed input text. - -Another example for a more complex and dynamic codec: - -```python -import codext - -def mydyncodec_encode(i): - def encode(text, error="strict"): - # do somthing depending on i - return result, len(text) - return encode - -codext.add("mydyncodec", mydyncodec_encode, pattern=r"mydyn-(\d+)$") -``` - -In this second example, we can see that: - -- Only the encoding function is defined. -- A pattern is defined to match the prefix "`mydyn-`" and then an integer which is captured and used with `mydyncodec_encode(i)`. - -!!! warning "Pattern capture group" - - A capture group means that the parameter will be used with a dynamic (decorated) encoding function. In order to avoid this, i.e. for matching multiple names leading to the same encoding while calling a static encoding function, we can simply define a non-capturing group, e.g. "`(?:my|special_)codec`". - ------ - -### Add a custom map encoding - -New codecs using encoding maps can be added easily using the new function `add_map`. - -```python ->>> import codext ->>> help(codext.add) -Help on function add_map in module codext.__common__: - -add_map(ename, encmap, repl_char='?', sep='', ignore_case=None, no_error=False, intype=None, outype=None, **kwargs) - This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs module - dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern - and with file handling (if text is True). - - :param ename: encoding name - :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture - group of the regex pattern) or a function building the encoding map - :param repl_char: replacement char (used when errors handling is set to "replace") - :param sep: string of possible character separators (hence, only single-char separators are considered) ; - - while encoding, the first separator is used - - while decoding, separators can be mixed in the input text - :param ignore_case: ignore text case while encoding and/or decoding - :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) - :param intype: specify the input type for pre-transforming the input text - :param outype: specify the output type for post-transforming the output text - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - -``` - -This relies on the [`add`](#add-a-custom-encoding) function and simplifies creating new encodings when they can be described as a mapping dictionary. - -Here is a simple example of how to add a map codec: - -```python -import codext - -ENCMAP = {'a': "A", 'b': "B", 'c': "C"} - -codext.add_map("mycodec", ENCMAP) -``` - -In this first example, we can see that: - -- The `decode`/`encode` functions do not have to be declared anymore. -- `ENCMAP` is the mapping between characters, it is also used to compute the decoding function. - -Another example for a more complex and dynamic codec: - -```python -import codext - -ENCMAP = [ - {'00': "A", '01': "B", '10': "C", '11': "D"}, - {'00': "D", '01': "C", '10': "B", '11': "A"}, -] - -codext.add("mydyncodec", ENCMAP, "#", ignore_case=True, intype="bin", pattern=r"mydyn-(\d+)$") -``` - -In this second example, we can see that: - -- `ENCMAP` is now a list of mappings. The capture group in the pattern is used to select the right encoding map. Consequently, using encoding "`mydyn-8`" will fail with a `LookupError` as the only possibility are "`mydyn-1`" and "`mydyn-2`". Note that the index begins at 1 in the encoding name. -- Instead of using the default character "`?`" for replacements, we use "`#`". -- The case is ignored ; decoding either "`abcd`" or "`ABCD`" will succeed. -- The binary mode is enabled, meaning that the input text is converted to a binary string for encoding, while it is converted from binary to text when decoding. - -!!! warning "Input/Output types" - - By default, when `intype` is defined, `outype` takes the same value. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). - ------ - -### Add a macro - -**Macros** are chains of encodings. It is possible to define own macros with this feature. It works by giving the precedence to user's macros saved in `~/.codext-macros.json` then using embedded macros from the `codext` package. - -Here is an example of adding a macro (and verifying it was indeed added): - -```python ->>> codext.list_macros() -['example-macro'] ->>> codext.add_macro("test-macro", "gzip", "base64") ->>> codext.list_macros() -['example-macro', 'test-macro'] -``` - -!!! note "Removing a macro" - - As macros are resolved like codecs (with the precedence for codecs), they can be removed the same way as a codec. - - :::python - >>> codext.remove("test-macro") - - If this is a built-in macro, it will removed from the runtime list within the `codext` package. Next time this will be loaded, it will reset the builtin list of macros. Otherwise, if this is a custom macro, it will removed from the list of custom macros AND removed from `~/.codext-macros.json`. - ------ - -### List codecs - -Codecs can be listed with the `list` function, either the whole codecs or only some categories. - -```python ->>> codext.list() -['affine', 'ascii', 'ascii85', 'atbash', 'bacon', ..., 'base36', 'base58', 'base62', 'base64', 'base64_codec', ..., 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'big5', 'big5hkscs', 'braille', 'bz2_codec', 'capitalize', 'cp037', ...] -``` - -!!! note "Codecs categories" - - - `native`: the built-in codecs from the original `codecs` package - - `non-native`: this special category regroups all the categories mentioned hereafter - - `base`: baseX codecs (e.g. `base`, `base100`) - - `binary`: codecs working on strings but applying their algorithms on their binary forms (e.g. `baudot`, `manchester`) - - `common`: common codecs not included in the native ones or simly added for the purpose of standardization (e.g. `octal`, `ordinal`) - - `crypto`: codecs related to cryptography algorithms (e.g. `barbie`, `rot`, `xor`) - - `language`: language-related codecs (e.g. `morse`, `navajo`) - - `other`: uncategorized codecs (e.g. `letters`, `url`) - - `stegano`: steganography-related codecs (e.g. `sms`, `resistor`) - - Except the `native` and `non-native` categories, the other ones are simply the name of the subdirectories (with "`s`" right-stripped) of the `codext` package. - -```python ->>> codext.list("binary") -['baudot', 'baudot-spaced', 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'excess3', 'gray', 'manchester', 'manchester-inverted'] ->>> codext.list("language") -['braille', 'leet', 'morse', 'navajo', 'radio', 'southpark', 'southpark-icase', 'tom-tom'] ->>> codext.list("native") -['ascii', 'base64_codec', 'big5', 'big5hkscs', 'bz2_codec', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', ...] -``` - -!!! warning "Codecs listed, not encodings" - - Beware that this function only lists the codecs, not the encodings. This means that, for instance, it only lists `base` (codecs' name) instead of `base17`, `base61`, `base97`, ... (the valid encoding names related to the `base` codec). - ------ - -### Search for encodings - -Natively, `codecs` provides a `lookup` function that allows to get the `CodecInfo` object for the desired encoding. This performs a lookup in the registry based on an exact match. Sometimes, it can be useful to search for available encodings based on a regular expression. Therefore, a `search` function is added by `codext` to allow to get a list of encoding names matching the input regex. - -```python ->>> codext.search("baudot") -['baudot', 'baudot_spaced', 'baudot_tape'] ->>> codext.search("al") -['capitalize', 'octal', 'octal_spaced', 'ordinal', 'ordinal_spaced', 'radio'] ->>> codext.search("white") -['whitespace', 'whitespace_after_before'] -``` - -Also, `codext` provides an `examples` function to get some examples of valid encoding names. This is especially useful when it concerns dynamicly named encodings (e.g. `rot`, `shift` or `dna`). - -```python ->>> codext.examples("rot") -['rot-14', 'rot-24', 'rot-7', 'rot18', 'rot3', 'rot4', 'rot6', 'rot_1', 'rot_12', 'rot_2'] ->>> codext.examples("dna") -['dna-1', 'dna-2', 'dna-5', 'dna1', 'dna4', 'dna5', 'dna6', 'dna8', 'dna_3', 'dna_5'] ->>> codext.examples("barbie", 5) -['barbie-1', 'barbie1', 'barbie4', 'barbie_2', 'barbie_4'] -``` - ------ - -### Remove a custom encoding or macro - -New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one. - -```python ->>> codext.encode("test", "bin") -'01110100011001010111001101110100' ->>> codext.remove("bin") ->>> codext.encode("test", "bin") - -Traceback (most recent call last): - [...] -LookupError: unknown encoding: bin -``` - -Trying to remove a codec that is in the native registry won't raise a `LookupError`. - -```python ->>> codext.remove("utf-8") ->>> codext.encode("test", "utf-8") -b'test' -``` - -Removing a macro works exactly the same way as for a codec. - -```python ->>> codext.remove("test-macro") -``` - ------ - -### Remove or restore `codext` encodings and macros - -It can be useful while playing with encodings and/or macros e.g. from Idle to be able to remove or restore `codext`'s encodings and macros. This can be achieved using respectively the new `clear` and `reset` functions. - -```python ->>> codext.clear() ->>> codext.encode("test", "bin") - -Traceback (most recent call last): - [...] -LookupError: unknown encoding: bin -``` - -```python ->>> codext.reset() ->>> codext.encode("test", "bin") -'01110100011001010111001101110100' -``` - ------ - -### Multi-rounds encoding - -It is possible to use multiple times the same encoding through the following convention: `encoding[X]` - -A simple example for a 1-round and a 2-rounds morse-encoded string: - -```python ->>> codext.encode("This is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' ->>> codext.encode("This is a test", "morse[2]") -'-....- / .-.-.- .-.-.- .-.-.- .-.-.- / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- -....- / -..-. / -....- / .-.-.- / .-.-.- .-.-.- .-.-.- / -....-' -``` - -Another example using 5-rounds base58: - -```python ->>> codext.encode("Sup3rS3cr3t", "base58[5]") -'3YrjaeeJE1qfUVkpUbMymEMLJenvRrtcZ4vaDQ3httdiqWV8wGYFpqw' -``` - ------ - -### Hooked `codecs` functions - -In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec. - -While being imported, `codext` hooks the following base functions of `codecs` dealing with the codecs registry: `encode`, `decode`, `lookup` and `register`. This way, `codext` holds a private registry that is called before reaching out to the native one, causing the codecs defined in `codext` to override native codecs with a matching registry search function. - +Basically, the `codecs` library provides a series of functions from the built-in `_codecs` library which maintains a registry of search functions (a simple list) that maps ancodings to the right de/encode functions by returning a `CodecInfo` object once first matched. + +`codext` hooks `codecs`'s functions to insert its own proxy registry between the function calls and the native registry so that new encodings can be added or replace existing ones while using `code[cs|xt].open`. Indeed, as the proxy registry is called first, the first possible match occurs in a custom codec, while if not existing, the native registry is used. + +!!! note "The `open` built-in function" + + Two behaviors are to be considered when using `codext`: + + 1. Encodings added from `codext` are only added to the proxy codecs registry of `codext` and are NOT available using `open(...)` (but well using `code[cs|xt].open(...)`. + 2. Encodings added from `codecs` are added to the proxy registry AND ALSO to the native registry and are therefore available using `open(...)`. + + This difference allows to keep encodings added from `codext` removable while these added from `codecs` are not. This is the consequence from the fact that there is no unregister function in the native `_codecs` library. + +!!! warning "Lossy conversion" + + Some encodings are lossy, meaning that it is not always possible to decode back to the exact start string. This should be considered especially when chaining codecs. + +----- + +### Add a custom encoding + +New codecs can be added easily using the new function `add`. + +```python +>>> import codext +>>> help(codext.add) +Help on function add in module codext.__common__: + +add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False) + This adds a new codec to the codecs module setting its encode and/or decode + functions, eventually dynamically naming the encoding with a pattern and + with file handling (if text is True). + + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the + built-in open(...) but will make it impossible + to remove the codec later + +``` + +Here is a simple example of how to add a basic codec: + +```python +import codext + +def mycodec_encode(text, errors="strict"): + # do some encoding stuff + return encoded, len(text) + +def mycodec_decode(text, errors="strict"): + # do some decoding stuff + return decoded, len(text) + +codext.add("mycodec", mycodec_encode, mycodec_decode) +``` + +In this first example, we can see that: + +- The `decode`/`encode` functions have a signature holding a keyword-argument "`errors`" for error handling. This comes from the syntax for making a codec for the `codecs` native library. This argument can have multiple values, namely "`strict`" for raising an exception when an de/encoding error occurs, while "`replace`" allows to replace the character at the position of the error with a generic character and also "`ignore`" that simply ignores the error and continues without adding anything to the resulting string. +- These functions always return a pair with the resulting string and the length of consumed input text. + +Another example for a more complex and dynamic codec: + +```python +import codext + +def mydyncodec_encode(i): + def encode(text, error="strict"): + # do somthing depending on i + return result, len(text) + return encode + +codext.add("mydyncodec", mydyncodec_encode, pattern=r"mydyn-(\d+)$") +``` + +In this second example, we can see that: + +- Only the encoding function is defined. +- A pattern is defined to match the prefix "`mydyn-`" and then an integer which is captured and used with `mydyncodec_encode(i)`. + +!!! warning "Pattern capture group" + + A capture group means that the parameter will be used with a dynamic (decorated) encoding function. In order to avoid this, i.e. for matching multiple names leading to the same encoding while calling a static encoding function, we can simply define a non-capturing group, e.g. "`(?:my|special_)codec`". + +----- + +### Add a custom map encoding + +New codecs using encoding maps can be added easily using the new function `add_map`. + +```python +>>> import codext +>>> help(codext.add) +Help on function add_map in module codext.__common__: + +add_map(ename, encmap, repl_char='?', sep='', ignore_case=None, no_error=False, intype=None, outype=None, **kwargs) + This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs module + dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with a pattern + and with file handling (if text is True). + + :param ename: encoding name + :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture + group of the regex pattern) or a function building the encoding map + :param repl_char: replacement char (used when errors handling is set to "replace") + :param sep: string of possible character separators (hence, only single-char separators are considered) ; + - while encoding, the first separator is used + - while decoding, separators can be mixed in the input text + :param ignore_case: ignore text case while encoding and/or decoding + :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) + :param intype: specify the input type for pre-transforming the input text + :param outype: specify the output type for post-transforming the output text + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + +``` + +This relies on the [`add`](#add-a-custom-encoding) function and simplifies creating new encodings when they can be described as a mapping dictionary. + +Here is a simple example of how to add a map codec: + +```python +import codext + +ENCMAP = {'a': "A", 'b': "B", 'c': "C"} + +codext.add_map("mycodec", ENCMAP) +``` + +In this first example, we can see that: + +- The `decode`/`encode` functions do not have to be declared anymore. +- `ENCMAP` is the mapping between characters, it is also used to compute the decoding function. + +Another example for a more complex and dynamic codec: + +```python +import codext + +ENCMAP = [ + {'00': "A", '01': "B", '10': "C", '11': "D"}, + {'00': "D", '01': "C", '10': "B", '11': "A"}, +] + +codext.add("mydyncodec", ENCMAP, "#", ignore_case=True, intype="bin", pattern=r"mydyn-(\d+)$") +``` + +In this second example, we can see that: + +- `ENCMAP` is now a list of mappings. The capture group in the pattern is used to select the right encoding map. Consequently, using encoding "`mydyn-8`" will fail with a `LookupError` as the only possibility are "`mydyn-1`" and "`mydyn-2`". Note that the index begins at 1 in the encoding name. +- Instead of using the default character "`?`" for replacements, we use "`#`". +- The case is ignored ; decoding either "`abcd`" or "`ABCD`" will succeed. +- The binary mode is enabled, meaning that the input text is converted to a binary string for encoding, while it is converted from binary to text when decoding. + +!!! warning "Input/Output types" + + By default, when `intype` is defined, `outype` takes the same value. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). + +----- + +### Add a macro + +**Macros** are chains of encodings. It is possible to define own macros with this feature. It works by giving the precedence to user's macros saved in `~/.codext-macros.json` then using embedded macros from the `codext` package. + +Here is an example of adding a macro (and verifying it was indeed added): + +```python +>>> codext.list_macros() +['example-macro'] +>>> codext.add_macro("test-macro", "gzip", "base64") +>>> codext.list_macros() +['example-macro', 'test-macro'] +``` + +!!! note "Removing a macro" + + As macros are resolved like codecs (with the precedence for codecs), they can be removed the same way as a codec. + + :::python + >>> codext.remove("test-macro") + + If this is a built-in macro, it will removed from the runtime list within the `codext` package. Next time this will be loaded, it will reset the builtin list of macros. Otherwise, if this is a custom macro, it will removed from the list of custom macros AND removed from `~/.codext-macros.json`. + +----- + +### List codecs + +Codecs can be listed with the `list` function, either the whole codecs or only some categories. + +```python +>>> codext.list() +['affine', 'ascii', 'ascii85', 'atbash', 'bacon', ..., 'base36', 'base58', 'base62', 'base64', 'base64_codec', ..., 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'big5', 'big5hkscs', 'braille', 'bz2_codec', 'capitalize', 'cp037', ...] +``` + +!!! note "Codecs categories" + + - `native`: the built-in codecs from the original `codecs` package + - `non-native`: this special category regroups all the categories mentioned hereafter + - `base`: baseX codecs (e.g. `base`, `base100`) + - `binary`: codecs working on strings but applying their algorithms on their binary forms (e.g. `baudot`, `manchester`) + - `common`: common codecs not included in the native ones or simly added for the purpose of standardization (e.g. `octal`, `ordinal`) + - `crypto`: codecs related to cryptography algorithms (e.g. `barbie`, `rot`, `xor`) + - `language`: language-related codecs (e.g. `morse`, `navajo`) + - `other`: uncategorized codecs (e.g. `letters`, `url`) + - `stegano`: steganography-related codecs (e.g. `sms`, `resistor`) + + Except the `native` and `non-native` categories, the other ones are simply the name of the subdirectories (with "`s`" right-stripped) of the `codext` package. + +```python +>>> codext.list("binary") +['baudot', 'baudot-spaced', 'baudot-tape', 'bcd', 'bcd-extended0', 'bcd-extended1', 'excess3', 'gray', 'manchester', 'manchester-inverted'] +>>> codext.list("language") +['braille', 'leet', 'morse', 'navajo', 'radio', 'southpark', 'southpark-icase', 'tom-tom'] +>>> codext.list("native") +['ascii', 'base64_codec', 'big5', 'big5hkscs', 'bz2_codec', 'cp037', 'cp273', 'cp424', 'cp437', 'cp500', 'cp775', 'cp850', 'cp852', 'cp855', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', ...] +``` + +!!! warning "Codecs listed, not encodings" + + Beware that this function only lists the codecs, not the encodings. This means that, for instance, it only lists `base` (codecs' name) instead of `base17`, `base61`, `base97`, ... (the valid encoding names related to the `base` codec). + +----- + +### Search for encodings + +Natively, `codecs` provides a `lookup` function that allows to get the `CodecInfo` object for the desired encoding. This performs a lookup in the registry based on an exact match. Sometimes, it can be useful to search for available encodings based on a regular expression. Therefore, a `search` function is added by `codext` to allow to get a list of encoding names matching the input regex. + +```python +>>> codext.search("baudot") +['baudot', 'baudot_spaced', 'baudot_tape'] +>>> codext.search("al") +['capitalize', 'octal', 'octal_spaced', 'ordinal', 'ordinal_spaced', 'radio'] +>>> codext.search("white") +['whitespace', 'whitespace_after_before'] +``` + +Also, `codext` provides an `examples` function to get some examples of valid encoding names. This is especially useful when it concerns dynamicly named encodings (e.g. `rot`, `shift` or `dna`). + +```python +>>> codext.examples("rot") +['rot-14', 'rot-24', 'rot-7', 'rot18', 'rot3', 'rot4', 'rot6', 'rot_1', 'rot_12', 'rot_2'] +>>> codext.examples("dna") +['dna-1', 'dna-2', 'dna-5', 'dna1', 'dna4', 'dna5', 'dna6', 'dna8', 'dna_3', 'dna_5'] +>>> codext.examples("barbie", 5) +['barbie-1', 'barbie1', 'barbie4', 'barbie_2', 'barbie_4'] +``` + +----- + +### Remove a custom encoding or macro + +New codecs can be removed easily using the new function `remove`, which will only remove every codec matching the given encoding name in the proxy codecs registry and NOT in the native one. + +```python +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +>>> codext.remove("bin") +>>> codext.encode("test", "bin") + +Traceback (most recent call last): + [...] +LookupError: unknown encoding: bin +``` + +Trying to remove a codec that is in the native registry won't raise a `LookupError`. + +```python +>>> codext.remove("utf-8") +>>> codext.encode("test", "utf-8") +b'test' +``` + +Removing a macro works exactly the same way as for a codec. + +```python +>>> codext.remove("test-macro") +``` + +----- + +### Remove or restore `codext` encodings and macros + +It can be useful while playing with encodings and/or macros e.g. from Idle to be able to remove or restore `codext`'s encodings and macros. This can be achieved using respectively the new `clear` and `reset` functions. + +```python +>>> codext.clear() +>>> codext.encode("test", "bin") + +Traceback (most recent call last): + [...] +LookupError: unknown encoding: bin +``` + +```python +>>> codext.reset() +>>> codext.encode("test", "bin") +'01110100011001010111001101110100' +``` + +----- + +### Multi-rounds encoding + +It is possible to use multiple times the same encoding through the following convention: `encoding[X]` + +A simple example for a 1-round and a 2-rounds morse-encoded string: + +```python +>>> codext.encode("This is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' +>>> codext.encode("This is a test", "morse[2]") +'-....- / .-.-.- .-.-.- .-.-.- .-.-.- / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- .-.-.- / .-.-.- .-.-.- .-.-.- / -..-. / .-.-.- -....- / -..-. / -....- / .-.-.- / .-.-.- .-.-.- .-.-.- / -....-' +``` + +Another example using 5-rounds base58: + +```python +>>> codext.encode("Sup3rS3cr3t", "base58[5]") +'3YrjaeeJE1qfUVkpUbMymEMLJenvRrtcZ4vaDQ3httdiqWV8wGYFpqw' +``` + +----- + +### Hooked `codecs` functions + +In order to select the right de/encoding function and avoid any conflict, the native `codecs` library registers search functions (using the `register(search_function)` function), called in order of registration while searching for a codec. + +While being imported, `codext` hooks the following base functions of `codecs` dealing with the codecs registry: `encode`, `decode`, `lookup` and `register`. This way, `codext` holds a private registry that is called before reaching out to the native one, causing the codecs defined in `codext` to override native codecs with a matching registry search function. + diff --git a/docs/pages/guessing.md b/docs/pages/guessing.md index 9bac11c..5745918 100644 --- a/docs/pages/guessing.md +++ b/docs/pages/guessing.md @@ -1,172 +1,170 @@ -## Guess Mode - -For decoding multiple layers of codecs, `codext` features a guess mode relying on an Artificial Intelligence algorithm, the Breadth-First tree Search (BFS). For many cases, the default parameters are sufficient for guess-decoding things. But it may require parameters tuning. - ------ - -### Parameters - -BFS stops when a given condition, in the form of a function applied to the decoded string at the current depth, is met. It returns two results: the decoded string and a tuple with the related encoding names in order of application. - -The following parameters are tunable: - -- `stop_func`: can be a function or a regular expression to be matched (automatically converted to a function that uses the `re` module) ; by default, checks if all input characters are printable. -- `min_depth`: the minimum depth for the tree search (allows to avoid a bit of overhead while checking the current decoded output at a depth with the stop function when we are sure it should not be the right result) ; by default 0. -- `max_depth`: the maximum depth for the tree search ; by default 5. -- `codec_categories`: a string indicating a codec [category](#list-codecs) or a list of [category](#list-codecs) strings ; by default, `None`, meaning the whole [categories](#list-codecs) (very slow). -- `found`: a list or tuple of currently found encodings that can be used to save time if the first decoding steps are known ; by default, an empty tuple. - -A simple example for a 1-stage base64-encoded string: - -```python ->>> codext.guess("VGhpcyBpcyBhIHRlc3Q=") -{('base64',): 'This is a test'} -``` - -An example of a 2-stages base64- then base62-encoded string: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7") -{('base62',): 'VGhpcyBpcyBhIHRlc3Q='} -``` - -In the second example, we can see that the given encoded string is not decoded as expected. This is the case because the (default) stop condition is too broad and stops if all the characters of the output are printable. If we have a prior knowledge on what we should expect, we can input a simple string or a regex: - -!!! note "Default stop function" - - :::python - >>> codext.stopfunc.default.__name__ - '...' - - The output depends on whether you have a language detection backend library installed ; see section [*Natural Language Detection*](#natural-language-detection). If no such library is installed, the default function is "`text`". - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test") -{('base62', 'base64'): 'This is a test'} -``` - -In this example, the string "*test*" is converted to a function that uses this string as regular expression. Instead of a string, we can also pass a function. For this purpose, standard [stop functions](#available-stop-functions) are predefined. So, we can for instance use `stopfunc.lang_en` to stop when we find something that is English. Note that working this way gives lots of false positives if the text is very short like in the example case. That's why the `codec_categories` argument is used to only consider baseX codecs. This is also demonstrated in the next examples. - -```python ->>> codext.stopfunc._reload_lang("langdetect") ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", codext.stopfunc.lang_en, codec_categories="base") -('This is a test', ('base62', 'base64')) -``` - -If we know the first encoding, we can set this in the `found` parameter to save time: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", found=["base62"]) -('This is a test', ('base62', 'base64')) -``` - -If we are sure that only `base` (which is a valid [category](#list-codecs)) encodings are used, we can restrict the tree search using the `codec_categories` parameter to save time: - -```python ->>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", codec_categories="base") -('This is a test', ('base62', 'base64')) -``` - -Another example of 2-stages encoded string: - -```python ->>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test") -('this is a test', ('base64', 'morse')) ->>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test", codec_categories=["base", "language"]) -('this is a test', ('base64', 'morse')) -``` - -When multiple results are expected, `stop` and `show` arguments can be used respectively to avoid stopping while finding a result and to display the intermediate result. - -!!! warning "Computation time" - - Note that, in the very last examples, the first call takes much longer than the second one but requires no knowledge about the possible [categories](#list-codecs) of encodings. - ------ - -### Available Stop Functions - -A few stop functions are predefined in the `stopfunc` submodule. - -```python ->>> import codext ->>> dir(codext.stopfunc) -['LANG_BACKEND', 'LANG_BACKENDS', ..., '_reload_lang', 'default', 'flag', ..., 'printables', 'regex', 'text'] -``` - -Currently, the following stop functions are provided: - -- `flag`: searches for the pattern "`[Ff][Ll1][Aa4@][Gg9]`" (either UTF-8 or UTF-16) -- `lang_**`: checks if the given lang is detected (note that it first checks if all characters are text ; see `text` hereafter) -- `printables`: checks that every output character is in the set of printables -- `regex(pattern)`: takes one argument, the regular expression, for checking a string against the given pattern -- `text`: checks for printables and an entropy less than 4.6 (empirically determined) - -A stop function can be used as the second argument of the `guess` function or as a keyword-argument, as shown in the following examples: - -```python ->>> codext.guess("...", codext.stopfunc.text) -[...] ->>> codext.guess("...", [...], stop_func=codext.stopfunc.text) -[...] -``` - -When a string is given, it is automatically converted to a `regex` stop function. - -```python ->>> s = codext.encode("pattern testing", "leetspeak") ->>> s -'p4773rn 73571n9' ->>> stop_func = codext.stopfunc.regex("p[a4@][t7]{2}[e3]rn") ->>> stop_func(s) -True ->>> codext.guess(s, stop_func) -[...] -``` - -Additionally, a simple stop function is predefined for CTF players, matching various declinations of the word *flag*. Alternatively, a pattern can always be used when flags have a particular format. - -```python ->>> codext.stopfunc.flag("test string") -False ->>> codext.stopfunc.flag("test f1@9") -True ->>> codext.stopfunc.regex(r"^CTF\{.*?\}$")("CTF{098f6bcd4621d373cade4e832627b4f6}") -True -``` - -The particular type of stop function `lang_**` is explained in the [next section](#natural-language-detection). - ------ - -### Natural Language Detection - -As in many cases, we are trying to decode inputs to readable text, it is necessary to narrow the scope while searching for valid decoded outputs. As matching printables and even text (as defined here before as printables with an entropy of less than 4.6) is too broad for many cases, it may be very useful to apply natural language detection. In `codext`, this is done by relying on Natural Language Processing (NLP) backend libraries, loaded only if they were separately installed. - -Currently, the following backends are supported, in order of precedence (this order was empirically determined by testing): - -- [`langid`](https://github.com/saffsd/langid.py): *Standalone Language Identification (LangID) tool.* -- [`langdetect`](https://github.com/Mimino666/langdetect): *Port of Nakatani Shuyo's language-detection library (version from 03/03/2014) to Python.* -- [`pycld2`](https://github.com/aboSamoor/pycld2): *Python bindings for the Compact Langauge Detect 2 (CLD2).* -- [`cld3`](https://github.com/bsolomon1124/pycld3): *Python bindings to the Compact Language Detector v3 (CLD3).* -- [`textblob`](https://github.com/sloria/TextBlob): *Python (2 and 3) library for processing textual data.* - -The way NLP is used is to check that these libraries exist and to take the first one by default. This sets up the `stopfunc.default` for the guess mode. This behavior aims to keep language detection as optional and to avoid multiple specific requirements having the same purpose. - -While loaded, the default backend can be switched to another one by using the `_reload_lang` function: - -```python ->>> codext.stopfunc._reload_lang("pycld2") # this loads pycld2 and attaches lang_** functions to the stopfunc submodule ->>> codext.stopfunc._reload_lang() # this unloads any loaded backend -``` - -Each time a backend is loaded, it gets `lang_**` stop functions attached to the `stopfunc` submodule for each supported language. - ------ - -### Ranking Heuristic - -!!! warning "Work in progress" - - This part is still in progress and shall be improved with better features and/or using machine learning. - +For decoding multiple layers of codecs, `codext` features a guess mode relying on an Artificial Intelligence algorithm, the Breadth-First tree Search (BFS). For many cases, the default parameters are sufficient for guess-decoding things. But it may require parameters tuning. + +----- + +### Parameters + +BFS stops when a given condition, in the form of a function applied to the decoded string at the current depth, is met. It returns two results: the decoded string and a tuple with the related encoding names in order of application. + +The following parameters are tunable: + +- `stop_func`: can be a function or a regular expression to be matched (automatically converted to a function that uses the `re` module) ; by default, checks if all input characters are printable. +- `min_depth`: the minimum depth for the tree search (allows to avoid a bit of overhead while checking the current decoded output at a depth with the stop function when we are sure it should not be the right result) ; by default 0. +- `max_depth`: the maximum depth for the tree search ; by default 5. +- `codec_categories`: a string indicating a codec [category](#list-codecs) or a list of [category](#list-codecs) strings ; by default, `None`, meaning the whole [categories](#list-codecs) (very slow). +- `found`: a list or tuple of currently found encodings that can be used to save time if the first decoding steps are known ; by default, an empty tuple. + +A simple example for a 1-stage base64-encoded string: + +```python +>>> codext.guess("VGhpcyBpcyBhIHRlc3Q=") +{('base64',): 'This is a test'} +``` + +An example of a 2-stages base64- then base62-encoded string: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7") +{('base62',): 'VGhpcyBpcyBhIHRlc3Q='} +``` + +In the second example, we can see that the given encoded string is not decoded as expected. This is the case because the (default) stop condition is too broad and stops if all the characters of the output are printable. If we have a prior knowledge on what we should expect, we can input a simple string or a regex: + +!!! note "Default stop function" + + :::python + >>> codext.stopfunc.default.__name__ + '...' + + The output depends on whether you have a language detection backend library installed ; see section [*Natural Language Detection*](#natural-language-detection). If no such library is installed, the default function is "`text`". + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test") +{('base62', 'base64'): 'This is a test'} +``` + +In this example, the string "*test*" is converted to a function that uses this string as regular expression. Instead of a string, we can also pass a function. For this purpose, standard [stop functions](#available-stop-functions) are predefined. So, we can for instance use `stopfunc.lang_en` to stop when we find something that is English. Note that working this way gives lots of false positives if the text is very short like in the example case. That's why the `codec_categories` argument is used to only consider baseX codecs. This is also demonstrated in the next examples. + +```python +>>> codext.stopfunc._reload_lang("langdetect") +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", codext.stopfunc.lang_en, codec_categories="base") +('This is a test', ('base62', 'base64')) +``` + +If we know the first encoding, we can set this in the `found` parameter to save time: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", found=["base62"]) +('This is a test', ('base62', 'base64')) +``` + +If we are sure that only `base` (which is a valid [category](#list-codecs)) encodings are used, we can restrict the tree search using the `codec_categories` parameter to save time: + +```python +>>> codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "test", codec_categories="base") +('This is a test', ('base62', 'base64')) +``` + +Another example of 2-stages encoded string: + +```python +>>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test") +('this is a test', ('base64', 'morse')) +>>> codext.guess("LSAuLi4uIC4uIC4uLiAvIC4uIC4uLiAvIC4tIC8gLSAuIC4uLiAt", "test", codec_categories=["base", "language"]) +('this is a test', ('base64', 'morse')) +``` + +When multiple results are expected, `stop` and `show` arguments can be used respectively to avoid stopping while finding a result and to display the intermediate result. + +!!! warning "Computation time" + + Note that, in the very last examples, the first call takes much longer than the second one but requires no knowledge about the possible [categories](#list-codecs) of encodings. + +----- + +### Available Stop Functions + +A few stop functions are predefined in the `stopfunc` submodule. + +```python +>>> import codext +>>> dir(codext.stopfunc) +['LANG_BACKEND', 'LANG_BACKENDS', ..., '_reload_lang', 'default', 'flag', ..., 'printables', 'regex', 'text'] +``` + +Currently, the following stop functions are provided: + +- `flag`: searches for the pattern "`[Ff][Ll1][Aa4@][Gg9]`" (either UTF-8 or UTF-16) +- `lang_**`: checks if the given lang is detected (note that it first checks if all characters are text ; see `text` hereafter) +- `printables`: checks that every output character is in the set of printables +- `regex(pattern)`: takes one argument, the regular expression, for checking a string against the given pattern +- `text`: checks for printables and an entropy less than 4.6 (empirically determined) + +A stop function can be used as the second argument of the `guess` function or as a keyword-argument, as shown in the following examples: + +```python +>>> codext.guess("...", codext.stopfunc.text) +[...] +>>> codext.guess("...", [...], stop_func=codext.stopfunc.text) +[...] +``` + +When a string is given, it is automatically converted to a `regex` stop function. + +```python +>>> s = codext.encode("pattern testing", "leetspeak") +>>> s +'p4773rn 73571n9' +>>> stop_func = codext.stopfunc.regex("p[a4@][t7]{2}[e3]rn") +>>> stop_func(s) +True +>>> codext.guess(s, stop_func) +[...] +``` + +Additionally, a simple stop function is predefined for CTF players, matching various declinations of the word *flag*. Alternatively, a pattern can always be used when flags have a particular format. + +```python +>>> codext.stopfunc.flag("test string") +False +>>> codext.stopfunc.flag("test f1@9") +True +>>> codext.stopfunc.regex(r"^CTF\{.*?\}$")("CTF{098f6bcd4621d373cade4e832627b4f6}") +True +``` + +The particular type of stop function `lang_**` is explained in the [next section](#natural-language-detection). + +----- + +### Natural Language Detection + +As in many cases, we are trying to decode inputs to readable text, it is necessary to narrow the scope while searching for valid decoded outputs. As matching printables and even text (as defined here before as printables with an entropy of less than 4.6) is too broad for many cases, it may be very useful to apply natural language detection. In `codext`, this is done by relying on Natural Language Processing (NLP) backend libraries, loaded only if they were separately installed. + +Currently, the following backends are supported, in order of precedence (this order was empirically determined by testing): + +- [`langid`](https://github.com/saffsd/langid.py): *Standalone Language Identification (LangID) tool.* +- [`langdetect`](https://github.com/Mimino666/langdetect): *Port of Nakatani Shuyo's language-detection library (version from 03/03/2014) to Python.* +- [`pycld2`](https://github.com/aboSamoor/pycld2): *Python bindings for the Compact Langauge Detect 2 (CLD2).* +- [`cld3`](https://github.com/bsolomon1124/pycld3): *Python bindings to the Compact Language Detector v3 (CLD3).* +- [`textblob`](https://github.com/sloria/TextBlob): *Python (2 and 3) library for processing textual data.* + +The way NLP is used is to check that these libraries exist and to take the first one by default. This sets up the `stopfunc.default` for the guess mode. This behavior aims to keep language detection as optional and to avoid multiple specific requirements having the same purpose. + +While loaded, the default backend can be switched to another one by using the `_reload_lang` function: + +```python +>>> codext.stopfunc._reload_lang("pycld2") # this loads pycld2 and attaches lang_** functions to the stopfunc submodule +>>> codext.stopfunc._reload_lang() # this unloads any loaded backend +``` + +Each time a backend is loaded, it gets `lang_**` stop functions attached to the `stopfunc` submodule for each supported language. + +----- + +### Ranking Heuristic + +!!! warning "Work in progress" + + This part is still in progress and shall be improved with better features and/or using machine learning. + diff --git a/docs/pages/howto.md b/docs/pages/howto.md index 6163ef6..9e59805 100644 --- a/docs/pages/howto.md +++ b/docs/pages/howto.md @@ -1,242 +1,240 @@ -## How To Create Your Codec - -The purpose of this section is to provide a tutorial for creating new codecs accordingly. - -As explained in [this section](./features.html), `codext` provides the possibility to add new codecs in two ways: - -1. [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. -2. [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. - -In both cases, a *pattern* is given in argument and aims to define the set of all strings that aim to select this codec. - -!!! important "Codec precedence" - - `codext` uses a local registry that is queried first before attempting native `codecs` lookups. This means that a native codec can be overridden with a *pattern* that matches the same strings. - -The remainder of this section explains how to successfully create a new codec and/or how to make so that it can be added to the library. - -!!! reminder "Contributions welcome !" - - Remember that you can always [submit a request for a new codec](https://github.com/dhondta/python-codext/issues/new) or submit your own with a PR for improving `codext` ! - ------ - -### Generic arguments - -Whatever solution is chosen, the following arguments shall be considered: - -- `ename` (first positional argument): Choose the shortest possible encoding name. If it clashes with another codec, always remember that `codext` resolves codecs in order of registry, that is from the first added. Also, it resolves codecs based on the given pattern. So, a codec with a clashing name could still be selected if the pattern does not match for the codec with the precedence but matches for this codec. -- `pattern` (keyword-argument): If not defined, it defaults to the encoding name. It can be a regular expression ; in this case, it should not be too broad. A codec decode or encode function can be parametrized through the pattern using the **first capture group**. It is important to note that the first capture group is used and not any other. This means that any other group definition shall use the do-not-capture specifier, that is "`(?:...)`". - -!!! danger "Too broad pattern" - - Let us consider the following ; we add a codec that handles every character in any number of occurrence. It will then capture anything in the given encoding name and will then always resolve to this codec, preventing any other codec added afterwards to resolve. - - >>> import codext - >>> identity = lambda text, errors="strict": (text, len(text)) - >>> codext.add("everything", identity, identity, pattern=r".*") - >>> codext.encode("test string", "test-encoding-name") # r".*" matches anything, thus including "test-encoding-name" - 'test string' - >>> codext.decode("test string", "test-encoding-name") - 'test string' - >>> codext.encode("test string", "morse") # "morse" has the precedence on codec "everything" we just added - '- . ... - / ... - .-. .. -. --.' - >>> test = lambda text, errors="strict": ("TEST", len(t)) - >>> codext.add("test", test) # no pattern given ; should then be matched by encoding name "test" - >>> codext.encode("test string", "test") # should give "TEST" if codec "test" was selected - 'test string' # gives the output of codec "test-encoding-name", - # which has precedence on "test" and a too broad pattern - ------ - -### Which `add` function ? - -At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) and define the encode/decode functions yourself. - -A few examples: - -- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) -- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) with its specific endocde/decode functions -- `atbash` defines a dynamic map with a "factory" function, that creates the encoding map according to the parameters supplied in the codec name - -So, before going further, determine the following: - -- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. -- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. -- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) do not forget to specify `no_error=True`. -- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. - -If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and refer -to [Case 2](#case-2-encoding-map). - ------ - -### Case 1: Generic encoding definition - -This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) - -The following shall be considered: - -- `encode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot encode. -- `decode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot decode. - -Both functions must take 2 arguments and return 2 values (in order to stick to `codec`'s encode/decode function format): - -- Inputs: `text`, `errors="strict"` ; respectively the text to encode/decode and the error handling mode. -- Outputs: encoded text and length of consumed input text. - -!!! note "Error handling mode" - - - `strict`: this is the default ; it means that any error shall raise an exception. - - `ignore`: any error is ignored, adding nothing to the output. - - `replace`: any error yields the given replacement character(s). - - `leave`: any error yields the erroneous input token in the output. - - This last mode is an addition to the native ones. It can be useful for some encodings that must cause no error while encoding and can therefore have their original characters in the output. - -Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/master/codext/binary/excess3.py) for an example). - -```python ->>> help(codext.handle_error) -Help on function handle_error in module codext.__common__: - -handle_error(ename, errors, sep='', repl_char='?', repl_minlen=1, decode=False, item='position') - This shortcut function allows to handle error modes given some tuning parameters. - - :param ename: encoding name - :param errors: error handling mode - :param sep: token separator - :param repl_char: replacement character (for use when errors="replace") - :param repl_minlen: repeat number for the replacement character - :param decode: whether we are encoding or decoding - :param item: position item description (for describing the error ; e.g. "group" or "token") - ->>> err = codext.handle_error("test", "strict") ->>> help(err) -Help on function _handle_error in module codext.__common__: - -_handle_error(token, position) - This handles an encoding/decoding error according to the selected handling mode. - - :param token: input token to be encoded/decoded - :param position: token position index - -``` - ------ - -### Case 2: Encoding map - -This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) - -The following options shall be considered: - -- `encmap` (second positional argument): This defines the encoding map and is the core of the codec ; 4 subcases are handled and explained hereafter. -- `repl_char` (keyword-argument ; default: "`?`"): The replacement character can be tuned, especially if the default one clashes with a character from the encoding. -- `sep` (keyword-argument ; default: ""): The separator between encoded tokens can be useful to tune, especially when the encoded tokens have a variable length. -- `ignore_case` (keyword-argument ; default: `None`): This defines where the case shall be ignored ; it can be one of the followings: "`encode`", "`decode`" or "`both`". -- `no_error` (keyword-argument ; default: `False`): This sets if errors should be handled as normal or if no error should be considered, simply leaving the input token as is in the output. -- `intype` (keyword-argument ; default: `None`): This specifies the type the input text should be converted to before applying the encoding map (pre-conversion before really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. -- `outype` (keyword-argument ; default: `None`): This specifies the type the output text of the encoding map should be converted from (post-conversion after really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. - -!!! warning "Input/Output types" - - By default, when `intype` is defined, `outype` takes the same value if left `None`. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be explicitely set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). - -`encmap` can be defined as follows: - -1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/master/codext/languages/radio.py) for an example). -2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/master/codext/others/dna.py) for an example). -3. **Parametrized map**: This variant defines a dictionary of regex-selected encoding maps, that is, a dictionary of dictionaries with keys matching the captured groups from codec's pattern. -4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/master/codext/crypto/affine.py) for an example). - -!!! note "Mapping one input character to multiple output characters" - - In some particular cases (e.g. the `navajo` codec), a single input character can be mapped to multiple output ones. It is possible to define them in a map by simply putting them into a list (e.g. a map with `{'A': ["B", "C", "D"]}`). In this case, while encoding, the output character is randomly chosen (e.g. "`A`" will map to "`D`", another time to "`B`", ...). - ------ - -### Self-generated tests - -In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/master/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/master/codext/common/ordinal.py) for an example). - -A set of examples is a dictionary specifying the test cases to be considered. The keys are the descriptions of the test cases and the values can be either dictionaries of input texts and their output encoded texts or lists of input texts. Each key has the format "`operation(encodings)`". Operations can be: - -- `enc`: This is for testing the encoding of the nested values (that is, a dictionary of input/outputs). -- `dec`: This is for testing the decoding of the nested values (that is, a dictionary of input/outputs). If this is not specified, the test suite automatically tries to decode from what is defined in `enc`. -- `enc-dec`: This is for testing the encoding AND decoding of the nested values (that is, a list of inputs) ; this one does not enforce what should be the output of the encoding but checks that encoding AND decoding leads to the same input text. This is particularly useful when encoding can yield randomly chosen tokens in the encoded output. - -The `encodings` are a `|`-separated list of encoding names, compliant or not with tested codec's pattern. Faulty names can also be tested as of the examples hereafter. - -Examples of `__examples__` test suites: - -```python -__my_examples__ = { - 'enc(BAD)': None -} -``` - -!!! note "Observations" - - - `__my__examples__` is not the standard dunder, therefore requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. - - `BAD` is assumed to be a bad encoding name, therefore having a dictionary value of `None`, meaning that the test should raise a `LookupError`. - -```python -__examples__ = { - 'enc(codec)': {'string': None} -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, therefore NOT requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. - - `codec` is assumed to be a valid encoding name, therefore having a dictionary as its value, but in this special case "`string`" is assumed not to be encoded, its corresponding value is then `None`, meaning that the test should raise a `ValueError`. - -```python -__examples__ = { - 'enc-dec(codec)': ["test string", "TEST STRING", "@random", "@random{1024}"] -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc-dec` is used, meaning that a list of inputs is defined. - - So, whatever its encoded output, the input string shall give the same while applying encoding then decoding. - - The special values `@random` and `@random{1024}`, meaning that test strings are generated from any possible byte-character with a specified length (512 when not specified, otherwise specified with `{...}`). - -```python -__examples__ = { - 'enc(codec)': {"test string": "..."} -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc` only is used, meaning that a dictionary of inputs/outputs is given and `dec` is automatically handled while requiring the exact encoded text but recovering the exact same input while decoding. - -```python -__examples__ = { - 'enc(codec)': {"Test String": "..."}, - 'dec(codec)': {"...": "test string"}, -} -``` - -!!! note "Observations" - - - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. - - `enc` and `dec` are used, meaning that dictionaries of inputs/outputs are given and the input texts are not necessarily the same (i.e. if text case is not handled by the codec). - ------ - -### Adding a new codec to `codext` - -As a checklist when making a codec for addition in `codext`, please follow these steps: - -1. Create your codec file (i.e. starting with a copy of an existing similar one) -2. Place it into the right category folder -3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/master/README.md#list-of-codecs) -4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/master/docs/enc) -5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/master/tests/test_manual.py) - +The purpose of this section is to provide a tutorial for creating new codecs accordingly. + +As explained in [this section](./features.html), `codext` provides the possibility to add new codecs in two ways: + +1. [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56): using this function, the *encode* and *decode* functions must be given as arguments. +2. [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160): using this function, an *encoding map* must be given but can be formatted in different ways to handle various use cases. + +In both cases, a *pattern* is given in argument and aims to define the set of all strings that aim to select this codec. + +!!! important "Codec precedence" + + `codext` uses a local registry that is queried first before attempting native `codecs` lookups. This means that a native codec can be overridden with a *pattern* that matches the same strings. + +The remainder of this section explains how to successfully create a new codec and/or how to make so that it can be added to the library. + +!!! reminder "Contributions welcome !" + + Remember that you can always [submit a request for a new codec](https://github.com/dhondta/python-codext/issues/new) or submit your own with a PR for improving `codext` ! + +----- + +### Generic arguments + +Whatever solution is chosen, the following arguments shall be considered: + +- `ename` (first positional argument): Choose the shortest possible encoding name. If it clashes with another codec, always remember that `codext` resolves codecs in order of registry, that is from the first added. Also, it resolves codecs based on the given pattern. So, a codec with a clashing name could still be selected if the pattern does not match for the codec with the precedence but matches for this codec. +- `pattern` (keyword-argument): If not defined, it defaults to the encoding name. It can be a regular expression ; in this case, it should not be too broad. A codec decode or encode function can be parametrized through the pattern using the **first capture group**. It is important to note that the first capture group is used and not any other. This means that any other group definition shall use the do-not-capture specifier, that is "`(?:...)`". + +!!! danger "Too broad pattern" + + Let us consider the following ; we add a codec that handles every character in any number of occurrence. It will then capture anything in the given encoding name and will then always resolve to this codec, preventing any other codec added afterwards to resolve. + + >>> import codext + >>> identity = lambda text, errors="strict": (text, len(text)) + >>> codext.add("everything", identity, identity, pattern=r".*") + >>> codext.encode("test string", "test-encoding-name") # r".*" matches anything, thus including "test-encoding-name" + 'test string' + >>> codext.decode("test string", "test-encoding-name") + 'test string' + >>> codext.encode("test string", "morse") # "morse" has the precedence on codec "everything" we just added + '- . ... - / ... - .-. .. -. --.' + >>> test = lambda text, errors="strict": ("TEST", len(t)) + >>> codext.add("test", test) # no pattern given ; should then be matched by encoding name "test" + >>> codext.encode("test string", "test") # should give "TEST" if codec "test" was selected + 'test string' # gives the output of codec "test-encoding-name", + # which has precedence on "test" and a too broad pattern + +----- + +### Which `add` function ? + +At this point, it is necessary to determine what kind of codec you want. If it is a simple map of characters, you should definitely use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160). If it is more complex and cannot be handled using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160)'s options, then you should use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) and define the encode/decode functions yourself. + +A few examples: + +- `morse` is a simple map that does not handle case ; it then uses [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `ignore_case` set to "`encode`" (not "`both`" for encoding and decoding as it does not matter anyway for decoding) +- `whitespace` has 2 codecs defined ; the simple one is a simple bit encoding map, therefore using [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype` set to "`bin`" (for pre-converting characters to bits before applying the encoding map), and the complex one uses [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) with its specific endocde/decode functions +- `atbash` defines a dynamic map with a "factory" function, that creates the encoding map according to the parameters supplied in the codec name + +So, before going further, determine the following: + +- What does the new codec map from and to ? E.g. if binary input and ordinal output, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) with `intype="bin"` and `outype="ord"`. +- Is this codec ignoring case ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and specify which operation(s) should ignore case, e.g. `ignore_case="both"` or `ignore_case="decode"`. +- Should this codec handle no error ? If so, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) do not forget to specify `no_error=True`. +- Does the codec yields variable-length encoded tokens ? If so, you can still use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) but you should define `sep` (separator) as `codext` will not be able to handle ambiguities. + +If you find aspects that are not covered in these questions, you shall use [`add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56), then refering to [Case 1](#case-1-generic-encoding-definition). Otherwise, you can use [`add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) and refer +to [Case 2](#case-2-encoding-map). + +----- + +### Case 1: Generic encoding definition + +This uses: [`codext.add`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L56) + +The following shall be considered: + +- `encode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot encode. +- `decode` (keyword-argument ; defaults to `None`): when left `None`, it means that the codec cannot decode. + +Both functions must take 2 arguments and return 2 values (in order to stick to `codec`'s encode/decode function format): + +- Inputs: `text`, `errors="strict"` ; respectively the text to encode/decode and the error handling mode. +- Outputs: encoded text and length of consumed input text. + +!!! note "Error handling mode" + + - `strict`: this is the default ; it means that any error shall raise an exception. + - `ignore`: any error is ignored, adding nothing to the output. + - `replace`: any error yields the given replacement character(s). + - `leave`: any error yields the erroneous input token in the output. + + This last mode is an addition to the native ones. It can be useful for some encodings that must cause no error while encoding and can therefore have their original characters in the output. + +Also, while defining the `encode` and/or `decode` functions, `codext.handle_error` can be used as a shortcut to handle the different modes. It returns a wrapped function that takes `token` and `position` as arguments (see [`excess3`](https://github.com/dhondta/python-codext/blob/master/codext/binary/excess3.py) for an example). + +```python +>>> help(codext.handle_error) +Help on function handle_error in module codext.__common__: + +handle_error(ename, errors, sep='', repl_char='?', repl_minlen=1, decode=False, item='position') + This shortcut function allows to handle error modes given some tuning parameters. + + :param ename: encoding name + :param errors: error handling mode + :param sep: token separator + :param repl_char: replacement character (for use when errors="replace") + :param repl_minlen: repeat number for the replacement character + :param decode: whether we are encoding or decoding + :param item: position item description (for describing the error ; e.g. "group" or "token") + +>>> err = codext.handle_error("test", "strict") +>>> help(err) +Help on function _handle_error in module codext.__common__: + +_handle_error(token, position) + This handles an encoding/decoding error according to the selected handling mode. + + :param token: input token to be encoded/decoded + :param position: token position index + +``` + +----- + +### Case 2: Encoding map + +This uses: [`codext.add_map`](https://github.com/dhondta/python-codext/blob/master/codext/__common__.py#L160) + +The following options shall be considered: + +- `encmap` (second positional argument): This defines the encoding map and is the core of the codec ; 4 subcases are handled and explained hereafter. +- `repl_char` (keyword-argument ; default: "`?`"): The replacement character can be tuned, especially if the default one clashes with a character from the encoding. +- `sep` (keyword-argument ; default: ""): The separator between encoded tokens can be useful to tune, especially when the encoded tokens have a variable length. +- `ignore_case` (keyword-argument ; default: `None`): This defines where the case shall be ignored ; it can be one of the followings: "`encode`", "`decode`" or "`both`". +- `no_error` (keyword-argument ; default: `False`): This sets if errors should be handled as normal or if no error should be considered, simply leaving the input token as is in the output. +- `intype` (keyword-argument ; default: `None`): This specifies the type the input text should be converted to before applying the encoding map (pre-conversion before really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. +- `outype` (keyword-argument ; default: `None`): This specifies the type the output text of the encoding map should be converted from (post-conversion after really encoding) ; this can be one of the followings: `str`, `bin` or `ord`. + +!!! warning "Input/Output types" + + By default, when `intype` is defined, `outype` takes the same value if left `None`. So, if the new encoding uses a pre-conversion to bits (`intype="bin"`) but maps bits to characters (therefore binary conversion to text is not needed), `outype` shall then be explicitely set to "`str`" (or if it maps bits to ordinals, use `outype="ord"`). + +`encmap` can be defined as follows: + +1. **Simple map**: In this case, the encoding map is a dictionary mapping each input character to an output one (see [`radio`](https://github.com/dhondta/python-codext/blob/master/codext/languages/radio.py) for an example). +2. **List of maps**: In this case, encoding maps are put in a list and referenced by their order number starting from 1, meaning that the `pattern` shall define a capture group with values from 1 to the length of this list (see [`dna`](https://github.com/dhondta/python-codext/blob/master/codext/others/dna.py) for an example). +3. **Parametrized map**: This variant defines a dictionary of regex-selected encoding maps, that is, a dictionary of dictionaries with keys matching the captured groups from codec's pattern. +4. **Map factory function**: This one is implemented by a function that returns the composed encoding map. This function takes a single argument according to the capture group from the `pattern` (see [`affine`](https://github.com/dhondta/python-codext/blob/master/codext/crypto/affine.py) for an example). + +!!! note "Mapping one input character to multiple output characters" + + In some particular cases (e.g. the `navajo` codec), a single input character can be mapped to multiple output ones. It is possible to define them in a map by simply putting them into a list (e.g. a map with `{'A': ["B", "C", "D"]}`). In this case, while encoding, the output character is randomly chosen (e.g. "`A`" will map to "`D`", another time to "`B`", ...). + +----- + +### Self-generated tests + +In order to facilitate testing, a test suite can be automatically generated from a set of *examples*. This is defined in the `__examples__` dunder inside codec's source file (see [`sms`](https://github.com/dhondta/python-codext/blob/master/codext/stegano/sms.py) for an example). By default, the `add`/`add_map` function will get `__examples__` from the global scope but this behavior can be overridden by specifying the keyword-argument `examples` (e.g. `add(..., examples=__examples1__)` ; see [`ordinal`](https://github.com/dhondta/python-codext/blob/master/codext/common/ordinal.py) for an example). + +A set of examples is a dictionary specifying the test cases to be considered. The keys are the descriptions of the test cases and the values can be either dictionaries of input texts and their output encoded texts or lists of input texts. Each key has the format "`operation(encodings)`". Operations can be: + +- `enc`: This is for testing the encoding of the nested values (that is, a dictionary of input/outputs). +- `dec`: This is for testing the decoding of the nested values (that is, a dictionary of input/outputs). If this is not specified, the test suite automatically tries to decode from what is defined in `enc`. +- `enc-dec`: This is for testing the encoding AND decoding of the nested values (that is, a list of inputs) ; this one does not enforce what should be the output of the encoding but checks that encoding AND decoding leads to the same input text. This is particularly useful when encoding can yield randomly chosen tokens in the encoded output. + +The `encodings` are a `|`-separated list of encoding names, compliant or not with tested codec's pattern. Faulty names can also be tested as of the examples hereafter. + +Examples of `__examples__` test suites: + +```python +__my_examples__ = { + 'enc(BAD)': None +} +``` + +!!! note "Observations" + + - `__my__examples__` is not the standard dunder, therefore requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. + - `BAD` is assumed to be a bad encoding name, therefore having a dictionary value of `None`, meaning that the test should raise a `LookupError`. + +```python +__examples__ = { + 'enc(codec)': {'string': None} +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, therefore NOT requiring to be specified as the `examples` keyword-argument of `add`/`add_map`. + - `codec` is assumed to be a valid encoding name, therefore having a dictionary as its value, but in this special case "`string`" is assumed not to be encoded, its corresponding value is then `None`, meaning that the test should raise a `ValueError`. + +```python +__examples__ = { + 'enc-dec(codec)': ["test string", "TEST STRING", "@random", "@random{1024}"] +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc-dec` is used, meaning that a list of inputs is defined. + - So, whatever its encoded output, the input string shall give the same while applying encoding then decoding. + - The special values `@random` and `@random{1024}`, meaning that test strings are generated from any possible byte-character with a specified length (512 when not specified, otherwise specified with `{...}`). + +```python +__examples__ = { + 'enc(codec)': {"test string": "..."} +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc` only is used, meaning that a dictionary of inputs/outputs is given and `dec` is automatically handled while requiring the exact encoded text but recovering the exact same input while decoding. + +```python +__examples__ = { + 'enc(codec)': {"Test String": "..."}, + 'dec(codec)': {"...": "test string"}, +} +``` + +!!! note "Observations" + + - `__examples__` is the standard dunder, thus not specified in `add`/`add_map`. + - `enc` and `dec` are used, meaning that dictionaries of inputs/outputs are given and the input texts are not necessarily the same (i.e. if text case is not handled by the codec). + +----- + +### Adding a new codec to `codext` + +As a checklist when making a codec for addition in `codext`, please follow these steps: + +1. Create your codec file (i.e. starting with a copy of an existing similar one) +2. Place it into the right category folder +3. Add it to the list in [`README.md`](https://github.com/dhondta/python-codext/blob/master/README.md#list-of-codecs) +4. Add its documentation in the [right Markdown file](https://github.com/dhondta/python-codext/tree/master/docs/enc) +5. If self-generated tests are not enough, add manual tests in [the related file](https://github.com/dhondta/python-codext/blob/master/tests/test_manual.py) + diff --git a/docs/pages/index.md b/docs/pages/index.md index 185dd25..2579b17 100644 --- a/docs/pages/index.md +++ b/docs/pages/index.md @@ -1,11 +1,9 @@ -## Introduction - -Codext, contraction of "*codecs*" and "*extension*", is a library that gathers many additional encodings for use with [`codecs`](https://docs.python.org/3/library/codecs.html). When imported, it registers new encodings to an extended codecs registry for making the encodings available from the `codecs.(decode|encode|open)` API. It also features [CLI tools](./cli.html) and a [guess mode](./features.html#guess-decode-an-arbitrary-input) for decoding multiple layers of codecs. - -### Setup - -This library is available on [PyPi](https://pypi.python.org/pypi/codext/) and can be simply installed using Pip: - -```sh -pip install codext -``` +Codext, contraction of "*codecs*" and "*extension*", is a library that gathers many additional encodings for use with [`codecs`](https://docs.python.org/3/library/codecs.html). When imported, it registers new encodings to an extended codecs registry for making the encodings available from the `codecs.(decode|encode|open)` API. It also features [CLI tools](./cli.html) and a [guess mode](./features.html#guess-decode-an-arbitrary-input) for decoding multiple layers of codecs. + +### Setup + +This library is available on [PyPi](https://pypi.python.org/pypi/codext/) and can be simply installed using Pip: + +```sh +pip install codext +``` diff --git a/docs/pages/manipulations.md b/docs/pages/manipulations.md index 8857ca7..340f89c 100644 --- a/docs/pages/manipulations.md +++ b/docs/pages/manipulations.md @@ -1,75 +1,74 @@ -## String tranformations - -`codext` also defines multiple dummy string manipulation/transformation codecs, essentially for use with the CLI tool and for the sake of simplicity. - ------ - -### Case-related operations - -These transformation functions are simple string transformations, including `str`'s methods. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`camelcase` | text --> camel-case text | `camel` | no decoding -`capitalize` | text <-> capitalized text | | decoding "uncapitalizes" the text -`lowercase` | text <-> lowercase text | `lower` | decoding is `uppercase` -`pascalcase` | text --> pascal-case text | `pascal` | no decoding -`slugify` | text --> slug | `slug`, `kebab`, `kebabcase` | no decoding -`snakecase` | text --> snake-case text | `snake` | no decoding -`swapcase` | text <-> case-swapped text | `swap`, `invert`, `invertcase` | -`title` | text <-> titled text | | decoding "untitles" the text -`uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase` - -Of course, these transformations have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)). - -Some simple examples: - -```sh -$ echo -en "test string" | codext encode swap-case -TEST STRING - -$ echo -en "test string" | codext encode camel_case -testString - -$ echo -en "test string" | codext encode kebab_case -test-string -``` - ------ - -### Dummy string operations - -These transformation functions are simple string transformations. - -**Codec** | **Conversions** | **Aliases** | **Comment** -:---: | :---: | --- | --- -`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ -`reverse` | text <-> reversed text | | -`reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) -`strip-spaces` | text <-> all whitespaces stripped | | -`substitute` | text <-> text with token substituted | | -`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ - -As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). - -A simple example: - -```sh -$ echo -en "test string" | codext encode reverse-words | codext encode reverse replace-\ _ -string_test -``` - -Another example: - -```sh -$ echo -en "3132333435" | codext encode tokenize-2 -31 32 33 34 35 -``` - -Or using encodings chaining: - -```sh -$ echo -en "test string" | codext encode reverse-words reverse substitute-string/phrase -phrase test -``` - +`codext` also defines multiple dummy string manipulation/transformation codecs, essentially for use with the CLI tool and for the sake of simplicity. + +----- + +### Case-related operations + +These transformation functions are simple string transformations, including `str`'s methods. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`camelcase` | text --> camel-case text | `camel` | no decoding +`capitalize` | text <-> capitalized text | | decoding "uncapitalizes" the text +`lowercase` | text <-> lowercase text | `lower` | decoding is `uppercase` +`pascalcase` | text --> pascal-case text | `pascal` | no decoding +`screamingsnakecase` | text --> screaming-snake-case text | `screaming-snake`, `screaming_snake_case` | no decoding +`slugify` | text --> slug | `slug`, `kebab`, `kebabcase` | no decoding +`snakecase` | text --> snake-case text | `snake` | no decoding +`swapcase` | text <-> case-swapped text | `swap`, `invert`, `invertcase` | +`title` | text <-> titled text | | decoding "untitles" the text +`uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase` + +Of course, these transformations have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)). + +Some simple examples: + +```sh +$ echo -en "test string" | codext encode swap-case +TEST STRING + +$ echo -en "test string" | codext encode camel_case +testString + +$ echo -en "test string" | codext encode kebab_case +test-string +``` + +----- + +### Dummy string operations + +These transformation functions are simple string transformations. + +**Codec** | **Conversions** | **Aliases** | **Comment** +:---: | :---: | --- | --- +`replace` | text <-> text with multi-chars replaced | | parametrized with a _string_ and its _replacement_ +`reverse` | text <-> reversed text | | +`reverse-words` | text <-> reversed words | | same as `reverse` but not on the whole text, only on the words (text split by whitespace) +`strip-spaces` | text <-> all whitespaces stripped | | +`substitute` | text <-> text with token substituted | | +`tokenize` | text <-> text split in tokens of length N | | parametrized with _N_ + +As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)). + +A simple example: + +```sh +$ echo -en "test string" | codext encode reverse-words | codext encode reverse replace-\ _ +string_test +``` + +Another example: + +```sh +$ echo -en "3132333435" | codext encode tokenize-2 +31 32 33 34 35 +``` + +Or using encodings chaining: + +```sh +$ echo -en "test string" | codext encode reverse-words reverse substitute-string/phrase +phrase test +``` + diff --git a/pyproject.toml b/pyproject.toml index 099d04b..b204596 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,17 +16,13 @@ authors = [ description = "Native codecs extension" license = {file = "LICENSE"} keywords = ["python", "development", "programming", "codecs", "encodings"] -requires-python = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,<4" +requires-python = ">=3.8,<4" classifiers = [ "Development Status :: 5 - Production/Stable", "Environment :: Console", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", @@ -34,9 +30,7 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ - "markdown2==2.3.10; python_version=='2.7'", - "markdown2>=2.4.0; python_version>='3.6'", - "six", + "markdown2>=2.4.0", ] dynamic = ["version"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..fcccae1 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +pythonpath = src diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index a4cc557..d3fbbb2 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.14.2 +1.15.0 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index d88dcbe..a2ff0ef 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -1,1517 +1,1520 @@ -# -*- coding: UTF-8 -*- -import _codecs -import codecs -import json -import os -import random -import re -import sys -from encodings.aliases import aliases as ALIASES -from functools import reduce, update_wrapper, wraps -from importlib import import_module -from inspect import currentframe -from itertools import chain, product -from locale import getlocale -from math import log -from pkgutil import iter_modules -from platform import system -from random import randint -from six import binary_type, string_types, text_type, BytesIO -from string import * -from types import FunctionType, ModuleType -try: # Python2 - import __builtin__ as builtins -except ImportError: - import builtins -try: # Python2 - from inspect import getfullargspec -except ImportError: - from inspect import getargspec as getfullargspec -try: # Python2 - from string import maketrans -except ImportError: - maketrans = str.maketrans -try: # Python3 - from importlib import reload -except ImportError: - pass -try: # from Python 3.11, it seems that 'sre_parse' is not bound to 're' anymore - re.sre_parse -except AttributeError: - import sre_parse as __sre_parse - re.sre_parse = __sre_parse - - -__all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", - "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "i2s", "is_native", - "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", "register", - "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", - "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] -CODECS_REGISTRY = None -CODECS_OVERWRITTEN = [] -CODECS_CATEGORIES = ["native", "custom"] -CODECS_CACHE = {} -LANG = getlocale() -if LANG: - LANG = (LANG[0] or "")[:2].lower() -MASKS = { - 'a': printable, - 'b': "".join(chr(i) for i in range(256)), - 'd': digits, - 'h': digits + "abcdef", - 'H': digits + "ABCDEF", - 'l': ascii_lowercase, - 'p': punctuation, - 's': " ", - 'u': ascii_uppercase, -} - -__codecs_registry = [] - -MACROS = {} -PERS_MACROS = {} -PERS_MACROS_FILE = os.path.expanduser("~/.codext-macros.json") - -DARWIN = system() == "Darwin" -LINUX = system() == "Linux" -PY3 = sys.version[0] == "3" -UNIX = DARWIN or LINUX -WINDOWS = system() == "Windows" - -entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]]) - -isb = lambda s: isinstance(s, binary_type) -iss = lambda s: isinstance(s, string_types) -fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x - -s2i = lambda s: int(codecs.encode(s, "base16"), 16) -exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) - - -def i2s(input): - h = hex(input)[2:].rstrip("eL") - return codecs.decode(h.zfill(len(h) + len(h) % 2), "hex") - - -class CodecMacro(tuple): - """Macro details when looking up the codec registry. """ - def __new__(cls, name): - self = tuple.__new__(cls) - self.name = name - # get from personal macros first - try: - self.codecs = PERS_MACROS[name] - except KeyError: - try: - self.codecs = MACROS[name] - except KeyError: - raise LookupError("unknown macro: %s" % name) - if not isinstance(self.codecs, (tuple, list)): - raise ValueError("bad macro list: %s" % str(self.codecs)) - self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) - self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable - # test examples to check that the chain of encodings works - for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): - if re.match(r"enc(-dec)?\(", action): - for e in (examples.keys() if action.startswith("enc(") else examples or []): - rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) - if rd: - for n in (rd.group(2) or "512").split(","): - s = "".join(chr(randint(0, 255)) for i in range(int(n))) - self.encode(s.lower() if rd.group(1) else s) - continue - self.encode(e) - - class Codec: - decode = self.decode - encode = self.encode - - class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - return b(self.encode(input, self.errors)[0]) - self.incrementalencoder = IncrementalEncoder - - class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return ensure_str(self.decode(input, self.errors)[0]) - self.incrementaldecoder = IncrementalDecoder - - class StreamWriter(Codec, codecs.StreamWriter): - charbuffertype = bytes - self.streamwriter = StreamWriter - - class StreamReader(Codec, codecs.StreamReader): - charbuffertype = bytes - self.streamreader = StreamReader - - return self - - def decode(self, input, error="strict"): - """ Decode with each codec in reverse order. """ - for ci in self.codecs[::-1]: - input, l = ci.decode(input, error) - return input, l - - def encode(self, input, error="strict"): - """ Encode with each codec. """ - for ci in self.codecs: - input, l = ci.encode(input, error) - return input, l - - def __repr__(self): - return "" % (self.name, id(self)) - - -# inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python -class Repr(object): - def __init__(self, name, func): - self.__name = name - self.__func = func - update_wrapper(self, func) - - def __call__(self, *args, **kwargs): - return self.__func(*args, **kwargs) - - def __repr__(self): - return "" % (self.__name, id(self)) - - -def __stdin_pipe(): - """ Stdin pipe read function. """ - try: - with open(0, 'rb') as f: - for l in f: - yield l - except TypeError: - for l in sys.stdin: - yield l - - -def _input(infile): - # handle input file or stdin - c = b("") - if infile: - with open(infile, 'rb') as f: - c = f.read() - else: - for line in __stdin_pipe(): - c += line - return c - - -def _set_exc(name, etype="ValueError"): - if not hasattr(builtins, name): - exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) - setattr(builtins, name, locals()[name]) -_set_exc("InputSizeLimitError") -_set_exc("ParameterError") - - -def _stripl(s, st_lines, st_crlf): - if st_crlf: - s = s.replace(b"\r\n", b"") if isb(s) else s.replace("\r\n", "") - if st_lines: - s = s.replace(b"\n", b"") if isb(s) else s.replace("\n", "") - return s - - -def _with_repr(name): - def _wrapper(f): - return Repr(name, f) - return _wrapper - - -def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs): - """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically - naming the encoding with a pattern and with file handling. - - :param ename: encoding name - :param encode: encoding function or None - :param decode: decoding function or None - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - remove(ename) - if encode: - if not isinstance(encode, FunctionType): - raise ValueError("Bad 'encode' function") - _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin - if decode: - if not isinstance(decode, FunctionType): - raise ValueError("Bad 'decode' function") - _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin - if not encode and not decode: - raise ValueError("At least one en/decoding function must be defined") - for exc in kwargs.get('extra_exceptions', []): - _set_exc(exc) # create additional custom exceptions as builtins - glob = currentframe().f_back.f_globals - # search function for the new encoding - @_with_repr(ename) - def getregentry(encoding): - if encoding != ename and not (pattern and re.match(pattern, encoding)): - return - fenc, fdec, name = encode, decode, encoding - # prepare CodecInfo input arguments - if pattern: - m, args, i = re.match(pattern, encoding), [], 1 - try: - while True: - try: - g = m.group(i) or "" - if g.isdigit() and not g.startswith("0") and "".join(set(g)) != "01": - g = int(g) - args += [g] - i += 1 - except AttributeError: - # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match - if m is not None: - raise - return - except IndexError: - # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; - # in this case, if fenc/fdec is a decorated function, execute it with no arg - if len(args) == 0: - if fenc and len(getfullargspec(fenc).args) == 1: - fenc = fenc() - if fdec and len(getfullargspec(fdec).args) == 1: - fdec = fdec() - else: - fenc = fenc(*args) if fenc else fenc - fdec = fdec(*args) if fdec else fdec - if fenc: - fenc = fix_inout_formats(fenc) - if fdec: - fdec = fix_inout_formats(fdec) - sl, sc = kwargs.pop('strip_lines', False), kwargs.pop('strip_crlf', False) - if sl or sc: - def _striplines(f): - def __wrapper(input, *a, **kw): - return f(_stripl(input, sc, sl), *a, **kw) - return __wrapper - # this fixes issues with wrapped encoded inputs - fdec = _striplines(fdec) - - class Codec(codecs.Codec): - def encode(self, input, errors="strict"): - if fenc is None: - raise NotImplementedError - return fenc(input, errors) - - def decode(self, input, errors="strict"): - if fdec is None: - raise NotImplementedError - return fdec(input, errors) - - class IncrementalEncoder(codecs.IncrementalEncoder): - def encode(self, input, final=False): - if fenc is None: - raise NotImplementedError - return b(fenc(input, self.errors)[0]) - - class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - if fdec is None: - raise NotImplementedError - return ensure_str(fdec(input, self.errors)[0]) - - class StreamWriter(Codec, codecs.StreamWriter): - charbuffertype = bytes - - class StreamReader(Codec, codecs.StreamReader): - charbuffertype = bytes - - ci = codecs.CodecInfo( - name=name, - encode=Codec().encode, - decode=Codec().decode, - incrementalencoder=IncrementalEncoder, - incrementaldecoder=IncrementalDecoder, - streamwriter=StreamWriter, - streamreader=StreamReader, - _is_text_encoding=text, - ) - ci.parameters = kwargs - ci.parameters['name'] = ename - ci.parameters['add_to_codecs'] = add_to_codecs - ci.parameters['pattern'] = pattern - ci.parameters['text'] = text - f = glob.get('__file__', os.path.join("custom", "_")) - cat = f.split(os.path.sep)[-2].rstrip("s") - if cat not in CODECS_CATEGORIES: - CODECS_CATEGORIES.append(cat) - ci.parameters['category'] = kwargs.get('category', cat) - ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__')) - ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename])) or [] - ci.parameters['module'] = kwargs.get('module', glob.get('__name__')) - ci.parameters.setdefault("scoring", {}) - for attr in ["bonus_func", "entropy", "expansion_factor", "len_charset", "penalty", "printables_rate", - "padding_char", "transitive"]: - a = kwargs.pop(attr, None) - if a is not None: - ci.parameters['scoring'][attr] = a - return ci - - getregentry.__name__ = re.sub(r"[\s\-]", "_", ename) - if kwargs.get('aliases'): - getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases'])) - getregentry.__pattern__ = pattern - register(getregentry, add_to_codecs) - return getregentry - - -def add_macro(mname, *encodings): - """ This allows to define a macro, chaining multiple codecs one after the other. This relies on a default set of - macros from a YAML file embedded in the package and a local YAML file from the home folder that takes - precedence for defining personal macros. - - :param mname: macro name - :param encodings: encoding names of the encodings to be chained with the macro - """ - global PERS_MACROS - # check for name clash with alreday existing macros and codecs - if mname in MACROS or mname in PERS_MACROS: - raise ValueError("Macro name already exists") - try: - ci = lookup(mname, False) - raise ValueError("Macro name clashes with codec '%s'" % ci.name) - except LookupError: - pass - try: - PERS_MACROS[mname] = encodings - CodecMacro(mname) - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f, indent=2) - except ValueError: - del PERS_MACROS[mname] - raise -codecs.add_macro = add_macro - - -def add_map(ename, encmap, repl_char="?", sep="", ignore_case=None, no_error=False, intype=None, outype=None, **kwargs): - """ This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs - module dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with - a pattern and with file handling (if text is True). - - :param ename: encoding name - :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture - group of the regex pattern) or a function building the encoding map - :param repl_char: replacement char (used when errors handling is set to "replace") - :param sep: string of possible character separators (hence, only single-char separators are considered) ; - - while encoding, the first separator is used - - while decoding, separators can be mixed in the input text - :param ignore_case: ignore text case while encoding and/or decoding - :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) - :param intype: specify the input type for pre-transforming the input text - :param outype: specify the output type for post-transforming the output text - :param pattern: pattern for dynamically naming the encoding - :param text: specify whether the codec is a text encoding - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - outype = outype or intype - if ignore_case not in [None, "encode", "decode", "both"]: - raise ValueError("Bad ignore_case parameter while creating encoding map") - if intype not in [None, "str", "bin", "ord"]: - raise ValueError("Bad input type parameter while creating encoding map") - if outype not in [None, "str", "bin", "ord"]: - raise ValueError("Bad output type parameter while creating encoding map") - - def __generic_code(decode=False): - def _wrapper(param): - """ The parameter for wrapping comes from the encoding regex pattern ; e.g. - [no pattern] => param will be None everytime - r"barbie[-_]?([1-4])$" => param could be int 1, 2, 3 or 4 - r"^morse(|[-_]?.{3})$" => param could be None, "-ABC" (for mapping to ".-/") - - In order of precedence: - 1. when param is a key in mapdict or mapdict is a list of encoding maps (hence in the case of "barbie...", - param MUST be an int, otherwise for the first case it could clash with a character of the encoding map) - 2. otherwise handle it as a new encoding character map "ABC" translates to ".-/" for morse - """ - p = param - if isinstance(encmap, FunctionType): - mapdict = encmap(p) - p = None - else: - mapdict = encmap - if isinstance(mapdict, dict): - smapdict = {k: v for k, v in mapdict.items()} - elif isinstance(mapdict, list) and isinstance(mapdict[0], dict): - smapdict = {k: v for k, v in mapdict[0].items()} - else: - raise ValueError("Bad mapping dictionary or list of mapping dictionaries") - if p is not None: - # case 1: param is empty string - if p == "": - if isinstance(mapdict, list): - smapdict = {k: v for k, v in mapdict[0].items()} - elif isinstance(mapdict, dict): - if '' in mapdict.keys() and isinstance(mapdict[''], dict): - smapdict = {k: v for k, v in mapdict[''].items()} - else: - smapdict = {k: v for k, v in mapdict.items()} - # no 'else' handling a LookupError here ; this case is covered by the first if/elif/else block - # case 2: list or dictionary or dictionary of numbered encodings - elif isinstance(p, int): - # if mapdict is a list, we shall align the parameter (starting from 1) as an index (starting from 0) - if isinstance(mapdict, list): - p -= 1 - if isinstance(mapdict, list) and 0 <= p < len(mapdict) or \ - isinstance(mapdict, dict) and p in mapdict.keys(): - smapdict = {k: v for k, v in mapdict[p].items()} - else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - # case 3: dictionary of regex-selected encoding mappings - elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): - tmp = None - for r, d in mapdict.items(): - if r == '': # this is already handled in case 1 ; anyway, an empty regex always matches, hence - continue # it must be excluded - if re.match(r, p): - tmp = d - break - if tmp is None: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - smapdict = tmp - # case 4: encoding characters translation - else: - # collect base tokens in order of appearance in the mapping dictionary - base_tokens = "" - for _, c in sorted(mapdict.items()): - for t in c: - for st in t: - if st not in base_tokens: - base_tokens += st - if " " not in sep: - base_tokens = base_tokens.replace(" ", "") - if len(p) > 0 and p[0] in "-_" and len(p[1:]) == len(set(p[1:])) == len(base_tokens): - p = p[1:] - if len(p) == len(set(p)) == len(base_tokens): - t = maketrans(base_tokens, p) - for k, v in smapdict.items(): - smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) - else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) - if ignore_case is not None: - cases = ["upper", "lower"] - case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] - case_e = cases[any(c in str(list(smapdict.keys())) for c in "abcdefghijklmnopqrstuvwxyz")] - i = ignore_case - smapdict = {getattr(k, case_e)() if i in ["both", "encode"] else k: \ - ([getattr(x, case_d)() for x in v] if isinstance(v, list) else getattr(v, case_d)()) \ - if i in ["both", "decode"] else v for k, v in smapdict.items()} - if decode: - tmp = {} - # this has a meaning for encoding maps that could have clashes in encoded chars (e.g. Bacon's cipher ; - # I => abaaa but also J => abaaa, with the following, we keep I instead of letting J overwrite it) - for k, v in sorted(smapdict.items()): - if not isinstance(v, list): - v = [v] - for x in v: - if x not in tmp.keys(): - tmp[x] = k - smapdict, cs = tmp, reduce(lambda acc, x: acc + x, tmp.keys()) - kwargs['strip_lines'], kwargs['strip_crlf'] = "\n" not in set(cs), "\r\n" not in cs - # this allows to avoid an error with Python2 in the "for i, c in enumerate(parts)" loop - if '' not in smapdict.keys(): - smapdict[''] = "" - # determine token and result lengths - tmaxlen = max(map(len, smapdict.keys())) - tminlen = max(1, min(map(len, set(smapdict.keys()) - {''}))) - l = [] - for x in smapdict.values(): - getattr(l, ["append", "extend"][isinstance(x, list)])(x) - rminlen = max(1, min(map(len, set(l) - {''}))) - - # generic encoding/decoding function for map encodings - def code(text, errors="strict"): - icase = ignore_case == "both" or \ - decode and ignore_case == "decode" or \ - not decode and ignore_case == "encode" - if icase: - case = case_d if decode else case_e - if no_error: - errors = "leave" - text = ensure_str(text) - if not decode: - if intype == "bin": - text = "".join("{:0>8}".format(bin(ord(c))[2:]) for c in text) - elif intype == "ord": - text = "".join(str(ord(c)).zfill(3) for c in text) - r = "" - lsep = "" if decode else sep if len(sep) <= 1 else sep[0] - kind = ["character", "token"][tmaxlen > 1] - error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode, kind) - - # get the value from the mapping dictionary, trying the token with its inverted case if relevant - def __get_value(token, position, case_changed=False): - try: - result = smapdict[token] - except KeyError: - if icase and not case_changed: - token_inv_case = getattr(token, case)() - return __get_value(token_inv_case, position, True) - return error_func(token, position) - if isinstance(result, list): - result = result[0] - return result + lsep - - # if a separator is defined, rely on it by splitting the input text - if decode and len(sep) > 0: - for i, c in enumerate(re.split("[" + sep + "]", text)): - r += __get_value(c, i) - # otherwise, move through the text using a cursor for tokenizing it ; this allows defining more complex - # encodings with variable token lengths - else: - cursor, bad = 0, "" - while cursor < len(text): - token = text[cursor:cursor+1] - for l in range(tminlen, tmaxlen + 1): - token = text[cursor:cursor+l] - if token in smapdict.keys() or icase and getattr(token, case)() in smapdict.keys(): - r += __get_value(token, cursor) - cursor += l - break - else: - # collect bad chars and only move the cursor one char to the right - bad += text[cursor] - cursor += 1 - # if the number of bad chars is the minimum token length, consume it and start a new buffer - if len(bad) == tminlen or errors == "leave": - posn = cursor - len(bad) - r += error_func(bad, posn) - bad = "" - if decode: - if outype in ["bin", "ord"]: - tmp, r = "", r.replace(lsep, "") - step = [3, 8][outype == "bin"] - for i in range(0, len(r), step): - s = r[i:i+step] - try: - tmp += chr(int(s, 2) if outype == "bin" else int(s)) - except ValueError: - if len(s) > 0: - tmp += "[" + s + "]" - r = tmp + lsep - return r[:len(r)-len(lsep)], len(b(text)) - return code - if re.search(r"\([^(?:)]", kwargs.get('pattern', "")) is None: - # in this case, there is no capturing group for parametrization - return _wrapper(None) - return _wrapper - - glob = currentframe().f_back.f_globals - kwargs['category'] = glob['__file__'].split(os.path.sep)[-2].rstrip("s") - kwargs['examples'] = kwargs.get('examples', glob.get('__examples__')) - kwargs['encmap'] = encmap - kwargs['repl_char'] = repl_char - kwargs['sep'] = sep - kwargs['ignore_case'] = ignore_case - kwargs['no_error'] = no_error - kwargs['intype'] = intype - kwargs['outype'] = outype - kwargs['module'] = glob.get('__name__') - try: - if isinstance(encmap, dict): - smapdict = {k: v for k, v in encmap.items()} - elif isinstance(encmap, list) and isinstance(encmap[0], dict): - smapdict = {k: v for k, v in encmap[0].items()} - kwargs['repl_minlen'] = i = max(1, min(map(len, set(smapdict.values()) - {''}))) - kwargs['repl_minlen_b'] = max(1, min(map(len, map(b, set(smapdict.values()) - {''})))) - except: - pass - return add(ename, __generic_code(), __generic_code(True), **kwargs) -codecs.add_map = add_map - - -def clear(): - """ Clear codext's local registry of search functions. """ - global __codecs_registry, MACROS, PERS_MACROS - __codecs_registry, MACROS, PERS_MACROS = [], {}, {} -codecs.clear = clear - - -def examples(encoding, number=10): - """ Use the search function to get the matching encodings and provide examples of valid encoding names. """ - e = [] - for name in search(encoding): - for search_function in __codecs_registry: - n = search_function.__name__ - if name in [n, n.replace("_", "-")]: - temp = [] - for s in generate_strings_from_regex(search_function.__pattern__, yield_max=16*number): - temp.append(s) - random.shuffle(temp) - i = 0 - while i < min(number, len(temp)): - if not temp[i].isdigit(): - try: - lookup(temp[i], False) - e.append(temp[i]) - except LookupError: - pass - i += 1 - for alias, codec in ALIASES.items(): - if name == codec: - if codec not in e: - e.append(codec) - if not alias.isdigit(): - e.append(alias) - random.shuffle(e) - return sorted([e[i] for i in range(min(number, len(e)))], key=_human_keys) -codecs.examples = examples - - -def is_native(encoding): - """ Determine if a given encoding is native or not. """ - return lookup(encoding, False).parameters['category'] == "native" - - -def list_categories(): - """ Get a list of all codec categories. """ - c = CODECS_CATEGORIES - root = os.path.dirname(__file__) - for d in os.listdir(root): - if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"): - c.append(d.rstrip("s")) - # particular category, hardcoded from base/_base.py - c += ["base-generic"] - return c -list_categories() - - -def list_encodings(*categories): - """ Get a list of all codecs. """ - # if "non-native" is in the input list, extend the list with the whole categories but "native" - categories, exclude = list(categories), [] - for c in categories[:]: - if c == "non-native": - for c in CODECS_CATEGORIES: - if c == "native" or c in categories: - continue - categories.append(c) - categories.remove("non-native") - if c.startswith("~"): - exclude.append(c[1:]) - categories.remove(c) - try: - categories.remove(c[1:]) - except ValueError: - pass - # now, filter codecs according to the input list of categories - enc = [] - if (len(categories) == 0 or "native" in categories) and "native" not in exclude: - for a in set(ALIASES.values()): - try: - ci = __orig_lookup(a) - except LookupError: - continue - if lookup(a) is ci: - enc.append(ci.name) - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - name = search_function.__name__.replace("_", "-") - p = search_function.__pattern__ - ci = search_function(name) if p is None else search_function(generate_string_from_regex(p)) - c = "other" if ci is None else ci.parameters['category'] - if (len(categories) == 0 or c in categories) and c not in exclude: - enc.append(name) - for category in categories: - if category not in CODECS_CATEGORIES: - raise ValueError("Category '%s' does not exist" % category) - return sorted(list(set(enc)), key=_human_keys) - - -def list_macros(): - """ Get a list of all macros, with the precedence on personal ones. """ - return sorted(list(set(list(MACROS.keys()) + list(PERS_MACROS.keys())))) - - -def remove(name): - """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the - given name. """ - global __codecs_registry, MACROS, PERS_MACROS - tbr = [] - for search_function in __codecs_registry: - if search_function(name) is not None: - tbr.append(search_function) - for search_function in tbr: - __codecs_registry.remove(search_function) - try: - del MACROS[name] - except KeyError: - pass - try: - del PERS_MACROS[name] - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f, indent=2) - except KeyError: - pass - try: - del CODECS_CACHE[name] - except KeyError: - pass - for s in ["En", "De"]: - try: - delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) - except AttributeError: - pass -codecs.remove = remove - - -def reset(): - """ Reset codext's local registry of search functions and macros. """ - global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS - clear() - d = os.path.dirname(__file__) - for pkg in sorted(os.listdir(d)): - if pkg.startswith("_") or not os.path.isdir(os.path.join(d, pkg)): - continue - reload(import_module("codext." + pkg)) - # backup codext's registry - if CODECS_REGISTRY is None: - CODECS_REGISTRY = __codecs_registry[:] - # restore codext's registry - else: - __codecs_registry = CODECS_REGISTRY[:] - # restore codext's embedded set of macros - with open(os.path.join(os.path.dirname(__file__), "macros.json")) as f: - MACROS = json.load(f) - # reload personal set of macros - PERS_MACROS = {} - if os.path.exists(PERS_MACROS_FILE): - with open(PERS_MACROS_FILE) as f: - PERS_MACROS = json.load(f) -codecs.reset = reset - - -# conversion functions -def b(s): - """ Non-crashing bytes conversion function. """ - if PY3: - try: - return s.encode("latin-1") - except: - pass - try: - return s.encode("utf-8") - except: - pass - return s - - -def ensure_str(s, encoding='utf-8', errors='strict'): - """ Similar to six.ensure_str. Adapted here to avoid messing up with six version errors. """ - if not PY3 and isinstance(s, text_type): - return s.encode(encoding, errors) - elif PY3 and isinstance(s, binary_type): - try: - return s.decode(encoding, errors) - except: - return s.decode("latin-1") - return s - - -# make conversion functions compatible with input/output strings/bytes -def fix_inout_formats(f): - """ This decorator ensures that the first output of f will have the same text format as the first input (str or - bytes). """ - @wraps(f) - def _wrapper(*args, **kwargs): - a0 = args[0] - a0_isb = isb(a0) - a0 = ensure_str(a0) if iss(a0) or a0_isb else a0 - r = f(a0, *args[1:], **kwargs) - # special case: input is in bytes ; ensure that the returned length is this of the bytes, not this processed by - # the decode/encode function - if isinstance(r, (tuple, list)) and isinstance(r[1], int) and a0_isb: - r = tuple([list(r)[0]] + [len(args[0])] + list(r)[2:]) - return (fix(r[0], args[0]), ) + r[1:] if isinstance(r, (tuple, list)) else fix(r, args[0]) - return _wrapper - - -# alphabet generation function from a given mask -def get_alphabet_from_mask(mask): - """ This function generates an alphabet from the given mask. The style used is similar to Hashcat ; group keys are - marked with a heading "?". """ - i, alphabet = 0, "" - while i < len(mask): - c = mask[i] - if c == "?" and i < len(mask) - 1 and mask[i+1] in MASKS.keys(): - for c in MASKS[mask[i+1]]: - if c not in alphabet: - alphabet += c - i += 1 - elif c not in alphabet: - alphabet += c - i += 1 - return alphabet - - -# generic error handling function -def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, kind="character", item="position"): - """ This shortcut function allows to handle error modes given some tuning parameters. - - :param ename: encoding name - :param errors: error handling mode - :param sep: token separator - :param repl_char: replacement character (for use when errors="replace") - :param repl_minlen: repeat number for the replacement character - :param decode: whether we are encoding or decoding - :param item: position item description (for describing the error ; e.g. "group" or "token") - """ - exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) - - def _handle_error(token, position, output="", eename=None): - """ This handles an encoding/decoding error according to the selected handling mode. - - :param token: input token to be encoded/decoded - :param position: token position index - :param output: output, as decoded up to the position of the error - """ - if errors == "strict": - msg = "'%s' codec can't %scode %s '%s' in %s %d" - token = ensure_str(token) - token = token[:7] + "..." if len(token) > 10 else token - err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) - err.output = output - err.__cause__ = err - raise err - elif errors == "leave": - return token + sep - elif errors == "replace": - return repl_char * repl_minlen + sep - elif errors == "ignore": - return "" - else: - raise ValueError("Unsupported error handling '{}'".format(errors)) - return _handle_error - - -# codecs module hooks -__orig_lookup = _codecs.lookup -__orig_register = _codecs.register - - -def __add(ename, encode=None, decode=None, pattern=None, text=True, **kwargs): - kwargs.pop('add_to_codecs', None) - return add(ename, encode, decode, pattern, text, True, **kwargs) -__add.__doc__ = add.__doc__ -codecs.add = __add - - -def decode(obj, encoding='utf-8', errors='strict'): - """ Custom decode function relying on the hooked lookup function. """ - return lookup(encoding).decode(obj, errors)[0] -codecs.decode = decode - - -def encode(obj, encoding='utf-8', errors='strict'): - """ Custom encode function relying on the hooked lookup function. """ - n, m = 1, re.search(r"\[(\d+)\]$", encoding) - if m: - n = int(m.group(1)) - encoding = re.sub(r"\[(\d+)\]$", "", encoding) - ci = lookup(encoding) - for i in range(n): - obj = ci.encode(obj, errors)[0] - return obj -codecs.encode = encode - - -def lookup(encoding, macro=True): - """ Hooked lookup function for searching first for codecs in the local registry of this module. """ - # first, try to match the given encoding with codecs' search functions - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - codecinfo = search_function(encoding) - if codecinfo is not None: - return codecinfo - # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - if search_function.__name__.replace("_", "-") == encoding or \ - encoding in getattr(search_function, "__aliases__", []): - codecinfo = search_function(generate_string_from_regex(search_function.__pattern__)) - if codecinfo is not None: - return codecinfo - # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters - try: - ci = __orig_lookup(encoding) - ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} - return ci - except LookupError: - if not macro: - raise - try: - return CodecMacro(encoding) - except LookupError: - e = LookupError("unknown encoding: %s" % encoding) - e.__cause__ = e # stop exception chaining - raise e -codecs.lookup = lookup - - -def register(search_function, add_to_codecs=False): - """ Register function for registering new codecs in the local registry of this module and, if required, in the - native codecs registry (for use with the built-in 'open' function). - - :param search_function: search function for the codecs registry - :param add_to_codecs: also add the search function to the native registry - NB: this will make the codec available in the built-in open(...) but will make it impossible - to remove the codec later - """ - if search_function not in __codecs_registry: - try: - __orig_lookup(search_function.__name__) - l = CODECS_OVERWRITTEN - except LookupError: - l = __codecs_registry - l.append(search_function) - if add_to_codecs: - __orig_register(search_function) - - -def __register(search_function): - """ Same as register(...), but with add_to_codecs set by default to True. """ - register(search_function, True) -codecs.register = __register - - -def search(encoding_regex, extended=True): - """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way - into the local registry but also tries a simple lookup with the original lookup function. """ - matches = [] - for search_function in CODECS_OVERWRITTEN + __codecs_registry: - n = search_function.__name__ - for name in [n, n.replace("_", "-")]: - if re.search(encoding_regex, name): - matches.append(n.replace("_", "-")) - continue - if extended: - # in some cases, encoding_regex can match a generated string that uses a particular portion of its - # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also - # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly - # generated strings - # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of - # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of - # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be - # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list - c = 0 - for i in range(5): - for s in generate_strings_from_regex(search_function.__pattern__): - if re.search(encoding_regex, s): - c += 1 - break - if c >= 3: - matches.append(n) - break - for s, n in ALIASES.items(): - if re.search(encoding_regex, s) or re.search(encoding_regex, n): - matches.append(n) - return sorted(list(set(matches)), key=_human_keys) -codecs.search = search - - -# utility function for the search feature -CATEGORIES = { - 'digit': digits, - 'not_digit': reduce(lambda x, c: x.replace(c, ""), digits, printable), - 'space': whitespace, - 'not_space': reduce(lambda x, c: x.replace(c, ""), whitespace, printable), - 'word': ascii_letters + digits + '_', - 'not_word': reduce(lambda x, c: x.replace(c, ""), ascii_letters + digits + '_', printable), -} -REPEAT_MAX = 10 -STAR_PLUS_MAX = 10 -YIELD_MAX = 100 - - -def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False): - """ Recursive function to generate strings from a regex pattern. """ - if regex is None: - return - __groups = {} - tokens = [] - negate, last_rand = False, None - for state in (regex if parsed else re.sre_parse.parse(b(getattr(regex, "pattern", regex)))): - code = getattr(state[0], "name", state[0]).lower() - value = getattr(state[1], "name", state[1]) - value = value.lower() if isinstance(value, str) else value - if code in ["assert_not", "at"]: - continue - elif code == "any": - charset = list(printable.replace("\n", "")) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) # should be ord(x) with x belongs to [0, 256[ - elif code == "assert": - tokens.append(list(__gen_str_from_re(value[1], star_plus_max, repeat_max, yield_max, True))) - elif code == "branch": - result = [] - for r in value[1]: - result += list(__gen_str_from_re(r, star_plus_max, repeat_max, yield_max, True)) or [""] - tokens.append(result) - elif code == "category": - charset = list(CATEGORIES[value[9:]]) - if negate: - negate = False - charset = list(set(printable).difference(charset)) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) - elif code == "groupref": - tokens.extend(__groups[value]) - elif code == "in": - subtokens = list(__gen_str_from_re(value, star_plus_max, repeat_max, yield_max, True)) - subtokens = [x for l in subtokens for x in l] - tokens.append(subtokens) - elif code == "literal": - tokens.append(chr(value)) - elif code in ["max_repeat", "min_repeat"]: - start, end = value[:2] - end = min(end, star_plus_max) - start = min(start, end) - charset = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) - subtokens = [] - if start == 0 and end == 1: - subtokens.append("") - subtokens.extend(charset) - elif len(charset) ** end > repeat_max: - for i in range(min(repeat_max, 10 * len(charset))): - n = random.randint(start, end + 1) - token = "" if n == 0 else "".join(random.choice(charset) for i in range(n)) - if token not in subtokens: - subtokens.append(token) - else: - i -= 1 - else: - for n in range(start, end + 1): - for c in product(charset, repeat=n): - subtokens.append("".join(c)) - tokens.append(subtokens) - elif code == "negate": - negate = True - elif code == "not_literal": - charset = list(printable.replace(chr(value), "")) - while charset[0] == last_rand and len(charset) > 1: - random.shuffle(charset) - last_rand = charset[0] - tokens.append(charset) - elif code == "range": - tokens.append("".join(chr(i) for i in range(value[0], value[1] + 1))) - elif code == "subpattern": - result = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) - if value[0]: - __groups[value[0]] = result - tokens.append(result) - else: - raise NotImplementedError("Unhandled code '{}'".format(code)) - if len(tokens) == 0: - tokens = [""] - i = 0 - for result in product(*tokens): - yield "".join(result) - i += 1 - if i >= yield_max: - break - - -def _human_keys(text): - """ Sorting function for considering strings with numbers (e.g. base2, base10, base100) """ - tokens = [] - for s in re.split(r"(\d+|\D+)", text): - tokens.append(int(s) if s.isdigit() else s) - return tokens - - -def generate_string_from_regex(regex): - """ Utility function to generate a single string from a regex pattern. """ - if regex: - return list(generate_strings_from_regex(regex, yield_max=1))[0] - - -def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX): - """ Utility function to generate strings from a regex pattern. """ - i = 0 - for result in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): - yield result - - -# guess feature objects -__module_exists = lambda n: n in [x[1] for x in iter_modules()] -stopfunc = ModuleType("stopfunc", """ - Predefined stop functions - ~~~~~~~~~~~~~~~~~~~~~~~~~ - - This submodule contains stop functions for the guess feature of codext. - - - `flag`: searches for the pattern "[Ff][Ll1][Aa4@][Gg9]" (either UTF-8 or UTF-16) - - `lang_**`: checks if the given lang (any from the PROFILES_DIRECTORY of the langdetect module) is detected - - `printables`: checks that every output character is in the set of printables - - `regex`: takes one argument, the regular expression, for checking a string against the given pattern - - `text`: checks for printables and an entropy less than 4.6 (empirically determined) -""") -stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s)) -stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables" -stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None -stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex" -stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 -stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text" -stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None -stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag" -stopfunc.default = stopfunc.text - -stopfunc.LANG_BACKEND = None -stopfunc.LANG_BACKENDS = [n for n in ["pycld2", "langdetect", "langid", "cld3", "textblob"] if __module_exists(n)] -if len(stopfunc.LANG_BACKENDS) > 0: - stopfunc.LANG_BACKEND = stopfunc.LANG_BACKENDS[0] -if "cld3" in stopfunc.LANG_BACKENDS: - stopfunc.CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|" \ - "hi|hm|hr|ht|hu|hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|" \ - "ml|mn|mr|ms|mt|my|ne|nl|no|ny|pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|" \ - "sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu".split("|") -if "textblob" in stopfunc.LANG_BACKENDS: - stopfunc.TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|" \ - "id|is|it|iw|ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|" \ - "te|th|tl|tr|uk|ur|vi|yi|zh".split("|") - - -def _detect(text): - _lb, t = stopfunc.LANG_BACKEND, ensure_str(text) - if _lb is None: - raise ValueError("No language backend %s" % ["selected", "installed"][len(stopfunc.LANG_BACKENDS) == 0]) - return langid.classify(t)[0] if _lb == "langid" else \ - langdetect.detect(t) if _lb == "langdetect" else \ - pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \ - cld3.get_language(t).language[:2] if _lb == "cld3" else \ - textblob.TextBlob(t).detect_language()[:2] - - -def _lang(lang): - def _test(s): - if not stopfunc.text(s): - return False - try: - return _detect(ensure_str(s))[:2] == lang - except: - return False - return _test - - -def _load_lang_backend(backend=None): - # import the requested backend library if not imported yet - if backend is None or backend in stopfunc.LANG_BACKENDS: - stopfunc.LANG_BACKEND = backend - if backend: - globals()[backend] = __import__(backend) - else: - raise ValueError("Unsupported language detection backend") - # remove language-related stop functions - for attr in dir(stopfunc): - if attr.startswith("_") or not isinstance(getattr(stopfunc, attr), FunctionType): - continue - if re.match(r"lang_[a-z]{2}$", attr): - delattr(stopfunc, attr) - # rebind applicable language-related stop functions - if stopfunc.LANG_BACKEND: - _lb = stopfunc.LANG_BACKEND - if _lb == "langid": - langid.langid.load_model() - for lang in ( - langid.langid.identifier.nb_classes if _lb == "langid" else \ - list(set(p[:2] for p in os.listdir(langdetect.PROFILES_DIRECTORY))) if _lb == "langdetect" else \ - list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \ - stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ - stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ - []): - n = "lang_%s" % lang - setattr(stopfunc, n, _lang(lang)) - getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n - if LANG: - flng = "lang_%s" % LANG - if getattr(stopfunc, flng, None): - stopfunc.default = getattr(stopfunc, flng) -stopfunc._reload_lang = _load_lang_backend - - -def _validate(stop_function, lang_backend="none"): - s, lb = stop_function, lang_backend - if isinstance(s, string_types): - if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ - all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(lb) - f = getattr(stopfunc, s, None) - if f: - return f - elif not isinstance(s, FunctionType): - raise ValueError("Bad stop function") - return s -stopfunc._validate = _validate - - -def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), - stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): - """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ - if depth > min_depth and stop_func(input): - if not stop and (show or debug) and found not in result: - s = repr(input) - s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s - s = "[+] %s: %s" % (", ".join(found), s) - print(s if len(s) <= 80 else s[:77] + "...") - result[found] = input - if depth >= max_depth or len(result) > 0 and stop: - return - prev_enc = found[-1] if len(found) > 0 else "" - e = encodings.get(depth, encodings.get(-1, [])) - for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): - if len(result) > 0 and stop: - return - if debug: - print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) - __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), - stop, show, scoring_heuristic, extended, debug) - - -def __make_encodings_dict(include, exclude): - """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible - encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ - def _develop(d, keep=True): - d = d or {} - for k, v in d.items(): - l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] - # list from in-scope categories and then everything that is not a category - for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): - g = [] - for e in (search(enc, False) or [enc]): - try: - ci = lookup(e, False) - g.extend(ci.parameters['guess']) - except: - pass - if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected - l.append(enc) - else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected - l.extend(g) - d[k] = list(set(l)) - return d - _excl, _incl = _develop(exclude, False), _develop(include) - return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} - - -def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): - """ Filter valid encodings and rank them by relevance. """ - ranking = {} - for e in encodings: - try: - codec = CODECS_CACHE[e] - except KeyError: - try: - CODECS_CACHE[e] = codec = lookup(e, False) - except LookupError: - continue - t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) - if t: - ranking[e] = t - for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): - yield result if yield_score else result[1], encoding - - -class _Text(object): - __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] - - def __init__(self, text, pad_char=None): - self.text = ensure_str(text) - c = self.text[-1] - pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) - self.padding = pad_char is not None and last_char == pad_char - if self.padding: - text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) - self.len = len(self.text) - self.lcharset = len(set(self.text)) - self.printables = float(len([c for c in self.text if c in printable])) / self.len - self.entropy = entropy(self.text) - - -def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): - """ Score relevant encodings given an input. """ - obj = None - sc = codec.parameters.get('scoring', {}) - no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) - # ignore encodings that fail to decode with their default errors handling value - try: - new_input = codec.decode(input)[0] - except: - return - # ignore encodings that give an output identical to the input (identity transformation) or to the previous input - if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): - return - # ignore encodings that transitively give the same output (identity transformation by chaining twice a same - # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) - if transitive and prev_encoding: - ci_prev = lookup(prev_encoding, False) - if ci_prev.parameters['name'] == codec.parameters['name']: - return - # compute input's characteristics only once and only if the control flow reaches this point - pad = sc.get('padding_char') - if obj is None: - obj = _Text(input, pad) - if heuristic: - # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base - # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates - s = -sc.get('penalty', .0) - # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; - # on the contrary, if the length of input text's charset is strictly greater, give a penalty - lcs = sc.get('len_charset', 256) - if isinstance(lcs, type(lambda: None)): - lcs = int(lcs(encoding)) - if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: - s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) - elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: - s -= .2 # this can occur for encodings with no_error set to True - # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, - # or a penalty when it should not be encountered but it is present - if pad and obj.padding: - s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus - elif not pad and obj.padding: - s -= .1 # it could arise a padding character is encountered while not being padding => small penalty - # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when - # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) - if not no_error: - pr = sc.get('printables_rate', 0) - if isinstance(pr, type(lambda: None)): - pr = float(pr(obj.printables)) - if obj.printables - pr <= .05: - s += .1 - expf = sc.get('expansion_factor', 1.) - if expf: - f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f - if isinstance(expf, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - expf = expf(f, encoding) - except TypeError: - expf = expf(f) - if isinstance(expf, (int, float)): - tmp = expf - expf = (1/f - .1 <= 1/expf <= 1/f + .1) - elif isinstance(expf, (tuple, list)) and len(expf) == 2: - expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] - s += [-1., .1][expf] - # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the - # number of input characters to take bad entropies of shorter strings into account - entr = sc.get('entropy', lambda e: e) - entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr - if isinstance(entr, type(lambda: None)): - try: # this case allows to consider the current encoding name from the current codec - entr = entr(obj.entropy, encoding) - except TypeError: - entr = entr(obj.entropy) - if entr is not None: - # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) - d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) - if d_entr <= .5: - s += .5 - d_entr - # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) - bonus = sc.get('bonus_func') - if bonus is not None: - if isinstance(bonus, type(lambda: None)): - bonus = bonus(obj, codec, encoding) - if bonus: - s += .2 - else: - s = 1. - # exclude negative (and eventually null) scores as they are (hopefully) not relevant - if extended and s >= .0 or not extended and s > .0: - return s, new_input - - -def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), - stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): - """ Try decoding without the knowledge of the encoding(s). - - :param input: input text to be guessed - :param stop_func: function defining the stop condition - :param min_depth: minimum search depth - :param max_depth: maximum search depth - ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per - depth (nothing means include every encoding) - :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per - depth (nothing means exclude no encoding) - :param found: tuple of already found encodings - :param stop: whether to stop or not when a valid solution is found - :param show: whether to immediately show once a solution is found - :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., - meaning that every non-failing encoding will be considered with no order of precedence) - :param extended: whether to also consider null scores with the heuristic - :param debug: whether to show each attempt at each depth during computation - """ - if len(input) == 0: - return "" - # check for min and max depths - if max_depth <= 0: - raise ValueError("Depth must be a non-null positive integer") - if min_depth > max_depth: - raise ValueError("Min depth shall be less than or equal to the max depth") - # take the tuple of found encodings into account - if len(found) > 0: - for encoding in found: - input = decode(input, encoding) - # handle the stop function as a regex if a string was given - if isinstance(stop_func, string_types): - stop_func = stopfunc.regex(stop_func) - # reformat include and exclude arguments ; supported formats: - for n, l in zip(["inc", "exc"], [include, exclude]): - if l is None: - if n == "inc": - include = l = {-1: CODECS_CATEGORIES} - else: - exclude = l = {} - # "category" OR "enc_name" OR whatever => means a single item for all depths - if isinstance(l, string_types): - if n == "inc": - include = l = {-1: [l]} - else: - exclude = l = {-1: [l]} - # ["enc_name1", "enc_name2", ...] => means for all depths - if isinstance(l, (list, tuple)): - if n == "inc": - include = l = {-1: l} - else: - exclude = l = {-1: l} - # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings - if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): - raise ValueError("Include argument shall be a list or a dictionary with integer keys") - # precompute encodings lists per depth and cache the related CodecInfo objects - encodings, result = __make_encodings_dict(include, exclude), {} - try: - # breadth-first search - for d in range(max_depth): - __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, - scoring_heuristic, extended, debug) - if stop and len(result) > 0: - break - except KeyboardInterrupt: - pass - CODECS_CACHE = {} - return result -codecs.guess = guess - - -def rank(input, extended=False, limit=-1, include=None, exclude=None): - """ Rank the most probable encodings based on the given input. - - :param input: input text to be evaluated - :param extended: whether to consider null scores too (NB: negative scores are not output !) - :param limit: number of encodings to be returned (-1 means all of them) - :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) - :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) - """ - encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, - exclude if isinstance(exclude, dict) else {-1: exclude or []}) - r = list(__rank(None, input, "", encodings[-1], True, extended, True)) - return r[:limit] if len(r) > 1 else r -codecs.rank = rank - +# -*- coding: UTF-8 -*- +import _codecs +import codecs +import hashlib +import json +import os +import random +import re +import sre_parse +import sys +from encodings.aliases import aliases as ALIASES +from functools import reduce, update_wrapper, wraps +from importlib import import_module +from inspect import currentframe +from io import BytesIO +from itertools import chain, product +from locale import getlocale +from math import log +from pkgutil import iter_modules +from platform import system +from random import randint +from string import * +from types import FunctionType, ModuleType +try: # Python2 + import __builtin__ as builtins +except ImportError: + import builtins +try: # Python2 + from inspect import getfullargspec +except ImportError: + from inspect import getargspec as getfullargspec +try: # Python2 + from string import maketrans +except ImportError: + maketrans = str.maketrans +try: # Python3 + from importlib import reload +except ImportError: + pass + +# from Python 3.11, it seems that 'sre_parse' is not bound to 're' anymore +re.sre_parse = sre_parse + + +__all__ = ["add", "add_macro", "add_map", "b", "clear", "codecs", "decode", "encode", "ensure_str", "examples", "guess", + "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "hashlib", "i2s", + "is_native", "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", + "register", "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", + "DARWIN", "LANG", "LINUX", "MASKS", "UNIX", "WINDOWS"] +CODECS_REGISTRY = None +CODECS_OVERWRITTEN = [] +CODECS_CATEGORIES = ["native", "custom"] +CODECS_CACHE = {} +LANG = getlocale() +if LANG: + LANG = (LANG[0] or "")[:2].lower() +MASKS = { + 'a': printable, + 'b': "".join(chr(i) for i in range(256)), + 'd': digits, + 'h': digits + "abcdef", + 'H': digits + "ABCDEF", + 'l': ascii_lowercase, + 'p': punctuation, + 's': " ", + 'u': ascii_uppercase, +} + +__codecs_registry = [] + +MACROS = {} +PERS_MACROS = {} +PERS_MACROS_FILE = os.path.expanduser("~/.codext-macros.json") + +DARWIN = system() == "Darwin" +LINUX = system() == "Linux" +UNIX = DARWIN or LINUX +WINDOWS = system() == "Windows" + +entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]]) + +isb = lambda s: isinstance(s, bytes) +iss = lambda s: isinstance(s, str) +fix = lambda x, ref: b(x) if isb(ref) else ensure_str(x) if iss(ref) else x + +s2i = lambda s: int(codecs.encode(s, "base16"), 16) +exc_name = lambda e: "".join(t.capitalize() for t in re.split(r"[-_+]", e)) + + +def i2s(input): + h = hex(input)[2:].rstrip("eL") + return codecs.decode(h.zfill(len(h) + len(h) % 2), "hex") + + +class CodecMacro(tuple): + """Macro details when looking up the codec registry. """ + def __new__(cls, name): + self = tuple.__new__(cls) + self.name = name + # get from personal macros first + try: + self.codecs = PERS_MACROS[name] + except KeyError: + try: + self.codecs = MACROS[name] + except KeyError: + raise LookupError("unknown macro: %s" % name) + if not isinstance(self.codecs, (tuple, list)): + raise ValueError("bad macro list: %s" % str(self.codecs)) + self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) + self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable + # test examples to check that the chain of encodings works + for action, examples in (self.codecs[0].parameters.get('examples', {}) or {'enc-dec(': ["T3st str!"]}).items(): + if re.match(r"enc(-dec)?\(", action): + for e in (examples.keys() if action.startswith("enc(") else examples or []): + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + if rd: + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + self.encode(s.lower() if rd.group(1) else s) + continue + self.encode(e) + + class Codec: + decode = self.decode + encode = self.encode + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + return b(self.encode(input, self.errors)[0]) + self.incrementalencoder = IncrementalEncoder + + class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + return ensure_str(self.decode(input, self.errors)[0]) + self.incrementaldecoder = IncrementalDecoder + + class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + self.streamwriter = StreamWriter + + class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + self.streamreader = StreamReader + + return self + + def decode(self, input, error="strict"): + """ Decode with each codec in reverse order. """ + for ci in self.codecs[::-1]: + input, l = ci.decode(input, error) + return input, l + + def encode(self, input, error="strict"): + """ Encode with each codec. """ + for ci in self.codecs: + input, l = ci.encode(input, error) + return input, l + + def __repr__(self): + return "" % (self.name, id(self)) + + +# inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python +class Repr(object): + def __init__(self, name, func): + self.__name = name + self.__func = func + update_wrapper(self, func) + + def __call__(self, *args, **kwargs): + return self.__func(*args, **kwargs) + + def __repr__(self): + return "" % (self.__name, id(self)) + + +def __stdin_pipe(): + """ Stdin pipe read function. """ + try: + with open(0, 'rb') as f: + for l in f: + yield l + except TypeError: + for l in sys.stdin: + yield l + + +def _input(infile): + # handle input file or stdin + c = b("") + if infile: + with open(infile, 'rb') as f: + c = f.read() + else: + for line in __stdin_pipe(): + c += line + return c + + +def _set_exc(name, etype="ValueError"): + if not hasattr(builtins, name): + exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) + setattr(builtins, name, locals()[name]) +_set_exc("InputSizeLimitError") +_set_exc("ParameterError") + + +def _stripl(s, st_lines, st_crlf): + if st_crlf: + s = s.replace(b"\r\n", b"") if isb(s) else s.replace("\r\n", "") + if st_lines: + s = s.replace(b"\n", b"") if isb(s) else s.replace("\n", "") + return s + + +def _with_repr(name): + def _wrapper(f): + return Repr(name, f) + return _wrapper + + +def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs=False, **kwargs): + """ This adds a new codec to the codecs module setting its encode and/or decode functions, eventually dynamically + naming the encoding with a pattern and with file handling. + + :param ename: encoding name + :param encode: encoding function or None + :param decode: decoding function or None + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + remove(ename) + if encode: + if not isinstance(encode, FunctionType): + raise ValueError("Bad 'encode' function") + _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin + if decode: + if not isinstance(decode, FunctionType): + raise ValueError("Bad 'decode' function") + _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin + if not encode and not decode: + raise ValueError("At least one en/decoding function must be defined") + for exc in kwargs.get('extra_exceptions', []): + _set_exc(exc) # create additional custom exceptions as builtins + glob = currentframe().f_back.f_globals + # search function for the new encoding + @_with_repr(ename) + def getregentry(encoding): + if encoding != ename and not (pattern and re.match(pattern, encoding)): + return + fenc, fdec, name = encode, decode, encoding + # prepare CodecInfo input arguments + if pattern: + m, args, i = re.match(pattern, encoding), [], 1 + try: + while True: + try: + g = m.group(i) or "" + if g.isdigit() and not g.startswith("0") and "".join(set(g)) != "01": + g = int(g) + args += [g] + i += 1 + except AttributeError: + # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match + if m is not None: + raise + return + except IndexError: + # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; + # in this case, if fenc/fdec is a decorated function, execute it with no arg + if len(args) == 0: + if fenc and len(getfullargspec(fenc).args) == 1: + fenc = fenc() + if fdec and len(getfullargspec(fdec).args) == 1: + fdec = fdec() + else: + fenc = fenc(*args) if fenc else fenc + fdec = fdec(*args) if fdec else fdec + if fenc: + fenc = fix_inout_formats(fenc) + if fdec: + fdec = fix_inout_formats(fdec) + sl, sc = kwargs.pop('strip_lines', False), kwargs.pop('strip_crlf', False) + if sl or sc: + def _striplines(f): + def __wrapper(input, *a, **kw): + return f(_stripl(input, sc, sl), *a, **kw) + return __wrapper + # this fixes issues with wrapped encoded inputs + fdec = _striplines(fdec) + + class Codec(codecs.Codec): + def encode(self, input, errors="strict"): + if fenc is None: + raise NotImplementedError + return fenc(input, errors) + + def decode(self, input, errors="strict"): + if fdec is None: + raise NotImplementedError + return fdec(input, errors) + + class IncrementalEncoder(codecs.IncrementalEncoder): + def encode(self, input, final=False): + if fenc is None: + raise NotImplementedError + return b(fenc(input, self.errors)[0]) + + class IncrementalDecoder(codecs.IncrementalDecoder): + def decode(self, input, final=False): + if fdec is None: + raise NotImplementedError + return ensure_str(fdec(input, self.errors)[0]) + + class StreamWriter(Codec, codecs.StreamWriter): + charbuffertype = bytes + + class StreamReader(Codec, codecs.StreamReader): + charbuffertype = bytes + + ci = codecs.CodecInfo( + name=name, + encode=Codec().encode, + decode=Codec().decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + _is_text_encoding=text, + ) + ci.parameters = kwargs + ci.parameters['name'] = ename + ci.parameters['add_to_codecs'] = add_to_codecs + ci.parameters['pattern'] = pattern + ci.parameters['text'] = text + f = glob.get('__file__', os.path.join("custom", "_")) + cat = f.split(os.path.sep)[-2].rstrip("s") + if cat not in CODECS_CATEGORIES: + CODECS_CATEGORIES.append(cat) + ci.parameters['category'] = kwargs.get('category', cat) + ci.parameters['examples'] = kwargs.get('examples', glob.get('__examples__')) + ci.parameters['guess'] = kwargs.get('guess', glob.get('__guess__', [ename])) or [] + ci.parameters['module'] = kwargs.get('module', glob.get('__name__')) + ci.parameters.setdefault("scoring", {}) + for attr in ["bonus_func", "entropy", "expansion_factor", "len_charset", "penalty", "printables_rate", + "padding_char", "transitive"]: + a = kwargs.pop(attr, None) + if a is not None: + ci.parameters['scoring'][attr] = a + return ci + + getregentry.__name__ = re.sub(r"[\s\-]", "_", ename) + if kwargs.get('aliases'): + getregentry.__aliases__ = list(map(lambda n: re.sub(r"[\s\-]", "_", n), kwargs['aliases'])) + getregentry.__pattern__ = pattern + register(getregentry, add_to_codecs) + return getregentry + + +def add_macro(mname, *encodings): + """ This allows to define a macro, chaining multiple codecs one after the other. This relies on a default set of + macros from a YAML file embedded in the package and a local YAML file from the home folder that takes + precedence for defining personal macros. + + :param mname: macro name + :param encodings: encoding names of the encodings to be chained with the macro + """ + global PERS_MACROS + # check for name clash with alreday existing macros and codecs + if mname in MACROS or mname in PERS_MACROS: + raise ValueError("Macro name already exists") + try: + ci = lookup(mname, False) + raise ValueError("Macro name clashes with codec '%s'" % ci.name) + except LookupError: + pass + try: + PERS_MACROS[mname] = encodings + CodecMacro(mname) + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f, indent=2) + except ValueError: + del PERS_MACROS[mname] + raise +codecs.add_macro = add_macro + + +def add_map(ename, encmap, repl_char="?", sep="", ignore_case=None, no_error=False, intype=None, outype=None, **kwargs): + """ This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs + module dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with + a pattern and with file handling (if text is True). + + :param ename: encoding name + :param encmap: characters encoding map ; can be a dictionary of encoding maps (for use with the first capture + group of the regex pattern) or a function building the encoding map + :param repl_char: replacement char (used when errors handling is set to "replace") + :param sep: string of possible character separators (hence, only single-char separators are considered) ; + - while encoding, the first separator is used + - while decoding, separators can be mixed in the input text + :param ignore_case: ignore text case while encoding and/or decoding + :param no_error: this encoding triggers no error (hence, always in "leave" errors handling) + :param intype: specify the input type for pre-transforming the input text + :param outype: specify the output type for post-transforming the output text + :param pattern: pattern for dynamically naming the encoding + :param text: specify whether the codec is a text encoding + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + outype = outype or intype + if ignore_case not in [None, "encode", "decode", "both"]: + raise ValueError("Bad ignore_case parameter while creating encoding map") + if intype not in [None, "str", "bin", "ord"]: + raise ValueError("Bad input type parameter while creating encoding map") + if outype not in [None, "str", "bin", "ord"]: + raise ValueError("Bad output type parameter while creating encoding map") + + def __generic_code(decode=False): + def _wrapper(param): + """ The parameter for wrapping comes from the encoding regex pattern ; e.g. + [no pattern] => param will be None everytime + r"barbie[-_]?([1-4])$" => param could be int 1, 2, 3 or 4 + r"^morse(|[-_]?.{3})$" => param could be None, "-ABC" (for mapping to ".-/") + + In order of precedence: + 1. when param is a key in mapdict or mapdict is a list of encoding maps (hence in the case of "barbie...", + param MUST be an int, otherwise for the first case it could clash with a character of the encoding map) + 2. otherwise handle it as a new encoding character map "ABC" translates to ".-/" for morse + """ + p = param + if isinstance(encmap, FunctionType): + mapdict = encmap(p) + p = None + else: + mapdict = encmap + if isinstance(mapdict, dict): + smapdict = {k: v for k, v in mapdict.items()} + elif isinstance(mapdict, list) and isinstance(mapdict[0], dict): + smapdict = {k: v for k, v in mapdict[0].items()} + else: + raise ValueError("Bad mapping dictionary or list of mapping dictionaries") + if p is not None: + # case 1: param is empty string + if p == "": + if isinstance(mapdict, list): + smapdict = {k: v for k, v in mapdict[0].items()} + elif isinstance(mapdict, dict): + if '' in mapdict.keys() and isinstance(mapdict[''], dict): + smapdict = {k: v for k, v in mapdict[''].items()} + else: + smapdict = {k: v for k, v in mapdict.items()} + # no 'else' handling a LookupError here ; this case is covered by the first if/elif/else block + # case 2: list or dictionary or dictionary of numbered encodings + elif isinstance(p, int): + # if mapdict is a list, we shall align the parameter (starting from 1) as an index (starting from 0) + if isinstance(mapdict, list): + p -= 1 + if isinstance(mapdict, list) and 0 <= p < len(mapdict) or \ + isinstance(mapdict, dict) and p in mapdict.keys(): + smapdict = {k: v for k, v in mapdict[p].items()} + else: + raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + # case 3: dictionary of regex-selected encoding mappings + elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): + tmp = None + for r, d in mapdict.items(): + if r == '': # this is already handled in case 1 ; anyway, an empty regex always matches, hence + continue # it must be excluded + if re.match(r, p): + tmp = d + break + if tmp is None: + raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + smapdict = tmp + # case 4: encoding characters translation + else: + # collect base tokens in order of appearance in the mapping dictionary + base_tokens = "" + for _, c in sorted(mapdict.items()): + for t in c: + for st in t: + if st not in base_tokens: + base_tokens += st + if " " not in sep: + base_tokens = base_tokens.replace(" ", "") + if len(p) > 0 and p[0] in "-_" and len(p[1:]) == len(set(p[1:])) == len(base_tokens): + p = p[1:] + if len(p) == len(set(p)) == len(base_tokens): + t = maketrans(base_tokens, p) + for k, v in smapdict.items(): + smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) + else: + raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + if ignore_case is not None: + cases = ["upper", "lower"] + case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] + case_e = cases[any(c in str(list(smapdict.keys())) for c in "abcdefghijklmnopqrstuvwxyz")] + i = ignore_case + smapdict = {getattr(k, case_e)() if i in ["both", "encode"] else k: \ + ([getattr(x, case_d)() for x in v] if isinstance(v, list) else getattr(v, case_d)()) \ + if i in ["both", "decode"] else v for k, v in smapdict.items()} + if decode: + tmp = {} + # this has a meaning for encoding maps that could have clashes in encoded chars (e.g. Bacon's cipher ; + # I => abaaa but also J => abaaa, with the following, we keep I instead of letting J overwrite it) + for k, v in sorted(smapdict.items()): + if not isinstance(v, list): + v = [v] + for x in v: + if x not in tmp.keys(): + tmp[x] = k + smapdict, cs = tmp, reduce(lambda acc, x: acc + x, tmp.keys()) + kwargs['strip_lines'], kwargs['strip_crlf'] = "\n" not in set(cs), "\r\n" not in cs + # this allows to avoid an error with Python2 in the "for i, c in enumerate(parts)" loop + if '' not in smapdict.keys(): + smapdict[''] = "" + # determine token and result lengths + tmaxlen = max(map(len, smapdict.keys())) + tminlen = max(1, min(map(len, set(smapdict.keys()) - {''}))) + l = [] + for x in smapdict.values(): + getattr(l, ["append", "extend"][isinstance(x, list)])(x) + rminlen = max(1, min(map(len, set(l) - {''}))) + + # generic encoding/decoding function for map encodings + def code(text, errors="strict"): + icase = ignore_case == "both" or \ + decode and ignore_case == "decode" or \ + not decode and ignore_case == "encode" + if icase: + case = case_d if decode else case_e + if no_error: + errors = "leave" + text = ensure_str(text) + if not decode: + if intype == "bin": + text = "".join("{:0>8}".format(bin(ord(c))[2:]) for c in text) + elif intype == "ord": + text = "".join(str(ord(c)).zfill(3) for c in text) + r = "" + lsep = "" if decode else sep if len(sep) <= 1 else sep[0] + kind = ["character", "token"][tmaxlen > 1] + error_func = handle_error(ename, errors, lsep, repl_char, rminlen, decode, kind) + + # get the value from the mapping dictionary, trying the token with its inverted case if relevant + def __get_value(token, position, case_changed=False): + try: + result = smapdict[token] + except KeyError: + if icase and not case_changed: + token_inv_case = getattr(token, case)() + return __get_value(token_inv_case, position, True) + return error_func(token, position) + if isinstance(result, list): + result = result[0] + return result + lsep + + # if a separator is defined, rely on it by splitting the input text + if decode and len(sep) > 0: + for i, c in enumerate(re.split("[" + sep + "]", text)): + r += __get_value(c, i) + # otherwise, move through the text using a cursor for tokenizing it ; this allows defining more complex + # encodings with variable token lengths + else: + cursor, bad = 0, "" + while cursor < len(text): + token = text[cursor:cursor+1] + for l in range(tminlen, tmaxlen + 1): + token = text[cursor:cursor+l] + if token in smapdict.keys() or icase and getattr(token, case)() in smapdict.keys(): + r += __get_value(token, cursor) + cursor += l + break + else: + # collect bad chars and only move the cursor one char to the right + bad += text[cursor] + cursor += 1 + # if the number of bad chars is the minimum token length, consume it and start a new buffer + if len(bad) == tminlen or errors == "leave": + posn = cursor - len(bad) + r += error_func(bad, posn) + bad = "" + if decode: + if outype in ["bin", "ord"]: + tmp, r = "", r.replace(lsep, "") + step = [3, 8][outype == "bin"] + for i in range(0, len(r), step): + s = r[i:i+step] + try: + tmp += chr(int(s, 2) if outype == "bin" else int(s)) + except ValueError: + if len(s) > 0: + tmp += "[" + s + "]" + r = tmp + lsep + return r[:len(r)-len(lsep)], len(b(text)) + return code + if re.search(r"\([^(?:)]", kwargs.get('pattern', "")) is None: + # in this case, there is no capturing group for parametrization + return _wrapper(None) + return _wrapper + + glob = currentframe().f_back.f_globals + kwargs['category'] = glob['__file__'].split(os.path.sep)[-2].rstrip("s") + kwargs['examples'] = kwargs.get('examples', glob.get('__examples__')) + kwargs['encmap'] = encmap + kwargs['repl_char'] = repl_char + kwargs['sep'] = sep + kwargs['ignore_case'] = ignore_case + kwargs['no_error'] = no_error + kwargs['intype'] = intype + kwargs['outype'] = outype + kwargs['module'] = glob.get('__name__') + try: + if isinstance(encmap, dict): + smapdict = {k: v for k, v in encmap.items()} + elif isinstance(encmap, list) and isinstance(encmap[0], dict): + smapdict = {k: v for k, v in encmap[0].items()} + kwargs['repl_minlen'] = i = max(1, min(map(len, set(smapdict.values()) - {''}))) + kwargs['repl_minlen_b'] = max(1, min(map(len, map(b, set(smapdict.values()) - {''})))) + except: + pass + return add(ename, __generic_code(), __generic_code(True), **kwargs) +codecs.add_map = add_map + + +def clear(): + """ Clear codext's local registry of search functions. """ + global __codecs_registry, MACROS, PERS_MACROS + __codecs_registry, MACROS, PERS_MACROS = [], {}, {} +codecs.clear = clear + + +def examples(encoding, number=10): + """ Use the search function to get the matching encodings and provide examples of valid encoding names. """ + e = [] + for name in search(encoding): + for search_function in __codecs_registry: + n = search_function.__name__ + if name in [n, n.replace("_", "-")]: + temp = [] + for s in generate_strings_from_regex(search_function.__pattern__, yield_max=16*number): + temp.append(s) + random.shuffle(temp) + i = 0 + while i < min(number, len(temp)): + if not temp[i].isdigit(): + try: + lookup(temp[i], False) + e.append(temp[i]) + except LookupError: + pass + i += 1 + for alias, codec in ALIASES.items(): + if name == codec: + if codec not in e: + e.append(codec) + if not alias.isdigit(): + e.append(alias) + random.shuffle(e) + return sorted([e[i] for i in range(min(number, len(e)))], key=_human_keys) +codecs.examples = examples + + +def is_native(encoding): + """ Determine if a given encoding is native or not. """ + return lookup(encoding, False).parameters['category'] == "native" + + +def list_categories(): + """ Get a list of all codec categories. """ + c = CODECS_CATEGORIES + root = os.path.dirname(__file__) + for d in os.listdir(root): + if os.path.isdir(os.path.join(root, d)) and not d.startswith("__"): + c.append(d.rstrip("s")) + # particular category, hardcoded from base/_base.py + c += ["base-generic"] + return c +list_categories() + + +def list_encodings(*categories): + """ Get a list of all codecs. """ + # if "non-native" is in the input list, extend the list with the whole categories but "native" + categories, exclude = list(categories), [] + for c in categories[:]: + if c == "non-native": + for c in CODECS_CATEGORIES: + if c == "native" or c in categories: + continue + categories.append(c) + categories.remove("non-native") + if c.startswith("~"): + exclude.append(c[1:]) + categories.remove(c) + try: + categories.remove(c[1:]) + except ValueError: + pass + # now, filter codecs according to the input list of categories + enc = [] + if (len(categories) == 0 or "native" in categories) and "native" not in exclude: + for a in set(ALIASES.values()): + try: + ci = __orig_lookup(a) + except LookupError: + continue + if lookup(a) is ci: + enc.append(ci.name) + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + name = search_function.__name__.replace("_", "-") + p = search_function.__pattern__ + ci = search_function(name) if p is None else search_function(generate_string_from_regex(p)) + c = "other" if ci is None else ci.parameters['category'] + if (len(categories) == 0 or c in categories) and c not in exclude: + enc.append(name) + for category in categories: + if category not in CODECS_CATEGORIES: + raise ValueError("Category '%s' does not exist" % category) + return sorted(list(set(enc)), key=_human_keys) + + +def list_macros(): + """ Get a list of all macros, with the precedence on personal ones. """ + return sorted(list(set(list(MACROS.keys()) + list(PERS_MACROS.keys())))) + + +def remove(name): + """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the + given name. """ + global __codecs_registry, MACROS, PERS_MACROS + tbr = [] + for search_function in __codecs_registry: + if search_function(name) is not None: + tbr.append(search_function) + for search_function in tbr: + __codecs_registry.remove(search_function) + try: + del MACROS[name] + except KeyError: + pass + try: + del PERS_MACROS[name] + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f, indent=2) + except KeyError: + pass + try: + del CODECS_CACHE[name] + except KeyError: + pass + for s in ["En", "De"]: + try: + delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) + except AttributeError: + pass +codecs.remove = remove + + +def reset(): + """ Reset codext's local registry of search functions and macros. """ + global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS + clear() + d = os.path.dirname(__file__) + for pkg in sorted(os.listdir(d)): + if pkg.startswith("_") or not os.path.isdir(os.path.join(d, pkg)): + continue + reload(import_module("codext." + pkg)) + # backup codext's registry + if CODECS_REGISTRY is None: + CODECS_REGISTRY = __codecs_registry[:] + # restore codext's registry + else: + __codecs_registry = CODECS_REGISTRY[:] + # restore codext's embedded set of macros + with open(os.path.join(os.path.dirname(__file__), "macros.json")) as f: + MACROS = json.load(f) + # reload personal set of macros + PERS_MACROS = {} + if os.path.exists(PERS_MACROS_FILE): + with open(PERS_MACROS_FILE) as f: + PERS_MACROS = json.load(f) +codecs.reset = reset + + +# conversion functions +def b(s): + """ Non-crashing bytes conversion function. """ + try: + return s.encode("latin-1") + except: + pass + try: + return s.encode("utf-8") + except: + pass + return s + + +def ensure_str(s, encoding='utf-8', errors='strict'): + """ Dummy str conversion function. """ + if isinstance(s, bytes): + try: + return s.decode(encoding, errors) + except: + return s.decode("latin-1") + return s + + +# make conversion functions compatible with input/output strings/bytes +def fix_inout_formats(f): + """ This decorator ensures that the first output of f will have the same text format as the first input (str or + bytes). """ + @wraps(f) + def _wrapper(*args, **kwargs): + a0 = args[0] + a0_isb = isb(a0) + a0 = ensure_str(a0) if iss(a0) or a0_isb else a0 + r = f(a0, *args[1:], **kwargs) + # special case: input is in bytes ; ensure that the returned length is this of the bytes, not this processed by + # the decode/encode function + if isinstance(r, (tuple, list)) and isinstance(r[1], int) and a0_isb: + r = tuple([list(r)[0]] + [len(args[0])] + list(r)[2:]) + return (fix(r[0], args[0]), ) + r[1:] if isinstance(r, (tuple, list)) else fix(r, args[0]) + return _wrapper + + +# alphabet generation function from a given mask +def get_alphabet_from_mask(mask): + """ This function generates an alphabet from the given mask. The style used is similar to Hashcat ; group keys are + marked with a heading "?". """ + i, alphabet = 0, "" + while i < len(mask): + c = mask[i] + if c == "?" and i < len(mask) - 1 and mask[i+1] in MASKS.keys(): + for c in MASKS[mask[i+1]]: + if c not in alphabet: + alphabet += c + i += 1 + elif c not in alphabet: + alphabet += c + i += 1 + return alphabet + + +# generic error handling function +def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=False, kind="character", item="position"): + """ This shortcut function allows to handle error modes given some tuning parameters. + + :param ename: encoding name + :param errors: error handling mode + :param sep: token separator + :param repl_char: replacement character (for use when errors="replace") + :param repl_minlen: repeat number for the replacement character + :param decode: whether we are encoding or decoding + :param item: position item description (for describing the error ; e.g. "group" or "token") + """ + exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) + + def _handle_error(token, position, output="", eename=None): + """ This handles an encoding/decoding error according to the selected handling mode. + + :param token: input token to be encoded/decoded + :param position: token position index + :param output: output, as decoded up to the position of the error + """ + if errors == "strict": + msg = "'%s' codec can't %scode %s '%s' in %s %d" + token = ensure_str(token) + token = token[:7] + "..." if len(token) > 10 else token + err = getattr(builtins, exc)(msg % (eename or ename, ["en", "de"][decode], kind, token, item, position)) + err.output = output + err.__cause__ = err + raise err + elif errors == "leave": + return token + sep + elif errors == "replace": + return repl_char * repl_minlen + sep + elif errors == "ignore": + return "" + else: + raise ValueError("Unsupported error handling '{}'".format(errors)) + return _handle_error + + +# codecs module hooks +__orig_lookup = _codecs.lookup +__orig_register = _codecs.register + + +def __add(ename, encode=None, decode=None, pattern=None, text=True, **kwargs): + kwargs.pop('add_to_codecs', None) + return add(ename, encode, decode, pattern, text, True, **kwargs) +__add.__doc__ = add.__doc__ +codecs.add = __add + + +def decode(obj, encoding='utf-8', errors='strict'): + """ Custom decode function relying on the hooked lookup function. """ + return lookup(encoding).decode(obj, errors)[0] +codecs.decode = decode + + +def encode(obj, encoding='utf-8', errors='strict'): + """ Custom encode function relying on the hooked lookup function. """ + n, m = 1, re.search(r"\[(\d+)\]$", encoding) + if m: + n = int(m.group(1)) + encoding = re.sub(r"\[(\d+)\]$", "", encoding) + ci = lookup(encoding) + for i in range(n): + try: + obj = ci.encode(obj, errors)[0] + except (AttributeError, TypeError) as e: # occurs for encodings that require str as input while 'obj' is bytes + if str(e) not in ["'bytes' object has no attribute 'encode'", + "ord() expected string of length 1, but int found"] or \ + encoding in ["latin-1", "utf-8"]: # encodings considered when using b(...) + raise + obj = ci.encode(ensure_str(obj), errors)[0] + return obj +codecs.encode = encode + + +def lookup(encoding, macro=True): + """ Hooked lookup function for searching first for codecs in the local registry of this module. """ + # first, try to match the given encoding with codecs' search functions + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + codecinfo = search_function(encoding) + if codecinfo is not None: + return codecinfo + # then, if a codec name was given, generate an encoding name from its pattern and get the CodecInfo + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + if search_function.__name__.replace("_", "-") == encoding or \ + encoding in getattr(search_function, "__aliases__", []): + codecinfo = search_function(generate_string_from_regex(search_function.__pattern__)) + if codecinfo is not None: + return codecinfo + # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters + try: + ci = __orig_lookup(encoding) + ci.parameters = {'category': "native", 'module': "codecs", 'name': ALIASES.get(ci.name, ci.name)} + return ci + except LookupError: + if not macro: + raise + try: + return CodecMacro(encoding) + except LookupError: + e = LookupError("unknown encoding: %s" % encoding) + e.__cause__ = e # stop exception chaining + raise e +codecs.lookup = lookup + + +def register(search_function, add_to_codecs=False): + """ Register function for registering new codecs in the local registry of this module and, if required, in the + native codecs registry (for use with the built-in 'open' function). + + :param search_function: search function for the codecs registry + :param add_to_codecs: also add the search function to the native registry + NB: this will make the codec available in the built-in open(...) but will make it impossible + to remove the codec later + """ + if search_function not in __codecs_registry: + try: + __orig_lookup(search_function.__name__) + l = CODECS_OVERWRITTEN + except LookupError: + l = __codecs_registry + l.append(search_function) + if add_to_codecs: + __orig_register(search_function) + + +def __register(search_function): + """ Same as register(...), but with add_to_codecs set by default to True. """ + register(search_function, True) +codecs.register = __register + + +def search(encoding_regex, extended=True): + """ Function similar to lookup but allows to search for an encoding based on a regex instead. It searches this way + into the local registry but also tries a simple lookup with the original lookup function. """ + matches = [] + for search_function in CODECS_OVERWRITTEN + __codecs_registry: + n = search_function.__name__ + for name in [n, n.replace("_", "-")]: + if re.search(encoding_regex, name): + matches.append(n.replace("_", "-")) + continue + if extended: + # in some cases, encoding_regex can match a generated string that uses a particular portion of its + # generating pattern ; e.g. we expect encoding_regex="uu_" to find "uu" and "uu_codec" while it can also + # find "morse" or "atbash" very rarely because of their dynamic patterns and the limited number of randomly + # generated strings + # so, we can use a qualified majority voting to ensure we do not get a "junk" encoding in the list of + # matches ; executing 5 times the string generation for a given codec but adding the codec to the list of + # matches only if we get at least 3 matches ensures that we consider up to 2 failures that could be + # stochastic, therefore drastically decreasing the probability to get a "junk" encoding in the matches list + c = 0 + for i in range(5): + for s in generate_strings_from_regex(search_function.__pattern__): + if re.search(encoding_regex, s): + c += 1 + break + if c >= 3: + matches.append(n) + break + for s, n in ALIASES.items(): + if re.search(encoding_regex, s) or re.search(encoding_regex, n): + matches.append(n) + return sorted(list(set(matches)), key=_human_keys) +codecs.search = search + + +# utility function for the search feature +CATEGORIES = { + 'digit': digits, + 'not_digit': reduce(lambda x, c: x.replace(c, ""), digits, printable), + 'space': whitespace, + 'not_space': reduce(lambda x, c: x.replace(c, ""), whitespace, printable), + 'word': ascii_letters + digits + '_', + 'not_word': reduce(lambda x, c: x.replace(c, ""), ascii_letters + digits + '_', printable), +} +REPEAT_MAX = 10 +STAR_PLUS_MAX = 10 +YIELD_MAX = 100 + + +def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False): + """ Recursive function to generate strings from a regex pattern. """ + if regex is None: + return + __groups = {} + tokens = [] + negate, last_rand = False, None + for state in (regex if parsed else re.sre_parse.parse(b(getattr(regex, "pattern", regex)))): + code = getattr(state[0], "name", state[0]).lower() + value = getattr(state[1], "name", state[1]) + value = value.lower() if isinstance(value, str) else value + if code in ["assert_not", "at"]: + continue + elif code == "any": + charset = list(printable.replace("\n", "")) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) # should be ord(x) with x belongs to [0, 256[ + elif code == "assert": + tokens.append(list(__gen_str_from_re(value[1], star_plus_max, repeat_max, yield_max, True))) + elif code == "branch": + result = [] + for r in value[1]: + result += list(__gen_str_from_re(r, star_plus_max, repeat_max, yield_max, True)) or [""] + tokens.append(result) + elif code == "category": + charset = list(CATEGORIES[value[9:]]) + if negate: + negate = False + charset = list(set(printable).difference(charset)) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) + elif code == "groupref": + tokens.extend(__groups[value]) + elif code == "in": + subtokens = list(__gen_str_from_re(value, star_plus_max, repeat_max, yield_max, True)) + subtokens = [x for l in subtokens for x in l] + tokens.append(subtokens) + elif code == "literal": + tokens.append(chr(value)) + elif code in ["max_repeat", "min_repeat"]: + start, end = value[:2] + end = min(end, star_plus_max) + start = min(start, end) + charset = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) + subtokens = [] + if start == 0 and end == 1: + subtokens.append("") + subtokens.extend(charset) + elif len(charset) ** end > repeat_max: + for i in range(min(repeat_max, 10 * len(charset))): + n = random.randint(start, end + 1) + token = "" if n == 0 else "".join(random.choice(charset) for i in range(n)) + if token not in subtokens: + subtokens.append(token) + else: + i -= 1 + else: + for n in range(start, end + 1): + for c in product(charset, repeat=n): + subtokens.append("".join(c)) + tokens.append(subtokens) + elif code == "negate": + negate = True + elif code == "not_literal": + charset = list(printable.replace(chr(value), "")) + while charset[0] == last_rand and len(charset) > 1: + random.shuffle(charset) + last_rand = charset[0] + tokens.append(charset) + elif code == "range": + tokens.append("".join(chr(i) for i in range(value[0], value[1] + 1))) + elif code == "subpattern": + result = list(__gen_str_from_re(value[-1], star_plus_max, repeat_max, yield_max, True)) + if value[0]: + __groups[value[0]] = result + tokens.append(result) + else: + raise NotImplementedError("Unhandled code '{}'".format(code)) + if len(tokens) == 0: + tokens = [""] + i = 0 + for result in product(*tokens): + yield "".join(result) + i += 1 + if i >= yield_max: + break + + +def _human_keys(text): + """ Sorting function for considering strings with numbers (e.g. base2, base10, base100) """ + tokens = [] + for s in re.split(r"(\d+|\D+)", text): + tokens.append(int(s) if s.isdigit() else s) + return tokens + + +def generate_string_from_regex(regex): + """ Utility function to generate a single string from a regex pattern. """ + if regex: + return list(generate_strings_from_regex(regex, yield_max=1))[0] + + +def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=REPEAT_MAX, yield_max=YIELD_MAX): + """ Utility function to generate strings from a regex pattern. """ + i = 0 + for result in __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max): + yield result + + +# guess feature objects +__module_exists = lambda n: n in [x[1] for x in iter_modules()] +stopfunc = ModuleType("stopfunc", """ + Predefined stop functions + ~~~~~~~~~~~~~~~~~~~~~~~~~ + + This submodule contains stop functions for the guess feature of codext. + + - `flag`: searches for the pattern "[Ff][Ll1][Aa4@][Gg9]" (either UTF-8 or UTF-16) + - `lang_**`: checks if the given lang (any from the PROFILES_DIRECTORY of the langdetect module) is detected + - `printables`: checks that every output character is in the set of printables + - `regex`: takes one argument, the regular expression, for checking a string against the given pattern + - `text`: checks for printables and an entropy less than 4.6 (empirically determined) +""") +stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s)) +stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables" +stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None +stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex" +stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 +stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text" +stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None +stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag" +stopfunc.default = stopfunc.text + +stopfunc.LANG_BACKEND = None +stopfunc.LANG_BACKENDS = [n for n in ["pycld2", "langdetect", "langid", "cld3", "textblob"] if __module_exists(n)] +if len(stopfunc.LANG_BACKENDS) > 0: + stopfunc.LANG_BACKEND = stopfunc.LANG_BACKENDS[0] +if "cld3" in stopfunc.LANG_BACKENDS: + stopfunc.CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|" \ + "hi|hm|hr|ht|hu|hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|" \ + "ml|mn|mr|ms|mt|my|ne|nl|no|ny|pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|" \ + "sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu".split("|") +if "textblob" in stopfunc.LANG_BACKENDS: + stopfunc.TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|" \ + "id|is|it|iw|ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|" \ + "te|th|tl|tr|uk|ur|vi|yi|zh".split("|") + + +def _detect(text): + _lb, t = stopfunc.LANG_BACKEND, ensure_str(text) + if _lb is None: + raise ValueError("No language backend %s" % ["selected", "installed"][len(stopfunc.LANG_BACKENDS) == 0]) + return langid.classify(t)[0] if _lb == "langid" else \ + langdetect.detect(t) if _lb == "langdetect" else \ + pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \ + cld3.get_language(t).language[:2] if _lb == "cld3" else \ + textblob.TextBlob(t).detect_language()[:2] + + +def _lang(lang): + def _test(s): + if not stopfunc.text(s): + return False + try: + return _detect(ensure_str(s))[:2] == lang + except: + return False + return _test + + +def _load_lang_backend(backend=None): + # import the requested backend library if not imported yet + if backend is None or backend in stopfunc.LANG_BACKENDS: + stopfunc.LANG_BACKEND = backend + if backend: + globals()[backend] = __import__(backend) + else: + raise ValueError("Unsupported language detection backend") + # remove language-related stop functions + for attr in dir(stopfunc): + if attr.startswith("_") or not isinstance(getattr(stopfunc, attr), FunctionType): + continue + if re.match(r"lang_[a-z]{2}$", attr): + delattr(stopfunc, attr) + # rebind applicable language-related stop functions + if stopfunc.LANG_BACKEND: + _lb = stopfunc.LANG_BACKEND + if _lb == "langid": + langid.langid.load_model() + for lang in ( + langid.langid.identifier.nb_classes if _lb == "langid" else \ + list(set(p[:2] for p in os.listdir(langdetect.PROFILES_DIRECTORY))) if _lb == "langdetect" else \ + list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \ + stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ + stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ + []): + n = "lang_%s" % lang + setattr(stopfunc, n, _lang(lang)) + getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n + if LANG: + flng = "lang_%s" % LANG + if getattr(stopfunc, flng, None): + stopfunc.default = getattr(stopfunc, flng) +stopfunc._reload_lang = _load_lang_backend + + +def _validate(stop_function, lang_backend="none"): + s, lb = stop_function, lang_backend + if isinstance(s, str): + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + f = getattr(stopfunc, s, None) + if f: + return f + elif not isinstance(s, FunctionType): + raise ValueError("Bad stop function") + return s +stopfunc._validate = _validate + + +def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings, result, found=(), + stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): + """ Perform a breadth-first tree search using a ranking logic to select and prune the list of codecs. """ + if depth > min_depth and stop_func(input): + if not stop and (show or debug) and found not in result: + s = repr(input) + s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s + s = "[+] %s: %s" % (", ".join(found), s) + print(s if len(s) <= 80 else s[:77] + "...") + result[found] = input + if depth >= max_depth or len(result) > 0 and stop: + return + prev_enc = found[-1] if len(found) > 0 else "" + e = encodings.get(depth, encodings.get(-1, [])) + for new_input, encoding in __rank(prev_input, input, prev_enc, e, scoring_heuristic, extended): + if len(result) > 0 and stop: + return + if debug: + print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) + __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), + stop, show, scoring_heuristic, extended, debug) + + +def __make_encodings_dict(include, exclude): + """ Process encodings inclusion and exclusion lists, listing categories and developping codecs' lists of possible + encoding names. It also creates a cache with the CodecInfo objects for improving performance. """ + def _develop(d, keep=True): + d = d or {} + for k, v in d.items(): + l, cc, sc = [], [e for e in v if e in CODECS_CATEGORIES], [e for e in v if e not in CODECS_CATEGORIES] + # list from in-scope categories and then everything that is not a category + for enc in ((list_encodings(*cc) if (len(cc) > 0 or keep) and len(sc) == 0 else []) + sc): + g = [] + for e in (search(enc, False) or [enc]): + try: + ci = lookup(e, False) + g.extend(ci.parameters['guess']) + except: + pass + if enc in g: # e.g. "rot-1" => ["rot-1", "rot-2", ...] ; only "rot-1" is to be selected + l.append(enc) + else: # e.g. "rot" => ["rot-1", "rot-2", ...] ; all the "rot-N" shall be selected + l.extend(g) + d[k] = list(set(l)) + return d + _excl, _incl = _develop(exclude, False), _develop(include) + return {k: [x for x in v if x not in _excl.get(k, [])] for k, v in _incl.items()} + + +def __rank(prev_input, input, prev_encoding, encodings, heuristic=False, extended=False, yield_score=False): + """ Filter valid encodings and rank them by relevance. """ + ranking = {} + for e in encodings: + try: + codec = CODECS_CACHE[e] + except KeyError: + try: + CODECS_CACHE[e] = codec = lookup(e, False) + except LookupError: + continue + t = __score(prev_input, input, prev_encoding, e, codec, heuristic, extended) + if t: + ranking[e] = t + for encoding, result in sorted(ranking.items(), key=lambda x: (-x[1][0], x[0])): + yield result if yield_score else result[1], encoding + + +class _Text(object): + __slots__ = ["entropy", "lcharset", "len", "padding", "printables", "text"] + + def __init__(self, text, pad_char=None): + self.text = ensure_str(text) + c = self.text[-1] + pad_char, last_char = (chr(pad_char), chr(c)) if isinstance(c, int) else (pad_char, c) + self.padding = pad_char is not None and last_char == pad_char + if self.padding: + text = text.rstrip(b(pad_char) if isinstance(text, bytes) else pad_char) + self.len = len(self.text) + self.lcharset = len(set(self.text)) + self.printables = float(len([c for c in self.text if c in printable])) / self.len + self.entropy = entropy(self.text) + + +def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, extended=False): + """ Score relevant encodings given an input. """ + obj = None + sc = codec.parameters.get('scoring', {}) + no_error, transitive = codec.parameters.get('no_error', False), sc.get('transitive', False) + # ignore encodings that fail to decode with their default errors handling value + try: + new_input = codec.decode(input)[0] + except: + return + # ignore encodings that give an output identical to the input (identity transformation) or to the previous input + if len(new_input) == 0 or prev_input is not None and b(input) == b(new_input) or b(prev_input) == b(new_input): + return + # ignore encodings that transitively give the same output (identity transformation by chaining twice a same + # codec (e.g. rot-15 is equivalent to rot-3 and rot-12 or rot-6 and rot-9) + if transitive and prev_encoding: + ci_prev = lookup(prev_encoding, False) + if ci_prev.parameters['name'] == codec.parameters['name']: + return + # compute input's characteristics only once and only if the control flow reaches this point + pad = sc.get('padding_char') + if obj is None: + obj = _Text(input, pad) + if heuristic: + # from here, the goal (e.g. if the input is Base32) is to rank candidate encodings (e.g. multiple base + # codecs) so that we can put the right one as early as possible and eventually exclude bad candidates + s = -sc.get('penalty', .0) + # first, apply a bonus if the length of input text's charset is exactly the same as encoding's charset ; + # on the contrary, if the length of input text's charset is strictly greater, give a penalty + lcs = sc.get('len_charset', 256) + if isinstance(lcs, type(lambda: None)): + lcs = int(lcs(encoding)) + if (pad and obj.padding and lcs + 1 >= obj.lcharset) or lcs >= obj.lcharset: + s += max(.0, round(.6 * (.99 ** (lcs - obj.lcharset)), 5) - .1) + elif (pad and obj.padding and lcs + 1 < obj.lcharset) or lcs < obj.lcharset: + s -= .2 # this can occur for encodings with no_error set to True + # then, take padding into account, giving a bonus if padding is to be encountered and effectively present, + # or a penalty when it should not be encountered but it is present + if pad and obj.padding: + s += .2 # when padding is encountered while it is legitimate, it could be a good indication => bonus + elif not pad and obj.padding: + s -= .1 # it could arise a padding character is encountered while not being padding => small penalty + # give a bonus when the rate of printable characters is greater or equal than expected and a penalty when + # lower only for codecs that DO NOT tolerate errors (otherwise, the printables rate can be biased) + if not no_error: + pr = sc.get('printables_rate', 0) + if isinstance(pr, type(lambda: None)): + pr = float(pr(obj.printables)) + if obj.printables - pr <= .05: + s += .1 + expf = sc.get('expansion_factor', 1.) + if expf: + f = obj.len / float(len(new_input)) # expansion while encoding => at decoding: 1/f + if isinstance(expf, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + expf = expf(f, encoding) + except TypeError: + expf = expf(f) + if isinstance(expf, (int, float)): + tmp = expf + expf = (1/f - .1 <= 1/expf <= 1/f + .1) + elif isinstance(expf, (tuple, list)) and len(expf) == 2: + expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] + s += [-1., .1][expf] + # afterwards, if the input text has an entropy close to the expected one, give a bonus weighted on the + # number of input characters to take bad entropies of shorter strings into account + entr = sc.get('entropy', lambda e: e) + entr = entr.get(encoding, entr.get('default')) if isinstance(entr, dict) else entr + if isinstance(entr, type(lambda: None)): + try: # this case allows to consider the current encoding name from the current codec + entr = entr(obj.entropy, encoding) + except TypeError: + entr = entr(obj.entropy) + if entr is not None: + # use a quadratic heuristic to compute a weight for the entropy delta, aligned on (256,.2) and (512,1) + d_entr = min(3.04575e-06 * obj.len**2 + .000394 * obj.len, 1) * abs(entr - obj.entropy) + if d_entr <= .5: + s += .5 - d_entr + # finally, if relevant, apply a custom bonus (e.g. when a regex pattern is matched) + bonus = sc.get('bonus_func') + if bonus is not None: + if isinstance(bonus, type(lambda: None)): + bonus = bonus(obj, codec, encoding) + if bonus: + s += .2 + else: + s = 1. + # exclude negative (and eventually null) scores as they are (hopefully) not relevant + if extended and s >= .0 or not extended and s > .0: + return s, new_input + + +def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, include=None, exclude=None, found=(), + stop=True, show=False, scoring_heuristic=True, extended=False, debug=False): + """ Try decoding without the knowledge of the encoding(s). + + :param input: input text to be guessed + :param stop_func: function defining the stop condition + :param min_depth: minimum search depth + :param max_depth: maximum search depth + ;param include: inclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means include every encoding) + :param exclude: exclusion item OR list with category, codec or encoding names OR dictionary with lists per + depth (nothing means exclude no encoding) + :param found: tuple of already found encodings + :param stop: whether to stop or not when a valid solution is found + :param show: whether to immediately show once a solution is found + :param scoring_heuristic: whether to apply the scoring heuristic during the search (if disabled, all scores are 1., + meaning that every non-failing encoding will be considered with no order of precedence) + :param extended: whether to also consider null scores with the heuristic + :param debug: whether to show each attempt at each depth during computation + """ + if len(input) == 0: + return "" + # check for min and max depths + if max_depth <= 0: + raise ValueError("Depth must be a non-null positive integer") + if min_depth > max_depth: + raise ValueError("Min depth shall be less than or equal to the max depth") + # take the tuple of found encodings into account + if len(found) > 0: + for encoding in found: + input = decode(input, encoding) + # handle the stop function as a regex if a string was given + if isinstance(stop_func, str): + stop_func = stopfunc.regex(stop_func) + # reformat include and exclude arguments ; supported formats: + for n, l in zip(["inc", "exc"], [include, exclude]): + if l is None: + if n == "inc": + include = l = {-1: CODECS_CATEGORIES} + else: + exclude = l = {} + # "category" OR "enc_name" OR whatever => means a single item for all depths + if isinstance(l, str): + if n == "inc": + include = l = {-1: [l]} + else: + exclude = l = {-1: [l]} + # ["enc_name1", "enc_name2", ...] => means for all depths + if isinstance(l, (list, tuple)): + if n == "inc": + include = l = {-1: l} + else: + exclude = l = {-1: l} + # {-1: [...], 2: [...], ...} => means prefedined depths with their lists of in-/excluded encodings + if not isinstance(l, dict) or not all(isinstance(k, int) for k in l.keys()): + raise ValueError("Include argument shall be a list or a dictionary with integer keys") + # precompute encodings lists per depth and cache the related CodecInfo objects + encodings, result = __make_encodings_dict(include, exclude), {} + try: + # breadth-first search + for d in range(max_depth): + __guess("", input, stop_func, 0, d+1, min_depth, encodings, result, tuple(found), stop, show, + scoring_heuristic, extended, debug) + if stop and len(result) > 0: + break + except KeyboardInterrupt: + pass + CODECS_CACHE = {} + return result +codecs.guess = guess + + +def rank(input, extended=False, limit=-1, include=None, exclude=None): + """ Rank the most probable encodings based on the given input. + + :param input: input text to be evaluated + :param extended: whether to consider null scores too (NB: negative scores are not output !) + :param limit: number of encodings to be returned (-1 means all of them) + :param include: inclusion list with category, codec or encoding names (nothing means include every encoding) + :param exclude: exclusion list with category, codec or encoding names (nothing means exclude no encoding) + """ + encodings = __make_encodings_dict(include if isinstance(include, dict) else {-1: include or CODECS_CATEGORIES}, + exclude if isinstance(exclude, dict) else {-1: exclude or []}) + r = list(__rank(None, input, "", encodings[-1], True, extended, True)) + return r[:limit] if len(r) > 1 else r +codecs.rank = rank + diff --git a/src/codext/__init__.py b/src/codext/__init__.py index f95abb8..67d6b5a 100644 --- a/src/codext/__init__.py +++ b/src/codext/__init__.py @@ -1,255 +1,257 @@ -# -*- coding: UTF-8 -*- -"""Codecs extension module. - -""" -from __future__ import print_function -from _codecs import lookup as orig_lookup -from ast import literal_eval -from six import binary_type, text_type - -from .__common__ import * -from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__ - - -__all__ = ["add", "add_map", "clear", "decode", "encode", "guess", "lookup", "open", "rank", "register", "remove", - "reset"] - -decode = codecs.decode -encode = codecs.encode -guess = codecs.guess -lookup = codecs.lookup -open = codecs.open - -_lst = list -list = list_encodings # not included in __all__ because of shadow name - - -reset() - - -def __format_list(items, include=True): - if items is None: - return - d = {-1: list_encodings() if include else []} - for n, i in enumerate(items): - try: - depth, i = i.split(":") - depth = int(depth.strip().replace("~", "-")) - if depth < 0: - depth = -1 - except ValueError: - if n == 0: - d[-1] = [] - depth = -1 - d.setdefault(depth, []) - d[depth].append(i.strip()) - return d - - -def __print_tabular(lst, space=4): - try: - cols, _ = os.get_terminal_size() - # first, convert the list to a table that fits into the terminal - i, line, w = 0, "", [] - while i < len(lst): - x = lst[i] - l = len(x) - col = "%-{}s".format(l + space) % x - i += 1 - w.append(l) - if len(line) + len(col) > cols: - break - line += col - while True: - t = [lst[j:j+i] for j in range(0, len(lst), i)] - w = [max(0 if j+k*i >= len(lst) else len(lst[j+k*i]) for k in range(len(t))) for j, _ in enumerate(w)] - if sum(w) + space * len(w) >= cols: - i -= 1 - w.pop() - else: - break - print("\n".join("".join("%-{}s".format(w[n] + space) % x for n, x in enumerate(r)) for r in t) + "\n") - except (AttributeError, OSError): - print(", ".join(lst) + "\n") - - -def main(): - import argparse, os - - class _CustomFormatter(argparse.RawTextHelpFormatter): - def __init__(self, prog, **kwargs): - kwargs['max_help_position'] = 32 - super(_CustomFormatter, self).__init__(prog, **kwargs) - - def _format_action_invocation(self, action): - if not action.option_strings: - metavar, = self._metavar_formatter(action, action.dest)(1) - return metavar - else: - return ", ".join(action.option_strings) - - descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ - "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ - .format(__version__, __author__, __email__, __copyright__, __license__, __source__) - examples = "usage examples:\n- " + "\n- ".join([ - "codext search bitcoin", - "codext decode base32 -i file.b32", - "codext encode morse < to_be_encoded.txt", - "echo \"test\" | codext encode base100", - "echo -en \"test\" | codext encode braille -o test.braille", - "codext encode base64 < to_be_encoded.txt > text.b64", - "echo -en \"test\" | codext encode base64 | codext encode base32", - "echo -en \"mrdvm6teie6t2cq=\" | codext encode upper | codext decode base32 | codext decode base64", - "echo -en \"test\" | codext encode upper reverse base32 | codext decode base32 reverse lower", - "echo -en \"test\" | codext encode upper reverse base32 base64 morse", - "echo -en \"test\" | codext encode base64 gzip | codext guess", - "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", - ]) - kw = {'formatter_class': _CustomFormatter} - parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) - kw2 = {'required': True} if PY3 else {} - sparsers = parser.add_subparsers(dest="command", help="command to be executed", **kw2) - parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") - parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") - parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", - help="strip newlines from input (default: False)") - encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) - encode.add_argument("encoding", nargs="+", help="list of encodings to apply") - encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], - help="error handling (default: strict)") - decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) - decode.add_argument("encoding", nargs="+", help="list of encodings to apply") - decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], - help="error handling (default: strict)") - guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) - guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") - guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely not used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - guess.add_argument("-E", "--extended", action="store_true", - help="while using the scoring heuristic, also consider null scores (default: False)") - lng = "lang_%s" % LANG - def_func = lng if getattr(stopfunc, lng, None) else "text" - guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " - "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" - "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" - % def_func) - guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" - " the search but may be more accurate (default: False)") - guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", - help="while using the regex stop function, set it as case-insensitive (default: False)") - if len(stopfunc.LANG_BACKENDS) > 0: - _lb = stopfunc.LANG_BACKEND - guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], - help="natural language detection backend (default: %s)" % _lb) - guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", - help="minimum codec search depth before triggering results (default: 0)") - guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", - help="maximum codec search depth (default: 5)") - guess.add_argument("-s", "--do-not-stop", action="store_true", - help="do not stop if a valid output is found (default: False)") - guess.add_argument("-v", "--verbose", action="store_true", - help="show guessing information and steps (default: False)") - rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) - rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely not used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - rank.add_argument("-E", "--extended", action="store_true", - help="while using the scoring heuristic, also consider null scores (default: False)") - rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", - help="categories, codecs and encodings to be explicitely used ;\n " - "format: [category|codec|encoding] OR depth:[category|codec|encoding]") - rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") - search = sparsers.add_parser("search", help="search for codecs") - search.add_argument("pattern", nargs="+", help="encoding pattern to search") - listi = sparsers.add_parser("list", help="list items") - lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", **kw2) - liste = lsparsers.add_parser("encodings", help="list encodings") - liste.add_argument("category", nargs="+", help="selected categories") - listm = lsparsers.add_parser("macros", help="list macros") - addm = sparsers.add_parser("add-macro", help="add a macro to the registry") - addm.add_argument("name", help="macro's name") - addm.add_argument("encoding", nargs="+", help="list of encodings to chain") - remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") - remm.add_argument("name", help="macro's name") - args = parser.parse_args() - if args.command in ["guess", "rank"]: - args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) - try: - # if a search pattern is given, only handle it - if args.command == "search": - results = [] - for enc in args.pattern: - results.extend(codecs.search(enc)) - print(", ".join(results) or "No encoding found") - return 0 - # add/remove macros (not requiring to input a file or text) - elif args.command == "add-macro": - add_macro(args.name, *args.encoding) - return 0 - elif args.command == "remove-macro": - remove_macro(args.name) - return 0 - # list encodings or macros - elif args.command == "list": - if args.type == "encodings": - cats = args.category or list_categories() - for c in sorted(cats): - l = list_encodings(c) - if len(l) > 0: - if len(cats) > 0: - print(c.upper() + ":") - __print_tabular(l) - elif args.type == "macros": - l = list_macros() - if len(l) > 0: - __print_tabular(l) - return 0 - # handle input file or stdin - c =_input(args.infile) - c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") - # strip any other (CR)LF - if args.strip: - c = re.sub(r"\r?\n", "", c) if isinstance(c, str) else c.replace(b"\r\n", b"").replace(b"\n", b"") - if args.command in ["decode", "encode"]: - # encode or decode - for encoding in args.encoding: - c = getattr(codecs, ["encode", "decode"][args.command == "decode"])(c, encoding, args.errors) - # handle output file or stdout - if args.outfile: - with open(args.outfile, 'wb') as f: - f.write(c) - else: - print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") - elif args.command == "guess": - s, lb = args.stop_function, args.lang_backend - if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ - all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): - stopfunc._reload_lang(lb) - r = codecs.guess(c, - getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, - args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show - not args.no_heuristic, args.extended, args.verbose) - for i, o in enumerate(r.items()): - e, out = o - if len(e) > 0: - if args.outfile: - n, ext = os.path.splitext(args.outfile) - fn = args.outfile if len(r) == 1 else "%s-%d%s" % (n, i+1, ext) - else: - print("Codecs: %s" % ", ".join(e)) - print(ensure_str(out)) - if len(r) == 0: - print("Could not decode :-(") - elif args.command == "rank": - for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): - s = "[+] %.5f: %s" % (i[0], e) - print(s if len(s) <= 80 else s[:77] + "...") - except Exception as e: - raise e - m = str(e) - print("codext: " + m[0].lower() + m[1:]) - +# -*- coding: UTF-8 -*- +"""Codecs extension module. + +""" +from .__common__ import * +from .__info__ import __author__, __copyright__, __email__, __license__, __source__, __version__ + + +__all__ = ["add", "add_map", "clear", "decode", "encode", "guess", "lookup", "open", "rank", "register", "remove", + "reset"] + +decode = codecs.decode +encode = codecs.encode +guess = codecs.guess +lookup = codecs.lookup +open = codecs.open + +_lst = list +list = list_encodings # not included in __all__ because of shadow name + + +reset() + + +# populate codext with attributes from codecs that were not modified +for attr in codecs.__all__: + if attr in __all__: + continue + locals()[attr] = getattr(codecs, attr) + __all__.append(attr) + + +def __format_list(items, include=True): + if items is None: + return + d = {-1: list_encodings() if include else []} + for n, i in enumerate(items): + try: + depth, i = i.split(":") + depth = int(depth.strip().replace("~", "-")) + if depth < 0: + depth = -1 + except ValueError: + if n == 0: + d[-1] = [] + depth = -1 + d.setdefault(depth, []) + d[depth].append(i.strip()) + return d + + +def __print_tabular(lst, space=4): + try: + cols, _ = os.get_terminal_size() + # first, convert the list to a table that fits into the terminal + i, line, w = 0, "", [] + while i < len(lst): + x = lst[i] + l = len(x) + col = "%-{}s".format(l + space) % x + i += 1 + w.append(l) + if len(line) + len(col) > cols: + break + line += col + while True: + t = [lst[j:j+i] for j in range(0, len(lst), i)] + w = [max(0 if j+k*i >= len(lst) else len(lst[j+k*i]) for k in range(len(t))) for j, _ in enumerate(w)] + if sum(w) + space * len(w) >= cols: + i -= 1 + w.pop() + else: + break + print("\n".join("".join("%-{}s".format(w[n] + space) % x for n, x in enumerate(r)) for r in t) + "\n") + except (AttributeError, OSError): + print(", ".join(lst) + "\n") + + +def main(): + import argparse, os + + class _CustomFormatter(argparse.RawTextHelpFormatter): + def __init__(self, prog, **kwargs): + kwargs['max_help_position'] = 32 + super(_CustomFormatter, self).__init__(prog, **kwargs) + + def _format_action_invocation(self, action): + if not action.option_strings: + metavar, = self._metavar_formatter(action, action.dest)(1) + return metavar + else: + return ", ".join(action.option_strings) + + descr = "Codecs Extension (CodExt) {}\n\nAuthor : {} ({})\nCopyright: {}\nLicense : {}\nSource : {}\n" \ + "\nThis tool allows to encode/decode input strings/files with an extended set of codecs.\n\n" \ + .format(__version__, __author__, __email__, __copyright__, __license__, __source__) + examples = "usage examples:\n- " + "\n- ".join([ + "codext search bitcoin", + "codext decode base32 -i file.b32", + "codext encode morse < to_be_encoded.txt", + "echo \"test\" | codext encode base100", + "echo -en \"test\" | codext encode braille -o test.braille", + "codext encode base64 < to_be_encoded.txt > text.b64", + "echo -en \"test\" | codext encode base64 | codext encode base32", + "echo -en \"mrdvm6teie6t2cq=\" | codext encode upper | codext decode base32 | codext decode base64", + "echo -en \"test\" | codext encode upper reverse base32 | codext decode base32 reverse lower", + "echo -en \"test\" | codext encode upper reverse base32 base64 morse", + "echo -en \"test\" | codext encode base64 gzip | codext guess", + "echo -en \"test\" | codext encode base64 gzip | codext guess gzip -c base", + ]) + kw = {'formatter_class': _CustomFormatter} + parser = argparse.ArgumentParser(description=descr, epilog=examples, **kw) + sparsers = parser.add_subparsers(dest="command", help="command to be executed", required=True) + parser.add_argument("-i", "--input-file", dest="infile", help="input file (if none, take stdin as input)") + parser.add_argument("-o", "--output-file", dest="outfile", help="output file (if none, display result to stdout)") + parser.add_argument("-s", "--strip-newlines", action="store_true", dest="strip", + help="strip newlines from input (default: False)") + encode = sparsers.add_parser("encode", help="encode input using the specified codecs", **kw) + encode.add_argument("encoding", nargs="+", help="list of encodings to apply") + encode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], + help="error handling (default: strict)") + decode = sparsers.add_parser("decode", help="decode input using the specified codecs", **kw) + decode.add_argument("encoding", nargs="+", help="list of encodings to apply") + decode.add_argument("-e", "--errors", default="strict", choices=["ignore", "leave", "replace", "strict"], + help="error handling (default: strict)") + guess = sparsers.add_parser("guess", help="try guessing the decoding codecs", **kw) + guess.add_argument("encoding", nargs="*", help="list of known encodings to apply (default: none)") + guess.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-E", "--extended", action="store_true", + help="while using the scoring heuristic, also consider null scores (default: False)") + lng = "lang_%s" % LANG + def_func = lng if getattr(stopfunc, lng, None) else "text" + guess.add_argument("-f", "--stop-function", default=def_func, metavar="FUNC", help="result checking function " + "(default: %s) ; format: printables|text|flag|lang_[bigram]|[regex]\nNB: [regex] is case-" + "sensitive ; add -i to force it as case-insensitive or add '(?i)' in front of the expression" + % def_func) + guess.add_argument("-H", "--no-heuristic", action="store_true", help="DO NOT use the scoring heuristic ; slows down" + " the search but may be more accurate (default: False)") + guess.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + guess.add_argument("-I", "--case-insensitive", dest="icase", action="store_true", + help="while using the regex stop function, set it as case-insensitive (default: False)") + if len(stopfunc.LANG_BACKENDS) > 0: + _lb = stopfunc.LANG_BACKEND + guess.add_argument("-l", "--lang-backend", default=_lb, choices=stopfunc.LANG_BACKENDS + ["none"], + help="natural language detection backend (default: %s)" % _lb) + guess.add_argument("-m", "--min-depth", default=0, type=int, metavar="INT", + help="minimum codec search depth before triggering results (default: 0)") + guess.add_argument("-M", "--max-depth", default=5, type=int, metavar="INT", + help="maximum codec search depth (default: 5)") + guess.add_argument("-s", "--do-not-stop", action="store_true", + help="do not stop if a valid output is found (default: False)") + guess.add_argument("-v", "--verbose", action="store_true", + help="show guessing information and steps (default: False)") + rank = sparsers.add_parser("rank", help="rank the most probable encodings based on the given input", **kw) + rank.add_argument("-e", "--exclude", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely not used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + rank.add_argument("-E", "--extended", action="store_true", + help="while using the scoring heuristic, also consider null scores (default: False)") + rank.add_argument("-i", "--include", nargs="*", action="extend", metavar="CAT|COD|ENC", + help="categories, codecs and encodings to be explicitely used ;\n " + "format: [category|codec|encoding] OR depth:[category|codec|encoding]") + rank.add_argument("-l", "--limit", type=int, default=10, help="limit the number of displayed results") + search = sparsers.add_parser("search", help="search for codecs") + search.add_argument("pattern", nargs="+", help="encoding pattern to search") + listi = sparsers.add_parser("list", help="list items") + lsparsers = listi.add_subparsers(dest="type", help="type of item to be listed", required=True) + liste = lsparsers.add_parser("encodings", help="list encodings") + liste.add_argument("category", nargs="+", help="selected categories") + listm = lsparsers.add_parser("macros", help="list macros") + addm = sparsers.add_parser("add-macro", help="add a macro to the registry") + addm.add_argument("name", help="macro's name") + addm.add_argument("encoding", nargs="+", help="list of encodings to chain") + remm = sparsers.add_parser("remove-macro", help="remove a macro from the registry") + remm.add_argument("name", help="macro's name") + args = parser.parse_args() + if args.command in ["guess", "rank"]: + args.include, args.exclude = __format_list(args.include), __format_list(args.exclude, False) + try: + # if a search pattern is given, only handle it + if args.command == "search": + results = [] + for enc in args.pattern: + results.extend(codecs.search(enc)) + print(", ".join(results) or "No encoding found") + return 0 + # add/remove macros (not requiring to input a file or text) + elif args.command == "add-macro": + add_macro(args.name, *args.encoding) + return 0 + elif args.command == "remove-macro": + remove_macro(args.name) + return 0 + # list encodings or macros + elif args.command == "list": + if args.type == "encodings": + cats = args.category or list_categories() + for c in sorted(cats): + l = list_encodings(c) + if len(l) > 0: + if len(cats) > 0: + print(c.upper() + ":") + __print_tabular(l) + elif args.type == "macros": + l = list_macros() + if len(l) > 0: + __print_tabular(l) + return 0 + # handle input file or stdin + c =_input(args.infile) + c = c.rstrip("\r\n") if isinstance(c, str) else c.rstrip(b"\r\n") + # strip any other (CR)LF + if args.strip: + c = re.sub(r"\r?\n", "", c) if isinstance(c, str) else c.replace(b"\r\n", b"").replace(b"\n", b"") + if args.command in ["decode", "encode"]: + # encode or decode + for encoding in args.encoding: + c = getattr(codecs, ["encode", "decode"][args.command == "decode"])(c, encoding, args.errors) + # handle output file or stdout + if args.outfile: + with open(args.outfile, 'wb') as f: + f.write(c) + else: + print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") + elif args.command == "guess": + s, lb = args.stop_function, args.lang_backend + if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ + all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): + stopfunc._reload_lang(lb) + r = codecs.guess(c, + getattr(stopfunc, s, ["", "(?i)"][args.icase] + s), args.min_depth, args.max_depth, + args.include, args.exclude, args.encoding, not args.do_not_stop, True, # show + not args.no_heuristic, args.extended, args.verbose) + for i, o in enumerate(r.items()): + e, out = o + if len(e) > 0: + if args.outfile: + n, ext = os.path.splitext(args.outfile) + fn = args.outfile if len(r) == 1 else "%s-%d%s" % (n, i+1, ext) + else: + print("Codecs: %s" % ", ".join(e)) + print(ensure_str(out)) + if len(r) == 0: + print("Could not decode :-(") + elif args.command == "rank": + for i, e in codecs.rank(c, args.extended, args.limit, args.include, args.exclude): + s = "[+] %.5f: %s" % (i[0], e) + print(s if len(s) <= 80 else s[:77] + "...") + except Exception as e: + raise e + m = str(e) + print("codext: " + m[0].lower() + m[1:]) + diff --git a/src/codext/base/_base.py b/src/codext/base/_base.py index fce8b9a..27a31e3 100755 --- a/src/codext/base/_base.py +++ b/src/codext/base/_base.py @@ -1,291 +1,290 @@ -# -*- coding: UTF-8 -*- -"""Generic baseN functions. - -""" -from argparse import ArgumentParser, RawTextHelpFormatter -from math import log -from six import integer_types, string_types -from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable -from textwrap import wrap as wraptext -from types import FunctionType, MethodType - -from ..__common__ import * -from ..__common__ import _set_exc -from ..__info__ import __version__ - - -_set_exc("BaseError") -_set_exc("BaseEncodeError") -_set_exc("BaseDecodeError") -""" -Curve fitting: - ->>> import matplotlib.pyplot as plt ->>> import pandas as pd ->>> import scipy.optimize ->>> from statistics import mean ->>> from tinyscript import random ->>> x, y = [], [] ->>> for i in range(2, 256): - v = [] - for j in range(16, 2048, 16): - s = random.randstr(j) - v.append(float(len(codext.encode(s, "base%d-generic" % i))) / len(s)) - x.append(i) - y.append(mean(v)) ->>> data = pd.DataFrame({'base': x, 'expf': y}) ->>> def fit(x, y, func, params): - params, cv = scipy.optimize.curve_fit(func, x, y, params) - print(params) - y2 = func(x, *params) - plt.clf() - plt.plot(x, y, ".", color="blue", alpha=.3) - plt.plot(x, y2, color="red", linewidth=3.0) - plt.show() ->>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (1, 1, 1, 1)) -[ 0.02841434 0.00512664 -0.99999984 0.01543879] ->>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (.028, .005, -1, .015)) -[ 0.02827357 0.00510124 -0.99999984 0.01536941] -""" -EXPANSION_FACTOR = lambda base: 0.02827357 / (base**0.00510124-0.99999984) + 0.01536941 -SIZE_LIMIT = 1024 * 1024 * 1024 - - -def _generate_charset(n): - """ Generate a characters set. - - :param n: size of charset - """ - if 1 < n <= len(printable): - return printable[:n] - elif len(printable) < n < 256: - return "".join(chr(i) for i in range(n)) - raise ValueError("Bad size of character set") - - -def _get_charset(charset, p=""): - """ Characters set selection function. It allows to define charsets in many different ways. - - :param charset: charset object, can be a string (the charset itself), a function (that chooses the right charset - depending on the input parameter) or a dictionary (either by exact key or by pattern matching) - :param p: the parameter for choosing the charset - """ - # case 1: charset is a function, so return its result - if isinstance(charset, FunctionType): - return charset(p) - # case 2: charset is a string, so return it - elif isinstance(charset, string_types): - return charset - # case 3: charset is a dict with keys '' and 'inv', typically for a charset using lowercase and uppercase characters - # that can be inverted - elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]: - return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""] - # case 4: charset is a dict, but not with the specific keys '' and 'inv', so consider it as pattern-charset pairs - elif isinstance(charset, dict): - # try to handle [p]arameter as a simple key - try: - return charset[p] - except KeyError: - pass - # or handle [p]arameter as a pattern - default, n, best = None, None, None - for pattern, cset in charset.items(): - n = len(cset) - if re.match(pattern, ""): - default = cset - continue - m = re.match(pattern, p) - if m: # find the longest match from the patterns - s, e = m.span() - if e - s > len(best or ""): - best = pattern - if best: - return charset[best] - # special case: the given [p]arameter can be the charset itself if it has the right length - p = re.sub(r"^[-_]+", "", p) - if len(p) == n: - return p - # or simply rely on key '' - if default is not None: - return default - raise ValueError("Bad charset descriptor ('%s')" % p) - - -# generic base en/decoding functions -def base_encode(input, charset, errors="strict", exc=BaseEncodeError): - """ Base-10 to base-N encoding. - - :param input: input (str or int) to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - :param exc: exception to be raised in case of error - """ - i, n, r = input if isinstance(input, integer_types) else s2i(input), len(charset), "" - if n == 1: - if i > SIZE_LIMIT: - raise InputSizeLimitError("Input exceeded size limit") - return i * charset[0] - if n == 10: - return str(i) if charset == digits else "".join(charset[int(x)] for x in str(i)) - while i > 0: - i, c = divmod(i, n) - r = charset[c] + r - return r - - -def base_decode(input, charset, errors="strict", exc=BaseDecodeError): - """ Base-N to base-10 decoding. - - :param input: input to be decoded - :param charset: base-N characters set - :param errors: errors handling marker - :param exc: exception to be raised in case of error - """ - i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc) - if n == 1: - return i2s(len(input)) - if n == 10: - return i2s(int(input)) if charset == digits else "".join(str(charset.index(c)) for c in input) - for k, c in enumerate(input): - try: - i = i * n + charset.index(c) - except ValueError: - handle_error("base", errors, exc, decode=True)(c, k, dec(i), "base%d" % n) - return dec(i) - - -# base codec factory functions -def base(charset, pattern, pow2=False, encode_template=base_encode, decode_template=base_decode, name=None, **kwargs): - """ Base-N codec factory. - - :param charset: charset selection function - :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting - the charset) - :param pow2: whether the base codec's N is a power of 2 - """ - cs = _get_charset(charset) - n = len(cs) - nb = log(n, 2) - if pow2 and nb != int(nb): - raise BaseError("Bad charset ; {} is not a power of 2".format(n)) - - def encode(param="", *args): - a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) - def _encode(input, errors="strict"): - if len(input) == 0: - return "", 0 - return encode_template(input, a, errors), len(input) - return _encode - - def decode(param="", *args): - a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) - sl, sc = "\n" not in a, "\n" not in a and not "\r" in a - def _decode(input, errors="strict"): - if len(input) == 0: - return "", 0 - input = _stripl(input, sc, sl) - return decode_template(input, a, errors), len(input) - return _decode - - kwargs['len_charset'] = n - kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) - kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) - n = "base{}".format(n) if name is None else name - try: - g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] - except AttributeError: - g = [n] - kwargs['guess'] = kwargs.get('guess', g) - add(n, encode, decode, pattern, entropy=nb, **kwargs) - - -def base_generic(): - """ Base-N generic codec. """ - def encode(n): - a = _generate_charset(int(n)) - def _encode(input, errors="strict"): - return base_encode(input, a, errors), len(input) - return _encode - - def decode(n): - a = _generate_charset(int(n)) - sl, sc = "\n" not in a, "\n" not in a and not "\r" in a - def _decode(input, errors="strict"): - input = _stripl(input, sc, sl) - return base_decode(input, a, errors), len(input) - return _decode - - add("base", encode, decode, r"^base[-_]?([2-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:[-_]generic)?$", - guess=["base%d-generic" % i for i in range(2, 255)], entropy=lambda e, n: log(int(n.split("-")[0][4:]), 2), - len_charset=lambda n: int(n.split("-")[0][4:]), printables_rate=1., category="base-generic", penalty=.4, - expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) - - -def main(n, ref=None, alt=None, inv=True, swap=True, wrap=True): - base = str(n) + ("-" + alt.lstrip("-") if alt else "") - src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ - {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" - text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ - "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ - " encoded stream." % {'base': base, 'source': src} - text = "\n".join(x for x in wraptext(text, 74)) - descr = """Usage: base%(base)s [OPTION]... [FILE] -Base%(base)s encode or decode FILE, or standard input, to standard output. - -With no FILE, or when FILE is -, read standard input. - -Mandatory arguments to long options are mandatory for short options too. - -d, --decode decode data - -i, --ignore-garbage when decoding, ignore non-alphabet characters -%(inv)s%(swap)s%(wrap)s - - --help display this help and exit - --version output version information and exit - -%(text)s - -Report base%(base)s translation bugs to -Full documentation at: -""" % {'base': base, 'text': text, - 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], - 'swap': ["", " -s, --swapcase swap the case\n"][swap], - 'wrap': ["", " -w, --wrap=COLS wrap encoded lines after COLS character (default 76).\n"+ 26 * " " + \ - "Use 0 to disable line wrapping"][wrap]} - - def _main(): - p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) - p.format_help = MethodType(lambda s: s.description, p) - p.add_argument("file", nargs="?") - p.add_argument("-d", "--decode", action="store_true") - p.add_argument("-i", "--ignore-garbage", action="store_true") - if inv: - p.add_argument("-I", "--invert", action="store_true") - if swap: - p.add_argument("-s", "--swapcase", action="store_true") - if wrap: - p.add_argument("-w", "--wrap", type=int, default=76) - p.add_argument("--help", action="help") - p.add_argument("--version", action="version") - p.version = "CodExt " + __version__ - args = p.parse_args() - if args.decode: - args.wrap = 0 - args.invert = getattr(args, "invert", False) - c, f = _input(args.file), [encode, decode][args.decode] - if swap and args.swapcase and args.decode: - c = codecs.decode(c, "swapcase") - c = b(c).rstrip(b"\r\n") - try: - c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], - ["strict", "ignore"][args.ignore_garbage]) - except Exception as err: - print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) - return 1 - c = ensure_str(c) - if swap and args.swapcase and not args.decode: - c = codecs.encode(c, "swapcase") - for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): - print(l) - return 0 - return _main - +# -*- coding: UTF-8 -*- +"""Generic baseN functions. + +""" +from argparse import ArgumentParser, RawTextHelpFormatter +from math import log +from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable +from textwrap import wrap as wraptext +from types import FunctionType, MethodType + +from ..__common__ import * +from ..__common__ import _set_exc +from ..__info__ import __version__ + + +_set_exc("BaseError") +_set_exc("BaseEncodeError") +_set_exc("BaseDecodeError") +""" +Curve fitting: + +>>> import matplotlib.pyplot as plt +>>> import pandas as pd +>>> import scipy.optimize +>>> from statistics import mean +>>> from tinyscript import random +>>> x, y = [], [] +>>> for i in range(2, 256): + v = [] + for j in range(16, 2048, 16): + s = random.randstr(j) + v.append(float(len(codext.encode(s, "base%d-generic" % i))) / len(s)) + x.append(i) + y.append(mean(v)) +>>> data = pd.DataFrame({'base': x, 'expf': y}) +>>> def fit(x, y, func, params): + params, cv = scipy.optimize.curve_fit(func, x, y, params) + print(params) + y2 = func(x, *params) + plt.clf() + plt.plot(x, y, ".", color="blue", alpha=.3) + plt.plot(x, y2, color="red", linewidth=3.0) + plt.show() +>>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (1, 1, 1, 1)) +[ 0.02841434 0.00512664 -0.99999984 0.01543879] +>>> fit(data['base'], data['expf'], lambda x, a, b, c, d: a / (x**b + c) + d, (.028, .005, -1, .015)) +[ 0.02827357 0.00510124 -0.99999984 0.01536941] +""" +EXPANSION_FACTOR = lambda base: 0.02827357 / (base**0.00510124-0.99999984) + 0.01536941 +SIZE_LIMIT = 1024 * 1024 * 1024 + + +def _generate_charset(n): + """ Generate a characters set. + + :param n: size of charset + """ + if 1 < n <= len(printable): + return printable[:n] + elif len(printable) < n < 256: + return "".join(chr(i) for i in range(n)) + raise ValueError("Bad size of character set") + + +def _get_charset(charset, p=""): + """ Characters set selection function. It allows to define charsets in many different ways. + + :param charset: charset object, can be a string (the charset itself), a function (that chooses the right charset + depending on the input parameter) or a dictionary (either by exact key or by pattern matching) + :param p: the parameter for choosing the charset + """ + # case 1: charset is a function, so return its result + if isinstance(charset, FunctionType): + return charset(p) + # case 2: charset is a string, so return it + elif isinstance(charset, str): + return charset + # case 3: charset is a dict with keys '' and 'inv', typically for a charset using lowercase and uppercase characters + # that can be inverted + elif isinstance(charset, dict) and list(charset.keys()) == ["", "inv"]: + return charset["inv" if re.match(r"[-_]inv(erted)?$", p) else ""] + # case 4: charset is a dict, but not with the specific keys '' and 'inv', so consider it as pattern-charset pairs + elif isinstance(charset, dict): + # try to handle [p]arameter as a simple key + try: + return charset[p] + except KeyError: + pass + # or handle [p]arameter as a pattern + default, n, best = None, None, None + for pattern, cset in charset.items(): + n = len(cset) + if re.match(pattern, ""): + default = cset + continue + m = re.match(pattern, p) + if m: # find the longest match from the patterns + s, e = m.span() + if e - s > len(best or ""): + best = pattern + if best: + return charset[best] + # special case: the given [p]arameter can be the charset itself if it has the right length + p = re.sub(r"^[-_]+", "", p) + if len(p) == n: + return p + # or simply rely on key '' + if default is not None: + return default + raise ValueError("Bad charset descriptor ('%s')" % p) + + +# generic base en/decoding functions +def base_encode(input, charset, errors="strict", exc=BaseEncodeError): + """ Base-10 to base-N encoding. + + :param input: input (str or int) to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + :param exc: exception to be raised in case of error + """ + i, n, r = input if isinstance(input, int) else s2i(input), len(charset), "" + if n == 1: + if i > SIZE_LIMIT: + raise InputSizeLimitError("Input exceeded size limit") + return i * charset[0] + if n == 10: + return str(i) if charset == digits else "".join(charset[int(x)] for x in str(i)) + while i > 0: + i, c = divmod(i, n) + r = charset[c] + r + return r + + +def base_decode(input, charset, errors="strict", exc=BaseDecodeError): + """ Base-N to base-10 decoding. + + :param input: input to be decoded + :param charset: base-N characters set + :param errors: errors handling marker + :param exc: exception to be raised in case of error + """ + i, n, dec = 0, len(charset), lambda n: base_encode(n, [chr(x) for x in range(256)], errors, exc) + if n == 1: + return i2s(len(input)) + if n == 10: + return i2s(int(input)) if charset == digits else "".join(str(charset.index(c)) for c in input) + for k, c in enumerate(input): + try: + i = i * n + charset.index(c) + except ValueError: + handle_error("base", errors, exc, decode=True)(c, k, dec(i), "base%d" % n) + return dec(i) + + +# base codec factory functions +def base(charset, pattern, pow2=False, encode_template=base_encode, decode_template=base_decode, name=None, **kwargs): + """ Base-N codec factory. + + :param charset: charset selection function + :param pattern: matching pattern for the codec name (first capturing group is used as the parameter for selecting + the charset) + :param pow2: whether the base codec's N is a power of 2 + """ + cs = _get_charset(charset) + n = len(cs) + nb = log(n, 2) + if pow2 and nb != int(nb): + raise BaseError("Bad charset ; {} is not a power of 2".format(n)) + + def encode(param="", *args): + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) + def _encode(input, errors="strict"): + if len(input) == 0: + return "", 0 + return encode_template(input, a, errors), len(input) + return _encode + + def decode(param="", *args): + a = _get_charset(charset, args[0] if len(args) > 0 and args[0] else param) + sl, sc = "\n" not in a, "\n" not in a and not "\r" in a + def _decode(input, errors="strict"): + if len(input) == 0: + return "", 0 + input = _stripl(input, sc, sl) + return decode_template(input, a, errors), len(input) + return _decode + + kwargs['len_charset'] = n + kwargs['printables_rate'] = float(len([c for c in cs if c in printable])) / len(cs) + kwargs['expansion_factor'] = kwargs.pop('expansion_factor', (EXPANSION_FACTOR(n), .05)) + n = "base{}".format(n) if name is None else name + try: + g = [n, n + "-inv"] if "[-_]inv(erted)?$" in charset.keys() else [n] + except AttributeError: + g = [n] + kwargs['guess'] = kwargs.get('guess', g) + add(n, encode, decode, pattern, entropy=nb, **kwargs) + + +def base_generic(): + """ Base-N generic codec. """ + def encode(n): + a = _generate_charset(int(n)) + def _encode(input, errors="strict"): + return base_encode(input, a, errors), len(input) + return _encode + + def decode(n): + a = _generate_charset(int(n)) + sl, sc = "\n" not in a, "\n" not in a and not "\r" in a + def _decode(input, errors="strict"): + input = _stripl(input, sc, sl) + return base_decode(input, a, errors), len(input) + return _decode + + add("base", encode, decode, r"^base[-_]?([2-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:[-_]generic)?$", + guess=["base%d-generic" % i for i in range(2, 255)], entropy=lambda e, n: log(int(n.split("-")[0][4:]), 2), + len_charset=lambda n: int(n.split("-")[0][4:]), printables_rate=1., category="base-generic", penalty=.4, + expansion_factor=lambda f, n: (EXPANSION_FACTOR(int(n.split("-")[0][4:])), .05)) + + +def main(n, ref=None, alt=None, inv=True, swap=True, wrap=True): + base = str(n) + ("-" + alt.lstrip("-") if alt else "") + src = "The data are encoded as described for the base%(base)s alphabet in %(reference)s.\n" % \ + {'base': base, 'reference': "\n" + ref if len(ref) > 20 else ref} if ref else "" + text = "%(source)sWhen decoding, the input may contain newlines in addition to the bytes of the formal base" \ + "%(base)s alphabet. Use --ignore-garbage to attempt to recover from any other non-alphabet bytes in the" \ + " encoded stream." % {'base': base, 'source': src} + text = "\n".join(x for x in wraptext(text, 74)) + descr = """Usage: base%(base)s [OPTION]... [FILE] +Base%(base)s encode or decode FILE, or standard input, to standard output. + +With no FILE, or when FILE is -, read standard input. + +Mandatory arguments to long options are mandatory for short options too. + -d, --decode decode data + -i, --ignore-garbage when decoding, ignore non-alphabet characters +%(inv)s%(swap)s%(wrap)s + + --help display this help and exit + --version output version information and exit + +%(text)s + +Report base%(base)s translation bugs to +Full documentation at: +""" % {'base': base, 'text': text, + 'inv': ["", " -I, --invert invert charsets from the base alphabet (e.g. digits and letters)\n"][inv], + 'swap': ["", " -s, --swapcase swap the case\n"][swap], + 'wrap': ["", " -w, --wrap=COLS wrap encoded lines after COLS character (default 76).\n"+ 26 * " " + \ + "Use 0 to disable line wrapping"][wrap]} + + def _main(): + p = ArgumentParser(description=descr, formatter_class=RawTextHelpFormatter, add_help=False) + p.format_help = MethodType(lambda s: s.description, p) + p.add_argument("file", nargs="?") + p.add_argument("-d", "--decode", action="store_true") + p.add_argument("-i", "--ignore-garbage", action="store_true") + if inv: + p.add_argument("-I", "--invert", action="store_true") + if swap: + p.add_argument("-s", "--swapcase", action="store_true") + if wrap: + p.add_argument("-w", "--wrap", type=int, default=76) + p.add_argument("--help", action="help") + p.add_argument("--version", action="version") + p.version = "CodExt " + __version__ + args = p.parse_args() + if args.decode: + args.wrap = 0 + args.invert = getattr(args, "invert", False) + c, f = _input(args.file), [encode, decode][args.decode] + if swap and args.swapcase and args.decode: + c = codecs.decode(c, "swapcase") + c = b(c).rstrip(b"\r\n") + try: + c = f(c, "base" + base + ["", "-inv"][getattr(args, "invert", False)], + ["strict", "ignore"][args.ignore_garbage]) + except Exception as err: + print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) + return 1 + c = ensure_str(c) + if swap and args.swapcase and not args.decode: + c = codecs.encode(c, "swapcase") + for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): + print(l) + return 0 + return _main + diff --git a/src/codext/base/base100.py b/src/codext/base/base100.py index f5faa1d..2287463 100755 --- a/src/codext/base/base100.py +++ b/src/codext/base/base100.py @@ -1,56 +1,47 @@ -# -*- coding: UTF-8 -*- -"""Base100 Codec - base100 content encoding. - -Note: only works in Python3 ; strongly inspired from https://github.com/MasterGroosha/pybase100 - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import main -from ..__common__ import * - - -# no __examples__ ; handled manually in tests/test_base.py - - -def base100_encode(input, errors="strict"): - raise NotImplementedError - - -def base100_decode(input, errors="strict"): - raise NotImplementedError - - -if PY3: - class Base100DecodeError(ValueError): - __module__ = "builtins" - - def base100_encode(input, errors="strict"): - input = b(input) - r = [240, 159, 0, 0] * len(input) - for i, c in enumerate(input): - r[4*i+2] = (c + 55) // 64 + 143 - r[4*i+3] = (c + 55) % 64 + 128 - return bytes(r), len(input) - - def base100_decode(input, errors="strict"): - input = b(_stripl(input, True, True)) - if errors == "ignore": - input = input.replace(b"\n", "") - if len(input) % 4 != 0: - raise Base100DecodeError("Bad input (length should be multiple of 4)") - r = [None] * (len(input) // 4) - for i, c in enumerate(input): - if i % 4 == 2: - tmp = ((c - 143) * 64) % 256 - elif i % 4 == 3: - r[i//4] = (c - 128 + tmp - 55) & 0xff - return bytes(r), len(input) - - -add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$", expansion_factor=1.) -main100 = main(100, "") - +# -*- coding: UTF-8 -*- +"""Base100 Codec - base100 content encoding. + +Note: only works in Python3 ; strongly inspired from https://github.com/MasterGroosha/pybase100 + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import main +from ..__common__ import * + +# no __examples__ ; handled manually in tests/test_base.py + +class Base100DecodeError(ValueError): + __module__ = "builtins" + + +def base100_encode(input, errors="strict"): + input = b(input) + r = [240, 159, 0, 0] * len(input) + for i, c in enumerate(input): + r[4*i+2] = (c + 55) // 64 + 143 + r[4*i+3] = (c + 55) % 64 + 128 + return bytes(r), len(input) + + +def base100_decode(input, errors="strict"): + input = b(_stripl(input, True, True)) + if errors == "ignore": + input = input.replace(b"\n", b"") + if len(input) % 4 != 0: + raise Base100DecodeError("Bad input (length should be multiple of 4)") + r = [None] * (len(input) // 4) + for i, c in enumerate(input): + if i % 4 == 2: + tmp = ((c - 143) * 64) % 256 + elif i % 4 == 3: + r[i//4] = (c - 128 + tmp - 55) & 0xff + return bytes(r), len(input) + + +add("base100", base100_encode, base100_decode, r"^(?:base[-_]?100|emoji)$", expansion_factor=1.) +main100 = main(100, "") + diff --git a/src/codext/base/base122.py b/src/codext/base/base122.py index f580ff8..b326341 100755 --- a/src/codext/base/base122.py +++ b/src/codext/base/base122.py @@ -1,106 +1,98 @@ -# -*- coding: UTF-8 -*- -"""Base122 Codec - base122 content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ._base import main -from ..__common__ import * - - -__examples__ = { - 'enc(base122|base-122)': { - 'this is a test': ":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", - b'This is another longer test string with d1g1t5 and sp3c141 characters !\n': \ - b"*\x1a\xca\x97\x19\x01Rs\x10\x18-f{QPe9\x08\xcb\x86{9Ne9\x08\x0eF+Mh 9]\x0e\xd3\x8b" - b"9N ;Z.FA\x01H13L.C)\x01Bn2\x08\x0e7\x01MF1\x1a\x0c$\x06\x1b!Br0XnF+If \x10B@" - }, - 'enc-dec(base_122)': ["@random"], -} if PY3 else {'enc(base122': None} - - -_BAD = [0, 10, 13, 34, 38, 92] -_i = lambda c: c if isinstance(c, int) else ord(c) - - -def base122_encode(input, errors='strict'): - raise NotImplementedError - - -def base122_decode(input, errors='strict'): - raise NotImplementedError - - -if PY3: - # inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js - def base122_encode(input, errors="strict"): - idx, bit, r, l = 0, 0, [], len(input) - - def _get_7bits(idx, bit): - if idx >= l: - return idx, bit, False - B1 = _i(input[idx]) - p1 = (((254 >> bit) & B1) << bit) >> 1 - bit += 7 - if bit < 8: - return idx, bit, p1 - bit -= 8 - idx += 1 - if idx >= l: - return idx, bit, p1 - B2 = _i(input[idx]) - p2 = (((65280 >> bit) & B2) & 255) >> (8 - bit) - return idx, bit, (p1 | p2) - - while True: - if idx >= l: - break - # get seven bits of input data - idx, bit, B = _get_7bits(idx, bit) - # check for illegal chars - try: - bad_idx = _BAD.index(B) - except ValueError: - r.append(B) - continue - idx, bit, nB = _get_7bits(idx, bit) - if nB is False: - nB, bad_idx = B, 7 - B1, B2 = 194, 128 - B1 |= (7 & bad_idx) << 2 - B1 |= int((nB & 64) > 0) - B2 |= nB & 63 - r.extend([B1, B2]) - return "".join(map(chr, r)).encode("latin-1"), len(input) - - # inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js - def base122_decode(input, errors="strict"): - currB, bob, r, input = 0, 0, [], list(map(ord, input)) - - def _get_7bits(currB, bob, B, decoded): - B <<= 1 - currB |= (B % 0x100000000) >> bob - bob += 7 - if bob >= 8: - decoded += [currB] - bob -= 8 - return (B << (7 - bob)) & 255, bob - - for i in range(len(input)): - if input[i] >= 128: - try: - currB, bob = _get_7bits(currB, bob, _BAD[(input[i] >> 8) & 7], r) - except IndexError: - pass - currB, bob = _get_7bits(currB, bob, input[i] & 127, r) - else: - currB, bob = _get_7bits(currB, bob, input[i], r) - return "".join(map(chr, r)).rstrip("\0"), len(input) - - -add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) -main122 = main(122, "", wrap=False) - +# -*- coding: UTF-8 -*- +"""Base122 Codec - base122 content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ._base import main +from ..__common__ import * + + +__examples__ = { + 'enc(base122|base-122)': { + 'this is a test': ":\x1aʗ\x19\x01Rs\x10\x18$\x07#\x15ft", + b'This is another longer test string with d1g1t5 and sp3c141 characters !\n': \ + b"*\x1a\xca\x97\x19\x01Rs\x10\x18-f{QPe9\x08\xcb\x86{9Ne9\x08\x0eF+Mh 9]\x0e\xd3\x8b" + b"9N ;Z.FA\x01H13L.C)\x01Bn2\x08\x0e7\x01MF1\x1a\x0c$\x06\x1b!Br0XnF+If \x10B@" + }, + 'enc-dec(base_122)': ["@random"], +} + + +_BAD = [0, 10, 13, 34, 38, 92] +_i = lambda c: c if isinstance(c, int) else ord(c) + + +# inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js +def base122_encode(input, errors="strict"): + idx, bit, r, l = 0, 0, [], len(input) + + def _get_7bits(idx, bit): + if idx >= l: + return idx, bit, False + B1 = _i(input[idx]) + p1 = (((254 >> bit) & B1) << bit) >> 1 + bit += 7 + if bit < 8: + return idx, bit, p1 + bit -= 8 + idx += 1 + if idx >= l: + return idx, bit, p1 + B2 = _i(input[idx]) + p2 = (((65280 >> bit) & B2) & 255) >> (8 - bit) + return idx, bit, (p1 | p2) + + while True: + if idx >= l: + break + # get seven bits of input data + idx, bit, B = _get_7bits(idx, bit) + # check for illegal chars + try: + bad_idx = _BAD.index(B) + except ValueError: + r.append(B) + continue + idx, bit, nB = _get_7bits(idx, bit) + if nB is False: + nB, bad_idx = B, 7 + B1, B2 = 194, 128 + B1 |= (7 & bad_idx) << 2 + B1 |= int((nB & 64) > 0) + B2 |= nB & 63 + r.extend([B1, B2]) + return "".join(map(chr, r)).encode("latin-1"), len(input) + + +# inspired from: https://github.com/kevinAlbs/Base122/blob/master/base122.js +def base122_decode(input, errors="strict"): + currB, bob, r, input = 0, 0, [], list(map(ord, input)) + + def _get_7bits(currB, bob, B, decoded): + B <<= 1 + currB |= (B % 0x100000000) >> bob + bob += 7 + if bob >= 8: + decoded += [currB] + bob -= 8 + return (B << (7 - bob)) & 255, bob + + for i in range(len(input)): + if input[i] >= 128: + try: + currB, bob = _get_7bits(currB, bob, _BAD[(input[i] >> 8) & 7], r) + except IndexError: + pass + currB, bob = _get_7bits(currB, bob, input[i] & 127, r) + else: + currB, bob = _get_7bits(currB, bob, input[i], r) + return "".join(map(chr, r)).rstrip("\0"), len(input) + + +add("base122", base122_encode, base122_decode, r"^base[-_]?122$", expansion_factor=1.085) +main122 = main(122, "", wrap=False) + diff --git a/src/codext/base/base85.py b/src/codext/base/base85.py index bc6d8b2..22aad28 100755 --- a/src/codext/base/base85.py +++ b/src/codext/base/base85.py @@ -1,186 +1,185 @@ -# -*- coding: UTF-8 -*- -"""Base85 Codec - base85 content encoding. - -This is a simple wrapper for adding base64.b85**code to the codecs. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import base64 -from six import integer_types - -from ._base import _get_charset, digits, lower, main, upper -from ..__common__ import * - - -__examples__ = { - 'enc-dec(base85|z85|base85-ipv6)': ["@random{512,1024,2048}"], - 'enc-dec(base85-btoa|base85-xbtoa)': ["@random{512,1024,2048}"], - 'enc(base85|ascii85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}, - 'enc(base85-adobe)': {'this is a test': "<~FD,B0+DGm>@3BZ'F*%~>", - 'this is a test\0\0\0\0\0\0': "<~FD,B0+DGm>@3BZ'F*%B^z~>"}, - 'enc(z85|base85-z)': {'this is a test': "BzbxfazC)tvixV6B94"}, - 'enc(base85-ipv6|base85_rfc1924)': {'this is a test': "bZBXFAZc?TVIXv6b94"}, - 'enc(base85_btoa)': {'this is a test': "FD,B0+DGm>@3BZ'F*%B^"}, - 'enc(base85_btoa)': {'this\0\0\0\0test': "FD,B0+DGm>@3BZ'F*%B^"}, - 'enc(base85_btoa)': {'this is a test\0\0\0\0': "FD,B0+DGm>y@3BZ'F*%B^z"}, - 'enc(base85-xbtoa)': {'this is a test': "xbtoa Begin\nFD,B0+DGm>@3BZ'F*%B^\nxbtoa End N 14 e E 4b" \ - " S 523 R 1b132e"}, - 'dec(base85-xbtoa)': {'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End': None, - 'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End N 14 e E 4b S 523 R 000bad': - None}, - 'enc(base85-xml)': {'this is a test': "bZBXFAZc@TVIXv6b94"}, - 'enc(base85|ascii85)': {'this\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0test': "FD,B0zzz!!!\"@ATMq"}, -} -__guess__ = ["ascii85", "z85", "base85-ipv6", "base85-xml", "base85-adobe", "base85-xbtoa"] - - -B85 = { - r'(base[-_]?85([-_]ascii)?|ascii85)$': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], - r'(z85|base[-_]?85[-_]z(eromq)?)$': digits + lower + upper + ".-:+=^!/*?&<>()[]{}@%$#", - r'base[-_]?85[-_](rfc1924|ipv6)$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", - r'base[-_]?85[-_]xml$': digits + upper + lower[:-1] + "!#$()*+,-./:;=?@^`{|}~z_", -} -B85[r'(base[-_]?85[-_]adobe)$'] = B85[r'(base[-_]?85[-_]x?btoa)$'] = B85[r'(base[-_]?85([-_]ascii)?|ascii85)$'] -POW85 = [85 ** i for i in range(5)] - - -def __format(text, mode, decode=False, **kwargs): - if "adobe" in mode: - if decode: - if text.startswith("<~") and text.endswith("~>"): - text = text[2:-2] - else: - text = "<~" + text + "~>" - elif "xbtoa" in mode: - sp, ep = "xbtoa [bB]egin\n", "xbtoa [eE]nd" - if decode: - if re.match(r"^xbtoa\s+[bB]egin\n", text) and \ - re.search(r"\nxbtoa\s+[eE]nd N \d+{h} E{h} S{h} R{h}\s*$".format(h=" [0-9a-fA-F]+"), text): - text = "".join(text.split("\n")[1:-1]).replace(" ", "") - elif not decode: - l, t = kwargs['length'], "\n".join(text[i:i+78] for i in range(0, len(text), 78)) - text = "xbtoa Begin\n%s\nxbtoa End N %d %x E %x S %x R %x" % \ - (t, l, l, kwargs['c_xor'], kwargs['c_sum'], kwargs['c_rot']) - return text - - -def __xbtoa_values(text): - try: - hr = "[0-9a-fA-F]+" - return re.search(r"\nxbtoa\s+[eE]nd N (\d+) ({h}) E ({h}) S ({h}) R ({h})\s*$".format(h=hr), text).groups() - except: - raise Base85DecodeError("Bad or missing xbtoa parameters") - - -def base85_encode(mode): - b85 = _get_charset(B85, mode) - def encode(input, errors="strict"): - r, l, kw = "", len(input), {} - if l == 0: - return input, 0 - if "xbtoa" in mode: - kw['length'] = l - kw['c_xor'], kw['c_sum'], kw['c_rot'] = 0, 0, 0 - n_pad = (4 - l % 4) % 4 - for i in range(0, l, 4): - block = input[i:i+4] - if block == "\0\0\0\0" and b85[-3:] == "stu": - r += "z" - if block == "\x20\x20\x20\x20" and "btoa" in mode: - r += "y" - if "xbtoa" in mode: - for c in block: - k = ord(c) - kw['c_xor'] ^= k - kw['c_sum'] += k + 1 - kw['c_rot'] <<= 1 - if kw['c_rot'] & 0x80000000: - kw['c_rot'] += 1 - kw['c_rot'] += k - if block == "\0\0\0\0" and b85[-3:] == "stu" or block == "\x20\x20\x20\x20" and "btoa" in mode: - continue - if len(block) < 4: - block += n_pad * "\0" - n, bl = s2i(block), "" - for _ in range(5): - n, k = divmod(n, 85) - bl = b85[k] + bl - r += bl - if "btoa" not in mode and n_pad: - r = r[:-n_pad] - if b85[-3:] == "stu" and r[-5:] == "!!!!!": - r = r[:-5] + "z" - return __format(r, mode, **kw), l - return encode - - -def base85_decode(mode): - b85 = _get_charset(B85, mode) - def decode(input, errors="strict"): - r, l, i, n_pad = "", len(input), 0, 0 - if l == 0: - return input, 0 - if "xbtoa" in mode: - v = __xbtoa_values(input) - n_last = int(v[0]) % 4 - c_xor, c_sum, c_rot = 0, 0, 0 - input = __format(input, mode, True) - ehandler = handle_error("base85", errors, decode=True) - if b85[-3:] == "stu" and input[-1] == "z": - input = input[:-1] + "!!!!!" - l = len(input) - while i < l: - n, incr = 0, 5 - if input[i] == "z" and b85[-3:] == "stu": - bl, incr = "\0\0\0\0", 1 - elif input[i] == "y" and "btoa" in mode: - bl, incr = "\x20\x20\x20\x20", 1 - else: - block = input[i:i+5] - if len(block) < 5: - n_pad = 5 - len(block) % 5 - block += n_pad * "\0" - for k, c in enumerate(block[::-1]): - try: - n += (b85.index(c) if c != "\0" else 255) * POW85[k] - except ValueError: - r += ehandler(c, i + k, r) - bl = codecs.decode("{:0>8}".format(hex(n & 0xffffffff)[2:]), "hex") - if "xbtoa" in mode: - if i + 5 == l and n_last > 0: - bl = bl[:n_last] - for c in bl: - k = ord(c) - c_xor ^= k - c_sum += k + 1 - c_rot <<= 1 - if c_rot & 0x80000000: - c_rot += 1 - c_rot += k - r += bl - i += incr - if n_pad > 0: - r = r[:-n_pad] - if "xbtoa" in mode: - chkv = ["%d" % len(r), "%x" % len(r), "%x" % c_xor, "%x" % c_sum, "%x" % c_rot] - if any(v1 != v2 for v1, v2 in zip(v, chkv)) and errors == "strict": - raise Base85ValueError("A check value does not match (%s != %s)" % (str(list(v)).replace("'", ""), - str(chkv).replace("'", ""))) - return r, l - return decode - - -add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, - pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", - extra_exceptions=["Base85ValueError"]) -main85 = main(85, None) -main85adobe = main(85, None, "adobe") -main85xbtoa = main(85, None, "xbtoa", wrap=False) -main85rfc1924 = main(85, "RFC 1924", "ipv6") -main85xml = main(85, "", "xml") -main85zeromq = main(85, "", "zeromq") - +# -*- coding: UTF-8 -*- +"""Base85 Codec - base85 content encoding. + +This is a simple wrapper for adding base64.b85**code to the codecs. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import base64 + +from ._base import _get_charset, digits, lower, main, upper +from ..__common__ import * + + +__examples__ = { + 'enc-dec(base85|z85|base85-ipv6)': ["@random{512,1024,2048}"], + 'enc-dec(base85-btoa|base85-xbtoa)': ["@random{512,1024,2048}"], + 'enc(base85|ascii85)': {'this is a test': "FD,B0+DGm>@3BZ'F*%"}, + 'enc(base85-adobe)': {'this is a test': "<~FD,B0+DGm>@3BZ'F*%~>", + 'this is a test\0\0\0\0\0\0': "<~FD,B0+DGm>@3BZ'F*%B^z~>"}, + 'enc(z85|base85-z)': {'this is a test': "BzbxfazC)tvixV6B94"}, + 'enc(base85-ipv6|base85_rfc1924)': {'this is a test': "bZBXFAZc?TVIXv6b94"}, + 'enc(base85_btoa)': {'this is a test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this\0\0\0\0test': "FD,B0+DGm>@3BZ'F*%B^"}, + 'enc(base85_btoa)': {'this is a test\0\0\0\0': "FD,B0+DGm>y@3BZ'F*%B^z"}, + 'enc(base85-xbtoa)': {'this is a test': "xbtoa Begin\nFD,B0+DGm>@3BZ'F*%B^\nxbtoa End N 14 e E 4b" \ + " S 523 R 1b132e"}, + 'dec(base85-xbtoa)': {'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End': None, + 'xbtoa Begin\nFD,B0+DGm>@3BZ\'F*%B^\nxbtoa End N 14 e E 4b S 523 R 000bad': + None}, + 'enc(base85-xml)': {'this is a test': "bZBXFAZc@TVIXv6b94"}, + 'enc(base85|ascii85)': {'this\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0test': "FD,B0zzz!!!\"@ATMq"}, +} +__guess__ = ["ascii85", "z85", "base85-ipv6", "base85-xml", "base85-adobe", "base85-xbtoa"] + + +B85 = { + r'(base[-_]?85([-_]ascii)?|ascii85)$': "!\"#$%&'()*+,-./" + digits + ":;<=>?@" + upper + "[\\]^_`" + lower[:21], + r'(z85|base[-_]?85[-_]z(eromq)?)$': digits + lower + upper + ".-:+=^!/*?&<>()[]{}@%$#", + r'base[-_]?85[-_](rfc1924|ipv6)$': digits + upper + lower + "!#$%&()*+-;<=>?@^_`{|}~", + r'base[-_]?85[-_]xml$': digits + upper + lower[:-1] + "!#$()*+,-./:;=?@^`{|}~z_", +} +B85[r'(base[-_]?85[-_]adobe)$'] = B85[r'(base[-_]?85[-_]x?btoa)$'] = B85[r'(base[-_]?85([-_]ascii)?|ascii85)$'] +POW85 = [85 ** i for i in range(5)] + + +def __format(text, mode, decode=False, **kwargs): + if "adobe" in mode: + if decode: + if text.startswith("<~") and text.endswith("~>"): + text = text[2:-2] + else: + text = "<~" + text + "~>" + elif "xbtoa" in mode: + sp, ep = "xbtoa [bB]egin\n", "xbtoa [eE]nd" + if decode: + if re.match(r"^xbtoa\s+[bB]egin\n", text) and \ + re.search(r"\nxbtoa\s+[eE]nd N \d+{h} E{h} S{h} R{h}\s*$".format(h=" [0-9a-fA-F]+"), text): + text = "".join(text.split("\n")[1:-1]).replace(" ", "") + elif not decode: + l, t = kwargs['length'], "\n".join(text[i:i+78] for i in range(0, len(text), 78)) + text = "xbtoa Begin\n%s\nxbtoa End N %d %x E %x S %x R %x" % \ + (t, l, l, kwargs['c_xor'], kwargs['c_sum'], kwargs['c_rot']) + return text + + +def __xbtoa_values(text): + try: + hr = "[0-9a-fA-F]+" + return re.search(r"\nxbtoa\s+[eE]nd N (\d+) ({h}) E ({h}) S ({h}) R ({h})\s*$".format(h=hr), text).groups() + except: + raise Base85DecodeError("Bad or missing xbtoa parameters") + + +def base85_encode(mode): + b85 = _get_charset(B85, mode) + def encode(input, errors="strict"): + r, l, kw = "", len(input), {} + if l == 0: + return input, 0 + if "xbtoa" in mode: + kw['length'] = l + kw['c_xor'], kw['c_sum'], kw['c_rot'] = 0, 0, 0 + n_pad = (4 - l % 4) % 4 + for i in range(0, l, 4): + block = input[i:i+4] + if block == "\0\0\0\0" and b85[-3:] == "stu": + r += "z" + if block == "\x20\x20\x20\x20" and "btoa" in mode: + r += "y" + if "xbtoa" in mode: + for c in block: + k = ord(c) + kw['c_xor'] ^= k + kw['c_sum'] += k + 1 + kw['c_rot'] <<= 1 + if kw['c_rot'] & 0x80000000: + kw['c_rot'] += 1 + kw['c_rot'] += k + if block == "\0\0\0\0" and b85[-3:] == "stu" or block == "\x20\x20\x20\x20" and "btoa" in mode: + continue + if len(block) < 4: + block += n_pad * "\0" + n, bl = s2i(block), "" + for _ in range(5): + n, k = divmod(n, 85) + bl = b85[k] + bl + r += bl + if "btoa" not in mode and n_pad: + r = r[:-n_pad] + if b85[-3:] == "stu" and r[-5:] == "!!!!!": + r = r[:-5] + "z" + return __format(r, mode, **kw), l + return encode + + +def base85_decode(mode): + b85 = _get_charset(B85, mode) + def decode(input, errors="strict"): + r, l, i, n_pad = "", len(input), 0, 0 + if l == 0: + return input, 0 + if "xbtoa" in mode: + v = __xbtoa_values(input) + n_last = int(v[0]) % 4 + c_xor, c_sum, c_rot = 0, 0, 0 + input = __format(input, mode, True) + ehandler = handle_error("base85", errors, decode=True) + if b85[-3:] == "stu" and input[-1] == "z": + input = input[:-1] + "!!!!!" + l = len(input) + while i < l: + n, incr = 0, 5 + if input[i] == "z" and b85[-3:] == "stu": + bl, incr = "\0\0\0\0", 1 + elif input[i] == "y" and "btoa" in mode: + bl, incr = "\x20\x20\x20\x20", 1 + else: + block = input[i:i+5] + if len(block) < 5: + n_pad = 5 - len(block) % 5 + block += n_pad * "\0" + for k, c in enumerate(block[::-1]): + try: + n += (b85.index(c) if c != "\0" else 255) * POW85[k] + except ValueError: + r += ehandler(c, i + k, r) + bl = codecs.decode("{:0>8}".format(hex(n & 0xffffffff)[2:]), "hex") + if "xbtoa" in mode: + if i + 5 == l and n_last > 0: + bl = bl[:n_last] + for c in bl: + k = ord(c) + c_xor ^= k + c_sum += k + 1 + c_rot <<= 1 + if c_rot & 0x80000000: + c_rot += 1 + c_rot += k + r += bl + i += incr + if n_pad > 0: + r = r[:-n_pad] + if "xbtoa" in mode: + chkv = ["%d" % len(r), "%x" % len(r), "%x" % c_xor, "%x" % c_sum, "%x" % c_rot] + if any(v1 != v2 for v1, v2 in zip(v, chkv)) and errors == "strict": + raise Base85ValueError("A check value does not match (%s != %s)" % (str(list(v)).replace("'", ""), + str(chkv).replace("'", ""))) + return r, l + return decode + + +add("base85", base85_encode, base85_decode, expansion_factor=lambda f, ename: f if "xbtoa" in ename else 1.25, + pattern=r"^(base[-_]?85(?:|[-_](?:adobe|x?btoa|ipv6|rfc1924|xml|z(?:eromq)?))|z85|ascii85)$", + extra_exceptions=["Base85ValueError"]) +main85 = main(85, None) +main85adobe = main(85, None, "adobe") +main85xbtoa = main(85, None, "xbtoa", wrap=False) +main85rfc1924 = main(85, "RFC 1924", "ipv6") +main85xml = main(85, "", "xml") +main85zeromq = main(85, "", "zeromq") + diff --git a/src/codext/binary/baudot.py b/src/codext/binary/baudot.py index a57e1ea..1cdd111 100755 --- a/src/codext/binary/baudot.py +++ b/src/codext/binary/baudot.py @@ -1,295 +1,281 @@ -# -*- coding: UTF-8 -*- -"""Baudot Codec - baudot content conversion to HTML. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_us"] -if PY3: - __CODES.extend(["ita2_meteo", "mtk2", "murray", "uk"]) -__guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] -__examples1__ = { - 'enc(baudot-BAD_ALPHABET)': None, - 'enc(baudot_ccitt2_lsb)': {'TEST 1234': "00001100001010000001001001101111101110011000001010"}, - 'enc(baudot-ita1)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, - 'enc(baudot_ita2_msb)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, - 'enc(baudot-ita2-us)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, - 'enc(baudot)': {'\x01\x02': None}, - 'enc(baudot_ccitt1-lsb)': {'TEST ': None}, -} -__examples2__ = { - 'enc(baudot_spaced-BAD_ALPHABET)': None, - 'enc(baudot-spaced_ccitt2_lsb)': {'TEST 1234': "00001 10000 10100 00001 00100 11011 11101 11001 10000 01010"}, - 'enc(baudot_spaced-ita1)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, - 'enc(baudot-spaced_ita2_msb)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, - 'enc(baudot_spaced-ita2-us)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, -} -__examples3__ = { - 'enc(baudot_tape-BAD_ALPHABET)': None, - 'enc(baudot_tape-ita1)': { - 'TEST 1234': "***.**\n* *. *\n .* \n* *. \n* *. *\n* . \n * . \n . *\n .* \n *. \n *. *", - }, - 'dec(baudot-tape_ita2)': {'BAD_HEADER\n .* \n': None}, - 'dec(baudot-tape_ita2-us)': {'***.**\nBAD_TAPE\n': None}, - 'dec(baudot_tape-ccitt1_lsb)': {'***.**\n .* \n* . *\n* . \n': None}, -} -if PY3: - __examples1__.update({ - 'enc(baudot_ccitt1_lsb)': {'TEST1234': "101010001010001101010100000100000100000100101"}, - 'enc(baudot-fr)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, - }) - __examples2__.update({ - 'enc(baudot-spaced_ccitt1_lsb)': {'TEST1234': "10101 00010 10001 10101 01000 00100 00010 00001 00101"}, - 'enc(baudot_spaced-fr)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, - }) - - -PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us" + (r"|meteo" if PY3 else r"") + r")" + \ - (r"|mtk2|murray|uk" if PY3 else r"") + r"|us_tty)(?:[-_](?:lsb|msb))?)?$" -# reserved character -RES_CHR = "\xff" - -# sources: -# - http://rabbit.eng.miami.edu/info/baudot.html -# - https://en.wikipedia.org/wiki/Baudot_code -# - https://fr.qwe.wiki/wiki/Baudot_code -# all alphabets consider MSB by default -# CCITT-1 original Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) -CCITT1 = [ - "00001", "00010", - "\x00\xff\xff\xffA-JKEXGM/ZHLYSBRUTCQIWFNOVDP", - "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff£5'0+" if PY3 else \ - "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff$5'0+", -] -# CCITT-2 revised Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) -CCITT2 = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", -] -# Original Baudot (French/European ; sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -BAUDOT = EU = FR = [ - "10000", "01000", - "\x00AEÉYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP" if PY3 else "\x00AEeYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP", - "\x0012&34°5 67h89f0\xff.,:;!?'\x7f()=-/\u2116%" if PY3 else "\x0012&34o5 67h89f0\xff.,:;!?'\x7f()=-/\xff%", -] -# International Telegraphic Alphabet 1 (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA1 = [ - "10000", "01000", - "\x00AE\rYUIO\xffJGHBCFD \xffXZSTWV\x7fKMLRQNP", - "\x0012\r34\xff5 67+89\xff0\xff\n,:.\xff?'\x7f()=-/\xff%", -] -# International Telegraphic Alphabet 2 (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA2 = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- '87\r\x054\x07,!:(5+)2$6019?&\xff./=\xff", -] -# International Telegraphic Alphabet 2 - US TTY (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -ITA2_US = US_TTY = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", -] -# International Telegraphic Alphabet 2 - Meteo version (source: https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - ITA2_METEO = [ - "11111", "11011", - "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", - "-3\n\u2191 \x0787\r\u21974\u2199\u29b7\u2192\u25ef\u21905+\u21962\u21936019\u2295\u2198\xff./\u29b6\xff", - ] -# Russian MTK-2 alphabet (source: https://fr.qwe.wiki/wiki/Baudot_code) -if PY3: - MTK2 = [ - "11111", "11011", - "\x00Е\n\xff СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", - "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", - ] -# Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - MURRAY = [ - "00100", "11011", - " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*" if PY3 else \ - "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,$)*", - ] -# English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code -# https://en.wikipedia.org/wiki/Baudot_code) -if PY3: - UK = [ - "10000", "01000", - "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", - "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+" if PY3 else \ - "\x0012\xff34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/$+", - ] - - -def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): - """ Converts a tape-like string with the given translation for ones and zeros to a series of bits. """ - bits = "" - trans_rev = {v: k for k, v in trans.items()} - for i, line in enumerate(tape.splitlines()): - if i == 0: - if line != trans_rev['1'] * 3 + "." + trans_rev['1'] * 2: - raise ValueError("Bad tape header '{}'".format(line)) - else: - line = line[:3] + line[4:] - if len(line) != 5: - raise ValueError("Bad tape line '{}'".format(line)) - bits += "".join(trans.get(c, "") for c in line) - return bits - - -def _bits_to_tape(bits, trans={'1': "*", '0': " "}): - """ Converts a series of bits to a tape-like string with the given translation for ones and zeros. """ - tape = [trans['1'] * 3 + "." + trans['1'] * 2] - for i in range(0, len(bits), 5): - group = "".join(trans[b] for b in bits[i:i+5]) - tape.append(group[:3] + "." + group[3:]) - return "\n".join(tape) - - -def _check_alphabet(alphabet): - """ Checks the length of letters and figures (must be 32 chars). """ - for chars in alphabet: - l = len(chars) - if l != 32: - raise ValueError("Bad length of alphabet (%d instead of 32)" % l) - - -def _handle_alphabet(alphabet): - """ Gets the given alphabet name and transforms it to its dictionary with letters and figures. """ - alphabet = (alphabet or "baudot").lower().replace("-", "_").strip("_") - if "_lsb" in alphabet: - alphabet = alphabet.replace("_lsb", "") - func = lambda x: x[::-1] - else: - alphabet = alphabet.replace("_msb", "") - func = lambda x: x - _ = globals()[alphabet.upper()] - st, a = _[:2], _[2:] - _check_alphabet(a) - alphabet = {n: {ch: bin(i)[2:].zfill(5) for i, ch in enumerate(alph) if ch != RES_CHR} for n, alph in \ - zip(["letters", "figures"], a)} - return alphabet, {'letters': st[0], 'figures': st[1]}, func - - -def baudot_encode(alphabet=None, spaced=False, tape=False): - ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") - alphabet, states, func = _handle_alphabet(alphabet) - def encode(text, errors="strict"): - text = text.upper() - s, l, state, seen_states = "", len(b(text)), None, [] - for i, c in enumerate(text): - # if the state is undefined yet, find the relevant alphabet - if state is None: - bits= None - for st in states.keys(): - try: - bits = func(alphabet[st][c]) - state = st - if st not in seen_states: - seen_states.append(st) - break - except KeyError: - pass - if bits is None: - bits = handle_error(ename, errors, "?", 5)(c, i) - s += bits - # otherwise, handle state change (when the current alphabet does not contain the character to encode but the - # other alphabet does - else: - try: - s += func(alphabet[state][c]) - continue - except KeyError: - state = list(set(states.keys()) - {state})[0] - try: - s += func(states[state]) + func(alphabet[state][c]) - if state not in seen_states: - seen_states.append(state) - except KeyError as e: - state = list(set(states.keys()) - {state})[0] # reset the state - s += handle_error(ename, errors, "?", 5)(c, i) - # by default, if no state is specified, the encoded string is handled as letters ; so if figures are used only, - # it is necessary to include the groups of bits for figures at the beginning of the encoded string - s = (states['figures'] if seen_states == ["figures"] else "") + s - if spaced: - s = " ".join(s[i:i+5] for i in range(0, len(s), 5)) - elif tape: - s = _bits_to_tape(s) - return s, l - return encode - - -def baudot_decode(alphabet=None, spaced=False, tape=False): - ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") - alphabet, states, func = _handle_alphabet(alphabet) - alphabet = {st: {v: k for k, v in alph.items()} for st, alph in alphabet.items()} - states = {v: k for k, v in states.items()} - def decode(text, errors="strict"): - s, l = "", len(b(text)) - if spaced: - text = text.replace(" ", "") - elif tape: - text = _bits_from_tape(text) - # infer the starting state by searching for the first encountered groups of bits indicating a valid state ; - # by default, we assume letters - state = "letters" - for i in range(0, len(text), 5): - bits = func(text[i:i+5]) - # the following code handles a possible ambiguity ; e.g. when letters have a group of bits matching - # a state change - if bits in states.keys(): - error = False - # so, when we see the bits of a state, we parse previous groups in order to determine if they are valid - # groups in the corresponding state, that is, if no error occurs ; if an error occurs, then it is a - # valid state change and not simply a character, and we can set it as the starting state - for j in range(i-5, 0, -5): - try: - alphabet[states[bits]][text[j:j+5]] - except KeyError: - error = True - break - if error: - state = list(set(states.values()) - {states[bits]})[0] - break - # now parse the input text - for i in range(0, len(text), 5): - bits = func(text[i:i+5]) - try: - s += alphabet[state][bits] - except KeyError: - if bits in states.keys() and states[bits] != state: - state = states[bits] - else: - s += handle_error(ename, errors, decode=True, item="group")(bits, i//5) - return s, l - return decode - - -add("baudot", baudot_encode, baudot_decode, PATTERN % r"", examples=__examples1__, guess=[x % "" for x in __guess__], - entropy=1., printables_rate=1.) - - -baudot_spaced_encode = lambda a: baudot_encode(a, spaced=True) -baudot_spaced_decode = lambda a: baudot_decode(a, spaced=True) -add("baudot-spaced", baudot_spaced_encode, baudot_spaced_decode, PATTERN % r"[-_]spaced", examples=__examples2__, - guess=[x % "-spaced" for x in __guess__], entropy=1.48, printables_rate=1.) - - -baudot_tape_encode = lambda a: baudot_encode(a, tape=True) -baudot_tape_decode = lambda a: baudot_decode(a, tape=True) -add("baudot-tape", baudot_tape_encode, baudot_tape_decode, PATTERN % r"[-_]tape", examples=__examples3__, - guess=[x % "-tape" for x in __guess__], entropy=1.86, printables_rate=1.) - +# -*- coding: UTF-8 -*- +"""Baudot Codec - baudot content conversion to HTML. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__CODES = ["ccitt1", "ccitt2", "eu", "ita1", "ita2", "ita2_meteo", "ita2_us", "mtk2", "murray", "uk"] +__guess__ = ["baudot%s-{}-{}".format(x, y) for x in __CODES for y in ["lsb", "msb"]] +__examples1__ = { + 'enc(baudot-BAD_ALPHABET)': None, + 'enc(baudot_ccitt2_lsb)': {'TEST 1234': "00001100001010000001001001101111101110011000001010"}, + 'enc(baudot-ita1)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, + 'enc(baudot_ita2_msb)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, + 'enc(baudot-ita2-us)': {'TEST 1234': "10000000010010110000001001101110111100110000101010"}, + 'enc(baudot)': {'\x01\x02': None}, + 'enc(baudot_ccitt1-lsb)': {'TEST ': None}, + 'enc(baudot_ccitt1_lsb)': {'TEST1234': "101010001010001101010100000100000100000100101"}, + 'enc(baudot-fr)': {'TEST 1234': "10101000101010010101100000100000001000100010000101"}, +} +__examples2__ = { + 'enc(baudot_spaced-BAD_ALPHABET)': None, + 'enc(baudot-spaced_ccitt2_lsb)': {'TEST 1234': "00001 10000 10100 00001 00100 11011 11101 11001 10000 01010"}, + 'enc(baudot_spaced-ita1)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, + 'enc(baudot-spaced_ita2_msb)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, + 'enc(baudot_spaced-ita2-us)': {'TEST 1234': "10000 00001 00101 10000 00100 11011 10111 10011 00001 01010"}, + 'enc(baudot-spaced_ccitt1_lsb)': {'TEST1234': "10101 00010 10001 10101 01000 00100 00010 00001 00101"}, + 'enc(baudot_spaced-fr)': {'TEST 1234': "10101 00010 10100 10101 10000 01000 00001 00010 00100 00101"}, +} +__examples3__ = { + 'enc(baudot_tape-BAD_ALPHABET)': None, + 'enc(baudot_tape-ita1)': { + 'TEST 1234': "***.**\n* *. *\n .* \n* *. \n* *. *\n* . \n * . \n . *\n .* \n *. \n *. *", + }, + 'dec(baudot-tape_ita2)': {'BAD_HEADER\n .* \n': None}, + 'dec(baudot-tape_ita2-us)': {'***.**\nBAD_TAPE\n': None}, + 'dec(baudot_tape-ccitt1_lsb)': {'***.**\n .* \n* . *\n* . \n': None}, +} + + +PATTERN = r"^baudot%s([-_](?:ccitt1|ccitt2|eu|fr|ita1|ita2|ita2[-_](?:us|meteo)|mtk2|murray|uk|us_tty)" + \ + r"(?:[-_](?:lsb|msb))?)?$" +# reserved character +RES_CHR = "\xff" + +# sources: +# - http://rabbit.eng.miami.edu/info/baudot.html +# - https://en.wikipedia.org/wiki/Baudot_code +# - https://fr.qwe.wiki/wiki/Baudot_code +# all alphabets consider MSB by default +# CCITT-1 original Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) +CCITT1 = [ + "00001", "00010", + "\x00\xff\xff\xffA-JKEXGM/ZHLYSBRUTCQIWFNOVDP", + "\x00\xff\xff\xff1.6(2\xff7)\xff:\xff=3\xff8-4\xff9/\xff?\xff£5'0+", +] +# CCITT-2 revised Baudot code (source: http://rabbit.eng.miami.edu/info/baudot.html) +CCITT2 = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", +] +# Original Baudot (French/European ; sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +BAUDOT = EU = FR = [ + "10000", "01000", + "\x00AEÉYUIO\xffJGHBCFD \nXZSTWV\x7fKMLRQNP", + "\x0012&34°5 67h89f0\xff.,:;!?'\x7f()=-/\u2116%", +] +# International Telegraphic Alphabet 1 (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA1 = [ + "10000", "01000", + "\x00AE\rYUIO\xffJGHBCFD \xffXZSTWV\x7fKMLRQNP", + "\x0012\r34\xff5 67+89\xff0\xff\n,:.\xff?'\x7f()=-/\xff%", +] +# International Telegraphic Alphabet 2 (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA2 = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- '87\r\x054\x07,!:(5+)2$6019?&\xff./=\xff", +] +# International Telegraphic Alphabet 2 - US TTY (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +ITA2_US = US_TTY = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "\x003\n- \x0787\r$4',!:(5\")2#6019?&\xff./;\xff", +] +# International Telegraphic Alphabet 2 - Meteo version (source: https://en.wikipedia.org/wiki/Baudot_code) +ITA2_METEO = [ + "11111", "11011", + "\x00E\nA SIU\rDRJNFCKTZLWHYPQOBG\xffMXV\xff", + "-3\n\u2191 \x0787\r\u21974\u2199\u29b7\u2192\u25ef\u21905+\u21962\u21936019\u2295\u2198\xff./\u29b6\xff", +] +# Russian MTK-2 alphabet (source: https://fr.qwe.wiki/wiki/Baudot_code) +MTK2 = [ + "11111", "11011", + "\x00Е\n\xff СИУ\r\xffРЙНФЦКТЗЛВХЫПЯОБГ\xffМЬЖ\xff", + "\x003\n- '87\r\xff4Ю,Э:(5+)2Щ6019?Ш\xff./=\xff", +] +# Murray code ; NB: not all fractions are supported (source: https://en.wikipedia.org/wiki/Baudot_code) +MURRAY = [ + "00100", "11011", + " E\xffA\xffSIU\nDRJNFCKTZLWHYPQOBF\xffMXV\x7f", + "\x003\xff\xff\xff'87\n²4\xff-\u215f(\xff5./2\xff6019?\xff\xff,£)*", +] +# English Baudot ; NB: not all fractions are supported (sources: https://fr.qwe.wiki/wiki/Baudot_code +# https://en.wikipedia.org/wiki/Baudot_code) +UK = [ + "10000", "01000", + "\x00AE/YUIO\xffJGHBCFD -XZSTWV\x7fKMLRQNP", + "\x0012\u215f34\xff5 67\xb989\xff0\xff.\xff:\xff²?'\x7f()=-/£+", +] + + +def _bits_from_tape(tape, trans={'*': "1", ' ': "0"}): + """ Converts a tape-like string with the given translation for ones and zeros to a series of bits. """ + bits = "" + trans_rev = {v: k for k, v in trans.items()} + for i, line in enumerate(tape.splitlines()): + if i == 0: + if line != trans_rev['1'] * 3 + "." + trans_rev['1'] * 2: + raise ValueError("Bad tape header '{}'".format(line)) + else: + line = line[:3] + line[4:] + if len(line) != 5: + raise ValueError("Bad tape line '{}'".format(line)) + bits += "".join(trans.get(c, "") for c in line) + return bits + + +def _bits_to_tape(bits, trans={'1': "*", '0': " "}): + """ Converts a series of bits to a tape-like string with the given translation for ones and zeros. """ + tape = [trans['1'] * 3 + "." + trans['1'] * 2] + for i in range(0, len(bits), 5): + group = "".join(trans[b] for b in bits[i:i+5]) + tape.append(group[:3] + "." + group[3:]) + return "\n".join(tape) + + +def _check_alphabet(alphabet): + """ Checks the length of letters and figures (must be 32 chars). """ + for chars in alphabet: + l = len(chars) + if l != 32: + raise ValueError("Bad length of alphabet (%d instead of 32)" % l) + + +def _handle_alphabet(alphabet): + """ Gets the given alphabet name and transforms it to its dictionary with letters and figures. """ + alphabet = (alphabet or "baudot").lower().replace("-", "_").strip("_") + if "_lsb" in alphabet: + alphabet = alphabet.replace("_lsb", "") + func = lambda x: x[::-1] + else: + alphabet = alphabet.replace("_msb", "") + func = lambda x: x + _ = globals()[alphabet.upper()] + st, a = _[:2], _[2:] + _check_alphabet(a) + alphabet = {n: {ch: bin(i)[2:].zfill(5) for i, ch in enumerate(alph) if ch != RES_CHR} for n, alph in \ + zip(["letters", "figures"], a)} + return alphabet, {'letters': st[0], 'figures': st[1]}, func + + +def baudot_encode(alphabet=None, spaced=False, tape=False): + ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") + alphabet, states, func = _handle_alphabet(alphabet) + def encode(text, errors="strict"): + text = text.upper() + s, l, state, seen_states = "", len(b(text)), None, [] + for i, c in enumerate(text): + # if the state is undefined yet, find the relevant alphabet + if state is None: + bits= None + for st in states.keys(): + try: + bits = func(alphabet[st][c]) + state = st + if st not in seen_states: + seen_states.append(st) + break + except KeyError: + pass + if bits is None: + bits = handle_error(ename, errors, "?", 5)(c, i) + s += bits + # otherwise, handle state change (when the current alphabet does not contain the character to encode but the + # other alphabet does + else: + try: + s += func(alphabet[state][c]) + continue + except KeyError: + state = list(set(states.keys()) - {state})[0] + try: + s += func(states[state]) + func(alphabet[state][c]) + if state not in seen_states: + seen_states.append(state) + except KeyError as e: + state = list(set(states.keys()) - {state})[0] # reset the state + s += handle_error(ename, errors, "?", 5)(c, i) + # by default, if no state is specified, the encoded string is handled as letters ; so if figures are used only, + # it is necessary to include the groups of bits for figures at the beginning of the encoded string + s = (states['figures'] if seen_states == ["figures"] else "") + s + if spaced: + s = " ".join(s[i:i+5] for i in range(0, len(s), 5)) + elif tape: + s = _bits_to_tape(s) + return s, l + return encode + + +def baudot_decode(alphabet=None, spaced=False, tape=False): + ename = "baudot" + ("-spaced" if spaced else "-tape" if tape else "") + alphabet, states, func = _handle_alphabet(alphabet) + alphabet = {st: {v: k for k, v in alph.items()} for st, alph in alphabet.items()} + states = {v: k for k, v in states.items()} + def decode(text, errors="strict"): + s, l = "", len(b(text)) + if spaced: + text = text.replace(" ", "") + elif tape: + text = _bits_from_tape(text) + # infer the starting state by searching for the first encountered groups of bits indicating a valid state ; + # by default, we assume letters + state = "letters" + for i in range(0, len(text), 5): + bits = func(text[i:i+5]) + # the following code handles a possible ambiguity ; e.g. when letters have a group of bits matching + # a state change + if bits in states.keys(): + error = False + # so, when we see the bits of a state, we parse previous groups in order to determine if they are valid + # groups in the corresponding state, that is, if no error occurs ; if an error occurs, then it is a + # valid state change and not simply a character, and we can set it as the starting state + for j in range(i-5, 0, -5): + try: + alphabet[states[bits]][text[j:j+5]] + except KeyError: + error = True + break + if error: + state = list(set(states.values()) - {states[bits]})[0] + break + # now parse the input text + for i in range(0, len(text), 5): + bits = func(text[i:i+5]) + try: + s += alphabet[state][bits] + except KeyError: + if bits in states.keys() and states[bits] != state: + state = states[bits] + else: + s += handle_error(ename, errors, decode=True, item="group")(bits, i//5) + return s, l + return decode + + +add("baudot", baudot_encode, baudot_decode, PATTERN % r"", examples=__examples1__, guess=[x % "" for x in __guess__], + entropy=1., printables_rate=1.) + + +baudot_spaced_encode = lambda a: baudot_encode(a, spaced=True) +baudot_spaced_decode = lambda a: baudot_decode(a, spaced=True) +add("baudot-spaced", baudot_spaced_encode, baudot_spaced_decode, PATTERN % r"[-_]spaced", examples=__examples2__, + guess=[x % "-spaced" for x in __guess__], entropy=1.48, printables_rate=1.) + + +baudot_tape_encode = lambda a: baudot_encode(a, tape=True) +baudot_tape_decode = lambda a: baudot_decode(a, tape=True) +add("baudot-tape", baudot_tape_encode, baudot_tape_decode, PATTERN % r"[-_]tape", examples=__examples3__, + guess=[x % "-tape" for x in __guess__], entropy=1.86, printables_rate=1.) + diff --git a/src/codext/binary/rotate.py b/src/codext/binary/rotate.py index 944e2b2..fb0c697 100755 --- a/src/codext/binary/rotate.py +++ b/src/codext/binary/rotate.py @@ -1,52 +1,51 @@ -# -*- coding: UTF-8 -*- -"""Rotate-Bits Codec - rotate-N-bits content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(rotate-0|rotate-8|rotate-left-8)': None, - 'enc(rotate1|rotate-right-1|rotate_1)': {'This is a test': "*4\xb4\xb9\x10\xb4\xb9\x10\xb0\x10:\xb2\xb9:"}, - 'enc(rotate-left-1|rotate_left_1)': {'This is a test': "¨ÐÒæ@Òæ@Â@èÊæè"}, -} -__guess__ = ["rotate-%d" % i for i in range(1, 8)] + ["rotate-left-%d" % i for i in range(1, 8)] - - -if PY3: - def _getn(i): - m = 1 - if str(i).startswith("left"): - i = i[4:].lstrip("-_") - m = -1 - return m * int(i) - - - def _rotaten(text, n=1): - r = "" - for c in ensure_str(text): - b = bin(ord(c))[2:].zfill(8) - r += chr(int(b[-n:] + b[:-n], 2)) - return r - - - def rotate_encode(i): - def encode(text, errors="strict"): - return _rotaten(text, _getn(i)), len(text) - return encode - - - def rotate_decode(i): - def decode(text, errors="strict"): - return _rotaten(text, -_getn(i)), len(text) - return decode - - - add("rotate", rotate_encode, rotate_decode, r"rotate(?:[-_]?bits)?[-_]?((?:(?:left|right)[-_]?)?[1-7])$", - transitive=True) - +# -*- coding: UTF-8 -*- +"""Rotate-Bits Codec - rotate-N-bits content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rotate-0|rotate-8|rotate-left-8)': None, + 'enc(rotate1|rotate-right-1|rotate_1)': {'This is a test': "*4\xb4\xb9\x10\xb4\xb9\x10\xb0\x10:\xb2\xb9:"}, + 'enc(rotate-left-1|rotate_left_1)': {'This is a test': "¨ÐÒæ@Òæ@Â@èÊæè"}, +} +__guess__ = ["rotate-%d" % i for i in range(1, 8)] + ["rotate-left-%d" % i for i in range(1, 8)] + + +def _getn(i): + m = 1 + if str(i).startswith("left"): + i = i[4:].lstrip("-_") + m = -1 + return m * int(i) + + +def _rotaten(text, n=1): + r = "" + for c in ensure_str(text): + b = bin(ord(c))[2:].zfill(8) + r += chr(int(b[-n:] + b[:-n], 2)) + return r + + +def rotate_encode(i): + def encode(text, errors="strict"): + return _rotaten(text, _getn(i)), len(text) + return encode + + +def rotate_decode(i): + def decode(text, errors="strict"): + return _rotaten(text, -_getn(i)), len(text) + return decode + + +add("rotate", rotate_encode, rotate_decode, r"rotate(?:[-_]?bits)?[-_]?((?:(?:left|right)[-_]?)?[1-7])$", + transitive=True) + diff --git a/src/codext/common/cases.py b/src/codext/common/cases.py index 8aa87e4..2f91ada 100644 --- a/src/codext/common/cases.py +++ b/src/codext/common/cases.py @@ -27,11 +27,12 @@ add("lowercase", lowercase, uppercase, r"^lower(?:case)?$", penalty=.2) slugify = lambda i, e="strict", d="-": (re.sub(r"[^0-9a-z]+", d, i.lower()).strip(d), len(i)) -add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|kebab(?:[-_]?case)?)$") +add("slugify", lambda i, e="strict": slugify(i, e), None, r"^(?:slug(?:ify)?|(?:dash|kebab)(?:[-_]?case)?)$") add("snakecase", lambda i, e="strict": slugify(i, e, "_"), None, r"^snake(?:[-_]?case)?$") +add("screamingsnakecase", lambda i, e="strict": slugify(i, e, "_").upper(), None, r"^screaming[-_]snake(?:[-_]?case)?$") swapcase = lambda i, e="strict": (i.swapcase(), len(i)) -add("swapcase", swapcase, swapcase, r"^(?:swap(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) +add("swapcase", swapcase, swapcase, r"^(?:(?:flip|swap)(?:[-_]?case)?|invert(?:case)?)$", penalty=.2) title = lambda i, e="strict": (i.title(), len(i)) untitle = lambda i, e="strict": (" ".join(w[0].lower() + w[1:] if len(w) > 0 else "" for w in i.split()), len(i)) diff --git a/src/codext/compressions/pkzip.py b/src/codext/compressions/pkzip.py index 47d9cd5..35ec94e 100755 --- a/src/codext/compressions/pkzip.py +++ b/src/codext/compressions/pkzip.py @@ -1,56 +1,55 @@ -# -*- coding: UTF-8 -*- -"""Pkzip Codec - pkzip content compression. - -NB: Not an encoding properly speaking. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -import zipfile - -from ..__common__ import * - - -_str = ["test", "This is a test", "@random{512,1024,2048}"] -__examples1__ = {'enc-dec(pkzip-deflate|deflate)': _str} -__examples2__ = {'enc-dec(pkzip_bz2|bzip2)': _str} -__examples3__ = {'enc-dec(pkzip-lzma|lzma)': _str} - - -if PY3: - NULL = { - 8: b"\x03\x00", - 12: b"BZh9\x17rE8P\x90\x00\x00\x00\x00", - 14: b"\t\x04\x05\x00]\x00\x00\x80\x00\x00\x83\xff\xfb\xff\xff\xc0\x00\x00\x00", - } - - - def pkzip_encode(compression_type): - def _encode(text, errors="strict"): - c = zipfile._get_compressor(compression_type) - return c.compress(b(text)) + c.flush(), len(text) - return _encode - - - def pkzip_decode(compression_type, name): - def _decode(data, errors="strict"): - d = zipfile._get_decompressor(compression_type) - r = d.decompress(b(data)) - if len(r) == 0 and b(data) != NULL[compression_type]: - return handle_error(name, errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) - return r, len(r) - return _decode - - - add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", - examples=__examples1__, guess=["deflate"]) - - add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", - examples=__examples2__, guess=["bz2"]) - - add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", - examples=__examples3__, guess=["lzma"]) - +# -*- coding: UTF-8 -*- +"""Pkzip Codec - pkzip content compression. + +NB: Not an encoding properly speaking. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +import zipfile + +from ..__common__ import * + + +_str = ["test", "This is a test", "@random{512,1024,2048}"] +__examples1__ = {'enc-dec(pkzip-deflate|deflate)': _str} +__examples2__ = {'enc-dec(pkzip_bz2|bzip2)': _str} +__examples3__ = {'enc-dec(pkzip-lzma|lzma)': _str} + + +NULL = { + 8: b"\x03\x00", + 12: b"BZh9\x17rE8P\x90\x00\x00\x00\x00", + 14: b"\t\x04\x05\x00]\x00\x00\x80\x00\x00\x83\xff\xfb\xff\xff\xc0\x00\x00\x00", +} + + +def pkzip_encode(compression_type): + def _encode(text, errors="strict"): + c = zipfile._get_compressor(compression_type) + return c.compress(b(text)) + c.flush(), len(text) + return _encode + + +def pkzip_decode(compression_type, name): + def _decode(data, errors="strict"): + d = zipfile._get_decompressor(compression_type) + r = d.decompress(b(data)) + if len(r) == 0 and b(data) != NULL[compression_type]: + return handle_error(name, errors, decode=True)(data[0], 0) if len(data) > 0 else "", len(data) + return r, len(r) + return _decode + + +add("pkzip_deflate", pkzip_encode(8), pkzip_decode(8, "deflate"), r"(?:(?:pk)?zip[-_])?deflate", + examples=__examples1__, guess=["deflate"]) + +add("pkzip_bzip2", pkzip_encode(12), pkzip_decode(12, "bzip2"), r"(?:(?:pk)?zip[-_])?bz(?:ip)?2", + examples=__examples2__, guess=["bz2"]) + +add("pkzip_lzma", pkzip_encode(14), pkzip_decode(14, "lzma"), r"(?:(?:pk)?zip[-_])?lzma", + examples=__examples3__, guess=["lzma"]) + diff --git a/src/codext/crypto/railfence.py b/src/codext/crypto/railfence.py index 3d150c0..a25f27a 100644 --- a/src/codext/crypto/railfence.py +++ b/src/codext/crypto/railfence.py @@ -1,96 +1,96 @@ -# -*- coding: UTF-8 -*- -"""Rail Fence Cipher Codec - rail fence content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(rail_123|rail-2-123)': {'this is a test': None}, - 'enc(railfence|zigzag)': {'this is a test': "t ashsi etist"}, - 'enc(rail-5|zigzag_5)': {'this is a test': "tah istsiet s"}, - 'enc(rail_5-3|rail_5_3)': {'this is a test': "it sss etiath "}, - 'enc(rail-5-3-up|rail_5_3-up)': {'this is a test': "h tiats e ssit"}, - 'enc(rail-7-4|rail_7_4)': {'this is a test': "a stiet shsti"}, - 'dec(zigzag)': {'': ""}, -} -__guess__ = ["railfence-%d" % i for i in range(1, 11)] + ["railfence-%d-up" % i for i in range(1, 11)] - - -def __build(text, rails, offset, up): - l, rail = len(text), offset - # set the starting rail and direction - if up: - dr = -1 - rail = rails - offset - 1 - else: - dr = 1 - # create rails - f = [[None] * l for i in range(rails)] - # now zig-zag between rails - for x in range(l): - f[rail][x] = text[x] - if rail >= rails - 1: - dr = -1 - elif rail <= 0: - dr = 1 - rail += dr - return f - - -def __check(length, rails, offset): - if rails > length: - raise ParameterError("Bad parameter for encoding 'railfence': rails=%d (should be >%d)" % (rails, length)) - if offset > rails: - raise ParameterError("Bad parameter for encoding 'railfence': offset=%d (should be >%d)" % (offset, rails)) - - -def railfence_encode(rails, offset, up): - rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" - def encode(text, errors="strict"): - r, l = "", len(text) - __check(l, rails, offset) - f = __build(text, rails, offset, up) - for rail in range(rails): - for x in range(l): - if f[rail][x] is not None: - r += f[rail][x] - return r, l - return encode - - -def railfence_decode(rails, offset, up): - rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" - def decode(text, errors="strict"): - # this if block is particularly useful with Python2 ; see codecs.py at line 492 in comparison with codecs.py - # from Python3 at line 501: in Python2, a last block can be read while empty while in Python3 not - # as a consequence, in Python2, an error is triggered as an empty text cannot be decoded with Rail Fence with - # a rails parameter > 0 (see the __check(length, rails, offset)) function - if text == "": - return "", 0 - r, i, l = "", 0, len(text) - __check(l, rails, offset) - f = __build("." * len(text), rails, offset, up) - # put the characters in the right place - for rail in range(rails): - for x in range(l): - if f[rail][x] == ".": - f[rail][x] = text[i] - i += 1 - # read the characters in the right order - for x in range(l): - for rail in range(rails): - if f[rail][x] is not None: - r += f[rail][x] - return r, len(text) - return decode - - -add("railfence", railfence_encode, railfence_decode, - r"^(?:rail(?:[-_]?fence)?|zigzag)(?:[-_]([1-9]|[1-9]\d+)(?:[-_]([0-9]|[1-9]\d+))?(?:[-_](up))?)?$") - +# -*- coding: UTF-8 -*- +"""Rail Fence Cipher Codec - rail fence content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(rail_123|rail-2-123)': {'this is a test': None}, + 'enc(railfence|zigzag)': {'this is a test': "t ashsi etist"}, + 'enc(rail-5|zigzag_5)': {'this is a test': "tah istsiet s"}, + 'enc(rail_5-3|rail_5_3)': {'this is a test': "it sss etiath "}, + 'enc(rail-5-3-up|rail_5_3-up)': {'this is a test': "h tiats e ssit"}, + 'enc(rail-7-4|rail_7_4)': {'this is a test': "a stiet shsti"}, + 'dec(zigzag)': {'': ""}, +} +__guess__ = ["railfence-%d" % i for i in range(1, 11)] + ["railfence-%d-up" % i for i in range(1, 11)] + + +def __build(text, rails, offset, up): + l, rail = len(text), offset + # set the starting rail and direction + if up: + dr = -1 + rail = rails - offset - 1 + else: + dr = 1 + # create rails + f = [[None] * l for i in range(rails)] + # now zig-zag between rails + for x in range(l): + f[rail][x] = text[x] + if rail >= rails - 1: + dr = -1 + elif rail <= 0: + dr = 1 + rail += dr + return f + + +def __check(length, rails, offset): + if rails > length: + raise ParameterError("Bad parameter for encoding 'railfence': rails=%d (should be <= %d)" % (rails, length)) + if offset > rails: + raise ParameterError("Bad parameter for encoding 'railfence': offset=%d (should be <= %d)" % (offset, rails)) + + +def railfence_encode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def encode(text, errors="strict"): + r, l = "", len(text) + __check(l, rails, offset) + f = __build(text, rails, offset, up) + for rail in range(rails): + for x in range(l): + if f[rail][x] is not None: + r += f[rail][x] + return r, l + return encode + + +def railfence_decode(rails, offset, up): + rails, offset, up = int(rails or 3), int(offset or 0), up is not None and up != "" + def decode(text, errors="strict"): + # this if block is particularly useful with Python2 ; see codecs.py at line 492 in comparison with codecs.py + # from Python3 at line 501: in Python2, a last block can be read while empty while in Python3 not + # as a consequence, in Python2, an error is triggered as an empty text cannot be decoded with Rail Fence with + # a rails parameter > 0 (see the __check(length, rails, offset)) function + if text == "": + return "", 0 + r, i, l = "", 0, len(text) + __check(l, rails, offset) + f = __build("." * len(text), rails, offset, up) + # put the characters in the right place + for rail in range(rails): + for x in range(l): + if f[rail][x] == ".": + f[rail][x] = text[i] + i += 1 + # read the characters in the right order + for x in range(l): + for rail in range(rails): + if f[rail][x] is not None: + r += f[rail][x] + return r, len(text) + return decode + + +add("railfence", railfence_encode, railfence_decode, + r"^(?:rail(?:[-_]?fence)?|zigzag)(?:[-_]([1-9]|[1-9]\d+)(?:[-_]([0-9]|[1-9]\d+))?(?:[-_](up))?)?$") + diff --git a/src/codext/hashing/blake.py b/src/codext/hashing/blake.py index 2fad090..6656c46 100644 --- a/src/codext/hashing/blake.py +++ b/src/codext/hashing/blake.py @@ -8,20 +8,18 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib +from ..__common__ import * -from ..__common__ import add, b, PY3 +def blake_hash(c): + def _hash_transform(l): + l = (l or "64" if c == "b" else "32").lstrip("_-") + def _encode(data, error="strict"): + return getattr(hashlib, "blake2%s" % c)(b(data), digest_size=int(l)).hexdigest(), len(data) + return _encode + return _hash_transform -if PY3: - def blake_hash(c): - def _hash_transform(l): - l = (l or "64" if c == "b" else "32").lstrip("_-") - def _encode(data, error="strict"): - return getattr(hashlib, "blake2%s" % c)(b(data), digest_size=int(l)).hexdigest(), len(data) - return _encode - return _hash_transform - add("blake2b", blake_hash("b"), pattern=r"^blake2b(|[-_](?:[1-9]|[1-5]\d|6[0-4]))$", guess=None) - add("blake2s", blake_hash("s"), pattern=r"^blake2s(|[-_](?:[1-9]|[1-2]\d|3[0-2]))$", guess=None) +add("blake2b", blake_hash("b"), pattern=r"^blake2b(|[-_](?:[1-9]|[1-5]\d|6[0-4]))$", guess=None) +add("blake2s", blake_hash("s"), pattern=r"^blake2s(|[-_](?:[1-9]|[1-2]\d|3[0-2]))$", guess=None) diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py index caf8290..0d44d8e 100644 --- a/src/codext/hashing/crypt.py +++ b/src/codext/hashing/crypt.py @@ -8,10 +8,10 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -from ..__common__ import add, ensure_str, PY3, UNIX +from ..__common__ import add, ensure_str, UNIX -if PY3 and UNIX: +if UNIX: import crypt METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] diff --git a/src/codext/hashing/md.py b/src/codext/hashing/md.py index 6463722..521a01c 100644 --- a/src/codext/hashing/md.py +++ b/src/codext/hashing/md.py @@ -8,9 +8,7 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib - -from ..__common__ import add, b +from ..__common__ import * MD2_TABLE = [41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6, 19, 98, 167, 5, 243, 192, 199, 115, 140, diff --git a/src/codext/hashing/sha.py b/src/codext/hashing/sha.py index dd94002..1351fe8 100644 --- a/src/codext/hashing/sha.py +++ b/src/codext/hashing/sha.py @@ -8,9 +8,7 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib - -from ..__common__ import add, b, PY3 +from ..__common__ import * add("sha1", lambda s, error="strict": (hashlib.sha1(b(s)).hexdigest(), len(s)), guess=None) @@ -18,15 +16,12 @@ add("sha256", lambda s, error="strict": (hashlib.sha256(b(s)).hexdigest(), len(s)), guess=None) add("sha384", lambda s, error="strict": (hashlib.sha384(b(s)).hexdigest(), len(s)), guess=None) add("sha512", lambda s, error="strict": (hashlib.sha512(b(s)).hexdigest(), len(s)), guess=None) - - -if PY3: - add("sha3_224", lambda s, error="strict": (hashlib.sha3_224(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]224$", - guess=None) - add("sha3_256", lambda s, error="strict": (hashlib.sha3_256(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]256$", - guess=None) - add("sha3_384", lambda s, error="strict": (hashlib.sha3_384(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]384$", - guess=None) - add("sha3_512", lambda s, error="strict": (hashlib.sha3_512(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]512$", - guess=None) +add("sha3_224", lambda s, error="strict": (hashlib.sha3_224(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]224$", + guess=None) +add("sha3_256", lambda s, error="strict": (hashlib.sha3_256(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]256$", + guess=None) +add("sha3_384", lambda s, error="strict": (hashlib.sha3_384(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]384$", + guess=None) +add("sha3_512", lambda s, error="strict": (hashlib.sha3_512(b(s)).hexdigest(), len(s)), pattern=r"^sha3[-_]512$", + guess=None) diff --git a/src/codext/hashing/shake.py b/src/codext/hashing/shake.py index af79dce..22c7b99 100644 --- a/src/codext/hashing/shake.py +++ b/src/codext/hashing/shake.py @@ -8,20 +8,18 @@ - transform strings from bytes to bytes - transform file content from str to bytes (write) """ -import hashlib +from ..__common__ import * -from ..__common__ import add, b, PY3 +def shake_hash(i): + def _hash_transform(l): + l = (l or str(i)).lstrip("_-") + def _encode(data, error="strict"): + return getattr(hashlib, "shake_%d" % i)(b(data)).hexdigest(int(l)), len(data) + return _encode + return _hash_transform -if PY3: - def shake_hash(i): - def _hash_transform(l): - l = (l or str(i)).lstrip("_-") - def _encode(data, error="strict"): - return getattr(hashlib, "shake_%d" % i)(b(data)).hexdigest(int(l)), len(data) - return _encode - return _hash_transform - add("shake_128", shake_hash(128), pattern=r"^shake[-_]?128(|[-_][1-9]\d*)$", guess=None) - add("shake_256", shake_hash(256), pattern=r"^shake[-_]?256(|[-_][1-9]\d*)$", guess=None) +add("shake_128", shake_hash(128), pattern=r"^shake[-_]?128(|[-_][1-9]\d*)$", guess=None) +add("shake_256", shake_hash(256), pattern=r"^shake[-_]?256(|[-_][1-9]\d*)$", guess=None) diff --git a/src/codext/languages/braille.py b/src/codext/languages/braille.py index b28c56e..775399c 100755 --- a/src/codext/languages/braille.py +++ b/src/codext/languages/braille.py @@ -1,34 +1,33 @@ -# -*- coding: UTF-8 -*- -"""Braille Codec - braille content encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(braille)': {'this is a test': "⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞"}, -} - - -ENCMAP = { - # digits - '0': '⠴', '1': '⠂', '2': '⠆', '3': '⠒', '4': '⠲', '5': '⠢', '6': '⠖', '7': '⠶', '8': '⠦', '9': '⠔', - # letters - 'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑', 'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚', 'k': '⠅', - 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕', 'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞', 'u': '⠥', 'v': '⠧', - 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵', - # punctuation - ' ': '⠀', '!': '⠮', '"': '⠐', '#': '⠼', '$': '⠫', '%': '⠩', '&': '⠯', ':': '⠱', ';': '⠰', '<': '⠣', '=': '⠿', - '>': '⠜', '?': '⠹', '@': '⠈', "'": '⠄', '(': '⠷', ')': '⠾', '*': '⠡', '+': '⠬', ',': '⠠', '-': '⠤', '.': '⠨', - '/': '⠌', '[': '⠪', '\\': '⠳', ']': '⠻', '^': '⠘', '_': '⠸', -} - - -if PY3: - add_map("braille", ENCMAP, ignore_case="encode") - +# -*- coding: UTF-8 -*- +"""Braille Codec - braille content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(braille)': {'this is a test': "⠞⠓⠊⠎⠀⠊⠎⠀⠁⠀⠞⠑⠎⠞"}, +} + + +ENCMAP = { + # digits + '0': '⠴', '1': '⠂', '2': '⠆', '3': '⠒', '4': '⠲', '5': '⠢', '6': '⠖', '7': '⠶', '8': '⠦', '9': '⠔', + # letters + 'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑', 'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚', 'k': '⠅', + 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕', 'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞', 'u': '⠥', 'v': '⠧', + 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵', + # punctuation + ' ': '⠀', '!': '⠮', '"': '⠐', '#': '⠼', '$': '⠫', '%': '⠩', '&': '⠯', ':': '⠱', ';': '⠰', '<': '⠣', '=': '⠿', + '>': '⠜', '?': '⠹', '@': '⠈', "'": '⠄', '(': '⠷', ')': '⠾', '*': '⠡', '+': '⠬', ',': '⠠', '-': '⠤', '.': '⠨', + '/': '⠌', '[': '⠪', '\\': '⠳', ']': '⠻', '^': '⠘', '_': '⠸', +} + + +add_map("braille", ENCMAP, ignore_case="encode") + diff --git a/src/codext/languages/galactic.py b/src/codext/languages/galactic.py index e77cb3a..26544b5 100644 --- a/src/codext/languages/galactic.py +++ b/src/codext/languages/galactic.py @@ -29,7 +29,6 @@ } -if PY3: - add_map("galactic", ENCMAP, ignore_case="encode", printables_rate=0., - pattern=r"^(?:galactic(?:[-_]alphabet)?|minecraft(?:[-_](?:enchantment|enchanting[-_]language))?)$") +add_map("galactic", ENCMAP, ignore_case="encode", printables_rate=0., + pattern=r"^(?:galactic(?:[-_]alphabet)?|minecraft(?:[-_](?:enchantment|enchanting[-_]language))?)$") diff --git a/src/codext/languages/tap.py b/src/codext/languages/tap.py index efd551d..ec7c15b 100644 --- a/src/codext/languages/tap.py +++ b/src/codext/languages/tap.py @@ -1,39 +1,38 @@ -# -*- coding: UTF-8 -*- -"""Tap code - Tap/knock code encoding. - -This codec: -- en/decodes strings from str to str -- en/decodes strings from bytes to bytes -- decodes file content to str (read) -- encodes file content from str to bytes (write) -""" -from ..__common__ import * - - -__examples__ = { - 'enc(tap|knock-code|tap_code)': {'this is a test' : ".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. ." - "⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ...."}, -} -__guess__ = ["tap", "tap-inv"] - - -def __build_encmap(a): - d, i = {}, 0 - for x in range(1,6): - for y in range(1,6): - d[a[i]] = x * "." + " " + y * "." - i += 1 - d['k'], d[' '] = d['c'], " " - return d - - - -ENCMAP = { - '': __build_encmap("abcdefghijlmnopqrstuvwxyz"), - 'inv': __build_encmap("abcdefghijlmnopqrstuvwxyz"[::-1]), -} - - -if PY3: - add_map("tap", ENCMAP, ignore_case="both", sep="⠀", pattern=r"^(?:tap|knock)(?:[-_]code)?(|inv)$") - +# -*- coding: UTF-8 -*- +"""Tap code - Tap/knock code encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(tap|knock-code|tap_code)': {'this is a test' : ".... ....⠀.. ...⠀.. ....⠀.... ...⠀ ⠀.. ....⠀.... ...⠀ ⠀. ." + "⠀ ⠀.... ....⠀. .....⠀.... ...⠀.... ...."}, +} +__guess__ = ["tap", "tap-inv"] + + +def __build_encmap(a): + d, i = {}, 0 + for x in range(1,6): + for y in range(1,6): + d[a[i]] = x * "." + " " + y * "." + i += 1 + d['k'], d[' '] = d['c'], " " + return d + + + +ENCMAP = { + '': __build_encmap("abcdefghijlmnopqrstuvwxyz"), + 'inv': __build_encmap("abcdefghijlmnopqrstuvwxyz"[::-1]), +} + + +add_map("tap", ENCMAP, ignore_case="both", sep="⠀", pattern=r"^(?:tap|knock)(?:[-_]code)?(|inv)$") + diff --git a/src/codext/others/uuencode.py b/src/codext/others/uuencode.py index a2f2fb6..f1ecfc3 100644 --- a/src/codext/others/uuencode.py +++ b/src/codext/others/uuencode.py @@ -17,7 +17,7 @@ 'dec(uu-encode)': {'.=&AI': "<This is a test>"}, - 'dec(html|html_entity)': {'&DoesNotExist;': None}, - 'dec(html_entities|html-entity)': { - '<This is a test>': "", - '<This is a test>': "", - }, -} -if PY3: - __examples__['enc(html)'] = {'\u1234': "&1234;"} - - -# source: https://dev.w3.org/html5/html-author/charref -ENCMAP = { - '\t': " ", '\n': " ", '!': "!", '"': """, '#': "#", '$': "$", '%': "%", - '&': "&", '\'': "'", '(': "(", ')': ")", '*': "*", '+': "+", ',': ",", - '.': ".", '/': "/", ':': ":", ';': ";", '<': "<", '=': "=", '>': ">", - '?': "?", '@': "@", '[': "[", '\\': "\", ']': "]", '^': "^", '_': "_", - '`': "`", '{': "{", '|': "|", '}': "}", '¡': "¡", '¢': "¢", - '£': "£", '¤': "¤", '¥': "¥", '¦': "¦", '§': "§", '¨': "¨", '©': "©", - 'ª': "ª", '«': "«", '¬': "¬", '­': "­", '®': "®", '¯': "¯", '°': "°", - '±': "±", '²': "²", '³': "³", '´': "´", 'µ': "µ", '¶': "¶", '·': "·", - '¸': "¸", '¹': "¹", 'º': "º", '»': "»", '¼': "¼", '½': "½", '¾': "¾", - '¿': "¿", 'À': "À", 'Á': "Á", 'Â': "Â", 'Ã': "Ã", 'Ä': "Ä", 'Å': "Å", - 'Æ': "Æ", 'Ç': "Ç", 'È': "È", 'É': "É", 'Ê': "Ê", 'Ë': "Ë", 'Ì': "Ì", - 'Í': "Í", 'Î': "Î", 'Ï': "Ï", 'Ð': "Ð", 'Ñ': "Ñ", 'Ò': "Ò", 'Ó': "Ó", - 'Ô': "Ô", 'Õ': "Õ", 'Ö': "Ö", '×': "×", 'Ø': "Ø", 'Ù': "Ù", 'Ú': "Ú", - 'Û': "Û", 'Ü': "Ü", 'Ý': "Ý", 'Þ': "Þ", 'ß': "ß", 'à': "à", 'á': "á", - 'â': "â", 'ã': "ã", 'ä': "ä", 'å': "å", 'æ': "æ", 'ç': "ç", 'è': "è", - 'é': "é", 'ê': "ê", 'ë': "ë", 'ì': "ì", 'í': "í", 'î': "î", 'ï': "ï", - 'ð': "ð", 'ñ': "ñ", 'ò': "ò", 'ó': "ó", 'ô': "ô", 'õ': "õ", 'ö': "ö", - '÷': "÷", 'ø': "ø", 'ù': "ù", 'ú': "ú", 'û': "û", 'ü': "ü", 'ý': "ý", - 'þ': "þ", 'ÿ': "ÿ", 'Ā': "Ā", 'ā': "ā", 'Ă': "Ă", 'ă': "ă", 'Ą': "Ą", - 'ą': "ą", 'Ć': "Ć", 'ć': "ć", 'Ĉ': "Ĉ", 'ĉ': "ĉ", 'Ċ': "Ċ", 'ċ': "ċ", - 'Č': "Č", 'č': "č", 'Ď': "Ď", 'ď': "ď", 'Đ': "Đ", 'đ': "đ", - 'Ē': "Ē", 'ē': "ē", 'Ė': "Ė", 'ė': "ė", 'Ę': "Ę", 'ę': "ę", 'Ě': "Ě", - 'ě': "ě", 'Ĝ': "Ĝ", 'ĝ': "ĝ", 'Ğ': "Ğ", 'ğ': "ğ", 'Ġ': "Ġ", 'ġ': "ġ", - 'Ģ': "Ģ", 'Ĥ': "Ĥ", 'ĥ': "ĥ", 'Ħ': "Ħ", 'ħ': "ħ", 'Ĩ': "Ĩ", - 'ĩ': "ĩ", 'Ī': "Ī", 'ī': "ī", 'Į': "Į", 'į': "į", 'İ': "İ", 'ı': "ı", - 'IJ': "IJ", 'ij': "ij", 'Ĵ': "Ĵ", 'ĵ': "ĵ", 'Ķ': "Ķ", 'ķ': "ķ", 'ĸ': "ĸ", - 'Ĺ': "Ĺ", 'ĺ': "ĺ", 'Ļ': "Ļ", 'ļ': "ļ", 'Ľ': "Ľ", 'ľ': "ľ", - 'Ŀ': "Ŀ", 'ŀ': "ŀ", 'Ł': "Ł", 'ł': "ł", 'Ń': "Ń", 'ń': "ń", - 'Ņ': "Ņ", 'ņ': "ņ", 'Ň': "Ň", 'ň': "ň", 'ʼn': "ʼn", 'Ŋ': "Ŋ", 'ŋ': "ŋ", - 'Ō': "Ō", 'ō': "ō", 'Ő': "Ő", 'ő': "ő", 'Œ': "Œ", 'œ': "œ", 'Ŕ': "Ŕ", - 'ŕ': "ŕ", 'Ŗ': "Ŗ", 'ŗ': "ŗ", 'Ř': "Ř", 'ř': "ř", 'Ś': "Ś", - 'ś': "ś", 'Ŝ': "Ŝ", 'ŝ': "ŝ", 'Ş': "Ş", 'ş': "ş", 'Š': "Š", - 'š': "š", 'Ţ': "Ţ", 'ţ': "ţ", 'Ť': "Ť", 'ť': "ť", 'Ŧ': "Ŧ", - 'ŧ': "ŧ", 'Ũ': "Ũ", 'ũ': "ũ", 'Ū': "Ū", 'ū': "ū", 'Ŭ': "Ŭ", - 'ŭ': "ŭ", 'Ů': "Ů", 'ů': "ů", 'Ű': "Ű", 'ű': "ű", 'Ų': "Ų", 'ų': "ų", - 'Ŵ': "Ŵ", 'ŵ': "ŵ", 'Ŷ': "Ŷ", 'ŷ': "ŷ", 'Ÿ': "Ÿ", 'Ź': "Ź", 'ź': "ź", - 'Ż': "Ż", 'ż': "ż", 'Ž': "Ž", 'ž': "ž", 'ƒ': "ƒ", 'Ƶ': "Ƶ", 'ǵ': "ǵ", - 'ȷ': "ȷ", 'ˆ': "ˆ", 'ˇ': "ˇ", '˘': "˘", '˙': "˙", '˚': "˚", '˛': "˛", - '˜': "˜", '˝': "˝", '̑': "̑", '̲': "_", 'Α': "Α", 'Β': "Β", - 'Γ': "Γ", 'Δ': "Δ", 'Ε': "Ε", 'Ζ': "Ζ", 'Η': "Η", 'Θ': "Θ", 'Ι': "Ι", - 'Κ': "Κ", 'Λ': "Λ", 'Μ': "Μ", 'Ν': "Ν", 'Ξ': "Ξ", 'Ο': "Ο", 'Π': "Π", - 'Ρ': "Ρ", 'Σ': "Σ", 'Τ': "Τ", 'Υ': "Υ", 'Φ': "Φ", 'Χ': "Χ", 'Ψ': "Ψ", - 'Ω': "Ω", 'α': "α", 'β': "β", 'γ': "γ", 'δ': "δ", 'ε': "ϵ", 'ζ': "ζ", - 'η': "η", 'θ': "θ", 'ι': "ι", 'κ': "κ", 'λ': "λ", 'μ': "μ", 'ν': "ν", - 'ξ': "ξ", 'ο': "ο", 'π': "π", 'ρ': "ρ", 'ς': "ς", 'σ': "σ", 'τ': "τ", - 'υ': "υ", 'φ': "φ", 'χ': "χ", 'ψ': "ψ", 'ω': "ω", 'ϑ': "ϑ", 'ϒ': "ϒ", - 'ϕ': "ϕ", 'ϖ': "ϖ", 'Ϝ': "Ϝ", 'ϝ': "ϝ", 'ϰ': "ϰ", 'ϱ': "ϱ", - 'ϵ': "ε", '϶': "϶", 'Ё': "Ё", 'Ђ': "Ђ", 'Ѓ': "Ѓ", 'Є': "Є", 'Ѕ': "Ѕ", - 'І': "І", 'Ї': "Ї", 'Ј': "Ј", 'Љ': "Љ", 'Њ': "Њ", 'Ћ': "Ћ", 'Ќ': "Ќ", - 'Ў': "Ў", 'Џ': "Џ", 'А': "А", 'Б': "Б", 'В': "В", 'Г': "Г", 'Д': "Д", 'Е': "Е", - 'Ж': "Ж", 'З': "З", 'И': "И", 'Й': "Й", 'К': "К", 'Л': "Л", 'М': "М", 'Н': "Н", - 'О': "О", 'П': "П", 'Р': "Р", 'С': "С", 'Т': "Т", 'У': "У", 'Ф': "Ф", 'Х': "Х", - 'Ц': "Ц", 'Ч': "Ч", 'Ш': "Ш", 'Щ': "Щ", 'Ъ': "Ъ", 'Ы': "Ы", 'Ь': "Ь", - 'Э': "Э", 'Ю': "Ю", 'Я': "Я", 'а': "а", 'б': "б", 'в': "в", 'г': "г", 'д': "д", - 'е': "е", 'ж': "ж", 'з': "з", 'и': "и", 'й': "й", 'к': "к", 'л': "л", 'м': "м", - 'н': "н", 'о': "о", 'п': "п", 'р': "р", 'с': "с", 'т': "т", 'у': "у", 'ф': "ф", - 'х': "х", 'ц': "ц", 'ч': "ч", 'ш': "ш", 'щ': "щ", 'ъ': "ъ", 'ы': "ы", - 'ь': "ь", 'э': "э", 'ю': "ю", 'я': "я", 'ё': "ё", 'ђ': "ђ", 'ѓ': "ѓ", - 'є': "є", 'ѕ': "ѕ", 'і': "і", 'ї': "ї", 'ј': "ј", 'љ': "љ", 'њ': "њ", - 'ћ': "ћ", 'ќ': "ќ", 'ў': "ў", 'џ': "џ", '\u2002': " ", '\u2003': " ", - '\u2004': " ", '\u2005': " ", '\u2007': " ", '\u2008': " ", '\u2009': " ", - '\u200a': " ", '​\u200b': "​", '\u200c': "‌", '\u200d': "‍", '\u200e': "‎", - '\u200f': "‏", '‐': "‐", '–': "–", '—': "—", - '―': "―", '‖': "‖", '‘': "‘", '’': "’", '‚': "‚", '“': "“", '”': "”", - '„': "„", '†': "†", '‡': "‡", '•': "•", '‥': "‥", '…': "…", '‰': "‰", - '‱': "‱", '′': "′", '″': "″", '‴': "‴", '‵': "‵", '‹': "‹", - '›': "›", '‾': "‾", '⁁': "⁁", '⁃': "⁃", '⁄': "⁄", '⁏': "⁏", '⁗': "⁗", - '\u205f': " ", '⁠': "⁠", '⁡': "⁡", '⁢': "⁢", '⁣': "⁣", - '€': "€", '⃛': "⃛", '⃜': "⃜", 'ℂ': "ℂ", '℅': "℅", 'ℊ': "ℊ", 'ℋ': "ℋ", - 'ℌ': "ℌ", 'ℍ': "ℍ", 'ℎ': "ℎ", 'ℏ': "ℏ", 'ℐ': "ℐ", 'ℑ': "ℑ", - 'ℒ': "ℒ", 'ℓ': "ℓ", 'ℕ': "ℕ", '№': "№", '℗': "℗", '℘': "℘", 'ℙ': "ℙ", - 'ℚ': "ℚ", 'ℛ': "ℛ", 'ℜ': "ℜ", 'ℝ': "ℝ", '℞': "℞", '™': "™", 'ℤ': "ℤ", - 'Ω': "Ω", '℧': "℧", 'ℨ': "ℨ", '℩': "℩", 'Å': "Å", 'ℬ': "ℬ", 'ℭ': "ℭ", - 'ℯ': "ℯ", 'ℰ': "ℰ", 'ℱ': "ℱ", 'ℳ': "ℳ", 'ℴ': "ℴ", 'ℵ': "ℵ", 'ℶ': "ℶ", - 'ℷ': "ℷ", 'ℸ': "ℸ", 'ⅅ': "ⅅ", 'ⅆ': "ⅆ", 'ⅇ': "ⅇ", - 'ⅈ': "ⅈ", '⅓': "⅓", '⅔': "⅔", '⅕': "⅕", '⅖': "⅖", '⅗': "⅗", - '⅘': "⅘", '⅙': "⅙", '⅚': "⅚", '⅛': "⅛", '⅜': "⅜", '⅝': "⅝", - '⅞': "⅞", '←': "←", '↑': "↑", '→': "→", '↓': "↓", '↔': "↔", '↕': "↕", - '↖': "↖", '↗': "↗", '↘': "↘", '↙': "↙", '↚': "↚", '↛': "↛", '↝': "↝", - '↞': "↞", '↟': "↟", '↠': "↠", '↡': "↡", '↢': "↢", '↣': "↣", - '↤': "↤", '↥': "↥", '↦': "↦", '↧': "↧", '↩': "↩", '↪': "↪", - '↫': "↫", '↬': "↬", '↭': "↭", '↮': "↮", '↰': "↰", '↱': "↱", '↲': "↲", - '↳': "↳", '↵': "↵", '↶': "↶", '↷': "↷", '↺': "↺", '↻': "↻", '↼': "↼", - '↽': "↽", '↾': "↾", '↿': "↿", '⇀': "⇀", '⇁': "⇁", '⇂': "⇂", '⇃': "⇃", - '⇄': "⇄", '⇅': "⇅", '⇆': "⇆", '⇇': "⇇", '⇈': "⇈", '⇉': "⇉", '⇊': "⇊", - '⇋': "⇋", '⇌': "⇌", '⇍': "⇍", '⇎': "⇎", '⇏': "⇏", '⇐': "⇐", '⇑': "⇑", - '⇒': "⇒", '⇓': "⇓", '⇔': "⇔", '⇕': "⇕", '⇖': "⇖", '⇗': "⇗", '⇘': "⇘", - '⇙': "⇙", '⇚': "⇚", '⇛': "⇛", '⇝': "⇝", '⇤': "⇤", '⇥': "⇥", '⇵': "⇵", - '⇽': "⇽", '⇾': "⇾", '⇿': "⇿", '∀': "∀", '∁': "∁", '∂': "∂", '∃': "∃", - '∄': "∄", '∅': "∅", '∇': "∇", '∈': "∈", '∉': "∉", '∋': "∋", '∌': "∌", - '∏': "∏", '∐': "∐", '∑': "∑", '−': "−", '∓': "∓", '∔': "∔", '∖': "∖", - '∗': "∗", '∘': "∘", '√': "√", '∝': "∝", '∞': "∞", '∟': "∟", '∠': "∠", - '∡': "∡", '∢': "∢", '∣': "∣", '∤': "∤", '∥': "∥", '∦': "∦", '∧': "∧", - '∨': "∨", '∩': "∩", '∪': "∪", '∫': "∫", '∬': "∬", '∭': "∭", '∮': "∮", - '∯': "∯", '∰': "∰", '∱': "∱", '∲': "∲", '∳': "∳", '∴': "∴", - '∵': "∵", '∶': "∶", '∷': "∷", '∸': "∸", '∺': "∺", '∻': "∻", '∼': "∼", - '∽': "∽", '∾': "∾", '∿': "∿", '≀': "≀", '≁': "≁", '≂': "≂", '≃': "≃", - '≄': "≄", '≅': "≅", '≆': "≆", '≇': "≇", '≈': "≈", '≉': "≉", '≊': "≊", - '≋': "≋", '≌': "≌", '≍': "≍", '≎': "≎", '≏': "≏", '≐': "≐", '≑': "≑", - '≒': "≒", '≓': "≓", '≔': "≔", '≕': "≕", '≖': "≖", '≗': "≗", '≙': "≙", - '≚': "≚", '≜': "≜", '≟': "≟", '≠': "≠", '≡': "≡", '≢': "≢", '≤': "≤", - '≥': "≥", '≦': "≦", '≧': "≧", '≨': "≨", '≩': "≩", '≪': "≪", '≫': "≫", '≬': "≬", - '≭': "≭", '≮': "≮", '≯': "≯", '≰': "≰", '≱': "≱", '≲': "≲", '≳': "≳", - '≴': "≴", '≵': "≵", '≶': "≶", '≷': "≷", '≸': "≸", '≹': "≹", '≺': "≺", '≻': "≻", - '≼': "≼", '≽': "≽", '≾': "≾", '≿': "≿", '⊀': "⊀", '⊁': "⊁", '⊂': "⊂", - '⊃': "⊃", '⊄': "⊄", '⊅': "⊅", '⊆': "⊆", '⊇': "⊇", '⊈': "⊈", '⊉': "⊉", - '⊊': "⊊", '⊋': "⊋", '⊍': "⊍", '⊎': "⊎", '⊏': "⊏", '⊐': "⊐", '⊑': "⊑", - '⊒': "⊒", '⊓': "⊓", '⊔': "⊔", '⊕': "⊕", '⊖': "⊖", '⊗': "⊗", '⊘': "⊘", - '⊙': "⊙", '⊚': "⊚", '⊛': "⊛", '⊝': "⊝", '⊞': "⊞", '⊟': "⊟", '⊠': "⊠", - '⊡': "⊡", '⊢': "⊢", '⊣': "⊣", '⊤': "⊤", '⊥': "⊥", '⊧': "⊧", '⊨': "⊨", - '⊩': "⊩", '⊪': "⊪", '⊫': "⊫", '⊬': "⊬", '⊭': "⊭", '⊮': "⊮", - '⊯': "⊯", '⊰': "⊰", '⊲': "⊲", '⊳': "⊳", '⊴': "⊴", '⊵': "⊵", '⊶': "⊶", - '⊷': "⊷", '⊸': "⊸", '⊹': "⊹", '⊺': "⊺", '⊻': "⊻", '⊽': "⊽", - '⊾': "⊾", '⊿': "⊿", '⋀': "⋀", '⋁': "⋁", '⋂': "⋂", '⋃': "⋃", '⋄': "⋄", - '⋅': "⋅", '⋆': "⋆", '⋇': "⋇", '⋈': "⋈", '⋉': "⋉", '⋊': "⋊", - '⋋': "⋋", '⋌': "⋌", '⋍': "⋍", '⋎': "⋎", '⋏': "⋏", '⋐': "⋐", '⋑': "⋑", - '⋒': "⋒", '⋓': "⋓", '⋔': "⋔", '⋕': "⋕", '⋖': "⋖", '⋗': "⋗", '⋘': "⋘", '⋙': "⋙", - '⋚': "⋚", '⋛': "⋛", '⋞': "⋞", '⋟': "⋟", '⋠': "⋠", '⋡': "⋡", '⋢': "⋢", - '⋣': "⋣", '⋦': "⋦", '⋧': "⋧", '⋨': "⋨", '⋩': "⋩", '⋪': "⋪", '⋫': "⋫", - '⋬': "⋬", '⋭': "⋭", '⋮': "⋮", '⋯': "⋯", '⋰': "⋰", '⋱': "⋱", '⋲': "⋲", - '⋳': "⋳", '⋴': "⋴", '⋵': "⋵", '⋶': "⋶", '⋷': "⋷", '⋹': "⋹", - '⋺': "⋺", '⋻': "⋻", '⋼': "⋼", '⋽': "⋽", '⋾': "⋾", '⌅': "⌅", '⌆': "⌆", - '⌈': "⌈", '⌉': "⌉", '⌊': "⌊", '⌋': "⌋", '⌌': "⌌", '⌍': "⌍", - '⌎': "⌎", '⌏': "⌏", '⌐': "⌐", '⌒': "⌒", '⌓': "⌓", '⌕': "⌕", - '⌖': "⌖", '⌜': "⌜", '⌝': "⌝", '⌞': "⌞", '⌟': "⌟", '⌢': "⌢", - '⌣': "⌣", '⌭': "⌭", '⌮': "⌮", '⌶': "⌶", '⌽': "⌽", '⌿': "⌿", - '⍼': "⍼", '⎰': "⎰", '⎱': "⎱", '⎴': "⎴", '⎵': "⎵", '⎶': "⎶", - '⏜': "⏜", '⏝': "⏝", '⏞': "⏞", '⏟': "⏟", '⏢': "⏢", - '⏧': "⏧", '␣': "␣", 'Ⓢ': "Ⓢ", '─': "─", '│': "│", '┌': "┌", '┐': "┐", - '└': "└", '┘': "┘", '├': "├", '┤': "┤", '┬': "┬", '┴': "┴", '┼': "┼", - '═': "═", '║': "║", '╒': "╒", '╓': "╓", '╔': "╔", '╕': "╕", '╖': "╖", - '╗': "╗", '╘': "╘", '╙': "╙", '╚': "╚", '╛': "╛", '╜': "╜", '╝': "╝", - '╞': "╞", '╟': "╟", '╠': "╠", '╡': "╡", '╢': "╢", '╣': "╣", '╤': "╤", - '╥': "╥", '╦': "╦", '╧': "╧", '╨': "╨", '╩': "╩", '╪': "╪", '╫': "╫", - '╬': "╬", '▀': "▀", '▄': "▄", '█': "█", '░': "░", '▒': "▒", '▓': "▓", - '□': "□", '▪': "▪", '▫': "▫", '▭': "▭", '▮': "▮", '▱': "▱", - '△': "△", '▴': "▴", '▵': "▵", '▸': "▸", '▹': "▹", '▽': "▽", '▾': "▾", - '▿': "▿", '◂': "◂", '◃': "◃", '◊': "◊", '○': "○", '◬': "◬", '◯': "◯", - '◸': "◸", '◹': "◹", '◺': "◺", '◻': "◻", '◼': "◼", - '★': "★", '☆': "☆", '☎': "☎", '♀': "♀", '♂': "♂", '♠': "♠", '♣': "♣", - '♥': "♥", '♦': "♦", '♪': "♪", '♭': "♭", '♮': "♮", '♯': "♯", '✓': "✓", - '✗': "✗", '✠': "✠", '✶': "✶", '❘': "❘", '❲': "❲", '❳': "❳", - '⟦': "⟦", '⟧': "⟧", '⟨': "⟨", '⟩': "⟩", '⟪': "⟪", '⟫': "⟫", '⟬': "⟬", - '⟭': "⟭", '⟵': "⟵", '⟶': "⟶", '⟷': "⟷", '⟸': "⟸", '⟹': "⟹", '⟺': "⟺", - '⟼': "⟼", '⟿': "⟿", '⤂': "⤂", '⤃': "⤃", '⤄': "⤄", '⤅': "⤅", '⤌': "⤌", - '⤍': "⤍", '⤎': "⤎", '⤏': "⤏", '⤐': "⤐", '⤑': "⤑", '⤒': "⤒", - '⤓': "⤓", '⤖': "⤖", '⤙': "⤙", '⤚': "⤚", '⤛': "⤛", '⤜': "⤜", - '⤝': "⤝", '⤞': "⤞", '⤟': "⤟", '⤠': "⤠", '⤣': "⤣", '⤤': "⤤", - '⤥': "⤥", '⤦': "⤦", '⤧': "⤧", '⤨': "⤨", '⤩': "⤩", '⤪': "⤪", - '⤳': "⤳", '⤵': "⤵", '⤶': "⤶", '⤷': "⤷", '⤸': "⤸", '⤹': "⤹", - '⤼': "⤼", '⤽': "⤽", '⥅': "⥅", '⥈': "⥈", '⥉': "⥉", '⥊': "⥊", - '⥋': "⥋", '⥎': "⥎", '⥏': "⥏", '⥐': "⥐", - '⥑': "⥑", '⥒': "⥒", '⥓': "⥓", '⥔': "⥔", - '⥕': "⥕", '⥖': "⥖", '⥗': "⥗", '⥘': "⥘", - '⥙': "⥙", '⥚': "⥚", '⥛': "⥛", '⥜': "⥜", - '⥝': "⥝", '⥞': "⥞", '⥟': "⥟", '⥠': "⥠", - '⥡': "⥡", '⥢': "⥢", '⥣': "⥣", '⥤': "⥤", '⥥': "⥥", '⥦': "⥦", - '⥧': "⥧", '⥨': "⥨", '⥩': "⥩", '⥪': "⥪", '⥫': "⥫", '⥬': "⥬", - '⥭': "⥭", '⥮': "⥮", '⥯': "⥯", '⥰': "⥰", '⥱': "⥱", '⥲': "⥲", - '⥳': "⥳", '⥴': "⥴", '⥵': "⥵", '⥶': "⥶", '⥸': "⥸", '⥹': "⥹", - '⥻': "⥻", '⥼': "⥼", '⥽': "⥽", '⥾': "⥾", '⥿': "⥿", '⦅': "⦅", - '⦆': "⦆", '⦋': "⦋", '⦌': "⦌", '⦍': "⦍", '⦎': "⦎", '⦏': "⦏", - '⦐': "⦐", '⦑': "⦑", '⦒': "⦒", '⦓': "⦓", '⦔': "⦔", '⦕': "⦕", - '⦖': "⦖", '⦚': "⦚", '⦜': "⦜", '⦝': "⦝", '⦤': "⦤", '⦥': "⦥", - '⦦': "⦦", '⦧': "⦧", '⦨': "⦨", '⦩': "⦩", '⦪': "⦪", '⦫': "⦫", - '⦬': "⦬", '⦭': "⦭", '⦮': "⦮", '⦯': "⦯", '⦰': "⦰", '⦱': "⦱", - '⦲': "⦲", '⦳': "⦳", '⦴': "⦴", '⦵': "⦵", '⦶': "⦶", '⦷': "⦷", - '⦹': "⦹", '⦻': "⦻", '⦼': "⦼", '⦾': "⦾", '⦿': "⦿", '⧀': "⧀", '⧁': "⧁", - '⧂': "⧂", '⧃': "⧃", '⧄': "⧄", '⧅': "⧅", '⧉': "⧉", '⧍': "⧍", '⧎': "⧎", - '⧏': "⧏", '⧐': "⧐", '⧚': "∽̱", '⧜': "⧜", '⧝': "⧝", - '⧞': "⧞", '⧣': "⧣", '⧤': "⧤", '⧥': "⧥", '⧫': "⧫", '⧴': "⧴", - '⧶': "⧶", '⨀': "⨀", '⨁': "⨁", '⨂': "⨂", '⨄': "⨄", '⨆': "⨆", '⨌': "⨌", - '⨍': "⨍", '⨐': "⨐", '⨑': "⨑", '⨒': "⨒", '⨓': "⨓", '⨔': "⨔", - '⨕': "⨕", '⨖': "⨖", '⨗': "⨗", '⨢': "⨢", '⨣': "⨣", '⨤': "⨤", - '⨥': "⨥", '⨦': "⨦", '⨧': "⨧", '⨩': "⨩", '⨪': "⨪", '⨭': "⨭", - '⨮': "⨮", '⨯': "⨯", '⨰': "⨰", '⨱': "⨱", '⨳': "⨳", '⨴': "⨴", - '⨵': "⨵", '⨶': "⨶", '⨷': "⨷", '⨸': "⨸", '⨹': "⨹", '⨺': "⨺", - '⨻': "⨻", '⨼': "⨼", '⨿': "⨿", '⩀': "⩀", '⩂': "⩂", '⩃': "⩃", '⩄': "⩄", - '⩅': "⩅", '⩆': "⩆", '⩇': "⩇", '⩈': "⩈", '⩉': "⩉", '⩊': "⩊", - '⩋': "⩋", '⩌': "⩌", '⩍': "⩍", '⩐': "⩐", '⩓': "⩓", '⩔': "⩔", '⩕': "⩕", - '⩖': "⩖", '⩗': "⩗", '⩘': "⩘", '⩚': "⩚", '⩛': "⩛", '⩜': "⩜", '⩝': "⩝", - '⩟': "⩟", '⩦': "⩦", '⩪': "⩪", '⩭': "⩭", '⩮': "⩮", '⩯': "⩯", '⩰': "⩰", - '⩱': "⩱", '⩲': "⩲", '⩳': "⩳", '⩴': "⩴", '⩵': "⩵", '⩷': "⩷", '⩸': "⩸", - '⩹': "⩹", '⩺': "⩺", '⩻': "⩻", '⩼': "⩼", '⩽': "⩽", '⩾': "⩾", '⩿': "⩿", - '⪀': "⪀", '⪁': "⪁", '⪂': "⪂", '⪃': "⪃", '⪄': "⪄", '⪅': "⪅", - '⪆': "⪆", '⪇': "⪇", '⪈': "⪈", '⪉': "⪉", '⪊': "⪊", '⪋': "⪋", '⪌': "⪌", '⪍': "⪍", - '⪎': "⪎", '⪏': "⪏", '⪐': "⪐", '⪑': "⪑", '⪒': "⪒", '⪓': "⪓", '⪔': "⪔", - '⪕': "⪕", '⪖': "⪖", '⪗': "⪗", '⪘': "⪘", '⪙': "⪙", '⪚': "⪚", '⪝': "⪝", - '⪞': "⪞", '⪟': "⪟", '⪠': "⪠", '⪡': "⪡", '⪢': "⪢", '⪤': "⪤", - '⪥': "⪥", '⪦': "⪦", '⪧': "⪧", '⪨': "⪨", '⪩': "⪩", '⪪': "⪪", '⪫': "⪫", - '⪬': "⪬", '⪭': "⪭", '⪮': "⪮", '⪯': "⪯", '⪰': "⪰", '⪳': "⪳", '⪴': "⪴", - '⪵': "⪵", '⪶': "⪶", '⪷': "⪷", '⪸': "⪸", '⪹': "⪹", '⪺': "⪺", '⪻': "⪻", - '⪼': "⪼", '⪽': "⪽", '⪾': "⪾", '⪿': "⪿", '⫀': "⫀", '⫁': "⫁", - '⫂': "⫂", '⫃': "⫃", '⫄': "⫄", '⫅': "⫅", '⫆': "⫆", '⫇': "⫇", - '⫈': "⫈", '⫋': "⫋", '⫌': "⫌", '⫏': "⫏", '⫐': "⫐", '⫑': "⫑", '⫒': "⫒", - '⫓': "⫓", '⫔': "⫔", '⫕': "⫕", '⫖': "⫖", '⫗': "⫗", '⫘': "⫘", - '⫙': "⫙", '⫚': "⫚", '⫛': "⫛", '⫤': "⫤", '⫦': "⫦", '⫧': "⫧", '⫨': "⫨", - '⫩': "⫩", '⫫': "⫫", '⫬': "⫬", '⫭': "⫭", '⫮': "⫮", '⫯': "⫯", '⫰': "⫰", - '⫱': "⫱", '⫲': "⫲", '⫳': "⫳", '⫽': "⫽", 'ff': "ff", 'fi': "fi", 'fl': "fl", - 'ffi': "ffi", 'ffl': "ffl", '𝒜': "𝒜", '𝒞': "𝒞", '𝒟': "𝒟", '𝒢': "𝒢", '𝒥': "𝒥", - '𝒦': "𝒦", '𝒩': "𝒩", '𝒪': "𝒪", '𝒫': "𝒫", '𝒬': "𝒬", '𝒮': "𝒮", '𝒯': "𝒯", - '𝒰': "𝒰", '𝒱': "𝒱", '𝒲': "𝒲", '𝒳': "𝒳", '𝒴': "𝒴", '𝒵': "𝒵", '𝒶': "𝒶", - '𝒷': "𝒷", '𝒸': "𝒸", '𝒹': "𝒹", '𝒻': "𝒻", '𝒽': "𝒽", '𝒾': "𝒾", '𝒿': "𝒿", - '𝓀': "𝓀", '𝓁': "𝓁", '𝓂': "𝓂", '𝓃': "𝓃", '𝓅': "𝓅", '𝓆': "𝓆", '𝓇': "𝓇", - '𝓈': "𝓈", '𝓉': "𝓉", '𝓊': "𝓊", '𝓋': "𝓋", '𝓌': "𝓌", '𝓍': "𝓍", '𝓎': "𝓎", - '𝓏': "𝓏", '𝔄': "𝔄", '𝔅': "𝔅", '𝔇': "𝔇", '𝔈': "𝔈", '𝔉': "𝔉", '𝔊': "𝔊", '𝔍': "𝔍", - '𝔎': "𝔎", '𝔏': "𝔏", '𝔐': "𝔐", '𝔑': "𝔑", '𝔒': "𝔒", '𝔓': "𝔓", '𝔔': "𝔔", '𝔖': "𝔖", - '𝔗': "𝔗", '𝔘': "𝔘", '𝔙': "𝔙", '𝔚': "𝔚", '𝔛': "𝔛", '𝔜': "𝔜", '𝔞': "𝔞", '𝔟': "𝔟", - '𝔠': "𝔠", '𝔡': "𝔡", '𝔢': "𝔢", '𝔣': "𝔣", '𝔤': "𝔤", '𝔥': "𝔥", '𝔦': "𝔦", '𝔧': "𝔧", - '𝔨': "𝔨", '𝔩': "𝔩", '𝔪': "𝔪", '𝔫': "𝔫", '𝔬': "𝔬", '𝔭': "𝔭", '𝔮': "𝔮", '𝔯': "𝔯", - '𝔰': "𝔰", '𝔱': "𝔱", '𝔲': "𝔲", '𝔳': "𝔳", '𝔴': "𝔴", '𝔵': "𝔵", '𝔶': "𝔶", '𝔷': "𝔷", - '𝔸': "𝔸", '𝔹': "𝔹", '𝔻': "𝔻", '𝔼': "𝔼", '𝔽': "𝔽", '𝔾': "𝔾", '𝕀': "𝕀", - '𝕁': "𝕁", '𝕂': "𝕂", '𝕃': "𝕃", '𝕄': "𝕄", '𝕆': "𝕆", '𝕊': "𝕊", '𝕋': "𝕋", - '𝕌': "𝕌", '𝕍': "𝕍", '𝕎': "𝕎", '𝕏': "𝕏", '𝕐': "𝕐", '𝕒': "𝕒", '𝕓': "𝕓", - '𝕔': "𝕔", '𝕕': "𝕕", '𝕖': "𝕖", '𝕗': "𝕗", '𝕘': "𝕘", '𝕙': "𝕙", '𝕚': "𝕚", - '𝕛': "𝕛", '𝕜': "𝕜", '𝕝': "𝕝", '𝕞': "𝕞", '𝕟': "𝕟", '𝕠': "𝕠", '𝕡': "𝕡", - '𝕢': "𝕢", '𝕣': "𝕣", '𝕤': "𝕤", '𝕥': "𝕥", '𝕦': "𝕦", '𝕧': "𝕧", '𝕨': "𝕨", - '𝕩': "𝕩", '𝕪': "𝕪", '𝕫': "𝕫", -} -DECMAP = {v: k for k, v in ENCMAP.items()} - - -class HtmlEntityDecodeError(ValueError): - pass - - -def htmlentity_encode(text, errors="strict"): - s = "" - for c in text: - try: - s += ENCMAP[c] - except KeyError: - i = ord(c) - s += "&" + hex(i)[2:].zfill(0) + ";" if i > 0xff else c - return s, len(text) - - -def htmlentity_decode(text, errors="strict"): - s = "" - i = 0 - while i < len(text): - m = re.match(r"&(?:(?:[A-Za-z][A-Za-z0-9]{1,6}){1,4}|[0-9]{4});", text[i:i+30]) - if m: - entity = m.group() - c = unichr(int(entity[1:5], 16)) if entity[1:5].isdigit() and len(entity) == 6 else \ - " " if entity == " " else None - if c: - s += c - else: - try: - s += DECMAP[entity] - except KeyError: - s += handle_error("html-entity", errors, HtmlEntityDecodeError, decode=True)(text[i], i) - i += len(entity) - else: - s += text[i] - i += 1 - return s, len(text) - - -add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$", - extra_exceptions=["HtmlEntityDecodeError"]) - +# -*- coding: UTF-8 -*- +"""HTML entity Codec - html entity content encoding. + +This codec: +- en/decodes strings from str to str +- en/decodes strings from bytes to bytes +- decodes file content to str (read) +- encodes file content from str to bytes (write) +""" +from ..__common__ import * + + +__examples__ = { + 'enc(html_entities|html-entity)': {'': "<This is a test>"}, + 'enc(html)': {'\u1234': "&1234;"}, + 'dec(html|html_entity)': {'&DoesNotExist;': None}, + 'dec(html_entities|html-entity)': { + '<This is a test>': "", + '<This is a test>': "", + }, +} + + +# source: https://dev.w3.org/html5/html-author/charref +ENCMAP = { + '\t': " ", '\n': " ", '!': "!", '"': """, '#': "#", '$': "$", '%': "%", + '&': "&", '\'': "'", '(': "(", ')': ")", '*': "*", '+': "+", ',': ",", + '.': ".", '/': "/", ':': ":", ';': ";", '<': "<", '=': "=", '>': ">", + '?': "?", '@': "@", '[': "[", '\\': "\", ']': "]", '^': "^", '_': "_", + '`': "`", '{': "{", '|': "|", '}': "}", '¡': "¡", '¢': "¢", + '£': "£", '¤': "¤", '¥': "¥", '¦': "¦", '§': "§", '¨': "¨", '©': "©", + 'ª': "ª", '«': "«", '¬': "¬", '­': "­", '®': "®", '¯': "¯", '°': "°", + '±': "±", '²': "²", '³': "³", '´': "´", 'µ': "µ", '¶': "¶", '·': "·", + '¸': "¸", '¹': "¹", 'º': "º", '»': "»", '¼': "¼", '½': "½", '¾': "¾", + '¿': "¿", 'À': "À", 'Á': "Á", 'Â': "Â", 'Ã': "Ã", 'Ä': "Ä", 'Å': "Å", + 'Æ': "Æ", 'Ç': "Ç", 'È': "È", 'É': "É", 'Ê': "Ê", 'Ë': "Ë", 'Ì': "Ì", + 'Í': "Í", 'Î': "Î", 'Ï': "Ï", 'Ð': "Ð", 'Ñ': "Ñ", 'Ò': "Ò", 'Ó': "Ó", + 'Ô': "Ô", 'Õ': "Õ", 'Ö': "Ö", '×': "×", 'Ø': "Ø", 'Ù': "Ù", 'Ú': "Ú", + 'Û': "Û", 'Ü': "Ü", 'Ý': "Ý", 'Þ': "Þ", 'ß': "ß", 'à': "à", 'á': "á", + 'â': "â", 'ã': "ã", 'ä': "ä", 'å': "å", 'æ': "æ", 'ç': "ç", 'è': "è", + 'é': "é", 'ê': "ê", 'ë': "ë", 'ì': "ì", 'í': "í", 'î': "î", 'ï': "ï", + 'ð': "ð", 'ñ': "ñ", 'ò': "ò", 'ó': "ó", 'ô': "ô", 'õ': "õ", 'ö': "ö", + '÷': "÷", 'ø': "ø", 'ù': "ù", 'ú': "ú", 'û': "û", 'ü': "ü", 'ý': "ý", + 'þ': "þ", 'ÿ': "ÿ", 'Ā': "Ā", 'ā': "ā", 'Ă': "Ă", 'ă': "ă", 'Ą': "Ą", + 'ą': "ą", 'Ć': "Ć", 'ć': "ć", 'Ĉ': "Ĉ", 'ĉ': "ĉ", 'Ċ': "Ċ", 'ċ': "ċ", + 'Č': "Č", 'č': "č", 'Ď': "Ď", 'ď': "ď", 'Đ': "Đ", 'đ': "đ", + 'Ē': "Ē", 'ē': "ē", 'Ė': "Ė", 'ė': "ė", 'Ę': "Ę", 'ę': "ę", 'Ě': "Ě", + 'ě': "ě", 'Ĝ': "Ĝ", 'ĝ': "ĝ", 'Ğ': "Ğ", 'ğ': "ğ", 'Ġ': "Ġ", 'ġ': "ġ", + 'Ģ': "Ģ", 'Ĥ': "Ĥ", 'ĥ': "ĥ", 'Ħ': "Ħ", 'ħ': "ħ", 'Ĩ': "Ĩ", + 'ĩ': "ĩ", 'Ī': "Ī", 'ī': "ī", 'Į': "Į", 'į': "į", 'İ': "İ", 'ı': "ı", + 'IJ': "IJ", 'ij': "ij", 'Ĵ': "Ĵ", 'ĵ': "ĵ", 'Ķ': "Ķ", 'ķ': "ķ", 'ĸ': "ĸ", + 'Ĺ': "Ĺ", 'ĺ': "ĺ", 'Ļ': "Ļ", 'ļ': "ļ", 'Ľ': "Ľ", 'ľ': "ľ", + 'Ŀ': "Ŀ", 'ŀ': "ŀ", 'Ł': "Ł", 'ł': "ł", 'Ń': "Ń", 'ń': "ń", + 'Ņ': "Ņ", 'ņ': "ņ", 'Ň': "Ň", 'ň': "ň", 'ʼn': "ʼn", 'Ŋ': "Ŋ", 'ŋ': "ŋ", + 'Ō': "Ō", 'ō': "ō", 'Ő': "Ő", 'ő': "ő", 'Œ': "Œ", 'œ': "œ", 'Ŕ': "Ŕ", + 'ŕ': "ŕ", 'Ŗ': "Ŗ", 'ŗ': "ŗ", 'Ř': "Ř", 'ř': "ř", 'Ś': "Ś", + 'ś': "ś", 'Ŝ': "Ŝ", 'ŝ': "ŝ", 'Ş': "Ş", 'ş': "ş", 'Š': "Š", + 'š': "š", 'Ţ': "Ţ", 'ţ': "ţ", 'Ť': "Ť", 'ť': "ť", 'Ŧ': "Ŧ", + 'ŧ': "ŧ", 'Ũ': "Ũ", 'ũ': "ũ", 'Ū': "Ū", 'ū': "ū", 'Ŭ': "Ŭ", + 'ŭ': "ŭ", 'Ů': "Ů", 'ů': "ů", 'Ű': "Ű", 'ű': "ű", 'Ų': "Ų", 'ų': "ų", + 'Ŵ': "Ŵ", 'ŵ': "ŵ", 'Ŷ': "Ŷ", 'ŷ': "ŷ", 'Ÿ': "Ÿ", 'Ź': "Ź", 'ź': "ź", + 'Ż': "Ż", 'ż': "ż", 'Ž': "Ž", 'ž': "ž", 'ƒ': "ƒ", 'Ƶ': "Ƶ", 'ǵ': "ǵ", + 'ȷ': "ȷ", 'ˆ': "ˆ", 'ˇ': "ˇ", '˘': "˘", '˙': "˙", '˚': "˚", '˛': "˛", + '˜': "˜", '˝': "˝", '̑': "̑", '̲': "_", 'Α': "Α", 'Β': "Β", + 'Γ': "Γ", 'Δ': "Δ", 'Ε': "Ε", 'Ζ': "Ζ", 'Η': "Η", 'Θ': "Θ", 'Ι': "Ι", + 'Κ': "Κ", 'Λ': "Λ", 'Μ': "Μ", 'Ν': "Ν", 'Ξ': "Ξ", 'Ο': "Ο", 'Π': "Π", + 'Ρ': "Ρ", 'Σ': "Σ", 'Τ': "Τ", 'Υ': "Υ", 'Φ': "Φ", 'Χ': "Χ", 'Ψ': "Ψ", + 'Ω': "Ω", 'α': "α", 'β': "β", 'γ': "γ", 'δ': "δ", 'ε': "ϵ", 'ζ': "ζ", + 'η': "η", 'θ': "θ", 'ι': "ι", 'κ': "κ", 'λ': "λ", 'μ': "μ", 'ν': "ν", + 'ξ': "ξ", 'ο': "ο", 'π': "π", 'ρ': "ρ", 'ς': "ς", 'σ': "σ", 'τ': "τ", + 'υ': "υ", 'φ': "φ", 'χ': "χ", 'ψ': "ψ", 'ω': "ω", 'ϑ': "ϑ", 'ϒ': "ϒ", + 'ϕ': "ϕ", 'ϖ': "ϖ", 'Ϝ': "Ϝ", 'ϝ': "ϝ", 'ϰ': "ϰ", 'ϱ': "ϱ", + 'ϵ': "ε", '϶': "϶", 'Ё': "Ё", 'Ђ': "Ђ", 'Ѓ': "Ѓ", 'Є': "Є", 'Ѕ': "Ѕ", + 'І': "І", 'Ї': "Ї", 'Ј': "Ј", 'Љ': "Љ", 'Њ': "Њ", 'Ћ': "Ћ", 'Ќ': "Ќ", + 'Ў': "Ў", 'Џ': "Џ", 'А': "А", 'Б': "Б", 'В': "В", 'Г': "Г", 'Д': "Д", 'Е': "Е", + 'Ж': "Ж", 'З': "З", 'И': "И", 'Й': "Й", 'К': "К", 'Л': "Л", 'М': "М", 'Н': "Н", + 'О': "О", 'П': "П", 'Р': "Р", 'С': "С", 'Т': "Т", 'У': "У", 'Ф': "Ф", 'Х': "Х", + 'Ц': "Ц", 'Ч': "Ч", 'Ш': "Ш", 'Щ': "Щ", 'Ъ': "Ъ", 'Ы': "Ы", 'Ь': "Ь", + 'Э': "Э", 'Ю': "Ю", 'Я': "Я", 'а': "а", 'б': "б", 'в': "в", 'г': "г", 'д': "д", + 'е': "е", 'ж': "ж", 'з': "з", 'и': "и", 'й': "й", 'к': "к", 'л': "л", 'м': "м", + 'н': "н", 'о': "о", 'п': "п", 'р': "р", 'с': "с", 'т': "т", 'у': "у", 'ф': "ф", + 'х': "х", 'ц': "ц", 'ч': "ч", 'ш': "ш", 'щ': "щ", 'ъ': "ъ", 'ы': "ы", + 'ь': "ь", 'э': "э", 'ю': "ю", 'я': "я", 'ё': "ё", 'ђ': "ђ", 'ѓ': "ѓ", + 'є': "є", 'ѕ': "ѕ", 'і': "і", 'ї': "ї", 'ј': "ј", 'љ': "љ", 'њ': "њ", + 'ћ': "ћ", 'ќ': "ќ", 'ў': "ў", 'џ': "џ", '\u2002': " ", '\u2003': " ", + '\u2004': " ", '\u2005': " ", '\u2007': " ", '\u2008': " ", '\u2009': " ", + '\u200a': " ", '​\u200b': "​", '\u200c': "‌", '\u200d': "‍", '\u200e': "‎", + '\u200f': "‏", '‐': "‐", '–': "–", '—': "—", + '―': "―", '‖': "‖", '‘': "‘", '’': "’", '‚': "‚", '“': "“", '”': "”", + '„': "„", '†': "†", '‡': "‡", '•': "•", '‥': "‥", '…': "…", '‰': "‰", + '‱': "‱", '′': "′", '″': "″", '‴': "‴", '‵': "‵", '‹': "‹", + '›': "›", '‾': "‾", '⁁': "⁁", '⁃': "⁃", '⁄': "⁄", '⁏': "⁏", '⁗': "⁗", + '\u205f': " ", '⁠': "⁠", '⁡': "⁡", '⁢': "⁢", '⁣': "⁣", + '€': "€", '⃛': "⃛", '⃜': "⃜", 'ℂ': "ℂ", '℅': "℅", 'ℊ': "ℊ", 'ℋ': "ℋ", + 'ℌ': "ℌ", 'ℍ': "ℍ", 'ℎ': "ℎ", 'ℏ': "ℏ", 'ℐ': "ℐ", 'ℑ': "ℑ", + 'ℒ': "ℒ", 'ℓ': "ℓ", 'ℕ': "ℕ", '№': "№", '℗': "℗", '℘': "℘", 'ℙ': "ℙ", + 'ℚ': "ℚ", 'ℛ': "ℛ", 'ℜ': "ℜ", 'ℝ': "ℝ", '℞': "℞", '™': "™", 'ℤ': "ℤ", + 'Ω': "Ω", '℧': "℧", 'ℨ': "ℨ", '℩': "℩", 'Å': "Å", 'ℬ': "ℬ", 'ℭ': "ℭ", + 'ℯ': "ℯ", 'ℰ': "ℰ", 'ℱ': "ℱ", 'ℳ': "ℳ", 'ℴ': "ℴ", 'ℵ': "ℵ", 'ℶ': "ℶ", + 'ℷ': "ℷ", 'ℸ': "ℸ", 'ⅅ': "ⅅ", 'ⅆ': "ⅆ", 'ⅇ': "ⅇ", + 'ⅈ': "ⅈ", '⅓': "⅓", '⅔': "⅔", '⅕': "⅕", '⅖': "⅖", '⅗': "⅗", + '⅘': "⅘", '⅙': "⅙", '⅚': "⅚", '⅛': "⅛", '⅜': "⅜", '⅝': "⅝", + '⅞': "⅞", '←': "←", '↑': "↑", '→': "→", '↓': "↓", '↔': "↔", '↕': "↕", + '↖': "↖", '↗': "↗", '↘': "↘", '↙': "↙", '↚': "↚", '↛': "↛", '↝': "↝", + '↞': "↞", '↟': "↟", '↠': "↠", '↡': "↡", '↢': "↢", '↣': "↣", + '↤': "↤", '↥': "↥", '↦': "↦", '↧': "↧", '↩': "↩", '↪': "↪", + '↫': "↫", '↬': "↬", '↭': "↭", '↮': "↮", '↰': "↰", '↱': "↱", '↲': "↲", + '↳': "↳", '↵': "↵", '↶': "↶", '↷': "↷", '↺': "↺", '↻': "↻", '↼': "↼", + '↽': "↽", '↾': "↾", '↿': "↿", '⇀': "⇀", '⇁': "⇁", '⇂': "⇂", '⇃': "⇃", + '⇄': "⇄", '⇅': "⇅", '⇆': "⇆", '⇇': "⇇", '⇈': "⇈", '⇉': "⇉", '⇊': "⇊", + '⇋': "⇋", '⇌': "⇌", '⇍': "⇍", '⇎': "⇎", '⇏': "⇏", '⇐': "⇐", '⇑': "⇑", + '⇒': "⇒", '⇓': "⇓", '⇔': "⇔", '⇕': "⇕", '⇖': "⇖", '⇗': "⇗", '⇘': "⇘", + '⇙': "⇙", '⇚': "⇚", '⇛': "⇛", '⇝': "⇝", '⇤': "⇤", '⇥': "⇥", '⇵': "⇵", + '⇽': "⇽", '⇾': "⇾", '⇿': "⇿", '∀': "∀", '∁': "∁", '∂': "∂", '∃': "∃", + '∄': "∄", '∅': "∅", '∇': "∇", '∈': "∈", '∉': "∉", '∋': "∋", '∌': "∌", + '∏': "∏", '∐': "∐", '∑': "∑", '−': "−", '∓': "∓", '∔': "∔", '∖': "∖", + '∗': "∗", '∘': "∘", '√': "√", '∝': "∝", '∞': "∞", '∟': "∟", '∠': "∠", + '∡': "∡", '∢': "∢", '∣': "∣", '∤': "∤", '∥': "∥", '∦': "∦", '∧': "∧", + '∨': "∨", '∩': "∩", '∪': "∪", '∫': "∫", '∬': "∬", '∭': "∭", '∮': "∮", + '∯': "∯", '∰': "∰", '∱': "∱", '∲': "∲", '∳': "∳", '∴': "∴", + '∵': "∵", '∶': "∶", '∷': "∷", '∸': "∸", '∺': "∺", '∻': "∻", '∼': "∼", + '∽': "∽", '∾': "∾", '∿': "∿", '≀': "≀", '≁': "≁", '≂': "≂", '≃': "≃", + '≄': "≄", '≅': "≅", '≆': "≆", '≇': "≇", '≈': "≈", '≉': "≉", '≊': "≊", + '≋': "≋", '≌': "≌", '≍': "≍", '≎': "≎", '≏': "≏", '≐': "≐", '≑': "≑", + '≒': "≒", '≓': "≓", '≔': "≔", '≕': "≕", '≖': "≖", '≗': "≗", '≙': "≙", + '≚': "≚", '≜': "≜", '≟': "≟", '≠': "≠", '≡': "≡", '≢': "≢", '≤': "≤", + '≥': "≥", '≦': "≦", '≧': "≧", '≨': "≨", '≩': "≩", '≪': "≪", '≫': "≫", '≬': "≬", + '≭': "≭", '≮': "≮", '≯': "≯", '≰': "≰", '≱': "≱", '≲': "≲", '≳': "≳", + '≴': "≴", '≵': "≵", '≶': "≶", '≷': "≷", '≸': "≸", '≹': "≹", '≺': "≺", '≻': "≻", + '≼': "≼", '≽': "≽", '≾': "≾", '≿': "≿", '⊀': "⊀", '⊁': "⊁", '⊂': "⊂", + '⊃': "⊃", '⊄': "⊄", '⊅': "⊅", '⊆': "⊆", '⊇': "⊇", '⊈': "⊈", '⊉': "⊉", + '⊊': "⊊", '⊋': "⊋", '⊍': "⊍", '⊎': "⊎", '⊏': "⊏", '⊐': "⊐", '⊑': "⊑", + '⊒': "⊒", '⊓': "⊓", '⊔': "⊔", '⊕': "⊕", '⊖': "⊖", '⊗': "⊗", '⊘': "⊘", + '⊙': "⊙", '⊚': "⊚", '⊛': "⊛", '⊝': "⊝", '⊞': "⊞", '⊟': "⊟", '⊠': "⊠", + '⊡': "⊡", '⊢': "⊢", '⊣': "⊣", '⊤': "⊤", '⊥': "⊥", '⊧': "⊧", '⊨': "⊨", + '⊩': "⊩", '⊪': "⊪", '⊫': "⊫", '⊬': "⊬", '⊭': "⊭", '⊮': "⊮", + '⊯': "⊯", '⊰': "⊰", '⊲': "⊲", '⊳': "⊳", '⊴': "⊴", '⊵': "⊵", '⊶': "⊶", + '⊷': "⊷", '⊸': "⊸", '⊹': "⊹", '⊺': "⊺", '⊻': "⊻", '⊽': "⊽", + '⊾': "⊾", '⊿': "⊿", '⋀': "⋀", '⋁': "⋁", '⋂': "⋂", '⋃': "⋃", '⋄': "⋄", + '⋅': "⋅", '⋆': "⋆", '⋇': "⋇", '⋈': "⋈", '⋉': "⋉", '⋊': "⋊", + '⋋': "⋋", '⋌': "⋌", '⋍': "⋍", '⋎': "⋎", '⋏': "⋏", '⋐': "⋐", '⋑': "⋑", + '⋒': "⋒", '⋓': "⋓", '⋔': "⋔", '⋕': "⋕", '⋖': "⋖", '⋗': "⋗", '⋘': "⋘", '⋙': "⋙", + '⋚': "⋚", '⋛': "⋛", '⋞': "⋞", '⋟': "⋟", '⋠': "⋠", '⋡': "⋡", '⋢': "⋢", + '⋣': "⋣", '⋦': "⋦", '⋧': "⋧", '⋨': "⋨", '⋩': "⋩", '⋪': "⋪", '⋫': "⋫", + '⋬': "⋬", '⋭': "⋭", '⋮': "⋮", '⋯': "⋯", '⋰': "⋰", '⋱': "⋱", '⋲': "⋲", + '⋳': "⋳", '⋴': "⋴", '⋵': "⋵", '⋶': "⋶", '⋷': "⋷", '⋹': "⋹", + '⋺': "⋺", '⋻': "⋻", '⋼': "⋼", '⋽': "⋽", '⋾': "⋾", '⌅': "⌅", '⌆': "⌆", + '⌈': "⌈", '⌉': "⌉", '⌊': "⌊", '⌋': "⌋", '⌌': "⌌", '⌍': "⌍", + '⌎': "⌎", '⌏': "⌏", '⌐': "⌐", '⌒': "⌒", '⌓': "⌓", '⌕': "⌕", + '⌖': "⌖", '⌜': "⌜", '⌝': "⌝", '⌞': "⌞", '⌟': "⌟", '⌢': "⌢", + '⌣': "⌣", '⌭': "⌭", '⌮': "⌮", '⌶': "⌶", '⌽': "⌽", '⌿': "⌿", + '⍼': "⍼", '⎰': "⎰", '⎱': "⎱", '⎴': "⎴", '⎵': "⎵", '⎶': "⎶", + '⏜': "⏜", '⏝': "⏝", '⏞': "⏞", '⏟': "⏟", '⏢': "⏢", + '⏧': "⏧", '␣': "␣", 'Ⓢ': "Ⓢ", '─': "─", '│': "│", '┌': "┌", '┐': "┐", + '└': "└", '┘': "┘", '├': "├", '┤': "┤", '┬': "┬", '┴': "┴", '┼': "┼", + '═': "═", '║': "║", '╒': "╒", '╓': "╓", '╔': "╔", '╕': "╕", '╖': "╖", + '╗': "╗", '╘': "╘", '╙': "╙", '╚': "╚", '╛': "╛", '╜': "╜", '╝': "╝", + '╞': "╞", '╟': "╟", '╠': "╠", '╡': "╡", '╢': "╢", '╣': "╣", '╤': "╤", + '╥': "╥", '╦': "╦", '╧': "╧", '╨': "╨", '╩': "╩", '╪': "╪", '╫': "╫", + '╬': "╬", '▀': "▀", '▄': "▄", '█': "█", '░': "░", '▒': "▒", '▓': "▓", + '□': "□", '▪': "▪", '▫': "▫", '▭': "▭", '▮': "▮", '▱': "▱", + '△': "△", '▴': "▴", '▵': "▵", '▸': "▸", '▹': "▹", '▽': "▽", '▾': "▾", + '▿': "▿", '◂': "◂", '◃': "◃", '◊': "◊", '○': "○", '◬': "◬", '◯': "◯", + '◸': "◸", '◹': "◹", '◺': "◺", '◻': "◻", '◼': "◼", + '★': "★", '☆': "☆", '☎': "☎", '♀': "♀", '♂': "♂", '♠': "♠", '♣': "♣", + '♥': "♥", '♦': "♦", '♪': "♪", '♭': "♭", '♮': "♮", '♯': "♯", '✓': "✓", + '✗': "✗", '✠': "✠", '✶': "✶", '❘': "❘", '❲': "❲", '❳': "❳", + '⟦': "⟦", '⟧': "⟧", '⟨': "⟨", '⟩': "⟩", '⟪': "⟪", '⟫': "⟫", '⟬': "⟬", + '⟭': "⟭", '⟵': "⟵", '⟶': "⟶", '⟷': "⟷", '⟸': "⟸", '⟹': "⟹", '⟺': "⟺", + '⟼': "⟼", '⟿': "⟿", '⤂': "⤂", '⤃': "⤃", '⤄': "⤄", '⤅': "⤅", '⤌': "⤌", + '⤍': "⤍", '⤎': "⤎", '⤏': "⤏", '⤐': "⤐", '⤑': "⤑", '⤒': "⤒", + '⤓': "⤓", '⤖': "⤖", '⤙': "⤙", '⤚': "⤚", '⤛': "⤛", '⤜': "⤜", + '⤝': "⤝", '⤞': "⤞", '⤟': "⤟", '⤠': "⤠", '⤣': "⤣", '⤤': "⤤", + '⤥': "⤥", '⤦': "⤦", '⤧': "⤧", '⤨': "⤨", '⤩': "⤩", '⤪': "⤪", + '⤳': "⤳", '⤵': "⤵", '⤶': "⤶", '⤷': "⤷", '⤸': "⤸", '⤹': "⤹", + '⤼': "⤼", '⤽': "⤽", '⥅': "⥅", '⥈': "⥈", '⥉': "⥉", '⥊': "⥊", + '⥋': "⥋", '⥎': "⥎", '⥏': "⥏", '⥐': "⥐", + '⥑': "⥑", '⥒': "⥒", '⥓': "⥓", '⥔': "⥔", + '⥕': "⥕", '⥖': "⥖", '⥗': "⥗", '⥘': "⥘", + '⥙': "⥙", '⥚': "⥚", '⥛': "⥛", '⥜': "⥜", + '⥝': "⥝", '⥞': "⥞", '⥟': "⥟", '⥠': "⥠", + '⥡': "⥡", '⥢': "⥢", '⥣': "⥣", '⥤': "⥤", '⥥': "⥥", '⥦': "⥦", + '⥧': "⥧", '⥨': "⥨", '⥩': "⥩", '⥪': "⥪", '⥫': "⥫", '⥬': "⥬", + '⥭': "⥭", '⥮': "⥮", '⥯': "⥯", '⥰': "⥰", '⥱': "⥱", '⥲': "⥲", + '⥳': "⥳", '⥴': "⥴", '⥵': "⥵", '⥶': "⥶", '⥸': "⥸", '⥹': "⥹", + '⥻': "⥻", '⥼': "⥼", '⥽': "⥽", '⥾': "⥾", '⥿': "⥿", '⦅': "⦅", + '⦆': "⦆", '⦋': "⦋", '⦌': "⦌", '⦍': "⦍", '⦎': "⦎", '⦏': "⦏", + '⦐': "⦐", '⦑': "⦑", '⦒': "⦒", '⦓': "⦓", '⦔': "⦔", '⦕': "⦕", + '⦖': "⦖", '⦚': "⦚", '⦜': "⦜", '⦝': "⦝", '⦤': "⦤", '⦥': "⦥", + '⦦': "⦦", '⦧': "⦧", '⦨': "⦨", '⦩': "⦩", '⦪': "⦪", '⦫': "⦫", + '⦬': "⦬", '⦭': "⦭", '⦮': "⦮", '⦯': "⦯", '⦰': "⦰", '⦱': "⦱", + '⦲': "⦲", '⦳': "⦳", '⦴': "⦴", '⦵': "⦵", '⦶': "⦶", '⦷': "⦷", + '⦹': "⦹", '⦻': "⦻", '⦼': "⦼", '⦾': "⦾", '⦿': "⦿", '⧀': "⧀", '⧁': "⧁", + '⧂': "⧂", '⧃': "⧃", '⧄': "⧄", '⧅': "⧅", '⧉': "⧉", '⧍': "⧍", '⧎': "⧎", + '⧏': "⧏", '⧐': "⧐", '⧚': "∽̱", '⧜': "⧜", '⧝': "⧝", + '⧞': "⧞", '⧣': "⧣", '⧤': "⧤", '⧥': "⧥", '⧫': "⧫", '⧴': "⧴", + '⧶': "⧶", '⨀': "⨀", '⨁': "⨁", '⨂': "⨂", '⨄': "⨄", '⨆': "⨆", '⨌': "⨌", + '⨍': "⨍", '⨐': "⨐", '⨑': "⨑", '⨒': "⨒", '⨓': "⨓", '⨔': "⨔", + '⨕': "⨕", '⨖': "⨖", '⨗': "⨗", '⨢': "⨢", '⨣': "⨣", '⨤': "⨤", + '⨥': "⨥", '⨦': "⨦", '⨧': "⨧", '⨩': "⨩", '⨪': "⨪", '⨭': "⨭", + '⨮': "⨮", '⨯': "⨯", '⨰': "⨰", '⨱': "⨱", '⨳': "⨳", '⨴': "⨴", + '⨵': "⨵", '⨶': "⨶", '⨷': "⨷", '⨸': "⨸", '⨹': "⨹", '⨺': "⨺", + '⨻': "⨻", '⨼': "⨼", '⨿': "⨿", '⩀': "⩀", '⩂': "⩂", '⩃': "⩃", '⩄': "⩄", + '⩅': "⩅", '⩆': "⩆", '⩇': "⩇", '⩈': "⩈", '⩉': "⩉", '⩊': "⩊", + '⩋': "⩋", '⩌': "⩌", '⩍': "⩍", '⩐': "⩐", '⩓': "⩓", '⩔': "⩔", '⩕': "⩕", + '⩖': "⩖", '⩗': "⩗", '⩘': "⩘", '⩚': "⩚", '⩛': "⩛", '⩜': "⩜", '⩝': "⩝", + '⩟': "⩟", '⩦': "⩦", '⩪': "⩪", '⩭': "⩭", '⩮': "⩮", '⩯': "⩯", '⩰': "⩰", + '⩱': "⩱", '⩲': "⩲", '⩳': "⩳", '⩴': "⩴", '⩵': "⩵", '⩷': "⩷", '⩸': "⩸", + '⩹': "⩹", '⩺': "⩺", '⩻': "⩻", '⩼': "⩼", '⩽': "⩽", '⩾': "⩾", '⩿': "⩿", + '⪀': "⪀", '⪁': "⪁", '⪂': "⪂", '⪃': "⪃", '⪄': "⪄", '⪅': "⪅", + '⪆': "⪆", '⪇': "⪇", '⪈': "⪈", '⪉': "⪉", '⪊': "⪊", '⪋': "⪋", '⪌': "⪌", '⪍': "⪍", + '⪎': "⪎", '⪏': "⪏", '⪐': "⪐", '⪑': "⪑", '⪒': "⪒", '⪓': "⪓", '⪔': "⪔", + '⪕': "⪕", '⪖': "⪖", '⪗': "⪗", '⪘': "⪘", '⪙': "⪙", '⪚': "⪚", '⪝': "⪝", + '⪞': "⪞", '⪟': "⪟", '⪠': "⪠", '⪡': "⪡", '⪢': "⪢", '⪤': "⪤", + '⪥': "⪥", '⪦': "⪦", '⪧': "⪧", '⪨': "⪨", '⪩': "⪩", '⪪': "⪪", '⪫': "⪫", + '⪬': "⪬", '⪭': "⪭", '⪮': "⪮", '⪯': "⪯", '⪰': "⪰", '⪳': "⪳", '⪴': "⪴", + '⪵': "⪵", '⪶': "⪶", '⪷': "⪷", '⪸': "⪸", '⪹': "⪹", '⪺': "⪺", '⪻': "⪻", + '⪼': "⪼", '⪽': "⪽", '⪾': "⪾", '⪿': "⪿", '⫀': "⫀", '⫁': "⫁", + '⫂': "⫂", '⫃': "⫃", '⫄': "⫄", '⫅': "⫅", '⫆': "⫆", '⫇': "⫇", + '⫈': "⫈", '⫋': "⫋", '⫌': "⫌", '⫏': "⫏", '⫐': "⫐", '⫑': "⫑", '⫒': "⫒", + '⫓': "⫓", '⫔': "⫔", '⫕': "⫕", '⫖': "⫖", '⫗': "⫗", '⫘': "⫘", + '⫙': "⫙", '⫚': "⫚", '⫛': "⫛", '⫤': "⫤", '⫦': "⫦", '⫧': "⫧", '⫨': "⫨", + '⫩': "⫩", '⫫': "⫫", '⫬': "⫬", '⫭': "⫭", '⫮': "⫮", '⫯': "⫯", '⫰': "⫰", + '⫱': "⫱", '⫲': "⫲", '⫳': "⫳", '⫽': "⫽", 'ff': "ff", 'fi': "fi", 'fl': "fl", + 'ffi': "ffi", 'ffl': "ffl", '𝒜': "𝒜", '𝒞': "𝒞", '𝒟': "𝒟", '𝒢': "𝒢", '𝒥': "𝒥", + '𝒦': "𝒦", '𝒩': "𝒩", '𝒪': "𝒪", '𝒫': "𝒫", '𝒬': "𝒬", '𝒮': "𝒮", '𝒯': "𝒯", + '𝒰': "𝒰", '𝒱': "𝒱", '𝒲': "𝒲", '𝒳': "𝒳", '𝒴': "𝒴", '𝒵': "𝒵", '𝒶': "𝒶", + '𝒷': "𝒷", '𝒸': "𝒸", '𝒹': "𝒹", '𝒻': "𝒻", '𝒽': "𝒽", '𝒾': "𝒾", '𝒿': "𝒿", + '𝓀': "𝓀", '𝓁': "𝓁", '𝓂': "𝓂", '𝓃': "𝓃", '𝓅': "𝓅", '𝓆': "𝓆", '𝓇': "𝓇", + '𝓈': "𝓈", '𝓉': "𝓉", '𝓊': "𝓊", '𝓋': "𝓋", '𝓌': "𝓌", '𝓍': "𝓍", '𝓎': "𝓎", + '𝓏': "𝓏", '𝔄': "𝔄", '𝔅': "𝔅", '𝔇': "𝔇", '𝔈': "𝔈", '𝔉': "𝔉", '𝔊': "𝔊", '𝔍': "𝔍", + '𝔎': "𝔎", '𝔏': "𝔏", '𝔐': "𝔐", '𝔑': "𝔑", '𝔒': "𝔒", '𝔓': "𝔓", '𝔔': "𝔔", '𝔖': "𝔖", + '𝔗': "𝔗", '𝔘': "𝔘", '𝔙': "𝔙", '𝔚': "𝔚", '𝔛': "𝔛", '𝔜': "𝔜", '𝔞': "𝔞", '𝔟': "𝔟", + '𝔠': "𝔠", '𝔡': "𝔡", '𝔢': "𝔢", '𝔣': "𝔣", '𝔤': "𝔤", '𝔥': "𝔥", '𝔦': "𝔦", '𝔧': "𝔧", + '𝔨': "𝔨", '𝔩': "𝔩", '𝔪': "𝔪", '𝔫': "𝔫", '𝔬': "𝔬", '𝔭': "𝔭", '𝔮': "𝔮", '𝔯': "𝔯", + '𝔰': "𝔰", '𝔱': "𝔱", '𝔲': "𝔲", '𝔳': "𝔳", '𝔴': "𝔴", '𝔵': "𝔵", '𝔶': "𝔶", '𝔷': "𝔷", + '𝔸': "𝔸", '𝔹': "𝔹", '𝔻': "𝔻", '𝔼': "𝔼", '𝔽': "𝔽", '𝔾': "𝔾", '𝕀': "𝕀", + '𝕁': "𝕁", '𝕂': "𝕂", '𝕃': "𝕃", '𝕄': "𝕄", '𝕆': "𝕆", '𝕊': "𝕊", '𝕋': "𝕋", + '𝕌': "𝕌", '𝕍': "𝕍", '𝕎': "𝕎", '𝕏': "𝕏", '𝕐': "𝕐", '𝕒': "𝕒", '𝕓': "𝕓", + '𝕔': "𝕔", '𝕕': "𝕕", '𝕖': "𝕖", '𝕗': "𝕗", '𝕘': "𝕘", '𝕙': "𝕙", '𝕚': "𝕚", + '𝕛': "𝕛", '𝕜': "𝕜", '𝕝': "𝕝", '𝕞': "𝕞", '𝕟': "𝕟", '𝕠': "𝕠", '𝕡': "𝕡", + '𝕢': "𝕢", '𝕣': "𝕣", '𝕤': "𝕤", '𝕥': "𝕥", '𝕦': "𝕦", '𝕧': "𝕧", '𝕨': "𝕨", + '𝕩': "𝕩", '𝕪': "𝕪", '𝕫': "𝕫", +} +DECMAP = {v: k for k, v in ENCMAP.items()} + + +class HtmlEntityDecodeError(ValueError): + pass + + +def htmlentity_encode(text, errors="strict"): + s = "" + for c in text: + try: + s += ENCMAP[c] + except KeyError: + i = ord(c) + s += "&" + hex(i)[2:].zfill(0) + ";" if i > 0xff else c + return s, len(text) + + +def htmlentity_decode(text, errors="strict"): + s = "" + i = 0 + while i < len(text): + m = re.match(r"&(?:(?:[A-Za-z][A-Za-z0-9]{1,6}){1,4}|[0-9]{4});", text[i:i+30]) + if m: + entity = m.group() + c = chr(int(entity[1:5], 16)) if entity[1:5].isdigit() and len(entity) == 6 else \ + " " if entity == " " else None + if c: + s += c + else: + try: + s += DECMAP[entity] + except KeyError: + s += handle_error("html-entity", errors, HtmlEntityDecodeError, decode=True)(text[i], i) + i += len(entity) + else: + s += text[i] + i += 1 + return s, len(text) + + +add("html", htmlentity_encode, htmlentity_decode, r"^html(?:[-_]?entit(?:y|ies))?$", + extra_exceptions=["HtmlEntityDecodeError"]) + diff --git a/tests/test_base.py b/tests/test_base.py index 7b3dae0..a37d1a6 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,236 +1,235 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Base codecs tests. - -""" -import os -import sys -from unittest import TestCase - -from codext.__common__ import * -from codext.base._base import _generate_charset -from codext.base.baseN import base, main2, main32, main64url - - -class TestCodecsBase(TestCase): - def setUp(self): - global STR - STR = "this is a test" - - def test_new_base_codec(self): - for i in [0, 1, 256]: - self.assertRaises(ValueError, _generate_charset, i) - b10 = lambda *a: "0123456789" - base(b10, "base10") - B10 = "2361031878030638688519054699098996" - self.assertEqual(codecs.encode(STR, "base10"), B10) - self.assertEqual(codecs.encode(b(STR), "base10"), b(B10)) - self.assertEqual(codecs.decode(B10, "base10"), STR) - self.assertEqual(codecs.decode(b(B10), "base10"), b(STR)) - self.assertRaises(ValueError, base, 1, "test") - b11 = "0123456789a" - base(b11, "base11") - B11 = "113342054335735319526632a26972419" - self.assertEqual(codecs.encode(STR, "base11"), B11) - self.assertEqual(codecs.decode(B11, "base11"), STR) - self.assertRaises(ValueError, base, object(), "test") - self.assertIsNone(base({'': "01234"}, r"^base5(test)?$")) - self.assertIsNotNone(codecs.encode(STR, "base5test")) - self.assertRaises(ValueError, base, {'': "01234"}, "base5-test", pow2=True) - self.assertEqual("", codecs.decode("", "base5test")) - - def test_codec_base1(self): - C = "A" - for i in range(3): - self.assertIsNotNone(codecs.encode(i * C, "base1")) - self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") - self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") - - def test_codec_base2(self): - STR = "test" - B2 = "01110100011001010111001101110100" - self.assertEqual(codecs.encode(STR, "base2"), B2) - self.assertEqual(codecs.encode(b(STR), "base2"), b(B2)) - self.assertEqual(codecs.decode(B2, "base2"), STR) - self.assertEqual(codecs.decode(b(B2), "base2"), b(STR)) - B2 = "10001011100110101000110010001011" - self.assertEqual(codecs.encode(STR, "base2-inv"), B2) - self.assertEqual(codecs.decode(B2, "base2-inv"), STR) - B2 = "abbbabaaabbaabababbbaabbabbbabaa" - self.assertEqual(codecs.encode(STR, "base2-ab"), B2) - self.assertEqual(codecs.decode(B2, "base2-ab"), STR) - B2 = "CDDDCDCCCDDCCDCDCDDDCCDDCDDDCDCC" - self.assertEqual(codecs.encode(STR, "base2-CD"), B2) - self.assertEqual(codecs.decode(B2, "base2-CD"), STR) - B2 = "34443433344334343444334434443433" - self.assertEqual(codecs.encode(STR, "base2-34"), B2) - self.assertEqual(codecs.decode(B2, "base2-34"), STR) - - def test_codec_base3(self): - STR = "test" - B3 = "23112113223321323322" - self.assertEqual(codecs.encode(STR, "base3"), B3) - self.assertEqual(codecs.encode(b(STR), "base3"), b(B3)) - self.assertEqual(codecs.decode(B3, "base3"), STR) - self.assertEqual(codecs.decode(b(B3), "base3"), b(STR)) - B3 = "21332331221123121122" - self.assertEqual(codecs.encode(STR, "base3-inv"), B3) - self.assertEqual(codecs.decode(B3, "base3-inv"), STR) - B3 = "bcaabaacbbccbacbccbb" - self.assertEqual(codecs.encode(STR, "base3-abc"), B3) - self.assertEqual(codecs.decode(B3, "base3-abc"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base3-ab") - self.assertRaises(LookupError, codecs.encode, "test", "base3-abcd") - - def test_codec_base4(self): - STR = "test" - B4 = "2421232224142421" - self.assertEqual(codecs.encode(STR, "base4"), B4) - self.assertEqual(codecs.encode(b(STR), "base4"), b(B4)) - self.assertEqual(codecs.decode(B4, "base4"), STR) - self.assertEqual(codecs.decode(b(B4), "base4"), b(STR)) - B4 = "3134323331413134" - self.assertEqual(codecs.encode(STR, "base4-inv"), B4) - self.assertEqual(codecs.decode(B4, "base4-inv"), STR) - B4 = "bdbabcbbbdadbdba" - self.assertEqual(codecs.encode(STR, "base4-abcd"), B4) - self.assertEqual(codecs.decode(B4, "base4-abcd"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base4-abc") - self.assertRaises(LookupError, codecs.encode, "test", "base4-abcde") - - def test_codec_base8(self): - STR = "test" - B8 = "dfagcfgddfa=====" - self.assertEqual(codecs.encode(STR, "base8"), B8) - self.assertEqual(codecs.encode(b(STR), "base8"), b(B8)) - self.assertEqual(codecs.decode(B8, "base8"), STR) - self.assertEqual(codecs.decode(b(B8), "base8"), b(STR)) - B8 = "echbfcbeech=====" - self.assertEqual(codecs.encode(STR, "base8-inv"), B8) - self.assertEqual(codecs.decode(B8, "base8-inv"), STR) - B8 = "35062563350=====" - self.assertEqual(codecs.encode(STR, "base8-01234567"), B8) - self.assertEqual(codecs.decode(B8, "base8-01234567"), STR) - self.assertRaises(LookupError, codecs.encode, "test", "base8-0123456") - self.assertRaises(LookupError, codecs.encode, "test", "base8-012345678") - - def test_codec_base16(self): - B16 = "7468697320697320612074657374" - self.assertEqual(codecs.encode(STR, "base16"), B16) - self.assertEqual(codecs.encode(b(STR), "base16"), b(B16)) - self.assertEqual(codecs.decode(B16, "base16"), STR) - self.assertEqual(codecs.decode(b(B16), "base16"), b(STR)) - B16 += "?" - self.assertRaises(ValueError, codecs.decode, B16, "base16") - self.assertEqual(codecs.decode(B16, "base16", "ignore"), STR) - self.assertEqual(codecs.decode(B16, "base16", "replace"), STR + "\x00") - self.assertRaises(ValueError, codecs.decode, B16, "base16", "BAD") - STR2 = "=:;" - B16_1 = "3d3a3b" - B16_2 = "3D3A3B" - B16_3 = "3D3a3B" # mixed case: should fail - self.assertEqual(codecs.encode(STR2, "hex"), B16_2) - self.assertEqual(codecs.decode(B16_1, "hex"), STR2) - self.assertEqual(codecs.decode(B16_2, "hex"), STR2) - self.assertRaises(ValueError, codecs.decode, B16_3, "hex") - - def test_codec_base32(self): - for b32, enc in zip(["ORUGS4ZANFZSAYJAORSXG5A=", "qtwg1h3ypf31yajyqt1zg7y=", "EHK6ISP0D5PI0O90EHIN6T0=", - "fjn6kwt0e5tk0s90fjkr6x0=", "EHM6JWS0D5SJ0R90EHJQ6X0="], - ["base32", "zbase32", "base32-hex", "geohash", "crockford"]): - self.assertEqual(codecs.encode(STR, enc), b32) - self.assertEqual(codecs.encode(b(STR), enc), b(b32)) - self.assertEqual(codecs.decode(b32, enc), STR) - self.assertEqual(codecs.decode(b(b32), enc), b(STR)) - self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc) - self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc, "BAD") - - def test_codec_base36(self): - B36 = "4WMHTK6UZL044O91NKCEB8" - self.assertEqual(codecs.encode(STR, "base36"), B36) - self.assertEqual(codecs.encode(b(STR), "base36"), b(B36)) - self.assertEqual(codecs.decode(B36, "base36"), STR) - self.assertEqual(codecs.decode(b(B36), "base36"), b(STR)) - B36 = "E6WR3UG49VAEEYJBXUMOLI" - self.assertEqual(codecs.encode(STR, "base36-inv"), B36) - self.assertEqual(codecs.decode(B36, "base36-inv"), STR) - self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36-inv") - self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36", "BAD") - self.assertEqual(codecs.decode(B36 + "?", "base36-inv", "ignore"), STR) - - def test_codec_base58(self): - B58 = "jo91waLQA1NNeBmZKUF" - self.assertEqual(codecs.encode(STR, "base58"), B58) - self.assertEqual(codecs.encode(b(STR), "base58"), b(B58)) - self.assertEqual(codecs.decode(B58, "base58"), STR) - self.assertEqual(codecs.decode(b(B58), "base58"), b(STR)) - B58 = "jo9rA2LQwr44eBmZK7E" - self.assertEqual(codecs.encode(STR, "base58-ripple"), B58) - self.assertEqual(codecs.decode(B58, "base58-rp"), STR) - B58 = "JN91Wzkpa1nnDbLyjtf" - self.assertEqual(codecs.encode(STR, "base58-flickr"), B58) - self.assertEqual(codecs.encode(STR, "base58-shorturl"), B58) - self.assertEqual(codecs.decode(B58, "base58-fl"), STR) - self.assertEqual(codecs.encode(STR, "base58-short-url"), B58) - self.assertEqual(codecs.encode(STR, "base58-url"), B58) - - def test_codec_base62(self): - for b62, enc in zip(["CsoB4HQ5gmgMyCenF7E", "M2yLERaFqwqW8MoxPHO"], ["base62", "base62-inv"]): - self.assertEqual(codecs.encode(STR, enc), b62) - self.assertEqual(codecs.encode(b(STR), enc), b(b62)) - self.assertEqual(codecs.decode(b62, enc), STR) - self.assertEqual(codecs.decode(b(b62), enc), b(STR)) - - def test_codec_base64(self): - for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): - self.assertEqual(codecs.encode(STR, enc), b64) - self.assertEqual(codecs.encode(b(STR), enc), b(b64)) - self.assertEqual(codecs.decode(b64, enc), STR) - self.assertEqual(codecs.decode(b(b64), enc), b(STR)) - - def test_codec_base91(self): - for b91, enc in zip([",X,<:WRT%yxth90oZB", ",N,<:MHJ%onjXzqeP1", "Jx&[jv4S3Wg>,71@Jk", "yJy^\\IDFsdc?Tof:L#"], - ["base91", "base91-inv", "base91-alt", "base91-alt-inv"]): - self.assertEqual(codecs.encode(STR, enc), b91) - self.assertEqual(codecs.encode(b(STR), enc), b(b91)) - self.assertEqual(codecs.decode(b91, enc), STR) - self.assertEqual(codecs.decode(b(b91), enc), b(STR)) - self.assertIsNotNone(codecs.encode("\x00\x00", "base91")) - self.assertIsNotNone(codecs.decode("abc", "base91")) - self.assertIsNotNone(codecs.decode("AD", "base91")) - self.assertRaises(ValueError, codecs.decode, "\xff", "base91") - self.assertRaises(ValueError, codecs.decode, "a\xff", "base91") - self.assertIsNotNone(codecs.encode("\x00\x00", "base91-alt")) - - def test_codec_base100(self): - if PY3: - B100 = "\U0001f46b\U0001f45f\U0001f460\U0001f46a\U0001f417\U0001f460\U0001f46a\U0001f417\U0001f458" \ - "\U0001f417\U0001f46b\U0001f45c\U0001f46a\U0001f46b" - self.assertEqual(codecs.encode(STR, "base100"), B100) - self.assertEqual(codecs.encode(b(STR), "base100"), b(B100)) - self.assertEqual(codecs.decode(B100, "base100"), STR) - self.assertEqual(codecs.decode(b(B100), "base100"), b(STR)) - self.assertRaises(ValueError, codecs.decode, b(B100)[1:], "base100") - - def test_codec_base_generic(self): - for n in range(2, 255): - bn = "base{}_generic".format(n) - self.assertEqual(codecs.decode(codecs.encode(STR, bn), bn), STR) - self.assertRaises(LookupError, codecs.decode, "test", "base0-generic") - self.assertRaises(LookupError, codecs.decode, "test", "base1-generic") - self.assertRaises(LookupError, codecs.decode, "test", "base256-generic") - - def test_base_main(self): - tmp = sys.argv[:] - tfile = "test-base-main.txt" - with open(tfile, 'w') as f: - f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") - for swap_arg in [[], ["-s"]]: - sys.argv = [tmp[0], tfile] + swap_arg - for m in main32, main64url: - self.assertEqual(m(), 0) - sys.argv = [tmp[0], tfile, "-d"] + swap_arg - self.assertEqual(main2(), 1) - os.remove(tfile) - sys.argv[:] = tmp - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Base codecs tests. + +""" +import sys +from unittest import TestCase + +from codext.__common__ import * +from codext.base._base import _generate_charset +from codext.base.baseN import base, main2, main32, main64url + + +class TestCodecsBase(TestCase): + def setUp(self): + global STR + STR = "this is a test" + + def test_new_base_codec(self): + for i in [0, 1, 256]: + self.assertRaises(ValueError, _generate_charset, i) + b10 = lambda *a: "0123456789" + base(b10, "base10") + B10 = "2361031878030638688519054699098996" + self.assertEqual(codecs.encode(STR, "base10"), B10) + self.assertEqual(codecs.encode(b(STR), "base10"), b(B10)) + self.assertEqual(codecs.decode(B10, "base10"), STR) + self.assertEqual(codecs.decode(b(B10), "base10"), b(STR)) + self.assertRaises(ValueError, base, 1, "test") + b11 = "0123456789a" + base(b11, "base11") + B11 = "113342054335735319526632a26972419" + self.assertEqual(codecs.encode(STR, "base11"), B11) + self.assertEqual(codecs.decode(B11, "base11"), STR) + self.assertRaises(ValueError, base, object(), "test") + self.assertIsNone(base({'': "01234"}, r"^base5(test)?$")) + self.assertIsNotNone(codecs.encode(STR, "base5test")) + self.assertRaises(ValueError, base, {'': "01234"}, "base5-test", pow2=True) + self.assertEqual("", codecs.decode("", "base5test")) + + def test_codec_base1(self): + C = "A" + for i in range(3): + self.assertIsNotNone(codecs.encode(i * C, "base1")) + self.assertRaises(ValueError, codecs.encode, 4 * C, "unary") + self.assertEqual(codecs.decode("AAAAA", "base1"), "\x05") + + def test_codec_base2(self): + STR = "test" + B2 = "01110100011001010111001101110100" + self.assertEqual(codecs.encode(STR, "base2"), B2) + self.assertEqual(codecs.encode(b(STR), "base2"), b(B2)) + self.assertEqual(codecs.decode(B2, "base2"), STR) + self.assertEqual(codecs.decode(b(B2), "base2"), b(STR)) + B2 = "10001011100110101000110010001011" + self.assertEqual(codecs.encode(STR, "base2-inv"), B2) + self.assertEqual(codecs.decode(B2, "base2-inv"), STR) + B2 = "abbbabaaabbaabababbbaabbabbbabaa" + self.assertEqual(codecs.encode(STR, "base2-ab"), B2) + self.assertEqual(codecs.decode(B2, "base2-ab"), STR) + B2 = "CDDDCDCCCDDCCDCDCDDDCCDDCDDDCDCC" + self.assertEqual(codecs.encode(STR, "base2-CD"), B2) + self.assertEqual(codecs.decode(B2, "base2-CD"), STR) + B2 = "34443433344334343444334434443433" + self.assertEqual(codecs.encode(STR, "base2-34"), B2) + self.assertEqual(codecs.decode(B2, "base2-34"), STR) + + def test_codec_base3(self): + STR = "test" + B3 = "23112113223321323322" + self.assertEqual(codecs.encode(STR, "base3"), B3) + self.assertEqual(codecs.encode(b(STR), "base3"), b(B3)) + self.assertEqual(codecs.decode(B3, "base3"), STR) + self.assertEqual(codecs.decode(b(B3), "base3"), b(STR)) + B3 = "21332331221123121122" + self.assertEqual(codecs.encode(STR, "base3-inv"), B3) + self.assertEqual(codecs.decode(B3, "base3-inv"), STR) + B3 = "bcaabaacbbccbacbccbb" + self.assertEqual(codecs.encode(STR, "base3-abc"), B3) + self.assertEqual(codecs.decode(B3, "base3-abc"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base3-ab") + self.assertRaises(LookupError, codecs.encode, "test", "base3-abcd") + + def test_codec_base4(self): + STR = "test" + B4 = "2421232224142421" + self.assertEqual(codecs.encode(STR, "base4"), B4) + self.assertEqual(codecs.encode(b(STR), "base4"), b(B4)) + self.assertEqual(codecs.decode(B4, "base4"), STR) + self.assertEqual(codecs.decode(b(B4), "base4"), b(STR)) + B4 = "3134323331413134" + self.assertEqual(codecs.encode(STR, "base4-inv"), B4) + self.assertEqual(codecs.decode(B4, "base4-inv"), STR) + B4 = "bdbabcbbbdadbdba" + self.assertEqual(codecs.encode(STR, "base4-abcd"), B4) + self.assertEqual(codecs.decode(B4, "base4-abcd"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base4-abc") + self.assertRaises(LookupError, codecs.encode, "test", "base4-abcde") + + def test_codec_base8(self): + STR = "test" + B8 = "dfagcfgddfa=====" + self.assertEqual(codecs.encode(STR, "base8"), B8) + self.assertEqual(codecs.encode(b(STR), "base8"), b(B8)) + self.assertEqual(codecs.decode(B8, "base8"), STR) + self.assertEqual(codecs.decode(b(B8), "base8"), b(STR)) + B8 = "echbfcbeech=====" + self.assertEqual(codecs.encode(STR, "base8-inv"), B8) + self.assertEqual(codecs.decode(B8, "base8-inv"), STR) + B8 = "35062563350=====" + self.assertEqual(codecs.encode(STR, "base8-01234567"), B8) + self.assertEqual(codecs.decode(B8, "base8-01234567"), STR) + self.assertRaises(LookupError, codecs.encode, "test", "base8-0123456") + self.assertRaises(LookupError, codecs.encode, "test", "base8-012345678") + + def test_codec_base16(self): + B16 = "7468697320697320612074657374" + self.assertEqual(codecs.encode(STR, "base16"), B16) + self.assertEqual(codecs.encode(b(STR), "base16"), b(B16)) + self.assertEqual(codecs.decode(B16, "base16"), STR) + self.assertEqual(codecs.decode(b(B16), "base16"), b(STR)) + B16 += "?" + self.assertRaises(ValueError, codecs.decode, B16, "base16") + self.assertEqual(codecs.decode(B16, "base16", "ignore"), STR) + self.assertEqual(codecs.decode(B16, "base16", "replace"), STR + "\x00") + self.assertRaises(ValueError, codecs.decode, B16, "base16", "BAD") + STR2 = "=:;" + B16_1 = "3d3a3b" + B16_2 = "3D3A3B" + B16_3 = "3D3a3B" # mixed case: should fail + self.assertEqual(codecs.encode(STR2, "hex"), B16_2) + self.assertEqual(codecs.decode(B16_1, "hex"), STR2) + self.assertEqual(codecs.decode(B16_2, "hex"), STR2) + self.assertRaises(ValueError, codecs.decode, B16_3, "hex") + + def test_codec_base32(self): + for b32, enc in zip(["ORUGS4ZANFZSAYJAORSXG5A=", "qtwg1h3ypf31yajyqt1zg7y=", "EHK6ISP0D5PI0O90EHIN6T0=", + "fjn6kwt0e5tk0s90fjkr6x0=", "EHM6JWS0D5SJ0R90EHJQ6X0="], + ["base32", "zbase32", "base32-hex", "geohash", "crockford"]): + self.assertEqual(codecs.encode(STR, enc), b32) + self.assertEqual(codecs.encode(b(STR), enc), b(b32)) + self.assertEqual(codecs.decode(b32, enc), STR) + self.assertEqual(codecs.decode(b(b32), enc), b(STR)) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc) + self.assertRaises(ValueError, codecs.decode, b32.rstrip("="), enc, "BAD") + + def test_codec_base36(self): + B36 = "4WMHTK6UZL044O91NKCEB8" + self.assertEqual(codecs.encode(STR, "base36"), B36) + self.assertEqual(codecs.encode(b(STR), "base36"), b(B36)) + self.assertEqual(codecs.decode(B36, "base36"), STR) + self.assertEqual(codecs.decode(b(B36), "base36"), b(STR)) + B36 = "E6WR3UG49VAEEYJBXUMOLI" + self.assertEqual(codecs.encode(STR, "base36-inv"), B36) + self.assertEqual(codecs.decode(B36, "base36-inv"), STR) + self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36-inv") + self.assertRaises(ValueError, codecs.decode, B36 + "?", "base36", "BAD") + self.assertEqual(codecs.decode(B36 + "?", "base36-inv", "ignore"), STR) + + def test_codec_base58(self): + B58 = "jo91waLQA1NNeBmZKUF" + self.assertEqual(codecs.encode(STR, "base58"), B58) + self.assertEqual(codecs.encode(b(STR), "base58"), b(B58)) + self.assertEqual(codecs.decode(B58, "base58"), STR) + self.assertEqual(codecs.decode(b(B58), "base58"), b(STR)) + B58 = "jo9rA2LQwr44eBmZK7E" + self.assertEqual(codecs.encode(STR, "base58-ripple"), B58) + self.assertEqual(codecs.decode(B58, "base58-rp"), STR) + B58 = "JN91Wzkpa1nnDbLyjtf" + self.assertEqual(codecs.encode(STR, "base58-flickr"), B58) + self.assertEqual(codecs.encode(STR, "base58-shorturl"), B58) + self.assertEqual(codecs.decode(B58, "base58-fl"), STR) + self.assertEqual(codecs.encode(STR, "base58-short-url"), B58) + self.assertEqual(codecs.encode(STR, "base58-url"), B58) + + def test_codec_base62(self): + for b62, enc in zip(["CsoB4HQ5gmgMyCenF7E", "M2yLERaFqwqW8MoxPHO"], ["base62", "base62-inv"]): + self.assertEqual(codecs.encode(STR, enc), b62) + self.assertEqual(codecs.encode(b(STR), enc), b(b62)) + self.assertEqual(codecs.decode(b62, enc), STR) + self.assertEqual(codecs.decode(b(b62), enc), b(STR)) + + def test_codec_base64(self): + for b64, enc in zip(["dGhpcyBpcyBhIHRlc3Q=", "T6XfSo1fSo1X87HbStG="], ["base64", "base64-inv"]): + self.assertEqual(codecs.encode(STR, enc), b64) + self.assertEqual(codecs.encode(b(STR), enc), b(b64)) + self.assertEqual(codecs.decode(b64, enc), STR) + self.assertEqual(codecs.decode(b(b64), enc), b(STR)) + + def test_codec_base91(self): + for b91, enc in zip([",X,<:WRT%yxth90oZB", ",N,<:MHJ%onjXzqeP1", "Jx&[jv4S3Wg>,71@Jk", "yJy^\\IDFsdc?Tof:L#"], + ["base91", "base91-inv", "base91-alt", "base91-alt-inv"]): + self.assertEqual(codecs.encode(STR, enc), b91) + self.assertEqual(codecs.encode(b(STR), enc), b(b91)) + self.assertEqual(codecs.decode(b91, enc), STR) + self.assertEqual(codecs.decode(b(b91), enc), b(STR)) + self.assertIsNotNone(codecs.encode("\x00\x00", "base91")) + self.assertIsNotNone(codecs.decode("abc", "base91")) + self.assertIsNotNone(codecs.decode("AD", "base91")) + self.assertRaises(ValueError, codecs.decode, "\xff", "base91") + self.assertRaises(ValueError, codecs.decode, "a\xff", "base91") + self.assertIsNotNone(codecs.encode("\x00\x00", "base91-alt")) + + def test_codec_base100(self): + B100 = "\U0001f46b\U0001f45f\U0001f460\U0001f46a\U0001f417\U0001f460\U0001f46a\U0001f417\U0001f458\U0001f417" \ + "\U0001f46b\U0001f45c\U0001f46a\U0001f46b" + self.assertEqual(codecs.encode(STR, "base100"), B100) + self.assertEqual(codecs.encode(b(STR), "base100"), b(B100)) + self.assertEqual(codecs.decode(B100, "base100"), STR) + self.assertEqual(codecs.decode(b(B100), "base100"), b(STR)) + self.assertRaises(ValueError, codecs.decode, b(B100)[1:], "base100") + self.assertIsNotNone(codecs.decode(b(B100) + b"\n", "base100", "ignore")) + + def test_codec_base_generic(self): + for n in range(2, 255): + bn = "base{}_generic".format(n) + self.assertEqual(codecs.decode(codecs.encode(STR, bn), bn), STR) + self.assertRaises(LookupError, codecs.decode, "test", "base0-generic") + self.assertRaises(LookupError, codecs.decode, "test", "base1-generic") + self.assertRaises(LookupError, codecs.decode, "test", "base256-generic") + + def test_base_main(self): + tmp = sys.argv[:] + tfile = "test-base-main.txt" + with open(tfile, 'w') as f: + f.write("This is a long test string for the sake of causing line wrapping based on default parameters.") + for swap_arg in [[], ["-s"]]: + sys.argv = [tmp[0], tfile] + swap_arg + for m in main32, main64url: + self.assertEqual(m(), 0) + sys.argv = [tmp[0], tfile, "-d"] + swap_arg + self.assertEqual(main2(), 1) + os.remove(tfile) + sys.argv[:] = tmp + diff --git a/tests/test_common.py b/tests/test_common.py index 8bbf410..407997c 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -1,256 +1,237 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Codecs added assets' tests. - -""" -import codecs -import codext -import json -import random -import sys -from codext.__common__ import CODECS_OVERWRITTEN, PERS_MACROS, PERS_MACROS_FILE -from six import b, binary_type, text_type -from unittest import TestCase - - -PY3 = sys.version[0] == "3" - - -def dummy_encode(input, errors="strict"): - return input, len(input) - - -def dummy_decode(input, errors="strict"): - return input, len(input) - - -def dummy_errored_decode(useless): - raise AttributeError - def decode(input, errors="strict"): - return input, len(input) - return decode - - -def ensure_str(s, encoding='utf-8', errors='strict'): - """ Similar to six.ensure_str. Adapted here to avoid messing up with six version errors. """ - if not PY3 and isinstance(s, text_type): - return s.encode(encoding, errors) - elif PY3 and isinstance(s, binary_type): - try: - return s.decode(encoding, errors) - except: - return s.decode("latin-1") - return s - - -def getregentry(encoding): - if encoding == "dummy3": - return codecs.CodecInfo(name="dummy3", encode=dummy_encode, decode=dummy_decode) - - -class TestCommon(TestCase): - def setUp(self): - codext.reset() - - def test_add_codec(self): - self.assertRaises(ValueError, codext.add, "test") - self.assertRaises(ValueError, codext.add, "test", "BAD") - self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") - self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) - self.assertEqual(codext.encode("test", "dummy"), "test") - ci = codext.lookup("dummy") - for k in ["add_to_codecs", "category", "examples", "name", "pattern", "text"]: - self.assertIn(k, ci.parameters.keys()) - self.assertIsNotNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) - self.assertRaises(AttributeError, codext.lookup, "dummy_errored1") - - def test_add_map_codec(self): - ENCMAP = [{'a': "A", 'b': "B", 'c': "C"}, {'d': "D", 'e': "E", 'f': "F"}, {'g': "G", 'h': "H", 'i': "I"}] - self.assertIsNotNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) - self.assertRaises(ValueError, codext.add_map, "dummy2", "BAD_ENCMAP") - self.assertEqual(codext.encode("abc", "dummy2"), "ABC") - self.assertEqual(codext.encode("abc", "dummy2-1"), "ABC") - self.assertEqual(codext.encode("def", "dummy2-2"), "DEF") - self.assertEqual(codext.encode("ghi", "dummy2-3"), "GHI") - self.assertRaises(LookupError, codext.encode, "test", "dummy2-4") - ENCMAP = {'': {'a': "A", 'b': "B"}, r'bad': {'a': "B", 'b': "A"}} - self.assertIsNotNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) - self.assertRaises(LookupError, codext.encode, "test", "dummy3_inverted") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, ignore_case="BAD") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, intype="BAD") - self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, outype="BAD") - ci = codext.lookup("dummy2") - for k in ["category", "encmap", "ignore_case", "intype", "no_error", "outype", "repl_char", "sep", "text"]: - self.assertIn(k, ci.parameters.keys()) - - def test_list_codecs(self): - self.assertTrue(len(codext.list()) > 0) - self.assertTrue(len(codext.list("other")) > 0) - self.assertTrue(len(codext.list("native")) > 0) - self.assertTrue(len(codext.list("non-native")) > 0) - self.assertTrue(len(codext.list("native", "non-native", "crypto", "base")) > 0) - self.assertTrue(len(codext.list("native", "language", "crypto")) > 0) - self.assertTrue(len(codext.list("~crypto")) > 0) - self.assertEqual(set(codext.list("~native")), set(codext.list("non-native"))) - self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native"))) - self.assertRaises(ValueError, codext.list, "BAD_CATEGORY") - self.assertTrue(codext.is_native("base64_codec")) - self.assertFalse(codext.is_native("base64")) - - def test_remove_codec(self): - self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) - self.assertEqual(codext.encode("test", "dummy"), "test") - self.assertIsNone(codext.remove("dummy")) - self.assertRaises(LookupError, codext.encode, "test", "dummy") - # special case, when adding a new codec also to the native codecs registry, then it won't be possible to remove - # it afterwards - self.assertIsNotNone(codecs.add("dummy2", dummy_encode, dummy_decode)) - self.assertEqual(codecs.encode("test", "dummy2"), "test") - self.assertIsNone(codecs.remove("dummy2")) - self.assertEqual(codecs.encode("test", "dummy2"), "test") - self.assertIsNone(codecs.register(getregentry)) - self.assertEqual(codecs.encode("test", "dummy3"), "test") - self.assertIsNone(codecs.remove("dummy3")) - self.assertEqual(codecs.encode("test", "dummy3"), "test") - - def test_clear_codecs(self): - self.assertIsNotNone(codecs.encode("test", "morse")) - self.assertIsNone(codecs.clear()) - self.assertRaises(LookupError, codecs.encode, "test", "morse") - - def test_reset_codecs(self): - self.assertIsNone(codext.reset()) - self.assertIsNotNone(codext.encode("test", "morse")) - self.assertRaises(LookupError, codext.encode, "test", "dummy") - self.assertTrue(len(CODECS_OVERWRITTEN) > 0) - self.assertIsNotNone(str(CODECS_OVERWRITTEN[0])) - - def test_search_codecs(self): - self.assertIsNotNone(codext.search("morse")) - self.assertIsNotNone(codext.search("geohash")) - self.assertIsNotNone(codext.examples("morse")) - self.assertIsNotNone(codext.examples("cp")) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[ab]{1,3}"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=ab)cd"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=-)\w+"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"([^\s])\1"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^\\]"))) - self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]"))) - - def test_encode_multiple_rounds(self): - if PY3: - self.assertRaises(TypeError, codext.encode, b"test", "utf-8[2]") - s = "test" - for i in range(3): - s = codext.encode(s, "morse") - self.assertEqual(s, codext.encode("test", "morse[3]")) - self.assertIsNotNone(codext.encode("test", "base64[10]")) - - def test_guess_decode(self): - self.assertIsNone(codext.stopfunc._reload_lang()) - self.assertIsNotNone(codext.stopfunc._validate("flag")) - _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None - codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) - self.assertIn("test-codec", codext.list_encodings("test")) - self.assertEqual(codext.decode("TEST=", "test"), "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, - scoring_heuristic=False).items())[0][1], "TEST") - self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], - max_depth=2).items())[0][1], "TEST") - STR = "This is a test" - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) - self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"]))) - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, - exclude=["base100"]))) - self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) - self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) - self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, - show=True))) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", - exclude=("base64", "base64-url"))), 0) - self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", - scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) - self.assertRaises(ValueError, codext.guess, STR, max_depth=0) - self.assertRaises(ValueError, codext.guess, STR, exclude=42) - for c in ["base", "language", "native", "stegano"]: - e = codext.list(c) - random.shuffle(e) - for ename in e[:10]: - for encoding in codext.lookup(ename).parameters.get('guess', [ename])[:10]: - try: - enc = codext.encode(STR, encoding) - except (NotImplementedError, ValueError): - continue - except TypeError: - enc = codext.encode(b(STR), encoding) - if codext.decode(enc, encoding) == STR: - continue - for found_encodings, found_dec in codext.guess(enc, "a test", 0, 1, [c], - scoring_heuristic=True, debug=True).items(): - self.assertEqual(ensure_str(STR).lower(), ensure_str(found_dec).lower()) - if c != "base": - # do not check for base as the guessed encoding name can be different, e.g.: - # actual: base2 - # guessed: base2-generic - if "-icase" in encoding: - self.assertEqual(encoding.lower(), found_encodings[0].lower()) - else: - self.assertEqual(encoding, found_encodings[0]) - txt = "".join(chr(i) for i in range(256)) - b64 = codext.encode(txt, "base64") - self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) - self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") - - def test_rank_input(self): - codext.remove("test_codec") - self.assertRaises(LookupError, codext.encode, "TEST", "test") - codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), - "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) - STR = "This is a test string !" - ENC = codext.encode(STR, "base64") - self.assertTrue(len(codext.rank(ENC)) > 20) - self.assertEqual(len(codext.rank(ENC, limit=20)), 20) - self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) - self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) - self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) - self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) - self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") - - def test_handle_macros(self): - MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" - STR = "this is a test" - ENC = "H4sIAMrbkmEC/0txzyhIrnQC4QxPj6CcZONAWwAMIDOIFAAAAA==" - codext.remove(MACRO) - l = codext.list_macros() - self.assertTrue(len(l) > 0) - cm = codext.lookup("example-macro") - self.assertIsNotNone(cm) - self.assertRaises(LookupError, codext.lookup, "example-macro", False) - self.assertRaises(ValueError, codext.add_macro, "example-macro", "base64") - self.assertRaises(ValueError, codext.add_macro, "base64", "base91") - self.assertIsNotNone(repr(cm)) - self.assertTrue(hasattr(cm, "parameters")) - self.assertRaises(LookupError, codext.lookup, MACRO) - self.assertIsNone(codext.add_macro(MACRO, "base64", "gzip", "base64")) - self.assertIn(MACRO, codext.list_macros()) - self.assertIsNotNone(codext.encode(STR, MACRO)) - self.assertEqual(codext.decode(ENC, MACRO), STR) - # insert a bad entry for the list of encodings in the JSON file - PERS_MACROS[MACRO] = "not a list or tuple..." - with open(PERS_MACROS_FILE, 'w') as f: - json.dump(PERS_MACROS, f) - codext.reset() - self.assertRaises(ValueError, codext.lookup, MACRO) - self.assertIsNone(codext.remove(MACRO)) - self.assertRaises(LookupError, codext.lookup, MACRO) - self.assertNotIn(MACRO, codext.list_macros()) - self.assertIsNone(codext.remove("THIS-MACRO-DOES-NOT-EXIST")) - self.assertIsNone(codext.remove("VALID-MACRO")) - self.assertIsNone(codext.add_macro("VALID-MACRO", "gzip", "base64")) - self.assertIsNone(codext.remove("VALID-MACRO")) - if PY3: - self.assertIsNone(codext.add_macro("VALID-MACRO", "lzma", "base64")) - self.assertIsNone(codext.remove("VALID-MACRO")) - self.assertRaises(ValueError, codext.add_macro, "SHALL-FAIL", "base26", "sms", "letter-indices") - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Codecs added assets' tests. + +""" +import codext +import json +import random +import sys +from codext.__common__ import * +from codext.__common__ import CODECS_OVERWRITTEN, PERS_MACROS, PERS_MACROS_FILE +from unittest import TestCase + + +def dummy_encode(input, errors="strict"): + return input, len(input) + + +def dummy_decode(input, errors="strict"): + return input, len(input) + + +def dummy_errored_decode(useless): + raise AttributeError + def decode(input, errors="strict"): + return input, len(input) + return decode + + +def getregentry(encoding): + if encoding == "dummy3": + return codecs.CodecInfo(name="dummy3", encode=dummy_encode, decode=dummy_decode) + + +class TestCommon(TestCase): + def setUp(self): + codext.reset() + + def test_add_codec(self): + self.assertRaises(ValueError, codext.add, "test") + self.assertRaises(ValueError, codext.add, "test", "BAD") + self.assertRaises(ValueError, codext.add, "test", lambda: None, "BAD") + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + ci = codext.lookup("dummy") + for k in ["add_to_codecs", "category", "examples", "name", "pattern", "text"]: + self.assertIn(k, ci.parameters.keys()) + self.assertIsNotNone(codext.add("dummy_errored", None, dummy_errored_decode, r"dummy_errored(\d+)$")) + self.assertRaises(AttributeError, codext.lookup, "dummy_errored1") + + def test_add_map_codec(self): + ENCMAP = [{'a': "A", 'b': "B", 'c': "C"}, {'d': "D", 'e': "E", 'f': "F"}, {'g': "G", 'h': "H", 'i': "I"}] + self.assertIsNotNone(codext.add_map("dummy2", ENCMAP, pattern=r"^dummy2(?:[-_]?(\d))?$")) + self.assertRaises(ValueError, codext.add_map, "dummy2", "BAD_ENCMAP") + self.assertEqual(codext.encode("abc", "dummy2"), "ABC") + self.assertEqual(codext.encode("abc", "dummy2-1"), "ABC") + self.assertEqual(codext.encode("def", "dummy2-2"), "DEF") + self.assertEqual(codext.encode("ghi", "dummy2-3"), "GHI") + self.assertRaises(LookupError, codext.encode, "test", "dummy2-4") + ENCMAP = {'': {'a': "A", 'b': "B"}, r'bad': {'a': "B", 'b': "A"}} + self.assertIsNotNone(codext.add_map("dummy3", ENCMAP, pattern=r"^dummy3([-_]inverted)?$")) + self.assertRaises(LookupError, codext.encode, "test", "dummy3_inverted") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, ignore_case="BAD") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, intype="BAD") + self.assertRaises(ValueError, codext.add_map, "dummy2", ENCMAP, outype="BAD") + ci = codext.lookup("dummy2") + for k in ["category", "encmap", "ignore_case", "intype", "no_error", "outype", "repl_char", "sep", "text"]: + self.assertIn(k, ci.parameters.keys()) + + def test_list_codecs(self): + self.assertTrue(len(codext.list()) > 0) + self.assertTrue(len(codext.list("other")) > 0) + self.assertTrue(len(codext.list("native")) > 0) + self.assertTrue(len(codext.list("non-native")) > 0) + self.assertTrue(len(codext.list("native", "non-native", "crypto", "base")) > 0) + self.assertTrue(len(codext.list("native", "language", "crypto")) > 0) + self.assertTrue(len(codext.list("~crypto")) > 0) + self.assertEqual(set(codext.list("~native")), set(codext.list("non-native"))) + self.assertEqual(set(codext.list()), set(codext.list("native") + codext.list("non-native"))) + self.assertRaises(ValueError, codext.list, "BAD_CATEGORY") + self.assertTrue(codext.is_native("base64_codec")) + self.assertFalse(codext.is_native("base64")) + + def test_remove_codec(self): + self.assertIsNotNone(codext.add("dummy", dummy_encode, dummy_decode)) + self.assertEqual(codext.encode("test", "dummy"), "test") + self.assertIsNone(codext.remove("dummy")) + self.assertRaises(LookupError, codext.encode, "test", "dummy") + # special case, when adding a new codec also to the native codecs registry, then it won't be possible to remove + # it afterwards + self.assertIsNotNone(codecs.add("dummy2", dummy_encode, dummy_decode)) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.remove("dummy2")) + self.assertEqual(codecs.encode("test", "dummy2"), "test") + self.assertIsNone(codecs.register(getregentry)) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + self.assertIsNone(codecs.remove("dummy3")) + self.assertEqual(codecs.encode("test", "dummy3"), "test") + + def test_clear_codecs(self): + self.assertIsNotNone(codecs.encode("test", "morse")) + self.assertIsNone(codecs.clear()) + self.assertRaises(LookupError, codecs.encode, "test", "morse") + + def test_reset_codecs(self): + self.assertIsNone(codext.reset()) + self.assertIsNotNone(codext.encode("test", "morse")) + self.assertRaises(LookupError, codext.encode, "test", "dummy") + self.assertTrue(len(CODECS_OVERWRITTEN) > 0) + self.assertIsNotNone(str(CODECS_OVERWRITTEN[0])) + + def test_search_codecs(self): + self.assertIsNotNone(codext.search("morse")) + self.assertIsNotNone(codext.search("geohash")) + self.assertIsNotNone(codext.examples("morse")) + self.assertIsNotNone(codext.examples("cp")) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[ab]{1,3}"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=ab)cd"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"(?<=-)\w+"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"([^\s])\1"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^\\]"))) + self.assertIsNotNone(list(codext.generate_strings_from_regex(r"[^a]"))) + + def test_encode_multiple_rounds(self): + s = "test" + for i in range(3): + s = codext.encode(s, "morse") + self.assertEqual(s, codext.encode("test", "morse[3]")) + self.assertIsNotNone(codext.encode("test", "base64[10]")) + + def test_guess_decode(self): + self.assertIsNone(codext.stopfunc._reload_lang()) + self.assertIsNotNone(codext.stopfunc._validate("flag")) + _l = lambda d: list(d.items())[0][1] if len(d) > 0 else None + codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), + "^test(?:_codec)?$", padding_char="=", no_error=True, bonus_func=lambda *a: True, penalty=-.5) + self.assertIn("test-codec", codext.list_encodings("test")) + self.assertEqual(codext.decode("TEST=", "test"), "TEST") + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include="test", max_depth=2, + scoring_heuristic=False).items())[0][1], "TEST") + self.assertEqual(list(codext.guess("TEST=", codext.stopfunc.text, include=["test", "base"], + max_depth=2).items())[0][1], "TEST") + STR = "This is a test" + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", max_depth=1))) + self.assertEqual(STR, _l(codext.guess("CJG3Ix8bVcSRMLOqwDUg28aDsT7", "a test", found=["base62"]))) + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, "base", scoring_heuristic=True, + exclude=["base100"]))) + self.assertEqual(STR, _l(codext.guess("VGhpcyBpcyBhIHRlc3Q=", "a test", 0, 1, ["base", "crypto"]))) + self.assertEqual(len(codext.guess("NOT THE ENCODED TEST STRING", "a test", max_depth=1, exclude=None)), 0) + self.assertIn("F1@9", _l(codext.guess("VGVzdCBGMUA5ICE=", codext.stopfunc.flag, max_depth=1, stop=False, + show=True))) + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", + exclude=("base64", "base64-url"))), 0) + self.assertEqual(len(codext.guess("VGhpcyBpcyBhIHRlc3Q=", " a test", max_depth=1, include="base", + scoring_heuristic=True, exclude=("base64", "base64-url", "atbash"))), 0) + self.assertRaises(ValueError, codext.guess, STR, max_depth=0) + self.assertRaises(ValueError, codext.guess, STR, exclude=42) + for c in ["base", "language", "native", "stegano"]: + e = codext.list(c) + random.shuffle(e) + for ename in e[:10]: + for encoding in codext.lookup(ename).parameters.get('guess', [ename])[:10]: + try: + enc = codext.encode(STR, encoding) + except (NotImplementedError, ValueError): + continue + except TypeError: + enc = codext.encode(b(STR), encoding) + if codext.decode(enc, encoding) == STR: + continue + for found_encodings, found_dec in codext.guess(enc, "a test", 0, 1, [c], + scoring_heuristic=True, debug=True).items(): + self.assertEqual(ensure_str(STR).lower(), ensure_str(found_dec).lower()) + if c != "base": + # do not check for base as the guessed encoding name can be different, e.g.: + # actual: base2 + # guessed: base2-generic + if "-icase" in encoding: + self.assertEqual(encoding.lower(), found_encodings[0].lower()) + else: + self.assertEqual(encoding, found_encodings[0]) + txt = "".join(chr(i) for i in range(256)) + b64 = codext.encode(txt, "base64") + self.assertEqual(txt, _l(codext.guess(b64, "0123456789", max_depth=1, scoring_heuristic=True, include="base"))) + self.assertRaises(ValueError, codext.stopfunc._reload_lang, "DOES_NOT_EXIST") + + def test_rank_input(self): + codext.remove("test_codec") + self.assertRaises(LookupError, codext.encode, "TEST", "test") + codext.add("test_codec", lambda x, e="strict": (x + "=", len(x)), lambda x, e="strict": (x[:-1], len(x)-1), + "^test(?:_codec)?$", padding_char="=", no_error=True, penalty=1.) + STR = "This is a test string !" + ENC = codext.encode(STR, "base64") + self.assertTrue(len(codext.rank(ENC)) > 20) + self.assertEqual(len(codext.rank(ENC, limit=20)), 20) + self.assertIn(codext.rank(ENC, exclude=["rot"])[0][1], ["base64", "base64-url", "base64-inv"]) + self.assertEqual(codext.rank(ENC, include="base")[0][0][1], STR) + self.assertEqual(codext.rank(ENC, include=["base"])[0][0][1], STR) + self.assertIsNotNone(codext.rank(ENC, include=["base"], exclude=["does_not_exist"])[0][0][1], STR) + self.assertIsNotNone(codext.rank("TEST=", include=["test", "base"])[0][0][1], "TEST") + + def test_handle_macros(self): + MACRO = "test-macro-f2ca1bb6c7e907d06dafe4687e579fce76b37e4e93b7605022da52e6ccc26fd2" + STR = "this is a test" + ENC = "H4sIAMrbkmEC/0txzyhIrnQC4QxPj6CcZONAWwAMIDOIFAAAAA==" + codext.remove(MACRO) + l = codext.list_macros() + self.assertTrue(len(l) > 0) + cm = codext.lookup("example-macro") + self.assertIsNotNone(cm) + self.assertRaises(LookupError, codext.lookup, "example-macro", False) + self.assertRaises(ValueError, codext.add_macro, "example-macro", "base64") + self.assertRaises(ValueError, codext.add_macro, "base64", "base91") + self.assertIsNotNone(repr(cm)) + self.assertTrue(hasattr(cm, "parameters")) + self.assertRaises(LookupError, codext.lookup, MACRO) + self.assertIsNone(codext.add_macro(MACRO, "base64", "gzip", "base64")) + self.assertIn(MACRO, codext.list_macros()) + self.assertIsNotNone(codext.encode(STR, MACRO)) + self.assertEqual(codext.decode(ENC, MACRO), STR) + # insert a bad entry for the list of encodings in the JSON file + PERS_MACROS[MACRO] = "not a list or tuple..." + with open(PERS_MACROS_FILE, 'w') as f: + json.dump(PERS_MACROS, f) + codext.reset() + self.assertRaises(ValueError, codext.lookup, MACRO) + self.assertIsNone(codext.remove(MACRO)) + self.assertRaises(LookupError, codext.lookup, MACRO) + self.assertNotIn(MACRO, codext.list_macros()) + self.assertIsNone(codext.remove("THIS-MACRO-DOES-NOT-EXIST")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertIsNone(codext.add_macro("VALID-MACRO", "gzip", "base64")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertIsNone(codext.add_macro("VALID-MACRO", "lzma", "base64")) + self.assertIsNone(codext.remove("VALID-MACRO")) + self.assertRaises(ValueError, codext.add_macro, "SHALL-FAIL", "base26", "sms", "letter-indices") + diff --git a/tests/test_generated.py b/tests/test_generated.py index 614562f..e8eaf10 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -1,139 +1,158 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Automatically generated codec tests. - -""" -import os -import re -from itertools import chain -from random import randint -from string import printable -from unittest import TestCase - -from codext.__common__ import * - - -def make_test(**params): - """ Test factory function for auto-creating tests for encodings having __examples__ defined. """ - def _template(self): - tfile = "test-codec-%s.txt" % params['name'] - icase = params.get('ignore_case') - icdec = lambda s: s.lower() if icase in ["decode", "both"] else s - icenc = lambda s: s.lower() if icase in ["encode", "both"] else s - # first, define if only encode is used ; if so, decoding must occur right after encode tests, otherwise just - # execute the defined decode tests - dec = True - for k in params['examples'].keys(): - if k.startswith("dec"): - dec = False - # now execute tests relying on the given examples - for k, examples in params['examples'].items(): - # multiple encoding names can be given, e.g. 'enc(morse|morse-AB|...)' - m = re.match(r"(?:dec|enc|enc-dec)\((.*?)(?:\|(.*?))*\)", k) - if m: - f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) - f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) - for ename in m.groups(): - if ename is None: - continue - # buggy generated encoding names - try: - lookup(ename) - except LookupError: - continue - # erroneous encoding name test - if examples is None: - self.assertRaises(LookupError, f1, "test", ename) - continue - # unhandled character error tests - encmap = params.get('encmap') - if encmap and params['intype'] not in ["bin", "ord"] and not params['no_error']: - if not isinstance(encmap, list): - encmap = [encmap] - for em in encmap: - if k.startswith("dec"): - em = {v: k for k, v in em.items()} - # find one handled character and one unhandled - c1, c2 = None, None - p = list(map(ord, printable)) - for i in chain(p, set(range(256)) - set(p)): - if chr(i) in em.keys(): - c1 = chr(i) - break - for i in chain(set(range(256)) - set(p), p): - if chr(i) not in em.keys(): - c2 = chr(i) - break - # now check that it raises the right error or not given the selected errors handling - if c1 and c2: - sep = params['sep'][0] if len(params['sep']) > 0 else "" - self.assertRaises(ValueError, f1, c2, ename) - self.assertRaises(ValueError, f1, c2, ename, "BAD_ERRORS") - if not k.startswith("enc-dec"): - self.assertEqual(f1(c1 + c2, ename, "ignore"), f1(c1, ename)) - self.assertEqual(f1(c1 + c2, ename, "leave"), f1(c1, ename) + sep + c2) - self.assertEqual(f1(c1 + c2, ename, "replace"), f1(c1, ename) + sep + \ - params.get('repl_minlen', 1) * params['repl_char']) - # examples validation tests - if k.startswith("enc-dec") and isinstance(examples, list): - for e in examples[:]: - rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) - if rd: - examples.remove(e) - for n in (rd.group(2) or "512").split(","): - s = "".join(chr(randint(0, 255)) for i in range(int(n))) - examples.append(s.lower() if rd.group(1) else s) - for s in [""] + examples: - self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) - self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) - # file tests - with codecs.open(tfile, 'wb', encoding=ename) as f: - f.write(b(s)) - with codecs.open(tfile, 'rb', encoding=ename) as f: - s2 = f.read() if PY3 else f.read().rstrip("\x00") - self.assertEqual(b(icdec(s2)), b(icdec(s))) - os.remove(tfile) - else: - for s1, s2 in examples.items(): - # willingly erroneous tests - if s2 is None: - self.assertRaises((ValueError, NotImplementedError), f1, s1, ename) - continue - # raw text tests - self.assertEqual(icenc(f1(s1, ename)), icenc(s2)) - self.assertEqual(b(icenc(f1(s1, ename))), b(icenc(s2))) - self.assertIsNotNone(f1(s1, ename, "replace")) - self.assertIsNotNone(f1(s1, ename, "ignore")) - if dec: - self.assertEqual(icdec(f2(s2, ename)), icdec(s1)) - self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s1))) - self.assertIsNotNone(f2(s2, ename, "replace")) - self.assertIsNotNone(f2(s2, ename, "ignore")) - if k.startswith("enc"): - # file tests - with codecs.open(tfile, 'wb', encoding=ename) as f: - f.write(b(s1)) - with codecs.open(tfile, 'rb', encoding=ename) as f: - s = f.read() - if not PY3 and re.search("[^\x00]\x00$", s): - s = s[:-1] - self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s))) - os.remove(tfile) - return _template - - -class GeneratedTestCase(TestCase): - pass - - -for encoding in list_encodings(): - try: - ci = lookup(encoding) - except LookupError: - continue - # only consider codecs with __examples__ defined in their globals for dynamic tests generation - if ci.parameters.get('examples') is not None: - f = make_test(**ci.parameters) - f.__name__ = n = "test_" + encoding.replace("-", "_") - setattr(GeneratedTestCase, n, f) - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Automatically generated codec tests. + +""" +from itertools import chain +from random import randint +from string import printable +from unittest import TestCase + +from codext.__common__ import * + + +def make_test(**params): + """ Test factory function for auto-creating tests for encodings having __examples__ defined. """ + def _template(self): + tfile = "test-codec-%s.txt" % params['name'] + icase = params.get('ignore_case') + icdec = lambda s: s.lower() if icase in ["decode", "both"] else s + icenc = lambda s: s.lower() if icase in ["encode", "both"] else s + # first, define if only encode is used ; if so, decoding must occur right after encode tests, otherwise just + # execute the defined decode tests + dec = True + for k in params['examples'].keys(): + if k.startswith("dec"): + dec = False + # now execute tests relying on the given examples + for k, examples in params['examples'].items(): + # multiple encoding names can be given, e.g. 'enc(morse|morse-AB|...)' + m = re.match(r"(?:dec|enc|enc-dec)\((.*?)(?:\|(.*?))*\)(\*)?", k) + if m: + f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) + f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) + for ename in m.groups(): + #FIXME + if ename == "*": + # ignore mode only + continue + if ename is None: + continue + # buggy generated encoding names + try: + lookup(ename) + except LookupError: + continue + # erroneous encoding name test + if examples is None: + self.assertRaises(LookupError, f1, "test", ename) + continue + # unhandled character error tests + encmap = params.get('encmap') + if encmap and params['intype'] not in ["bin", "ord"] and not params['no_error']: + if not isinstance(encmap, list): + encmap = [encmap] + for em in encmap: + if k.startswith("dec"): + em = {v: k for k, v in em.items()} + # find one handled character and one unhandled + c1, c2 = None, None + p = list(map(ord, printable)) + for i in chain(p, set(range(256)) - set(p)): + if chr(i) in em.keys(): + c1 = chr(i) + break + for i in chain(set(range(256)) - set(p), p): + if chr(i) not in em.keys(): + c2 = chr(i) + break + # now check that it raises the right error or not given the selected errors handling + if c1 and c2: + sep = params['sep'][0] if len(params['sep']) > 0 else "" + self.assertRaises(ValueError, f1, c2, ename) + self.assertRaises(ValueError, f1, c2, ename, "BAD_ERRORS") + if not k.startswith("enc-dec"): + self.assertEqual(f1(c1 + c2, ename, "ignore"), f1(c1, ename)) + self.assertEqual(f1(c1 + c2, ename, "leave"), f1(c1, ename) + sep + c2) + self.assertEqual(f1(c1 + c2, ename, "replace"), f1(c1, ename) + sep + \ + params.get('repl_minlen', 1) * params['repl_char']) + # examples validation tests + incr_f1 = codecs.getincrementalencoder(ename)().encode + incr_f2 = codecs.getincrementaldecoder(ename)().decode + # - "enc-dec" tests (uses a list of values that shall remain the same after encoding and decoding, + # no matter what the encoded value is + if k.startswith("enc-dec") and isinstance(examples, list): + for e in examples[:]: + rd = re.match(r"\@(i?)random(?:\{(\d+(?:,(\d+))*?)\})?$", e) + if rd: + examples.remove(e) + for n in (rd.group(2) or "512").split(","): + s = "".join(chr(randint(0, 255)) for i in range(int(n))) + examples.append(s.lower() if rd.group(1) else s) + for s in [""] + examples: + self.assertEqual(icdec(f2(icenc(f1(s, ename)), ename)), icdec(s)) + self.assertEqual(icdec(f2(icenc(f1(b(s), ename)), ename)), b(icdec(s))) + # important note: with respect to the original design, + # IncrementalEncoder(...).encode(...) gives bytes + # IncrementalDecoder(...).encode(...) gives str + self.assertEqual(icdec(incr_f2(icenc(incr_f1(s, ename)), ename)), icdec(s)) + self.assertEqual(icdec(incr_f2(icenc(incr_f1(b(s), ename)), ename)), icdec(s)) + # file tests + with codecs.open(tfile, 'wb', encoding=ename) as f: + f.write(b(s)) + with codecs.open(tfile, 'rb', encoding=ename) as f: + s2 = f.read() + self.assertEqual(b(icdec(s2)), b(icdec(s))) + os.remove(tfile) + # - "enc" and "dec" tests (uses a dictionary with the value to be encoded and the expected encoded + # value) + else: + for s1, s2 in examples.items(): + # willingly erroneous tests + if s2 is None: + self.assertRaises((ValueError, NotImplementedError), f1, s1, ename) + continue + # raw text tests + self.assertEqual(icenc(f1(s1, ename)), icenc(s2)) + self.assertEqual(b(icenc(f1(s1, ename))), b(icenc(s2))) + # important note: with respect to the original design, + # IncrementalEncoder(...).encode(...) gives bytes + #self.assertEqual(icenc(incr_f1(s1, ename)), b(icenc(s2))) + #self.assertEqual(icenc(incr_f1(b(s1), ename)), b(icenc(s2))) + self.assertIsNotNone(f1(s1, ename, "replace")) + self.assertIsNotNone(f1(s1, ename, "ignore")) + if dec: + self.assertEqual(icdec(f2(s2, ename)), icdec(s1)) + self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s1))) + # important note: with respect to the original design, + # IncrementalDecoder(...).encode(...) gives str + #self.assertEqual(icdec(incr_f2(s2, ename)), icdec(s1)) + #self.assertEqual(icdec(incr_f2(b(s2), ename)), icdec(s1)) + self.assertIsNotNone(f2(s2, ename, "replace")) + self.assertIsNotNone(f2(s2, ename, "ignore")) + if k.startswith("enc"): + # file tests + with codecs.open(tfile, 'wb', encoding=ename) as f: + f.write(b(s1)) + with codecs.open(tfile, 'rb', encoding=ename) as f: + s = f.read() + self.assertEqual(b(icdec(f2(s2, ename))), b(icdec(s))) + os.remove(tfile) + return _template + + +class GeneratedTestCase(TestCase): + pass + + +for encoding in list_encodings(): + try: + ci = lookup(encoding) + except LookupError: + continue + # only consider codecs with __examples__ defined in their globals for dynamic tests generation + if ci.parameters.get('examples') is not None: + f = make_test(**ci.parameters) + f.__name__ = n = "test_" + encoding.replace("-", "_") + setattr(GeneratedTestCase, n, f) + diff --git a/tests/test_manual.py b/tests/test_manual.py index 6a1d09f..bed4884 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -1,172 +1,168 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- -"""Manual codec tests. - -""" -import hashlib -import os -import random -from six import binary_type, string_types -from unittest import TestCase - -from codext.__common__ import * -from codext.binary.baudot import _check_alphabet -from codext.hashing.checksums import CRC - - -class ComplementaryTestCase(TestCase): - def test_codec_baudot(self): - self.assertRaises(ValueError, _check_alphabet, ["BAD_ALPHABET"]) - - def test_codec_dna(self): - self.assertEqual(codecs.decode("ABC", "dna-1", errors="ignore"), "\x02") - self.assertEqual(codecs.decode("ABC", "dna-2", errors="replace"), "[00??01]") - - def test_codec_morse(self): - self.assertRaises(LookupError, codecs.encode, "test", "morse-AAB") - - def test_codec_sms(self): - self.assertEqual(codecs.decode("A-B-222-3-4-5", "sms", "leave"), "ABcdgj") - - -class ManualTestCase(TestCase): - def test_codec_affine(self): - STR = "this is a test" - AFF1 = "vjkubkubcbvguv" - self.assertRaises(LookupError, codecs.encode, STR, "affine-BAD") - self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u-BAD") - # uses by default an alphabet with lowercase, uppercase, whitespace and parameters a=1 and b=2 - self.assertEqual(codecs.encode(STR, "affine"), codecs.encode(STR, "affine-?l?u?s-1,2")) - self.assertEqual(codecs.encode(STR, "affine"), AFF1) - self.assertEqual(codecs.encode(b(STR), "affine"), b(AFF1)) - self.assertEqual(codecs.decode(AFF1, "affine"), STR) - self.assertEqual(codecs.decode(b(AFF1), "affine"), b(STR)) - AFF2 = "ORWJdWJdidOCJO" - self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-5,8"), AFF2) - self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-5,8"), b(AFF2)) - self.assertEqual(codecs.decode(AFF2, "affine-?l?u?d?s-5,8"), STR) - self.assertEqual(codecs.decode(b(AFF2), "affine-?l?u?d?s-5,8"), b(STR)) - AFF3 = "QsuOcuOcecQmOQ" - self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-2,4"), AFF3) - self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-2,4"), b(AFF3)) - self.assertEqual(codecs.decode(AFF3, "affine-?l?u?d?s-2,4"), STR) - self.assertEqual(codecs.decode(b(AFF3), "affine-?l?u?d?s-2,4"), b(STR)) - self.assertRaises(ValueError, codecs.decode, ".BAD.", "affine-?l?u?d?s-2,4") - self.assertIsNotNone(codecs.encode("TEST", "affine_?u-1,2")) - # example of parameters that cause mapping collisions - self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u?d?s-6,8") - - def test_codec_atbash(self): - STR = "This is a test" - ATB1 = "Gsrh rh z gvhg" - self.assertIsNotNone(codecs.encode("test", "atbash-whatevers")) - # uses by default an alphabet with lowercase and uppercase - self.assertEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-?l?u")) - self.assertNotEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-[?l?u]")) - self.assertEqual(codecs.encode(STR, "atbash_cipher"), ATB1) - self.assertEqual(codecs.encode(b(STR), "atbash-cipher"), b(ATB1)) - self.assertEqual(codecs.decode(ATB1, "atbash"), STR) - self.assertEqual(codecs.decode(b(ATB1), "atbash"), b(STR)) - ATB2 = "N^]/a]/a a.{/." - self.assertEqual(codecs.encode(STR, "atbash-[?l?u?p?s]"), ATB2) - self.assertEqual(codecs.encode(b(STR), "atbash_cipher-[?l?u?p?s]"), b(ATB2)) - self.assertEqual(codecs.decode(ATB2, "atbash-[?l?u?p?s]"), STR) - self.assertEqual(codecs.decode(b(ATB2), "atbash_cipher-[?l?u?p?s]"), b(STR)) - - def test_codec_case_related_manips(self): - STR = "This is a test" - self.assertEqual(codecs.encode(STR, "lower"), "this is a test") - self.assertEqual(codecs.encode(b(STR), "uppercase"), b("THIS IS A TEST")) - self.assertEqual(codecs.encode(STR, "capitalize"), "This is a test") - self.assertEqual(codecs.decode(b(STR), "capitalize"), b("this is a test")) - self.assertEqual(codecs.encode(STR, "title"), "This Is A Test") - self.assertEqual(codecs.decode(b(STR), "title"), b("this is a test")) - self.assertEqual(codecs.encode(b(STR), "swapcase"), b("tHIS IS A TEST")) - self.assertEqual(codecs.encode(b(STR), "camelcase"), b("thisIsATest")) - self.assertEqual(codecs.encode(b(STR), "kebabcase"), b("this-is-a-test")) - self.assertEqual(codecs.encode(b(STR), "pascalcase"), b("ThisIsATest")) - self.assertEqual(codecs.encode(b(STR), "slugify"), b("this-is-a-test")) - self.assertEqual(codecs.encode(b(STR), "snakecase"), b("this_is_a_test")) - self.assertRaises(NotImplementedError, codecs.decode, STR, "camel") - self.assertRaises(NotImplementedError, codecs.decode, STR, "pascal") - self.assertRaises(NotImplementedError, codecs.decode, STR, "slug") - self.assertRaises(NotImplementedError, codecs.decode, STR, "snake") - - def test_codec_dummy_str_manips(self): - STR = "this is a test" - self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht") - self.assertEqual(codecs.decode(STR, "reverse_words"), "siht si a tset") - self.assertEqual(codecs.decode(STR.split()[0], "reverse"), codecs.decode(STR.split()[0], "reverse-words")) - self.assertEqual(codecs.encode(STR, "replace-i1"), STR.replace("i", "1")) - self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) - self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) - self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) - self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") - self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") - - def test_codec_hash_functions(self): - STR = b"This is a test string!" - for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - if PY3: - self.assertEqual(len(codecs.encode(STR, "blake2b_64")), 128) - self.assertRaises(LookupError, codecs.encode, STR, "blake2b_0") - self.assertRaises(LookupError, codecs.encode, STR, "blake2b-65") - self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2b") - self.assertEqual(len(codecs.encode(STR, "blake2s_32")), 64) - self.assertRaises(LookupError, codecs.encode, STR, "blake2s_0") - self.assertRaises(LookupError, codecs.encode, STR, "blake2s-33") - self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2s") - self.assertIsNotNone(codecs.encode(STR, "shake128")) - self.assertRaises(LookupError, codecs.encode, STR, "shake128_0") - self.assertRaises(NotImplementedError, codecs.decode, STR, "shake128") - self.assertIsNotNone(codecs.encode(STR, "shake256")) - self.assertRaises(LookupError, codecs.encode, STR, "shake256-0") - self.assertRaises(NotImplementedError, codecs.decode, STR, "shake256") - for h in ["sha3_224", "sha3_256", "sha3_384", "sha3_512"]: - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - if UNIX: - import crypt - METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] - for m in METHODS: - h = "crypt-" + m - self.assertIsNotNone(codecs.encode(STR, h)) - self.assertRaises(NotImplementedError, codecs.decode, STR, h) - # CRC checks - STR = "123456789" - for n, variants in CRC.items(): - for name, params in variants.items(): - enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") - print(enc) - self.assertEqual(codecs.encode(STR, enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) - - def test_codec_markdown(self): - HTM = "

Test title

\n\n

Test paragraph

\n" - MD = "# Test title\n\nTest paragraph" - TFILE = "test-codec-markdown.html" - self.assertTrue(isinstance(codecs.encode(MD, "markdown"), string_types)) - self.assertTrue(not PY3 or isinstance(codecs.encode(b(MD), "markdown"), binary_type)) - self.assertEqual(codecs.encode(MD, "markdown"), HTM) - self.assertRaises(NotImplementedError, codecs.decode, MD, "markdown") - with codecs.open(TFILE, 'w', encoding="markdown") as f: - f.write(b(MD)) - with codecs.open(TFILE) as f: - s = f.read() - self.assertEqual(HTM, ensure_str(s)) - os.remove(TFILE) - - def test_codec_whitespace_after_before(self): - STR = "test" - for i in range(100): - c = "whitespace{}{}*after{}{}*before".format("-+"[random.randint(0, 1)], random.randint(1, 3), - "-+"[random.randint(0, 1)], random.randint(1, 3)) - self.assertEqual(codecs.decode("\n" + codecs.encode(STR, c) + "\n", c), STR) - # in this special case, the whitespaces between words cannot be encoded because: - # - ord(" ") == 32 - # - the next minimal value in the printable characters excluding the latest 6 is ord("!") == 33 - # and therefore ord(" ")-random(0,20)-random(0,20) will never fall into the valid ordinals ! - self.assertRaises(ValueError, codecs.encode, "this is a test", "whitespace-after-before") - self.assertIn("\x00", codecs.encode("this is a test", "whitespace-after-before", "replace")) - +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +"""Manual codec tests. + +""" +import os +import random +from unittest import TestCase + +from codext.__common__ import * +from codext.binary.baudot import _check_alphabet +from codext.hashing.checksums import CRC + + +class ComplementaryTestCase(TestCase): + def test_codec_baudot(self): + self.assertRaises(ValueError, _check_alphabet, ["BAD_ALPHABET"]) + + def test_codec_dna(self): + self.assertEqual(codecs.decode("ABC", "dna-1", errors="ignore"), "\x02") + self.assertEqual(codecs.decode("ABC", "dna-2", errors="replace"), "[00??01]") + + def test_codec_morse(self): + self.assertRaises(LookupError, codecs.encode, "test", "morse-AAB") + + def test_codec_sms(self): + self.assertEqual(codecs.decode("A-B-222-3-4-5", "sms", "leave"), "ABcdgj") + + +class ManualTestCase(TestCase): + def test_codec_affine(self): + STR = "this is a test" + AFF1 = "vjkubkubcbvguv" + self.assertRaises(LookupError, codecs.encode, STR, "affine-BAD") + self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u-BAD") + # uses by default an alphabet with lowercase, uppercase, whitespace and parameters a=1 and b=2 + self.assertEqual(codecs.encode(STR, "affine"), codecs.encode(STR, "affine-?l?u?s-1,2")) + self.assertEqual(codecs.encode(STR, "affine"), AFF1) + self.assertEqual(codecs.encode(b(STR), "affine"), b(AFF1)) + self.assertEqual(codecs.decode(AFF1, "affine"), STR) + self.assertEqual(codecs.decode(b(AFF1), "affine"), b(STR)) + AFF2 = "ORWJdWJdidOCJO" + self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-5,8"), AFF2) + self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-5,8"), b(AFF2)) + self.assertEqual(codecs.decode(AFF2, "affine-?l?u?d?s-5,8"), STR) + self.assertEqual(codecs.decode(b(AFF2), "affine-?l?u?d?s-5,8"), b(STR)) + AFF3 = "QsuOcuOcecQmOQ" + self.assertEqual(codecs.encode(STR, "affine-?l?u?d?s-2,4"), AFF3) + self.assertEqual(codecs.encode(b(STR), "affine-?l?u?d?s-2,4"), b(AFF3)) + self.assertEqual(codecs.decode(AFF3, "affine-?l?u?d?s-2,4"), STR) + self.assertEqual(codecs.decode(b(AFF3), "affine-?l?u?d?s-2,4"), b(STR)) + self.assertRaises(ValueError, codecs.decode, ".BAD.", "affine-?l?u?d?s-2,4") + self.assertIsNotNone(codecs.encode("TEST", "affine_?u-1,2")) + # example of parameters that cause mapping collisions + self.assertRaises(LookupError, codecs.encode, STR, "affine-?l?u?d?s-6,8") + + def test_codec_atbash(self): + STR = "This is a test" + ATB1 = "Gsrh rh z gvhg" + self.assertIsNotNone(codecs.encode("test", "atbash-whatevers")) + # uses by default an alphabet with lowercase and uppercase + self.assertEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-?l?u")) + self.assertNotEqual(codecs.encode(STR, "atbash"), codecs.encode(STR, "atbash-[?l?u]")) + self.assertEqual(codecs.encode(STR, "atbash_cipher"), ATB1) + self.assertEqual(codecs.encode(b(STR), "atbash-cipher"), b(ATB1)) + self.assertEqual(codecs.decode(ATB1, "atbash"), STR) + self.assertEqual(codecs.decode(b(ATB1), "atbash"), b(STR)) + ATB2 = "N^]/a]/a a.{/." + self.assertEqual(codecs.encode(STR, "atbash-[?l?u?p?s]"), ATB2) + self.assertEqual(codecs.encode(b(STR), "atbash_cipher-[?l?u?p?s]"), b(ATB2)) + self.assertEqual(codecs.decode(ATB2, "atbash-[?l?u?p?s]"), STR) + self.assertEqual(codecs.decode(b(ATB2), "atbash_cipher-[?l?u?p?s]"), b(STR)) + + def test_codec_case_related_manips(self): + STR = "This is a test" + self.assertEqual(codecs.encode(STR, "lower"), "this is a test") + self.assertEqual(codecs.encode(b(STR), "uppercase"), b("THIS IS A TEST")) + self.assertEqual(codecs.encode(STR, "capitalize"), "This is a test") + self.assertEqual(codecs.decode(b(STR), "capitalize"), b("this is a test")) + self.assertEqual(codecs.encode(STR, "title"), "This Is A Test") + self.assertEqual(codecs.decode(b(STR), "title"), b("this is a test")) + self.assertEqual(codecs.encode(b(STR), "swapcase"), b("tHIS IS A TEST")) + self.assertEqual(codecs.encode(b(STR), "camelcase"), b("thisIsATest")) + self.assertEqual(codecs.encode(b(STR), "kebabcase"), b("this-is-a-test")) + self.assertEqual(codecs.encode(b(STR), "pascalcase"), b("ThisIsATest")) + self.assertEqual(codecs.encode(b(STR), "slugify"), b("this-is-a-test")) + self.assertEqual(codecs.encode(b(STR), "snakecase"), b("this_is_a_test")) + self.assertRaises(NotImplementedError, codecs.decode, STR, "camel") + self.assertRaises(NotImplementedError, codecs.decode, STR, "pascal") + self.assertRaises(NotImplementedError, codecs.decode, STR, "slug") + self.assertRaises(NotImplementedError, codecs.decode, STR, "snake") + + def test_codec_dummy_str_manips(self): + STR = "this is a test" + self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht") + self.assertEqual(codecs.decode(STR, "reverse_words"), "siht si a tset") + self.assertEqual(codecs.decode(STR.split()[0], "reverse"), codecs.decode(STR.split()[0], "reverse-words")) + self.assertEqual(codecs.encode(STR, "replace-i1"), STR.replace("i", "1")) + self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR) + self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that")) + self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR) + self.assertEqual(codecs.encode(STR, "tokenize-2"), "th is i s a te st") + self.assertRaises(LookupError, codecs.encode, STR, "tokenize-200") + + def test_codec_hash_functions(self): + STR = b"This is a test string!" + for h in ["adler32", "md2", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"]: + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + self.assertEqual(len(codecs.encode(STR, "blake2b_64")), 128) + self.assertRaises(LookupError, codecs.encode, STR, "blake2b_0") + self.assertRaises(LookupError, codecs.encode, STR, "blake2b-65") + self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2b") + self.assertEqual(len(codecs.encode(STR, "blake2s_32")), 64) + self.assertRaises(LookupError, codecs.encode, STR, "blake2s_0") + self.assertRaises(LookupError, codecs.encode, STR, "blake2s-33") + self.assertRaises(NotImplementedError, codecs.decode, STR, "blake2s") + self.assertIsNotNone(codecs.encode(STR, "shake128")) + self.assertRaises(LookupError, codecs.encode, STR, "shake128_0") + self.assertRaises(NotImplementedError, codecs.decode, STR, "shake128") + self.assertIsNotNone(codecs.encode(STR, "shake256")) + self.assertRaises(LookupError, codecs.encode, STR, "shake256-0") + self.assertRaises(NotImplementedError, codecs.decode, STR, "shake256") + for h in ["sha3_224", "sha3_256", "sha3_384", "sha3_512"]: + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + if UNIX: + import crypt + METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] + for m in METHODS: + h = "crypt-" + m + self.assertIsNotNone(codecs.encode(STR, h)) + self.assertRaises(NotImplementedError, codecs.decode, STR, h) + # CRC checks + STR = "123456789" + for n, variants in CRC.items(): + for name, params in variants.items(): + enc = ("crc%d-%s" % (n, name) if isinstance(n, int) else "crc-%s" % name).rstrip("-") + print(enc) + self.assertEqual(codecs.encode(STR, enc), "%0{}x".format(round((n or 16)/4+.5)) % params[5]) + + def test_codec_markdown(self): + HTM = "

Test title

\n\n

Test paragraph

\n" + MD = "# Test title\n\nTest paragraph" + TFILE = "test-codec-markdown.html" + self.assertTrue(isinstance(codecs.encode(MD, "markdown"), str)) + self.assertEqual(codecs.encode(MD, "markdown"), HTM) + self.assertRaises(NotImplementedError, codecs.decode, MD, "markdown") + with codecs.open(TFILE, 'w', encoding="markdown") as f: + f.write(b(MD)) + with codecs.open(TFILE) as f: + s = f.read() + self.assertEqual(HTM, ensure_str(s)) + os.remove(TFILE) + + def test_codec_whitespace_after_before(self): + STR = "test" + for i in range(100): + c = "whitespace{}{}*after{}{}*before".format("-+"[random.randint(0, 1)], random.randint(1, 3), + "-+"[random.randint(0, 1)], random.randint(1, 3)) + self.assertEqual(codecs.decode("\n" + codecs.encode(STR, c) + "\n", c), STR) + # in this special case, the whitespaces between words cannot be encoded because: + # - ord(" ") == 32 + # - the next minimal value in the printable characters excluding the latest 6 is ord("!") == 33 + # and therefore ord(" ")-random(0,20)-random(0,20) will never fall into the valid ordinals ! + self.assertRaises(ValueError, codecs.encode, "this is a test", "whitespace-after-before") + self.assertIn("\x00", codecs.encode("this is a test", "whitespace-after-before", "replace")) + From c1d268d7be7d58cdd315f1f69047652af811a7ce Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 27 Apr 2023 23:46:30 +0000 Subject: [PATCH 76/97] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 78f9f98..3033e1b 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.03%coverage99.03% \ No newline at end of file +coverage: 99.16%coverage99.16% \ No newline at end of file From b643181673d14e41ba41399bdd315a7cacf71692 Mon Sep 17 00:00:00 2001 From: dhondta Date: Sat, 27 May 2023 23:13:33 +0200 Subject: [PATCH 77/97] Refined documentation --- docs/mkdocs.yml | 112 ++++++++++++++++++++------------------- docs/pages/css/extra.css | 26 +++++++++ 2 files changed, 83 insertions(+), 55 deletions(-) create mode 100644 docs/pages/css/extra.css diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index a39ccb0..387710b 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,55 +1,57 @@ -site_author: dhondta -site_name: "Codext - Extension of native codecs for Python" -repo_url: https://github.com/dhondta/python-codext -copyright: Copyright © 2021-2023 Alexandre D'Hondt -docs_dir: pages -nav: - - Introduction: index.md - - Features: features.md - - 'Guess mode': guessing.md - - Encodings: - - Base: enc/base.md - - Binary: enc/binary.md - - Common: enc/common.md - - Compressions: enc/compressions.md - - Cryptography: enc/crypto.md - - Hashing: enc/hashing.md - - Languages: enc/languages.md - - Others: enc/others.md - - Steganography: enc/stegano.md - - 'String manipulations': manipulations.md - - 'CLI tool': cli.md - - 'Create your codec': howto.md -extra: - generator: false - social: - - icon: fontawesome/solid/paper-plane - link: mailto:alexandre.dhondt@gmail.com - name: Contact Alex - - icon: fontawesome/brands/github - link: https://github.com/dhondta - name: Alex on GitHub - - icon: fontawesome/brands/linkedin - link: https://www.linkedin.com/in/alexandre-d-2ab2aa14/ - name: Alex on LinkedIn - - icon: fontawesome/brands/twitter - link: https://twitter.com/alex_dhondt - name: Alex on Twitter -theme: - name: material - palette: - - scheme: default - toggle: - icon: material/brightness-7 - name: Switch to dark mode - - scheme: slate - toggle: - icon: material/brightness-4 - name: Switch to light mode - logo: img/logo.png - favicon: img/icon.png -use_directory_urls: false -markdown_extensions: - - toc: - permalink: true - - admonition +site_author: dhondta +site_name: "Codext - Extension of native codecs for Python" +repo_url: https://github.com/dhondta/python-codext +copyright: Copyright © 2021-2023 Alexandre D'Hondt +docs_dir: pages +nav: + - Introduction: index.md + - Features: features.md + - 'Guess mode': guessing.md + - Encodings: + - Base: enc/base.md + - Binary: enc/binary.md + - Common: enc/common.md + - Compressions: enc/compressions.md + - Cryptography: enc/crypto.md + - Hashing: enc/hashing.md + - Languages: enc/languages.md + - Others: enc/others.md + - Steganography: enc/stegano.md + - 'String manipulations': manipulations.md + - 'CLI tool': cli.md + - 'Create your codec': howto.md +extra: + generator: false + social: + - icon: fontawesome/solid/paper-plane + link: mailto:alexandre.dhondt@gmail.com + name: Contact Alex + - icon: fontawesome/brands/github + link: https://github.com/dhondta + name: Alex on GitHub + - icon: fontawesome/brands/linkedin + link: https://www.linkedin.com/in/alexandre-d-2ab2aa14/ + name: Alex on LinkedIn + - icon: fontawesome/brands/twitter + link: https://twitter.com/alex_dhondt + name: Alex on Twitter +extra_css: + - css/extra.css +theme: + name: material + palette: + - scheme: default + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + toggle: + icon: material/brightness-4 + name: Switch to light mode + logo: img/logo.png + favicon: img/icon.png +use_directory_urls: false +markdown_extensions: + - toc: + permalink: true + - admonition diff --git a/docs/pages/css/extra.css b/docs/pages/css/extra.css new file mode 100644 index 0000000..c78f454 --- /dev/null +++ b/docs/pages/css/extra.css @@ -0,0 +1,26 @@ +/* Full width (only works for some themes, including 'material') */ +@media only screen and (min-width: 76.25em) { + .md-main__inner { + max-width: none; + } + .md-sidebar--primary { + left: 0; + } + .md-sidebar--secondary { + right: 0; + margin-left: 0; + -webkit-transform: none; + transform: none; + } +} + +/* See https://github.com/mkdocs/mkdocs/wiki/MkDocs-Recipes */ +/* Add Support for Checkbox Lists */ +.task-list-item { + list-style-type: none; +} + +.task-list-item input { + margin: 0 4px 0.25em -20px; + vertical-align: middle; +} From 8eed486279e6f2a531e30fabf8772d06a56fcc50 Mon Sep 17 00:00:00 2001 From: dhondta Date: Fri, 8 Sep 2023 16:06:21 +0200 Subject: [PATCH 78/97] Fixed #7 --- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 3 +-- src/codext/__init__.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index d3fbbb2..37e98a8 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.0 +1.15.1 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index a2ff0ef..cb32c75 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -1398,8 +1398,7 @@ def __score(prev_input, input, prev_encoding, encoding, codec, heuristic=False, except TypeError: expf = expf(f) if isinstance(expf, (int, float)): - tmp = expf - expf = (1/f - .1 <= 1/expf <= 1/f + .1) + expf = 1/f - .1 <= 1/expf <= 1/f + .1 elif isinstance(expf, (tuple, list)) and len(expf) == 2: expf = 1/f - expf[1] <= 1/expf[0] <= 1/f + expf[1] s += [-1., .1][expf] diff --git a/src/codext/__init__.py b/src/codext/__init__.py index 67d6b5a..2a37ebe 100644 --- a/src/codext/__init__.py +++ b/src/codext/__init__.py @@ -227,7 +227,7 @@ def _format_action_invocation(self, action): else: print(ensure_str(c or "Could not %scode :-(" % ["en", "de"][args.command == "decode"]), end="") elif args.command == "guess": - s, lb = args.stop_function, args.lang_backend + s, lb = args.stop_function, getattr(args, "lang_backend", "none") if re.match(r"lang_[a-z]{2}$", s) and lb != "none" and \ all(re.match(r"lang_[a-z]{2}$", x) is None for x in dir(stopfunc)): stopfunc._reload_lang(lb) From 10487853b55c0434bf52465ca92cee0616965cc9 Mon Sep 17 00:00:00 2001 From: dhondta Date: Thu, 28 Sep 2023 23:59:51 +0200 Subject: [PATCH 79/97] Fixed documentation --- .readthedocs.yml | 5 +++++ docs/pages/img/logo.png | Bin 21838 -> 15408 bytes 2 files changed, 5 insertions(+) diff --git a/.readthedocs.yml b/.readthedocs.yml index 0e991f8..aca74b8 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,5 +1,10 @@ version: 2 +build: + os: "ubuntu-22.04" + tools: + python: "3.11" + mkdocs: configuration: docs/mkdocs.yml diff --git a/docs/pages/img/logo.png b/docs/pages/img/logo.png index d14178df4385b6f0b9635f3fb79ce7d2a4361d3d..a1827f847b39fc1358e580c291e151d0f89b3bff 100644 GIT binary patch literal 15408 zcmYkj2VBkX7dYM$GBOh)BO%n?L!aXAz4x`cd+&WIDLXWbP?@1*luf9tj1Yy$&KAjv zME`fc@9+Qm|L^N|pL3sap0l3woOABGRct0{K-j3TfPjDj6f&L*w3~bW`h)_Xkr&2| z4+t1~)JYIKOMaDxBJxge0K7&39f&*Zq&Y%Jsl1gE)^ej=@WuADQ z1(*(o$D`t5Ko?GHx0rxQL^L=a6c2+TA@N{D&%po32@wp9N5uE=!;`6HcHRGPx1LUw zQ>OVJgI=MY!N5T9N{S53=HmTutB&wL#@s5qLkCFZ|H$b7-MeS8(`QxvuUDyZ>l6SX zkV#MiAX}wQrm@RRz!cs86`zVvMRVm^vz*W1AXzS%ScZ1}54%|gaPWUnvjGXt0RImO zQfL&*Cqb}In!ybNM1+N491aEsEx{nzRsxrY$5M$5mDA{B^2pE zx}XlC2CHMrrEZp8YEjwj2&V+jRf7aHFb;_nTU{KJ9;FdsJ#GP!Er)4v7%v`2*E@V{ zDS?9F0_Zq!3a}{-4uBFY0&CU--*jl0m!)**F*3KAqc9;P3P;bdorLFt$sDnsi1OQe zD3!w@I49(R0GqHFq+JSLZjep z2|dho^^Xv`Bwiv=L_xp`)f+iro{Q6euKqg7el zG%p>`AgVD!V4e&G7U}`hB&sC}saj4$$jw?fgib~ZL3lNvEtJSPY)lWm8YGo!k|20c zu^me%La-nWO{f?0kwgz#1N92fObs96QwU`uwB1E=&>>JFg3Bj+%t#AJZ37I(1}6(? zcrpq=r4;FDRx?_OL0JGHVo^PPYAuQB;ln^!5`qWPx=3&oU+q+rfDyp-)IH`b;G>y5 z76C65GT9P2f=v@h0AXQGDkw!rg(IOpm|6=HtMOJq8(1c=Km$kckrX2v)gwCxMrmOa zpgy-B!$$R}NsdM1Y*49FidI++Xt$V#XF=r>rCtc9L)BiU2?2qVl|m^6tJAquWVl!? zb?8ZUnh6U9h$504vL5LxMR*#WX&_3_fT*#0j~_4@U=$=a;!Q}vkRg66oWns#y)qjV zE2R;wBr*n$pvh?>g~@^w!C?$wHkPf&BaJeG1P6qb&8A> zC7tX<*(ekiTuNmb$t)Xzj$t!lXs*(#anj`sm5xnxf{09%*g?We7#c6w;86Pc%pQ&i zV244eu^6Bdhgu<^%B4yUUk11;BGYT78E6WJ3a?RNVSX)_NCY!Ma0^P}(h*QxGgk!T zB0WaG55f0feMqWDE%A^M1|9~a1e3fzy56tj(=`eeL1bd;2x1P5Ds{pPE`^3j(xBV| zw@9G@X&?Z9G^1LA(Lp?971+ZfNRU{J0_N*+hDIU}AkgA~u}yregbHUFU~D5@CuX}W z7#BfdaRNq-RV)2o3J9&o!2ug1Qt28fkO+*Trgm1eL_< zkw94D9@VPd2nyg|v1*XP#Xw*+Jg^CC6e7fK4BLls2-!9P2J0cS99SL0MAC}@^n55u zqZbkc5I2|Ya=MUA8c*R@DBV&%O+m!L6)q!HDduqKXbgZBFA;GCE*Jx(rLd(~tqkIm z8xTqxPc0#PWJ;}_fYf6oP=*zw2mBjRq|_qGfQzxJoB(Y^gHnaiX`xm!1dfFW*c_Ty zPa?}8a*$iWVT*}+kzQ+%GiYG1+C}AaAt<>=OxAONg=&qRf)Y7tasLF zqm%|3AORwm2*Io65VW7ba4?y8qgY6_5n*|2p%mR;_4KT*hATV?&)}qvF$OaFKX=5>Qz z-k>JC1Qr35GNQ~2#d(B$ImN(GDUCKcgu^x(Su&wn&XEa;Toxb6 zmkX={ECs0+YaI%qmnKtlWK4;J0}{bG0vgtc;lSxAD%Ys+i;Z>}-OhLD;1&elCUcnJ zd?`RA)PhziSwaMvha&dSqXo_c*6348by|*8f??axLblvu7a(XNGTCB80+)-@is4(? z2tUE6QJP(Fv)`;hQkZnFhbYl7!~&0=Ec9~8CacTfkt3O zV899m7;N&}EC!84=alk!P^Zq}QQJ&(BHAIb$c07&O#-qpgf2IMB2p~Jk>EVAP-x*h%oa9;%JP`}207h= zaTx6kHJ}Z5=00joZoXVFhzJH+(K3|!C)iQ zAyfG62$V_9bXzbQyHamgVGUrMSgsNKgiI>Zs)55v0-{4~)-z~m7agjA0K%6*$yT~m zNpgBYSfdnY0BJE|lF-iep%f|!-h{)*JustMqbI8DV57-vws`C)wq9f>sO)O4S*(^~ zr3kB<4uiOqJQvc;P~(s^KZY*w;4yrbU(2(Rh%Ob!B{tg;G^*67kPBTl2t+AD868#@ zQ%F)%G(-+psk0$WJ~J2MLqf=a2?`lh7u`iwBj6AQyC+r>!yyDFOY2mdFc6|uO5mcL z5*yvcra75Tl9`T2DY06J%jISHxe&b526%-Yzssg+fGZ9QrcvQ|3POsCKXohJtgPRA6&A4<)kFuy`4qY!EW7Js||jZSf+cRF{uPHn=%lnZaei zQe_G-6_`MyU|~v;OO2y5p-Kpoit~x!a-&+KA` zMMK9is6G!$52D!FNQhjfv3a~Olh30f*=!D*4P>!X%vunM%Jh1TA`ej`=2*mfmER6* zEkYtOzyT+iSx}^e!+ri~C)Mf`($U)#eW{E?ha`3St zm5oJkOFS}&Q*ALq6<#J%D|DevN|KAmg=no(nFq`qg&F+>kXyj#p>-l5Pi(}pg?7Nu!7?P4;MV!>z8n zGJ=i3rP2_7qsQmP_861Y=K-OlAUVh)@OV6Y23bP{Ni0SXLFk(V7j#|1xV?$_C(xNfrx}L!N7P11T6E3>>>t3L{w667L=KSql%;~ zpN7dJ%1s8bP)DYlts1q;PX!KAPq4za8f*lH97EuF9cGE#4+jBP3eYd^34Ub+7$0s0 zapgXTmWwo);eNZxiuLnxCauUT(it^EAiu+sEd;t2Xi;RQnMMMe=vuzigtz!88oPjO zbSMaPB1{A#6HF+!OD@nlDIT+5r!Zl82tSrX)!QLDuTMmQLkt`_4T2=nlys&8;$vc6 zY!VkGGxVHoAcQ1}Jqj>LB<4Bje1u+SWeUkk4v6NIS-34V?eP)YKNwZMQ zJT0B=SAwN3E5Ra%3AjLljHP?YOryahMgg#2&*dWpDuvIa(NY;85J#-0vW;Au4N8|1 zg%G9?Ok{{aQZ5%rcZeWvPig`}s@%Y3&gJNlT$ljpW@?paBNEvIxs`T^+od8ZH8Q6H z=)hx*2C@{);u|RrtO#SG`CTrI4(YO>P!_OB=VBqHZW7WdqA+w=p${R_Fb!O!k%eM2 zoL~i%>eng(svsYzn~rVu4$kDA(XT{+=X7X~0v&Kqrw;LFx6tMXH1H7;;LFR)AiLx5r;I z-6XX~j@Ci6IuykKvH}B8J|Flr@s)Ie*kMFlSRS%X&jqNL$uKN6mqkLdfpH)y&?}%~ zH{R>=Fx_;nAIWt06eqB7l@kfqLWM}MK!PBOU1$}{tny)vXq%m8)6$=z_NMeRU={2-N^ z&*>>00MoHNH4@|2(y&xBOe2OVq$ZnzZE>miW|YI`_V`Gg9=vP>lR*XmJ;kFD09iZH z(IW>Q8f3yKtxO95w!6T70;+a5(QK@kBD~S)L<1(rSA#3^g@pX4WP2KxLOjdM>t9!7)>K!$Qq6l z5HgV?f#`e=x(Ch0qeM0~jHbq^JU~u})tXEOhr?@gAn4$pEQDZpNYN&P$jU-1+(x}# zN9*x`77~Gt!qO~$6s(6gEQtuBYsn~XkJ7DV5FTi3aw>>M?QxwrKMh6G_CQ=V8YEWR zKu`ciNhnw^tcO@BNPB4C=Sg(-B{3 zaDP1=y}Pc+T~!Emyr~;GaKc`=;r^vpuU>sWar9`G(`xsnwLAH&^x)@(ude34A2@JeYFAg+ z)*DJlZ~X@8!O+^IDUW`7K8_CG!!515e*MfIeq8qV_YIn)THA{LflNzvqu#ae?Af!W zVaw!lm1>WNKv=fn@a7#yPM%!Bk6VVHAj{EZsL8>mw#{iF8I_dyDBm=g2InWZ$6#0qp+~>`MY<0 zw$Njb&7YIpFFJ4;cFB<=kPVwQL7q>sMeTEYJhwZ(eti`BoVKoo`SHW@aR6yQUIH&P z@JfFD`ZayP%*4ctH*c=3QPILX`V3MoTefV<)Ty#DfUrcJzkas!*6@ZUYNYEAR*M=g2$cy2xJ+Rd9YW`RIC!oqK;b%_fW z79R`RD=eE6B;LMh)BQswC0C1LKLiE_T3xQ=Ymb>in)(bIK79PdiJ3D(=gyngFC#Ov z@1gfmwNL7!c6A2)89!r&iJX>pKEJ4FOh!hAG44*NuxRkOja#;DTX+3B)$8?M&CJa7 zIHHiHm>y~DTC--&h_)w{Z&`WMVqz|Z<_|dJC#8?0Q7E#!1C;f#TY#OT9Z@F>7w=pi z!YjM?>9zfL_EbPb-?3Q{Gn%Nmvs>2gIWenj&E69>FKFPg3v>oU)iPwrki6E{ui@)j zPB}s(rs`x0nY{TRdrDLR!W6c6@#5yTL#B|BwX@ag*w0VT(S*y7G&MB|qH52vgF(x% zcMwphgG3|}$1&c_Ij&dzVH~H#os0Q#-0-03)vR&jo}Z|y>b7p7yLU=60J}Z6v9Pc! zVIy^t?#UTAZ|&mzOBT(l;|DKWj%v*(yT)9}-3`K6rshp;+HuVK{T!>&{L{1OhQG~>g+e}zT(ps(Q- zC-Y~&N$E}6GDbO2(0w~`op$uB4EoUXEAEU>FBZ?AKXzox`rW&CUmV$EuFb24PX0F~ zAtC>`u3`D6!ti?ug(n!EjnuUnyX?vvH`wC2fI@l~RLY%CZ>`HL$VystF6zZ+{el5w z7a*!?rW5CF;uF)-lH(Vgr#Dkpk4^IbcOjtpeCi)fe0+TOj=z_Gerle>T{yr@GnKr{ zeDi8*uI=zr$|Ofjo;8qB_@d=KySC`PjJxmcts=#rO_wiU-VUukVQO~Yx_R^9JgXJ- zuHrs_bX*>Hq-4*j0)h14)9HIGL)PJVvnGf9Lf^|Vhi58jG+J;Ha3bE0Oj+3Rc?P@e zo;l{+<0nsMydK@hOfr?f`W;sMIDW{yPkqff+eT7~mWQ0VaHC~O<$8Fgkj)OrlSkDK zw{Ktcm4a6)l|#^du3WzS9u0eQ^HARiMm6tlR!+f!;|-RB*P3z5bH<5smbQ%Ob-yGz zJUY29D)O{wzE7o}JNI$u)+Wzqdh)XSx!zOc0kKc6k|?@mF4)|ZY4Zw;S8CtaH12~_ zk2dUMWyW?rzOby{vsGDH3)dd_Sy53DRGDBtakAqnqGa~Quqo4pAO3zmW%Zz%`hGp$ zSZSf-*Vl)i=zHtjzs`@doUKhZxz~$=mT#R1k4*`lc(%N4 zb(HM$Pw#}g!S}~q)qahRXjKc+b8CKd{`mO(skdeA41mTK-NKb)9mQ8dTH6PvP2uui zcF<1u`Bq-LKKJu;{pa5WiayT!57GA@>~D-Qjl6SlhUAFS!RI%IaI+Wp9!^QW`C)K+ zM|iP-VK55dVC1bZ{npUq_v#9Xrxc6WZWJF5igOl!kFUj`g z3IB}&p)c6SA4ODaXXTCkyr8@D!CpW}p<<`=*!+L9-z(ja8BcRF)=%xF zZJBYg<;9_h_q`uXU0yUw`L9jBwsPtX3Y~r+sP*`R`G0l_%7jDv9@@M)($NPLIj!dD z=#i_FM6XcZ87$2oxFBZxmaZxDo70BwYX}?sdeX&~gEMZAxb-Ne>2lTxOiu2Y)~bWj z<6H6RxjXMgoqxkJG2qmU_x;4Q1)uia>6o7jy}Ik1DssddIVbPJqnOB%(vc~E$Lh3r zy}ycqgRu2xUILzQ2G#Up+4&`3JdX6r-NSmRhX3l=275e%pAj&P&);==Uw=sH(wn7- zu8n>3=-|Nk%?)KEql%-hPrE1feIGakvU>Cer+MIzvyD$_`(Vbp{`X^R`jqz@5nq0A zSKGx47h;E|e0p>onl-s>P+vsM-F`Dt3(LPYjrlom#K1mSvkiTJTb(kUv+MMR#&s8K zvzN(kJe(d?nVa3sG-P@@ipT#G4&L>*2z;0|?v43eY$S-*-ugPQG99!+F+ST2oWZ=J^-*z@euOwaU z>PxKeul=-a!O$bQaiSdyS0)y=PWshl9iGxW`Q%*o#0NPMh&TP{cM$36VCs7d2)vI*yg(x3R1mdT}()M zcz42@gF{YEepCH$en$Aeg#Mz>8=p7K+8=NuF0Pie!iWZHgT$HBVT^_>;X#W0b^1X}qigk{w%J7JA2x&6xM|jX552*z=S28-Ky}f>2q5vuKkDEP{_L;+m)4et|nj_R|r}4#%NhFi`TFGw=v>l z80LrZu-dwTx`SU@{_^UWm?sBUI|K`c#MNx7N)pvu+sk|PI+?K|KfG&e2;|py%Kgx$ zEl|zu@Q%lh+ot|uVg4hu{P=qVSFBB{7#_4aIr?JLS$Xi@gGryqKGlwSIQEXIvTp`s zX!(_Y$;%r_*4&2iNXyf+Yinw1`h)y+x$1wL)f>jpTM@|6l&z*%9dXBCYsj_}#iv&f ztE}2PrLD1)w_`;?qUZDS9XocE&Z~6y0(e{Oss06?EO>O*s?C;?A*td3u$fO zNmCIiSG~tp?-;jUaP3h*qVH?WzpigA>+R$}t)1$dTNx8i`Mz&#?6J1~ zC%WRA4oxZ=(SM$mZ0H{i-&o2p1h-yVHFVt7QxA`74u9+D*#7jWhIKR$@p19{P4M1F z^PgXK@9npJcKN(Y``8h;w>S13U3#;2-+$SIGiR31i!fAV4o<1fj_!>#+&^Fm3JOwx z|8TEg$QE=SIQ`SMp;;Br=E}|WKlY4E7ztJ7l)k)dx<@IJw6CJ^K^+ z>=(_sPm`}cm3GGV>!Y2KK6q~Jx-r2WjXYLrOIzEl-JZ6mJ4khxw$E!E9zl60%sp|U zZ=i8*;;duso-56jyk4%`Hz=zUNV=oEQXBfTK9SnGVUvG&ZFt*k|Nft;+|{FxD6b6( z-6CE!%c5JId5)AlwpY^QH6x+-5vE7qOOsPO)5n5`1QKd8m&K-b)tm{feDQu;U4dU%Y+$_U5=_=V^Hydi}a6bddkl`-XjoO?Qj? z2Dj#t#)2P3ufYzg$4x$Dc^Zc`&)>qV+Rus~wxfJ*$Cg>5Eo06X4Y_up49x z{W=k~3d2zFlnYs=<%_M)Bbq}e)Ls~5A%!;n`kAJE^P>91#-%ab+=OGU2M^6W-_pDC z^bahP{kAEgwH7u7AF( zcJrl+g8e1J zCE&JNLuhd9so>bF-h^)vh}3;!j`U)6klvnnNyOxNe}onvYHiGm>`m(f@}HW@+k4>W zw{HdaPFfb!E3Tdhm{q%L&i%M)0@T9N4-tK+*z12_JJ~rQ6RXwke$}lr!dqF*+gH_G z8jxS1)b@)ywRJ^Ne%XiETHK?8x@iYNVW9D!kb>e1o#Vd!a9=(mBGo*+hN=G*X4-zq zGpI4AQB00})Qwv)aLHKn*H_odQd3hkKR!MP|NEx{`OA}=KR^HQ+nwUuvtzK6B7Mix z{wQPScmIuzS8dv1IW_qo9JRUh>R`@OZ_MMN3EhQj^6bO6W%UawZS-{P9ocJhP4I-f zS(<+{S7t8k7VSH>GqpM8Lz8*JJ$qGTX=2R$Btovd_rR#?n#|rcjIFTuhUn<+`}!UV z3=WNaoRT*m*0HMUSM}Bz$3Q#NIlfOp^R|eVjndCOEn0mF6#n1uc|)#W7kKwq&N_3_ zx*|;fIB(jQ7d3P0{#>X>Z?6AYF!R|Jc-xDoPquy|<;^#2Vnv+OuWCz4Uthj(;#$g| zj<(*ID~v)^yCEUr@IHVDEhwdHe>+jS6VrZr>3*_e=7HOluxpEx@7tLXcZ$24k9U_g zEE}QuF*G&)%C(^C(71tPua3p#t5Vkw06m!~EG$?Fgn0Xj#I3E1JKu<(Bz)YG>9B+|-&lsFEd1FTnDlrv zSP_|mTi!n~sWTQ*lDGQ5g;Q&vZ!SSpJG1@#%~18NLdoQo#H_IdvLmuEOz_S$}bD6#6SIJUknH zaYcCVq|O*_wz7LdL&)NrtCQPCPpFZ4(J%krnE{sl-Sc6;yx??-`*whO?w;>|-!DWt zld{JQ+?jgy=aP8iv{y=fpfLJ&<3k~B4uQM9PdRUL_>0{v(Y?r*%nHo78|UAxTAyAV zDfuy={drS`qW8NO#5Iu>7jE7(eO@_uBFO%VeC`qDZq}wFn*QafI$;bx*d*)GA&zAAnyo!4c3Ssej}>5j z!HW+cX68SKgp?ak2TeWMe@>E+R-Q$ea<}3kWAuZeiJh{Kr;Uq}5obq%9u533_~osj zYwhnVCr_+!isBAw@-Ngdg9yvQ%#g&;rU$i9BI5k*y9 z&{$_}`uvy6Fs~oWYtW&iievi%US}t6McRv(FCi8{jo$BfNG^BA?b*{v5LV3G+|j8W zxAz4Fw0!(w-SBdS&a^+U?8)=mN1Yij#dS~o=cL1#pY9IZtw_zyn{(v?Y_N5}BwFx+ zk}1&(7M~ZtJ^8!qpzX_7>-{Y(YiQzR^4VzFo3PZihpPVm9G&*}hsc^4I-v9My?d_Y zK!pFhw#GGdZpfCMSMH69w`48S@6)nAZ~edPIFp&9*IsNwJ$Gcl?4QUc|%J$w{%&*5s>5iZd~5trwr{n8#L+AzKA}5RVB#n73qH$ z;`@AlHBHi*aC1)dWS2gpDPY9eiiykq{@E3XxtegWI-gXAoqaFgTfOG<=9Rsc!cSi8 z`u*)=^u;}j%Yy6FeRc{|j%~^W;xkRd)ESuJYKFkdc!hv%W-hc9JJo_leJ_ zYf|=4Dax2z(Ryc-alWe?E4?2T_Icq?;ovOV9q`rR_@>{9gkK-T%Xgq7DLsYGz23Gb zCvD&g^OQJl_Tf6PFuM=(XlBe<fX_EiU=;#Ui&kO7E+mWaAR1gDLH1AwM z*?_H0=?Blw?f*MzqRW}kq8rsw=<@X z1PPu}rg!US@z(FY05U7<(C!VmqVCB*qbsjBzWQ#P2j0<ncm1%ErTPe49OFdR2MLxv2~^AddF-_I?x+*ekitOh0cA3v3c^`!aeDaW}43q`jNk5LiGN#Ro~;d8zNKQ z1ke`@FE#L&RtvdvS29hRktzER%saEXtNGvDVI?r)?dfsccQMcE47ChnLvE8$7n?{-7iw|gRE`jhai%e1(8Et=Nb zqwmpcg7@C6*BeQr{8h!pgPt9jyIR%#yHDW1WVWVF{cwhkAf3Cj*{HZ0**>l6(d^S? z)$(EcIuh03W2;}^j5~I6#O>(XovE{AFE34voasy{LPsXNuIu~c%*Wc4xU0w3HsqvF zIp!QbYrf&rnYtqae8H0uKl-enI;^)+0*dt}<_41{S{`}Jx4dNIEG-tYarkG;T& z`+AU-{In$fj(EU;mBf;=!xa~spZ^)smYiNhYY~6-4VRR1v)&xqZE$LT{JJTkl^sp} z`A}mOL_PQ+&75@@C|Q(B#~xYUSiV z!raDj;B^Q3j@)>u-(>yL+`^fzxXvqK&j^FKD?|azg`wfEeMaz*{ zn0_M~W?!l8XMj~--M#TZ``TWShsXN*$mR0#6zQ5s`bsHL5scVzGji_P)Uf7ThjzXm zNDRF)?yBS!o7oY3H+Lj{J2^45jc|9w`0j+6Ph;=z%wB%pZu)Wc?cZN(?|iXBi=xkr zf3+nTq<{N+&g3s2Pyfi7o$;bSBmuYe@ZX|8E0W(mKbVYRuL)Z7>`TPQCMj;Wt-Y&n z)FTn{`q%Y}nY$NcTiZjqOY`b_MZ%MkKlMqg`ZA^!HSl7=ae_l*i$DH0`{4T)3g#0y z43xI(S>)a@$f3Ra?kBAp^W=JTLQ;b$WcsCP7YfrW(6;O2ThIOqE*{dC2s31raGR$oQ&Zeqgh?Sw6 zBgexAaI!a*#JU2BSMInmNfloQ^@0V>RgK+TuNrvhS>?)xsf+m+V>UKToLSz_zkI-z zO}uHjyhZu1^Q}Yb-d7HrHZY#pZ~OgsgC38bwf*nj`7-tNltq0QM+2{np1XEH&;irZ zBcGE_6})}E?V#)PiMbi@h;q4A**wxHkb-TFMl3-9k*`}dNpR9d1vsKmMwzK^}cxfYG_6GSC;l=#7fP}h?acE#~kY)7QO9s zAHenrX}WwlyInn`@0Atc2M_*Dic9JGudeaU>MCYy z&4SvF#>lI8G8G_yj>3fA-MPrlLj-1^iJA^h7e{E_%*3Y3^Ja`)KM8N2Fg zO2*A+Y_&I&N2iS072KZv{^etE{HDzfy*5`Z9Q3Gn$Av>#1)vyzkXgN8NztW&^KgF@ z7u&SCb^ux9>ZeV9#m;_e&lrCiH6k zCxCQ*o~1v5fmM5_s_RtZGs#~*eODTrj~rp1usq#f^6?(=@9FsJK9M7~Bp%6v)K4mo zE{vL+vte+yE@=crSIia3vH7Rj!VNF>{W>1m{9ac`OT5D0lgVGcOVw@XgV~cP)===oN{% z7hlr*ev~hN31aty3C92ajeivSV|K@;;9TB@RhJSq^$BuV*mb+@*9Z6GI;LEoGuN>z zXr|)NG7ET${y+1z^)~dxr!naxUvA@7j^9NEJ$_X0zVtG=Jxw<(A3r<{4M<^aDi{1!hPXEG}T^d+(uKtWJ(U^QZh!FNSa|Vzu{pv>< z06w&pPWZ~LtDm-U*`kX#R_!Za{NT+fNl{U2#rMlQzqB7&lYXu@E|F>bos_294ol`D< zNciy7jfuX3xKLBe{4$OBFSg{;Hrewg!!P-)vx;P`>8lDm)lAG3o;t3Q%-gTJfh`Ksv z{<*-?Y3t{9dJiGuQure+BX-jUn~O&rsXTj`j$A*mrQ_M+gS;5ih#6a-JUfshnV8~4 zBHuERZ|rr73pu9ore_aghO9Zfa)DrV-%V;sLRA;!$kC%WgGN>@eQ^KekA2T@6-Qp& zo1a}?)2Aa#y=|`2KF*NL_^`e8e88;BhcUSmqN66)w4SI<)U|Q3d~DmkBYAmw<;I7Qg9=)n z@#XF@^b@S9?FXJsKk~0Df6AfP^LIZa2fbakq`PMFpZOSLiU{Tkmp|P%Ci74Aou{We z3nzBpY8;#J;&#EGktvFqw=NW&9rmVz`g=|%p(2$&FP-u6yZ)Ty*PNM4i8C5WDVzBj z?;8fCe*3!Z7|DI;(xaQ19$Vh8^Vq1G$Bt2tp5Kq^tyzeqo9DP1oIj8O0ezALEpgXcdLTGwZw5EWS*Dm?`tD3BM!eWE%H{ImGdsN%zHC_ zM#rd`AwRC%Y`uZX`3~jn8!%~fN@$m~eaM>d*^@&c`n>EixtcgY`pcBQDj;%r zu6xe91Vj|!#98&@f1Rr*H)RCR%hR%Et{S< zuaDj(=7VrKK6h>MnRDq4r7N1c4xCp8(_^ZC$$ySLy?T2`%#LI9 z?$NlbzT8&b`BBZx^T~(4{P0_QyYV-ckBRybxTY}tZ&JpV_2W$$4KdwUk3Va7Zhuhm z`+L~14cE&8C$7{knZ5o`Q?>cj23zRU__U^l?`uO(<{Hyi^^Y7b;!4*=r=c?y8gBn7 z=U#}Mr+fG9RrmB(AnT_%y^WdHU$rdb^rtnSe!=1d8&8S61(#`_pBJ>ZJqs)-S>aHcZc6u{I)Pc?s+Mfh$1=l5roep5UbGS*h^}^+Cp+jc&hVEz%Jtq&YOP4`=rG&z8AEzOz$M(O0lU*CpuYbjL za0JLRIROtdEoj3?Q=4_`Q##j0Rv#KtJitObOC?T8aq0`Ew;vlp5&l_yJ+AUe{pMBL z=Ci%eC$*1nwiKuRG$n<7W)FHNcPxITuzJ2@?ZOH1k7r&R zxMEP@qCVf2FRu@6dIl`Ue|-Nc^XqkcL^YnnS^c)ND$BBNhN6ZwB`kZzdSt&9I{J}7 zb@jWX{&9EH&7;4s9smknelv9Q7tGPX*hL5SJEV{l!}@G&>5;+$uPJ?GK;&ZmwIMf$ z&1twbZVpE|`!M}m&yp(}!~Of>QN52(Xhcv_ZhwV z&91FEF#b+o`|JS%;XRXLQ{MHs8<;EismHHR*s5e!C;xgLya=cTJrAg_SRbywy>0Cz z+Ww3Nsega6N5To&*i}M+cMv&@mIE_yfRJZNiIS-*rJ z{kO8Im@zTB*Ac<1ht^C6W2f}p-;CcPJz-o0JdqO`xrY|;sOx95<8bygNBWUww&+z*hoUj-e(YkIHwmq$H$#Q5k~Uz^Pq8e#fRq~-!^uj8f4Uqo3 zDm?z>%ZrPfQMt?V_e~>9wLw?<9|B6xHD_D{qTe0cbSq3Zc?q$;Jv+zad_8{UueY)d z>;XR(Us*G<>0H6c>T4yPn_^RrZ7p7@UKd_Dtf3eA)Ak3GKm9X*c{d6<;60(e>3vXU z6r=iGYV73$cVhRJeLJzeAT7k(JR;#-$(2XxcdEeaFJC;i3--7Ek-lDxo*z2sa`s9@ zb<^rR$bos45f_&aIJ$P5DP&aClA=V9{7v1KMPq~1B3!zljC$m=<;j_Q`_~>uAn1uf zO8%HxY5yI4WY|xfmwxBAZYugobMxrY2xB=${isd3anI1Xz>hvSvpN3w;-vJEt_bo> z|A}7J-aO?r;eql}TG^#)*xK0RD9h9T=G#rZK$tm z-Lpr(0&2&ksfgzTSpLjFCk$?Pd`SJ@br25x)OJTC3%C|tnq4&5Bx*Z@dc z$hTg1<>TVw!p4su|K;1a%&aWh--7FY=lxChj!m3BXFl-g&DHDIu~ceU{Jb-lSWkbi z*0uO&oPJjSGduYpco^vBOtNcT$m|#0n>&x~Ss@Q056Eka_y!Vg( zJ30VQ&znBJHa1fVjd-q6Ws$}yB{u$0?0-xC5RhDBF#1Dr{tyUWDG?d<8Z}S^>Qx$X zoHR}%{_i&k2m)CQ0xt$bm|&1(sQ)(*JO~5~FrNzs`$I86 zi4vzaS%9}C7_kNcTL6^MTCGyTkqF6gMt=wa4n+n*5u;tm5lkw@AA$wS$~c7tc!f(u z3e9L2v0ji66sG~2gW*BJK`@{SE7xmOKqCSQ90Ur2!H|$3Fk|F;qXKqDe(bUo1m zsX!n1|C?>JQeqUy{<}b)UZ{x|Y7}N2M}#nG)f7z}{J#q`OZ0{~fGhvu(f`do+S!<> zmHf9>EHTH4074-CP&~j}ahyP=7pQ$n!QhD!ig5d}*y zVH0pzyo_p!6yk&NQD!uh6sfWX6O|}*gdP${gRud9z{IiS;)nzWmtat-DO89KB?HkI zU;@!1rILYh%)nYP96iUF$l`*;MsqY1ficBl*id)`GCDy7vPxJ9Iy@a{iD4+CnR=i# zmrh`^)M|600ws&4KoU4iIEGA@U?a>rRk#=>0CRQ3L^Tu&HA|I9<|qw%kQ^9*sY|3I z%mlL}3eDDy4rGW%TX9@8L8_9{I6Md!3PDD4A<-r^97dKZED$Kl0@bJxibxF-#01)- z*-E7~N=QUP!7?QTfhJRN6bwWk45*t$57SCbFct?(74ji$tr=(@9A#7)kx@)NSRq6M z^GkG5ql1w7kz9=w9~>!W$`O16P-PUsHJC&|%>l+@tSABs&(l&zS8N6(O#r%LpyW8I z#>!!$FjfXETB8$?El?Pa%!27qqx>_Im~aaW8W$|JXoS*uKA;L1nmG!Iz=23gvvqWi zU~0IAjt3?DXA%OIM2!bZLJSG5p#fDWnALA0I zdN`T|=VJtVsGLtEXvyJdVB=#bLY+XQ;$b;(A`xf=>~?|~jU-6Id0Gn-ql3qxwHy)O z2*_8##-c$=DiS3StK=G!7EQ;}{Jpc%)jOPy?zOuI33Wk+6htnw1Ai6k#Q`L*c={Au)sicD2$v) zC+MR`H;jyAh(mLXAi&~d^bx=W5qzvc3Sx+5TnNf6vfwBxh6y7GqG**G6duo^;AsRCi-48! zqwtZV%7ta|xl}M*0Et6@kVKOpQif3JNhTJA#}kVYbgr7Il*mD5jMM;&l4?b24o|}h z=b&*2Gn)xPkin60O1T9=k{}{P+s09=*>rW+%X(W6wzI3XRNM2^Ku%oYP7GT2Oj0w9N=(#udL zr2=Q<0DOst$14~_3!RDr!S&&ZDzJ`AQ<~tT4uh$U;3?zPN+uW`A7|D?({NygH5{eY z^B@$io|S-s)A1;X9F_8a4pwpkswu3YN0-S)Xk8@6dgD+GLA|ia#RW;76u3< z5*iLm0$S9Ip2L_qq1t=()Vv<;> zID=k{VW6U|aE;2UkBZVIvgjb`sG7z_i0D|r=V4%0 zCJdxiAk^&O$S4g|2UJB$!SQhhoGG3g%~638Wpo2#6r7MK7!DY4fkY17@P|0L2uuh-`#Xks5GfbR@`#qi`YsphCcCS#Tl%7!5ipJtCS%1gtv{)36bo z-~=$)K%uI1NIKrcQz#KS9@h|^sEZEPkpxDm(7@4id1z%YM=X*`Kxh($AZ15E%`yx| zVi42#R+c4!C{Xa(!BiMW#gQWsph$eYF#@Ryk0cnNG%PzJK8~o1h!zVW;z(s&Fcuf3 zjEWY9xsDw_|gQPTs(>lb#iVv3Ss5S)fyBt z0w;?C(0=@=ALL?qoM^?UI%5(*BrFSymW-NDe4>hH62(U(64=2|06Qo^bV{N%-V#iU z0BJc!)|AX-i$if6&-67(b@Hv(@#GsspF z1VqwBQACh9fg&0%LQ!#}JAhAuih&oM0j4Sw((ZCd3Im#+f(6lOiu!yNhAagkaGT?V5APF@Zu-70gMabdu^h^@psN(Pe ziNO-+IFU>ahmLxOc#|nEI*}_E00IY-bQ)w_xSXvQ2t`_)#(<4r2S@R_aFk9GO@b&B zVDJQ#MV<&E^YsuS-jKjU@)-!cF)|_^MU{h;rbLC2YE|=*MzIP%8oP=)7HA|0g=dn) z7`#A48Aa7hgPsN^E91G?I1<+?A;Jv?1rX{H06-`hjkg4HYd8yn*OC;fI4lyb1!-t( z4pOMmnJi?JMjHo-C+lc{U&X|c>5vE^7!IfkS;6Jev1+7&qBnEsW;g@|d2ZBmo^|;7AC1 zT(lL?X0sL>O`@rpDl>#DWMNb)9vDg0@B#QmNYL;j6e2O%jOQR2a=skEvIL7N(hN5y z04NQElbSdnJX?-8sp${|PYO!pYXyK_6BKkM4uS@A1au}1$>iaz0!$*59}LB-f`uwU zB1|$G4&aRN=*S4T@;`0_N&(TurbvWYN*$vM{I!YoZAFqEr%s5ebPAi2q;! zEqas-fG%U@5hytnAvMT&5fY|=o&ZV2nwU}@a`Zb0VM1D1Vrv3Qf+rIJLNLKl1d#-i zDfA#V1As4hyf6xiFo4Y@j!_(^!$Y;Bco!n4Ak0DveKcrg>tT2S@CHNC3Gr0ER4Fn_ zqvdD}3{PQ=&Ja%181&d+I%`zy1RY>Y^0fpV z)6CYxc`#TUV5R8y6Q;W=-T9RF%~w;5d0{!bPUvJ$gNW)im`EAX@Viq zjMZ?ZqhUNjOn@ScfDi%35yG)R@D%|h03xukQRcy@67Fb63RW>HX61idaj+t!i7J;O zt;k@J9IGNQWM;Vgqi$55(pv^TnZBc zDI%ch|7D5(lQIH+|1Tp4#c#MGDYLQhu_5BnOv~yohsLj)(%4k<*e~uGOm+FmGlWF( z(#d^(#dyaQ){2#GeG972&YyVabn={Ij+;|LKk!zrz)xyL+x<8znzVCP+8p=H6){Iu zq~D7lhV~CXIzJ^~S8g38@BGj8UpItp_zJz2+j^=0h@p4+EB~Xq#m+X?$A?0GyPRxD zobZR<2fWNqqs9FovGbIPPi z-_jUG%(G$0yoYev`zX%VKEqt3U-N$^tWm0q3!)2AIq35R%3ZsB!xoR zx_R^FozFOB&Epcfn-=FTcfGcHVtaf0^q7n4*LBtRva_>mGG=eEWYSI=t{@)WI{;UV zl)qFiUC>cw`S$K*MMZ^^c}=zM?Hhx^aL&oeN!weV68G%*UuOp_ z*`#Fs`(;}b$=JBdj=N#Q$g$ZQ1_bTjKRjq}sZ_MSEGqri-nwPVynuUm?)>Z|oje_L z@ywNVe=72f2ZD9))-(Q^dtQJ$d#`*we2h^9>rpg6f9}5F@0W|+Rg>T6XLui&JagvE zFWow4FT|otN&iMfm!|l@w%_w!+f!FlQ}ZN!``GIVLj&KA{QCMPlI(wC{I=AxvfG!i z)U=!L*jqjo9H)#;_GnwNYs`z?K=U1or)G7xo4>ZrN_U>FSN`m3gnI6*cP>48GZcjRjJeI zqAyMRZ7cSkI3elQn(R9JibAY|O^36?t*O5ZJIC_SXxlS8ksZIIL|9M`XiBFB)}{GiH#Sr%npZV8 zHZIZmU`<=HpKhDrdFJ9L`$a8@+h>J13KjJE^A$g)^3TnV!)@>SlQbq* zy!QC>KW;m_&ii#g$;WPdDD{e-IGz9ZtH{Ria(UP3b47aR#DAx=que+37c5$|=<0Zz zUvHo7UhFYHbE)d$tMK!H?KwDn>BY_cC96MN^%OGFzJAyz3d|Wl{m3M*M~9bR(Y>$T z`TJbwoWr(T7c-&~j=6fYra#yt_9N0%$3NGI7QbIeJi!e7y}&(_z6oPHup@59wgBqM zhC=8+6(VoSqNxuUna5fWHacyZ6XNG5t_F6nt<88FM5}3ZnJaSMKXk|IV0%t}Y`;Ff z0x$xrTdpT1ogB&-=KxCX`>lw$vl7?M(WL6YP?C=i?jiJKMC| z9DSS8;kTgt6|tOqh|X`P{GEGl>9xx(ww3I8xbR8iiHERzi`~;`MLsJE=d37Trw|J6 zH^rO!V>ZQ6XKc$k9D1^$sN~F<)r1_sYcKFmX1b2c_Je#HpS$@Iwy8HqLp#98$^Ve_ zwq~OV`YZ9g77dMP6dFSDjkmW|qnHG_A6~1)o;8N)0ImJIU_Ak@MPw~9S z2x^0;zdjnX8BFc#MI5k&)TNVpW5`kNmm!w{GfAm@sjLb);U5vV$z?0Pa{eL8$tBF| ziGBs|ocpz#YT}&51FzWv+skJcJbm!o7PpLa5Kx`HXU|X9k#NAqK;I;V)M^cry}Yit+YH*Q zuxZ=e*p)U#=ESJ4+{=7sSnS7J`e!mkn&M$+`!tHbIjc#Mk=~UHKRC;VHhtUOq;I(6 zZ#fdY!0{A0w4i7u_Cf}JzYfw>Mrms{RAK$sgz{%1CI zP3_ExT%EC_8a1)9k~SzMyq>&g@|%F{z2erptc}Qd89ymBXEc2|KXK3U^~1j~t!_`L z#h2fcAibLo`H8~`X^+0X2|2(no3AC4$+IuMRLUoPyg)z)$nNb3Te+m}$@?2FvhT@T zuWg5UI{&CzmFyo7u=>rFgtD0I!%w^#7|#nyg)Ot)+}u)-DU$<#^Ie8^@;+aX3J-6e zS$lhMZ~{c?(!ALB=8qre)kmWO^D8PKjq#Nyf)AONIF-N`i1qbze=3ht_k(=hZ>?RK zEbvqUjCA-zbegJE*Ur!El0b4#9C*0*d1(*n6#UA~{^iU!Z(Ja)^NsP-#*n4I?XwC4 z8r2t1zW8!#`9JUM{O4X_8-{ew-RK%mFHeGhP|mr+pB~~!&BEoYL~~2_e~zB)_5{vY z&OL2JD*3aRjcW>jVFL>>`4gGxs0gR|C37oS*^rGPe+w%g^^l;vo5RbPC7Tbn`h$Gm zrq?}a+*A@Yyr8akwnsne&6zz{yr?lSN6gqSMN_l<3)^mgyO=Ew+|y-~-e2zT>EgM4 zwX1l{h9zxsj&Ic8*6omroA2HyzF)S`ZT7n3oSMpeQ^vk}RyOS<+(Rt9!VVILt=m6_ z#rp7I(Nr?SP*x{CMdoY2jcKx}9T-||$z9AYF%@65{P}3!joGvbv10Jr>o)kz4VUPy z`J}2rOLL9+{&ib%syZ|MWy9sqBob-0Kk@t5G1IRly4%%fWG7PZ%SkofJyY!A?0L!% zj)T?S^x??sF|Ve03v`Da*hxj?q8w#Iu{$X5;Rg>756`r>XA2ai*=d@u>W57P6;7c4^}kp;(=Nr#?z=0&>Q zwq0_Tw=IRDc0Txo%ZQE^+)8(O>%d)~-5y%fxOBVM+TJC^bmZ>KCGU2e;J-g-i`~Ea zjmdt)^_AlVPPK>I7C#XoN!N>JCW#HF9@t)sk&PQSE^L;YTX{p?54RjK#?^G@&YNl3 z;E@R7-L2C%9a;CC2%DGL2oQQTNBR7I1D@t|Y^eI?IGdo@fxHP9KHl8f8tBNGwD0hz zVgZKQn%>Zw{j+=emX1qY77L!7_{VEb-SGo^XH3YQP!2%Siy&y)Jj&%dpDV)!4S#It zA7kVzV$2E4daCKJ*H;_K4j->go*nCX7M`SVJy^wendH>ZDu|9%Z9^a$M58?9{9C#{z+d&0c+)v-A6kg*$>Y`<x7%KG+O3r?^b}Uip(g>9u`~9htd2e%!gidWn zie>lWkbCJ;x{Yk>)=1upl{xfZk6nwS^L>kV-whhNL}*CI+_f#egGH!Ys}$3xPrsoI zd7(j}P`}43Hx4_!6y;@|-Mcqt`QqNxS$jNP$Dcl5-W9tl;%U|bRerrVhHBZa_`PHMSgtN2f>_t1%=bh9wVJnuhTU|iTp}Q5@{yV+jbHDgM3S7vVrdafB zX?R?IjM2>#0|y;l?|ASkd+h9y!;xbz{9HNR#r?v#K^*!@--;KyGwz)y&ZpMBiCuPN zD|@2nhJwNs2t|Bzp&zkJiE#dtpjd1R32oRs0dzNbFlN26WWVjhB{`JLe2;wM=gF>N zM;2^c_<>z@Db@SDtHE4q@hHRX z$D5`ji;gwE+qc75*c>l&qMZK`W0bAke?EvjH5QYtYAer_p0B6~d@S}IIhy=vQOfN{ z-+&}K-XDke@`2p9nelKN6_0K7znn=4hz0^SkAQH);x7Z|w(S)&`n|jU3iiX0-G^Lw zvtqfj^H;B41>(2U(5%eN)|o}2DO-_(eJ;7~HG`ECHs)fTe+a#OFRzS0F>T0@^yq|b z$yrp~mZ|PX(+W7nvBO2e!S=b#NLTQRqa<5IPTGzQOOi{wD~}E496h>hK>1D7N?U3A z)Bb&$-{)0!iz+H@-?bR+*30eeG6)}Lg_b0uAN5YQ#g2{Nuwlct6~5I0nC$3b zFc5M$UEJjfxfa(Dz}rNt`m^1FMpd5PU(K-D^}}!d^92$1kD5>(6QMTcOCF?C0%O{Oyuqo*7hiV2d|_cgQcbe0*;)0$Z~gJ< z&gr}!Vz$rZReQeprRl30y=Gk*+*&7|n|gfN(1}l}$;%t@V_Sbto>oy7dgz{Fo?^h$ zBVfJw=EykjgXn5 zl0W|(3fd2}=L;l*{{ zubDqiAlDp!Qc~SrbQmOfh+68*KUL|{{N#i)j8)&VZu%J(7wQBAkGld?6Xf#}!9K|2q3f#>pQR#A!D9Q66$f z`qnr*^vdVmKOYw{1DjD4AeD3OrE(OHKEc2rQ6_xi@BuU&a0L zTAQg36v|2R9x+$8(8s6dcw_uWMI}FNtGYFZQ2#Mv)_FAUR1DmWb|pqZq;Ae zGja8|!@<$-ObH9ZSLy#Ae!R@}`q$y{O2kFz?N8u2V)EWaQ8>c08yUuq=kp4}rW8xq zZ^D)@T2#*(Nxl@mY}KZDUn}ynw~P&N$i2us59F6FGbdjjtSD>TKXUG3?8wWkfNA2M zcxb)f>A?q1aPy=$vFq=}Zkt?f_1k%&ZOtOYtwXQp#{TL;K%%I`S903L;tC*bN z4;;<(&y~kyr=W|xA zx&q|RZug6`k#-d(%AJ6ex*ApybKES~j8{9?>2FG({_>bPa~(aOZbZf$%KpQ7z4y?u z8e^Xrz-tU!>}QxN)_K{b{Gir!ADau0KaKx4@NVYSr4`H1kB?5Ro(^H)xnx zn&ULR2@2%&CNkOV#UZ*AFI2j%*}87sS^w0THQuv5CqNNHAFO+0{^iSE zw(Ehh_x?ZzHJ+Bu)pbtiu!vasxV07M?tZz`*LnY2C;J_Vrcpq8me}!PZFoby3lkb0 zK4V@Ve7G%LY1w$@a?{y*C8GajuOGIPUA^T?*3h?mvp?*;Y#yGEk6LAO^g-V2!_R-x zoUo{yTG)a+Z|zw0sHmswHadTIzHnsM z#_ZFdTW!b`%A)%F_cJ@IGR0^h+wu59d}G%(55=QA*Mp@CQGw@!J5x&o7Icv_k7j+p zu=%dTR?V!J&vTqv9|p@lK_RUzUzTsx&}K`Pc93hCPk*OY{GI_@vl5g`3F|`#FxTa7 z>;Y0Y`QCfi6?wS+X%f%amfjY0ec;!TGjpo+J|(@ADqY>(p3eDnb4>NJy+fUQ#p8Z& z7<-cT)R%om>Ig3^v;45kC{h>n+lxtxUkT_<{|4F6oHNHIj~_l<9>_eg!ntB@`1XU} zZ78%&*dVbZH?h#;(V8>gK^W3b@nRQ{FL{FjBQ2^=c^vw;-UYO>CkZb42OE0A$u4?$ z@|p2$YQr9Rwats%ygaYcr}Ea!AxNrQitlkat5K=iBw!_2%16}_`kV>|0}y)h%-c6av^ z5<+&wB~_=^TZ5=?x?L8%-mu4KbJ({Q%*NwmvY!^_RV6>2C$6iTjHx}(|6-eS+&B+P zUWI%tOLI?Mw75Z7kt*9ZD2>kjNt}%lsa0iMy#DilN>bV?FHWo*%w(IcQ1PD zPF6(Dq!kCokdTj>pZ8pA=Z)`bXqo)yMM*1l%t$h#w->gmd}LaR&joPU{IQVLHJ)$% zIKju~lSGI-*~gr#XwuG47p~mKT(x5Isy8$(06^}0kTsny!2X(mj>@G|B=HUF4_4@)py?6V84Y>DN-rTrz zA*+0lo^K&cvat(vRMdQ))0|HOur+0HAhDrYv*Chy?4wI1KR-_z_n0=lt34iDmc{%#YvN@R>Q(k-L6d@$@D3AmfDB4Fw*{TFc;Rawe#A z*JbQdw_`t}FWyoxfo>E13S!YR_xBu+Ol9s-f2S-ykg`BZD{Qi5}!1nUb#x2cr z7RXQR_bkwsYG*v;ADknCx-AZ4N2rKu?Tq+(yts`%7@JAq7kP-T!OKI$R>->QDxYHlxB z(%odQZ(X2Yh5uA_*b?nNlpQr@Hs#a%G0v~isY^e7#^@=32cP1&SNok-27*)hMK&d= zX>Qwc9C{j33Bl&dAT;L`*bF+esrr)7t7=n!ThD}XnklnjHp`Y)&pzHd^CQ$mO#5Ls z2jp7HUZ{a&)32T7os&vR*IWOH=r+haWUp0IP`zLmv&XGflbF39}e@8x}hP#q7t*OW(+lk1t$ z_G{LCzl^MY`)0yj%>LEgA*Q0|DRbVjYaAyye`tT;)KP-$G_xm-16>_Y%r0t-fByKy z@9L_zQx`mSBL>`K`Pq54#CZ2kbGc}C+LZXOZX=_0U&RdX>%VVqMyz10XqN49+V3Xr z__{f9O;K7?PxpW!;ZOIYB{sVjr15t=0)svt@l0nmSB2K4pQ&??DtMXSlH+3-y9kB4 zG~+?fb23P}Zguy%W4b+l3kEurMd88QSJu9-${et{b!YhMt5=xhoaDl_XV9ll2U#W! z@BL6|UOM0w^Uv(t_xQ$H(C}Zcr=II3ZtV_ymr99X{GCuy{4T8h!IJClcFv6*o6)^D zZdNBdAM4#R4udZto=Ds!RwskM#m=nh3&ZM@>k91t=GM~eZso!FuSLDkA zKarQ8teu^0iZ2FuFu@I%_I>;yEmSwsL+4e0>v-*b``%L@n+ISa&ReuH_DPuU3{N) z=;l+v<*mZ+@#nPvnYOyW>Sy!a!y8Ivuk=d^uV-gBhI8C!Y$Ke%p-cV(M)`aqWoG%Hec0Et zp!t(ULSbh7Y1*B+Mf=?!t&uWZ#UbCnXfI&+EeANk>z-`v4*YH}4qjx2eeJV7t)@{N zt(!tt`qn;SF?mL?>G!X%jOrHST;lJuBto>Sf)@=rZgMPXu+a5phK$%PTeyZDTc4xx12*{w%BCAUi zaQu1>kH;?`2%PDJP7lb;(Z>v4O@OW%e{kC#@fD|@K3U>}Y$4{%vCg!uw)NH_^Zwuc z*Sdq&oX%YFGkEQwOGoi_pXjt-;5ckZ{X{1x>yhF6PUyQc)9_P6y@fd|XWp4bL7JvuLdoYK7!dI5n~=eKDeen?8l3v#x)yoh;aZT%BYdrk@M zC=T9HwJbbY(-k;Bzj;CGU1#~vilxa_QAhI+7e0&syHS<#fDM)3@q&HPe4d2Z;Zsvk zeyEYQ>ZPvq$LYXb4IgmVWZII??Kc8B@4%S5Fx1=H&kxe;W9E$aDz6{vE63-pO{yF! zym37v{DPr~d?D@6j{mz0AicWoSLxkFL*rF3J}GM*mtMP+eTnz|PvFc+JtMxoj9li1bt(z z^?3UXXO`cEdDlI!4s;Y{Om*4_AU;Pb%xy5#D47kra9{5+o)zq#(lhw8dv-Ci_~gI) zMJdGxye9PjTarDjKV^jPy7MJVH0(BYLAJ}M*Grdg9bCB0w&#nASg_`D_o7*CU7Ffc zM|&6G2Rwi=qFsgBJoG>F@2($H$G!xQy^&m!NK_FD|}w*1(bNSD$R`>Rjk< zg7vsPTXnwp**FVDDe}*~ zc(gkFYxmCAC||o8-UQmbq7npe?@qrf>73VJ7CO#Zz6e%ppZiitE`Kp|O!lOgzL#tM zkP2_w^CXq1#?OBCK8CLv$SX!EJ7kGie>eZB-SN-o<7YtHzAX*h7&!QnH9g~R#ehpi zd3EgP%lL{g21xZ#4huRqzl+mb^RkHOj&DEq=fa(ZG%Uq$Cva#~UgraCWH$ezbqy#n zf6M1|tCugQ&R`0<_0HM5||){&+rC_~v2I+R1X8~ozJqWo|AuUW%a zhir2L^}+KsgVVPXI&jWcI=f7;6RN-=VQva;a?Ll$nB-LkHu6`zFdEN{MD#;f#fv;KYKHf5GA z{rt;<;v>k~6jE5sLviZ>c2(zrLq=z2?E6(0=cicTWqo>D+X5|F?ElKOim9TP{c5Rw z+CQIedC~o1y~na)W8k)XkfZrs40nhs@5g5It?Y7@9s0>PUd}2!1y{l@Ssstb(?}=M z9U-ULc_Ymp;Vm(AIN_{_Ed7{kz-Yu|Gb&+PJ8#E@mmi8M4E)Ye%#A z$m-C^>XYtNw!fUIOo(vY{yju29~&JnYPfalR`*uzPpV^uKY8!E^7zJL*=G|-B=;!R z){dLb|Nd|)Wca23$=B<4jxhI$pB_LG*P6=IudOxYCD$Uxrf<{e+E!+teh~V;)YRhN zi0as-cKM?;DkpB9xc$dMapvZH8`3Q9qP_pQad8bDz)yaZw`b~-?a3&j(=YF!AAR`2 zv5@=v#jU){b$h<`jr{$JE&rL_9s0+|;P5j}wf(?I=$c6@lZX{d4BM0Id{?Jl_jUWW zUZ%YlrTz3hXfU!sw96;7YR?;=Xk42s<=wb7XQ!O;>u0yS-8&c$@?<*}k?owl_Wqcf zx#oM}Wxod_hIw(1%zO4Ge#*}vslQ^|PJMWtOnNYqTv}wCHDO1_Y}z_XSQa?lPhQqP zQM~_h`ZnTFpF{3$rws3T*tZiwT{#O_+mi+TNb$*}HQ-J911s+}0M2Fj&!enV?2MSD z&zeV%Q5L+ud+X&;NBJN3CYQNyd#n89UheG%B5*48Zt<1dmpn%S+nG6soib9gfja<8 zn1Opr0``abmJ}5gulifp?Hy8zpZGrPW&WG`ef-Jdhqj)NhCg4u)n~ZNrzelO`y_kQ zU)PEk9~m=s=b~r1j`Kd=e&X~dAK>cC6EP4Be@pSsZM$>lPUL*|%*LMc@FaBl-NyCT zhd!NkzCk%K?cTnBomtOzR_BNx+I!L-)s7TEF6e)jL~ z>gr;uTV{*|%@%8}5C3Y}x7xae=DL&7o;|JNz<~q5&rIW-oFne(RG9uW+>LB+@pw#8 z*4qymTiWfb1804nP}{Wn0Wt1AKD0Q(9J2GuMDxL6SEiGFtK^ftgKECZ^j9PIdY9aJ zSJpM`GQZ&K#+x^9Mr0nVc6)KW9^&b&%u4Btf?zJVobkK$j0zlsCCo&zeCnoc8N2IB zA7@hK0wF$Z^|jwW-nV_Z{{V<7X9v%IEjKUiGOyU6uF85pCOs_C@9(r1hTd`XNlsNE zXQthb#{&1@>-%o|G)yhq5F~%nG$)|FV1vtcpG&j;M34i2*E+4P>06h(;04dFVr7os zz4kaCcf8BZ_zy?c^`8`HTwGb3mdNrH-@#t^IMh(Tq@#bo zs6NVgx24H$IO$U1w6|{y578BdC3}DO)qE7lI)BrnVw|Ar^)Bp@uH4TL>+#u;8Wc)U zo+m@Qic7~BD_p~d`)*Bqv-+~}GZ_~9`bxr*(1hNV?|}ng{j^2gC*i_P*da9@hr2yy z4{1-9KkV%IJLiFnpO=tc6 zv>WZ%ly-CM@R1ZUUe=E5)dcNlX4-t3)I8fEVdi+R?PV#9=+~bYIALrfUDi0y9=>eT zaH4+E1Jfg;FshUFa5O$&u`6KL%oz_W0a?E zkUVQ$09x;TWf^jN8Oo=>Gpnar5DakEF*K}v>*1Rj4x!`j8Ghf`^X&TDN5_dz0+wyu zOKC1SV@tXKO$-+>pQV;})&R-&5|-9k&fG{>6&-Cqk2}sb+ zEq=dBmj14L<#SfFY9qSZ`9NoKecr6njQrV)Z&WWI3_ZW?lik>k=-7_gM{33nOEE=7 zZqFWTQkr~qeUAkb-k#jLn*F%<44d>yNV!ohZ~kmJPdbEp zXBv2av3b>-_5wG&w0`0I8{V^E6ZU@HuyHRt()rND*~5b~mc2*{I2w2C>V^Hi z)26zQmzlCr~rU7dcVcAgM+=Ggok|6Inz1G|@9ea|X*8l1NTcOWvSz386B(v*+Q zu&%Ajl-BRpt#7}%Nqz4JiM6j@z};8IUVCud=Ly8UNnaq!w(|QT*XAIvok4}9Bdtg- z?fk+kWyl&g&qIWX{&2#lHv&6(TI&v$my1}=j z3(aR0vu0R^_U;ojR^y+*1E$kw%sD08297SDLOj0WMAtRf{kO_*V~%g~c~D=!%nCr& z4%^D}xnm-q&G4E$`TMU;6Q+TJ3XrlVs~i18){VQq@IzH>&pcVepU>mUtTS`AfAkOi zUW-LubgymtFtcIHrA)WtHr2Qrvg=C<(RoY7LF*Fbs2}kmj%r%fERw>>_uSiwo=;7S zn7sS3|JKC#p7hINr+OY}Sbkx)a=3qCRl(mdpRysyLFD1xcAn;(G1Cidf23Y~Yi*&q z;a1h~4y|%ujyRmb{$BbrD(B9aa=J+n8lZ7X^@i$sRWAi&Q(-Suo}7GwbQ4REGP(WotX0!`EQUq5Z+fzpng>D6^FI z9SnIK)*O~!b;^Im-p>64mD~K54H<=?wD_1C$pe$luOb&Y+GHM^xXk9)=r7RD+ z;!+f&hLg=ZE#(fkkJ<5OtMl2JJoum5yn&3)^wJNPGM_&eeM?>DR~(zal;t@8X2uiU z8rO_e&e7Urif4H`2DrW((9!PPb8hbXqM$#|{Z=O_k1Xt;Jv|3|WP9Vm>_meHF-lS0 zQ^2P0i8RhU@$LcnZK|`-n`n!v3pM8ZW`BWtI?n|$x3YJR9p^eU^66>8#O*Ut`j5S@ zuY481Il7X_W-@=IGK#LgQJOHFYktgl+z4E?RTF|&+hNx`M=`rNM;6;>-RUu_D(~~o z|JbDYvePZ`&dgqa`atM|yf}t$#{);1PwD|t z(W?u?J>cs%X2^aW8$Z22Zwf?n=Klpc62K|QdKlb<)?@~QfxZ=D)TmL^u3fum!0;bm2&~a)9L`r1Md-MhqN1Yjl}hD< ze3PZoXz&zOm&L!bvNF}_^emNxWtBogLNFG|RfB?-P9{^zv(G-8T2fM%aKdg)0U4I` zHk-`}7K_EBQqS5G%fMt4-|Y?RI1)NV^`P|!njFxYkBj<8Z=MaCPJRY8)=D4<1 zfHtP0)&D|a25o@7H+eFsRe|(w^FOD*q^nf598J`Fh30iauv1MEgJVsue2RB16MNtT1*yry~efFpQr6YwV z7Y;%KzU_sZBK|< zU!-VmB;SJ-IDw1zKnMIjyT4S(7{IVT*w=qf5QPkXqibyCx6?;l`Z==a3)A@yi{kF4 z@H-8KObTF$F@5Mbu3B3S?8;mZ!@J%~JFoE{w=sZa7oUR%`#%j=%PvBnwzoh|$$8kD zwS`uvVl5~j1Rm-&$3w&T6S9JW0{X-?gTX-C16>iV!-fs}>HF`$AImdf4_A?hu}$ch zH=jY)p+g4@p$KCN&`E&|?+GQ^9m9I+hYal1JxCkicALI}%$uu-JGZ#58s2p*oj5x> zX%gtv!SL7hKWPUxHU_X7OC{XieFE(e$2HjP-Nw@v{?C4YQ`Ev=4;WsmNg+dk2MieS zM?paW9+eKbGq@E178{MmT!D2~i-9aED(YpGN~K}|!`VAp{_w=L+bbR}DR0IE*#n!!k|^S&f}DfhDb8Gt|AOLM29s-Nkr&cLweRGm@#Z2e05|wn5@+< zz%E|@lMYV2R(6?=1w-HD!vj;{#Fc&Y`b?;?0P`DUd`l_(F=pxI5tA>&6(Hs|@S7yl zXfzi?Lqiv^D;(ktVSTG@Cv>S>cAo*W#r-&CQ9V-sM+{z6kk#j$mK#qjBG_KkCQ?z= zzQyq=t_e3zQ09wmm*XnJ;KP{Ec65*-I>B*~|NF&lFn#E&u>QnHP-QN60frFa z1_2lMJ>v$#IpZG?p^Jh&=QfJEkvM=gHDt8l;!XN!$&g`b{ zN~6-kn4S;Qp*o*=rCJY#kqM)Dv6D*(Sxq)U?mk1bb_TZ^-$Uuh0 zc(3M{!+{eykdc`WQdOrn9M}G9Y>rq0g;c=2{SNbY_AU}t;`zdpaPrE2FoboW7ya+; zc~SIIDuudYN}u^*Je?Y~CX)zR9bj6m_K4YR9vB-NyTWKRx@Bog&GN<@@bSWb1Y3s?UZs?`3cn=J(+R#Qx}xFS#M~9(?dYHetd9IDGgp5V5hbSw^F=E7y`Q z02yO5k88FxoqTwE~YDiV@5%>|iMSJB8G4?tX4 zJe?C+T2n;3iqIB@M%KMqn`!tqom>qup>65^AG3=v`+LKQl`tSag-%(+L=bO{Z3p8D z3tvG70OJe>1489$XnmM4VZvluMM`lMaY67mWMaD>y2cZ$|I4|JP*hz&XL_Rn7M<8nk61!WC@dFnTjDs-2ZqDt;!OB6?+Be7f^ae=%*7z% z8e07X8E?2&Vq#)pR#sNDWLHs3flRGd*8~Lxy`uO%+Eojqr9dXtRfI{jVcJ$;ciQO! zCRT?Tp1W-+tU34=UHk;A@Puk3=yI5iX?&LaCWMU6w{<1nqO+SRKz_z1#=bn1#QuE%}Xs3%@JiD)QI``bb@J zB^jM>>ya1fZSCYwNyeAJ$-Q~T&`K3=HwBb#Y!qa2c8pT`wlu&*Ttz~V@zr1T*u04> zk=T%l1BRf9kj=OQXai*Z4`SR%d!DyW*6z|9WNwyx%2uQ+{MU2L32OM;awVA*aF~1Y z)_1vW;sz%W+Qv#AAAVoVRpbGgi2J7@kYV8ALSovqX*p}wtm!ag#*D&w^X7HqRydR@ zKE?c4-k^~KB~#5u>@X~!L1sGmCO4#746={&O769SI>Cp4H1<;7Tt$9^Z0OLT>n~rv zJi{$*O{l_QOM(mlzCbBPRb>EWEuJZ(CNZSP|#~?1?9yfY)Dt9k0(@O~+r5F)`eHwbz9tpIWW{0uzVH zVps8(76F;XX@@Vf4`DIUCOy(mgk~3DULcd>kCdb?Q&Ur6!-fq&plbZ&lTV^KXB9Om zWOCJOQBl!zN~LlIIBI|_cO4$vA|T7IDS^G$&spsxeYSbCHm)zSDIs%N@mi3g7E`YLHa0N z++YF-_iPpbxTE7Fk8FJ(h3|+;GIrV;))1g$ZxJvAjn%do42E&s*U(?;Qm!N`EiJ|J zj?3xsQ6x~+XMnOATZAOTAxVwGz`l5loa8ai}nH~PK+z;*1$`t|EgLx&D^ z3%(P|8fhtz8I8uNN~LlQmX2Z?F8+zC&=A02E-%Ohfo0#k_1Yv zAw4WC3~fPi6%g@j6txza1ipAUcK%#zPe?i7~-8OC7Je-%8_caPDH^2rC9Jp}v z=FKP-xHscw_hKfi1wlraHDD?&GUHPca3r|ogEt*lKLl{r6=Cs(3BxCl=|?6w+Ptt7 zw4HQ7C|vwGozCxDw{GorVv3wlW-&jpOvrMrR;$~=91>&k@m}aW*uH)HUPHj}ETB{M zclZ;=fueDk^V%)92T=gfnKNoh!UZzke>ZF3x3eLif;K+Fk%)w*FI<(!N;Hn*e znlNZ}S+g@;6+aJ@G55PvpxJC*N|L0`+i9{sglutKT%7AnDlZFvNe3^`D-?=iul2BY z7bcr5W~!2)MINOV{%^R*TkyAnj9vo}1lEuJsa301pQ6c(fyz`1h&axxAy!mv+M0S&HCsW6z5UL1(z&Ie(t>{D5S(Sxw#g3(MF68IucjHrmeA32O zknshRO4UXIaNRQ!RL@kghs*w(oKSZ;>e8FqW~}!(mKR=l;lR7^zB`m%8IDV?b@~dg9Z(9 z+3SuTJqm5wv{}mrj5&}2=MUjR$RtS+nQ3lr(n(DH`4ch#;2vZy7@j0#3;ZvG%+28~ zwkG47kMQvDz2)WQG&2u8@BpNx)twQ~1`M~A>{C8aO}bSOk^2@1t;ysBE&3C(#;ON@ zN;09kxZczhXT2v!lJQDZk@YOw+`$%3uxHO6=+UDG4&@5BNWZ8Q!*Bc#vZhm)ayBb+ zs)5U0S9>X?iAyWSECE`r_UPrym+3evf*?}4fT0v~MW$lCin(*=zRfkv{T(2aBc0^* z6;YRbNy&{~lld7k)MIoNYF3jh$ucuDVa%8@wRXEbsQ&v{(-U2fE=N4y!A;45fq_RY z7K<9cgyODKr%vJANZ0>C<}S&6WSo}#aPis5RhL$)HD2PG>|V#Qe*JowHER}!O~4Q8 z7GL#^_hhh@D_8EEIdkSH?1K(Vyebvwvkn00(>=TbhWdQfs#R;=eDh7VVGWJ3 zD`Y?>dODlOKnP-LY;3H{3XOl4x=HY|HuU;7x=jcte3+S3z8YKq?T}fmR<7nkb&Bis z4xH!53m5kBcD5lJVA&N`a94>xfERQ%xr;_QCrLB|WU_$7myR8mVMWaX9 zhnRHYvv3QQ^Yn!T;Vz=Fo&rgdHQ#;rUFxPyn=Y{@ZNY*Cqmz@9=W!=8BO~LjB}mLG=1YqgFHXI9@gn>55?Hx%<-F+VXgqP))mY>Boc+0+ z0}@065k0&m_0Q$L-+ue8)1*n0up$dC*fIT#ubr+}iNa;BB}g##h4IIbX#7ELyclEk z&zLl6(r|;pfRfMEOeRy|#~*+EF?W=G`}Uot(P-MSKX*Evj`QcwXH->Hxjg`b!4T7- zLx)7BFZ#Xk-~XE644@F8?U(@p0ji{=q-3pDi-lw8bv4?l3JMA`zWeSwT Date: Mon, 13 Nov 2023 18:34:54 +0100 Subject: [PATCH 80/97] Fixed #8 --- pytest.ini | 2 +- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 46 ++++++++++++++++++++-------------------- src/codext/base/_base.py | 6 +++++- tests/test_generated.py | 1 - 5 files changed, 30 insertions(+), 27 deletions(-) diff --git a/pytest.ini b/pytest.ini index fcccae1..ab4c198 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -pythonpath = src +python_paths = src diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 37e98a8..318dd9d 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.1 +1.15.3 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index cb32c75..275dafd 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -104,9 +104,9 @@ def __new__(cls, name): try: self.codecs = MACROS[name] except KeyError: - raise LookupError("unknown macro: %s" % name) + raise LookupError(f"unknown macro: {name}") if not isinstance(self.codecs, (tuple, list)): - raise ValueError("bad macro list: %s" % str(self.codecs)) + raise ValueError(f"bad macro list: {self.codecs}") self.codecs = [lookup(e, False) for e in self.codecs] # lookup(e, False) self.parameters = {'name': name, 'category': "macro"} # ^ means that macros won't be nestable # test examples to check that the chain of encodings works @@ -158,7 +158,7 @@ def encode(self, input, error="strict"): return input, l def __repr__(self): - return "" % (self.name, id(self)) + return f"" # inspired from: https://stackoverflow.com/questions/10875442/possible-to-change-a-functions-repr-in-python @@ -172,7 +172,7 @@ def __call__(self, *args, **kwargs): return self.__func(*args, **kwargs) def __repr__(self): - return "" % (self.__name, id(self)) + return f"" def __stdin_pipe(): @@ -200,7 +200,7 @@ def _input(infile): def _set_exc(name, etype="ValueError"): if not hasattr(builtins, name): - exec("class %s(%s): __module__ = 'builtins'" % (name, etype)) + exec(f"class {name}({etype}): __module__ = 'builtins'") setattr(builtins, name, locals()[name]) _set_exc("InputSizeLimitError") _set_exc("ParameterError") @@ -237,11 +237,11 @@ def add(ename, encode=None, decode=None, pattern=None, text=True, add_to_codecs= if encode: if not isinstance(encode, FunctionType): raise ValueError("Bad 'encode' function") - _set_exc("%sEncodeError" % exc_name(ename)) # create the custom encode exception as a builtin + _set_exc(f"{exc_name(ename)}EncodeError") # create the custom encode exception as a builtin if decode: if not isinstance(decode, FunctionType): raise ValueError("Bad 'decode' function") - _set_exc("%sDecodeError" % exc_name(ename)) # create the custom decode exception as a builtin + _set_exc(f"{exc_name(ename)}DecodeError") # create the custom decode exception as a builtin if not encode and not decode: raise ValueError("At least one en/decoding function must be defined") for exc in kwargs.get('extra_exceptions', []): @@ -375,7 +375,7 @@ def add_macro(mname, *encodings): raise ValueError("Macro name already exists") try: ci = lookup(mname, False) - raise ValueError("Macro name clashes with codec '%s'" % ci.name) + raise ValueError(f"Macro name clashes with codec '{ci.name}'") except LookupError: pass try: @@ -463,7 +463,7 @@ def _wrapper(param): isinstance(mapdict, dict) and p in mapdict.keys(): smapdict = {k: v for k, v in mapdict[p].items()} else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + raise LookupError(f"Bad parameter for encoding '{ename}': '{p}'") # case 3: dictionary of regex-selected encoding mappings elif isinstance(mapdict, dict) and isinstance(list(mapdict.values())[0], dict): tmp = None @@ -474,7 +474,7 @@ def _wrapper(param): tmp = d break if tmp is None: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + raise LookupError(f"Bad parameter for encoding '{ename}': '{p}'") smapdict = tmp # case 4: encoding characters translation else: @@ -494,7 +494,7 @@ def _wrapper(param): for k, v in smapdict.items(): smapdict[k] = [x.translate(t) for x in v] if isinstance(v, list) else v.translate(t) else: - raise LookupError("Bad parameter for encoding '{}': '{}'".format(ename, p)) + raise LookupError(f"Bad parameter for encoding '{ename}': '{p}'") if ignore_case is not None: cases = ["upper", "lower"] case_d = cases[any(c in str(list(smapdict.values())) for c in "abcdefghijklmnopqrstuvwxyz")] @@ -538,7 +538,7 @@ def code(text, errors="strict"): text = ensure_str(text) if not decode: if intype == "bin": - text = "".join("{:0>8}".format(bin(ord(c))[2:]) for c in text) + text = "".join(f"{bin(ord(c))[2:]:0>8}" for c in text) elif intype == "ord": text = "".join(str(ord(c)).zfill(3) for c in text) r = "" @@ -720,7 +720,7 @@ def list_encodings(*categories): enc.append(name) for category in categories: if category not in CODECS_CATEGORIES: - raise ValueError("Category '%s' does not exist" % category) + raise ValueError(f"Category '{category}' does not exist") return sorted(list(set(enc)), key=_human_keys) @@ -755,7 +755,7 @@ def remove(name): pass for s in ["En", "De"]: try: - delattr(builtins, "%s%scodeError" % (name.capitalize(), s)) + delattr(builtins, f"{name.capitalize()}{s}codeError") except AttributeError: pass codecs.remove = remove @@ -801,7 +801,7 @@ def b(s): return s -def ensure_str(s, encoding='utf-8', errors='strict'): +def ensure_str(s, encoding="utf-8", errors='strict'): """ Dummy str conversion function. """ if isinstance(s, bytes): try: @@ -859,7 +859,7 @@ def handle_error(ename, errors, sep="", repl_char="?", repl_minlen=1, decode=Fal :param decode: whether we are encoding or decoding :param item: position item description (for describing the error ; e.g. "group" or "token") """ - exc = "%s%scodeError" % (exc_name(ename), ["En", "De"][decode]) + exc = f"{exc_name(ename)}{['En','De'][decode]}codeError" def _handle_error(token, position, output="", eename=None): """ This handles an encoding/decoding error according to the selected handling mode. @@ -883,7 +883,7 @@ def _handle_error(token, position, output="", eename=None): elif errors == "ignore": return "" else: - raise ValueError("Unsupported error handling '{}'".format(errors)) + raise ValueError(f"Unsupported error handling '{errors}'") return _handle_error @@ -950,7 +950,7 @@ def lookup(encoding, macro=True): try: return CodecMacro(encoding) except LookupError: - e = LookupError("unknown encoding: %s" % encoding) + e = LookupError(f"unknown encoding: {encoding}") e.__cause__ = e # stop exception chaining raise e codecs.lookup = lookup @@ -1112,7 +1112,7 @@ def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False) __groups[value[0]] = result tokens.append(result) else: - raise NotImplementedError("Unhandled code '{}'".format(code)) + raise NotImplementedError(f"Unhandled code '{code}'") if len(tokens) == 0: tokens = [""] i = 0 @@ -1231,11 +1231,11 @@ def _load_lang_backend(backend=None): stopfunc.CLD3_LANGUAGES if _lb == "cld3" else \ stopfunc.TEXTBLOB_LANGUAGES if _lb == "textblob" else \ []): - n = "lang_%s" % lang + n = f"lang_{lang}" setattr(stopfunc, n, _lang(lang)) getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n if LANG: - flng = "lang_%s" % LANG + flng = f"lang_{LANG}" if getattr(stopfunc, flng, None): stopfunc.default = getattr(stopfunc, flng) stopfunc._reload_lang = _load_lang_backend @@ -1263,7 +1263,7 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings if not stop and (show or debug) and found not in result: s = repr(input) s = s[2:-1] if s.startswith("b'") and s.endswith("'") else s - s = "[+] %s: %s" % (", ".join(found), s) + s = "[+] {', '.join(found)}: {s}" print(s if len(s) <= 80 else s[:77] + "...") result[found] = input if depth >= max_depth or len(result) > 0 and stop: @@ -1274,7 +1274,7 @@ def __guess(prev_input, input, stop_func, depth, max_depth, min_depth, encodings if len(result) > 0 and stop: return if debug: - print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) + print(f"[*] Depth %0{len(str(max_depth))}d/%d: {encoding}" % (depth+1, max_depth)) __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, encodings, result, found + (encoding, ), stop, show, scoring_heuristic, extended, debug) diff --git a/src/codext/base/_base.py b/src/codext/base/_base.py index 27a31e3..f41df0b 100755 --- a/src/codext/base/_base.py +++ b/src/codext/base/_base.py @@ -5,6 +5,7 @@ from argparse import ArgumentParser, RawTextHelpFormatter from math import log from string import ascii_lowercase as lower, ascii_uppercase as upper, digits, printable +from sys import stdout from textwrap import wrap as wraptext from types import FunctionType, MethodType @@ -280,8 +281,11 @@ def _main(): except Exception as err: print("%sbase%s: invalid input" % (getattr(err, "output", ""), base)) return 1 + if args.decode: + stdout.buffer.write(c) + return 0 c = ensure_str(c) - if swap and args.swapcase and not args.decode: + if swap and args.swapcase: c = codecs.encode(c, "swapcase") for l in (wraptext(c, args.wrap) if args.wrap > 0 else [c]) if wrap else c.split("\n"): print(l) diff --git a/tests/test_generated.py b/tests/test_generated.py index e8eaf10..57b7b4e 100644 --- a/tests/test_generated.py +++ b/tests/test_generated.py @@ -32,7 +32,6 @@ def _template(self): f1 = getattr(codecs, ["decode", "encode"][k.startswith("enc")]) f2 = getattr(codecs, ["encode", "decode"][k.startswith("enc")]) for ename in m.groups(): - #FIXME if ename == "*": # ignore mode only continue From 3e837f45d799fdea61dad4f43b7c1f90701b29d7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 13 Nov 2023 23:14:41 +0000 Subject: [PATCH 81/97] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 3033e1b..f637fde 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.16%coverage99.16% \ No newline at end of file +coverage: 99.13%coverage99.13% \ No newline at end of file From f415240a94e7d9c9f3dbb21244028eaa063c3759 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 16:43:19 +0000 Subject: [PATCH 82/97] Bump tj-actions/verify-changed-files from 12 to 17 in /.github/workflows Bumps [tj-actions/verify-changed-files](https://github.com/tj-actions/verify-changed-files) from 12 to 17. - [Release notes](https://github.com/tj-actions/verify-changed-files/releases) - [Changelog](https://github.com/tj-actions/verify-changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/verify-changed-files/compare/v12...v17) --- updated-dependencies: - dependency-name: tj-actions/verify-changed-files dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 62476a7..de73aff 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -60,7 +60,7 @@ jobs: pytest --cov=$package --cov-report=xml genbadge coverage -i coverage.xml -o $cov_badge_path - name: Verify Changed files - uses: tj-actions/verify-changed-files@v12 + uses: tj-actions/verify-changed-files@v17 id: changed_files with: files: ${{ env.cov_badge_path }} From 58789428a760399fb02cb9dd1c9fbe9e892db306 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 3 Jan 2024 07:14:25 +0000 Subject: [PATCH 83/97] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index f637fde..fa2dd63 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.13%coverage99.13% \ No newline at end of file +coverage: 98.92%coverage98.92% \ No newline at end of file From 3f1733f5b9e8b29565c6ce00d21af5f5dc165a20 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 10 Jan 2024 22:17:48 +0100 Subject: [PATCH 84/97] Fixed requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ffe2fce..b5db972 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -six +markdown2>=2.4.0 From 2a48f1a25f3293096f5bf831401bdcf74be31d8d Mon Sep 17 00:00:00 2001 From: dhondta Date: Sun, 7 Jul 2024 15:27:19 +0200 Subject: [PATCH 85/97] Added support for Python 3.12 --- .github/workflows/python-package.yml | 2 +- pyproject.toml | 1 + src/codext/VERSION.txt | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index de73aff..4947463 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/pyproject.toml b/pyproject.toml index b204596..7c5d1b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 318dd9d..701a6a4 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.3 +1.15.4 From 98ea759553b3cbf7a97a6299ffe3cb721c0cef6a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 7 Jul 2024 13:29:15 +0000 Subject: [PATCH 86/97] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index fa2dd63..8cef4a9 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 98.92%coverage98.92% \ No newline at end of file +coverage: 99.04%coverage99.04% \ No newline at end of file From 06d7ca58336fa5479bdd0ca53a56605f6b2ad1a4 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 6 Jan 2025 23:49:03 +0100 Subject: [PATCH 87/97] Fixed #10 --- .github/workflows/python-package.yml | 2 +- pyproject.toml | 5 ----- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 5 +++-- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4947463..bdf7f9c 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/pyproject.toml b/pyproject.toml index 7c5d1b6..f8ad01b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,11 +23,6 @@ classifiers = [ "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 701a6a4..be2c181 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.4 +1.15.5 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 275dafd..7ad45d9 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -200,8 +200,9 @@ def _input(infile): def _set_exc(name, etype="ValueError"): if not hasattr(builtins, name): - exec(f"class {name}({etype}): __module__ = 'builtins'") - setattr(builtins, name, locals()[name]) + ns = {} + exec(f"class {name}({etype}): __module__ = 'builtins'", {}, ns) + setattr(builtins, name, ns[name]) _set_exc("InputSizeLimitError") _set_exc("ParameterError") From 6b4da6a6ef5affc1083dbeb3b60613d5c4293be1 Mon Sep 17 00:00:00 2001 From: dhondta Date: Tue, 7 Jan 2025 00:09:11 +0100 Subject: [PATCH 88/97] Fixed dependency to removed crypt module --- pyproject.toml | 1 + src/codext/hashing/crypt.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f8ad01b..2323ece 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ + "crypt-r; python_version >= '3.13'", "markdown2>=2.4.0", ] dynamic = ["version"] diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py index 0d44d8e..eddc668 100644 --- a/src/codext/hashing/crypt.py +++ b/src/codext/hashing/crypt.py @@ -12,7 +12,10 @@ if UNIX: - import crypt + try: + import crypt + except ImportError: + import crypt_r as crypt METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] From 9811df6922b7abdb2252289c104ff09a508b3fbb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 6 Jan 2025 23:11:03 +0000 Subject: [PATCH 89/97] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 8cef4a9..1006657 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.04%coverage99.04% \ No newline at end of file +coverage: 99.11%coverage99.11% \ No newline at end of file From 46748f226a37f67f6a9e8f5048ed25d30d89d257 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 9 Jun 2025 17:49:32 +0200 Subject: [PATCH 90/97] Fixed dependency to removed crypt module (2) --- pyproject.toml | 2 +- src/codext/hashing/crypt.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2323ece..849d94a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ "Topic :: Software Development :: Libraries :: Python Modules", ] dependencies = [ - "crypt-r; python_version >= '3.13'", + "legacycrypt; python_version >= '3.13'", "markdown2>=2.4.0", ] dynamic = ["version"] diff --git a/src/codext/hashing/crypt.py b/src/codext/hashing/crypt.py index eddc668..9ef8ed5 100644 --- a/src/codext/hashing/crypt.py +++ b/src/codext/hashing/crypt.py @@ -15,7 +15,7 @@ try: import crypt except ImportError: - import crypt_r as crypt + import legacycrypt as crypt METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] From cc05c071ec0b769da8bf6bcc428986d7a1b5f0ba Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 9 Jun 2025 18:04:15 +0200 Subject: [PATCH 91/97] Fixed dependency to removed crypt module (2) --- ...python-package.yml => publish-package.yml} | 29 ++++++++++++++- .github/workflows/pypi-publish.yml | 37 ------------------- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 8 ++-- tests/test_manual.py | 5 ++- 5 files changed, 37 insertions(+), 44 deletions(-) rename .github/workflows/{python-package.yml => publish-package.yml} (71%) delete mode 100644 .github/workflows/pypi-publish.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/publish-package.yml similarity index 71% rename from .github/workflows/python-package.yml rename to .github/workflows/publish-package.yml index bdf7f9c..8d9914d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/publish-package.yml @@ -19,7 +19,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} @@ -77,3 +77,30 @@ jobs: with: github_token: ${{ secrets.github_token }} branch: ${{ github.ref }} + deploy: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'success' }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Check for version change + uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + version: + - '**/VERSION.txt' + - if: steps.filter.outputs.version == 'true' + name: Cleanup README + run: | + sed -ri 's/^(##*)\s*:.*:\s*/\1 /g' README.md + awk '{if (match($0,"## Supporters")) exit; print}' README.md > README + mv -f README README.md + - run: python3 -m pip install --upgrade build && python3 -m build + - name: Upload ${{ env.package }} to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + verbose: true + verify_metadata: false diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml deleted file mode 100644 index 392e026..0000000 --- a/.github/workflows/pypi-publish.yml +++ /dev/null @@ -1,37 +0,0 @@ -# This workflow will deploy the Python package to PyPi.org - -name: deploy - -env: - package: codext - -on: - push: - branches: - - main - paths: - - '**/VERSION.txt' - workflow_run: - workflows: ["build"] - types: [completed] - -jobs: - deploy: - runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'success' }} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Cleanup README - run: | - sed -ri 's/^(##*)\s*:.*:\s*/\1 /g' README.md - awk '{if (match($0,"## Supporters")) exit; print}' README.md > README - mv -f README README.md - - run: python3 -m pip install --upgrade build && python3 -m build - - name: Upload ${{ env.package }} to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} - verbose: true - verify_metadata: false diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index be2c181..ab826b5 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.5 +1.15.6 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 7ad45d9..3042950 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -370,7 +370,7 @@ def add_macro(mname, *encodings): :param mname: macro name :param encodings: encoding names of the encodings to be chained with the macro """ - global PERS_MACROS + global PERS_MACROS # noqa: F824 # check for name clash with alreday existing macros and codecs if mname in MACROS or mname in PERS_MACROS: raise ValueError("Macro name already exists") @@ -630,7 +630,7 @@ def __get_value(token, position, case_changed=False): def clear(): """ Clear codext's local registry of search functions. """ - global __codecs_registry, MACROS, PERS_MACROS + global __codecs_registry, MACROS, PERS_MACROS # noqa: F824 __codecs_registry, MACROS, PERS_MACROS = [], {}, {} codecs.clear = clear @@ -733,7 +733,7 @@ def list_macros(): def remove(name): """ Remove all search functions matching the input encoding name from codext's local registry or any macro with the given name. """ - global __codecs_registry, MACROS, PERS_MACROS + global __codecs_registry, MACROS, PERS_MACROS # noqa: F824 tbr = [] for search_function in __codecs_registry: if search_function(name) is not None: @@ -764,7 +764,7 @@ def remove(name): def reset(): """ Reset codext's local registry of search functions and macros. """ - global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS + global __codecs_registry, CODECS_REGISTRY, MACROS, PERS_MACROS # noqa: F824 clear() d = os.path.dirname(__file__) for pkg in sorted(os.listdir(d)): diff --git a/tests/test_manual.py b/tests/test_manual.py index bed4884..c6e3c74 100644 --- a/tests/test_manual.py +++ b/tests/test_manual.py @@ -125,7 +125,10 @@ def test_codec_hash_functions(self): self.assertIsNotNone(codecs.encode(STR, h)) self.assertRaises(NotImplementedError, codecs.decode, STR, h) if UNIX: - import crypt + try: + import crypt + except ImportError: + import legacycrypt as crypt METHODS = [x[7:].lower() for x in crypt.__dict__ if x.startswith("METHOD_")] for m in METHODS: h = "crypt-" + m From f1b9b6b250effd29010fde717ea5801716df40bf Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 9 Jun 2025 16:12:13 +0000 Subject: [PATCH 92/97] Updated coverage.svg --- ...publish-package.yml => python-package.yml} | 25 +++++++++++-------- docs/coverage.svg | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) rename .github/workflows/{publish-package.yml => python-package.yml} (79%) diff --git a/.github/workflows/publish-package.yml b/.github/workflows/python-package.yml similarity index 79% rename from .github/workflows/publish-package.yml rename to .github/workflows/python-package.yml index 8d9914d..bbf5a50 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/python-package.yml @@ -26,18 +26,14 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} + - name: Install pandoc + run: sudo apt-get install -y pandoc - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov pytest-pythonpath coverage + python -m pip install pytest pytest-cov pytest-pythonpath coverage pip install -r requirements.txt pip install . - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test ${{ env.package }} with pytest run: | pytest --cov=$package @@ -48,6 +44,12 @@ jobs: cov_badge_path: docs/coverage.svg steps: - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: "3.12" + - name: Install pandoc + run: sudo apt-get install -y pandoc notification-daemon - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip @@ -79,7 +81,7 @@ jobs: branch: ${{ github.ref }} deploy: runs-on: ubuntu-latest - if: ${{ github.event.workflow_run.conclusion == 'success' }} + needs: coverage steps: - uses: actions/checkout@v3 with: @@ -97,8 +99,11 @@ jobs: sed -ri 's/^(##*)\s*:.*:\s*/\1 /g' README.md awk '{if (match($0,"## Supporters")) exit; print}' README.md > README mv -f README README.md - - run: python3 -m pip install --upgrade build && python3 -m build - - name: Upload ${{ env.package }} to PyPI + - if: steps.filter.outputs.version == 'true' + name: Build ${{ env.package }} package + run: python3 -m pip install --upgrade build && python3 -m build + - if: steps.filter.outputs.version == 'true' + name: Upload ${{ env.package }} to PyPi uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/docs/coverage.svg b/docs/coverage.svg index 1006657..4d30c44 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 99.11%coverage99.11% \ No newline at end of file +coverage: 98.90%coverage98.90% \ No newline at end of file From dac31601077326f54a3dad91c3d279a0a91c01ee Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 9 Jun 2025 18:20:01 +0200 Subject: [PATCH 93/97] New release --- .github/workflows/python-package.yml | 4 ---- src/codext/VERSION.txt | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index bbf5a50..85432f3 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -26,8 +26,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - name: Install pandoc - run: sudo apt-get install -y pandoc - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip @@ -48,8 +46,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: "3.12" - - name: Install pandoc - run: sudo apt-get install -y pandoc notification-daemon - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index ab826b5..d86159f 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.6 +1.15.7 From 3859ebe131ccffb4805b4747af3b174014271449 Mon Sep 17 00:00:00 2001 From: dhondta Date: Mon, 16 Jun 2025 23:50:52 +0200 Subject: [PATCH 94/97] Fixed #11 --- src/codext/VERSION.txt | 2 +- src/codext/__common__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index d86159f..51c7561 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.7 +1.15.8 diff --git a/src/codext/__common__.py b/src/codext/__common__.py index 3042950..ae92325 100644 --- a/src/codext/__common__.py +++ b/src/codext/__common__.py @@ -261,7 +261,7 @@ def getregentry(encoding): while True: try: g = m.group(i) or "" - if g.isdigit() and not g.startswith("0") and "".join(set(g)) != "01": + if g.isdigit() and not g.startswith("0") and (re.match(r"10+", g) or "".join(set(g)) != "01"): g = int(g) args += [g] i += 1 From a1c0eea919b8ab8b13e1c91e84f3d724536c7c75 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 18 Jun 2025 00:17:09 +0200 Subject: [PATCH 95/97] Fixed README --- README.md | 686 +++++++++++++++++++++++++++--------------------------- 1 file changed, 343 insertions(+), 343 deletions(-) diff --git a/README.md b/README.md index 35aa6c2..58c1c9d 100644 --- a/README.md +++ b/README.md @@ -1,343 +1,343 @@ -

-

CodExt Tweet

-

Encode/decode anything.

- -[![PyPi](https://img.shields.io/pypi/v/codext.svg)](https://pypi.python.org/pypi/codext/) -[![Read The Docs](https://readthedocs.org/projects/python-codext/badge/?version=latest)](https://python-codext.readthedocs.io/en/latest/?badge=latest) -[![Build Status](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml/badge.svg)](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml) -[![Coverage Status](https://raw.githubusercontent.com/dhondta/python-codext/main/docs/coverage.svg)](#) -[![Python Versions](https://img.shields.io/pypi/pyversions/codext.svg)](https://pypi.python.org/pypi/codext/) -[![Known Vulnerabilities](https://snyk.io/test/github/dhondta/python-codext/badge.svg?targetFile=requirements.txt)](https://snyk.io/test/github/dhondta/python-codext?targetFile=requirements.txt) -[![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) -[![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) - -[**CodExt**](https://github.com/dhondta/python-codext) is a (Python2-3 compatible) library that extends the native [`codecs`](https://docs.python.org/3/library/codecs.html) library (namely for adding new custom encodings and character mappings) and provides **120+ new codecs**, hence its name combining *CODecs EXTension*. It also features a **guess mode** for decoding multiple layers of encoding and **CLI tools** for convenience. - -```sh -$ pip install codext -``` - -Want to contribute a new codec ? | Want to contribute a new macro ? -:----------------------------------:|:------------------------------------: -Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/main/codext/macros.json) - -## :mag: Demonstrations - -

Using CodExt from the command line

-

Using base tools from the command line

-

Using the unbase command line tool

- -## :computer: Usage (main CLI tool) Tweet on codext - -```session -$ codext -i test.txt encode dna-1 -GTGAGCGGGTATGTGA - -$ echo -en "test" | codext encode morse -- . ... - - -$ echo -en "test" | codext encode braille -⠞⠑⠎⠞ - -$ echo -en "test" | codext encode base100 -👫👜👪👫 -``` - -### Chaining codecs - -```sh -$ echo -en "Test string" | codext encode reverse -gnirts tseT - -$ echo -en "Test string" | codext encode reverse morse ---. -. .. .-. - ... / - ... . - - -$ echo -en "Test string" | codext encode reverse morse dna-2 -AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC - -$ echo -en "Test string" | codext encode reverse morse dna-2 octal -101107124103101107124103101107124107101107101101101107124103101107124107101107101101101107124107101107124107101107101101101107124107101107124103101107124107101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124124101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124107101107101101101107124103 - -$ echo -en "AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC" | codext -d dna-2 morse reverse -test string -``` - -### Using macros - -```sh -$ codext add-macro my-encoding-chain gzip base63 lzma base64 - -$ codext list macros -example-macro, my-encoding-chain - -$ echo -en "Test string" | codext encode my-encoding-chain -CQQFAF0AAIAAABuTgySPa7WaZC5Sunt6FS0ko71BdrYE8zHqg91qaqadZIR2LafUzpeYDBalvE///ug4AA== - -$ codext remove-macro my-encoding-chain - -$ codext list macros -example-macro -``` - -## :computer: Usage (base CLI tool) Tweet on unbase - -```session -$ echo "Test string !" | base122 -*.7!ft9�-f9Â - -$ echo "Test string !" | base91 -"ONK;WDZM%Z%xE7L - -$ echo "Test string !" | base91 | base85 -B2P|BJ6A+nO(j|-cttl% - -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr -QVx5tvgjvCAkXaMSuKoQmCnjeCV1YyyR3WErUUErFf - -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | base58-flickr -d | base36 -d | base85 -d | base91 -d -Test string ! -``` - -```session -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -m 3 -Test string ! - -$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -f Test -Test string ! -``` - -## :computer: Usage (Python) - -Getting the list of available codecs: - -```python ->>> import codext - ->>> codext.list() -['ascii85', 'base85', 'base100', 'base122', ..., 'tomtom', 'dna', 'html', 'markdown', 'url', 'resistor', 'sms', 'whitespace', 'whitespace-after-before'] - ->>> codext.encode("this is a test", "base58-bitcoin") -'jo91waLQA1NNeBmZKUF' - ->>> codext.encode("this is a test", "base58-ripple") -'jo9rA2LQwr44eBmZK7E' - ->>> codext.encode("this is a test", "base58-url") -'JN91Wzkpa1nnDbLyjtf' - ->>> codecs.encode("this is a test", "base100") -'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' - ->>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") -'this is a test' - ->>> for i in range(8): - print(codext.encode("this is a test", "dna-%d" % (i + 1))) -GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA -CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA -ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG -AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC -TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG -TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC -GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT -CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT ->>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") -'this is a test' - ->>> codecs.encode("this is a test", "morse") -'- .... .. ... / .. ... / .- / - . ... -' - ->>> codecs.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") -'this is a test' - ->>> with open("morse.txt", 'w', encoding="morse") as f: - f.write("this is a test") -14 - ->>> with open("morse.txt",encoding="morse") as f: - f.read() -'this is a test' - ->>> codext.decode(""" - = - X - : - x - n - r - y - Y - y - p - a - ` - n - | - a -o - h - ` - g - o - z """, "whitespace-after+before") -'CSC{not_so_invisible}' - ->>> print(codext.encode("An example test string", "baudot-tape")) -***.** - . * -***.* -* . - .* -* .* - . * -** .* -***.** -** .** - .* -* . -* *. * - .* -* *. -* *. * -* . -* *. -* *. * -***. - *.* -***.* - * .* -``` - -## :page_with_curl: List of codecs - -#### [BaseXX](https://python-codext.readthedocs.io/en/latest/enc/base.html) - -- [X] `base1`: useless, but for the sake of completeness -- [X] `base2`: simple conversion to binary (with a variant with a reversed alphabet) -- [X] `base3`: conversion to ternary (with a variant with a reversed alphabet) -- [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet) -- [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet) -- [X] `base10`: simple conversion to decimal -- [X] `base11`: conversion to digits with a "*a*" -- [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) -- [X] `base26`: conversion to alphabet letters -- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html)) -- [X] `base36`: [Base36](https://en.wikipedia.org/wiki/Base36) conversion to letters and digits (with a variant inverting both groups) -- [X] `base45`: [Base45](https://datatracker.ietf.org/doc/html/draft-faltstrom-base45-04.txt) DRAFT algorithm (with a variant inverting letters and digits) -- [X] `base58`: multiple versions of [Base58](https://en.bitcoinwiki.org/wiki/Base58) (bitcoin, flickr, ripple) -- [X] `base62`: [Base62](https://en.wikipedia.org/wiki/Base62) conversion to lower- and uppercase letters and digits (with a variant with letters and digits inverted) -- [X] `base63`: similar to `base62` with the "`_`" added -- [X] `base64`: classical conversion according to RFC4648 with its variant URL (or *file*) (it also holds a variant with letters and digits inverted) -- [X] `base67`: custom conversion using some more special characters (also with a variant with letters and digits inverted) -- [X] `base85`: all variants of Base85 ([Ascii85](https://fr.wikipedia.org/wiki/Ascii85), [z85](https://rfc.zeromq.org/spec/32), [Adobe](https://dencode.com/string/ascii85), [(x)btoa](https://dencode.com/string/ascii85), [RFC1924](https://datatracker.ietf.org/doc/html/rfc1924), [XML](https://datatracker.ietf.org/doc/html/draft-kwiatkowski-base85-for-xml-00)) -- [X] `base91`: [Base91](http://base91.sourceforge.net) custom conversion -- [X] `base100` (or *emoji*): [Base100](https://github.com/AdamNiederer/base100) custom conversion -- [X] `base122`: [Base100](http://blog.kevinalbs.com/base122) custom conversion -- [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) ; supports any possible base - -This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `base85` codec. - -#### [Binary](https://python-codext.readthedocs.io/en/latest/enc/binary.html) - -- [X] `baudot`: supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... -- [X] `baudot-spaced`: variant of `baudot` ; groups of 5 bits are whitespace-separated -- [X] `baudot-tape`: variant of `baudot` ; outputs a string that looks like a perforated tape -- [X] `bcd`: _Binary Coded Decimal_, encodes characters from their (zero-left-padded) ordinals -- [X] `bcd-extended0`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `0000` -- [X] `bcd-extended1`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `1111` -- [X] `excess3`: uses Excess-3 (aka Stibitz code) binary encoding to convert characters from their ordinals -- [X] `gray`: aka reflected binary code -- [X] `manchester`: XORes each bit of the input with `01` -- [X] `manchester-inverted`: variant of `manchester` ; XORes each bit of the input with `10` -- [X] `rotateN`: rotates characters by the specified number of bits (*N* belongs to [1, 7] ; Python 3 only) - -#### [Common](https://python-codext.readthedocs.io/en/latest/enc/common.html) - -- [X] `a1z26`: keeps words whitespace-separated and uses a custom character separator -- [X] `cases`: set of case-related encodings (including camel-, kebab-, lower-, pascal-, upper-, snake- and swap-case, slugify, capitalize, title) -- [X] `dummy`: set of simple encodings (including integer, replace, reverse, word-reverse, substite and strip-spaces) -- [X] `octal`: dummy octal conversion (converts to 3-digits groups) -- [X] `octal-spaced`: variant of `octal` ; dummy octal conversion, handling whitespace separators -- [X] `ordinal`: dummy character ordinals conversion (converts to 3-digits groups) -- [X] `ordinal-spaced`: variant of `ordinal` ; dummy character ordinals conversion, handling whitespace separators - -#### [Compression](https://python-codext.readthedocs.io/en/latest/enc/compressions.html) - -- [X] `gzip`: standard Gzip compression/decompression -- [X] `lz77`: compresses the given data with the algorithm of Lempel and Ziv of 1977 -- [X] `lz78`: compresses the given data with the algorithm of Lempel and Ziv of 1978 -- [X] `pkzip_deflate`: standard Zip-deflate compression/decompression -- [X] `pkzip_bzip2`: standard BZip2 compression/decompression -- [X] `pkzip_lzma`: standard LZMA compression/decompression - -> :warning: Compression functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. - -#### [Cryptography](https://python-codext.readthedocs.io/en/latest/enc/crypto.html) - -- [X] `affine`: aka Affine Cipher -- [X] `atbash`: aka Atbash Cipher -- [X] `bacon`: aka Baconian Cipher -- [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) -- [X] `citrix`: aka Citrix CTX1 password encoding -- [X] `railfence`: aka Rail Fence Cipher -- [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) -- [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) -- [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) -- [X] `xorN`: XOR with a single byte (*N* belongs to [1,255]) - -> :warning: Crypto functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. - -#### [Hashing](https://python-codext.readthedocs.io/en/latest/enc/hashing.html) - -- [X] `blake`: includes BLAKE2b and BLAKE2s (Python 3 only ; relies on `hashlib`) -- [X] `checksums`: includes Adler32 and CRC32 (relies on `zlib`) -- [X] `crypt`: Unix's crypt hash for passwords (Python 3 and Unix only ; relies on `crypt`) -- [X] `md`: aka Message Digest ; includes MD4 and MD5 (relies on `hashlib`) -- [X] `sha`: aka Secure Hash Algorithms ; includes SHA1, 224, 256, 384, 512 (Python2/3) but also SHA3-224, -256, -384 and -512 (Python 3 only ; relies on `hashlib`) -- [X] `shake`: aka SHAKE hashing (Python 3 only ; relies on `hashlib`) - -> :warning: Hash functions are of course definitely **NOT** encoding functions ; they are implemented for convenience with the `.encode(...)` API from `codecs` and useful for chaning codecs. - -#### [Languages](https://python-codext.readthedocs.io/en/latest/enc/languages.html) - -- [X] `braille`: well-known braille language (Python 3 only) -- [X] `ipsum`: aka lorem ipsum -- [X] `galactic`: aka galactic alphabet or Minecraft enchantment language (Python 3 only) -- [X] `leetspeak`: based on minimalistic elite speaking rules -- [X] `morse`: uses whitespace as a separator -- [X] `navajo`: only handles letters (not full words from the Navajo dictionary) -- [X] `radio`: aka NATO or radio phonetic alphabet -- [X] `southpark`: converts letters to Kenny's language from Southpark (whitespace is also handled) -- [X] `southpark-icase`: case insensitive variant of `southpark` -- [X] `tap`: converts text to tap/knock code, commonly used by prisoners -- [X] `tomtom`: similar to `morse`, using slashes and backslashes - -#### [Others](https://python-codext.readthedocs.io/en/latest/enc/others.html) - -- [X] `dna`: implements the 8 rules of DNA sequences (N belongs to [1,8]) -- [X] `letter-indices`: encodes consonants and/or vowels with their corresponding indices -- [X] `markdown`: unidirectional encoding from Markdown to HTML - -#### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano.html) - -- [X] `hexagram`: uses Base64 and encodes the result to a charset of [I Ching hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) (as implemented [here](https://github.com/qntm/hexagram-encode)) -- [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution -- [X] `resistor`: aka resistor color codes -- [X] `rick`: aka Rick cipher (in reference to Rick Astley's song "*Never gonna give you up*") -- [X] `sms`: also called _T9 code_ ; uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding -- [X] `whitespace`: replaces bits with whitespaces and tabs -- [X] `whitespace_after_before`: variant of `whitespace` ; encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") - -#### [Web](https://python-codext.readthedocs.io/en/latest/enc/web.html) - -- [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) -- [X] `url`: aka URL encoding - - -## :clap: Supporters - -[![Stargazers repo roster for @dhondta/python-codext](https://reporoster.com/stars/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/stargazers) - -[![Forkers repo roster for @dhondta/python-codext](https://reporoster.com/forks/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/network/members) - -

Back to top

+

+

CodExt Tweet

+

Encode/decode anything.

+ +[![PyPi](https://img.shields.io/pypi/v/codext.svg)](https://pypi.python.org/pypi/codext/) +[![Read The Docs](https://readthedocs.org/projects/python-codext/badge/?version=latest)](https://python-codext.readthedocs.io/en/latest/?badge=latest) +[![Build Status](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml/badge.svg)](https://github.com/dhondta/python-codext/actions/workflows/python-package.yml) +[![Coverage Status](https://raw.githubusercontent.com/dhondta/python-codext/main/docs/coverage.svg)](#) +[![Python Versions](https://img.shields.io/pypi/pyversions/codext.svg)](https://pypi.python.org/pypi/codext/) +[![Known Vulnerabilities](https://snyk.io/test/github/dhondta/python-codext/badge.svg?targetFile=requirements.txt)](https://snyk.io/test/github/dhondta/python-codext?targetFile=requirements.txt) +[![DOI](https://zenodo.org/badge/236679865.svg)](https://zenodo.org/badge/latestdoi/236679865) +[![License](https://img.shields.io/pypi/l/codext.svg)](https://pypi.python.org/pypi/codext/) + +[**CodExt**](https://github.com/dhondta/python-codext) is a (Python2-3 compatible) library that extends the native [`codecs`](https://docs.python.org/3/library/codecs.html) library (namely for adding new custom encodings and character mappings) and provides **120+ new codecs**, hence its name combining *CODecs EXTension*. It also features a **guess mode** for decoding multiple layers of encoding and **CLI tools** for convenience. + +```sh +$ pip install codext +``` + +Want to contribute a new codec ? | Want to contribute a new macro ? +:----------------------------------:|:------------------------------------: +Check the [documentation](https://python-codext.readthedocs.io/en/latest/howto.html) first
Then [PR](https://github.com/dhondta/python-codext/pulls) your new codec | [PR](https://github.com/dhondta/python-codext/pulls) your updated version of [`macros.json`](https://github.com/dhondta/python-codext/blob/main/codext/macros.json) + +## :mag: Demonstrations + +

Using CodExt from the command line

+

Using base tools from the command line

+

Using the unbase command line tool

+ +## :computer: Usage (main CLI tool) Tweet on codext + +```session +$ codext -i test.txt encode dna-1 +GTGAGCGGGTATGTGA + +$ echo -en "test" | codext encode morse +- . ... - + +$ echo -en "test" | codext encode braille +⠞⠑⠎⠞ + +$ echo -en "test" | codext encode base100 +👫👜👪👫 +``` + +### Chaining codecs + +```sh +$ echo -en "Test string" | codext encode reverse +gnirts tseT + +$ echo -en "Test string" | codext encode reverse morse +--. -. .. .-. - ... / - ... . - + +$ echo -en "Test string" | codext encode reverse morse dna-2 +AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC + +$ echo -en "Test string" | codext encode reverse morse dna-2 octal +101107124103101107124103101107124107101107101101101107124103101107124107101107101101101107124107101107124107101107101101101107124107101107124103101107124107101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124124101107101101101107124103101107101101101107124107101107124107101107124107101107101101101107124107101107101101101107124103 + +$ echo -en "AGTCAGTCAGTGAGAAAGTCAGTGAGAAAGTGAGTGAGAAAGTGAGTCAGTGAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTTAGAAAGTCAGAAAGTGAGTGAGTGAGAAAGTGAGAAAGTC" | codext -d dna-2 morse reverse +test string +``` + +### Using macros + +```sh +$ codext add-macro my-encoding-chain gzip base63 lzma base64 + +$ codext list macros +example-macro, my-encoding-chain + +$ echo -en "Test string" | codext encode my-encoding-chain +CQQFAF0AAIAAABuTgySPa7WaZC5Sunt6FS0ko71BdrYE8zHqg91qaqadZIR2LafUzpeYDBalvE///ug4AA== + +$ codext remove-macro my-encoding-chain + +$ codext list macros +example-macro +``` + +## :computer: Usage (base CLI tool) Tweet on unbase + +```session +$ echo "Test string !" | base122 +*.7!ft9�-f9Â + +$ echo "Test string !" | base91 +"ONK;WDZM%Z%xE7L + +$ echo "Test string !" | base91 | base85 +B2P|BJ6A+nO(j|-cttl% + +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr +QVx5tvgjvCAkXaMSuKoQmCnjeCV1YyyR3WErUUErFf + +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | base58-flickr -d | base36 -d | base85 -d | base91 -d +Test string ! +``` + +```session +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -m 3 +Test string ! + +$ echo "Test string !" | base91 | base85 | base36 | base58-flickr | unbase -f Test +Test string ! +``` + +## :computer: Usage (Python) + +Getting the list of available codecs: + +```python +>>> import codext + +>>> codext.list() +['ascii85', 'base85', 'base100', 'base122', ..., 'tomtom', 'dna', 'html', 'markdown', 'url', 'resistor', 'sms', 'whitespace', 'whitespace-after-before'] + +>>> codext.encode("this is a test", "base58-bitcoin") +'jo91waLQA1NNeBmZKUF' + +>>> codext.encode("this is a test", "base58-ripple") +'jo9rA2LQwr44eBmZK7E' + +>>> codext.encode("this is a test", "base58-url") +'JN91Wzkpa1nnDbLyjtf' + +>>> codecs.encode("this is a test", "base100") +'👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫' + +>>> codecs.decode("👫👟👠👪🐗👠👪🐗👘🐗👫👜👪👫", "base100") +'this is a test' + +>>> for i in range(8): + print(codext.encode("this is a test", "dna-%d" % (i + 1))) +GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA +CTCACGGACGGCCTATAGAACGGCCTATAGAACGACAGAACTCACGCCCTATCTCA +ACAGATTGATTAACGCGTGGATTAACGCGTGGATGAGTGGACAGATAAACGCACAG +AGACATTCATTAAGCGCTCCATTAAGCGCTCCATCACTCCAGACATAAAGCGAGAC +TCTGTAAGTAATTCGCGAGGTAATTCGCGAGGTAGTGAGGTCTGTATTTCGCTCTG +TGTCTAACTAATTGCGCACCTAATTGCGCACCTACTCACCTGTCTATTTGCGTGTC +GAGTGCCTGCCGGATATCTTGCCGGATATCTTGCTGTCTTGAGTGCGGGATAGAGT +CACTCGGTCGGCCATATGTTCGGCCATATGTTCGTCTGTTCACTCGCCCATACACT +>>> codext.decode("GTGAGCCAGCCGGTATACAAGCCGGTATACAAGCAGACAAGTGAGCGGGTATGTGA", "dna-1") +'this is a test' + +>>> codecs.encode("this is a test", "morse") +'- .... .. ... / .. ... / .- / - . ... -' + +>>> codecs.decode("- .... .. ... / .. ... / .- / - . ... -", "morse") +'this is a test' + +>>> with open("morse.txt", 'w', encoding="morse") as f: + f.write("this is a test") +14 + +>>> with open("morse.txt",encoding="morse") as f: + f.read() +'this is a test' + +>>> codext.decode(""" + = + X + : + x + n + r + y + Y + y + p + a + ` + n + | + a +o + h + ` + g + o + z """, "whitespace-after+before") +'CSC{not_so_invisible}' + +>>> print(codext.encode("An example test string", "baudot-tape")) +***.** + . * +***.* +* . + .* +* .* + . * +** .* +***.** +** .** + .* +* . +* *. * + .* +* *. +* *. * +* . +* *. +* *. * +***. + *.* +***.* + * .* +``` + +## :page_with_curl: List of codecs + +#### [BaseXX](https://python-codext.readthedocs.io/en/latest/enc/base.html) + +- [X] `base1`: useless, but for the sake of completeness +- [X] `base2`: simple conversion to binary (with a variant with a reversed alphabet) +- [X] `base3`: conversion to ternary (with a variant with a reversed alphabet) +- [X] `base4`: conversion to quarternary (with a variant with a reversed alphabet) +- [X] `base8`: simple conversion to octal (with a variant with a reversed alphabet) +- [X] `base10`: simple conversion to decimal +- [X] `base11`: conversion to digits with a "*a*" +- [X] `base16`: simple conversion to hexadecimal (with a variant holding an alphabet with digits and letters inverted) +- [X] `base26`: conversion to alphabet letters +- [X] `base32`: classical conversion according to the RFC4648 with all its variants ([zbase32](https://philzimmermann.com/docs/human-oriented-base-32-encoding.txt), extended hexadecimal, [geohash](https://en.wikipedia.org/wiki/Geohash), [Crockford](https://www.crockford.com/base32.html)) +- [X] `base36`: [Base36](https://en.wikipedia.org/wiki/Base36) conversion to letters and digits (with a variant inverting both groups) +- [X] `base45`: [Base45](https://datatracker.ietf.org/doc/html/draft-faltstrom-base45-04.txt) DRAFT algorithm (with a variant inverting letters and digits) +- [X] `base58`: multiple versions of [Base58](https://en.bitcoinwiki.org/wiki/Base58) (bitcoin, flickr, ripple) +- [X] `base62`: [Base62](https://en.wikipedia.org/wiki/Base62) conversion to lower- and uppercase letters and digits (with a variant with letters and digits inverted) +- [X] `base63`: similar to `base62` with the "`_`" added +- [X] `base64`: classical conversion according to RFC4648 with its variant URL (or *file*) (it also holds a variant with letters and digits inverted) +- [X] `base67`: custom conversion using some more special characters (also with a variant with letters and digits inverted) +- [X] `base85`: all variants of Base85 ([Ascii85](https://fr.wikipedia.org/wiki/Ascii85), [z85](https://rfc.zeromq.org/spec/32), [Adobe](https://dencode.com/string/ascii85), [(x)btoa](https://dencode.com/string/ascii85), [RFC1924](https://datatracker.ietf.org/doc/html/rfc1924), [XML](https://datatracker.ietf.org/doc/html/draft-kwiatkowski-base85-for-xml-00)) +- [X] `base91`: [Base91](http://base91.sourceforge.net) custom conversion +- [X] `base100` (or *emoji*): [Base100](https://github.com/AdamNiederer/base100) custom conversion +- [X] `base122`: [Base100](http://blog.kevinalbs.com/base122) custom conversion +- [X] `base-genericN`: see [base encodings](https://python-codext.readthedocs.io/en/latest/enc/base.html) ; supports any possible base + +This category also contains `ascii85`, `adobe`, `[x]btoa`, `zeromq` with the `base85` codec. + +#### [Binary](https://python-codext.readthedocs.io/en/latest/enc/binary.html) + +- [X] `baudot`: supports CCITT-1, CCITT-2, EU/FR, ITA1, ITA2, MTK-2 (Python3 only), UK, ... +- [X] `baudot-spaced`: variant of `baudot` ; groups of 5 bits are whitespace-separated +- [X] `baudot-tape`: variant of `baudot` ; outputs a string that looks like a perforated tape +- [X] `bcd`: _Binary Coded Decimal_, encodes characters from their (zero-left-padded) ordinals +- [X] `bcd-extended0`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `0000` +- [X] `bcd-extended1`: variant of `bcd` ; encodes characters from their (zero-left-padded) ordinals using prefix bits `1111` +- [X] `excess3`: uses Excess-3 (aka Stibitz code) binary encoding to convert characters from their ordinals +- [X] `gray`: aka reflected binary code +- [X] `manchester`: XORes each bit of the input with `01` +- [X] `manchester-inverted`: variant of `manchester` ; XORes each bit of the input with `10` +- [X] `rotateN`: rotates characters by the specified number of bits (*N* belongs to [1, 7] ; Python 3 only) + +#### [Common](https://python-codext.readthedocs.io/en/latest/enc/common.html) + +- [X] `a1z26`: keeps words whitespace-separated and uses a custom character separator +- [X] `cases`: set of case-related encodings (including camel-, kebab-, lower-, pascal-, upper-, snake- and swap-case, slugify, capitalize, title) +- [X] `dummy`: set of simple encodings (including integer, replace, reverse, word-reverse, substite and strip-spaces) +- [X] `octal`: dummy octal conversion (converts to 3-digits groups) +- [X] `octal-spaced`: variant of `octal` ; dummy octal conversion, handling whitespace separators +- [X] `ordinal`: dummy character ordinals conversion (converts to 3-digits groups) +- [X] `ordinal-spaced`: variant of `ordinal` ; dummy character ordinals conversion, handling whitespace separators + +#### [Compression](https://python-codext.readthedocs.io/en/latest/enc/compressions.html) + +- [X] `gzip`: standard Gzip compression/decompression +- [X] `lz77`: compresses the given data with the algorithm of Lempel and Ziv of 1977 +- [X] `lz78`: compresses the given data with the algorithm of Lempel and Ziv of 1978 +- [X] `pkzip_deflate`: standard Zip-deflate compression/decompression +- [X] `pkzip_bzip2`: standard BZip2 compression/decompression +- [X] `pkzip_lzma`: standard LZMA compression/decompression + +> :warning: Compression functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. + +#### [Cryptography](https://python-codext.readthedocs.io/en/latest/enc/crypto.html) + +- [X] `affine`: aka Affine Cipher +- [X] `atbash`: aka Atbash Cipher +- [X] `bacon`: aka Baconian Cipher +- [X] `barbie-N`: aka Barbie Typewriter (*N* belongs to [1, 4]) +- [X] `citrix`: aka Citrix CTX1 password encoding +- [X] `railfence`: aka Rail Fence Cipher +- [X] `rotN`: aka Caesar cipher (*N* belongs to [1,25]) +- [X] `scytaleN`: encrypts using the number of letters on the rod (*N* belongs to [1,[) +- [X] `shiftN`: shift ordinals (*N* belongs to [1,255]) +- [X] `xorN`: XOR with a single byte (*N* belongs to [1,255]) + +> :warning: Crypto functions are of course definitely **NOT** encoding functions ; they are implemented for leveraging the `.encode(...)` API from `codecs`. + +#### [Hashing](https://python-codext.readthedocs.io/en/latest/enc/hashing.html) + +- [X] `blake`: includes BLAKE2b and BLAKE2s (Python 3 only ; relies on `hashlib`) +- [X] `checksums`: includes Adler32 and CRC32 (relies on `zlib`) +- [X] `crypt`: Unix's crypt hash for passwords (Python 3 and Unix only ; relies on `crypt`) +- [X] `md`: aka Message Digest ; includes MD4 and MD5 (relies on `hashlib`) +- [X] `sha`: aka Secure Hash Algorithms ; includes SHA1, 224, 256, 384, 512 (Python2/3) but also SHA3-224, -256, -384 and -512 (Python 3 only ; relies on `hashlib`) +- [X] `shake`: aka SHAKE hashing (Python 3 only ; relies on `hashlib`) + +> :warning: Hash functions are of course definitely **NOT** encoding functions ; they are implemented for convenience with the `.encode(...)` API from `codecs` and useful for chaning codecs. + +#### [Languages](https://python-codext.readthedocs.io/en/latest/enc/languages.html) + +- [X] `braille`: well-known braille language (Python 3 only) +- [X] `ipsum`: aka lorem ipsum +- [X] `galactic`: aka galactic alphabet or Minecraft enchantment language (Python 3 only) +- [X] `leetspeak`: based on minimalistic elite speaking rules +- [X] `morse`: uses whitespace as a separator +- [X] `navajo`: only handles letters (not full words from the Navajo dictionary) +- [X] `radio`: aka NATO or radio phonetic alphabet +- [X] `southpark`: converts letters to Kenny's language from Southpark (whitespace is also handled) +- [X] `southpark-icase`: case insensitive variant of `southpark` +- [X] `tap`: converts text to tap/knock code, commonly used by prisoners +- [X] `tomtom`: similar to `morse`, using slashes and backslashes + +#### [Others](https://python-codext.readthedocs.io/en/latest/enc/others.html) + +- [X] `dna`: implements the 8 rules of DNA sequences (N belongs to [1,8]) +- [X] `letter-indices`: encodes consonants and/or vowels with their corresponding indices +- [X] `markdown`: unidirectional encoding from Markdown to HTML + +#### [Steganography](https://python-codext.readthedocs.io/en/latest/enc/stegano.html) + +- [X] `hexagram`: uses Base64 and encodes the result to a charset of [I Ching hexagrams](https://en.wikipedia.org/wiki/Hexagram_%28I_Ching%29) (as implemented [here](https://github.com/qntm/hexagram-encode)) +- [X] `klopf`: aka Klopf code ; Polybius square with trivial alphabetical distribution +- [X] `resistor`: aka resistor color codes +- [X] `rick`: aka Rick cipher (in reference to Rick Astley's song "*Never gonna give you up*") +- [X] `sms`: also called _T9 code_ ; uses "`-`" as a separator for encoding, "`-`" or "`_`" or whitespace for decoding +- [X] `whitespace`: replaces bits with whitespaces and tabs +- [X] `whitespace_after_before`: variant of `whitespace` ; encodes characters as new characters with whitespaces before and after according to an equation described in the codec name (e.g. "`whitespace+2*after-3*before`") + +#### [Web](https://python-codext.readthedocs.io/en/latest/enc/web.html) + +- [X] `html`: implements entities according to [this reference](https://dev.w3.org/html5/html-author/charref) +- [X] `url`: aka URL encoding + + +## :clap: Supporters + +[![Stargazers repo roster for @dhondta/python-codext](https://reporoster.com/stars/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/stargazers) + +[![Forkers repo roster for @dhondta/python-codext](https://reporoster.com/forks/dark/dhondta/python-codext)](https://github.com/dhondta/python-codext/network/members) + +

Back to top

From 935cd8c9100e7009bac2c67b406501937a1abac5 Mon Sep 17 00:00:00 2001 From: dhondta Date: Wed, 17 Sep 2025 22:41:28 +0200 Subject: [PATCH 96/97] Fixed some vulnerabilities --- .github/workflows/python-package.yml | 4 ++-- pyproject.toml | 4 ++-- pytest.ini | 2 +- requirements.txt | 3 ++- src/codext/VERSION.txt | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 85432f3..96f267f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -29,7 +29,7 @@ jobs: - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip - python -m pip install pytest pytest-cov pytest-pythonpath coverage + python -m pip install pytest pytest-cov coverage pip install -r requirements.txt pip install . - name: Test ${{ env.package }} with pytest @@ -49,7 +49,7 @@ jobs: - name: Install ${{ env.package }} run: | python -m pip install --upgrade pip - python -m pip install pytest pytest-cov pytest-pythonpath + python -m pip install pytest pytest-cov pip install -r requirements.txt pip install . - name: Make coverage badge for ${{ env.package }} diff --git a/pyproject.toml b/pyproject.toml index 849d94a..1644aee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=61.0", "setuptools-scm"] +requires = ["setuptools>=80.0.0", "setuptools-scm"] build-backend = "setuptools.build_meta" [tool.setuptools.dynamic] @@ -27,7 +27,7 @@ classifiers = [ ] dependencies = [ "legacycrypt; python_version >= '3.13'", - "markdown2>=2.4.0", + "markdown2>=2.5.4", ] dynamic = ["version"] diff --git a/pytest.ini b/pytest.ini index ab4c198..fcccae1 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,2 @@ [pytest] -python_paths = src +pythonpath = src diff --git a/requirements.txt b/requirements.txt index b5db972..51e438c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -markdown2>=2.4.0 +markdown2>=2.5.4 +legacycrypt; python_version >= '3.13' diff --git a/src/codext/VERSION.txt b/src/codext/VERSION.txt index 51c7561..054a2bd 100644 --- a/src/codext/VERSION.txt +++ b/src/codext/VERSION.txt @@ -1 +1 @@ -1.15.8 +1.15.9 From 5aaec705c823ea2f906a50b643ca3608684ac918 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Sep 2025 20:43:39 +0000 Subject: [PATCH 97/97] Updated coverage.svg --- docs/coverage.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/coverage.svg b/docs/coverage.svg index 4d30c44..9f90515 100644 --- a/docs/coverage.svg +++ b/docs/coverage.svg @@ -1 +1 @@ -coverage: 98.90%coverage98.90% \ No newline at end of file +coverage: 99.04%coverage99.04% \ No newline at end of file