/usr/lib/python3/dist-packages/skytools/utf8.py is in python3-skytools 3.3-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | r"""UTF-8 sanitizer.
Python's UTF-8 parser is quite relaxed, this creates problems when
talking with other software that uses stricter parsers.
>>> _norm(safe_utf8_decode(b"foobar"))
(True, ['f', 'o', 'o', 'b', 'a', 'r'])
>>> _norm(safe_utf8_decode(b'X\0Z'))
(False, ['X', 65533, 'Z'])
>>> _norm(safe_utf8_decode(b'OK'))
(True, ['O', 'K'])
>>> _norm(safe_utf8_decode(b'X\xF1Y'))
(False, ['X', 65533, 'Y'])
>>> _norm_str(sanitize_unicode(u'\uD801\uDC01'))
[66561]
>>> sanitize_unicode(b'qwe')
Traceback (most recent call last):
...
TypeError: Need unicode string
"""
## these give different results in py27 and py35
# >>> _norm(safe_utf8_decode(b'X\xed\xa0\x80Y\xed\xb0\x89Z'))
# (False, ['X', 65533, 65533, 65533, 'Y', 65533, 65533, 65533, 'Z'])
# >>> _norm(safe_utf8_decode(b'X\xed\xa0\x80\xed\xb0\x89Z'))
# (False, ['X', 65533, 65533, 65533, 65533, 65533, 65533, 'Z'])
# from __future__ import division, absolute_import, print_function
import re
import codecs
try:
unichr
except NameError:
unichr = chr # noqa
unicode = str # noqa
def _norm_char(uchr):
code = ord(uchr)
if code >= 0x20 and code < 0x7f:
return chr(code)
return code
def _norm_str(ustr):
return [_norm_char(c) for c in ustr]
def _norm(tup):
flg, ustr = tup
return (flg, _norm_str(ustr))
__all__ = ['safe_utf8_decode']
# by default, use same symbol as 'replace'
REPLACEMENT_SYMBOL = unichr(0xFFFD) # 65533
def _fix_utf8(m):
"""Merge UTF16 surrogates, replace others"""
u = m.group()
if len(u) == 2:
# merge into single symbol
c1 = ord(u[0])
c2 = ord(u[1])
c = 0x10000 + ((c1 & 0x3FF) << 10) + (c2 & 0x3FF)
return unichr(c)
else:
# use replacement symbol
return REPLACEMENT_SYMBOL
_urc = None
def sanitize_unicode(u):
"""Fix invalid symbols in unicode string."""
global _urc
if not isinstance(u, unicode):
raise TypeError('Need unicode string')
# regex for finding invalid chars, works on unicode string
if not _urc:
rx = u"[\uD800-\uDBFF] [\uDC00-\uDFFF]? | [\0\uDC00-\uDFFF]"
_urc = re.compile(rx, re.X)
# now find and fix UTF16 surrogates
m = _urc.search(u)
if m:
u = _urc.sub(_fix_utf8, u)
return u
def safe_replace(exc):
"""Replace only one symbol at a time.
Builtin .decode('xxx', 'replace') replaces several symbols
together, which is unsafe.
"""
c2 = REPLACEMENT_SYMBOL
# we could assume latin1
#if 0:
# c1 = exc.object[exc.start]
# c2 = unichr(ord(c1))
return c2, exc.start + 1
# register, it will be globally available
codecs.register_error("safe_replace", safe_replace)
def safe_utf8_decode(s):
"""Decode UTF-8 safely.
Acts like str.decode('utf8', 'replace') but also fixes
UTF16 surrogates and NUL bytes, which Python's default
decoder does not do.
@param s: utf8-encoded byte string
@return: tuple of (was_valid_utf8, unicode_string)
"""
# decode with error detection
ok = True
try:
# expect no errors by default
u = s.decode('utf8')
except UnicodeDecodeError:
u = s.decode('utf8', 'safe_replace')
ok = False
u2 = sanitize_unicode(u)
if u is not u2:
ok = False
return (ok, u2)
|