-
Notifications
You must be signed in to change notification settings - Fork 165
Expand file tree
/
Copy pathunicode_dammit.py
More file actions
78 lines (63 loc) · 2 KB
/
unicode_dammit.py
File metadata and controls
78 lines (63 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
import os
import codecs
CHARSET_ALIASES = {'macintosh': 'mac-roman', 'x-sjis': 'shift-jis'}
ENCODINGS = [
'windows-1252',
'iso-8859-1',
'iso-8859-2',
]
def unicode_dammit(string, logger=None):
for encoding in ENCODINGS:
try:
string = string.strip(os.linesep)
except UnicodeDecodeError:
u = _convert_from(string, encoding)
if u:
string = u
break
return string
def _convert_from(markup, proposed, errors='strict'):
proposed = _find_codec(proposed)
try:
u = _to_unicode(markup, proposed, errors)
markup = u
except Exception:
return None
return markup
def _to_unicode(self, data, encoding, errors='strict'):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
# strip Byte Order Mark (if present)
if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
data = data[2:]
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
encoding = 'utf-16le'
data = data[2:]
elif data[:3] == '\xef\xbb\xbf':
encoding = 'utf-8'
data = data[3:]
elif data[:4] == '\x00\x00\xfe\xff':
encoding = 'utf-32be'
data = data[4:]
elif data[:4] == '\xff\xfe\x00\x00':
encoding = 'utf-32le'
data = data[4:]
newdata = unicode(data, encoding, errors)
return newdata
def _find_codec(self, charset):
return _codec(CHARSET_ALIASES.get(charset, charset)) \
or (charset and self._codec(charset.replace('-', ''))) \
or (charset and self._codec(charset.replace('-', '_'))) \
or charset
def _codec(self, charset):
if not charset:
return charset
codec = None
try:
codecs.lookup(charset)
codec = charset
except (LookupError, ValueError):
pass
return codec