Skip to content

Commit ec731f4

Browse files
committed
Merge with #532, fix unicode filenames with escapesurogates
2 parents b2efa1b + 9e4a454 commit ec731f4

File tree

7 files changed

+209
-18
lines changed

7 files changed

+209
-18
lines changed

‎VERSION‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0.9dev0
1+
2.0.10dev0

‎git/compat.py‎

Lines changed: 191 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
importlocale
1111
importos
1212
importsys
13+
importcodecs
14+
1315

1416
fromgitdb.utils.compatimport (
1517
xrange,
@@ -67,7 +69,7 @@ def safe_decode(s):
6769
ifisinstance(s, unicode):
6870
returns
6971
elifisinstance(s, bytes):
70-
returns.decode(defenc, 'replace')
72+
returns.decode(defenc, 'surrogateescape')
7173
elifsisnotNone:
7274
raiseTypeError('Expected bytes or text, but got %r'% (s,))
7375

@@ -121,3 +123,191 @@ def __str__(self):
121123
else: # Python 2
122124
def__str__(self):
123125
returnself.__unicode__().encode(defenc)
126+
127+
128+
"""
129+
This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
130+
handler of Python 3.
131+
Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
132+
"""
133+
134+
# This code is released under the Python license and the BSD 2-clause license
135+
136+
137+
FS_ERRORS='surrogateescape'
138+
139+
# # -- Python 2/3 compatibility -------------------------------------
140+
# FS_ERRORS = 'my_surrogateescape'
141+
142+
defu(text):
143+
ifPY3:
144+
returntext
145+
else:
146+
returntext.decode('unicode_escape')
147+
148+
defb(data):
149+
ifPY3:
150+
returndata.encode('latin1')
151+
else:
152+
returndata
153+
154+
ifPY3:
155+
_unichr=chr
156+
bytes_chr=lambdacode: bytes((code,))
157+
else:
158+
_unichr=unichr
159+
bytes_chr=chr
160+
161+
defsurrogateescape_handler(exc):
162+
"""
163+
Pure Python implementation of the PEP 383: the "surrogateescape" error
164+
handler of Python 3. Undecodable bytes will be replaced by a Unicode
165+
character U+DCxx on decoding, and these are translated into the
166+
original bytes on encoding.
167+
"""
168+
mystring=exc.object[exc.start:exc.end]
169+
170+
try:
171+
ifisinstance(exc, UnicodeDecodeError):
172+
# mystring is a byte-string in this case
173+
decoded=replace_surrogate_decode(mystring)
174+
elifisinstance(exc, UnicodeEncodeError):
175+
# In the case of u'\udcc3'.encode('ascii',
176+
# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
177+
# exception anyway after this function is called, even though I think
178+
# it's doing what it should. It seems that the strict encoder is called
179+
# to encode the unicode string that this function returns ...
180+
decoded=replace_surrogate_encode(mystring)
181+
else:
182+
raiseexc
183+
exceptNotASurrogateError:
184+
raiseexc
185+
return (decoded, exc.end)
186+
187+
188+
classNotASurrogateError(Exception):
189+
pass
190+
191+
192+
defreplace_surrogate_encode(mystring):
193+
"""
194+
Returns a (unicode) string, not the more logical bytes, because the codecs
195+
register_error functionality expects this.
196+
"""
197+
decoded= []
198+
forchinmystring:
199+
# if PY3:
200+
# code = ch
201+
# else:
202+
code=ord(ch)
203+
204+
# The following magic comes from Py3.3's Python/codecs.c file:
205+
ifnot0xD800<=code<=0xDCFF:
206+
# Not a surrogate. Fail with the original exception.
207+
raiseexc
208+
# mybytes = [0xe0 | (code >> 12),
209+
# 0x80 | ((code >> 6) & 0x3f),
210+
# 0x80 | (code & 0x3f)]
211+
# Is this a good idea?
212+
if0xDC00<=code<=0xDC7F:
213+
decoded.append(_unichr(code-0xDC00))
214+
elifcode<=0xDCFF:
215+
decoded.append(_unichr(code-0xDC00))
216+
else:
217+
raiseNotASurrogateError
218+
returnstr().join(decoded)
219+
220+
221+
defreplace_surrogate_decode(mybytes):
222+
"""
223+
Returns a (unicode) string
224+
"""
225+
decoded= []
226+
forchinmybytes:
227+
# We may be parsing newbytes (in which case ch is an int) or a native
228+
# str on Py2
229+
ifisinstance(ch, int):
230+
code=ch
231+
else:
232+
code=ord(ch)
233+
if0x80<=code<=0xFF:
234+
decoded.append(_unichr(0xDC00+code))
235+
elifcode<=0x7F:
236+
decoded.append(_unichr(code))
237+
else:
238+
# # It may be a bad byte
239+
# # Try swallowing it.
240+
# continue
241+
# print("RAISE!")
242+
raiseNotASurrogateError
243+
returnstr().join(decoded)
244+
245+
246+
defencodefilename(fn):
247+
ifFS_ENCODING=='ascii':
248+
# ASCII encoder of Python 2 expects that the error handler returns a
249+
# Unicode string encodable to ASCII, whereas our surrogateescape error
250+
# handler has to return bytes in 0x80-0xFF range.
251+
encoded= []
252+
forindex, chinenumerate(fn):
253+
code=ord(ch)
254+
ifcode<128:
255+
ch=bytes_chr(code)
256+
elif0xDC80<=code<=0xDCFF:
257+
ch=bytes_chr(code-0xDC00)
258+
else:
259+
raiseUnicodeEncodeError(FS_ENCODING,
260+
fn, index, index+1,
261+
'ordinal not in range(128)')
262+
encoded.append(ch)
263+
returnbytes().join(encoded)
264+
elifFS_ENCODING=='utf-8':
265+
# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
266+
# doesn't go through our error handler
267+
encoded= []
268+
forindex, chinenumerate(fn):
269+
code=ord(ch)
270+
if0xD800<=code<=0xDFFF:
271+
if0xDC80<=code<=0xDCFF:
272+
ch=bytes_chr(code-0xDC00)
273+
encoded.append(ch)
274+
else:
275+
raiseUnicodeEncodeError(
276+
FS_ENCODING,
277+
fn, index, index+1, 'surrogates not allowed')
278+
else:
279+
ch_utf8=ch.encode('utf-8')
280+
encoded.append(ch_utf8)
281+
returnbytes().join(encoded)
282+
else:
283+
returnfn.encode(FS_ENCODING, FS_ERRORS)
284+
285+
defdecodefilename(fn):
286+
returnfn.decode(FS_ENCODING, FS_ERRORS)
287+
288+
FS_ENCODING='ascii'; fn=b('[abc\xff]'); encoded=u('[abc\udcff]')
289+
# FS_ENCODING = 'cp932' fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
290+
# FS_ENCODING = 'UTF-8' fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
291+
292+
293+
# normalize the filesystem encoding name.
294+
# For example, we expect "utf-8", not "UTF8".
295+
FS_ENCODING=codecs.lookup(FS_ENCODING).name
296+
297+
298+
defregister_surrogateescape():
299+
"""
300+
Registers the surrogateescape error handler on Python 2 (only)
301+
"""
302+
ifPY3:
303+
return
304+
try:
305+
codecs.lookup_error(FS_ERRORS)
306+
exceptLookupError:
307+
codecs.register_error(FS_ERRORS, surrogateescape_handler)
308+
309+
310+
try:
311+
b"100644 \x9f\0aaa".decode(defenc, "surrogateescape")
312+
except:
313+
register_surrogateescape()

‎git/objects/fun.py‎

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
fromstatimportS_ISDIR
33
fromgit.compatimport (
44
byte_ord,
5+
safe_decode,
56
defenc,
67
xrange,
78
text_type,
@@ -76,11 +77,7 @@ def tree_entries_from_data(data):
7677
# default encoding for strings in git is utf8
7778
# Only use the respective unicode object if the byte stream was encoded
7879
name=data[ns:i]
79-
try:
80-
name=name.decode(defenc)
81-
exceptUnicodeDecodeError:
82-
pass
83-
# END handle encoding
80+
name=safe_decode(name)
8481

8582
# byte is NULL, get next 20
8683
i+=1

‎git/test/performance/test_commit.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_iteration(self):
5252
# END for each object
5353
# END for each commit
5454
elapsed_time=time() -st
55-
print("Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )"
55+
print("Traversed %i Trees and a total of %i uncached objects in %s [s] ( %f objs/s )"
5656
% (nc, no, elapsed_time, no/elapsed_time), file=sys.stderr)
5757

5858
deftest_commit_traversal(self):

‎git/test/test_fun.py‎

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
fromioimportBytesIO
2-
fromstatimport (
3-
S_IFDIR,
4-
S_IFREG,
5-
S_IFLNK
6-
)
2+
fromstatimportS_IFDIR, S_IFREG, S_IFLNK
3+
fromunittest.caseimportskipIf
74

5+
fromgit.compatimportPY3
86
fromgit.indeximportIndexFile
97
fromgit.index.funimport (
108
aggressive_tree_merge
@@ -253,6 +251,12 @@ def test_tree_traversal_single(self):
253251
assertentries
254252
# END for each commit
255253

256-
deftest_tree_entries_from_data_with_failing_name_decode(self):
254+
@skipIf(PY3, 'odd types returned ... maybe figure it out one day')
255+
deftest_tree_entries_from_data_with_failing_name_decode_py2(self):
256+
r=tree_entries_from_data(b'100644 \x9f\0aaa')
257+
assertr== [('aaa', 33188, u'\udc9f')], r
258+
259+
@skipIf(notPY3, 'odd types returned ... maybe figure it out one day')
260+
deftest_tree_entries_from_data_with_failing_name_decode_py3(self):
257261
r=tree_entries_from_data(b'100644 \x9f\0aaa')
258-
assertr== [(b'aaa', 33188, b'\x9f')], r
262+
assertr== [(b'aaa', 33188, '\udc9f')], r

‎setup.py‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def _stamp_version(filename):
6464
else:
6565
print("WARNING: Couldn't find version line in file %s"%filename, file=sys.stderr)
6666

67-
install_requires= ['gitdb >= 0.6.4']
67+
install_requires= ['gitdb2 >= 2.0.0']
6868
extras_require={
6969
':python_version == "2.6"': ['ordereddict'],
7070
}
@@ -100,7 +100,7 @@ def _stamp_version(filename):
100100
package_data={'git.test': ['fixtures/*']},
101101
package_dir={'git': 'git'},
102102
license="BSD License",
103-
requires=['gitdb (>=0.6.4)'],
103+
requires=['gitdb2 (>=2.0.0)'],
104104
install_requires=install_requires,
105105
test_requirements=test_requires+install_requires,
106106
zip_safe=False,

0 commit comments

Comments
(0)