Merge with #532, fix unicode filenames with escapesurogates

ankostis · ankostis · commit ec731f448d30 · 2016-10-16T19:25:20.000+02:00
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.9dev0
+2.0.10dev0
diff --git a/git/compat.py b/git/compat.py
@@ -10,6 +10,8 @@
 import locale
 import os
 import sys
+import codecs
+
 
 from gitdb.utils.compat import (
 xrange,
@@ -67,7 +69,7 @@ def safe_decode(s):
 if isinstance(s, unicode):
 return s
 elif isinstance(s, bytes):
- return s.decode(defenc, 'replace')
+ return s.decode(defenc, 'surrogateescape')
 elif s is not None:
 raise TypeError('Expected bytes or text, but got %r' % (s,))
 
@@ -121,3 +123,191 @@ def __str__(self):
 else: # Python 2
 def __str__(self):
 return self.__unicode__().encode(defenc)
+ 
+ 
+"""
+This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
+handler of Python 3.
+Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
+"""
+
+# This code is released under the Python license and the BSD 2-clause license
+
+
+FS_ERRORS = 'surrogateescape'
+
+# # -- Python 2/3 compatibility -------------------------------------
+# FS_ERRORS = 'my_surrogateescape'
+
+def u(text):
+ if PY3:
+ return text
+ else:
+ return text.decode('unicode_escape')
+
+def b(data):
+ if PY3:
+ return data.encode('latin1')
+ else:
+ return data
+
+if PY3:
+ _unichr = chr
+ bytes_chr = lambda code: bytes((code,))
+else:
+ _unichr = unichr
+ bytes_chr = chr
+
+def surrogateescape_handler(exc):
+ """
+ Pure Python implementation of the PEP 383: the "surrogateescape" error
+ handler of Python 3. Undecodable bytes will be replaced by a Unicode
+ character U+DCxx on decoding, and these are translated into the
+ original bytes on encoding.
+ """
+ mystring = exc.object[exc.start:exc.end]
+
+ try:
+ if isinstance(exc, UnicodeDecodeError):
+ # mystring is a byte-string in this case
+ decoded = replace_surrogate_decode(mystring)
+ elif isinstance(exc, UnicodeEncodeError):
+ # In the case of u'\udcc3'.encode('ascii',
+ # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
+ # exception anyway after this function is called, even though I think
+ # it's doing what it should. It seems that the strict encoder is called
+ # to encode the unicode string that this function returns ...
+ decoded = replace_surrogate_encode(mystring)
+ else:
+ raise exc
+ except NotASurrogateError:
+ raise exc
+ return (decoded, exc.end)
+
+
+class NotASurrogateError(Exception):
+ pass
+
+
+def replace_surrogate_encode(mystring):
+ """
+ Returns a (unicode) string, not the more logical bytes, because the codecs
+ register_error functionality expects this.
+ """
+ decoded = []
+ for ch in mystring:
+ # if PY3:
+ # code = ch
+ # else:
+ code = ord(ch)
+
+ # The following magic comes from Py3.3's Python/codecs.c file:
+ if not 0xD800 <= code <= 0xDCFF:
+ # Not a surrogate. Fail with the original exception.
+ raise exc
+ # mybytes = [0xe0 | (code >> 12),
+ # 0x80 | ((code >> 6) & 0x3f),
+ # 0x80 | (code & 0x3f)]
+ # Is this a good idea?
+ if 0xDC00 <= code <= 0xDC7F:
+ decoded.append(_unichr(code - 0xDC00))
+ elif code <= 0xDCFF:
+ decoded.append(_unichr(code - 0xDC00))
+ else:
+ raise NotASurrogateError
+ return str().join(decoded)
+
+
+def replace_surrogate_decode(mybytes):
+ """
+ Returns a (unicode) string
+ """
+ decoded = []
+ for ch in mybytes:
+ # We may be parsing newbytes (in which case ch is an int) or a native
+ # str on Py2
+ if isinstance(ch, int):
+ code = ch
+ else:
+ code = ord(ch)
+ if 0x80 <= code <= 0xFF:
+ decoded.append(_unichr(0xDC00 + code))
+ elif code <= 0x7F:
+ decoded.append(_unichr(code))
+ else:
+ # # It may be a bad byte
+ # # Try swallowing it.
+ # continue
+ # print("RAISE!")
+ raise NotASurrogateError
+ return str().join(decoded)
+
+
+def encodefilename(fn):
+ if FS_ENCODING == 'ascii':
+ # ASCII encoder of Python 2 expects that the error handler returns a
+ # Unicode string encodable to ASCII, whereas our surrogateescape error
+ # handler has to return bytes in 0x80-0xFF range.
+ encoded = []
+ for index, ch in enumerate(fn):
+ code = ord(ch)
+ if code < 128:
+ ch = bytes_chr(code)
+ elif 0xDC80 <= code <= 0xDCFF:
+ ch = bytes_chr(code - 0xDC00)
+ else:
+ raise UnicodeEncodeError(FS_ENCODING,
+ fn, index, index+1,
+ 'ordinal not in range(128)')
+ encoded.append(ch)
+ return bytes().join(encoded)
+ elif FS_ENCODING == 'utf-8':
+ # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
+ # doesn't go through our error handler
+ encoded = []
+ for index, ch in enumerate(fn):
+ code = ord(ch)
+ if 0xD800 <= code <= 0xDFFF:
+ if 0xDC80 <= code <= 0xDCFF:
+ ch = bytes_chr(code - 0xDC00)
+ encoded.append(ch)
+ else:
+ raise UnicodeEncodeError(
+ FS_ENCODING,
+ fn, index, index+1, 'surrogates not allowed')
+ else:
+ ch_utf8 = ch.encode('utf-8')
+ encoded.append(ch_utf8)
+ return bytes().join(encoded)
+ else:
+ return fn.encode(FS_ENCODING, FS_ERRORS)
+
+def decodefilename(fn):
+ return fn.decode(FS_ENCODING, FS_ERRORS)
+
+FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
+# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
+# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
+
+
+# normalize the filesystem encoding name.
+# For example, we expect "utf-8", not "UTF8".
+FS_ENCODING = codecs.lookup(FS_ENCODING).name
+
+
+def register_surrogateescape():
+ """
+ Registers the surrogateescape error handler on Python 2 (only)
+ """
+ if PY3:
+ return
+ try:
+ codecs.lookup_error(FS_ERRORS)
+ except LookupError:
+ codecs.register_error(FS_ERRORS, surrogateescape_handler)
+
+
+try:
+ b"100644 \x9f\0aaa".decode(defenc, "surrogateescape")
+except:
+ register_surrogateescape()
diff --git a/git/ext/gitdb b/git/ext/gitdb
@@ -1 +1 @@
-Subproject commit 97035c64f429c229629c25becc54ae44dd95e49d
+Subproject commit 38866bc7c4956170c681a62c4508f934ac826469
diff --git a/git/objects/fun.py b/git/objects/fun.py
@@ -2,6 +2,7 @@
 from stat import S_ISDIR
 from git.compat import (
 byte_ord,
+ safe_decode,
 defenc,
 xrange,
 text_type,
@@ -76,11 +77,7 @@ def tree_entries_from_data(data):
 # default encoding for strings in git is utf8
 # Only use the respective unicode object if the byte stream was encoded
 name = data[ns:i]
- try:
- name = name.decode(defenc)
- except UnicodeDecodeError:
- pass
- # END handle encoding
+ name = safe_decode(name)
 
 # byte is NULL, get next 20
 i += 1
diff --git a/git/test/performance/test_commit.py b/git/test/performance/test_commit.py
@@ -52,7 +52,7 @@ def test_iteration(self):
 # END for each object
 # END for each commit
 elapsed_time = time() - st
- print("Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )"
+ print("Traversed %i Trees and a total of %i uncached objects in %s [s] ( %f objs/s )"
 % (nc, no, elapsed_time, no / elapsed_time), file=sys.stderr)
 
 def test_commit_traversal(self):
diff --git a/git/test/test_fun.py b/git/test/test_fun.py
@@ -1,10 +1,8 @@
 from io import BytesIO
-from stat import (
- S_IFDIR,
- S_IFREG,
- S_IFLNK
-)
+from stat import S_IFDIR, S_IFREG, S_IFLNK
+from unittest.case import skipIf
 
+from git.compat import PY3
 from git.index import IndexFile
 from git.index.fun import (
 aggressive_tree_merge
@@ -253,6 +251,12 @@ def test_tree_traversal_single(self):
 assert entries
 # END for each commit
 
- def test_tree_entries_from_data_with_failing_name_decode(self):
+ @skipIf(PY3, 'odd types returned ... maybe figure it out one day')
+ def test_tree_entries_from_data_with_failing_name_decode_py2(self):
+ r = tree_entries_from_data(b'100644 \x9f\0aaa')
+ assert r == [('aaa', 33188, u'\udc9f')], r
+
+ @skipIf(not PY3, 'odd types returned ... maybe figure it out one day')
+ def test_tree_entries_from_data_with_failing_name_decode_py3(self):
 r = tree_entries_from_data(b'100644 \x9f\0aaa')
- assert r == [(b'aaa', 33188, b'\x9f')], r
+ assert r == [(b'aaa', 33188, '\udc9f')], r
diff --git a/setup.py b/setup.py
@@ -64,7 +64,7 @@ def _stamp_version(filename):
 else:
 print("WARNING: Couldn't find version line in file %s" % filename, file=sys.stderr)
 
-install_requires = ['gitdb >= 0.6.4']
+install_requires = ['gitdb2 >= 2.0.0']
 extras_require ={
 ':python_version == "2.6"': ['ordereddict'],
 }
@@ -100,7 +100,7 @@ def _stamp_version(filename):
 package_data={'git.test': ['fixtures/*']},
 package_dir={'git': 'git'},
 license="BSD License",
- requires=['gitdb (>=0.6.4)'],
+ requires=['gitdb2 (>=2.0.0)'],
 install_requires=install_requires,
 test_requirements=test_requires + install_requires,
 zip_safe=False,

-Original file line number
+Diff line change
 importlocale
 importos
 importsys
 +importcodecs
++
 fromgitdb.utils.compatimport (
 xrange,
 ifisinstance(s, unicode):
 returns
 elifisinstance(s, bytes):
 -returns.decode(defenc, 'replace')
 +returns.decode(defenc, 'surrogateescape')
 elifsisnotNone:
 raiseTypeError('Expected bytes or text, but got %r'% (s,))
 else: # Python 2
 def__str__(self):
 returnself.__unicode__().encode(defenc)
++
++
 +"""
 +This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
 +handler of Python 3.
 +Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
 +"""
++
 +# This code is released under the Python license and the BSD 2-clause license
++
++
 +FS_ERRORS='surrogateescape'
++
 +# # -- Python 2/3 compatibility -------------------------------------
 +# FS_ERRORS = 'my_surrogateescape'
++
 +defu(text):
 +ifPY3:
 +returntext
 +else:
 +returntext.decode('unicode_escape')
++
 +defb(data):
 +ifPY3:
 +returndata.encode('latin1')
 +else:
 +returndata
++
 +ifPY3:
 +_unichr=chr
 +bytes_chr=lambdacode: bytes((code,))
 +else:
 +_unichr=unichr
 +bytes_chr=chr
++
 +defsurrogateescape_handler(exc):
 +"""
 + Pure Python implementation of the PEP 383: the "surrogateescape" error
 + handler of Python 3. Undecodable bytes will be replaced by a Unicode
 + character U+DCxx on decoding, and these are translated into the
 + original bytes on encoding.
 + """
 +mystring=exc.object[exc.start:exc.end]
++
 +try:
 +ifisinstance(exc, UnicodeDecodeError):
 +# mystring is a byte-string in this case
 +decoded=replace_surrogate_decode(mystring)
 +elifisinstance(exc, UnicodeEncodeError):
 +# In the case of u'\udcc3'.encode('ascii',
 +# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
 +# exception anyway after this function is called, even though I think
 +# it's doing what it should. It seems that the strict encoder is called
 +# to encode the unicode string that this function returns ...
 +decoded=replace_surrogate_encode(mystring)
 +else:
 +raiseexc
 +exceptNotASurrogateError:
 +raiseexc
 +return (decoded, exc.end)
++
++
 +classNotASurrogateError(Exception):
 +pass
++
++
 +defreplace_surrogate_encode(mystring):
 +"""
 + Returns a (unicode) string, not the more logical bytes, because the codecs
 + register_error functionality expects this.
 + """
 +decoded= []
 +forchinmystring:
 +# if PY3:
 +# code = ch
 +# else:
 +code=ord(ch)
++
 +# The following magic comes from Py3.3's Python/codecs.c file:
 +ifnot0xD800<=code<=0xDCFF:
 +# Not a surrogate. Fail with the original exception.
 +raiseexc
 +# mybytes = [0xe0 | (code >> 12),
 +# 0x80 | ((code >> 6) & 0x3f),
 +# 0x80 | (code & 0x3f)]
 +# Is this a good idea?
 +if0xDC00<=code<=0xDC7F:
 +decoded.append(_unichr(code-0xDC00))
 +elifcode<=0xDCFF:
 +decoded.append(_unichr(code-0xDC00))
 +else:
 +raiseNotASurrogateError
 +returnstr().join(decoded)
++
++
 +defreplace_surrogate_decode(mybytes):
 +"""
 + Returns a (unicode) string
 + """
 +decoded= []
 +forchinmybytes:
 +# We may be parsing newbytes (in which case ch is an int) or a native
 +# str on Py2
 +ifisinstance(ch, int):
 +code=ch
 +else:
 +code=ord(ch)
 +if0x80<=code<=0xFF:
 +decoded.append(_unichr(0xDC00+code))
 +elifcode<=0x7F:
 +decoded.append(_unichr(code))
 +else:
 +# # It may be a bad byte
 +# # Try swallowing it.
 +# continue
 +# print("RAISE!")
 +raiseNotASurrogateError
 +returnstr().join(decoded)
++
++
 +defencodefilename(fn):
 +ifFS_ENCODING=='ascii':
 +# ASCII encoder of Python 2 expects that the error handler returns a
 +# Unicode string encodable to ASCII, whereas our surrogateescape error
 +# handler has to return bytes in 0x80-0xFF range.
 +encoded= []
 +forindex, chinenumerate(fn):
 +code=ord(ch)
 +ifcode<128:
 +ch=bytes_chr(code)
 +elif0xDC80<=code<=0xDCFF:
 +ch=bytes_chr(code-0xDC00)
 +else:
 +raiseUnicodeEncodeError(FS_ENCODING,
 +fn, index, index+1,
 +'ordinal not in range(128)')
 +encoded.append(ch)
 +returnbytes().join(encoded)
 +elifFS_ENCODING=='utf-8':
 +# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
 +# doesn't go through our error handler
 +encoded= []
 +forindex, chinenumerate(fn):
 +code=ord(ch)
 +if0xD800<=code<=0xDFFF:
 +if0xDC80<=code<=0xDCFF:
 +ch=bytes_chr(code-0xDC00)
 +encoded.append(ch)
 +else:
 +raiseUnicodeEncodeError(
 +FS_ENCODING,
 +fn, index, index+1, 'surrogates not allowed')
 +else:
 +ch_utf8=ch.encode('utf-8')
 +encoded.append(ch_utf8)
 +returnbytes().join(encoded)
 +else:
 +returnfn.encode(FS_ENCODING, FS_ERRORS)
++
 +defdecodefilename(fn):
 +returnfn.decode(FS_ENCODING, FS_ERRORS)
++
 +FS_ENCODING='ascii'; fn=b('[abc\xff]'); encoded=u('[abc\udcff]')
 +# FS_ENCODING = 'cp932' fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
 +# FS_ENCODING = 'UTF-8' fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
++
++
 +# normalize the filesystem encoding name.
 +# For example, we expect "utf-8", not "UTF8".
 +FS_ENCODING=codecs.lookup(FS_ENCODING).name
++
++
 +defregister_surrogateescape():
 +"""
 + Registers the surrogateescape error handler on Python 2 (only)
 + """
 +ifPY3:
 +return
 +try:
 +codecs.lookup_error(FS_ERRORS)
 +exceptLookupError:
 +codecs.register_error(FS_ERRORS, surrogateescape_handler)
++
++
 +try:
 +b"100644 \x9f\0aaa".decode(defenc, "surrogateescape")
 +except:
 +register_surrogateescape()
-Original file line number
+Diff line change
@@ @@ -1,10 +1,8 @@ @@
 fromioimportBytesIO
 -fromstatimport (
 -S_IFDIR,
 -S_IFREG,
 -S_IFLNK
 -)
 +fromstatimportS_IFDIR, S_IFREG, S_IFLNK
 +fromunittest.caseimportskipIf
 +fromgit.compatimportPY3
 fromgit.indeximportIndexFile
 fromgit.index.funimport (
 aggressive_tree_merge
 assertentries
 # END for each commit
 -deftest_tree_entries_from_data_with_failing_name_decode(self):
 +@skipIf(PY3, 'odd types returned ... maybe figure it out one day')
 +deftest_tree_entries_from_data_with_failing_name_decode_py2(self):
 +r=tree_entries_from_data(b'100644 \x9f\0aaa')
 +assertr== [('aaa', 33188, u'\udc9f')], r
++
 +@skipIf(notPY3, 'odd types returned ... maybe figure it out one day')
 +deftest_tree_entries_from_data_with_failing_name_decode_py3(self):
 r=tree_entries_from_data(b'100644 \x9f\0aaa')
 -assertr== [(b'aaa', 33188, b'\x9f')], r
 +assertr== [(b'aaa', 33188, '\udc9f')], r