mirror of
https://github.com/python/cpython.git
synced 2025-09-26 10:19:53 +00:00
[3.12] gh-113028: Correctly memoize str in pickle when escapes added (GH-113436) (GH-113448)
This fixes a divergence between the Python and C implementations of pickle
for protocol 0, such that it pickle.py fails to re-use the first pickled
representation of strings involving characters that have to be escaped.
(cherry picked from commit 08398631a0
)
Co-authored-by: Jeff Allen <ja.py@farowl.co.uk>
This commit is contained in:
parent
15ea4a4292
commit
269cb342ad
3 changed files with 21 additions and 7 deletions
|
@ -855,13 +855,13 @@ class _Pickler:
|
||||||
else:
|
else:
|
||||||
self.write(BINUNICODE + pack("<I", n) + encoded)
|
self.write(BINUNICODE + pack("<I", n) + encoded)
|
||||||
else:
|
else:
|
||||||
obj = obj.replace("\\", "\\u005c")
|
# Escape what raw-unicode-escape doesn't, but memoize the original.
|
||||||
obj = obj.replace("\0", "\\u0000")
|
tmp = obj.replace("\\", "\\u005c")
|
||||||
obj = obj.replace("\n", "\\u000a")
|
tmp = tmp.replace("\0", "\\u0000")
|
||||||
obj = obj.replace("\r", "\\u000d")
|
tmp = tmp.replace("\n", "\\u000a")
|
||||||
obj = obj.replace("\x1a", "\\u001a") # EOF on DOS
|
tmp = tmp.replace("\r", "\\u000d")
|
||||||
self.write(UNICODE + obj.encode('raw-unicode-escape') +
|
tmp = tmp.replace("\x1a", "\\u001a") # EOF on DOS
|
||||||
b'\n')
|
self.write(UNICODE + tmp.encode('raw-unicode-escape') + b'\n')
|
||||||
self.memoize(obj)
|
self.memoize(obj)
|
||||||
dispatch[str] = save_str
|
dispatch[str] = save_str
|
||||||
|
|
||||||
|
|
|
@ -1825,6 +1825,14 @@ class AbstractPickleTests:
|
||||||
t2 = self.loads(p)
|
t2 = self.loads(p)
|
||||||
self.assert_is_copy(t, t2)
|
self.assert_is_copy(t, t2)
|
||||||
|
|
||||||
|
def test_unicode_memoization(self):
|
||||||
|
# Repeated str is re-used (even when escapes added).
|
||||||
|
for proto in protocols:
|
||||||
|
for s in '', 'xyz', 'xyz\n', 'x\\yz', 'x\xa1yz\r':
|
||||||
|
p = self.dumps((s, s), proto)
|
||||||
|
s1, s2 = self.loads(p)
|
||||||
|
self.assertIs(s1, s2)
|
||||||
|
|
||||||
def test_bytes(self):
|
def test_bytes(self):
|
||||||
for proto in protocols:
|
for proto in protocols:
|
||||||
for s in b'', b'xyz', b'xyz'*100:
|
for s in b'', b'xyz', b'xyz'*100:
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
When a second reference to a string appears in the input to :mod:`pickle`,
|
||||||
|
and the Python implementation is in use,
|
||||||
|
we are guaranteed that a single copy gets pickled
|
||||||
|
and a single object is shared when reloaded.
|
||||||
|
Previously, in protocol 0, when a string contained certain characters
|
||||||
|
(e.g. newline) it resulted in duplicate objects.
|
Loading…
Add table
Add a link
Reference in a new issue