From 7be0ee8272a541e291f13ed67d69b951ae42a9da Mon Sep 17 00:00:00 2001 From: Andreas Eriksen Date: Thu, 18 Dec 2025 16:48:26 +0100 Subject: [PATCH] Merge commit from fork * track depth of recursive encode/decode, clear shared refs on start * test that shared refs are cleared on start * add fix-shared-state-reset to version history * clear shared state _after_ encode/decode * use PY_SSIZE_T_MAX to clear shareables list * use context manager for python decoder depth tracking * use context manager for python encoder depth tracking CVE: CVE-2025-68131 Upstream-Status: Backport [https://github.com/agronholm/cbor2/commit/f1d701cd2c411ee40bb1fe383afe7f365f35abf0] Signed-off-by: Hitendra Prajapati --- cbor2/decoder.py | 26 ++++++++++++++-- cbor2/encoder.py | 42 +++++++++++++++++++++----- source/decoder.c | 28 +++++++++++++++++- source/decoder.h | 1 + source/encoder.c | 23 +++++++++++++-- source/encoder.h | 1 + tests/test_decoder.py | 62 ++++++++++++++++++++++++++++++++++++++ tests/test_encoder.py | 69 +++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 239 insertions(+), 13 deletions(-) diff --git a/cbor2/decoder.py b/cbor2/decoder.py index be7198b..f2d818c 100644 --- a/cbor2/decoder.py +++ b/cbor2/decoder.py @@ -2,6 +2,7 @@ import re import struct import sys from collections.abc import Mapping +from contextlib import contextmanager from datetime import datetime, timedelta, timezone from io import BytesIO @@ -40,7 +41,7 @@ class CBORDecoder: __slots__ = ( '_tag_hook', '_object_hook', '_share_index', '_shareables', '_fp_read', - '_immutable', '_str_errors', '_stringref_namespace') + '_immutable', '_str_errors', '_stringref_namespace', '_decode_depth') def __init__(self, fp, tag_hook=None, object_hook=None, str_errors='strict'): @@ -52,6 +53,7 @@ class CBORDecoder: self._shareables = [] self._stringref_namespace = None self._immutable = False + self._decode_depth = 0 @property def immutable(self): @@ -173,13 +175,32 @@ class CBORDecoder: if unshared: self._share_index = old_index + @contextmanager + def _decoding_context(self): + """ + Context manager for tracking decode depth and clearing shared state. + Shared state is cleared at the end of each top-level decode to prevent + shared references from leaking between independent decode operations. + Nested calls (from hooks) must preserve the state. + """ + self._decode_depth += 1 + try: + yield + finally: + self._decode_depth -= 1 + assert self._decode_depth >= 0 + if self._decode_depth == 0: + self._shareables.clear() + self._share_index = None + def decode(self): """ Decode the next value from the stream. :raises CBORDecodeError: if there is any problem decoding the stream """ - return self._decode() + with self._decoding_context(): + return self._decode() def decode_from_bytes(self, buf): """ @@ -190,6 +211,7 @@ class CBORDecoder: object needs to be decoded separately from the rest but while still taking advantage of the shared value registry. """ + with self._decoding_context(): with BytesIO(buf) as fp: old_fp = self.fp self.fp = fp diff --git a/cbor2/encoder.py b/cbor2/encoder.py index 42526c0..0a5722d 100644 --- a/cbor2/encoder.py +++ b/cbor2/encoder.py @@ -109,7 +109,7 @@ class CBOREncoder: __slots__ = ( 'datetime_as_timestamp', '_timezone', '_default', 'value_sharing', '_fp_write', '_shared_containers', '_encoders', '_canonical', - 'string_referencing', 'string_namespacing', '_string_references') + 'string_referencing', 'string_namespacing', '_string_references', '_encode_depth') def __init__(self, fp, datetime_as_timestamp=False, timezone=None, value_sharing=False, default=None, canonical=False, @@ -124,6 +124,7 @@ class CBOREncoder: self._canonical = canonical self._shared_containers = {} # indexes used for value sharing self._string_references = {} # indexes used for string references + self._encode_depth = 0 self._encoders = default_encoders.copy() if canonical: self._encoders.update(canonical_encoders) @@ -236,6 +237,23 @@ class CBOREncoder: """ self._fp_write(data) + @contextmanager + def _encoding_context(self): + """ + Context manager for tracking encode depth and clearing shared state. + Shared state is cleared at the end of each top-level encode to prevent + shared references from leaking between independent encode operations. + Nested calls (from hooks) must preserve the state. + """ + self._encode_depth += 1 + try: + yield + finally: + self._encode_depth -= 1 + if self._encode_depth == 0: + self._shared_containers.clear() + self._string_references.clear() + def encode(self, obj): """ Encode the given object using CBOR. @@ -243,6 +261,14 @@ class CBOREncoder: :param obj: the object to encode """ + with self._encoding_context(): + self._encode_value(obj) + def _encode_value(self, obj: Any) -> None: + """ + Internal fast path for encoding - used by built-in encoders. + External code should use encode() instead, which properly manages + shared state between independent encode operations. + """ obj_type = obj.__class__ encoder = ( self._encoders.get(obj_type) or @@ -390,14 +416,14 @@ class CBOREncoder: def encode_array(self, value): self.encode_length(4, len(value)) for item in value: - self.encode(item) + self._encode_value(item) @container_encoder def encode_map(self, value): self.encode_length(5, len(value)) for key, val in value.items(): - self.encode(key) - self.encode(val) + self._encode_value(key) + self._encode_value(val) def encode_sortable_key(self, value): """ @@ -422,10 +448,10 @@ class CBOREncoder: # String referencing requires that the order encoded is # the same as the order emitted so string references are # generated after an order is determined - self.encode(realkey) + self._encode_value(realkey) else: self._fp_write(sortkey[1]) - self.encode(value) + self._encode_value(value) def encode_semantic(self, value): # Nested string reference domains are distinct @@ -436,7 +462,7 @@ class CBOREncoder: self._string_references = {} self.encode_length(6, value.tag) - self.encode(value.value) + self._encode_value(value.value) self.string_referencing = old_string_referencing self._string_references = old_string_references @@ -489,7 +515,7 @@ class CBOREncoder: def encode_stringref(self, value): # Semantic tag 25 if not self._stringref(value): - self.encode(value) + self._encode_value(value) def encode_rational(self, value): # Semantic tag 30 diff --git a/source/decoder.c b/source/decoder.c index d4da393..d8df2b5 100644 --- a/source/decoder.c +++ b/source/decoder.c @@ -139,6 +139,7 @@ CBORDecoder_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) self->str_errors = PyBytes_FromString("strict"); self->immutable = false; self->shared_index = -1; + self->decode_depth = 0; } return (PyObject *) self; error: @@ -1747,11 +1748,30 @@ decode(CBORDecoderObject *self, DecodeOptions options) } +// Reset shared state at the end of each top-level decode to prevent +// shared references from leaking between independent decode operations. +// Nested calls (from hooks) must preserve the state. +static inline void +clear_shareable_state(CBORDecoderObject *self) +{ + PyList_SetSlice(self->shareables, 0, PY_SSIZE_T_MAX, NULL); + self->shared_index = -1; +} + + // CBORDecoder.decode(self) -> obj PyObject * CBORDecoder_decode(CBORDecoderObject *self) { - return decode(self, DECODE_NORMAL); + PyObject *ret; + self->decode_depth++; + ret = decode(self, DECODE_NORMAL); + self->decode_depth--; + assert(self->decode_depth >= 0); + if (self->decode_depth == 0) { + clear_shareable_state(self); + } + return ret; } @@ -1764,6 +1784,7 @@ CBORDecoder_decode_from_bytes(CBORDecoderObject *self, PyObject *data) if (!_CBOR2_BytesIO && _CBOR2_init_BytesIO() == -1) return NULL; + self->decode_depth++; save_read = self->read; buf = PyObject_CallFunctionObjArgs(_CBOR2_BytesIO, data, NULL); if (buf) { @@ -1775,6 +1796,11 @@ CBORDecoder_decode_from_bytes(CBORDecoderObject *self, PyObject *data) Py_DECREF(buf); } self->read = save_read; + self->decode_depth--; + assert(self->decode_depth >= 0); + if (self->decode_depth == 0) { + clear_shareable_state(self); + } return ret; } diff --git a/source/decoder.h b/source/decoder.h index 6bb6d52..a2f1bcb 100644 --- a/source/decoder.h +++ b/source/decoder.h @@ -13,6 +13,7 @@ typedef struct { PyObject *str_errors; bool immutable; Py_ssize_t shared_index; + Py_ssize_t decode_depth; } CBORDecoderObject; extern PyTypeObject CBORDecoderType; diff --git a/source/encoder.c b/source/encoder.c index af2b9c5..ae3fd64 100644 --- a/source/encoder.c +++ b/source/encoder.c @@ -112,6 +112,7 @@ CBOREncoder_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) self->shared_handler = NULL; self->string_referencing = false; self->string_namespacing = false; + self->encode_depth = 0; } return (PyObject *) self; } @@ -2016,17 +2017,35 @@ encode(CBOREncoderObject *self, PyObject *value) } +// Reset shared state at the end of each top-level encode to prevent +// shared references from leaking between independent encode operations. +// Nested calls (from hooks or recursive encoding) must preserve the state. +static inline void +clear_shared_state(CBOREncoderObject *self) +{ + PyDict_Clear(self->shared); + PyDict_Clear(self->string_references); +} + + // CBOREncoder.encode(self, value) PyObject * CBOREncoder_encode(CBOREncoderObject *self, PyObject *value) { PyObject *ret; - // TODO reset shared dict? - if (Py_EnterRecursiveCall(" in CBOREncoder.encode")) + self->encode_depth++; + if (Py_EnterRecursiveCall(" in CBOREncoder.encode")) { + self->encode_depth--; return NULL; + } ret = encode(self, value); Py_LeaveRecursiveCall(); + self->encode_depth--; + assert(self->encode_depth >= 0); + if (self->encode_depth == 0) { + clear_shared_state(self); + } return ret; } diff --git a/source/encoder.h b/source/encoder.h index b994ed9..7a68e2f 100644 --- a/source/encoder.h +++ b/source/encoder.h @@ -23,6 +23,7 @@ typedef struct { bool value_sharing; bool string_referencing; bool string_namespacing; + Py_ssize_t encode_depth; } CBOREncoderObject; extern PyTypeObject CBOREncoderType; diff --git a/tests/test_decoder.py b/tests/test_decoder.py index d03e288..cc3af11 100644 --- a/tests/test_decoder.py +++ b/tests/test_decoder.py @@ -713,6 +713,68 @@ def test_reserved_special_tags(impl, data, expected): ('c400', '4'), ('c500', '5') ], ) + +class TestDecoderReuse: + """ + Tests for correct behavior when reusing CBORDecoder instances. + """ + + def test_decoder_reuse_resets_shared_refs(self, impl): + """ + Shared references should be scoped to a single decode operation, + not persist across multiple decodes on the same decoder instance. + """ + # Message with shareable tag (28) + msg1 = impl.dumps(impl.CBORTag(28, "first_value")) + + # Message with sharedref tag (29) referencing index 0 + msg2 = impl.dumps(impl.CBORTag(29, 0)) + + # Reuse decoder across messages + decoder = impl.CBORDecoder(BytesIO(msg1)) + result1 = decoder.decode() + assert result1 == "first_value" + + # Second decode should fail - sharedref(0) doesn't exist in this context + decoder.fp = BytesIO(msg2) + with pytest.raises(impl.CBORDecodeValueError, match="shared reference"): + decoder.decode() + + def test_decode_from_bytes_resets_shared_refs(self, impl): + """ + decode_from_bytes should also reset shared references between calls. + """ + msg1 = impl.dumps(impl.CBORTag(28, "value")) + msg2 = impl.dumps(impl.CBORTag(29, 0)) + + decoder = impl.CBORDecoder(BytesIO(b"")) + decoder.decode_from_bytes(msg1) + + with pytest.raises(impl.CBORDecodeValueError, match="shared reference"): + decoder.decode_from_bytes(msg2) + + def test_shared_refs_within_single_decode(self, impl): + """ + Shared references must work correctly within a single decode operation. + + Note: This tests non-cyclic sibling references [shareable(x), sharedref(0)], + which is a different pattern from test_cyclic_array/test_cyclic_map that + test self-referencing structures like shareable([sharedref(0)]). + """ + # [shareable("hello"), sharedref(0)] -> ["hello", "hello"] + data = unhexlify( + "82" # array(2) + "d81c" # tag(28) shareable + "65" # text(5) + "68656c6c6f" # "hello" + "d81d" # tag(29) sharedref + "00" # unsigned(0) + ) + + result = impl.loads(data) + assert result == ["hello", "hello"] + assert result[0] is result[1] # Same object reference + def test_decimal_payload_unpacking(impl, data, expected): with pytest.raises(impl.CBORDecodeValueError) as exc_info: impl.loads(unhexlify(data)) diff --git a/tests/test_encoder.py b/tests/test_encoder.py index 8c40000..c76d5e0 100644 --- a/tests/test_encoder.py +++ b/tests/test_encoder.py @@ -522,6 +522,75 @@ def test_encode_stringrefs_dict(impl): assert impl.dumps(value, string_referencing=True, canonical=True) == expected +class TestEncoderReuse: + """ + Tests for correct behavior when reusing CBOREncoder instances. + """ + + def test_encoder_reuse_resets_shared_containers(self, impl): + """ + Shared container tracking should be scoped to a single encode operation, + not persist across multiple encodes on the same encoder instance. + """ + fp = BytesIO() + encoder = impl.CBOREncoder(fp, value_sharing=True) + shared_obj = ["hello"] + + # First encode: object is tracked in shared containers + encoder.encode([shared_obj, shared_obj]) + + # Second encode on new fp: should produce valid standalone CBOR + # (not a sharedref pointing to stale first-encode data) + encoder.fp = BytesIO() + encoder.encode(shared_obj) + second_output = encoder.fp.getvalue() + + # The second output must be decodable on its own + result = impl.loads(second_output) + assert result == ["hello"] + + def test_encode_to_bytes_resets_shared_containers(self, impl): + """ + encode_to_bytes should also reset shared container tracking between calls. + """ + fp = BytesIO() + encoder = impl.CBOREncoder(fp, value_sharing=True) + shared_obj = ["hello"] + + # First encode + encoder.encode_to_bytes([shared_obj, shared_obj]) + + # Second encode should produce valid standalone CBOR + result_bytes = encoder.encode_to_bytes(shared_obj) + result = impl.loads(result_bytes) + assert result == ["hello"] + + def test_encoder_hook_does_not_reset_state(self, impl): + """ + When a custom encoder hook calls encode(), the shared container + tracking should be preserved (not reset mid-operation). + """ + + class Custom: + def __init__(self, value): + self.value = value + + def custom_encoder(encoder, obj): + # Hook encodes the wrapped value + encoder.encode(obj.value) + + # Encode a Custom wrapping a list + data = impl.dumps(Custom(["a", "b"]), default=custom_encoder) + + # Verify the output decodes correctly + result = impl.loads(data) + assert result == ["a", "b"] + + # Test nested Custom objects - hook should work recursively + data2 = impl.dumps(Custom(Custom(["x"])), default=custom_encoder) + result2 = impl.loads(data2) + assert result2 == ["x"] + @pytest.mark.parametrize('tag', [-1, 2**64, 'f'], ids=['too small', 'too large', 'wrong type']) def test_invalid_tag(impl, tag): with pytest.raises(TypeError): -- 2.50.1