cgcardona / muse public
test_stress_object_store.py python
301 lines 10.7 KB
8d5137ed fix(security): full surface hardening — validation, path containment, p… Gabriel Cardona <cgcardona@gmail.com> 10h ago
1 """Stress tests for the content-addressed object store.
2
3 Exercises:
4 - Write-then-read round-trip for varied payload sizes (1 byte … 10 MB).
5 - Idempotency: writing the same object ID twice is a no-op.
6 - has_object before and after writes.
7 - object_path sharding: first two hex chars as directory.
8 - read_object returns None for absent objects.
9 - restore_object copies bytes faithfully.
10 - write_object_from_path uses copy semantics, not load.
11 - Content integrity: read(write(content)) == content.
12 - Multiple distinct objects coexist without collision.
13 """
14
15 import hashlib
16 import os
17 import pathlib
18 import secrets
19
20 import pytest
21
22 from muse.core.object_store import (
23 has_object,
24 object_path,
25 objects_dir,
26 read_object,
27 restore_object,
28 write_object,
29 write_object_from_path,
30 )
31
32
33 # ---------------------------------------------------------------------------
34 # Helpers
35 # ---------------------------------------------------------------------------
36
37
38 def _sha256(data: bytes) -> str:
39 return hashlib.sha256(data).hexdigest()
40
41
42 @pytest.fixture
43 def repo(tmp_path: pathlib.Path) -> pathlib.Path:
44 (tmp_path / ".muse").mkdir()
45 return tmp_path
46
47
48 # ---------------------------------------------------------------------------
49 # Basic round-trip
50 # ---------------------------------------------------------------------------
51
52
53 class TestRoundTrip:
54 def test_write_then_read_small(self, repo: pathlib.Path) -> None:
55 data = b"hello muse"
56 oid = _sha256(data)
57 write_object(repo, oid, data)
58 assert read_object(repo, oid) == data
59
60 def test_write_then_read_empty(self, repo: pathlib.Path) -> None:
61 data = b""
62 oid = _sha256(data)
63 write_object(repo, oid, data)
64 assert read_object(repo, oid) == data
65
66 def test_write_then_read_single_byte(self, repo: pathlib.Path) -> None:
67 data = b"\x00"
68 oid = _sha256(data)
69 write_object(repo, oid, data)
70 assert read_object(repo, oid) == data
71
72 def test_write_then_read_binary(self, repo: pathlib.Path) -> None:
73 data = bytes(range(256)) * 100
74 oid = _sha256(data)
75 write_object(repo, oid, data)
76 assert read_object(repo, oid) == data
77
78 @pytest.mark.parametrize("size", [1, 100, 4096, 65536, 1_000_000])
79 def test_write_then_read_various_sizes(self, repo: pathlib.Path, size: int) -> None:
80 data = secrets.token_bytes(size)
81 oid = _sha256(data)
82 assert write_object(repo, oid, data) is True
83 assert read_object(repo, oid) == data
84
85 def test_content_integrity(self, repo: pathlib.Path) -> None:
86 """Read back exactly what was written — not a truncated or padded version."""
87 for i in range(20):
88 data = f"object-content-{i}-{'x' * i}".encode()
89 oid = _sha256(data)
90 write_object(repo, oid, data)
91 recovered = read_object(repo, oid)
92 assert recovered == data
93 assert len(recovered) == len(data)
94
95
96 # ---------------------------------------------------------------------------
97 # Idempotency
98 # ---------------------------------------------------------------------------
99
100
101 class TestIdempotency:
102 def test_double_write_returns_false_second_time(self, repo: pathlib.Path) -> None:
103 data = b"idempotent"
104 oid = _sha256(data)
105 assert write_object(repo, oid, data) is True
106 assert write_object(repo, oid, data) is False
107
108 def test_double_write_does_not_corrupt(self, repo: pathlib.Path) -> None:
109 data = b"original content"
110 oid = _sha256(data)
111 write_object(repo, oid, data)
112 # Writing different content with the same ID raises ValueError (integrity check).
113 # The object on disk is NOT overwritten — idempotency guard fires first.
114 with pytest.raises(ValueError, match="Content integrity failure"):
115 write_object(repo, oid, b"different content")
116 assert read_object(repo, oid) == data
117
118 def test_triple_write_stays_stable(self, repo: pathlib.Path) -> None:
119 data = b"triple-write"
120 oid = _sha256(data)
121 for _ in range(3):
122 write_object(repo, oid, data)
123 assert read_object(repo, oid) == data
124
125
126 # ---------------------------------------------------------------------------
127 # has_object
128 # ---------------------------------------------------------------------------
129
130
131 class TestHasObject:
132 def test_absent_before_write(self, repo: pathlib.Path) -> None:
133 oid = _sha256(b"not yet written")
134 assert not has_object(repo, oid)
135
136 def test_present_after_write(self, repo: pathlib.Path) -> None:
137 data = b"present"
138 oid = _sha256(data)
139 write_object(repo, oid, data)
140 assert has_object(repo, oid)
141
142 def test_other_objects_dont_shadow(self, repo: pathlib.Path) -> None:
143 a = b"object-a"
144 b_ = b"object-b"
145 oid_a = _sha256(a)
146 oid_b = _sha256(b_)
147 write_object(repo, oid_a, a)
148 assert has_object(repo, oid_a)
149 assert not has_object(repo, oid_b)
150 write_object(repo, oid_b, b_)
151 assert has_object(repo, oid_b)
152
153
154 # ---------------------------------------------------------------------------
155 # Absent objects
156 # ---------------------------------------------------------------------------
157
158
159 class TestAbsentObjects:
160 def test_read_absent_returns_none(self, repo: pathlib.Path) -> None:
161 fake_oid = "a" * 64
162 assert read_object(repo, fake_oid) is None
163
164 def test_restore_absent_returns_false(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
165 fake_oid = "b" * 64
166 dest = tmp_path / "restored.bin"
167 result = restore_object(repo, fake_oid, dest)
168 assert result is False
169 assert not dest.exists()
170
171 def test_has_object_false_for_random_id(self, repo: pathlib.Path) -> None:
172 for _ in range(10):
173 assert not has_object(repo, secrets.token_hex(32))
174
175
176 # ---------------------------------------------------------------------------
177 # Sharding layout
178 # ---------------------------------------------------------------------------
179
180
181 class TestSharding:
182 def test_object_path_uses_first_two_chars_as_dir(self, repo: pathlib.Path) -> None:
183 oid = "ab" + "c" * 62
184 path = object_path(repo, oid)
185 assert path.parent.name == "ab"
186 assert path.name == "c" * 62
187
188 def test_objects_with_same_prefix_go_to_same_shard(self, repo: pathlib.Path) -> None:
189 oid1 = "ff" + "0" * 62
190 oid2 = "ff" + "1" * 62
191 assert object_path(repo, oid1).parent == object_path(repo, oid2).parent
192
193 def test_objects_with_different_prefix_go_to_different_shards(self, repo: pathlib.Path) -> None:
194 # Use valid 64-char hex IDs with different first-two-char prefixes.
195 oid1 = "aa" + "f" * 62
196 oid2 = "bb" + "f" * 62
197 assert object_path(repo, oid1).parent != object_path(repo, oid2).parent
198
199 def test_256_shards_can_all_be_created(self, repo: pathlib.Path) -> None:
200 """Write one object per possible shard prefix (00-ff).
201
202 Finds data whose SHA-256 starts with each 2-hex prefix by brute-force,
203 using a counter to stay deterministic.
204 """
205 import itertools
206 written_prefixes: set[str] = set()
207 for n in itertools.count():
208 if len(written_prefixes) == 256:
209 break
210 data = f"shard-seed-{n}".encode()
211 oid = _sha256(data)
212 prefix = oid[:2]
213 if prefix not in written_prefixes:
214 write_object(repo, oid, data)
215 written_prefixes.add(prefix)
216 # Verify all 256 shard dirs exist.
217 shards = [d.name for d in objects_dir(repo).iterdir() if d.is_dir()]
218 assert len(shards) == 256
219
220
221 # ---------------------------------------------------------------------------
222 # write_object_from_path
223 # ---------------------------------------------------------------------------
224
225
226 class TestWriteObjectFromPath:
227 def test_from_path_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
228 src = tmp_path / "source.bin"
229 data = b"from-path-content"
230 src.write_bytes(data)
231 oid = _sha256(data)
232 assert write_object_from_path(repo, oid, src) is True
233 assert read_object(repo, oid) == data
234
235 def test_from_path_idempotent(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
236 src = tmp_path / "idem.bin"
237 data = b"idempotent-from-path"
238 src.write_bytes(data)
239 oid = _sha256(data)
240 write_object_from_path(repo, oid, src)
241 assert write_object_from_path(repo, oid, src) is False
242
243
244 # ---------------------------------------------------------------------------
245 # restore_object
246 # ---------------------------------------------------------------------------
247
248
249 class TestRestoreObject:
250 def test_restore_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
251 data = b"restore-me"
252 oid = _sha256(data)
253 write_object(repo, oid, data)
254 dest = tmp_path / "sub" / "restored.bin"
255 assert restore_object(repo, oid, dest) is True
256 assert dest.read_bytes() == data
257
258 def test_restore_creates_parent_dirs(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
259 data = b"deep-restore"
260 oid = _sha256(data)
261 write_object(repo, oid, data)
262 dest = tmp_path / "a" / "b" / "c" / "file.bin"
263 restore_object(repo, oid, dest)
264 assert dest.exists()
265
266 def test_restore_large_object_intact(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None:
267 data = secrets.token_bytes(2_000_000)
268 oid = _sha256(data)
269 write_object(repo, oid, data)
270 dest = tmp_path / "large.bin"
271 restore_object(repo, oid, dest)
272 assert dest.read_bytes() == data
273
274
275 # ---------------------------------------------------------------------------
276 # Multiple distinct objects
277 # ---------------------------------------------------------------------------
278
279
280 class TestMultipleObjects:
281 def test_100_distinct_objects_coexist(self, repo: pathlib.Path) -> None:
282 written: dict[str, bytes] = {}
283 for i in range(100):
284 data = f"payload-{i:03d}-{'z' * i}".encode()
285 oid = _sha256(data)
286 write_object(repo, oid, data)
287 written[oid] = data
288
289 for oid, data in written.items():
290 assert read_object(repo, oid) == data
291
292 def test_all_objects_independently_addressable(self, repo: pathlib.Path) -> None:
293 """Verify no two distinct objects collide in the store."""
294 oids: list[str] = []
295 for i in range(50):
296 data = secrets.token_bytes(64)
297 oid = _sha256(data)
298 write_object(repo, oid, data)
299 oids.append(oid)
300 # All OIDs should be unique (probabilistic but essentially certain).
301 assert len(set(oids)) == 50