test_stress_object_store.py
python
| 1 | """Stress tests for the content-addressed object store. |
| 2 | |
| 3 | Exercises: |
| 4 | - Write-then-read round-trip for varied payload sizes (1 byte … 10 MB). |
| 5 | - Idempotency: writing the same object ID twice is a no-op. |
| 6 | - has_object before and after writes. |
| 7 | - object_path sharding: first two hex chars as directory. |
| 8 | - read_object returns None for absent objects. |
| 9 | - restore_object copies bytes faithfully. |
| 10 | - write_object_from_path uses copy semantics, not load. |
| 11 | - Content integrity: read(write(content)) == content. |
| 12 | - Multiple distinct objects coexist without collision. |
| 13 | """ |
| 14 | |
| 15 | import hashlib |
| 16 | import os |
| 17 | import pathlib |
| 18 | import secrets |
| 19 | |
| 20 | import pytest |
| 21 | |
| 22 | from muse.core.object_store import ( |
| 23 | has_object, |
| 24 | object_path, |
| 25 | objects_dir, |
| 26 | read_object, |
| 27 | restore_object, |
| 28 | write_object, |
| 29 | write_object_from_path, |
| 30 | ) |
| 31 | |
| 32 | |
| 33 | # --------------------------------------------------------------------------- |
| 34 | # Helpers |
| 35 | # --------------------------------------------------------------------------- |
| 36 | |
| 37 | |
| 38 | def _sha256(data: bytes) -> str: |
| 39 | return hashlib.sha256(data).hexdigest() |
| 40 | |
| 41 | |
| 42 | @pytest.fixture |
| 43 | def repo(tmp_path: pathlib.Path) -> pathlib.Path: |
| 44 | (tmp_path / ".muse").mkdir() |
| 45 | return tmp_path |
| 46 | |
| 47 | |
| 48 | # --------------------------------------------------------------------------- |
| 49 | # Basic round-trip |
| 50 | # --------------------------------------------------------------------------- |
| 51 | |
| 52 | |
| 53 | class TestRoundTrip: |
| 54 | def test_write_then_read_small(self, repo: pathlib.Path) -> None: |
| 55 | data = b"hello muse" |
| 56 | oid = _sha256(data) |
| 57 | write_object(repo, oid, data) |
| 58 | assert read_object(repo, oid) == data |
| 59 | |
| 60 | def test_write_then_read_empty(self, repo: pathlib.Path) -> None: |
| 61 | data = b"" |
| 62 | oid = _sha256(data) |
| 63 | write_object(repo, oid, data) |
| 64 | assert read_object(repo, oid) == data |
| 65 | |
| 66 | def test_write_then_read_single_byte(self, repo: pathlib.Path) -> None: |
| 67 | data = b"\x00" |
| 68 | oid = _sha256(data) |
| 69 | write_object(repo, oid, data) |
| 70 | assert read_object(repo, oid) == data |
| 71 | |
| 72 | def test_write_then_read_binary(self, repo: pathlib.Path) -> None: |
| 73 | data = bytes(range(256)) * 100 |
| 74 | oid = _sha256(data) |
| 75 | write_object(repo, oid, data) |
| 76 | assert read_object(repo, oid) == data |
| 77 | |
| 78 | @pytest.mark.parametrize("size", [1, 100, 4096, 65536, 1_000_000]) |
| 79 | def test_write_then_read_various_sizes(self, repo: pathlib.Path, size: int) -> None: |
| 80 | data = secrets.token_bytes(size) |
| 81 | oid = _sha256(data) |
| 82 | assert write_object(repo, oid, data) is True |
| 83 | assert read_object(repo, oid) == data |
| 84 | |
| 85 | def test_content_integrity(self, repo: pathlib.Path) -> None: |
| 86 | """Read back exactly what was written — not a truncated or padded version.""" |
| 87 | for i in range(20): |
| 88 | data = f"object-content-{i}-{'x' * i}".encode() |
| 89 | oid = _sha256(data) |
| 90 | write_object(repo, oid, data) |
| 91 | recovered = read_object(repo, oid) |
| 92 | assert recovered == data |
| 93 | assert len(recovered) == len(data) |
| 94 | |
| 95 | |
| 96 | # --------------------------------------------------------------------------- |
| 97 | # Idempotency |
| 98 | # --------------------------------------------------------------------------- |
| 99 | |
| 100 | |
| 101 | class TestIdempotency: |
| 102 | def test_double_write_returns_false_second_time(self, repo: pathlib.Path) -> None: |
| 103 | data = b"idempotent" |
| 104 | oid = _sha256(data) |
| 105 | assert write_object(repo, oid, data) is True |
| 106 | assert write_object(repo, oid, data) is False |
| 107 | |
| 108 | def test_double_write_does_not_corrupt(self, repo: pathlib.Path) -> None: |
| 109 | data = b"original content" |
| 110 | oid = _sha256(data) |
| 111 | write_object(repo, oid, data) |
| 112 | # Attempt to overwrite with different content using the same ID — should be silently skipped. |
| 113 | write_object(repo, oid, b"different content") |
| 114 | assert read_object(repo, oid) == data |
| 115 | |
| 116 | def test_triple_write_stays_stable(self, repo: pathlib.Path) -> None: |
| 117 | data = b"triple-write" |
| 118 | oid = _sha256(data) |
| 119 | for _ in range(3): |
| 120 | write_object(repo, oid, data) |
| 121 | assert read_object(repo, oid) == data |
| 122 | |
| 123 | |
| 124 | # --------------------------------------------------------------------------- |
| 125 | # has_object |
| 126 | # --------------------------------------------------------------------------- |
| 127 | |
| 128 | |
| 129 | class TestHasObject: |
| 130 | def test_absent_before_write(self, repo: pathlib.Path) -> None: |
| 131 | oid = _sha256(b"not yet written") |
| 132 | assert not has_object(repo, oid) |
| 133 | |
| 134 | def test_present_after_write(self, repo: pathlib.Path) -> None: |
| 135 | data = b"present" |
| 136 | oid = _sha256(data) |
| 137 | write_object(repo, oid, data) |
| 138 | assert has_object(repo, oid) |
| 139 | |
| 140 | def test_other_objects_dont_shadow(self, repo: pathlib.Path) -> None: |
| 141 | a = b"object-a" |
| 142 | b_ = b"object-b" |
| 143 | oid_a = _sha256(a) |
| 144 | oid_b = _sha256(b_) |
| 145 | write_object(repo, oid_a, a) |
| 146 | assert has_object(repo, oid_a) |
| 147 | assert not has_object(repo, oid_b) |
| 148 | write_object(repo, oid_b, b_) |
| 149 | assert has_object(repo, oid_b) |
| 150 | |
| 151 | |
| 152 | # --------------------------------------------------------------------------- |
| 153 | # Absent objects |
| 154 | # --------------------------------------------------------------------------- |
| 155 | |
| 156 | |
| 157 | class TestAbsentObjects: |
| 158 | def test_read_absent_returns_none(self, repo: pathlib.Path) -> None: |
| 159 | fake_oid = "a" * 64 |
| 160 | assert read_object(repo, fake_oid) is None |
| 161 | |
| 162 | def test_restore_absent_returns_false(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 163 | fake_oid = "b" * 64 |
| 164 | dest = tmp_path / "restored.bin" |
| 165 | result = restore_object(repo, fake_oid, dest) |
| 166 | assert result is False |
| 167 | assert not dest.exists() |
| 168 | |
| 169 | def test_has_object_false_for_random_id(self, repo: pathlib.Path) -> None: |
| 170 | for _ in range(10): |
| 171 | assert not has_object(repo, secrets.token_hex(32)) |
| 172 | |
| 173 | |
| 174 | # --------------------------------------------------------------------------- |
| 175 | # Sharding layout |
| 176 | # --------------------------------------------------------------------------- |
| 177 | |
| 178 | |
| 179 | class TestSharding: |
| 180 | def test_object_path_uses_first_two_chars_as_dir(self, repo: pathlib.Path) -> None: |
| 181 | oid = "ab" + "c" * 62 |
| 182 | path = object_path(repo, oid) |
| 183 | assert path.parent.name == "ab" |
| 184 | assert path.name == "c" * 62 |
| 185 | |
| 186 | def test_objects_with_same_prefix_go_to_same_shard(self, repo: pathlib.Path) -> None: |
| 187 | oid1 = "ff" + "0" * 62 |
| 188 | oid2 = "ff" + "1" * 62 |
| 189 | assert object_path(repo, oid1).parent == object_path(repo, oid2).parent |
| 190 | |
| 191 | def test_objects_with_different_prefix_go_to_different_shards(self, repo: pathlib.Path) -> None: |
| 192 | oid1 = "aa" + "x" * 62 |
| 193 | oid2 = "bb" + "x" * 62 |
| 194 | assert object_path(repo, oid1).parent != object_path(repo, oid2).parent |
| 195 | |
| 196 | def test_256_shards_can_all_be_created(self, repo: pathlib.Path) -> None: |
| 197 | """Write one object per possible shard prefix (00-ff).""" |
| 198 | for prefix in [f"{i:02x}" for i in range(256)]: |
| 199 | data = f"shard-{prefix}".encode() |
| 200 | oid = prefix + _sha256(data)[2:] |
| 201 | write_object(repo, oid, data) |
| 202 | # Verify all 256 shard dirs exist. |
| 203 | shards = [d.name for d in objects_dir(repo).iterdir() if d.is_dir()] |
| 204 | assert len(shards) == 256 |
| 205 | |
| 206 | |
| 207 | # --------------------------------------------------------------------------- |
| 208 | # write_object_from_path |
| 209 | # --------------------------------------------------------------------------- |
| 210 | |
| 211 | |
| 212 | class TestWriteObjectFromPath: |
| 213 | def test_from_path_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 214 | src = tmp_path / "source.bin" |
| 215 | data = b"from-path-content" |
| 216 | src.write_bytes(data) |
| 217 | oid = _sha256(data) |
| 218 | assert write_object_from_path(repo, oid, src) is True |
| 219 | assert read_object(repo, oid) == data |
| 220 | |
| 221 | def test_from_path_idempotent(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 222 | src = tmp_path / "idem.bin" |
| 223 | data = b"idempotent-from-path" |
| 224 | src.write_bytes(data) |
| 225 | oid = _sha256(data) |
| 226 | write_object_from_path(repo, oid, src) |
| 227 | assert write_object_from_path(repo, oid, src) is False |
| 228 | |
| 229 | |
| 230 | # --------------------------------------------------------------------------- |
| 231 | # restore_object |
| 232 | # --------------------------------------------------------------------------- |
| 233 | |
| 234 | |
| 235 | class TestRestoreObject: |
| 236 | def test_restore_round_trip(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 237 | data = b"restore-me" |
| 238 | oid = _sha256(data) |
| 239 | write_object(repo, oid, data) |
| 240 | dest = tmp_path / "sub" / "restored.bin" |
| 241 | assert restore_object(repo, oid, dest) is True |
| 242 | assert dest.read_bytes() == data |
| 243 | |
| 244 | def test_restore_creates_parent_dirs(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 245 | data = b"deep-restore" |
| 246 | oid = _sha256(data) |
| 247 | write_object(repo, oid, data) |
| 248 | dest = tmp_path / "a" / "b" / "c" / "file.bin" |
| 249 | restore_object(repo, oid, dest) |
| 250 | assert dest.exists() |
| 251 | |
| 252 | def test_restore_large_object_intact(self, repo: pathlib.Path, tmp_path: pathlib.Path) -> None: |
| 253 | data = secrets.token_bytes(2_000_000) |
| 254 | oid = _sha256(data) |
| 255 | write_object(repo, oid, data) |
| 256 | dest = tmp_path / "large.bin" |
| 257 | restore_object(repo, oid, dest) |
| 258 | assert dest.read_bytes() == data |
| 259 | |
| 260 | |
| 261 | # --------------------------------------------------------------------------- |
| 262 | # Multiple distinct objects |
| 263 | # --------------------------------------------------------------------------- |
| 264 | |
| 265 | |
| 266 | class TestMultipleObjects: |
| 267 | def test_100_distinct_objects_coexist(self, repo: pathlib.Path) -> None: |
| 268 | written: dict[str, bytes] = {} |
| 269 | for i in range(100): |
| 270 | data = f"payload-{i:03d}-{'z' * i}".encode() |
| 271 | oid = _sha256(data) |
| 272 | write_object(repo, oid, data) |
| 273 | written[oid] = data |
| 274 | |
| 275 | for oid, data in written.items(): |
| 276 | assert read_object(repo, oid) == data |
| 277 | |
| 278 | def test_all_objects_independently_addressable(self, repo: pathlib.Path) -> None: |
| 279 | """Verify no two distinct objects collide in the store.""" |
| 280 | oids: list[str] = [] |
| 281 | for i in range(50): |
| 282 | data = secrets.token_bytes(64) |
| 283 | oid = _sha256(data) |
| 284 | write_object(repo, oid, data) |
| 285 | oids.append(oid) |
| 286 | # All OIDs should be unique (probabilistic but essentially certain). |
| 287 | assert len(set(oids)) == 50 |