gabriel / muse public
object_store.py python
291 lines 9.7 KB
8d5137ed fix(security): full surface hardening — validation, path containment, p… Gabriel Cardona <cgcardona@gmail.com> 4d ago
1 """Canonical content-addressed object store for the Muse VCS.
2
3 All Muse commands that read or write blobs — ``muse commit``, ``muse read-tree``,
4 ``muse reset`` — go through this module exclusively. No command may implement
5 its own path logic or copy its own blobs.
6
7 Layout
8 ------
9 Objects are stored under ``<repo_root>/.muse/objects/`` using a two-character
10 sharded directory layout that mirrors Git's loose-object format::
11
12 .muse/objects/<sha2>/<sha62>
13
14 where ``<sha2>`` is the first two hex characters of the SHA-256 digest and
15 ``<sha62>`` is the remaining 62 characters. For example, the object with
16 digest ``ab1234...`` is stored at ``.muse/objects/ab/1234...``.
17
18 Why sharding?
19 -------------
20 Music repositories accumulate objects at a far higher rate than code
21 repositories: every generated take, every variation, every rendered clip is a
22 new blob. A single recording session can produce tens of thousands of objects.
23 Without sharding, a flat directory exceeds filesystem limits (ext4, APFS, HFS+
24 all degrade or hard-limit above ~32,000 entries per directory). Two hex
25 characters yield 256 subdirectories — the same trade-off Git settled on after
26 years of production use.
27
28 This module is the single source of truth for all local object I/O.
29 The store is append-only: writing the same object twice is always a no-op.
30 """
31
32 from __future__ import annotations
33
34 import hashlib
35 import logging
36 import os
37 import pathlib
38 import shutil
39 import tempfile
40
41 from muse.core.validation import MAX_FILE_BYTES, validate_object_id
42
43 logger = logging.getLogger(__name__)
44
45 _OBJECTS_DIR = "objects"
46
47
48 def objects_dir(repo_root: pathlib.Path) -> pathlib.Path:
49 """Return the path to the local object store root directory.
50
51 The store lives at ``<repo_root>/.muse/objects/``. Shard subdirectories
52 are created lazily by :func:`write_object` and :func:`write_object_from_path`.
53
54 Args:
55 repo_root: Root of the Muse repository (the directory containing
56 ``.muse/``).
57
58 Returns:
59 Absolute path to the objects directory (may not yet exist).
60 """
61 return repo_root / ".muse" / _OBJECTS_DIR
62
63
64 def object_path(repo_root: pathlib.Path, object_id: str) -> pathlib.Path:
65 """Return the canonical on-disk path for a single object.
66
67 Objects are sharded by the first two hex characters of their SHA-256
68 digest, matching Git's loose-object layout::
69
70 .muse/objects/<sha2>/<sha62>
71
72 This prevents filesystem performance issues as the repository grows.
73
74 Args:
75 repo_root: Root of the Muse repository.
76 object_id: SHA-256 hex digest of the object's content (64 chars).
77
78 Returns:
79 Absolute path to the object file (may not yet exist).
80
81 Raises:
82 ValueError: If *object_id* is not exactly 64 lowercase hex characters.
83 """
84 validate_object_id(object_id)
85 return objects_dir(repo_root) / object_id[:2] / object_id[2:]
86
87
88 def has_object(repo_root: pathlib.Path, object_id: str) -> bool:
89 """Return ``True`` if *object_id* is present in the local store.
90
91 Cheaper than :func:`read_object` when the caller only needs to check
92 existence (e.g. to pre-flight a hard reset before touching the working
93 tree).
94
95 Args:
96 repo_root: Root of the Muse repository.
97 object_id: SHA-256 hex digest to check.
98 """
99 return object_path(repo_root, object_id).exists()
100
101
102 def write_object(repo_root: pathlib.Path, object_id: str, content: bytes) -> bool:
103 """Write *content* to the local object store under *object_id*.
104
105 If the object already exists (same ID = same content, content-addressed)
106 the write is skipped and ``False`` is returned. Returns ``True`` when a
107 new object was written.
108
109 The shard directory is created on first write. Subsequent writes for the
110 same ``object_id`` are no-ops — they never overwrite existing content.
111
112 The content hash is verified against *object_id* before writing to prevent
113 corrupt or malicious blobs from entering the store.
114
115 Writes are atomic: content is written to a temp file then renamed,
116 so a crash mid-write never leaves a partial object.
117
118 Args:
119 repo_root: Root of the Muse repository.
120 object_id: SHA-256 hex digest that identifies this object (64 chars).
121 content: Raw bytes to persist.
122
123 Returns:
124 ``True`` if the object was newly written, ``False`` if it already
125 existed (idempotent).
126
127 Raises:
128 ValueError: If *object_id* is not a valid 64-char hex string, or if
129 the hash of *content* does not match *object_id*.
130 """
131 validate_object_id(object_id)
132
133 actual = hashlib.sha256(content).hexdigest()
134 if actual != object_id:
135 raise ValueError(
136 f"Content integrity failure: expected object {object_id[:8]}…, "
137 f"got {actual[:8]}…"
138 )
139
140 dest = object_path(repo_root, object_id)
141 if dest.exists():
142 logger.debug("⚠️ Object %s already in store — skipped", object_id[:8])
143 return False
144
145 dest.parent.mkdir(parents=True, exist_ok=True)
146
147 fd, tmp_str = tempfile.mkstemp(dir=dest.parent, prefix=".obj-tmp-")
148 tmp = pathlib.Path(tmp_str)
149 try:
150 with os.fdopen(fd, "wb") as fh:
151 fh.write(content)
152 os.replace(tmp, dest)
153 except Exception:
154 tmp.unlink(missing_ok=True)
155 raise
156
157 logger.debug("✅ Stored object %s (%d bytes)", object_id[:8], len(content))
158 return True
159
160
161 def write_object_from_path(
162 repo_root: pathlib.Path,
163 object_id: str,
164 src: pathlib.Path,
165 ) -> bool:
166 """Copy *src* into the object store without loading it into memory.
167
168 Preferred over :func:`write_object` for large blobs (dense MIDI renders,
169 audio previews) because ``shutil.copy2`` delegates to the OS copy
170 mechanism, keeping the interpreter heap clean.
171
172 Idempotent: if the object already exists it is never overwritten.
173
174 The source file's hash is verified against *object_id* before writing.
175 Writes are atomic (temp file + rename).
176
177 Args:
178 repo_root: Root of the Muse repository.
179 object_id: SHA-256 hex digest of *src*'s content (64 chars).
180 src: Absolute path of the source file to store.
181
182 Returns:
183 ``True`` if the object was newly written, ``False`` if it already
184 existed (idempotent).
185
186 Raises:
187 ValueError: If *object_id* is invalid or the file's hash does not match.
188 """
189 validate_object_id(object_id)
190
191 dest = object_path(repo_root, object_id)
192 if dest.exists():
193 logger.debug("⚠️ Object %s already in store — skipped", object_id[:8])
194 return False
195
196 # Verify hash before writing.
197 h = hashlib.sha256()
198 with src.open("rb") as fh:
199 for chunk in iter(lambda: fh.read(65536), b""):
200 h.update(chunk)
201 actual = h.hexdigest()
202 if actual != object_id:
203 raise ValueError(
204 f"Content integrity failure for {src}: expected {object_id[:8]}…, "
205 f"got {actual[:8]}…"
206 )
207
208 dest.parent.mkdir(parents=True, exist_ok=True)
209
210 fd, tmp_str = tempfile.mkstemp(dir=dest.parent, prefix=".obj-tmp-")
211 tmp = pathlib.Path(tmp_str)
212 try:
213 os.close(fd)
214 shutil.copy2(src, tmp)
215 os.replace(tmp, dest)
216 except Exception:
217 tmp.unlink(missing_ok=True)
218 raise
219
220 logger.debug("✅ Stored object %s (%s)", object_id[:8], src.name)
221 return True
222
223
224 def read_object(repo_root: pathlib.Path, object_id: str) -> bytes | None:
225 """Read and return the raw bytes for *object_id* from the local store.
226
227 Returns ``None`` when the object is not present in the store so callers
228 can produce a user-facing error rather than raising ``FileNotFoundError``.
229
230 Args:
231 repo_root: Root of the Muse repository.
232 object_id: SHA-256 hex digest of the desired object.
233
234 Returns:
235 Raw bytes, or ``None`` when the object is absent from the store.
236
237 Raises:
238 ValueError: If *object_id* is not a valid 64-char hex string.
239 OSError: If the object file exceeds MAX_FILE_BYTES.
240 """
241 validate_object_id(object_id)
242 dest = object_path(repo_root, object_id)
243 if not dest.exists():
244 logger.debug("⚠️ Object %s not found in local store", object_id[:8])
245 return None
246 size = dest.stat().st_size
247 if size > MAX_FILE_BYTES:
248 raise OSError(
249 f"Object {object_id[:8]} is {size} bytes, exceeding the "
250 f"{MAX_FILE_BYTES // (1024 * 1024)} MiB read limit."
251 )
252 return dest.read_bytes()
253
254
255 def restore_object(
256 repo_root: pathlib.Path,
257 object_id: str,
258 dest: pathlib.Path,
259 ) -> bool:
260 """Copy an object from the store to *dest* without loading it into memory.
261
262 Preferred over :func:`read_object` + ``dest.write_bytes()`` for large
263 blobs because ``shutil.copy2`` delegates to the OS copy mechanism.
264
265 Creates parent directories of *dest* if they do not exist.
266
267 The caller is responsible for ensuring *dest* is within a safe base
268 directory (use :func:`muse.core.validation.contain_path` before calling).
269
270 Args:
271 repo_root: Root of the Muse repository.
272 object_id: SHA-256 hex digest of the desired object (64 chars).
273 dest: Absolute path to write the restored file.
274
275 Returns:
276 ``True`` on success, ``False`` if the object is not in the store.
277
278 Raises:
279 ValueError: If *object_id* is not a valid 64-char hex string.
280 """
281 validate_object_id(object_id)
282 src = object_path(repo_root, object_id)
283 if not src.exists():
284 logger.debug(
285 "⚠️ Object %s not found in local store — cannot restore", object_id[:8]
286 )
287 return False
288 dest.parent.mkdir(parents=True, exist_ok=True)
289 shutil.copy2(src, dest)
290 logger.debug("✅ Restored object %s → %s", object_id[:8], dest)
291 return True