gabriel / muse public
store.py python
964 lines 34.9 KB
afdf7861 fix: correct .muse init layout — bare repos, encoding, type safety, constants Gabriel Cardona <gabriel@tellurstori.com> 2d ago
1 """File-based commit and snapshot store for the Muse VCS.
2
3 All commit and snapshot metadata is stored as JSON files under ``.muse/`` —
4 no external database required.
5
6 Layout
7 ------
8
9 .muse/
10 commits/<commit_id>.json — one JSON file per commit
11 snapshots/<snapshot_id>.json — one JSON file per snapshot manifest
12 tags/<repo_id>/<tag_id>.json — tag records
13 objects/<sha2>/<sha62> — content-addressed blobs (via object_store.py)
14 refs/heads/<branch> — branch HEAD pointers (plain text commit IDs)
15 HEAD — "ref: refs/heads/main" | "commit: <sha256>"
16 repo.json — repository identity
17
18 Commit JSON schema
19 ------------------
20
21 {
22 "commit_id": "<sha256>",
23 "repo_id": "<uuid>",
24 "branch": "main",
25 "parent_commit_id": null | "<sha256>",
26 "parent2_commit_id": null | "<sha256>",
27 "snapshot_id": "<sha256>",
28 "message": "Add verse melody",
29 "author": "gabriel",
30 "committed_at": "2026-03-16T12:00:00+00:00",
31 "metadata": {}
32 }
33
34 Snapshot JSON schema
35 --------------------
36
37 {
38 "snapshot_id": "<sha256>",
39 "manifest": {"tracks/drums.mid": "<sha256>", ...},
40 "created_at": "2026-03-16T12:00:00+00:00"
41 }
42
43 All functions are synchronous — file I/O on a local ``.muse/`` directory
44 does not require async. This removes the SQLAlchemy/asyncpg dependency from
45 the CLI entirely.
46 """
47
48 from __future__ import annotations
49
50 import datetime
51 import json
52 import logging
53 import pathlib
54 import re
55 import uuid
56 from dataclasses import dataclass, field
57 from typing import Literal, TypedDict
58
59 from muse.core.validation import (
60 sanitize_glob_prefix,
61 validate_branch_name,
62 validate_ref_id,
63 validate_repo_id,
64 )
65 from muse.domain import SemVerBump, StructuredDelta
66
67 logger = logging.getLogger(__name__)
68
69 _COMMITS_DIR = "commits"
70 _SNAPSHOTS_DIR = "snapshots"
71 _TAGS_DIR = "tags"
72
73 # ---------------------------------------------------------------------------
74 # HEAD file — typed I/O
75 # ---------------------------------------------------------------------------
76 #
77 # Muse HEAD format
78 # ----------------
79 # The ``.muse/HEAD`` file is always one of two self-describing forms:
80 #
81 # ref: refs/heads/<branch> — symbolic ref; HEAD points to a branch
82 # commit: <sha256> — detached HEAD; HEAD points to a commit
83 #
84 # The ``ref:`` prefix is adopted from Git because it is the right design:
85 # a file that can hold two semantically different things should say which
86 # one it holds. The ``commit:`` prefix for detached HEAD is a Muse
87 # extension — Git uses a bare SHA, which is ambiguous (SHA-1? SHA-256?).
88 # Muse makes the hash algorithm implicit in the prefix, leaving the door
89 # open for future algorithm identifiers without changing the parsing rule.
90 #
91 # There is no backward-compatibility layer; every write site uses
92 # ``write_head_branch`` / ``write_head_commit`` and every read site uses
93 # ``read_head`` / ``read_current_branch``.
94
95
96 class SymbolicHead(TypedDict):
97 """HEAD points to a named branch."""
98
99 kind: Literal["branch"]
100 branch: str
101
102
103 class DetachedHead(TypedDict):
104 """HEAD points directly to a commit (detached HEAD state)."""
105
106 kind: Literal["commit"]
107 commit_id: str
108
109
110 HeadState = SymbolicHead | DetachedHead
111
112
113 def read_head(repo_root: pathlib.Path) -> HeadState:
114 """Parse ``.muse/HEAD`` and return a typed :data:`HeadState`.
115
116 Raises :exc:`ValueError` for any content that does not match the two
117 expected forms, and when the HEAD file does not exist (uninitialised or
118 corrupt repository), so callers never receive an ambiguous raw string or
119 an unhandled :exc:`FileNotFoundError`.
120 """
121 head_path = repo_root / ".muse" / "HEAD"
122 try:
123 raw = head_path.read_text(encoding="utf-8").strip()
124 except FileNotFoundError:
125 raise ValueError(
126 f"Repository HEAD file missing: {head_path}\n"
127 "The repository may be uninitialised. Run 'muse init' to fix it."
128 )
129 if raw.startswith("ref: refs/heads/"):
130 branch = raw.removeprefix("ref: refs/heads/").strip()
131 validate_branch_name(branch)
132 return SymbolicHead(kind="branch", branch=branch)
133 if raw.startswith("commit: "):
134 commit_id = raw.removeprefix("commit: ").strip()
135 if not re.fullmatch(r"[0-9a-f]{64}", commit_id):
136 raise ValueError(f"Malformed commit ID in HEAD: {commit_id!r}")
137 return DetachedHead(kind="commit", commit_id=commit_id)
138 raise ValueError(
139 f"Malformed HEAD: {raw!r}. "
140 "Expected 'ref: refs/heads/<branch>' or 'commit: <sha256>'."
141 )
142
143
144 def read_current_branch(repo_root: pathlib.Path) -> str:
145 """Return the currently checked-out branch name.
146
147 Raises :exc:`ValueError` when the repository is in detached HEAD state
148 so callers that cannot operate without a branch get a clear error
149 rather than silently receiving a commit ID as a branch name.
150 """
151 state = read_head(repo_root)
152 if state["kind"] != "branch":
153 raise ValueError(
154 "Repository is in detached HEAD state. "
155 "Run 'muse checkout <branch>' to return to a branch."
156 )
157 return state["branch"]
158
159
160 def write_head_branch(repo_root: pathlib.Path, branch: str) -> None:
161 """Write a symbolic ref to ``.muse/HEAD``.
162
163 Format: ``ref: refs/heads/<branch>`` — self-describing; the ``ref:``
164 prefix unambiguously identifies the entry as a symbolic reference.
165 """
166 validate_branch_name(branch)
167 (repo_root / ".muse" / "HEAD").write_text(f"ref: refs/heads/{branch}\n", encoding="utf-8")
168
169
170 def write_head_commit(repo_root: pathlib.Path, commit_id: str) -> None:
171 """Write a direct commit reference to ``.muse/HEAD`` (detached HEAD).
172
173 Format: ``commit: <sha256>`` — the ``commit:`` prefix is a Muse
174 extension that makes the entry self-describing in all states. Unlike
175 Git (which stores a bare hash), this makes the hash type explicit and
176 leaves room for future algorithm prefixes without parsing heuristics.
177 """
178 if not re.fullmatch(r"[0-9a-f]{64}", commit_id):
179 raise ValueError(f"commit_id must be a 64-char hex string, got: {commit_id!r}")
180 (repo_root / ".muse" / "HEAD").write_text(f"commit: {commit_id}\n", encoding="utf-8")
181
182
183 # ---------------------------------------------------------------------------
184 # Wire-format TypedDicts (JSON-serialisable, used by to_dict / from_dict)
185 # ---------------------------------------------------------------------------
186
187
188 class CommitDict(TypedDict, total=False):
189 """JSON-serialisable representation of a CommitRecord.
190
191 ``structured_delta`` is the typed delta produced by the domain plugin's
192 ``diff()`` at commit time. ``None`` on the initial commit (no parent to
193 diff against).
194
195 ``sem_ver_bump`` and ``breaking_changes`` are semantic versioning
196 metadata. Absent (treated as ``"none"`` / ``[]``) for older records and
197 non-code domains.
198
199 Agent provenance fields (all optional, default ``""`` for older records):
200
201 ``agent_id`` Stable identity string for the committing agent or human
202 (e.g. ``"counterpoint-bot"`` or ``"gabriel"``).
203 ``model_id`` Model identifier when the author is an AI agent
204 (e.g. ``"claude-opus-4"``). Empty for human authors.
205 ``toolchain_id`` Toolchain that produced the commit
206 (e.g. ``"cursor-agent-v2"``).
207 ``prompt_hash`` SHA-256 of the instruction/prompt that triggered this
208 commit. Privacy-preserving: the hash identifies the
209 prompt without storing its content.
210 ``signature`` HMAC-SHA256 hex digest of ``commit_id`` using the
211 agent's shared key. Verifiable with
212 :func:`muse.core.provenance.verify_commit_hmac`.
213 ``signer_key_id`` Fingerprint of the signing key
214 (SHA-256[:16] of the raw key bytes).
215 ``format_version`` Schema evolution counter. Each phase of the Muse
216 supercharge plan that extends the commit record bumps
217 this value. Readers use it to know which optional fields
218 are present:
219
220 - ``1`` — base record (commit_id, snapshot_id, parent, message, author)
221 - ``2`` — adds ``structured_delta`` (Phase 1: Typed Delta Algebra)
222 - ``3`` — adds ``sem_ver_bump``, ``breaking_changes``
223 (Phase 2: Domain Schema)
224 - ``4`` — adds agent provenance: ``agent_id``, ``model_id``,
225 ``toolchain_id``, ``prompt_hash``, ``signature``,
226 ``signer_key_id`` (Phase 4: Agent Identity)
227 - ``5`` — adds CRDT annotation fields: ``reviewed_by``
228 (ORSet of reviewer IDs), ``test_runs``
229 (GCounter of test-run events)
230
231 Old records without this field default to ``1``.
232 """
233
234 commit_id: str
235 repo_id: str
236 branch: str
237 snapshot_id: str
238 message: str
239 committed_at: str
240 parent_commit_id: str | None
241 parent2_commit_id: str | None
242 author: str
243 metadata: dict[str, str]
244 structured_delta: StructuredDelta | None
245 sem_ver_bump: SemVerBump
246 breaking_changes: list[str]
247 agent_id: str
248 model_id: str
249 toolchain_id: str
250 prompt_hash: str
251 signature: str
252 signer_key_id: str
253 format_version: int
254 # CRDT-backed annotation fields (format_version >= 5).
255 # ``reviewed_by`` is the logical state of an ORSet: a list of unique
256 # reviewer identifiers. Merging two records takes the union (set join).
257 # ``test_runs`` is a GCounter: monotonically increasing test-run count.
258 # Both fields are absent in older records and default to [] / 0.
259 reviewed_by: list[str]
260 test_runs: int
261
262
263 class SnapshotDict(TypedDict):
264 """JSON-serialisable representation of a SnapshotRecord."""
265
266 snapshot_id: str
267 manifest: dict[str, str]
268 created_at: str
269
270
271 class TagDict(TypedDict):
272 """JSON-serialisable representation of a TagRecord."""
273
274 tag_id: str
275 repo_id: str
276 commit_id: str
277 tag: str
278 created_at: str
279
280
281 class RemoteCommitPayload(TypedDict, total=False):
282 """Wire format received from a remote during push/pull.
283
284 All fields are optional because the payload may omit fields that are
285 unknown to older protocol versions. Callers validate required fields
286 before constructing a CommitRecord from this payload.
287 """
288
289 commit_id: str
290 repo_id: str
291 branch: str
292 snapshot_id: str
293 message: str
294 committed_at: str
295 parent_commit_id: str | None
296 parent2_commit_id: str | None
297 author: str
298 metadata: dict[str, str]
299 manifest: dict[str, str]
300
301
302 # ---------------------------------------------------------------------------
303 # Data classes
304 # ---------------------------------------------------------------------------
305
306
307 @dataclass
308 class CommitRecord:
309 """An immutable commit record stored as a JSON file under .muse/commits/.
310
311 ``sem_ver_bump`` and ``breaking_changes`` are populated by the commit command
312 when a code-domain delta is available. They default to ``"none"`` and ``[]``
313 for older records and non-code domains.
314
315 Agent provenance fields default to ``""`` so that existing JSON without
316 them deserialises without error. See :class:`CommitDict` for field semantics.
317 """
318
319 commit_id: str
320 repo_id: str
321 branch: str
322 snapshot_id: str
323 message: str
324 committed_at: datetime.datetime
325 parent_commit_id: str | None = None
326 parent2_commit_id: str | None = None
327 author: str = ""
328 metadata: dict[str, str] = field(default_factory=dict)
329 structured_delta: StructuredDelta | None = None
330 sem_ver_bump: SemVerBump = "none"
331 breaking_changes: list[str] = field(default_factory=list)
332 agent_id: str = ""
333 model_id: str = ""
334 toolchain_id: str = ""
335 prompt_hash: str = ""
336 signature: str = ""
337 signer_key_id: str = ""
338 #: Schema evolution counter — see :class:`CommitDict` for the version table.
339 #: Version 5 adds ``reviewed_by`` (ORSet) and ``test_runs`` (GCounter).
340 format_version: int = 5
341 reviewed_by: list[str] = field(default_factory=list)
342 test_runs: int = 0
343
344 def to_dict(self) -> CommitDict:
345 return CommitDict(
346 commit_id=self.commit_id,
347 repo_id=self.repo_id,
348 branch=self.branch,
349 snapshot_id=self.snapshot_id,
350 message=self.message,
351 committed_at=self.committed_at.isoformat(),
352 parent_commit_id=self.parent_commit_id,
353 parent2_commit_id=self.parent2_commit_id,
354 author=self.author,
355 metadata=dict(self.metadata),
356 structured_delta=self.structured_delta,
357 sem_ver_bump=self.sem_ver_bump,
358 breaking_changes=list(self.breaking_changes),
359 agent_id=self.agent_id,
360 model_id=self.model_id,
361 toolchain_id=self.toolchain_id,
362 prompt_hash=self.prompt_hash,
363 signature=self.signature,
364 signer_key_id=self.signer_key_id,
365 format_version=self.format_version,
366 reviewed_by=list(self.reviewed_by),
367 test_runs=self.test_runs,
368 )
369
370 @classmethod
371 def from_dict(cls, d: CommitDict) -> "CommitRecord":
372 try:
373 committed_at = datetime.datetime.fromisoformat(d["committed_at"])
374 except (ValueError, KeyError):
375 logger.warning(
376 "⚠️ Commit record has missing or unparseable committed_at; "
377 "substituting current time. The record may have been tampered with."
378 )
379 committed_at = datetime.datetime.now(datetime.timezone.utc)
380
381 # Runtime type guards — JSON can contain anything; fail loud rather than
382 # silently carrying non-string IDs into path construction.
383 commit_id = d["commit_id"]
384 if not isinstance(commit_id, str):
385 raise TypeError(f"commit_id must be str, got {type(commit_id).__name__}")
386 snapshot_id = d["snapshot_id"]
387 if not isinstance(snapshot_id, str):
388 raise TypeError(f"snapshot_id must be str, got {type(snapshot_id).__name__}")
389 branch = d["branch"]
390 if not isinstance(branch, str):
391 raise TypeError(f"branch must be str, got {type(branch).__name__}")
392
393 return cls(
394 commit_id=commit_id,
395 repo_id=d["repo_id"] if isinstance(d.get("repo_id"), str) else "",
396 branch=branch,
397 snapshot_id=snapshot_id,
398 message=d["message"] if isinstance(d.get("message"), str) else "",
399 committed_at=committed_at,
400 parent_commit_id=d.get("parent_commit_id"),
401 parent2_commit_id=d.get("parent2_commit_id"),
402 author=d.get("author", ""),
403 metadata=dict(d.get("metadata") or {}),
404 structured_delta=d.get("structured_delta"),
405 sem_ver_bump=d.get("sem_ver_bump", "none"),
406 breaking_changes=list(d.get("breaking_changes") or []),
407 agent_id=d.get("agent_id", ""),
408 model_id=d.get("model_id", ""),
409 toolchain_id=d.get("toolchain_id", ""),
410 prompt_hash=d.get("prompt_hash", ""),
411 signature=d.get("signature", ""),
412 signer_key_id=d.get("signer_key_id", ""),
413 format_version=d.get("format_version", 1),
414 reviewed_by=list(d.get("reviewed_by") or []),
415 test_runs=int(d.get("test_runs") or 0),
416 )
417
418
419 @dataclass
420 class SnapshotRecord:
421 """An immutable snapshot record stored as a JSON file under .muse/snapshots/."""
422
423 snapshot_id: str
424 manifest: dict[str, str]
425 created_at: datetime.datetime = field(
426 default_factory=lambda: datetime.datetime.now(datetime.timezone.utc)
427 )
428
429 def to_dict(self) -> SnapshotDict:
430 return SnapshotDict(
431 snapshot_id=self.snapshot_id,
432 manifest=self.manifest,
433 created_at=self.created_at.isoformat(),
434 )
435
436 @classmethod
437 def from_dict(cls, d: SnapshotDict) -> "SnapshotRecord":
438 try:
439 created_at = datetime.datetime.fromisoformat(d["created_at"])
440 except (ValueError, KeyError):
441 created_at = datetime.datetime.now(datetime.timezone.utc)
442 return cls(
443 snapshot_id=d["snapshot_id"],
444 manifest=dict(d.get("manifest") or {}),
445 created_at=created_at,
446 )
447
448
449 @dataclass
450 class TagRecord:
451 """A semantic tag attached to a commit."""
452
453 tag_id: str
454 repo_id: str
455 commit_id: str
456 tag: str
457 created_at: datetime.datetime = field(
458 default_factory=lambda: datetime.datetime.now(datetime.timezone.utc)
459 )
460
461 def to_dict(self) -> TagDict:
462 return TagDict(
463 tag_id=self.tag_id,
464 repo_id=self.repo_id,
465 commit_id=self.commit_id,
466 tag=self.tag,
467 created_at=self.created_at.isoformat(),
468 )
469
470 @classmethod
471 def from_dict(cls, d: TagDict) -> "TagRecord":
472 try:
473 created_at = datetime.datetime.fromisoformat(d["created_at"])
474 except (ValueError, KeyError):
475 created_at = datetime.datetime.now(datetime.timezone.utc)
476 return cls(
477 tag_id=d.get("tag_id", str(uuid.uuid4())),
478 repo_id=d["repo_id"],
479 commit_id=d["commit_id"],
480 tag=d["tag"],
481 created_at=created_at,
482 )
483
484
485 # ---------------------------------------------------------------------------
486 # Path helpers
487 # ---------------------------------------------------------------------------
488
489
490 def _commits_dir(repo_root: pathlib.Path) -> pathlib.Path:
491 return repo_root / ".muse" / _COMMITS_DIR
492
493
494 def _snapshots_dir(repo_root: pathlib.Path) -> pathlib.Path:
495 return repo_root / ".muse" / _SNAPSHOTS_DIR
496
497
498 def _tags_dir(repo_root: pathlib.Path, repo_id: str) -> pathlib.Path:
499 # Validate repo_id to prevent path traversal via crafted IDs from remote data.
500 # Uses a best-effort guard (no path separators or dot-sequences).
501 if "/" in repo_id or "\\" in repo_id or ".." in repo_id or not repo_id:
502 raise ValueError(f"repo_id {repo_id!r} contains unsafe path components.")
503 return repo_root / ".muse" / _TAGS_DIR / repo_id
504
505
506 def _commit_path(repo_root: pathlib.Path, commit_id: str) -> pathlib.Path:
507 return _commits_dir(repo_root) / f"{commit_id}.json"
508
509
510 def _snapshot_path(repo_root: pathlib.Path, snapshot_id: str) -> pathlib.Path:
511 return _snapshots_dir(repo_root) / f"{snapshot_id}.json"
512
513
514 # ---------------------------------------------------------------------------
515 # Commit operations
516 # ---------------------------------------------------------------------------
517
518
519 def write_commit(repo_root: pathlib.Path, commit: CommitRecord) -> None:
520 """Persist a commit record to ``.muse/commits/<commit_id>.json``."""
521 _commits_dir(repo_root).mkdir(parents=True, exist_ok=True)
522 path = _commit_path(repo_root, commit.commit_id)
523 if path.exists():
524 logger.debug("⚠️ Commit %s already exists — skipped", commit.commit_id[:8])
525 return
526 path.write_text(json.dumps(commit.to_dict(), indent=2) + "\n", encoding="utf-8")
527 logger.debug("✅ Stored commit %s branch=%r", commit.commit_id[:8], commit.branch)
528
529
530 def read_commit(repo_root: pathlib.Path, commit_id: str) -> CommitRecord | None:
531 """Load a commit record by ID, or ``None`` if it does not exist.
532
533 Callers that accept user-supplied or remote-supplied commit IDs should
534 validate the ID with :func:`~muse.core.validation.validate_ref_id` before
535 calling this function. This function itself accepts any string to support
536 internal uses with computed IDs.
537 """
538 path = _commit_path(repo_root, commit_id)
539 if not path.exists():
540 return None
541 try:
542 return CommitRecord.from_dict(json.loads(path.read_text(encoding="utf-8")))
543 except (json.JSONDecodeError, KeyError, TypeError) as exc:
544 logger.warning("⚠️ Corrupt commit file %s: %s", path, exc)
545 return None
546
547
548 def overwrite_commit(repo_root: pathlib.Path, commit: CommitRecord) -> None:
549 """Overwrite an existing commit record on disk (e.g. for annotation updates).
550
551 Unlike :func:`write_commit`, this function always writes the record even if
552 the file already exists. Use only for annotation fields
553 (``reviewed_by``, ``test_runs``) that are semantically additive — never
554 for changing history (commit_id, parent, snapshot, message).
555
556 Args:
557 repo_root: Repository root.
558 commit: The updated commit record to persist.
559 """
560 _commits_dir(repo_root).mkdir(parents=True, exist_ok=True)
561 path = _commit_path(repo_root, commit.commit_id)
562 path.write_text(json.dumps(commit.to_dict(), indent=2) + "\n", encoding="utf-8")
563 logger.debug("✅ Updated annotation on commit %s", commit.commit_id[:8])
564
565
566 def update_commit_metadata(
567 repo_root: pathlib.Path,
568 commit_id: str,
569 key: str,
570 value: str,
571 ) -> bool:
572 """Set a single string key in a commit's metadata dict.
573
574 Returns ``True`` on success, ``False`` if the commit is not found.
575 """
576 commit = read_commit(repo_root, commit_id)
577 if commit is None:
578 logger.warning("⚠️ Commit %s not found — cannot update metadata", commit_id[:8])
579 return False
580 commit.metadata[key] = value
581 path = _commit_path(repo_root, commit_id)
582 path.write_text(json.dumps(commit.to_dict(), indent=2) + "\n", encoding="utf-8")
583 logger.debug("✅ Set %s=%r on commit %s", key, value, commit_id[:8])
584 return True
585
586
587 def get_head_commit_id(repo_root: pathlib.Path, branch: str) -> str | None:
588 """Return the commit ID at HEAD of *branch*, or ``None`` for an empty branch."""
589 validate_branch_name(branch)
590 ref_path = repo_root / ".muse" / "refs" / "heads" / branch
591 if not ref_path.exists():
592 return None
593 raw = ref_path.read_text(encoding="utf-8").strip()
594 return raw if raw else None
595
596
597 def get_head_snapshot_id(
598 repo_root: pathlib.Path,
599 repo_id: str,
600 branch: str,
601 ) -> str | None:
602 """Return the snapshot_id at HEAD of *branch*, or ``None``."""
603 commit_id = get_head_commit_id(repo_root, branch)
604 if commit_id is None:
605 return None
606 commit = read_commit(repo_root, commit_id)
607 if commit is None:
608 return None
609 return commit.snapshot_id
610
611
612 def resolve_commit_ref(
613 repo_root: pathlib.Path,
614 repo_id: str,
615 branch: str,
616 ref: str | None,
617 ) -> CommitRecord | None:
618 """Resolve a commit reference to a ``CommitRecord``.
619
620 *ref* may be:
621 - ``None`` / ``"HEAD"`` — the most recent commit on *branch*.
622 - A full or abbreviated commit SHA — resolved by prefix scan.
623
624 Performs a safe prefix scan (glob metacharacters stripped from *ref*) so
625 user-supplied references cannot glob the entire commits directory.
626 """
627 if ref is None or ref.upper() == "HEAD":
628 commit_id = get_head_commit_id(repo_root, branch)
629 if commit_id is None:
630 return None
631 return read_commit(repo_root, commit_id)
632
633 # Sanitize user-supplied ref before using it in any filesystem operation.
634 safe_ref = sanitize_glob_prefix(ref)
635
636 # Try exact match — only if it looks like a full 64-char hex ID.
637 try:
638 validate_ref_id(safe_ref)
639 commit = read_commit(repo_root, safe_ref)
640 if commit is not None:
641 return commit
642 except ValueError:
643 pass # Not a full hex ID — fall through to prefix scan.
644
645 # Prefix scan with sanitized prefix.
646 return _find_commit_by_prefix(repo_root, safe_ref)
647
648
649 def _find_commit_by_prefix(
650 repo_root: pathlib.Path, prefix: str
651 ) -> CommitRecord | None:
652 """Find the first commit whose ID starts with *prefix*.
653
654 Glob metacharacters are stripped from *prefix* before use to prevent
655 callers from turning a targeted lookup into an arbitrary directory scan.
656 """
657 commits_dir = _commits_dir(repo_root)
658 if not commits_dir.exists():
659 return None
660 safe_prefix = sanitize_glob_prefix(prefix)
661 for path in commits_dir.glob(f"{safe_prefix}*.json"):
662 try:
663 return CommitRecord.from_dict(json.loads(path.read_text(encoding="utf-8")))
664 except (json.JSONDecodeError, KeyError, TypeError):
665 continue
666 return None
667
668
669 def find_commits_by_prefix(
670 repo_root: pathlib.Path, prefix: str
671 ) -> list[CommitRecord]:
672 """Return all commits whose ID starts with *prefix*."""
673 commits_dir = _commits_dir(repo_root)
674 if not commits_dir.exists():
675 return []
676 safe_prefix = sanitize_glob_prefix(prefix)
677 results: list[CommitRecord] = []
678 for path in commits_dir.glob(f"{safe_prefix}*.json"):
679 try:
680 results.append(CommitRecord.from_dict(json.loads(path.read_text(encoding="utf-8"))))
681 except (json.JSONDecodeError, KeyError, TypeError):
682 continue
683 return results
684
685
686 def get_commits_for_branch(
687 repo_root: pathlib.Path,
688 repo_id: str,
689 branch: str,
690 ) -> list[CommitRecord]:
691 """Return all commits on *branch*, newest first, by walking the parent chain."""
692 commits: list[CommitRecord] = []
693 commit_id = get_head_commit_id(repo_root, branch)
694 seen: set[str] = set()
695 while commit_id and commit_id not in seen:
696 seen.add(commit_id)
697 commit = read_commit(repo_root, commit_id)
698 if commit is None:
699 break
700 commits.append(commit)
701 commit_id = commit.parent_commit_id
702 return commits
703
704
705 def get_all_commits(repo_root: pathlib.Path) -> list[CommitRecord]:
706 """Return all commits in the store (order not guaranteed)."""
707 commits_dir = _commits_dir(repo_root)
708 if not commits_dir.exists():
709 return []
710 results: list[CommitRecord] = []
711 for path in commits_dir.glob("*.json"):
712 try:
713 results.append(CommitRecord.from_dict(json.loads(path.read_text(encoding="utf-8"))))
714 except (json.JSONDecodeError, KeyError, TypeError):
715 continue
716 return results
717
718
719 def walk_commits_between(
720 repo_root: pathlib.Path,
721 to_commit_id: str,
722 from_commit_id: str | None = None,
723 max_commits: int = 10_000,
724 ) -> list[CommitRecord]:
725 """Return commits reachable from *to_commit_id*, stopping before *from_commit_id*.
726
727 Walks the parent chain from *to_commit_id* backwards. Returns commits in
728 newest-first order (callers can reverse for oldest-first).
729
730 Args:
731 repo_root: Repository root.
732 to_commit_id: Inclusive end of the range.
733 from_commit_id: Exclusive start; ``None`` means walk to the initial commit.
734 max_commits: Safety cap.
735
736 Returns:
737 List of ``CommitRecord`` objects, newest first.
738 """
739 commits: list[CommitRecord] = []
740 seen: set[str] = set()
741 current_id: str | None = to_commit_id
742 while current_id and current_id not in seen and len(commits) < max_commits:
743 seen.add(current_id)
744 if current_id == from_commit_id:
745 break
746 commit = read_commit(repo_root, current_id)
747 if commit is None:
748 break
749 commits.append(commit)
750 current_id = commit.parent_commit_id
751 return commits
752
753
754 # ---------------------------------------------------------------------------
755 # Snapshot operations
756 # ---------------------------------------------------------------------------
757
758
759 def write_snapshot(repo_root: pathlib.Path, snapshot: SnapshotRecord) -> None:
760 """Persist a snapshot record to ``.muse/snapshots/<snapshot_id>.json``."""
761 _snapshots_dir(repo_root).mkdir(parents=True, exist_ok=True)
762 path = _snapshot_path(repo_root, snapshot.snapshot_id)
763 if path.exists():
764 logger.debug("⚠️ Snapshot %s already exists — skipped", snapshot.snapshot_id[:8])
765 return
766 path.write_text(json.dumps(snapshot.to_dict(), indent=2) + "\n", encoding="utf-8")
767 logger.debug("✅ Stored snapshot %s (%d files)", snapshot.snapshot_id[:8], len(snapshot.manifest))
768
769
770 def read_snapshot(repo_root: pathlib.Path, snapshot_id: str) -> SnapshotRecord | None:
771 """Load a snapshot record by ID, or ``None`` if it does not exist.
772
773 Callers that accept user-supplied or remote-supplied snapshot IDs should
774 validate the ID with :func:`~muse.core.validation.validate_ref_id` before
775 calling this function. This function itself accepts any string to support
776 internal uses with computed IDs.
777 """
778 path = _snapshot_path(repo_root, snapshot_id)
779 if not path.exists():
780 return None
781 try:
782 return SnapshotRecord.from_dict(json.loads(path.read_text(encoding="utf-8")))
783 except (json.JSONDecodeError, KeyError, TypeError) as exc:
784 logger.warning("⚠️ Corrupt snapshot file %s: %s", path, exc)
785 return None
786
787
788 def get_commit_snapshot_manifest(
789 repo_root: pathlib.Path, commit_id: str
790 ) -> dict[str, str] | None:
791 """Return the file manifest for the snapshot attached to *commit_id*, or ``None``."""
792 commit = read_commit(repo_root, commit_id)
793 if commit is None:
794 logger.warning("⚠️ Commit %s not found", commit_id[:8])
795 return None
796 snapshot = read_snapshot(repo_root, commit.snapshot_id)
797 if snapshot is None:
798 logger.warning(
799 "⚠️ Snapshot %s referenced by commit %s not found",
800 commit.snapshot_id[:8],
801 commit_id[:8],
802 )
803 return None
804 return dict(snapshot.manifest)
805
806
807 def get_head_snapshot_manifest(
808 repo_root: pathlib.Path, repo_id: str, branch: str
809 ) -> dict[str, str] | None:
810 """Return the manifest of the most recent commit on *branch*, or ``None``."""
811 snapshot_id = get_head_snapshot_id(repo_root, repo_id, branch)
812 if snapshot_id is None:
813 return None
814 snapshot = read_snapshot(repo_root, snapshot_id)
815 if snapshot is None:
816 return None
817 return dict(snapshot.manifest)
818
819
820 def get_all_object_ids(repo_root: pathlib.Path, repo_id: str) -> list[str]:
821 """Return all object IDs referenced by any snapshot in this repo."""
822 object_ids: set[str] = set()
823 for commit in get_all_commits(repo_root):
824 snapshot = read_snapshot(repo_root, commit.snapshot_id)
825 if snapshot is not None:
826 object_ids.update(snapshot.manifest.values())
827 return sorted(object_ids)
828
829
830 # ---------------------------------------------------------------------------
831 # Tag operations
832 # ---------------------------------------------------------------------------
833
834
835 def write_tag(repo_root: pathlib.Path, tag: TagRecord) -> None:
836 """Persist a tag record to ``.muse/tags/<repo_id>/<tag_id>.json``."""
837 tags_dir = _tags_dir(repo_root, tag.repo_id)
838 tags_dir.mkdir(parents=True, exist_ok=True)
839 path = tags_dir / f"{tag.tag_id}.json"
840 path.write_text(json.dumps(tag.to_dict(), indent=2) + "\n", encoding="utf-8")
841 logger.debug("✅ Stored tag %r on commit %s", tag.tag, tag.commit_id[:8])
842
843
844 def get_tags_for_commit(
845 repo_root: pathlib.Path, repo_id: str, commit_id: str
846 ) -> list[TagRecord]:
847 """Return all tags attached to *commit_id*."""
848 tags_dir = _tags_dir(repo_root, repo_id)
849 if not tags_dir.exists():
850 return []
851 results: list[TagRecord] = []
852 for path in tags_dir.glob("*.json"):
853 try:
854 record = TagRecord.from_dict(json.loads(path.read_text(encoding="utf-8")))
855 if record.commit_id == commit_id:
856 results.append(record)
857 except (json.JSONDecodeError, KeyError, TypeError):
858 continue
859 return results
860
861
862 def get_all_tags(repo_root: pathlib.Path, repo_id: str) -> list[TagRecord]:
863 """Return all tags in this repository."""
864 tags_dir = _tags_dir(repo_root, repo_id)
865 if not tags_dir.exists():
866 return []
867 results: list[TagRecord] = []
868 for path in tags_dir.glob("*.json"):
869 try:
870 results.append(TagRecord.from_dict(json.loads(path.read_text(encoding="utf-8"))))
871 except (json.JSONDecodeError, KeyError, TypeError):
872 continue
873 return results
874
875
876 # ---------------------------------------------------------------------------
877 # Remote sync helpers (push/pull)
878 # ---------------------------------------------------------------------------
879
880
881 def store_pulled_commit(
882 repo_root: pathlib.Path, commit_data: RemoteCommitPayload
883 ) -> bool:
884 """Persist a commit received from a remote into local storage.
885
886 Idempotent — silently skips if the commit already exists. Returns
887 ``True`` if the row was newly written, ``False`` if it already existed.
888
889 All ID fields from the remote payload are validated before any filesystem
890 operation to prevent path-traversal attacks via crafted remote responses.
891 """
892 commit_id = commit_data.get("commit_id") or ""
893 if not commit_id:
894 logger.warning("⚠️ store_pulled_commit: missing commit_id — skipping")
895 return False
896
897 try:
898 validate_ref_id(commit_id)
899 except ValueError as exc:
900 logger.warning("⚠️ store_pulled_commit: invalid commit_id %r — %s", commit_id, exc)
901 return False
902
903 snapshot_id = commit_data.get("snapshot_id") or ""
904 if snapshot_id:
905 try:
906 validate_ref_id(snapshot_id)
907 except ValueError as exc:
908 logger.warning(
909 "⚠️ store_pulled_commit: invalid snapshot_id %r — %s", snapshot_id, exc
910 )
911 return False
912
913 branch = commit_data.get("branch") or ""
914 if branch:
915 try:
916 validate_branch_name(branch)
917 except ValueError as exc:
918 logger.warning(
919 "⚠️ store_pulled_commit: invalid branch %r — %s", branch, exc
920 )
921 return False
922
923 if read_commit(repo_root, commit_id) is not None:
924 logger.debug("⚠️ Pulled commit %s already exists — skipped", commit_id[:8])
925 return False
926
927 commit_dict = CommitDict(
928 commit_id=commit_id,
929 repo_id=commit_data.get("repo_id") or "",
930 branch=commit_data.get("branch") or "",
931 snapshot_id=commit_data.get("snapshot_id") or "",
932 message=commit_data.get("message") or "",
933 committed_at=commit_data.get("committed_at") or "",
934 parent_commit_id=commit_data.get("parent_commit_id"),
935 parent2_commit_id=commit_data.get("parent2_commit_id"),
936 author=commit_data.get("author") or "",
937 metadata=dict(commit_data.get("metadata") or {}),
938 structured_delta=None,
939 )
940 write_commit(repo_root, CommitRecord.from_dict(commit_dict))
941
942 # Ensure a (possibly stub) snapshot record exists.
943 snapshot_id = commit_data.get("snapshot_id") or ""
944 if snapshot_id and read_snapshot(repo_root, snapshot_id) is None:
945 manifest: dict[str, str] = dict(commit_data.get("manifest") or {})
946 write_snapshot(repo_root, SnapshotRecord(
947 snapshot_id=snapshot_id,
948 manifest=manifest,
949 ))
950
951 return True
952
953
954 def store_pulled_object_metadata(
955 repo_root: pathlib.Path, object_data: dict[str, str]
956 ) -> bool:
957 """Register an object descriptor received from a remote.
958
959 The actual blob bytes are stored by ``object_store.write_object``.
960 This function records that the object is known (for GC and push-delta
961 computation). Currently a no-op since objects are content-addressed
962 files — presence in ``.muse/objects/`` is the ground truth.
963 """
964 return True