snapshot.py
python
| 1 | """Pure filesystem snapshot logic for ``muse commit``. |
| 2 | |
| 3 | All functions here are side-effect-free (no DB, no I/O besides reading |
| 4 | files under ``workdir``). They are kept separate so they can be |
| 5 | unit-tested without a database. |
| 6 | |
| 7 | ID derivation contract (deterministic, no random/UUID components): |
| 8 | |
| 9 | object_id = sha256(file_bytes).hexdigest() |
| 10 | |
| 11 | snapshot_id = sha256( |
| 12 | NUL.join(sorted(f"{path}NUL{oid}" for path, oid in manifest.items())) |
| 13 | ).hexdigest() |
| 14 | |
| 15 | commit_id = sha256( |
| 16 | NUL.join([NUL.join(sorted(parent_ids)), |
| 17 | snapshot_id, message, committed_at_iso]) |
| 18 | ).hexdigest() |
| 19 | |
| 20 | The null byte (\\x00) is used as the field separator because it is: |
| 21 | - Illegal in POSIX filenames (preventing separator-injection attacks from |
| 22 | crafted file paths). |
| 23 | - Absent from SHA-256 hex strings (preventing injection via object IDs). |
| 24 | - Absent from ISO-8601 timestamps and typical message text. |
| 25 | |
| 26 | This replaces the previous ``|`` / ``:`` separator scheme which allowed two |
| 27 | distinct manifests or commit inputs to produce the same hash if filenames |
| 28 | contained those characters. |
| 29 | |
| 30 | Symlinks in the working tree are excluded from snapshots. Following a |
| 31 | symlink that points outside state/ would silently commit the contents |
| 32 | of arbitrary filesystem paths. |
| 33 | |
| 34 | Hidden directories (any path component starting with ``.``) are also excluded |
| 35 | to prevent accidental inclusion of ``.git/``, ``.env``, and similar. |
| 36 | """ |
| 37 | |
| 38 | from __future__ import annotations |
| 39 | |
| 40 | import hashlib |
| 41 | import os |
| 42 | import pathlib |
| 43 | import stat as _stat |
| 44 | |
| 45 | from muse.core.stat_cache import load_cache |
| 46 | |
| 47 | _SEP = "\x00" |
| 48 | |
| 49 | |
| 50 | def hash_file(path: pathlib.Path) -> str: |
| 51 | """Return the sha256 hex digest of a file's raw bytes. |
| 52 | |
| 53 | This is the ``object_id`` for the given file. Reading in chunks |
| 54 | keeps memory usage constant regardless of file size. |
| 55 | """ |
| 56 | h = hashlib.sha256() |
| 57 | with path.open("rb") as fh: |
| 58 | for chunk in iter(lambda: fh.read(65536), b""): |
| 59 | h.update(chunk) |
| 60 | return h.hexdigest() |
| 61 | |
| 62 | |
| 63 | def build_snapshot_manifest(workdir: pathlib.Path) -> dict[str, str]: |
| 64 | """Alias for ``walk_workdir`` — preferred name in public API.""" |
| 65 | return walk_workdir(workdir) |
| 66 | |
| 67 | |
| 68 | def walk_workdir(workdir: pathlib.Path) -> dict[str, str]: |
| 69 | """Walk *workdir* recursively and return ``{rel_path: object_id}``. |
| 70 | |
| 71 | Exclusions (all silent, no warning emitted): |
| 72 | - Symlinks — following them could commit content from outside state/. |
| 73 | - Directories — only regular files are included. |
| 74 | - Hidden files — names starting with ``.``. |
| 75 | - Hidden directories — any path component starting with ``.``. |
| 76 | |
| 77 | Paths use POSIX separators regardless of host OS for cross-platform |
| 78 | reproducibility. |
| 79 | |
| 80 | When a ``.muse/`` directory exists under *workdir*, the stat cache is |
| 81 | consulted to skip re-hashing files whose ``(mtime, size)`` is unchanged |
| 82 | since the last walk. The cache is saved atomically after every walk so |
| 83 | subsequent calls benefit immediately. |
| 84 | |
| 85 | Performance note: ``os.walk`` with in-place ``dirnames`` pruning is used |
| 86 | instead of ``pathlib.rglob`` so that hidden directories (e.g. ``.venv/``, |
| 87 | ``.muse/objects/``, ``node_modules/``) are never descended into. On a |
| 88 | Python project with a virtualenv, ``rglob`` would visit tens of thousands |
| 89 | of site-package files before the hidden-dir filter could discard them. |
| 90 | """ |
| 91 | cache = load_cache(workdir) |
| 92 | manifest: dict[str, str] = {} |
| 93 | root_str = str(workdir) |
| 94 | prefix_len = len(root_str) + 1 # +1 for the path separator |
| 95 | |
| 96 | for dirpath, dirnames, filenames in os.walk(root_str, followlinks=False): |
| 97 | # Prune hidden subdirectories in-place before os.walk descends into them. |
| 98 | dirnames[:] = sorted(d for d in dirnames if not d.startswith(".")) |
| 99 | |
| 100 | for fname in sorted(filenames): |
| 101 | if fname.startswith("."): |
| 102 | continue |
| 103 | abs_str = os.path.join(dirpath, fname) |
| 104 | try: |
| 105 | st = os.lstat(abs_str) |
| 106 | except OSError: |
| 107 | continue |
| 108 | # os.lstat lets us check for symlinks and regular files with one |
| 109 | # syscall, replacing the separate is_symlink() + is_file() pair. |
| 110 | if not _stat.S_ISREG(st.st_mode): |
| 111 | continue |
| 112 | rel = abs_str[prefix_len:] |
| 113 | if os.sep != "/": |
| 114 | rel = rel.replace(os.sep, "/") |
| 115 | manifest[rel] = cache.get_cached(rel, abs_str, st.st_mtime, st.st_size) |
| 116 | |
| 117 | cache.prune(set(manifest)) |
| 118 | cache.save() |
| 119 | return manifest |
| 120 | |
| 121 | |
| 122 | def compute_snapshot_id(manifest: dict[str, str]) -> str: |
| 123 | """Return sha256 of the sorted ``path NUL object_id`` pairs. |
| 124 | |
| 125 | The null-byte separator prevents collisions from filenames or object IDs |
| 126 | that contain the previous ``|`` / ``:`` separators. |
| 127 | |
| 128 | Sorting ensures two identical working trees always produce the same |
| 129 | snapshot_id, regardless of filesystem traversal order. |
| 130 | """ |
| 131 | parts = sorted(f"{path}{_SEP}{oid}" for path, oid in manifest.items()) |
| 132 | payload = _SEP.join(parts).encode() |
| 133 | return hashlib.sha256(payload).hexdigest() |
| 134 | |
| 135 | |
| 136 | def diff_workdir_vs_snapshot( |
| 137 | workdir: pathlib.Path, |
| 138 | last_manifest: dict[str, str], |
| 139 | ) -> tuple[set[str], set[str], set[str], set[str]]: |
| 140 | """Compare *workdir* against *last_manifest* from the previous commit. |
| 141 | |
| 142 | Returns a tuple of four disjoint path sets: |
| 143 | |
| 144 | - ``added`` — files in *workdir* absent from *last_manifest* |
| 145 | (new files since the last commit). |
| 146 | - ``modified`` — files present in both but with a differing sha256 hash. |
| 147 | - ``deleted`` — files in *last_manifest* absent from *workdir*. |
| 148 | - ``untracked`` — non-empty only when *last_manifest* is empty (i.e. the |
| 149 | branch has no commits yet): every file in *workdir* is |
| 150 | treated as untracked rather than as newly-added. |
| 151 | |
| 152 | All paths use POSIX separators for cross-platform reproducibility. |
| 153 | """ |
| 154 | if not workdir.exists(): |
| 155 | # Nothing on disk — every previously committed path is deleted. |
| 156 | return set(), set(), set(last_manifest.keys()), set() |
| 157 | |
| 158 | current_manifest = walk_workdir(workdir) |
| 159 | current_paths = set(current_manifest.keys()) |
| 160 | last_paths = set(last_manifest.keys()) |
| 161 | |
| 162 | if not last_paths: |
| 163 | # No prior snapshot — all working-tree files are untracked. |
| 164 | return set(), set(), set(), current_paths |
| 165 | |
| 166 | added = current_paths - last_paths |
| 167 | deleted = last_paths - current_paths |
| 168 | common = current_paths & last_paths |
| 169 | modified = {p for p in common if current_manifest[p] != last_manifest[p]} |
| 170 | return added, modified, deleted, set() |
| 171 | |
| 172 | |
| 173 | def compute_commit_id( |
| 174 | parent_ids: list[str], |
| 175 | snapshot_id: str, |
| 176 | message: str, |
| 177 | committed_at_iso: str, |
| 178 | ) -> str: |
| 179 | """Return sha256 of the commit's canonical inputs. |
| 180 | |
| 181 | Uses null bytes as field separators to prevent separator-injection |
| 182 | attacks from commit messages, author names, or branch names containing |
| 183 | ``|`` characters. |
| 184 | |
| 185 | Given the same arguments on two machines the result is identical. |
| 186 | ``parent_ids`` is sorted before hashing so insertion order does not |
| 187 | affect determinism. |
| 188 | """ |
| 189 | parts = [ |
| 190 | _SEP.join(sorted(parent_ids)), |
| 191 | snapshot_id, |
| 192 | message, |
| 193 | committed_at_iso, |
| 194 | ] |
| 195 | payload = _SEP.join(parts).encode() |
| 196 | return hashlib.sha256(payload).hexdigest() |
| 197 | |
| 198 | |
| 199 | def compute_commit_tree_id( |
| 200 | parent_ids: list[str], |
| 201 | snapshot_id: str, |
| 202 | message: str, |
| 203 | author: str, |
| 204 | ) -> str: |
| 205 | """Return a deterministic commit ID for a raw plumbing commit (no timestamp). |
| 206 | |
| 207 | Unlike ``compute_commit_id``, this function omits ``committed_at`` so that |
| 208 | the same (parent_ids, snapshot_id, message, author) tuple always produces |
| 209 | the same commit_id. This guarantees idempotency for ``muse commit-tree``: |
| 210 | re-running with identical inputs returns the same ID without inserting a |
| 211 | duplicate row. |
| 212 | |
| 213 | Args: |
| 214 | parent_ids: Zero or more parent commit IDs. Sorted before hashing. |
| 215 | snapshot_id: The sha256 ID of the snapshot this commit points to. |
| 216 | message: The commit message. |
| 217 | author: The author name string. |
| 218 | |
| 219 | Returns: |
| 220 | A 64-character lowercase hex SHA-256 digest. |
| 221 | """ |
| 222 | parts = [ |
| 223 | _SEP.join(sorted(parent_ids)), |
| 224 | snapshot_id, |
| 225 | message, |
| 226 | author, |
| 227 | ] |
| 228 | payload = _SEP.join(parts).encode() |
| 229 | return hashlib.sha256(payload).hexdigest() |