gabriel / muse public
snapshot.py python
229 lines 8.3 KB
f288fedc perf+fix: muse status — eliminate rglob descent, stat cache, single rep… Gabriel Cardona <gabriel@tellurstori.com> 2d ago
1 """Pure filesystem snapshot logic for ``muse commit``.
2
3 All functions here are side-effect-free (no DB, no I/O besides reading
4 files under ``workdir``). They are kept separate so they can be
5 unit-tested without a database.
6
7 ID derivation contract (deterministic, no random/UUID components):
8
9 object_id = sha256(file_bytes).hexdigest()
10
11 snapshot_id = sha256(
12 NUL.join(sorted(f"{path}NUL{oid}" for path, oid in manifest.items()))
13 ).hexdigest()
14
15 commit_id = sha256(
16 NUL.join([NUL.join(sorted(parent_ids)),
17 snapshot_id, message, committed_at_iso])
18 ).hexdigest()
19
20 The null byte (\\x00) is used as the field separator because it is:
21 - Illegal in POSIX filenames (preventing separator-injection attacks from
22 crafted file paths).
23 - Absent from SHA-256 hex strings (preventing injection via object IDs).
24 - Absent from ISO-8601 timestamps and typical message text.
25
26 This replaces the previous ``|`` / ``:`` separator scheme which allowed two
27 distinct manifests or commit inputs to produce the same hash if filenames
28 contained those characters.
29
30 Symlinks in the working tree are excluded from snapshots. Following a
31 symlink that points outside state/ would silently commit the contents
32 of arbitrary filesystem paths.
33
34 Hidden directories (any path component starting with ``.``) are also excluded
35 to prevent accidental inclusion of ``.git/``, ``.env``, and similar.
36 """
37
38 from __future__ import annotations
39
40 import hashlib
41 import os
42 import pathlib
43 import stat as _stat
44
45 from muse.core.stat_cache import load_cache
46
47 _SEP = "\x00"
48
49
50 def hash_file(path: pathlib.Path) -> str:
51 """Return the sha256 hex digest of a file's raw bytes.
52
53 This is the ``object_id`` for the given file. Reading in chunks
54 keeps memory usage constant regardless of file size.
55 """
56 h = hashlib.sha256()
57 with path.open("rb") as fh:
58 for chunk in iter(lambda: fh.read(65536), b""):
59 h.update(chunk)
60 return h.hexdigest()
61
62
63 def build_snapshot_manifest(workdir: pathlib.Path) -> dict[str, str]:
64 """Alias for ``walk_workdir`` — preferred name in public API."""
65 return walk_workdir(workdir)
66
67
68 def walk_workdir(workdir: pathlib.Path) -> dict[str, str]:
69 """Walk *workdir* recursively and return ``{rel_path: object_id}``.
70
71 Exclusions (all silent, no warning emitted):
72 - Symlinks — following them could commit content from outside state/.
73 - Directories — only regular files are included.
74 - Hidden files — names starting with ``.``.
75 - Hidden directories — any path component starting with ``.``.
76
77 Paths use POSIX separators regardless of host OS for cross-platform
78 reproducibility.
79
80 When a ``.muse/`` directory exists under *workdir*, the stat cache is
81 consulted to skip re-hashing files whose ``(mtime, size)`` is unchanged
82 since the last walk. The cache is saved atomically after every walk so
83 subsequent calls benefit immediately.
84
85 Performance note: ``os.walk`` with in-place ``dirnames`` pruning is used
86 instead of ``pathlib.rglob`` so that hidden directories (e.g. ``.venv/``,
87 ``.muse/objects/``, ``node_modules/``) are never descended into. On a
88 Python project with a virtualenv, ``rglob`` would visit tens of thousands
89 of site-package files before the hidden-dir filter could discard them.
90 """
91 cache = load_cache(workdir)
92 manifest: dict[str, str] = {}
93 root_str = str(workdir)
94 prefix_len = len(root_str) + 1 # +1 for the path separator
95
96 for dirpath, dirnames, filenames in os.walk(root_str, followlinks=False):
97 # Prune hidden subdirectories in-place before os.walk descends into them.
98 dirnames[:] = sorted(d for d in dirnames if not d.startswith("."))
99
100 for fname in sorted(filenames):
101 if fname.startswith("."):
102 continue
103 abs_str = os.path.join(dirpath, fname)
104 try:
105 st = os.lstat(abs_str)
106 except OSError:
107 continue
108 # os.lstat lets us check for symlinks and regular files with one
109 # syscall, replacing the separate is_symlink() + is_file() pair.
110 if not _stat.S_ISREG(st.st_mode):
111 continue
112 rel = abs_str[prefix_len:]
113 if os.sep != "/":
114 rel = rel.replace(os.sep, "/")
115 manifest[rel] = cache.get_cached(rel, abs_str, st.st_mtime, st.st_size)
116
117 cache.prune(set(manifest))
118 cache.save()
119 return manifest
120
121
122 def compute_snapshot_id(manifest: dict[str, str]) -> str:
123 """Return sha256 of the sorted ``path NUL object_id`` pairs.
124
125 The null-byte separator prevents collisions from filenames or object IDs
126 that contain the previous ``|`` / ``:`` separators.
127
128 Sorting ensures two identical working trees always produce the same
129 snapshot_id, regardless of filesystem traversal order.
130 """
131 parts = sorted(f"{path}{_SEP}{oid}" for path, oid in manifest.items())
132 payload = _SEP.join(parts).encode()
133 return hashlib.sha256(payload).hexdigest()
134
135
136 def diff_workdir_vs_snapshot(
137 workdir: pathlib.Path,
138 last_manifest: dict[str, str],
139 ) -> tuple[set[str], set[str], set[str], set[str]]:
140 """Compare *workdir* against *last_manifest* from the previous commit.
141
142 Returns a tuple of four disjoint path sets:
143
144 - ``added`` — files in *workdir* absent from *last_manifest*
145 (new files since the last commit).
146 - ``modified`` — files present in both but with a differing sha256 hash.
147 - ``deleted`` — files in *last_manifest* absent from *workdir*.
148 - ``untracked`` — non-empty only when *last_manifest* is empty (i.e. the
149 branch has no commits yet): every file in *workdir* is
150 treated as untracked rather than as newly-added.
151
152 All paths use POSIX separators for cross-platform reproducibility.
153 """
154 if not workdir.exists():
155 # Nothing on disk — every previously committed path is deleted.
156 return set(), set(), set(last_manifest.keys()), set()
157
158 current_manifest = walk_workdir(workdir)
159 current_paths = set(current_manifest.keys())
160 last_paths = set(last_manifest.keys())
161
162 if not last_paths:
163 # No prior snapshot — all working-tree files are untracked.
164 return set(), set(), set(), current_paths
165
166 added = current_paths - last_paths
167 deleted = last_paths - current_paths
168 common = current_paths & last_paths
169 modified = {p for p in common if current_manifest[p] != last_manifest[p]}
170 return added, modified, deleted, set()
171
172
173 def compute_commit_id(
174 parent_ids: list[str],
175 snapshot_id: str,
176 message: str,
177 committed_at_iso: str,
178 ) -> str:
179 """Return sha256 of the commit's canonical inputs.
180
181 Uses null bytes as field separators to prevent separator-injection
182 attacks from commit messages, author names, or branch names containing
183 ``|`` characters.
184
185 Given the same arguments on two machines the result is identical.
186 ``parent_ids`` is sorted before hashing so insertion order does not
187 affect determinism.
188 """
189 parts = [
190 _SEP.join(sorted(parent_ids)),
191 snapshot_id,
192 message,
193 committed_at_iso,
194 ]
195 payload = _SEP.join(parts).encode()
196 return hashlib.sha256(payload).hexdigest()
197
198
199 def compute_commit_tree_id(
200 parent_ids: list[str],
201 snapshot_id: str,
202 message: str,
203 author: str,
204 ) -> str:
205 """Return a deterministic commit ID for a raw plumbing commit (no timestamp).
206
207 Unlike ``compute_commit_id``, this function omits ``committed_at`` so that
208 the same (parent_ids, snapshot_id, message, author) tuple always produces
209 the same commit_id. This guarantees idempotency for ``muse commit-tree``:
210 re-running with identical inputs returns the same ID without inserting a
211 duplicate row.
212
213 Args:
214 parent_ids: Zero or more parent commit IDs. Sorted before hashing.
215 snapshot_id: The sha256 ID of the snapshot this commit points to.
216 message: The commit message.
217 author: The author name string.
218
219 Returns:
220 A 64-character lowercase hex SHA-256 digest.
221 """
222 parts = [
223 _SEP.join(sorted(parent_ids)),
224 snapshot_id,
225 message,
226 author,
227 ]
228 payload = _SEP.join(parts).encode()
229 return hashlib.sha256(payload).hexdigest()