muse/core/snapshot.py · gabriel/muse

snapshot.py python

229 lines 8.3 KB

f288fedc perf+fix: muse status — eliminate rglob descent, stat cache, single rep… Gabriel Cardona <gabriel@tellurstori.com> 2d ago

1	"""Pure filesystem snapshot logic for ``muse commit``.
2
3	All functions here are side-effect-free (no DB, no I/O besides reading
4	files under ``workdir``). They are kept separate so they can be
5	unit-tested without a database.
6
7	ID derivation contract (deterministic, no random/UUID components):
8
9	object_id = sha256(file_bytes).hexdigest()
10
11	snapshot_id = sha256(
12	NUL.join(sorted(f"{path}NUL{oid}" for path, oid in manifest.items()))
13	).hexdigest()
14
15	commit_id = sha256(
16	NUL.join([NUL.join(sorted(parent_ids)),
17	snapshot_id, message, committed_at_iso])
18	).hexdigest()
19
20	The null byte (\\x00) is used as the field separator because it is:
21	- Illegal in POSIX filenames (preventing separator-injection attacks from
22	crafted file paths).
23	- Absent from SHA-256 hex strings (preventing injection via object IDs).
24	- Absent from ISO-8601 timestamps and typical message text.
25
26	This replaces the previous ``\|`` / ``:`` separator scheme which allowed two
27	distinct manifests or commit inputs to produce the same hash if filenames
28	contained those characters.
29
30	Symlinks in the working tree are excluded from snapshots. Following a
31	symlink that points outside state/ would silently commit the contents
32	of arbitrary filesystem paths.
33
34	Hidden directories (any path component starting with ``.``) are also excluded
35	to prevent accidental inclusion of ``.git/``, ``.env``, and similar.
36	"""
37
38	from __future__ import annotations
39
40	import hashlib
41	import os
42	import pathlib
43	import stat as _stat
44
45	from muse.core.stat_cache import load_cache
46
47	_SEP = "\x00"
48
49
50	def hash_file(path: pathlib.Path) -> str:
51	"""Return the sha256 hex digest of a file's raw bytes.
52
53	This is the ``object_id`` for the given file. Reading in chunks
54	keeps memory usage constant regardless of file size.
55	"""
56	h = hashlib.sha256()
57	with path.open("rb") as fh:
58	for chunk in iter(lambda: fh.read(65536), b""):
59	h.update(chunk)
60	return h.hexdigest()
61
62
63	def build_snapshot_manifest(workdir: pathlib.Path) -> dict[str, str]:
64	"""Alias for ``walk_workdir`` — preferred name in public API."""
65	return walk_workdir(workdir)
66
67
68	def walk_workdir(workdir: pathlib.Path) -> dict[str, str]:
69	"""Walk workdir recursively and return ``{rel_path: object_id}``.
70
71	Exclusions (all silent, no warning emitted):
72	- Symlinks — following them could commit content from outside state/.
73	- Directories — only regular files are included.
74	- Hidden files — names starting with ``.``.
75	- Hidden directories — any path component starting with ``.``.
76
77	Paths use POSIX separators regardless of host OS for cross-platform
78	reproducibility.
79
80	When a ``.muse/`` directory exists under workdir, the stat cache is
81	consulted to skip re-hashing files whose ``(mtime, size)`` is unchanged
82	since the last walk. The cache is saved atomically after every walk so
83	subsequent calls benefit immediately.
84
85	Performance note: ``os.walk`` with in-place ``dirnames`` pruning is used
86	instead of ``pathlib.rglob`` so that hidden directories (e.g. ``.venv/``,
87	``.muse/objects/``, ``node_modules/``) are never descended into. On a
88	Python project with a virtualenv, ``rglob`` would visit tens of thousands
89	of site-package files before the hidden-dir filter could discard them.
90	"""
91	cache = load_cache(workdir)
92	manifest: dict[str, str] = {}
93	root_str = str(workdir)
94	prefix_len = len(root_str) + 1 # +1 for the path separator
95
96	for dirpath, dirnames, filenames in os.walk(root_str, followlinks=False):
97	# Prune hidden subdirectories in-place before os.walk descends into them.
98	dirnames[:] = sorted(d for d in dirnames if not d.startswith("."))
99
100	for fname in sorted(filenames):
101	if fname.startswith("."):
102	continue
103	abs_str = os.path.join(dirpath, fname)
104	try:
105	st = os.lstat(abs_str)
106	except OSError:
107	continue
108	# os.lstat lets us check for symlinks and regular files with one
109	# syscall, replacing the separate is_symlink() + is_file() pair.
110	if not _stat.S_ISREG(st.st_mode):
111	continue
112	rel = abs_str[prefix_len:]
113	if os.sep != "/":
114	rel = rel.replace(os.sep, "/")
115	manifest[rel] = cache.get_cached(rel, abs_str, st.st_mtime, st.st_size)
116
117	cache.prune(set(manifest))
118	cache.save()
119	return manifest
120
121
122	def compute_snapshot_id(manifest: dict[str, str]) -> str:
123	"""Return sha256 of the sorted ``path NUL object_id`` pairs.
124
125	The null-byte separator prevents collisions from filenames or object IDs
126	that contain the previous ``\|`` / ``:`` separators.
127
128	Sorting ensures two identical working trees always produce the same
129	snapshot_id, regardless of filesystem traversal order.
130	"""
131	parts = sorted(f"{path}{_SEP}{oid}" for path, oid in manifest.items())
132	payload = _SEP.join(parts).encode()
133	return hashlib.sha256(payload).hexdigest()
134
135
136	def diff_workdir_vs_snapshot(
137	workdir: pathlib.Path,
138	last_manifest: dict[str, str],
139	) -> tuple[set[str], set[str], set[str], set[str]]:
140	"""Compare workdir against last_manifest from the previous commit.
141
142	Returns a tuple of four disjoint path sets:
143
144	- ``added`` — files in workdir absent from last_manifest
145	(new files since the last commit).
146	- ``modified`` — files present in both but with a differing sha256 hash.
147	- ``deleted`` — files in last_manifest absent from workdir.
148	- ``untracked`` — non-empty only when last_manifest is empty (i.e. the
149	branch has no commits yet): every file in workdir is
150	treated as untracked rather than as newly-added.
151
152	All paths use POSIX separators for cross-platform reproducibility.
153	"""
154	if not workdir.exists():
155	# Nothing on disk — every previously committed path is deleted.
156	return set(), set(), set(last_manifest.keys()), set()
157
158	current_manifest = walk_workdir(workdir)
159	current_paths = set(current_manifest.keys())
160	last_paths = set(last_manifest.keys())
161
162	if not last_paths:
163	# No prior snapshot — all working-tree files are untracked.
164	return set(), set(), set(), current_paths
165
166	added = current_paths - last_paths
167	deleted = last_paths - current_paths
168	common = current_paths & last_paths
169	modified = {p for p in common if current_manifest[p] != last_manifest[p]}
170	return added, modified, deleted, set()
171
172
173	def compute_commit_id(
174	parent_ids: list[str],
175	snapshot_id: str,
176	message: str,
177	committed_at_iso: str,
178	) -> str:
179	"""Return sha256 of the commit's canonical inputs.
180
181	Uses null bytes as field separators to prevent separator-injection
182	attacks from commit messages, author names, or branch names containing
183	``\|`` characters.
184
185	Given the same arguments on two machines the result is identical.
186	``parent_ids`` is sorted before hashing so insertion order does not
187	affect determinism.
188	"""
189	parts = [
190	_SEP.join(sorted(parent_ids)),
191	snapshot_id,
192	message,
193	committed_at_iso,
194	]
195	payload = _SEP.join(parts).encode()
196	return hashlib.sha256(payload).hexdigest()
197
198
199	def compute_commit_tree_id(
200	parent_ids: list[str],
201	snapshot_id: str,
202	message: str,
203	author: str,
204	) -> str:
205	"""Return a deterministic commit ID for a raw plumbing commit (no timestamp).
206
207	Unlike ``compute_commit_id``, this function omits ``committed_at`` so that
208	the same (parent_ids, snapshot_id, message, author) tuple always produces
209	the same commit_id. This guarantees idempotency for ``muse commit-tree``:
210	re-running with identical inputs returns the same ID without inserting a
211	duplicate row.
212
213	Args:
214	parent_ids: Zero or more parent commit IDs. Sorted before hashing.
215	snapshot_id: The sha256 ID of the snapshot this commit points to.
216	message: The commit message.
217	author: The author name string.
218
219	Returns:
220	A 64-character lowercase hex SHA-256 digest.
221	"""
222	parts = [
223	_SEP.join(sorted(parent_ids)),
224	snapshot_id,
225	message,
226	author,
227	]
228	payload = _SEP.join(parts).encode()
229	return hashlib.sha256(payload).hexdigest()

Content Address

Object ID (SHA-256)

e8fd427a57ce1ef2572cef7ec3b86a382dc8e6c8cf0ac6ef3b1a93ddc6de2fb4

This file is immutable and content-addressed. The same SHA always refers to the same bytes, across every clone and every time.

File Info

Path muse/core/snapshot.py

Lines 229

Size 8.3 KB

Language python

Ref f288fedc

Snapshot b33f1b88c658…

Last Modified

f288fedc

perf+fix: muse status — eliminate rglob descent, stat cache, single repo.json read

Gabriel Cardona <gabriel@tellurstori.com> 2d ago

View commit →

Links

Browse tree at f288fedc All commits View raw