cgcardona / muse public
snapshot.py python
161 lines 5.4 KB
bda49bdb feat: redesign .museignore as TOML with domain-scoped sections (#100) Gabriel Cardona <cgcardona@gmail.com> 1d ago
1 """Pure filesystem snapshot logic for ``muse commit``.
2
3 All functions here are side-effect-free (no DB, no I/O besides reading
4 files under ``workdir``). They are kept separate so they can be
5 unit-tested without a database.
6
7 ID derivation contract (deterministic, no random/UUID components):
8
9 object_id = sha256(file_bytes).hexdigest()
10 snapshot_id = sha256("|".join(sorted(f"{path}:{oid}" for path, oid in manifest.items()))).hexdigest()
11 commit_id = sha256(
12 "|".join(sorted(parent_ids))
13 + "|" + snapshot_id
14 + "|" + message
15 + "|" + committed_at_iso
16 ).hexdigest()
17 """
18
19 from __future__ import annotations
20
21 import hashlib
22 import pathlib
23
24
25 def hash_file(path: pathlib.Path) -> str:
26 """Return the sha256 hex digest of a file's raw bytes.
27
28 This is the ``object_id`` for the given file. Reading in chunks
29 keeps memory usage constant regardless of file size.
30 """
31 h = hashlib.sha256()
32 with path.open("rb") as fh:
33 for chunk in iter(lambda: fh.read(65536), b""):
34 h.update(chunk)
35 return h.hexdigest()
36
37
38 def build_snapshot_manifest(workdir: pathlib.Path) -> dict[str, str]:
39 """Alias for ``walk_workdir`` — preferred name in public API."""
40 return walk_workdir(workdir)
41
42
43 def walk_workdir(workdir: pathlib.Path) -> dict[str, str]:
44 """Walk *workdir* recursively and return ``{rel_path: object_id}``.
45
46 Only regular files are included (symlinks and directories are skipped).
47 Paths use POSIX separators regardless of host OS for cross-platform
48 reproducibility. Hidden files (starting with ``.``) are excluded.
49 """
50 manifest: dict[str, str] = {}
51 for file_path in sorted(workdir.rglob("*")):
52 if not file_path.is_file():
53 continue
54 if file_path.name.startswith("."):
55 continue
56 rel = file_path.relative_to(workdir).as_posix()
57 manifest[rel] = hash_file(file_path)
58 return manifest
59
60
61 def compute_snapshot_id(manifest: dict[str, str]) -> str:
62 """Return sha256 of the sorted ``path:object_id`` pairs.
63
64 Sorting ensures two identical working trees always produce the same
65 snapshot_id, regardless of filesystem traversal order.
66 """
67 parts = sorted(f"{path}:{oid}" for path, oid in manifest.items())
68 payload = "|".join(parts).encode()
69 return hashlib.sha256(payload).hexdigest()
70
71
72 def diff_workdir_vs_snapshot(
73 workdir: pathlib.Path,
74 last_manifest: dict[str, str],
75 ) -> tuple[set[str], set[str], set[str], set[str]]:
76 """Compare *workdir* against *last_manifest* from the previous commit.
77
78 Returns a tuple of four disjoint path sets:
79
80 - ``added`` — files in *workdir* absent from *last_manifest*
81 (new files since the last commit).
82 - ``modified`` — files present in both but with a differing sha256 hash.
83 - ``deleted`` — files in *last_manifest* absent from *workdir*.
84 - ``untracked`` — non-empty only when *last_manifest* is empty (i.e. the
85 branch has no commits yet): every file in *workdir* is
86 treated as untracked rather than as newly-added.
87
88 All paths use POSIX separators for cross-platform reproducibility.
89 """
90 if not workdir.exists():
91 # Nothing on disk — every previously committed path is deleted.
92 return set(), set(), set(last_manifest.keys()), set()
93
94 current_manifest = walk_workdir(workdir)
95 current_paths = set(current_manifest.keys())
96 last_paths = set(last_manifest.keys())
97
98 if not last_paths:
99 # No prior snapshot — all working-tree files are untracked.
100 return set(), set(), set(), current_paths
101
102 added = current_paths - last_paths
103 deleted = last_paths - current_paths
104 common = current_paths & last_paths
105 modified = {p for p in common if current_manifest[p] != last_manifest[p]}
106 return added, modified, deleted, set()
107
108
109 def compute_commit_id(
110 parent_ids: list[str],
111 snapshot_id: str,
112 message: str,
113 committed_at_iso: str,
114 ) -> str:
115 """Return sha256 of the commit's canonical inputs.
116
117 Given the same arguments on two machines the result is identical.
118 ``parent_ids`` is sorted before hashing so insertion order does not
119 affect determinism.
120 """
121 parts = [
122 "|".join(sorted(parent_ids)),
123 snapshot_id,
124 message,
125 committed_at_iso,
126 ]
127 payload = "|".join(parts).encode()
128 return hashlib.sha256(payload).hexdigest()
129
130
131 def compute_commit_tree_id(
132 parent_ids: list[str],
133 snapshot_id: str,
134 message: str,
135 author: str,
136 ) -> str:
137 """Return a deterministic commit ID for a raw plumbing commit (no timestamp).
138
139 Unlike ``compute_commit_id``, this function omits ``committed_at`` so that
140 the same (parent_ids, snapshot_id, message, author) tuple always produces
141 the same commit_id. This guarantees idempotency for ``muse commit-tree``:
142 re-running with identical inputs returns the same ID without inserting a
143 duplicate row.
144
145 Args:
146 parent_ids: Zero or more parent commit IDs. Sorted before hashing.
147 snapshot_id: The sha256 ID of the snapshot this commit points to.
148 message: The commit message.
149 author: The author name string.
150
151 Returns:
152 A 64-character lowercase hex SHA-256 digest.
153 """
154 parts = [
155 "|".join(sorted(parent_ids)),
156 snapshot_id,
157 message,
158 author,
159 ]
160 payload = "|".join(parts).encode()
161 return hashlib.sha256(payload).hexdigest()