stat_cache.py
python
| 1 | """Stat-based file hash cache — fast snapshot computation for all domains. |
| 2 | |
| 3 | Architecture |
| 4 | ------------ |
| 5 | Every ``plugin.snapshot()`` call must hash every tracked file to detect |
| 6 | changes. On a repository with hundreds of files this is the dominant cost of |
| 7 | ``muse status``, ``muse diff``, and any command that calls ``snapshot()``. |
| 8 | |
| 9 | ``StatCache`` eliminates redundant I/O by persisting two classes of hash per |
| 10 | file between invocations: |
| 11 | |
| 12 | Object hash |
| 13 | SHA-256 of raw bytes. Used by the content-addressed object store. |
| 14 | Recomputed only when ``(mtime, size)`` changes. |
| 15 | |
| 16 | Dimension hashes |
| 17 | Domain-specific semantic hashes. For the code domain these might be the |
| 18 | SHA-256 of the AST symbol set, the import set, and so on. For the MIDI |
| 19 | domain they might be the hash of parsed note events, tempo map, and |
| 20 | harmony analysis. Populated by domain plugins after parsing; consumed by |
| 21 | ``diff()`` and ``merge()`` to skip re-parsing unchanged files entirely. |
| 22 | |
| 23 | An empty ``dimensions`` dict means no semantic hashes are cached yet — |
| 24 | this is the baseline state and is always safe. |
| 25 | |
| 26 | Cache validity |
| 27 | -------------- |
| 28 | A cache entry is valid when the file's current ``st_mtime`` and ``st_size`` |
| 29 | exactly match the stored values — the same contract Git's index uses. The |
| 30 | cache is **self-healing**: writing a file (e.g. ``muse checkout``) always |
| 31 | updates ``mtime``, causing a cache miss on the next scan. |
| 32 | |
| 33 | Known corner-case ("racy Muse"): a file modified within the same filesystem |
| 34 | timestamp quantum *and* with identical size could be served a stale cache |
| 35 | entry. This is identical to "racy git" and is not defended against here. |
| 36 | |
| 37 | Storage |
| 38 | ------- |
| 39 | ``.muse/stat_cache.json`` — a versioned JSON document:: |
| 40 | |
| 41 | { |
| 42 | "version": 1, |
| 43 | "entries": { |
| 44 | "muse/core/snapshot.py": { |
| 45 | "mtime": 1710000000.123456, |
| 46 | "size": 4321, |
| 47 | "object_hash": "<sha256-of-raw-bytes>", |
| 48 | "dimensions": { |
| 49 | "symbols": "<sha256-of-ast-symbol-set>", |
| 50 | "imports": "<sha256-of-import-set>" |
| 51 | } |
| 52 | } |
| 53 | } |
| 54 | } |
| 55 | |
| 56 | Writes are atomic: data is flushed to a ``.tmp`` sibling then renamed over |
| 57 | the target, so a crash mid-write never corrupts the cache. |
| 58 | """ |
| 59 | |
| 60 | from __future__ import annotations |
| 61 | |
| 62 | import hashlib |
| 63 | import json |
| 64 | import logging |
| 65 | import pathlib |
| 66 | from typing import TypedDict |
| 67 | |
| 68 | logger = logging.getLogger(__name__) |
| 69 | |
| 70 | _CACHE_VERSION = 1 |
| 71 | _CACHE_FILENAME = "stat_cache.json" |
| 72 | _CHUNK = 65_536 |
| 73 | |
| 74 | |
| 75 | class FileCacheEntry(TypedDict): |
| 76 | """Persisted metadata for a single workspace file.""" |
| 77 | |
| 78 | mtime: float |
| 79 | size: int |
| 80 | object_hash: str |
| 81 | # Domain plugins write semantic hashes here after parsing. |
| 82 | # Keys are dimension names ("symbols", "imports", "notes", …). |
| 83 | # Empty dict == no dimension hashes cached yet; always safe to return None. |
| 84 | dimensions: dict[str, str] |
| 85 | |
| 86 | |
| 87 | class _CacheDoc(TypedDict): |
| 88 | """On-disk JSON document shape.""" |
| 89 | |
| 90 | version: int |
| 91 | entries: dict[str, FileCacheEntry] |
| 92 | |
| 93 | |
| 94 | def _hash_bytes(path: pathlib.Path) -> str: |
| 95 | """Return the SHA-256 hex digest of *path*'s raw bytes. |
| 96 | |
| 97 | Reads in 64 KiB chunks so memory usage is constant regardless of file size. |
| 98 | This is the single canonical implementation shared by the cache and all |
| 99 | domain plugins — no more duplicated ``_hash_file`` helpers. |
| 100 | """ |
| 101 | return _hash_str(str(path)) |
| 102 | |
| 103 | |
| 104 | def _hash_str(path_str: str) -> str: |
| 105 | """String-path variant of ``_hash_bytes`` — avoids constructing a Path object. |
| 106 | |
| 107 | Used in the hot inner loop of ``walk_workdir`` and plugin snapshot methods |
| 108 | where the file path is already a plain string from ``os.walk``. |
| 109 | """ |
| 110 | h = hashlib.sha256() |
| 111 | with open(path_str, "rb") as fh: |
| 112 | for chunk in iter(lambda: fh.read(_CHUNK), b""): |
| 113 | h.update(chunk) |
| 114 | return h.hexdigest() |
| 115 | |
| 116 | |
| 117 | class StatCache: |
| 118 | """Shared stat-based hash cache for all domain plugin ``snapshot()`` calls. |
| 119 | |
| 120 | Typical lifecycle inside a plugin's ``snapshot()``:: |
| 121 | |
| 122 | cache = StatCache.load(root / ".muse") |
| 123 | for file_path in walk(...): |
| 124 | files[rel] = cache.get_object_hash(root, file_path) |
| 125 | cache.prune(set(files)) |
| 126 | cache.save() |
| 127 | |
| 128 | The same instance can be passed to ``diff()`` or ``merge()`` logic to |
| 129 | retrieve already-computed dimension hashes without re-parsing files. |
| 130 | """ |
| 131 | |
| 132 | def __init__( |
| 133 | self, muse_dir: pathlib.Path | None, entries: dict[str, FileCacheEntry] |
| 134 | ) -> None: |
| 135 | self._muse_dir = muse_dir |
| 136 | self._entries = entries |
| 137 | self._dirty = False |
| 138 | |
| 139 | # ------------------------------------------------------------------ |
| 140 | # Construction |
| 141 | # ------------------------------------------------------------------ |
| 142 | |
| 143 | @classmethod |
| 144 | def load(cls, muse_dir: pathlib.Path) -> StatCache: |
| 145 | """Load the cache from *muse_dir*/stat_cache.json. |
| 146 | |
| 147 | Validates the version field and every entry's field types on load so |
| 148 | a corrupt or future-format file never poisons the cache. Returns a |
| 149 | fresh empty cache if the file is absent, unreadable, or version |
| 150 | mismatches — never raises. |
| 151 | |
| 152 | Parsing is done inline (not via a typed helper) so that isinstance |
| 153 | checks narrow from ``Any`` — the type returned by ``json.loads`` — |
| 154 | giving mypy accurate control-flow narrowing without unreachable-branch |
| 155 | false positives. |
| 156 | """ |
| 157 | cache_file = muse_dir / _CACHE_FILENAME |
| 158 | if cache_file.is_file(): |
| 159 | try: |
| 160 | raw = json.loads(cache_file.read_text(encoding="utf-8")) |
| 161 | if not (isinstance(raw, dict) and raw.get("version") == _CACHE_VERSION): |
| 162 | return cls(muse_dir, {}) |
| 163 | raw_entries = raw.get("entries") |
| 164 | if not isinstance(raw_entries, dict): |
| 165 | return cls(muse_dir, {}) |
| 166 | entries: dict[str, FileCacheEntry] = {} |
| 167 | for rel, ev in raw_entries.items(): |
| 168 | if not isinstance(rel, str) or not isinstance(ev, dict): |
| 169 | continue |
| 170 | mtime = ev.get("mtime") |
| 171 | size = ev.get("size") |
| 172 | obj_hash = ev.get("object_hash") |
| 173 | dims = ev.get("dimensions") |
| 174 | if not ( |
| 175 | isinstance(mtime, (int, float)) |
| 176 | and isinstance(size, int) |
| 177 | and isinstance(obj_hash, str) |
| 178 | and isinstance(dims, dict) |
| 179 | ): |
| 180 | continue |
| 181 | entries[rel] = FileCacheEntry( |
| 182 | mtime=float(mtime), |
| 183 | size=size, |
| 184 | object_hash=obj_hash, |
| 185 | # Coerce dimension keys/values to str — guards against |
| 186 | # a cache written by a future version with non-str values. |
| 187 | dimensions={str(k): str(v) for k, v in dims.items()}, |
| 188 | ) |
| 189 | return cls(muse_dir, entries) |
| 190 | except (json.JSONDecodeError, KeyError, TypeError): |
| 191 | logger.debug("⚠️ stat_cache.json unreadable — starting fresh") |
| 192 | return cls(muse_dir, {}) |
| 193 | |
| 194 | @classmethod |
| 195 | def empty(cls) -> StatCache: |
| 196 | """Return a no-op cache for contexts without a ``.muse`` directory.""" |
| 197 | return cls(None, {}) |
| 198 | |
| 199 | # ------------------------------------------------------------------ |
| 200 | # Object hash — raw-bytes SHA-256 |
| 201 | # ------------------------------------------------------------------ |
| 202 | |
| 203 | def get_cached( |
| 204 | self, rel: str, abs_path_str: str, mtime: float, size: int |
| 205 | ) -> str: |
| 206 | """Fast inner-loop hash lookup with pre-computed stat values. |
| 207 | |
| 208 | Callers that already have ``(mtime, size)`` from an ``os.stat`` or |
| 209 | ``os.walk`` call should use this method to avoid a redundant |
| 210 | ``stat()`` syscall inside :meth:`get_object_hash`. |
| 211 | |
| 212 | Args: |
| 213 | rel: Workspace-relative POSIX path (cache key). |
| 214 | abs_path_str: Absolute file path as a plain string — avoids |
| 215 | constructing a ``pathlib.Path`` in the hot loop. |
| 216 | mtime: ``st_mtime`` from the caller's stat result. |
| 217 | size: ``st_size`` from the caller's stat result. |
| 218 | |
| 219 | Returns: |
| 220 | 64-character lowercase hex SHA-256 digest. |
| 221 | """ |
| 222 | entry = self._entries.get(rel) |
| 223 | if entry is not None and entry["mtime"] == mtime and entry["size"] == size: |
| 224 | return entry["object_hash"] |
| 225 | |
| 226 | obj_hash = _hash_str(abs_path_str) |
| 227 | self._entries[rel] = FileCacheEntry( |
| 228 | mtime=mtime, |
| 229 | size=size, |
| 230 | object_hash=obj_hash, |
| 231 | dimensions={}, |
| 232 | ) |
| 233 | self._dirty = True |
| 234 | return obj_hash |
| 235 | |
| 236 | def get_object_hash(self, root: pathlib.Path, file_path: pathlib.Path) -> str: |
| 237 | """Return the SHA-256 of *file_path*, using the cache when valid. |
| 238 | |
| 239 | Convenience wrapper around :meth:`get_cached` for callers that work |
| 240 | with ``pathlib.Path`` objects. The hot inner loops of ``walk_workdir`` |
| 241 | and plugin snapshot methods call :meth:`get_cached` directly to skip |
| 242 | pathlib overhead. |
| 243 | |
| 244 | Args: |
| 245 | root: Repository root — used to compute the workspace-relative |
| 246 | POSIX key. |
| 247 | file_path: Absolute path to the file. |
| 248 | |
| 249 | Returns: |
| 250 | 64-character lowercase hex SHA-256 digest. |
| 251 | """ |
| 252 | rel = file_path.relative_to(root).as_posix() |
| 253 | st = file_path.stat() |
| 254 | return self.get_cached(rel, str(file_path), st.st_mtime, st.st_size) |
| 255 | |
| 256 | # ------------------------------------------------------------------ |
| 257 | # Dimension hashes — domain-specific semantic hashes |
| 258 | # ------------------------------------------------------------------ |
| 259 | |
| 260 | def get_dimension( |
| 261 | self, |
| 262 | root: pathlib.Path, |
| 263 | file_path: pathlib.Path, |
| 264 | dimension: str, |
| 265 | ) -> str | None: |
| 266 | """Return a cached dimension hash, or ``None`` if not yet computed. |
| 267 | |
| 268 | Callers must verify that the entry is still valid by checking that |
| 269 | the object hash hasn't changed (i.e. call ``get_object_hash`` first |
| 270 | to ensure the entry is fresh). |
| 271 | |
| 272 | Args: |
| 273 | root: Repository root. |
| 274 | file_path: Absolute path to the file. |
| 275 | dimension: Dimension name, e.g. ``"symbols"`` or ``"notes"``. |
| 276 | |
| 277 | Returns: |
| 278 | Cached hash string, or ``None`` if absent. |
| 279 | """ |
| 280 | rel = file_path.relative_to(root).as_posix() |
| 281 | entry = self._entries.get(rel) |
| 282 | if entry is None: |
| 283 | return None |
| 284 | return entry["dimensions"].get(dimension) |
| 285 | |
| 286 | def set_dimension( |
| 287 | self, |
| 288 | root: pathlib.Path, |
| 289 | file_path: pathlib.Path, |
| 290 | dimension: str, |
| 291 | hash_value: str, |
| 292 | ) -> None: |
| 293 | """Store a semantic hash for a specific dimension of *file_path*. |
| 294 | |
| 295 | Should be called by domain plugins after parsing a file whose object |
| 296 | hash triggered a cache miss. Silently ignored if the file has no |
| 297 | entry (which should not happen in normal operation). |
| 298 | |
| 299 | Args: |
| 300 | root: Repository root. |
| 301 | file_path: Absolute path to the file. |
| 302 | dimension: Dimension name, e.g. ``"symbols"``. |
| 303 | hash_value: Hash string to store. |
| 304 | """ |
| 305 | rel = file_path.relative_to(root).as_posix() |
| 306 | entry = self._entries.get(rel) |
| 307 | if entry is None: |
| 308 | return |
| 309 | entry["dimensions"][dimension] = hash_value |
| 310 | self._dirty = True |
| 311 | |
| 312 | # ------------------------------------------------------------------ |
| 313 | # Lifecycle helpers |
| 314 | # ------------------------------------------------------------------ |
| 315 | |
| 316 | def prune(self, known_paths: set[str]) -> None: |
| 317 | """Remove entries for paths no longer present in the working tree. |
| 318 | |
| 319 | Call this after a full directory walk, passing the set of |
| 320 | workspace-relative POSIX paths that were found. Keeps the cache |
| 321 | lean by evicting stale entries for deleted files. |
| 322 | |
| 323 | Args: |
| 324 | known_paths: Set of rel-posix paths observed during the walk. |
| 325 | """ |
| 326 | stale = set(self._entries) - known_paths |
| 327 | if stale: |
| 328 | for k in stale: |
| 329 | del self._entries[k] |
| 330 | self._dirty = True |
| 331 | |
| 332 | def save(self) -> None: |
| 333 | """Atomically persist the cache to disk if it has changed. |
| 334 | |
| 335 | Uses a temp-file-then-rename pattern so a crash mid-write never |
| 336 | leaves a corrupt cache file. Silently skips when there is no |
| 337 | ``.muse`` directory (e.g. in-memory unit tests). |
| 338 | """ |
| 339 | if not self._dirty or self._muse_dir is None: |
| 340 | return |
| 341 | doc = _CacheDoc(version=_CACHE_VERSION, entries=self._entries) |
| 342 | cache_file = self._muse_dir / _CACHE_FILENAME |
| 343 | tmp = self._muse_dir / (_CACHE_FILENAME + ".tmp") |
| 344 | tmp.write_text( |
| 345 | json.dumps(doc, separators=(",", ":"), ensure_ascii=False), |
| 346 | encoding="utf-8", |
| 347 | ) |
| 348 | tmp.replace(cache_file) |
| 349 | self._dirty = False |
| 350 | logger.debug("✅ stat_cache saved (%d entries)", len(self._entries)) |
| 351 | |
| 352 | |
| 353 | def load_cache(root: pathlib.Path) -> StatCache: |
| 354 | """Convenience loader: return a ``StatCache`` for a repository root. |
| 355 | |
| 356 | Returns ``StatCache.empty()`` when *root* has no ``.muse`` directory |
| 357 | so callers never need to guard against a missing repo. |
| 358 | |
| 359 | Args: |
| 360 | root: Repository root (the directory that contains ``.muse/``). |
| 361 | |
| 362 | Returns: |
| 363 | A ``StatCache`` instance ready for use. |
| 364 | """ |
| 365 | muse_dir = root / ".muse" |
| 366 | if muse_dir.is_dir(): |
| 367 | return StatCache.load(muse_dir) |
| 368 | return StatCache.empty() |