cgcardona / muse public
test_core_stat_cache.py python
400 lines 14.5 KB
71e26490 feat: stat cache — 86x faster snapshot via os.walk + persistent hash cache 6h ago
1 """Tests for muse.core.stat_cache.
2
3 Coverage
4 --------
5 - Cache hit: file with unchanged (mtime, size) returns stored hash without I/O.
6 - Cache miss: new or modified file is re-hashed and entry is updated.
7 - Stale-entry pruning: entries for deleted files are removed.
8 - Dimension hash round-trip: set_dimension / get_dimension.
9 - Dimension eviction on object-hash miss: dimensions reset when file changes.
10 - Persistence: save() / load() round-trip via .muse/stat_cache.json.
11 - Atomic write: temp file is cleaned up; no corruption on concurrent use.
12 - empty(): no-op — save() is a no-op without a muse_dir.
13 - load_cache() convenience helper.
14 - walk_workdir() integration: cache is used and persisted automatically.
15 """
16
17 from __future__ import annotations
18
19 import json
20 import pathlib
21 import time
22
23 import pytest
24
25 from muse.core.stat_cache import FileCacheEntry, StatCache, _hash_bytes, load_cache
26 from muse.core.snapshot import walk_workdir
27
28
29 # ---------------------------------------------------------------------------
30 # Helpers
31 # ---------------------------------------------------------------------------
32
33
34 def _make_muse_dir(tmp_path: pathlib.Path) -> pathlib.Path:
35 muse_dir = tmp_path / ".muse"
36 muse_dir.mkdir()
37 return muse_dir
38
39
40 def _write(path: pathlib.Path, content: str = "hello") -> pathlib.Path:
41 path.parent.mkdir(parents=True, exist_ok=True)
42 path.write_text(content, encoding="utf-8")
43 return path
44
45
46 # ---------------------------------------------------------------------------
47 # _hash_bytes — canonical hash function
48 # ---------------------------------------------------------------------------
49
50
51 class TestHashBytes:
52 def test_matches_hashlib(self, tmp_path: pathlib.Path) -> None:
53 import hashlib
54
55 f = _write(tmp_path / "f.txt", "muse")
56 expected = hashlib.sha256(b"muse").hexdigest()
57 assert _hash_bytes(f) == expected
58
59 def test_empty_file(self, tmp_path: pathlib.Path) -> None:
60 import hashlib
61
62 f = tmp_path / "empty.txt"
63 f.write_bytes(b"")
64 assert _hash_bytes(f) == hashlib.sha256(b"").hexdigest()
65
66 def test_large_file_chunked(self, tmp_path: pathlib.Path) -> None:
67 import hashlib
68
69 data = b"x" * (200 * 1024) # 200 KiB — forces multiple 64 KiB chunks
70 f = tmp_path / "big.bin"
71 f.write_bytes(data)
72 assert _hash_bytes(f) == hashlib.sha256(data).hexdigest()
73
74
75 # ---------------------------------------------------------------------------
76 # StatCache — construction
77 # ---------------------------------------------------------------------------
78
79
80 class TestStatCacheConstruction:
81 def test_load_missing_file_returns_empty(self, tmp_path: pathlib.Path) -> None:
82 muse_dir = _make_muse_dir(tmp_path)
83 cache = StatCache.load(muse_dir)
84 assert cache._entries == {}
85
86 def test_load_corrupt_json_returns_empty(self, tmp_path: pathlib.Path) -> None:
87 muse_dir = _make_muse_dir(tmp_path)
88 (muse_dir / "stat_cache.json").write_text("not json", encoding="utf-8")
89 cache = StatCache.load(muse_dir)
90 assert cache._entries == {}
91
92 def test_load_wrong_version_returns_empty(self, tmp_path: pathlib.Path) -> None:
93 muse_dir = _make_muse_dir(tmp_path)
94 (muse_dir / "stat_cache.json").write_text(
95 '{"version": 99, "entries": {}}', encoding="utf-8"
96 )
97 cache = StatCache.load(muse_dir)
98 assert cache._entries == {}
99
100 def test_empty_has_no_muse_dir(self, tmp_path: pathlib.Path) -> None:
101 cache = StatCache.empty()
102 assert cache._muse_dir is None
103 assert cache._entries == {}
104
105 def test_load_cache_helper_with_muse_dir(self, tmp_path: pathlib.Path) -> None:
106 _make_muse_dir(tmp_path)
107 cache = load_cache(tmp_path)
108 assert isinstance(cache, StatCache)
109 assert cache._muse_dir == tmp_path / ".muse"
110
111 def test_load_cache_helper_without_muse_dir(self, tmp_path: pathlib.Path) -> None:
112 cache = load_cache(tmp_path)
113 assert cache._muse_dir is None
114
115
116 # ---------------------------------------------------------------------------
117 # StatCache — get_object_hash (hit / miss)
118 # ---------------------------------------------------------------------------
119
120
121 class TestGetObjectHash:
122 def test_first_call_is_cache_miss(self, tmp_path: pathlib.Path) -> None:
123 muse_dir = _make_muse_dir(tmp_path)
124 f = _write(tmp_path / "a.py", "x = 1")
125 cache = StatCache.load(muse_dir)
126
127 h = cache.get_object_hash(tmp_path, f)
128
129 assert h == _hash_bytes(f)
130 assert cache._dirty is True
131 assert "a.py" in cache._entries
132
133 def test_second_call_is_cache_hit_no_dirty(self, tmp_path: pathlib.Path) -> None:
134 muse_dir = _make_muse_dir(tmp_path)
135 f = _write(tmp_path / "a.py", "x = 1")
136 cache = StatCache.load(muse_dir)
137 cache.get_object_hash(tmp_path, f)
138 cache._dirty = False # reset after first miss
139
140 h2 = cache.get_object_hash(tmp_path, f)
141
142 assert h2 == _hash_bytes(f)
143 assert cache._dirty is False # no re-hash, no dirty flag
144
145 def test_modified_file_triggers_miss(self, tmp_path: pathlib.Path) -> None:
146 muse_dir = _make_muse_dir(tmp_path)
147 f = _write(tmp_path / "a.py", "x = 1")
148 cache = StatCache.load(muse_dir)
149 h1 = cache.get_object_hash(tmp_path, f)
150
151 # Modify file content (ensure mtime changes on this filesystem).
152 time.sleep(0.01)
153 f.write_text("x = 2", encoding="utf-8")
154 h2 = cache.get_object_hash(tmp_path, f)
155
156 assert h1 != h2
157 assert h2 == _hash_bytes(f)
158
159 def test_same_content_new_mtime_triggers_miss_but_same_hash(
160 self, tmp_path: pathlib.Path
161 ) -> None:
162 muse_dir = _make_muse_dir(tmp_path)
163 f = _write(tmp_path / "a.py", "identical")
164 cache = StatCache.load(muse_dir)
165 h1 = cache.get_object_hash(tmp_path, f)
166
167 time.sleep(0.01)
168 f.write_text("identical", encoding="utf-8")
169 h2 = cache.get_object_hash(tmp_path, f)
170
171 # Cache miss because mtime changed, but hash is still the same.
172 assert h1 == h2
173
174
175 # ---------------------------------------------------------------------------
176 # StatCache — dimension hashes
177 # ---------------------------------------------------------------------------
178
179
180 class TestDimensionHashes:
181 def test_set_and_get_dimension(self, tmp_path: pathlib.Path) -> None:
182 muse_dir = _make_muse_dir(tmp_path)
183 f = _write(tmp_path / "src.py")
184 cache = StatCache.load(muse_dir)
185 cache.get_object_hash(tmp_path, f) # ensure entry exists
186
187 cache.set_dimension(tmp_path, f, "symbols", "abc123")
188
189 assert cache.get_dimension(tmp_path, f, "symbols") == "abc123"
190
191 def test_get_dimension_missing_key_returns_none(self, tmp_path: pathlib.Path) -> None:
192 muse_dir = _make_muse_dir(tmp_path)
193 f = _write(tmp_path / "src.py")
194 cache = StatCache.load(muse_dir)
195 cache.get_object_hash(tmp_path, f)
196
197 assert cache.get_dimension(tmp_path, f, "nonexistent") is None
198
199 def test_get_dimension_missing_entry_returns_none(self, tmp_path: pathlib.Path) -> None:
200 muse_dir = _make_muse_dir(tmp_path)
201 f = _write(tmp_path / "src.py")
202 cache = StatCache.load(muse_dir)
203 # Never called get_object_hash, so no entry exists.
204 assert cache.get_dimension(tmp_path, f, "symbols") is None
205
206 def test_dimension_evicted_on_object_hash_miss(self, tmp_path: pathlib.Path) -> None:
207 """When a file changes, its dimension hashes must be cleared."""
208 muse_dir = _make_muse_dir(tmp_path)
209 f = _write(tmp_path / "src.py", "v1")
210 cache = StatCache.load(muse_dir)
211 cache.get_object_hash(tmp_path, f)
212 cache.set_dimension(tmp_path, f, "symbols", "stale-hash")
213
214 time.sleep(0.01)
215 f.write_text("v2", encoding="utf-8")
216 cache.get_object_hash(tmp_path, f) # triggers miss → evicts dimensions
217
218 assert cache.get_dimension(tmp_path, f, "symbols") is None
219
220 def test_multiple_dimensions(self, tmp_path: pathlib.Path) -> None:
221 muse_dir = _make_muse_dir(tmp_path)
222 f = _write(tmp_path / "src.py")
223 cache = StatCache.load(muse_dir)
224 cache.get_object_hash(tmp_path, f)
225 cache.set_dimension(tmp_path, f, "symbols", "sym-hash")
226 cache.set_dimension(tmp_path, f, "imports", "imp-hash")
227
228 assert cache.get_dimension(tmp_path, f, "symbols") == "sym-hash"
229 assert cache.get_dimension(tmp_path, f, "imports") == "imp-hash"
230
231 def test_set_dimension_noop_for_unknown_file(self, tmp_path: pathlib.Path) -> None:
232 """set_dimension on a file with no entry must not crash."""
233 muse_dir = _make_muse_dir(tmp_path)
234 f = _write(tmp_path / "ghost.py")
235 cache = StatCache.load(muse_dir)
236 # No get_object_hash call → no entry.
237 cache.set_dimension(tmp_path, f, "symbols", "x") # must not raise
238
239
240 # ---------------------------------------------------------------------------
241 # StatCache — prune
242 # ---------------------------------------------------------------------------
243
244
245 class TestPrune:
246 def test_prune_removes_stale_entries(self, tmp_path: pathlib.Path) -> None:
247 muse_dir = _make_muse_dir(tmp_path)
248 f1 = _write(tmp_path / "keep.py")
249 f2 = _write(tmp_path / "gone.py")
250 cache = StatCache.load(muse_dir)
251 cache.get_object_hash(tmp_path, f1)
252 cache.get_object_hash(tmp_path, f2)
253
254 cache.prune({"keep.py"})
255
256 assert "keep.py" in cache._entries
257 assert "gone.py" not in cache._entries
258
259 def test_prune_noop_when_all_present(self, tmp_path: pathlib.Path) -> None:
260 muse_dir = _make_muse_dir(tmp_path)
261 f = _write(tmp_path / "a.py")
262 cache = StatCache.load(muse_dir)
263 cache.get_object_hash(tmp_path, f)
264 cache._dirty = False
265
266 cache.prune({"a.py"})
267
268 assert cache._dirty is False
269
270 def test_prune_empty_known_set_clears_all(self, tmp_path: pathlib.Path) -> None:
271 muse_dir = _make_muse_dir(tmp_path)
272 f = _write(tmp_path / "a.py")
273 cache = StatCache.load(muse_dir)
274 cache.get_object_hash(tmp_path, f)
275
276 cache.prune(set())
277
278 assert cache._entries == {}
279
280
281 # ---------------------------------------------------------------------------
282 # StatCache — persistence (save / load round-trip)
283 # ---------------------------------------------------------------------------
284
285
286 class TestPersistence:
287 def test_save_and_reload(self, tmp_path: pathlib.Path) -> None:
288 muse_dir = _make_muse_dir(tmp_path)
289 f = _write(tmp_path / "mod.py", "print('hi')")
290 cache = StatCache.load(muse_dir)
291 h = cache.get_object_hash(tmp_path, f)
292 cache.save()
293
294 assert (muse_dir / "stat_cache.json").is_file()
295
296 cache2 = StatCache.load(muse_dir)
297 cache2._dirty = False
298 h2 = cache2.get_object_hash(tmp_path, f)
299
300 assert h2 == h
301 assert cache2._dirty is False # served from cache, no re-hash
302
303 def test_save_is_atomic_no_tmp_left(self, tmp_path: pathlib.Path) -> None:
304 muse_dir = _make_muse_dir(tmp_path)
305 f = _write(tmp_path / "x.py")
306 cache = StatCache.load(muse_dir)
307 cache.get_object_hash(tmp_path, f)
308 cache.save()
309
310 assert not (muse_dir / "stat_cache.json.tmp").exists()
311
312 def test_save_noop_when_not_dirty(self, tmp_path: pathlib.Path) -> None:
313 muse_dir = _make_muse_dir(tmp_path)
314 cache = StatCache.load(muse_dir)
315 cache.save() # nothing written
316 assert not (muse_dir / "stat_cache.json").exists()
317
318 def test_empty_cache_save_is_noop(self) -> None:
319 cache = StatCache.empty()
320 cache.save() # must not raise
321
322 def test_dimensions_persisted(self, tmp_path: pathlib.Path) -> None:
323 muse_dir = _make_muse_dir(tmp_path)
324 f = _write(tmp_path / "s.py")
325 cache = StatCache.load(muse_dir)
326 cache.get_object_hash(tmp_path, f)
327 cache.set_dimension(tmp_path, f, "symbols", "sym42")
328 cache.save()
329
330 cache2 = StatCache.load(muse_dir)
331 # Validate entry shape — mtime/size unchanged so entry is still valid.
332 assert cache2.get_dimension(tmp_path, f, "symbols") == "sym42"
333
334 def test_json_format_is_versioned(self, tmp_path: pathlib.Path) -> None:
335 muse_dir = _make_muse_dir(tmp_path)
336 f = _write(tmp_path / "v.py")
337 cache = StatCache.load(muse_dir)
338 cache.get_object_hash(tmp_path, f)
339 cache.save()
340
341 raw = json.loads((muse_dir / "stat_cache.json").read_text(encoding="utf-8"))
342 assert raw["version"] == 1
343 assert "v.py" in raw["entries"]
344
345
346 # ---------------------------------------------------------------------------
347 # walk_workdir integration
348 # ---------------------------------------------------------------------------
349
350
351 class TestWalkWorkdirCacheIntegration:
352 def test_walk_creates_cache_file(self, tmp_path: pathlib.Path) -> None:
353 muse_dir = tmp_path / ".muse"
354 muse_dir.mkdir()
355 _write(tmp_path / "a.py", "x = 1")
356 _write(tmp_path / "b.py", "y = 2")
357
358 walk_workdir(tmp_path)
359
360 assert (muse_dir / "stat_cache.json").is_file()
361
362 def test_walk_second_call_uses_cache(self, tmp_path: pathlib.Path) -> None:
363 """Second walk should hit cache for both files — no dirty flag set."""
364 muse_dir = tmp_path / ".muse"
365 muse_dir.mkdir()
366 _write(tmp_path / "a.py", "x = 1")
367
368 walk_workdir(tmp_path) # cold — populates cache
369
370 cache = StatCache.load(muse_dir)
371 cache._dirty = False
372 cache.get_object_hash(tmp_path, tmp_path / "a.py")
373 # Should not set dirty because mtime/size unchanged.
374 assert cache._dirty is False
375
376 def test_walk_excludes_hidden_paths_from_cache(self, tmp_path: pathlib.Path) -> None:
377 muse_dir = tmp_path / ".muse"
378 muse_dir.mkdir()
379 _write(tmp_path / "visible.py")
380 _write(tmp_path / ".hidden.py")
381
382 manifest = walk_workdir(tmp_path)
383
384 assert "visible.py" in manifest
385 assert ".hidden.py" not in manifest
386
387 def test_walk_without_muse_dir_still_works(self, tmp_path: pathlib.Path) -> None:
388 """walk_workdir must work correctly even with no .muse directory."""
389 _write(tmp_path / "a.py", "ok")
390 manifest = walk_workdir(tmp_path)
391 assert "a.py" in manifest
392
393 def test_walk_hashes_match_direct_hash(self, tmp_path: pathlib.Path) -> None:
394 muse_dir = tmp_path / ".muse"
395 muse_dir.mkdir()
396 f = _write(tmp_path / "c.py", "content")
397
398 manifest = walk_workdir(tmp_path)
399
400 assert manifest["c.py"] == _hash_bytes(f)