cgcardona / muse public
muse_validate.py python
615 lines 21.3 KB
d87ef453 Introduce Muse v2 architecture: domain-agnostic VCS with plugin interface Gabriel Cardona <gabriel@tellurstori.com> 4d ago
1 """muse validate — musical integrity checks for the working tree.
2
3 This module provides the core validation logic that ``muse validate`` invokes.
4 It is intentionally kept separate from the CLI layer so the checks can be
5 called from tests and future automation pipelines without spawning a subprocess.
6
7 Named result types registered in ``docs/reference/type_contracts.md``:
8 - ``ValidationSeverity``
9 - ``ValidationIssue``
10 - ``ValidationCheckResult``
11 - ``MuseValidateResult``
12
13 Exit-code contract (mirrors git-fsck conventions):
14 - 0 — all checks passed (no errors, no warnings)
15 - 1 — one or more ERROR issues found
16 - 2 — one or more WARN issues found and ``--strict`` was requested
17 """
18 from __future__ import annotations
19
20 import dataclasses
21 import enum
22 import json
23 import logging
24 import pathlib
25 import re
26 import struct
27
28 logger = logging.getLogger(__name__)
29
30 # ---------------------------------------------------------------------------
31 # Types
32 # ---------------------------------------------------------------------------
33
34 ALLOWED_EMOTION_TAGS: frozenset[str] = frozenset(
35 [
36 "happy",
37 "sad",
38 "energetic",
39 "calm",
40 "tense",
41 "relaxed",
42 "dark",
43 "bright",
44 "melancholic",
45 "triumphant",
46 "mysterious",
47 "playful",
48 "romantic",
49 "aggressive",
50 "peaceful",
51 ]
52 )
53
54 #: Regex for well-formed section directory names: e.g. "verse", "chorus-01", "bridge_02"
55 _SECTION_NAME_RE = re.compile(r"^[a-z][a-z0-9_-]*$")
56
57
58 class ValidationSeverity(str, enum.Enum):
59 """Severity level for a single validation issue."""
60
61 ERROR = "error"
62 WARN = "warn"
63 INFO = "info"
64
65
66 @dataclasses.dataclass
67 class ValidationIssue:
68 """A single finding produced by a validation check.
69
70 Agents should treat ERROR severity as a blocker for ``muse commit``.
71 WARN severity is informational unless ``--strict`` mode is active.
72 """
73
74 severity: ValidationSeverity
75 check: str
76 path: str
77 message: str
78
79 def to_dict(self) -> dict[str, str]:
80 return {
81 "severity": self.severity.value,
82 "check": self.check,
83 "path": self.path,
84 "message": self.message,
85 }
86
87
88 @dataclasses.dataclass
89 class ValidationCheckResult:
90 """Outcome of a single named check category.
91
92 ``passed`` is True only when ``issues`` is empty for this check.
93 """
94
95 name: str
96 passed: bool
97 issues: list[ValidationIssue]
98
99 def to_dict(self) -> dict[str, object]:
100 return {
101 "name": self.name,
102 "passed": self.passed,
103 "issues": [i.to_dict() for i in self.issues],
104 }
105
106
107 @dataclasses.dataclass
108 class MuseValidateResult:
109 """Aggregated result of all validation checks run against the working tree.
110
111 ``clean`` is True iff every check passed (no issues of any severity).
112 ``has_errors`` is True iff at least one ERROR-severity issue was found.
113 ``has_warnings`` is True iff at least one WARN-severity issue was found.
114 """
115
116 clean: bool
117 has_errors: bool
118 has_warnings: bool
119 checks: list[ValidationCheckResult]
120 fixes_applied: list[str]
121
122 def to_dict(self) -> dict[str, object]:
123 return {
124 "clean": self.clean,
125 "has_errors": self.has_errors,
126 "has_warnings": self.has_warnings,
127 "checks": [c.to_dict() for c in self.checks],
128 "fixes_applied": self.fixes_applied,
129 }
130
131
132 # ---------------------------------------------------------------------------
133 # MIDI integrity check
134 # ---------------------------------------------------------------------------
135
136 def _is_valid_midi(path: pathlib.Path) -> bool:
137 """Return True iff *path* begins with the Standard MIDI File header (MThd).
138
139 This is a fast structural check — it verifies the 4-byte magic header and
140 the header chunk length (always 6 bytes for SMF). Full parse correctness
141 is left to ``mido`` in the import pipeline; here we just reject obviously
142 corrupt or truncated files so agents get an actionable error before commit.
143 """
144 try:
145 with path.open("rb") as fh:
146 magic = fh.read(4)
147 if magic != b"MThd":
148 return False
149 chunk_len_bytes = fh.read(4)
150 if len(chunk_len_bytes) < 4:
151 return False
152 chunk_len: int = struct.unpack(">I", chunk_len_bytes)[0]
153 return chunk_len == 6
154 except OSError:
155 return False
156
157
158 def check_midi_integrity(
159 workdir: pathlib.Path,
160 track_filter: str | None = None,
161 ) -> ValidationCheckResult:
162 """Verify that every .mid/.midi file in *workdir* has a valid MIDI header.
163
164 Agents use this to detect corruption introduced by partial writes, failed
165 exports, or bit-rot before the file is committed to Muse VCS history.
166
167 Args:
168 workdir: The ``muse-work/`` directory to scan.
169 track_filter: If given, only MIDI files whose relative path contains
170 this string (case-insensitive) are validated.
171
172 Returns:
173 ValidationCheckResult with check name ``"midi_integrity"``.
174 """
175 issues: list[ValidationIssue] = []
176 if not workdir.exists():
177 return ValidationCheckResult(name="midi_integrity", passed=True, issues=[])
178
179 for midi_path in sorted(workdir.rglob("*.mid")) + sorted(workdir.rglob("*.midi")):
180 if not midi_path.is_file():
181 continue
182 rel = midi_path.relative_to(workdir).as_posix()
183 if track_filter and track_filter.lower() not in rel.lower():
184 continue
185 if not _is_valid_midi(midi_path):
186 issues.append(
187 ValidationIssue(
188 severity=ValidationSeverity.ERROR,
189 check="midi_integrity",
190 path=rel,
191 message=f"Invalid or corrupted MIDI file: missing or malformed MThd header.",
192 )
193 )
194 logger.warning("❌ MIDI integrity failure: %s", rel)
195
196 return ValidationCheckResult(
197 name="midi_integrity",
198 passed=len(issues) == 0,
199 issues=issues,
200 )
201
202
203 # ---------------------------------------------------------------------------
204 # Manifest consistency check
205 # ---------------------------------------------------------------------------
206
207 def check_manifest_consistency(
208 root: pathlib.Path,
209 track_filter: str | None = None,
210 ) -> ValidationCheckResult:
211 """Compare the committed snapshot manifest against the actual working tree.
212
213 Detects orphaned files (in the manifest but missing from disk) and
214 unregistered files (on disk but absent from the manifest). These indicate
215 that the working tree has drifted from the last commit — potentially from
216 manual edits or a failed ``muse checkout``.
217
218 Args:
219 root: Repository root (contains ``.muse/`` and ``muse-work/``).
220 track_filter: Scope validation to paths containing this string.
221
222 Returns:
223 ValidationCheckResult with check name ``"manifest_consistency"``.
224 """
225 issues: list[ValidationIssue] = []
226 muse_dir = root / ".muse"
227 workdir = root / "muse-work"
228
229 # Resolve HEAD commit and its snapshot manifest
230 head_path = muse_dir / "HEAD"
231 if not head_path.exists():
232 return ValidationCheckResult(name="manifest_consistency", passed=True, issues=[])
233
234 head_ref = head_path.read_text().strip()
235 ref_file = muse_dir / pathlib.Path(head_ref)
236 if not ref_file.exists() or not ref_file.read_text().strip():
237 # No commits yet — nothing to compare against
238 return ValidationCheckResult(name="manifest_consistency", passed=True, issues=[])
239
240 # Load the committed snapshot manifest from the muse-work objects area
241 # The manifest is stored alongside objects in .muse/objects/ as a JSON side-car,
242 # but in this implementation commits reference snapshots stored in DB.
243 # We read the on-disk snapshot cache if available (written by muse commit).
244 snapshot_cache = muse_dir / "snapshot_manifest.json"
245 if not snapshot_cache.exists():
246 # No cached manifest — check is not possible without DB access
247 return ValidationCheckResult(name="manifest_consistency", passed=True, issues=[])
248
249 try:
250 committed_manifest: dict[str, str] = json.loads(snapshot_cache.read_text())
251 except (json.JSONDecodeError, OSError) as exc:
252 issues.append(
253 ValidationIssue(
254 severity=ValidationSeverity.ERROR,
255 check="manifest_consistency",
256 path=".muse/snapshot_manifest.json",
257 message=f"Cannot read cached snapshot manifest: {exc}",
258 )
259 )
260 return ValidationCheckResult(name="manifest_consistency", passed=False, issues=issues)
261
262 if not workdir.exists():
263 # All committed files are orphaned
264 for path in sorted(committed_manifest):
265 if track_filter and track_filter.lower() not in path.lower():
266 continue
267 issues.append(
268 ValidationIssue(
269 severity=ValidationSeverity.ERROR,
270 check="manifest_consistency",
271 path=path,
272 message="File is in committed manifest but muse-work/ does not exist.",
273 )
274 )
275 return ValidationCheckResult(
276 name="manifest_consistency",
277 passed=len(issues) == 0,
278 issues=issues,
279 )
280
281 # Build current working-tree manifest
282 from muse.core.snapshot import walk_workdir, hash_file
283
284 current_manifest = walk_workdir(workdir)
285
286 committed_paths = set(committed_manifest.keys())
287 current_paths = set(current_manifest.keys())
288
289 for path in sorted(committed_paths - current_paths):
290 if track_filter and track_filter.lower() not in path.lower():
291 continue
292 issues.append(
293 ValidationIssue(
294 severity=ValidationSeverity.ERROR,
295 check="manifest_consistency",
296 path=path,
297 message="File in committed manifest is missing from working tree (orphaned).",
298 )
299 )
300
301 for path in sorted(current_paths - committed_paths):
302 if track_filter and track_filter.lower() not in path.lower():
303 continue
304 issues.append(
305 ValidationIssue(
306 severity=ValidationSeverity.WARN,
307 check="manifest_consistency",
308 path=path,
309 message="File in working tree is not recorded in committed manifest (unregistered).",
310 )
311 )
312
313 return ValidationCheckResult(
314 name="manifest_consistency",
315 passed=len(issues) == 0,
316 issues=issues,
317 )
318
319
320 # ---------------------------------------------------------------------------
321 # Duplicate tracks check
322 # ---------------------------------------------------------------------------
323
324 def check_no_duplicate_tracks(
325 workdir: pathlib.Path,
326 track_filter: str | None = None,
327 ) -> ValidationCheckResult:
328 """Detect duplicate instrument-role definitions in the working tree.
329
330 A duplicate is defined as two or more MIDI files sharing the same
331 instrument role name (the stem of their filename, excluding the extension
332 and any numeric suffix). For example: ``bass.mid`` and ``bass_alt.mid``
333 both define a bass role.
334
335 Agents use this to prevent ambiguous track assignments that would cause
336 Storpheus to generate for the wrong instrument during composition.
337
338 Args:
339 workdir: The ``muse-work/`` directory to scan.
340 track_filter: If given, only roles whose name contains this string
341 (case-insensitive) are evaluated.
342
343 Returns:
344 ValidationCheckResult with check name ``"no_duplicate_tracks"``.
345 """
346 issues: list[ValidationIssue] = []
347 if not workdir.exists():
348 return ValidationCheckResult(name="no_duplicate_tracks", passed=True, issues=[])
349
350 from collections import defaultdict
351 role_to_paths: dict[str, list[str]] = defaultdict(list)
352
353 for midi_path in sorted(workdir.rglob("*.mid")) + sorted(workdir.rglob("*.midi")):
354 if not midi_path.is_file():
355 continue
356 rel = midi_path.relative_to(workdir).as_posix()
357 if track_filter and track_filter.lower() not in rel.lower():
358 continue
359 # Derive role: strip extension, strip trailing digits/underscores/hyphens
360 stem = midi_path.stem.lower()
361 role = re.sub(r"[_\-]?\d+$", "", stem)
362 role_to_paths[role].append(rel)
363
364 for role, paths in sorted(role_to_paths.items()):
365 if len(paths) > 1:
366 issues.append(
367 ValidationIssue(
368 severity=ValidationSeverity.WARN,
369 check="no_duplicate_tracks",
370 path=", ".join(paths),
371 message=f"Duplicate instrument role '{role}' defined by {len(paths)} files.",
372 )
373 )
374 logger.warning("⚠️ Duplicate track role: %s → %s", role, paths)
375
376 return ValidationCheckResult(
377 name="no_duplicate_tracks",
378 passed=len(issues) == 0,
379 issues=issues,
380 )
381
382
383 # ---------------------------------------------------------------------------
384 # Section naming convention check
385 # ---------------------------------------------------------------------------
386
387 def check_section_naming(
388 workdir: pathlib.Path,
389 section_filter: str | None = None,
390 ) -> ValidationCheckResult:
391 """Verify that section subdirectories follow the expected naming convention.
392
393 Section directories must match ``[a-z][a-z0-9_-]*`` (lowercase, starting
394 with a letter, using only alphanumeric chars, hyphens, or underscores).
395 This constraint ensures consistent referencing by AI agents and avoids
396 shell quoting issues.
397
398 Args:
399 workdir: The ``muse-work/`` directory to scan.
400 section_filter: If given, only directories whose name contains this
401 string (case-insensitive) are evaluated.
402
403 Returns:
404 ValidationCheckResult with check name ``"section_naming"``.
405 """
406 issues: list[ValidationIssue] = []
407 if not workdir.exists():
408 return ValidationCheckResult(name="section_naming", passed=True, issues=[])
409
410 for entry in sorted(workdir.iterdir()):
411 if not entry.is_dir():
412 continue
413 name = entry.name
414 if section_filter and section_filter.lower() not in name.lower():
415 continue
416 if not _SECTION_NAME_RE.match(name):
417 issues.append(
418 ValidationIssue(
419 severity=ValidationSeverity.WARN,
420 check="section_naming",
421 path=name,
422 message=(
423 f"Section directory '{name}' does not follow naming convention "
424 f"[a-z][a-z0-9_-]* (lowercase, no spaces or uppercase letters)."
425 ),
426 )
427 )
428 logger.warning("⚠️ Section naming violation: %s", name)
429
430 return ValidationCheckResult(
431 name="section_naming",
432 passed=len(issues) == 0,
433 issues=issues,
434 )
435
436
437 # ---------------------------------------------------------------------------
438 # Emotion tags check
439 # ---------------------------------------------------------------------------
440
441 def check_emotion_tags(
442 root: pathlib.Path,
443 track_filter: str | None = None,
444 ) -> ValidationCheckResult:
445 """Verify that emotion tags in commit metadata are from the allowed vocabulary.
446
447 Reads ``.muse/commit_metadata.json`` if present (written by ``muse tag``).
448 Any tag not in :data:`ALLOWED_EMOTION_TAGS` is flagged as a warning so
449 agents know they may be working with an unrecognised emotional label that
450 Maestro's mood model has not been trained on.
451
452 Args:
453 root: Repository root.
454 track_filter: Unused for this check (included for API symmetry).
455
456 Returns:
457 ValidationCheckResult with check name ``"emotion_tags"``.
458 """
459 issues: list[ValidationIssue] = []
460 muse_dir = root / ".muse"
461 tag_cache = muse_dir / "tags.json"
462
463 if not tag_cache.exists():
464 return ValidationCheckResult(name="emotion_tags", passed=True, issues=[])
465
466 try:
467 tags_data: object = json.loads(tag_cache.read_text())
468 except (json.JSONDecodeError, OSError) as exc:
469 issues.append(
470 ValidationIssue(
471 severity=ValidationSeverity.WARN,
472 check="emotion_tags",
473 path=".muse/tags.json",
474 message=f"Cannot read tag cache: {exc}",
475 )
476 )
477 return ValidationCheckResult(name="emotion_tags", passed=False, issues=issues)
478
479 if not isinstance(tags_data, list):
480 return ValidationCheckResult(name="emotion_tags", passed=True, issues=[])
481
482 for entry in tags_data:
483 if not isinstance(entry, dict):
484 continue
485 tag_name: object = entry.get("tag")
486 if not isinstance(tag_name, str):
487 continue
488 tag_lower = tag_name.lower()
489 if tag_lower not in ALLOWED_EMOTION_TAGS:
490 issues.append(
491 ValidationIssue(
492 severity=ValidationSeverity.WARN,
493 check="emotion_tags",
494 path=".muse/tags.json",
495 message=(
496 f"Emotion tag '{tag_name}' is not in the allowed vocabulary. "
497 f"Allowed: {', '.join(sorted(ALLOWED_EMOTION_TAGS))}"
498 ),
499 )
500 )
501 logger.warning("⚠️ Unknown emotion tag: %s", tag_name)
502
503 return ValidationCheckResult(
504 name="emotion_tags",
505 passed=len(issues) == 0,
506 issues=issues,
507 )
508
509
510 # ---------------------------------------------------------------------------
511 # Auto-fix: quantise slightly off-grid notes (stub — full impl requires mido)
512 # ---------------------------------------------------------------------------
513
514 def apply_fixes(
515 workdir: pathlib.Path,
516 issues: list[ValidationIssue],
517 ) -> list[str]:
518 """Apply automatic corrections for fixable issues.
519
520 Currently supports:
521 - Re-writing malformed MIDI files is not auto-fixable (data-loss risk).
522 - Section naming: no auto-rename (would break references in other files).
523 - Duplicate tracks: no auto-remove (ambiguous which to keep).
524
525 The function is intentionally conservative — it only fixes issues that
526 cannot cause data loss and where the correct fix is unambiguous.
527
528 Args:
529 workdir: The ``muse-work/`` working tree directory.
530 issues: The full list of issues found during validation.
531
532 Returns:
533 List of human-readable strings describing each fix applied.
534 """
535 applied: list[str] = []
536
537 # Future: quantise off-grid MIDI notes using mido when mido is available.
538 # For now, emit an informational note if any fixable categories were found.
539 fixable_checks = {"manifest_consistency"}
540 fixable_issues = [i for i in issues if i.check in fixable_checks]
541 if fixable_issues:
542 logger.info(
543 "⚠️ --fix: %d fixable issue(s) found but no auto-fix logic is "
544 "implemented yet for check categories: %s",
545 len(fixable_issues),
546 {i.check for i in fixable_issues},
547 )
548
549 return applied
550
551
552 # ---------------------------------------------------------------------------
553 # Orchestrator
554 # ---------------------------------------------------------------------------
555
556 def run_validate(
557 root: pathlib.Path,
558 *,
559 strict: bool = False,
560 track_filter: str | None = None,
561 section_filter: str | None = None,
562 auto_fix: bool = False,
563 ) -> MuseValidateResult:
564 """Run all integrity checks against the working tree at *root*.
565
566 This is the single entry point for the validate subsystem. It runs
567 checks in dependency order and aggregates results into a single
568 :class:`MuseValidateResult`.
569
570 Args:
571 root: Repository root (contains ``.muse/`` and ``muse-work/``).
572 strict: Treat WARN-severity issues as fatal (exit 2 in CLI).
573 track_filter: Restrict checks to files/paths containing this string.
574 section_filter: Restrict section-naming check to dirs matching this.
575 auto_fix: Attempt to auto-correct fixable issues before reporting.
576
577 Returns:
578 MuseValidateResult with all check outcomes and any fixes applied.
579 """
580 workdir = root / "muse-work"
581
582 check_results: list[ValidationCheckResult] = [
583 check_midi_integrity(workdir, track_filter=track_filter),
584 check_manifest_consistency(root, track_filter=track_filter),
585 check_no_duplicate_tracks(workdir, track_filter=track_filter),
586 check_section_naming(workdir, section_filter=section_filter),
587 check_emotion_tags(root, track_filter=track_filter),
588 ]
589
590 all_issues: list[ValidationIssue] = [
591 issue for result in check_results for issue in result.issues
592 ]
593
594 fixes_applied: list[str] = []
595 if auto_fix and all_issues:
596 fixes_applied = apply_fixes(workdir, all_issues)
597
598 has_errors = any(i.severity == ValidationSeverity.ERROR for i in all_issues)
599 has_warnings = any(i.severity == ValidationSeverity.WARN for i in all_issues)
600 clean = not has_errors and not has_warnings
601
602 logger.info(
603 "✅ muse validate: %d check(s), errors=%s, warnings=%s",
604 len(check_results),
605 has_errors,
606 has_warnings,
607 )
608
609 return MuseValidateResult(
610 clean=clean,
611 has_errors=has_errors,
612 has_warnings=has_warnings,
613 checks=check_results,
614 fixes_applied=fixes_applied,
615 )