cgcardona / muse public
git2muse.py python
576 lines 17.8 KB
dfaf1b77 refactor: rename muse-work/ → state/ Gabriel Cardona <gabriel@tellurstori.com> 8h ago
1 """git2muse — Replay a Git commit graph into a Muse repository.
2
3 Usage
4 -----
5 ::
6
7 python tools/git2muse.py [--repo-root PATH] [--dry-run] [--verbose]
8
9 Strategy
10 --------
11 1. Walk ``main`` branch commits oldest-first and create Muse commits on the
12 Muse ``main`` branch preserving the original author, timestamp, and message.
13 2. Walk ``dev`` branch commits oldest-first that are not already on ``main``
14 and replay them onto a Muse ``dev`` branch, branching from the correct
15 ancestor.
16 3. Skip merge commits (commits with more than one parent) — they carry no
17 unique file-state delta; the Muse DAG is reconstructed faithfully through
18 the parent chain on each branch.
19
20 For each Git commit the tool:
21 - Extracts the commit's file tree into ``state/`` using ``git archive``.
22 - Removes files that Muse should not snapshot (build artefacts, caches, IDE
23 files, etc.) according to a hard-coded exclusion list that mirrors
24 ``.museignore``.
25 - Calls the Muse Python API directly (bypassing the CLI) so the original
26 Git author name, e-mail, and committer timestamp are preserved verbatim in
27 the Muse ``CommitRecord``.
28 - Updates the Muse branch HEAD ref so the Muse repo tracks the same history.
29
30 After a successful run the Muse repo under ``.muse/`` contains a full code-
31 domain representation of the project history and is ready to push to MuseHub.
32 """
33
34 from __future__ import annotations
35
36 import argparse
37 import datetime
38 import hashlib
39 import logging
40 import pathlib
41 import shutil
42 import subprocess
43 import sys
44 import tarfile
45 import tempfile
46
47 # ---------------------------------------------------------------------------
48 # Bootstrap: make sure the project root is on sys.path so we can import muse
49 # even when running from the tools/ directory.
50 # ---------------------------------------------------------------------------
51 _REPO_ROOT = pathlib.Path(__file__).parent.parent
52 if str(_REPO_ROOT) not in sys.path:
53 sys.path.insert(0, str(_REPO_ROOT))
54
55 from muse.core.object_store import write_object
56 from muse.core.store import (
57 CommitRecord,
58 SnapshotRecord,
59 get_head_commit_id,
60 write_commit,
61 write_snapshot,
62 )
63 from muse.core.snapshot import compute_commit_id, compute_snapshot_id
64
65 logger = logging.getLogger("git2muse")
66
67 # ---------------------------------------------------------------------------
68 # Files / dirs that should never end up in a Muse snapshot.
69 # These mirror .museignore + the hidden-directory exclusion in walk_workdir.
70 # ---------------------------------------------------------------------------
71
72 _EXCLUDE_PREFIXES: tuple[str, ...] = (
73 ".git/",
74 ".muse/",
75 ".muse",
76 ".venv/",
77 ".tox/",
78 ".mypy_cache/",
79 ".pytest_cache/",
80 ".hypothesis/",
81 ".github/",
82 ".DS_Store",
83 "state/",
84 "artifacts/",
85 "__pycache__/",
86 )
87
88 _EXCLUDE_SUFFIXES: tuple[str, ...] = (
89 ".pyc",
90 ".pyo",
91 ".egg-info",
92 ".swp",
93 ".swo",
94 ".tmp",
95 "Thumbs.db",
96 ".DS_Store",
97 )
98
99
100 def _should_exclude(rel_path: str) -> bool:
101 """Return True if *rel_path* should be excluded from the Muse snapshot."""
102 for prefix in _EXCLUDE_PREFIXES:
103 if rel_path.startswith(prefix) or rel_path == prefix.rstrip("/"):
104 return True
105 for suffix in _EXCLUDE_SUFFIXES:
106 if rel_path.endswith(suffix):
107 return True
108 # Skip hidden files/dirs at the top level (mirrors walk_workdir behaviour).
109 first_component = rel_path.split("/")[0]
110 if first_component.startswith("."):
111 return True
112 return False
113
114
115 # ---------------------------------------------------------------------------
116 # Git helpers
117 # ---------------------------------------------------------------------------
118
119
120 def _git(repo_root: pathlib.Path, *args: str) -> str:
121 """Run a git command and return stdout (stripped)."""
122 result = subprocess.run(
123 ["git", *args],
124 cwd=repo_root,
125 capture_output=True,
126 text=True,
127 check=True,
128 )
129 return result.stdout.strip()
130
131
132 def _git_commits_oldest_first(
133 repo_root: pathlib.Path,
134 branch: str,
135 exclude_branches: list[str] | None = None,
136 ) -> list[str]:
137 """Return SHA1 hashes oldest-first for *branch*.
138
139 When *exclude_branches* is given, commits reachable from any of those
140 branches are excluded (used to extract dev-only commits).
141 """
142 cmd = ["log", "--topo-order", "--reverse", "--format=%H"]
143 if exclude_branches:
144 cmd.append(branch)
145 for excl in exclude_branches:
146 cmd.append(f"^{excl}")
147 else:
148 cmd.append(branch)
149 raw = _git(repo_root, *cmd)
150 return [line for line in raw.splitlines() if line.strip()]
151
152
153 _META_SEP = "|||GIT2MUSE|||"
154
155
156 def _git_commit_meta(repo_root: pathlib.Path, sha: str) -> dict[str, str]:
157 """Return author name, email, timestamp, and message for *sha*."""
158 fmt = f"%an{_META_SEP}%ae{_META_SEP}%at{_META_SEP}%B"
159 raw = _git(repo_root, "show", "-s", f"--format={fmt}", sha)
160 parts = raw.split(_META_SEP, 3)
161 if len(parts) < 4:
162 return {"name": "unknown", "email": "", "ts": "0", "message": sha[:12]}
163 name, email, ts, message = parts
164 return {
165 "name": name.strip(),
166 "email": email.strip(),
167 "ts": ts.strip(),
168 "message": message.strip(),
169 }
170
171
172 def _git_parent_shas(repo_root: pathlib.Path, sha: str) -> list[str]:
173 """Return parent SHA1s for *sha* (empty list for root commits)."""
174 raw = _git(repo_root, "log", "-1", "--format=%P", sha)
175 return [p for p in raw.split() if p]
176
177
178 def _is_merge_commit(repo_root: pathlib.Path, sha: str) -> bool:
179 return len(_git_parent_shas(repo_root, sha)) > 1
180
181
182 def _extract_tree_to(
183 repo_root: pathlib.Path,
184 sha: str,
185 dest: pathlib.Path,
186 ) -> None:
187 """Extract the git tree for *sha* into *dest*, applying exclusions."""
188 # Wipe and recreate dest for a clean slate.
189 if dest.exists():
190 shutil.rmtree(dest)
191 dest.mkdir(parents=True)
192
193 # git archive produces a tar stream of the commit tree.
194 archive = subprocess.run(
195 ["git", "archive", "--format=tar", sha],
196 cwd=repo_root,
197 capture_output=True,
198 check=True,
199 )
200 with tempfile.NamedTemporaryFile(suffix=".tar", delete=False) as tmp:
201 tmp.write(archive.stdout)
202 tmp_path = pathlib.Path(tmp.name)
203
204 try:
205 with tarfile.open(tmp_path) as tf:
206 for member in tf.getmembers():
207 if not member.isfile():
208 continue
209 rel = member.name.lstrip("./")
210 if _should_exclude(rel):
211 continue
212 target = dest / rel
213 target.parent.mkdir(parents=True, exist_ok=True)
214 f = tf.extractfile(member)
215 if f is not None:
216 target.write_bytes(f.read())
217 finally:
218 tmp_path.unlink(missing_ok=True)
219
220
221 # ---------------------------------------------------------------------------
222 # Muse snapshot helpers (bypass CLI to preserve git metadata)
223 # ---------------------------------------------------------------------------
224
225
226 def _sha256_file(path: pathlib.Path) -> str:
227 h = hashlib.sha256()
228 h.update(path.read_bytes())
229 return h.hexdigest()
230
231
232 def _build_manifest(workdir: pathlib.Path) -> dict[str, str]:
233 """Walk *workdir* and return {rel_path: sha256} manifest."""
234 manifest: dict[str, str] = {}
235 for fpath in sorted(workdir.rglob("*")):
236 if not fpath.is_file():
237 continue
238 if fpath.is_symlink():
239 continue
240 rel = str(fpath.relative_to(workdir))
241 first = rel.split("/")[0]
242 if first.startswith("."):
243 continue
244 manifest[rel] = _sha256_file(fpath)
245 return manifest
246
247
248 def _store_objects(
249 repo_root: pathlib.Path,
250 workdir: pathlib.Path,
251 manifest: dict[str, str],
252 ) -> None:
253 """Write all objects referenced in *manifest* to the object store."""
254 for rel, oid in manifest.items():
255 fpath = workdir / rel
256 if not fpath.exists():
257 logger.warning("⚠️ Missing file in workdir: %s", rel)
258 continue
259 content = fpath.read_bytes()
260 write_object(repo_root, oid, content)
261
262
263 # ---------------------------------------------------------------------------
264 # Branch ref helpers (direct file I/O — mirrors store.py internal logic)
265 # ---------------------------------------------------------------------------
266
267
268 def _refs_dir(repo_root: pathlib.Path) -> pathlib.Path:
269 return repo_root / ".muse" / "refs" / "heads"
270
271
272 def _set_branch_head(
273 repo_root: pathlib.Path, branch: str, commit_id: str
274 ) -> None:
275 ref_path = _refs_dir(repo_root) / branch
276 ref_path.parent.mkdir(parents=True, exist_ok=True)
277 ref_path.write_text(commit_id + "\n")
278
279
280 def _get_branch_head(repo_root: pathlib.Path, branch: str) -> str | None:
281 ref_path = _refs_dir(repo_root) / branch
282 if not ref_path.exists():
283 return None
284 return ref_path.read_text().strip() or None
285
286
287 def _set_head_ref(repo_root: pathlib.Path, branch: str) -> None:
288 head_path = repo_root / ".muse" / "HEAD"
289 head_path.write_text(f"refs/heads/{branch}\n")
290
291
292 def _ensure_branch_exists(repo_root: pathlib.Path, branch: str) -> None:
293 _refs_dir(repo_root).mkdir(parents=True, exist_ok=True)
294 ref_path = _refs_dir(repo_root) / branch
295 if not ref_path.exists():
296 ref_path.write_text("")
297
298
299 # ---------------------------------------------------------------------------
300 # Core replay logic
301 # ---------------------------------------------------------------------------
302
303
304 def _replay_commit(
305 repo_root: pathlib.Path,
306 workdir: pathlib.Path,
307 git_sha: str,
308 muse_branch: str,
309 parent_muse_id: str | None,
310 meta: dict[str, str],
311 repo_id: str,
312 dry_run: bool,
313 ) -> str:
314 """Replay one Git commit into the Muse object store.
315
316 Returns the new Muse commit ID.
317 """
318 # Build manifest from workdir (already populated by caller).
319 manifest = _build_manifest(workdir)
320
321 # Compute snapshot ID deterministically.
322 snapshot_id = compute_snapshot_id(manifest)
323
324 # Build CommitRecord with original Git metadata.
325 committed_at = datetime.datetime.fromtimestamp(
326 int(meta["ts"]), tz=datetime.timezone.utc
327 )
328 author = f"{meta['name']} <{meta['email']}>"
329 message = meta["message"] or git_sha[:12]
330
331 committed_at_iso = committed_at.isoformat()
332 parent_ids = [parent_muse_id] if parent_muse_id else []
333 commit_id = compute_commit_id(
334 parent_ids=parent_ids,
335 snapshot_id=snapshot_id,
336 message=message,
337 committed_at_iso=committed_at_iso,
338 )
339
340 if dry_run:
341 logger.info(
342 "[dry-run] Would create commit %s (git: %s) on %s | %s",
343 commit_id[:12],
344 git_sha[:12],
345 muse_branch,
346 message[:60],
347 )
348 return commit_id
349
350 # Write objects into the content-addressed store.
351 _store_objects(repo_root, workdir, manifest)
352
353 # Write snapshot record.
354 snap = SnapshotRecord(snapshot_id=snapshot_id, manifest=manifest)
355 write_snapshot(repo_root, snap)
356
357 # Write commit record.
358 record = CommitRecord(
359 commit_id=commit_id,
360 repo_id=repo_id,
361 branch=muse_branch,
362 snapshot_id=snapshot_id,
363 message=message,
364 committed_at=committed_at,
365 parent_commit_id=parent_muse_id,
366 author=author,
367 )
368 write_commit(repo_root, record)
369
370 # Advance branch HEAD.
371 _set_branch_head(repo_root, muse_branch, commit_id)
372
373 return commit_id
374
375
376 def _replay_branch(
377 repo_root: pathlib.Path,
378 workdir: pathlib.Path,
379 git_shas: list[str],
380 muse_branch: str,
381 start_parent_muse_id: str | None,
382 repo_id: str,
383 dry_run: bool,
384 verbose: bool,
385 ) -> dict[str, str]:
386 """Replay a list of git SHAs (oldest first) onto *muse_branch*.
387
388 Returns a mapping of git_sha → muse_commit_id for every replayed commit.
389 """
390 _ensure_branch_exists(repo_root, muse_branch)
391
392 git_to_muse: dict[str, str] = {}
393 parent_muse_id = start_parent_muse_id
394 total = len(git_shas)
395
396 for i, git_sha in enumerate(git_shas, 1):
397 meta = _git_commit_meta(repo_root, git_sha)
398
399 if verbose or i % 10 == 0 or i == 1 or i == total:
400 logger.info(
401 "[%s] %d/%d git:%s '%s'",
402 muse_branch,
403 i,
404 total,
405 git_sha[:12],
406 meta["message"][:60],
407 )
408
409 # Populate state/ with this commit's tree.
410 if not dry_run:
411 _extract_tree_to(repo_root, git_sha, workdir)
412
413 muse_id = _replay_commit(
414 repo_root=repo_root,
415 workdir=workdir,
416 git_sha=git_sha,
417 muse_branch=muse_branch,
418 parent_muse_id=parent_muse_id,
419 meta=meta,
420 repo_id=repo_id,
421 dry_run=dry_run,
422 )
423
424 git_to_muse[git_sha] = muse_id
425 parent_muse_id = muse_id
426
427 return git_to_muse
428
429
430 # ---------------------------------------------------------------------------
431 # Entry point
432 # ---------------------------------------------------------------------------
433
434
435 def _load_repo_id(repo_root: pathlib.Path) -> str:
436 import json
437 repo_json = repo_root / ".muse" / "repo.json"
438 data: dict[str, str] = json.loads(repo_json.read_text())
439 return data["repo_id"]
440
441
442 def main(argv: list[str] | None = None) -> int:
443 parser = argparse.ArgumentParser(
444 description="Replay a Git commit graph into a Muse repository."
445 )
446 parser.add_argument(
447 "--repo-root",
448 type=pathlib.Path,
449 default=_REPO_ROOT,
450 help="Path to the repository root (default: parent of this script).",
451 )
452 parser.add_argument(
453 "--dry-run",
454 action="store_true",
455 help="Log what would happen without writing anything.",
456 )
457 parser.add_argument(
458 "--verbose",
459 "-v",
460 action="store_true",
461 help="Log every commit (default: log every 10 + first/last).",
462 )
463 parser.add_argument(
464 "--branch",
465 default="all",
466 help="Which git branch(es) to replay: 'main', 'dev', or 'all' (default).",
467 )
468 args = parser.parse_args(argv)
469
470 logging.basicConfig(
471 level=logging.INFO,
472 format="%(levelname)s %(message)s",
473 )
474
475 repo_root: pathlib.Path = args.repo_root.resolve()
476 workdir = repo_root / "state"
477 dry_run: bool = args.dry_run
478 verbose: bool = args.verbose
479 branch_arg: str = args.branch
480
481 # Verify .muse/ exists.
482 if not (repo_root / ".muse" / "repo.json").exists():
483 logger.error(
484 "❌ No .muse/repo.json found in %s — run 'muse init' first.", repo_root
485 )
486 return 1
487
488 repo_id = _load_repo_id(repo_root)
489 logger.info("✅ Muse repo ID: %s", repo_id)
490
491 # -----------------------------------------------------------------------
492 # Phase 1: main branch
493 # -----------------------------------------------------------------------
494 all_git_to_muse: dict[str, str] = {}
495
496 if branch_arg in ("main", "all"):
497 logger.info("━━━ Phase 1: replaying main branch ━━━")
498 main_shas = _git_commits_oldest_first(repo_root, "main")
499 # Skip merge commits — they add no unique tree delta.
500 main_shas = [
501 s for s in main_shas
502 if not _is_merge_commit(repo_root, s)
503 ]
504 logger.info(" %d non-merge commits on main", len(main_shas))
505
506 _set_head_ref(repo_root, "main")
507 mapping = _replay_branch(
508 repo_root=repo_root,
509 workdir=workdir,
510 git_shas=main_shas,
511 muse_branch="main",
512 start_parent_muse_id=None,
513 repo_id=repo_id,
514 dry_run=dry_run,
515 verbose=verbose,
516 )
517 all_git_to_muse.update(mapping)
518 logger.info("✅ main: %d commits written", len(mapping))
519
520 # -----------------------------------------------------------------------
521 # Phase 2: dev branch (commits not reachable from main)
522 # -----------------------------------------------------------------------
523 if branch_arg in ("dev", "all"):
524 logger.info("━━━ Phase 2: replaying dev branch ━━━")
525 dev_only_shas = _git_commits_oldest_first(
526 repo_root, "dev", exclude_branches=["main"]
527 )
528 dev_only_shas = [
529 s for s in dev_only_shas
530 if not _is_merge_commit(repo_root, s)
531 ]
532 logger.info(" %d dev-only non-merge commits", len(dev_only_shas))
533
534 if dev_only_shas:
535 # Find the git parent of the oldest dev-only commit — it should
536 # already be in all_git_to_muse (it's a main commit).
537 oldest_dev_sha = dev_only_shas[0]
538 git_parents = _git_parent_shas(repo_root, oldest_dev_sha)
539 branch_parent_muse_id: str | None = None
540 for gp in git_parents:
541 if gp in all_git_to_muse:
542 branch_parent_muse_id = all_git_to_muse[gp]
543 break
544 if branch_parent_muse_id is None:
545 # Fall back to current main HEAD.
546 branch_parent_muse_id = _get_branch_head(repo_root, "main")
547
548 _set_head_ref(repo_root, "dev")
549 mapping = _replay_branch(
550 repo_root=repo_root,
551 workdir=workdir,
552 git_shas=dev_only_shas,
553 muse_branch="dev",
554 start_parent_muse_id=branch_parent_muse_id,
555 repo_id=repo_id,
556 dry_run=dry_run,
557 verbose=verbose,
558 )
559 all_git_to_muse.update(mapping)
560 logger.info("✅ dev: %d commits written", len(mapping))
561 else:
562 logger.info(" dev has no unique commits beyond main — skipping")
563
564 # Leave HEAD pointing at main.
565 if not dry_run:
566 _set_head_ref(repo_root, "main")
567
568 # Summary.
569 main_count = len([k for k in all_git_to_muse])
570 logger.info("━━━ Done ━━━ total Muse commits written: %d", main_count)
571
572 return 0
573
574
575 if __name__ == "__main__":
576 sys.exit(main())