gabriel / muse public
git2muse.py python
582 lines 18.4 KB
f7645c07 feat(store): self-describing HEAD format with typed read/write API (#163) Gabriel Cardona <cgcardona@gmail.com> 3d ago
1 """git2muse — Replay a Git commit graph into a Muse repository.
2
3 Usage
4 -----
5 ::
6
7 python tools/git2muse.py [--repo-root PATH] [--dry-run] [--verbose]
8
9 Strategy
10 --------
11 1. Walk ``main`` branch commits oldest-first and create Muse commits on the
12 Muse ``main`` branch preserving the original author, timestamp, and message.
13 2. Walk ``dev`` branch commits oldest-first that are not already on ``main``
14 and replay them onto a Muse ``dev`` branch, branching from the correct
15 ancestor.
16 3. Skip merge commits (commits with more than one parent) — they carry no
17 unique file-state delta; the Muse DAG is reconstructed faithfully through
18 the parent chain on each branch.
19
20 For each Git commit the tool:
21 - Extracts the commit's file tree into ``state/`` using ``git archive``.
22 - Removes files that Muse should not snapshot (build artefacts, caches, IDE
23 files, etc.) according to a hard-coded exclusion list that mirrors
24 ``.museignore``.
25 - Calls the Muse Python API directly (bypassing the CLI) so the original
26 Git author name, e-mail, and committer timestamp are preserved verbatim in
27 the Muse ``CommitRecord``.
28 - Updates the Muse branch HEAD ref so the Muse repo tracks the same history.
29
30 After a successful run the Muse repo under ``.muse/`` contains a full code-
31 domain representation of the project history and is ready to push to MuseHub.
32 """
33
34 from __future__ import annotations
35
36 import argparse
37 import datetime
38 import hashlib
39 import logging
40 import pathlib
41 import shutil
42 import subprocess
43 import sys
44 import tarfile
45 import tempfile
46
47 # ---------------------------------------------------------------------------
48 # Bootstrap: make sure the project root is on sys.path so we can import muse
49 # even when running from the tools/ directory.
50 # ---------------------------------------------------------------------------
51 _REPO_ROOT = pathlib.Path(__file__).parent.parent
52 if str(_REPO_ROOT) not in sys.path:
53 sys.path.insert(0, str(_REPO_ROOT))
54
55 from muse.core.object_store import write_object
56 from muse.core.store import (
57 CommitRecord,
58 SnapshotRecord,
59 get_head_commit_id,
60 write_commit,
61 write_head_branch,
62 write_snapshot,
63 )
64 from muse.core.snapshot import compute_commit_id, compute_snapshot_id
65
66 logger = logging.getLogger("git2muse")
67
68 # ---------------------------------------------------------------------------
69 # Files / dirs that should never end up in a Muse snapshot.
70 # These mirror .museignore + the hidden-directory exclusion in walk_workdir.
71 # ---------------------------------------------------------------------------
72
73 _EXCLUDE_PREFIXES: tuple[str, ...] = (
74 ".git/",
75 ".muse/",
76 ".muse",
77 ".venv/",
78 ".tox/",
79 ".mypy_cache/",
80 ".pytest_cache/",
81 ".hypothesis/",
82 ".github/",
83 ".DS_Store",
84 "artifacts/",
85 "__pycache__/",
86 )
87
88 _EXCLUDE_SUFFIXES: tuple[str, ...] = (
89 ".pyc",
90 ".pyo",
91 ".egg-info",
92 ".swp",
93 ".swo",
94 ".tmp",
95 "Thumbs.db",
96 ".DS_Store",
97 )
98
99
100 def _should_exclude(rel_path: str) -> bool:
101 """Return True if *rel_path* should be excluded from the Muse snapshot."""
102 for prefix in _EXCLUDE_PREFIXES:
103 if rel_path.startswith(prefix) or rel_path == prefix.rstrip("/"):
104 return True
105 for suffix in _EXCLUDE_SUFFIXES:
106 if rel_path.endswith(suffix):
107 return True
108 # Skip hidden files/dirs at the top level (mirrors walk_workdir behaviour).
109 first_component = rel_path.split("/")[0]
110 if first_component.startswith("."):
111 return True
112 return False
113
114
115 # ---------------------------------------------------------------------------
116 # Git helpers
117 # ---------------------------------------------------------------------------
118
119
120 def _git(repo_root: pathlib.Path, *args: str) -> str:
121 """Run a git command and return stdout (stripped)."""
122 result = subprocess.run(
123 ["git", *args],
124 cwd=repo_root,
125 capture_output=True,
126 text=True,
127 check=True,
128 )
129 return result.stdout.strip()
130
131
132 def _git_commits_oldest_first(
133 repo_root: pathlib.Path,
134 branch: str,
135 exclude_branches: list[str] | None = None,
136 ) -> list[str]:
137 """Return SHA1 hashes oldest-first for *branch*.
138
139 When *exclude_branches* is given, commits reachable from any of those
140 branches are excluded (used to extract dev-only commits).
141 """
142 cmd = ["log", "--topo-order", "--reverse", "--format=%H"]
143 if exclude_branches:
144 cmd.append(branch)
145 for excl in exclude_branches:
146 cmd.append(f"^{excl}")
147 else:
148 cmd.append(branch)
149 raw = _git(repo_root, *cmd)
150 return [line for line in raw.splitlines() if line.strip()]
151
152
153 _META_SEP = "|||GIT2MUSE|||"
154
155
156 def _git_commit_meta(repo_root: pathlib.Path, sha: str) -> dict[str, str]:
157 """Return author name, email, timestamp, and message for *sha*."""
158 fmt = f"%an{_META_SEP}%ae{_META_SEP}%at{_META_SEP}%B"
159 raw = _git(repo_root, "show", "-s", f"--format={fmt}", sha)
160 parts = raw.split(_META_SEP, 3)
161 if len(parts) < 4:
162 return {"name": "unknown", "email": "", "ts": "0", "message": sha[:12]}
163 name, email, ts, message = parts
164 return {
165 "name": name.strip(),
166 "email": email.strip(),
167 "ts": ts.strip(),
168 "message": message.strip(),
169 }
170
171
172 def _git_parent_shas(repo_root: pathlib.Path, sha: str) -> list[str]:
173 """Return parent SHA1s for *sha* (empty list for root commits)."""
174 raw = _git(repo_root, "log", "-1", "--format=%P", sha)
175 return [p for p in raw.split() if p]
176
177
178 def _is_merge_commit(repo_root: pathlib.Path, sha: str) -> bool:
179 return len(_git_parent_shas(repo_root, sha)) > 1
180
181
182 def _extract_tree_to(
183 repo_root: pathlib.Path,
184 sha: str,
185 dest: pathlib.Path,
186 ) -> None:
187 """Extract the git tree for *sha* into *dest*, applying exclusions."""
188 # Wipe and recreate dest for a clean slate.
189 if dest.exists():
190 shutil.rmtree(dest)
191 dest.mkdir(parents=True)
192
193 # git archive produces a tar stream of the commit tree.
194 archive = subprocess.run(
195 ["git", "archive", "--format=tar", sha],
196 cwd=repo_root,
197 capture_output=True,
198 check=True,
199 )
200 with tempfile.NamedTemporaryFile(suffix=".tar", delete=False) as tmp:
201 tmp.write(archive.stdout)
202 tmp_path = pathlib.Path(tmp.name)
203
204 try:
205 with tarfile.open(tmp_path) as tf:
206 for member in tf.getmembers():
207 if not member.isfile():
208 continue
209 # removeprefix strips only the literal "./" tar prefix, not
210 # individual characters — lstrip("./") was incorrectly turning
211 # ".cursorignore" into "cursorignore" and ".github/" into "github/".
212 rel = member.name.removeprefix("./")
213 if _should_exclude(rel):
214 continue
215 target = dest / rel
216 target.parent.mkdir(parents=True, exist_ok=True)
217 f = tf.extractfile(member)
218 if f is not None:
219 target.write_bytes(f.read())
220 finally:
221 tmp_path.unlink(missing_ok=True)
222
223
224 # ---------------------------------------------------------------------------
225 # Muse snapshot helpers (bypass CLI to preserve git metadata)
226 # ---------------------------------------------------------------------------
227
228
229 def _sha256_file(path: pathlib.Path) -> str:
230 h = hashlib.sha256()
231 h.update(path.read_bytes())
232 return h.hexdigest()
233
234
235 def _build_manifest(workdir: pathlib.Path) -> dict[str, str]:
236 """Walk *workdir* and return {rel_path: sha256} manifest."""
237 manifest: dict[str, str] = {}
238 for fpath in sorted(workdir.rglob("*")):
239 if not fpath.is_file():
240 continue
241 if fpath.is_symlink():
242 continue
243 rel = str(fpath.relative_to(workdir))
244 first = rel.split("/")[0]
245 if first.startswith("."):
246 continue
247 manifest[rel] = _sha256_file(fpath)
248 return manifest
249
250
251 def _store_objects(
252 repo_root: pathlib.Path,
253 workdir: pathlib.Path,
254 manifest: dict[str, str],
255 ) -> None:
256 """Write all objects referenced in *manifest* to the object store."""
257 for rel, oid in manifest.items():
258 fpath = workdir / rel
259 if not fpath.exists():
260 logger.warning("⚠️ Missing file in workdir: %s", rel)
261 continue
262 content = fpath.read_bytes()
263 write_object(repo_root, oid, content)
264
265
266 # ---------------------------------------------------------------------------
267 # Branch ref helpers (direct file I/O — mirrors store.py internal logic)
268 # ---------------------------------------------------------------------------
269
270
271 def _refs_dir(repo_root: pathlib.Path) -> pathlib.Path:
272 return repo_root / ".muse" / "refs" / "heads"
273
274
275 def _set_branch_head(
276 repo_root: pathlib.Path, branch: str, commit_id: str
277 ) -> None:
278 ref_path = _refs_dir(repo_root) / branch
279 ref_path.parent.mkdir(parents=True, exist_ok=True)
280 ref_path.write_text(commit_id + "\n")
281
282
283 def _get_branch_head(repo_root: pathlib.Path, branch: str) -> str | None:
284 ref_path = _refs_dir(repo_root) / branch
285 if not ref_path.exists():
286 return None
287 return ref_path.read_text().strip() or None
288
289
290 def _set_head_ref(repo_root: pathlib.Path, branch: str) -> None:
291 write_head_branch(repo_root, branch)
292
293
294 def _ensure_branch_exists(repo_root: pathlib.Path, branch: str) -> None:
295 _refs_dir(repo_root).mkdir(parents=True, exist_ok=True)
296 ref_path = _refs_dir(repo_root) / branch
297 if not ref_path.exists():
298 ref_path.write_text("")
299
300
301 # ---------------------------------------------------------------------------
302 # Core replay logic
303 # ---------------------------------------------------------------------------
304
305
306 def _replay_commit(
307 repo_root: pathlib.Path,
308 workdir: pathlib.Path,
309 git_sha: str,
310 muse_branch: str,
311 parent_muse_id: str | None,
312 meta: dict[str, str],
313 repo_id: str,
314 dry_run: bool,
315 ) -> str:
316 """Replay one Git commit into the Muse object store.
317
318 Returns the new Muse commit ID.
319 """
320 # Build manifest from workdir (already populated by caller).
321 manifest = _build_manifest(workdir)
322
323 # Compute snapshot ID deterministically.
324 snapshot_id = compute_snapshot_id(manifest)
325
326 # Build CommitRecord with original Git metadata.
327 committed_at = datetime.datetime.fromtimestamp(
328 int(meta["ts"]), tz=datetime.timezone.utc
329 )
330 author = f"{meta['name']} <{meta['email']}>"
331 message = meta["message"] or git_sha[:12]
332
333 committed_at_iso = committed_at.isoformat()
334 parent_ids = [parent_muse_id] if parent_muse_id else []
335 commit_id = compute_commit_id(
336 parent_ids=parent_ids,
337 snapshot_id=snapshot_id,
338 message=message,
339 committed_at_iso=committed_at_iso,
340 )
341
342 if dry_run:
343 logger.info(
344 "[dry-run] Would create commit %s (git: %s) on %s | %s",
345 commit_id[:12],
346 git_sha[:12],
347 muse_branch,
348 message[:60],
349 )
350 return commit_id
351
352 # Write objects into the content-addressed store.
353 _store_objects(repo_root, workdir, manifest)
354
355 # Write snapshot record.
356 snap = SnapshotRecord(snapshot_id=snapshot_id, manifest=manifest)
357 write_snapshot(repo_root, snap)
358
359 # Write commit record.
360 record = CommitRecord(
361 commit_id=commit_id,
362 repo_id=repo_id,
363 branch=muse_branch,
364 snapshot_id=snapshot_id,
365 message=message,
366 committed_at=committed_at,
367 parent_commit_id=parent_muse_id,
368 author=author,
369 )
370 write_commit(repo_root, record)
371
372 # Advance branch HEAD.
373 _set_branch_head(repo_root, muse_branch, commit_id)
374
375 return commit_id
376
377
378 def _replay_branch(
379 repo_root: pathlib.Path,
380 workdir: pathlib.Path,
381 git_shas: list[str],
382 muse_branch: str,
383 start_parent_muse_id: str | None,
384 repo_id: str,
385 dry_run: bool,
386 verbose: bool,
387 ) -> dict[str, str]:
388 """Replay a list of git SHAs (oldest first) onto *muse_branch*.
389
390 Returns a mapping of git_sha → muse_commit_id for every replayed commit.
391 """
392 _ensure_branch_exists(repo_root, muse_branch)
393
394 git_to_muse: dict[str, str] = {}
395 parent_muse_id = start_parent_muse_id
396 total = len(git_shas)
397
398 for i, git_sha in enumerate(git_shas, 1):
399 meta = _git_commit_meta(repo_root, git_sha)
400
401 if verbose or i % 10 == 0 or i == 1 or i == total:
402 logger.info(
403 "[%s] %d/%d git:%s '%s'",
404 muse_branch,
405 i,
406 total,
407 git_sha[:12],
408 meta["message"][:60],
409 )
410
411 # Populate state/ with this commit's tree.
412 if not dry_run:
413 _extract_tree_to(repo_root, git_sha, workdir)
414
415 muse_id = _replay_commit(
416 repo_root=repo_root,
417 workdir=workdir,
418 git_sha=git_sha,
419 muse_branch=muse_branch,
420 parent_muse_id=parent_muse_id,
421 meta=meta,
422 repo_id=repo_id,
423 dry_run=dry_run,
424 )
425
426 git_to_muse[git_sha] = muse_id
427 parent_muse_id = muse_id
428
429 return git_to_muse
430
431
432 # ---------------------------------------------------------------------------
433 # Entry point
434 # ---------------------------------------------------------------------------
435
436
437 def _load_repo_id(repo_root: pathlib.Path) -> str:
438 import json
439 repo_json = repo_root / ".muse" / "repo.json"
440 data: dict[str, str] = json.loads(repo_json.read_text())
441 return data["repo_id"]
442
443
444 def main(argv: list[str] | None = None) -> int:
445 parser = argparse.ArgumentParser(
446 description="Replay a Git commit graph into a Muse repository."
447 )
448 parser.add_argument(
449 "--repo-root",
450 type=pathlib.Path,
451 default=_REPO_ROOT,
452 help="Path to the repository root (default: parent of this script).",
453 )
454 parser.add_argument(
455 "--dry-run",
456 action="store_true",
457 help="Log what would happen without writing anything.",
458 )
459 parser.add_argument(
460 "--verbose",
461 "-v",
462 action="store_true",
463 help="Log every commit (default: log every 10 + first/last).",
464 )
465 parser.add_argument(
466 "--branch",
467 default="all",
468 help="Which git branch(es) to replay: 'main', 'dev', or 'all' (default).",
469 )
470 args = parser.parse_args(argv)
471
472 logging.basicConfig(
473 level=logging.INFO,
474 format="%(levelname)s %(message)s",
475 )
476
477 repo_root: pathlib.Path = args.repo_root.resolve()
478 dry_run: bool = args.dry_run
479 verbose: bool = args.verbose
480 branch_arg: str = args.branch
481
482 # Verify .muse/ exists.
483 if not (repo_root / ".muse" / "repo.json").exists():
484 logger.error(
485 "❌ No .muse/repo.json found in %s — run 'muse init' first.", repo_root
486 )
487 return 1
488
489 repo_id = _load_repo_id(repo_root)
490 logger.info("✅ Muse repo ID: %s", repo_id)
491
492 # Use a temp directory for git archive extraction — the repo root IS the
493 # working tree and must never be wiped between replays.
494 with tempfile.TemporaryDirectory(prefix="git2muse-") as _tmpdir:
495 workdir = pathlib.Path(_tmpdir)
496
497 # -----------------------------------------------------------------------
498 # Phase 1: main branch
499 # -----------------------------------------------------------------------
500 all_git_to_muse: dict[str, str] = {}
501
502 if branch_arg in ("main", "all"):
503 logger.info("━━━ Phase 1: replaying main branch ━━━")
504 main_shas = _git_commits_oldest_first(repo_root, "main")
505 # Skip merge commits — they add no unique tree delta.
506 main_shas = [
507 s for s in main_shas
508 if not _is_merge_commit(repo_root, s)
509 ]
510 logger.info(" %d non-merge commits on main", len(main_shas))
511
512 _set_head_ref(repo_root, "main")
513 mapping = _replay_branch(
514 repo_root=repo_root,
515 workdir=workdir,
516 git_shas=main_shas,
517 muse_branch="main",
518 start_parent_muse_id=None,
519 repo_id=repo_id,
520 dry_run=dry_run,
521 verbose=verbose,
522 )
523 all_git_to_muse.update(mapping)
524 logger.info("✅ main: %d commits written", len(mapping))
525
526 # -----------------------------------------------------------------------
527 # Phase 2: dev branch (commits not reachable from main)
528 # -----------------------------------------------------------------------
529 if branch_arg in ("dev", "all"):
530 logger.info("━━━ Phase 2: replaying dev branch ━━━")
531 dev_only_shas = _git_commits_oldest_first(
532 repo_root, "dev", exclude_branches=["main"]
533 )
534 dev_only_shas = [
535 s for s in dev_only_shas
536 if not _is_merge_commit(repo_root, s)
537 ]
538 logger.info(" %d dev-only non-merge commits", len(dev_only_shas))
539
540 if dev_only_shas:
541 # Find the git parent of the oldest dev-only commit — it should
542 # already be in all_git_to_muse (it's a main commit).
543 oldest_dev_sha = dev_only_shas[0]
544 git_parents = _git_parent_shas(repo_root, oldest_dev_sha)
545 branch_parent_muse_id: str | None = None
546 for gp in git_parents:
547 if gp in all_git_to_muse:
548 branch_parent_muse_id = all_git_to_muse[gp]
549 break
550 if branch_parent_muse_id is None:
551 # Fall back to current main HEAD.
552 branch_parent_muse_id = _get_branch_head(repo_root, "main")
553
554 _set_head_ref(repo_root, "dev")
555 mapping = _replay_branch(
556 repo_root=repo_root,
557 workdir=workdir,
558 git_shas=dev_only_shas,
559 muse_branch="dev",
560 start_parent_muse_id=branch_parent_muse_id,
561 repo_id=repo_id,
562 dry_run=dry_run,
563 verbose=verbose,
564 )
565 all_git_to_muse.update(mapping)
566 logger.info("✅ dev: %d commits written", len(mapping))
567 else:
568 logger.info(" dev has no unique commits beyond main — skipping")
569
570 # Leave HEAD pointing at main.
571 if not dry_run:
572 _set_head_ref(repo_root, "main")
573
574 # Summary.
575 main_count = len(all_git_to_muse)
576 logger.info("━━━ Done ━━━ total Muse commits written: %d", main_count)
577
578 return 0
579
580
581 if __name__ == "__main__":
582 sys.exit(main())