gabriel / muse public
git2muse.py python
579 lines 18.5 KB
5392be66 fix(git2muse): use walk_workdir, drop hashlib, write reflog entries Gabriel Cardona <gabriel@tellurstori.com> 20h ago
1 """git2muse — Replay a Git commit graph into a Muse repository.
2
3 Usage
4 -----
5 ::
6
7 python tools/git2muse.py [--repo-root PATH] [--dry-run] [--verbose]
8
9 Strategy
10 --------
11 1. Walk ``main`` branch commits oldest-first and create Muse commits on the
12 Muse ``main`` branch preserving the original author, timestamp, and message.
13 2. Walk ``dev`` branch commits oldest-first that are not already on ``main``
14 and replay them onto a Muse ``dev`` branch, branching from the correct
15 ancestor.
16 3. Skip merge commits (commits with more than one parent) — they carry no
17 unique file-state delta; the Muse DAG is reconstructed faithfully through
18 the parent chain on each branch.
19
20 For each Git commit the tool:
21 - Extracts the commit's file tree into ``state/`` using ``git archive``.
22 - Removes files that Muse should not snapshot (build artefacts, caches, IDE
23 files, etc.) according to a hard-coded exclusion list that mirrors
24 ``.museignore``.
25 - Calls the Muse Python API directly (bypassing the CLI) so the original
26 Git author name, e-mail, and committer timestamp are preserved verbatim in
27 the Muse ``CommitRecord``.
28 - Updates the Muse branch HEAD ref so the Muse repo tracks the same history.
29
30 After a successful run the Muse repo under ``.muse/`` contains a full code-
31 domain representation of the project history and is ready to push to MuseHub.
32 """
33
34 from __future__ import annotations
35
36 import argparse
37 import datetime
38 import logging
39 import pathlib
40 import shutil
41 import subprocess
42 import sys
43 import tarfile
44 import tempfile
45
46 # ---------------------------------------------------------------------------
47 # Bootstrap: make sure the project root is on sys.path so we can import muse
48 # even when running from the tools/ directory.
49 # ---------------------------------------------------------------------------
50 _REPO_ROOT = pathlib.Path(__file__).parent.parent
51 if str(_REPO_ROOT) not in sys.path:
52 sys.path.insert(0, str(_REPO_ROOT))
53
54 from muse.core.object_store import write_object
55 from muse.core.reflog import append_reflog
56 from muse.core.store import (
57 CommitRecord,
58 SnapshotRecord,
59 get_head_commit_id,
60 write_commit,
61 write_head_branch,
62 write_snapshot,
63 )
64 from muse.core.snapshot import compute_commit_id, compute_snapshot_id, walk_workdir
65
66 logger = logging.getLogger("git2muse")
67
68 # ---------------------------------------------------------------------------
69 # Files / dirs that should never end up in a Muse snapshot.
70 # These mirror .museignore + the hidden-directory exclusion in walk_workdir.
71 # ---------------------------------------------------------------------------
72
73 _EXCLUDE_PREFIXES: tuple[str, ...] = (
74 ".git/",
75 ".muse/",
76 ".muse",
77 ".venv/",
78 ".tox/",
79 ".mypy_cache/",
80 ".pytest_cache/",
81 ".hypothesis/",
82 ".github/",
83 ".DS_Store",
84 "artifacts/",
85 "__pycache__/",
86 )
87
88 _EXCLUDE_SUFFIXES: tuple[str, ...] = (
89 ".pyc",
90 ".pyo",
91 ".egg-info",
92 ".swp",
93 ".swo",
94 ".tmp",
95 "Thumbs.db",
96 ".DS_Store",
97 )
98
99
100 def _should_exclude(rel_path: str) -> bool:
101 """Return True if *rel_path* should be excluded from the Muse snapshot."""
102 for prefix in _EXCLUDE_PREFIXES:
103 if rel_path.startswith(prefix) or rel_path == prefix.rstrip("/"):
104 return True
105 for suffix in _EXCLUDE_SUFFIXES:
106 if rel_path.endswith(suffix):
107 return True
108 # Skip hidden files/dirs at the top level (mirrors walk_workdir behaviour).
109 first_component = rel_path.split("/")[0]
110 if first_component.startswith("."):
111 return True
112 return False
113
114
115 # ---------------------------------------------------------------------------
116 # Git helpers
117 # ---------------------------------------------------------------------------
118
119
120 def _git(repo_root: pathlib.Path, *args: str) -> str:
121 """Run a git command and return stdout (stripped)."""
122 result = subprocess.run(
123 ["git", *args],
124 cwd=repo_root,
125 capture_output=True,
126 text=True,
127 check=True,
128 )
129 return result.stdout.strip()
130
131
132 def _git_commits_oldest_first(
133 repo_root: pathlib.Path,
134 branch: str,
135 exclude_branches: list[str] | None = None,
136 ) -> list[str]:
137 """Return SHA1 hashes oldest-first for *branch*.
138
139 When *exclude_branches* is given, commits reachable from any of those
140 branches are excluded (used to extract dev-only commits).
141 """
142 cmd = ["log", "--topo-order", "--reverse", "--format=%H"]
143 if exclude_branches:
144 cmd.append(branch)
145 for excl in exclude_branches:
146 cmd.append(f"^{excl}")
147 else:
148 cmd.append(branch)
149 raw = _git(repo_root, *cmd)
150 return [line for line in raw.splitlines() if line.strip()]
151
152
153 _META_SEP = "|||GIT2MUSE|||"
154
155
156 def _git_commit_meta(repo_root: pathlib.Path, sha: str) -> dict[str, str]:
157 """Return author name, email, timestamp, and message for *sha*."""
158 fmt = f"%an{_META_SEP}%ae{_META_SEP}%at{_META_SEP}%B"
159 raw = _git(repo_root, "show", "-s", f"--format={fmt}", sha)
160 parts = raw.split(_META_SEP, 3)
161 if len(parts) < 4:
162 return {"name": "unknown", "email": "", "ts": "0", "message": sha[:12]}
163 name, email, ts, message = parts
164 return {
165 "name": name.strip(),
166 "email": email.strip(),
167 "ts": ts.strip(),
168 "message": message.strip(),
169 }
170
171
172 def _git_parent_shas(repo_root: pathlib.Path, sha: str) -> list[str]:
173 """Return parent SHA1s for *sha* (empty list for root commits)."""
174 raw = _git(repo_root, "log", "-1", "--format=%P", sha)
175 return [p for p in raw.split() if p]
176
177
178 def _is_merge_commit(repo_root: pathlib.Path, sha: str) -> bool:
179 return len(_git_parent_shas(repo_root, sha)) > 1
180
181
182 def _extract_tree_to(
183 repo_root: pathlib.Path,
184 sha: str,
185 dest: pathlib.Path,
186 ) -> None:
187 """Extract the git tree for *sha* into *dest*, applying exclusions."""
188 # Wipe and recreate dest for a clean slate.
189 if dest.exists():
190 shutil.rmtree(dest)
191 dest.mkdir(parents=True)
192
193 # git archive produces a tar stream of the commit tree.
194 archive = subprocess.run(
195 ["git", "archive", "--format=tar", sha],
196 cwd=repo_root,
197 capture_output=True,
198 check=True,
199 )
200 with tempfile.NamedTemporaryFile(suffix=".tar", delete=False) as tmp:
201 tmp.write(archive.stdout)
202 tmp_path = pathlib.Path(tmp.name)
203
204 try:
205 with tarfile.open(tmp_path) as tf:
206 for member in tf.getmembers():
207 if not member.isfile():
208 continue
209 # removeprefix strips only the literal "./" tar prefix, not
210 # individual characters — lstrip("./") was incorrectly turning
211 # ".cursorignore" into "cursorignore" and ".github/" into "github/".
212 rel = member.name.removeprefix("./")
213 if _should_exclude(rel):
214 continue
215 target = dest / rel
216 target.parent.mkdir(parents=True, exist_ok=True)
217 f = tf.extractfile(member)
218 if f is not None:
219 target.write_bytes(f.read())
220 finally:
221 tmp_path.unlink(missing_ok=True)
222
223
224 # ---------------------------------------------------------------------------
225 # Muse snapshot helpers (bypass CLI to preserve git metadata)
226 # ---------------------------------------------------------------------------
227
228
229 def _build_manifest(workdir: pathlib.Path) -> dict[str, str]:
230 """Walk *workdir* using Muse's canonical walker and return a manifest.
231
232 Delegates to :func:`muse.core.snapshot.walk_workdir` so the exclusion
233 rules, hidden-file logic, and path normalisation are always in sync with
234 what ``muse commit`` produces. Using the same walker prevents the tool
235 from drifting out of sync as Muse evolves.
236 """
237 return walk_workdir(workdir)
238
239
240 def _store_objects(
241 repo_root: pathlib.Path,
242 workdir: pathlib.Path,
243 manifest: dict[str, str],
244 ) -> None:
245 """Write all objects referenced in *manifest* to the object store."""
246 for rel, oid in manifest.items():
247 fpath = workdir / rel
248 if not fpath.exists():
249 logger.warning("⚠️ Missing file in workdir: %s", rel)
250 continue
251 content = fpath.read_bytes()
252 write_object(repo_root, oid, content)
253
254
255 # ---------------------------------------------------------------------------
256 # Branch ref helpers (direct file I/O — mirrors store.py internal logic)
257 # ---------------------------------------------------------------------------
258
259
260 def _refs_dir(repo_root: pathlib.Path) -> pathlib.Path:
261 return repo_root / ".muse" / "refs" / "heads"
262
263
264 def _set_branch_head(
265 repo_root: pathlib.Path, branch: str, commit_id: str
266 ) -> None:
267 ref_path = _refs_dir(repo_root) / branch
268 ref_path.parent.mkdir(parents=True, exist_ok=True)
269 ref_path.write_text(commit_id + "\n")
270
271
272 def _get_branch_head(repo_root: pathlib.Path, branch: str) -> str | None:
273 ref_path = _refs_dir(repo_root) / branch
274 if not ref_path.exists():
275 return None
276 return ref_path.read_text().strip() or None
277
278
279 def _set_head_ref(repo_root: pathlib.Path, branch: str) -> None:
280 write_head_branch(repo_root, branch)
281
282
283 def _ensure_branch_exists(repo_root: pathlib.Path, branch: str) -> None:
284 _refs_dir(repo_root).mkdir(parents=True, exist_ok=True)
285 ref_path = _refs_dir(repo_root) / branch
286 if not ref_path.exists():
287 ref_path.write_text("")
288
289
290 # ---------------------------------------------------------------------------
291 # Core replay logic
292 # ---------------------------------------------------------------------------
293
294
295 def _replay_commit(
296 repo_root: pathlib.Path,
297 workdir: pathlib.Path,
298 git_sha: str,
299 muse_branch: str,
300 parent_muse_id: str | None,
301 meta: dict[str, str],
302 repo_id: str,
303 dry_run: bool,
304 ) -> str:
305 """Replay one Git commit into the Muse object store.
306
307 Returns the new Muse commit ID.
308 """
309 # Build manifest from workdir (already populated by caller).
310 manifest = _build_manifest(workdir)
311
312 # Compute snapshot ID deterministically.
313 snapshot_id = compute_snapshot_id(manifest)
314
315 # Build CommitRecord with original Git metadata.
316 committed_at = datetime.datetime.fromtimestamp(
317 int(meta["ts"]), tz=datetime.timezone.utc
318 )
319 author = f"{meta['name']} <{meta['email']}>"
320 message = meta["message"] or git_sha[:12]
321
322 committed_at_iso = committed_at.isoformat()
323 parent_ids = [parent_muse_id] if parent_muse_id else []
324 commit_id = compute_commit_id(
325 parent_ids=parent_ids,
326 snapshot_id=snapshot_id,
327 message=message,
328 committed_at_iso=committed_at_iso,
329 )
330
331 if dry_run:
332 logger.info(
333 "[dry-run] Would create commit %s (git: %s) on %s | %s",
334 commit_id[:12],
335 git_sha[:12],
336 muse_branch,
337 message[:60],
338 )
339 return commit_id
340
341 # Write objects into the content-addressed store.
342 _store_objects(repo_root, workdir, manifest)
343
344 # Write snapshot record.
345 snap = SnapshotRecord(snapshot_id=snapshot_id, manifest=manifest)
346 write_snapshot(repo_root, snap)
347
348 # Write commit record.
349 record = CommitRecord(
350 commit_id=commit_id,
351 repo_id=repo_id,
352 branch=muse_branch,
353 snapshot_id=snapshot_id,
354 message=message,
355 committed_at=committed_at,
356 parent_commit_id=parent_muse_id,
357 author=author,
358 )
359 write_commit(repo_root, record)
360
361 # Advance branch HEAD and record in reflog so `muse reflog` works.
362 _set_branch_head(repo_root, muse_branch, commit_id)
363 append_reflog(
364 repo_root,
365 muse_branch,
366 old_id=parent_muse_id,
367 new_id=commit_id,
368 author=author,
369 operation=f"git2muse: {message[:60]}",
370 )
371
372 return commit_id
373
374
375 def _replay_branch(
376 repo_root: pathlib.Path,
377 workdir: pathlib.Path,
378 git_shas: list[str],
379 muse_branch: str,
380 start_parent_muse_id: str | None,
381 repo_id: str,
382 dry_run: bool,
383 verbose: bool,
384 ) -> dict[str, str]:
385 """Replay a list of git SHAs (oldest first) onto *muse_branch*.
386
387 Returns a mapping of git_sha → muse_commit_id for every replayed commit.
388 """
389 _ensure_branch_exists(repo_root, muse_branch)
390
391 git_to_muse: dict[str, str] = {}
392 parent_muse_id = start_parent_muse_id
393 total = len(git_shas)
394
395 for i, git_sha in enumerate(git_shas, 1):
396 meta = _git_commit_meta(repo_root, git_sha)
397
398 if verbose or i % 10 == 0 or i == 1 or i == total:
399 logger.info(
400 "[%s] %d/%d git:%s '%s'",
401 muse_branch,
402 i,
403 total,
404 git_sha[:12],
405 meta["message"][:60],
406 )
407
408 # Populate state/ with this commit's tree.
409 if not dry_run:
410 _extract_tree_to(repo_root, git_sha, workdir)
411
412 muse_id = _replay_commit(
413 repo_root=repo_root,
414 workdir=workdir,
415 git_sha=git_sha,
416 muse_branch=muse_branch,
417 parent_muse_id=parent_muse_id,
418 meta=meta,
419 repo_id=repo_id,
420 dry_run=dry_run,
421 )
422
423 git_to_muse[git_sha] = muse_id
424 parent_muse_id = muse_id
425
426 return git_to_muse
427
428
429 # ---------------------------------------------------------------------------
430 # Entry point
431 # ---------------------------------------------------------------------------
432
433
434 def _load_repo_id(repo_root: pathlib.Path) -> str:
435 import json
436 repo_json = repo_root / ".muse" / "repo.json"
437 data: dict[str, str] = json.loads(repo_json.read_text())
438 return data["repo_id"]
439
440
441 def main(argv: list[str] | None = None) -> int:
442 parser = argparse.ArgumentParser(
443 description="Replay a Git commit graph into a Muse repository."
444 )
445 parser.add_argument(
446 "--repo-root",
447 type=pathlib.Path,
448 default=_REPO_ROOT,
449 help="Path to the repository root (default: parent of this script).",
450 )
451 parser.add_argument(
452 "--dry-run",
453 action="store_true",
454 help="Log what would happen without writing anything.",
455 )
456 parser.add_argument(
457 "--verbose",
458 "-v",
459 action="store_true",
460 help="Log every commit (default: log every 10 + first/last).",
461 )
462 parser.add_argument(
463 "--branch",
464 default="all",
465 help="Which git branch(es) to replay: 'main', 'dev', or 'all' (default).",
466 )
467 args = parser.parse_args(argv)
468
469 logging.basicConfig(
470 level=logging.INFO,
471 format="%(levelname)s %(message)s",
472 )
473
474 repo_root: pathlib.Path = args.repo_root.resolve()
475 dry_run: bool = args.dry_run
476 verbose: bool = args.verbose
477 branch_arg: str = args.branch
478
479 # Verify .muse/ exists.
480 if not (repo_root / ".muse" / "repo.json").exists():
481 logger.error(
482 "❌ No .muse/repo.json found in %s — run 'muse init' first.", repo_root
483 )
484 return 1
485
486 repo_id = _load_repo_id(repo_root)
487 logger.info("✅ Muse repo ID: %s", repo_id)
488
489 # Use a temp directory for git archive extraction — the repo root IS the
490 # working tree and must never be wiped between replays.
491 with tempfile.TemporaryDirectory(prefix="git2muse-") as _tmpdir:
492 workdir = pathlib.Path(_tmpdir)
493
494 # -----------------------------------------------------------------------
495 # Phase 1: main branch
496 # -----------------------------------------------------------------------
497 all_git_to_muse: dict[str, str] = {}
498
499 if branch_arg in ("main", "all"):
500 logger.info("━━━ Phase 1: replaying main branch ━━━")
501 main_shas = _git_commits_oldest_first(repo_root, "main")
502 # Skip merge commits — they add no unique tree delta.
503 main_shas = [
504 s for s in main_shas
505 if not _is_merge_commit(repo_root, s)
506 ]
507 logger.info(" %d non-merge commits on main", len(main_shas))
508
509 _set_head_ref(repo_root, "main")
510 mapping = _replay_branch(
511 repo_root=repo_root,
512 workdir=workdir,
513 git_shas=main_shas,
514 muse_branch="main",
515 start_parent_muse_id=None,
516 repo_id=repo_id,
517 dry_run=dry_run,
518 verbose=verbose,
519 )
520 all_git_to_muse.update(mapping)
521 logger.info("✅ main: %d commits written", len(mapping))
522
523 # -----------------------------------------------------------------------
524 # Phase 2: dev branch (commits not reachable from main)
525 # -----------------------------------------------------------------------
526 if branch_arg in ("dev", "all"):
527 logger.info("━━━ Phase 2: replaying dev branch ━━━")
528 dev_only_shas = _git_commits_oldest_first(
529 repo_root, "dev", exclude_branches=["main"]
530 )
531 dev_only_shas = [
532 s for s in dev_only_shas
533 if not _is_merge_commit(repo_root, s)
534 ]
535 logger.info(" %d dev-only non-merge commits", len(dev_only_shas))
536
537 if dev_only_shas:
538 # Find the git parent of the oldest dev-only commit — it should
539 # already be in all_git_to_muse (it's a main commit).
540 oldest_dev_sha = dev_only_shas[0]
541 git_parents = _git_parent_shas(repo_root, oldest_dev_sha)
542 branch_parent_muse_id: str | None = None
543 for gp in git_parents:
544 if gp in all_git_to_muse:
545 branch_parent_muse_id = all_git_to_muse[gp]
546 break
547 if branch_parent_muse_id is None:
548 # Fall back to current main HEAD.
549 branch_parent_muse_id = _get_branch_head(repo_root, "main")
550
551 _set_head_ref(repo_root, "dev")
552 mapping = _replay_branch(
553 repo_root=repo_root,
554 workdir=workdir,
555 git_shas=dev_only_shas,
556 muse_branch="dev",
557 start_parent_muse_id=branch_parent_muse_id,
558 repo_id=repo_id,
559 dry_run=dry_run,
560 verbose=verbose,
561 )
562 all_git_to_muse.update(mapping)
563 logger.info("✅ dev: %d commits written", len(mapping))
564 else:
565 logger.info(" dev has no unique commits beyond main — skipping")
566
567 # Leave HEAD pointing at main.
568 if not dry_run:
569 _set_head_ref(repo_root, "main")
570
571 # Summary.
572 main_count = len(all_git_to_muse)
573 logger.info("━━━ Done ━━━ total Muse commits written: %d", main_count)
574
575 return 0
576
577
578 if __name__ == "__main__":
579 sys.exit(main())