cgcardona / muse public
git2muse.py python
579 lines 18.2 KB
b105da0f fix(git2muse): use temp dir for extraction — repo root is now the worki… Gabriel Cardona <gabriel@tellurstori.com> 6h ago
1 """git2muse — Replay a Git commit graph into a Muse repository.
2
3 Usage
4 -----
5 ::
6
7 python tools/git2muse.py [--repo-root PATH] [--dry-run] [--verbose]
8
9 Strategy
10 --------
11 1. Walk ``main`` branch commits oldest-first and create Muse commits on the
12 Muse ``main`` branch preserving the original author, timestamp, and message.
13 2. Walk ``dev`` branch commits oldest-first that are not already on ``main``
14 and replay them onto a Muse ``dev`` branch, branching from the correct
15 ancestor.
16 3. Skip merge commits (commits with more than one parent) — they carry no
17 unique file-state delta; the Muse DAG is reconstructed faithfully through
18 the parent chain on each branch.
19
20 For each Git commit the tool:
21 - Extracts the commit's file tree into ``state/`` using ``git archive``.
22 - Removes files that Muse should not snapshot (build artefacts, caches, IDE
23 files, etc.) according to a hard-coded exclusion list that mirrors
24 ``.museignore``.
25 - Calls the Muse Python API directly (bypassing the CLI) so the original
26 Git author name, e-mail, and committer timestamp are preserved verbatim in
27 the Muse ``CommitRecord``.
28 - Updates the Muse branch HEAD ref so the Muse repo tracks the same history.
29
30 After a successful run the Muse repo under ``.muse/`` contains a full code-
31 domain representation of the project history and is ready to push to MuseHub.
32 """
33
34 from __future__ import annotations
35
36 import argparse
37 import datetime
38 import hashlib
39 import logging
40 import pathlib
41 import shutil
42 import subprocess
43 import sys
44 import tarfile
45 import tempfile
46
47 # ---------------------------------------------------------------------------
48 # Bootstrap: make sure the project root is on sys.path so we can import muse
49 # even when running from the tools/ directory.
50 # ---------------------------------------------------------------------------
51 _REPO_ROOT = pathlib.Path(__file__).parent.parent
52 if str(_REPO_ROOT) not in sys.path:
53 sys.path.insert(0, str(_REPO_ROOT))
54
55 from muse.core.object_store import write_object
56 from muse.core.store import (
57 CommitRecord,
58 SnapshotRecord,
59 get_head_commit_id,
60 write_commit,
61 write_snapshot,
62 )
63 from muse.core.snapshot import compute_commit_id, compute_snapshot_id
64
65 logger = logging.getLogger("git2muse")
66
67 # ---------------------------------------------------------------------------
68 # Files / dirs that should never end up in a Muse snapshot.
69 # These mirror .museignore + the hidden-directory exclusion in walk_workdir.
70 # ---------------------------------------------------------------------------
71
72 _EXCLUDE_PREFIXES: tuple[str, ...] = (
73 ".git/",
74 ".muse/",
75 ".muse",
76 ".venv/",
77 ".tox/",
78 ".mypy_cache/",
79 ".pytest_cache/",
80 ".hypothesis/",
81 ".github/",
82 ".DS_Store",
83 "artifacts/",
84 "__pycache__/",
85 )
86
87 _EXCLUDE_SUFFIXES: tuple[str, ...] = (
88 ".pyc",
89 ".pyo",
90 ".egg-info",
91 ".swp",
92 ".swo",
93 ".tmp",
94 "Thumbs.db",
95 ".DS_Store",
96 )
97
98
99 def _should_exclude(rel_path: str) -> bool:
100 """Return True if *rel_path* should be excluded from the Muse snapshot."""
101 for prefix in _EXCLUDE_PREFIXES:
102 if rel_path.startswith(prefix) or rel_path == prefix.rstrip("/"):
103 return True
104 for suffix in _EXCLUDE_SUFFIXES:
105 if rel_path.endswith(suffix):
106 return True
107 # Skip hidden files/dirs at the top level (mirrors walk_workdir behaviour).
108 first_component = rel_path.split("/")[0]
109 if first_component.startswith("."):
110 return True
111 return False
112
113
114 # ---------------------------------------------------------------------------
115 # Git helpers
116 # ---------------------------------------------------------------------------
117
118
119 def _git(repo_root: pathlib.Path, *args: str) -> str:
120 """Run a git command and return stdout (stripped)."""
121 result = subprocess.run(
122 ["git", *args],
123 cwd=repo_root,
124 capture_output=True,
125 text=True,
126 check=True,
127 )
128 return result.stdout.strip()
129
130
131 def _git_commits_oldest_first(
132 repo_root: pathlib.Path,
133 branch: str,
134 exclude_branches: list[str] | None = None,
135 ) -> list[str]:
136 """Return SHA1 hashes oldest-first for *branch*.
137
138 When *exclude_branches* is given, commits reachable from any of those
139 branches are excluded (used to extract dev-only commits).
140 """
141 cmd = ["log", "--topo-order", "--reverse", "--format=%H"]
142 if exclude_branches:
143 cmd.append(branch)
144 for excl in exclude_branches:
145 cmd.append(f"^{excl}")
146 else:
147 cmd.append(branch)
148 raw = _git(repo_root, *cmd)
149 return [line for line in raw.splitlines() if line.strip()]
150
151
152 _META_SEP = "|||GIT2MUSE|||"
153
154
155 def _git_commit_meta(repo_root: pathlib.Path, sha: str) -> dict[str, str]:
156 """Return author name, email, timestamp, and message for *sha*."""
157 fmt = f"%an{_META_SEP}%ae{_META_SEP}%at{_META_SEP}%B"
158 raw = _git(repo_root, "show", "-s", f"--format={fmt}", sha)
159 parts = raw.split(_META_SEP, 3)
160 if len(parts) < 4:
161 return {"name": "unknown", "email": "", "ts": "0", "message": sha[:12]}
162 name, email, ts, message = parts
163 return {
164 "name": name.strip(),
165 "email": email.strip(),
166 "ts": ts.strip(),
167 "message": message.strip(),
168 }
169
170
171 def _git_parent_shas(repo_root: pathlib.Path, sha: str) -> list[str]:
172 """Return parent SHA1s for *sha* (empty list for root commits)."""
173 raw = _git(repo_root, "log", "-1", "--format=%P", sha)
174 return [p for p in raw.split() if p]
175
176
177 def _is_merge_commit(repo_root: pathlib.Path, sha: str) -> bool:
178 return len(_git_parent_shas(repo_root, sha)) > 1
179
180
181 def _extract_tree_to(
182 repo_root: pathlib.Path,
183 sha: str,
184 dest: pathlib.Path,
185 ) -> None:
186 """Extract the git tree for *sha* into *dest*, applying exclusions."""
187 # Wipe and recreate dest for a clean slate.
188 if dest.exists():
189 shutil.rmtree(dest)
190 dest.mkdir(parents=True)
191
192 # git archive produces a tar stream of the commit tree.
193 archive = subprocess.run(
194 ["git", "archive", "--format=tar", sha],
195 cwd=repo_root,
196 capture_output=True,
197 check=True,
198 )
199 with tempfile.NamedTemporaryFile(suffix=".tar", delete=False) as tmp:
200 tmp.write(archive.stdout)
201 tmp_path = pathlib.Path(tmp.name)
202
203 try:
204 with tarfile.open(tmp_path) as tf:
205 for member in tf.getmembers():
206 if not member.isfile():
207 continue
208 rel = member.name.lstrip("./")
209 if _should_exclude(rel):
210 continue
211 target = dest / rel
212 target.parent.mkdir(parents=True, exist_ok=True)
213 f = tf.extractfile(member)
214 if f is not None:
215 target.write_bytes(f.read())
216 finally:
217 tmp_path.unlink(missing_ok=True)
218
219
220 # ---------------------------------------------------------------------------
221 # Muse snapshot helpers (bypass CLI to preserve git metadata)
222 # ---------------------------------------------------------------------------
223
224
225 def _sha256_file(path: pathlib.Path) -> str:
226 h = hashlib.sha256()
227 h.update(path.read_bytes())
228 return h.hexdigest()
229
230
231 def _build_manifest(workdir: pathlib.Path) -> dict[str, str]:
232 """Walk *workdir* and return {rel_path: sha256} manifest."""
233 manifest: dict[str, str] = {}
234 for fpath in sorted(workdir.rglob("*")):
235 if not fpath.is_file():
236 continue
237 if fpath.is_symlink():
238 continue
239 rel = str(fpath.relative_to(workdir))
240 first = rel.split("/")[0]
241 if first.startswith("."):
242 continue
243 manifest[rel] = _sha256_file(fpath)
244 return manifest
245
246
247 def _store_objects(
248 repo_root: pathlib.Path,
249 workdir: pathlib.Path,
250 manifest: dict[str, str],
251 ) -> None:
252 """Write all objects referenced in *manifest* to the object store."""
253 for rel, oid in manifest.items():
254 fpath = workdir / rel
255 if not fpath.exists():
256 logger.warning("⚠️ Missing file in workdir: %s", rel)
257 continue
258 content = fpath.read_bytes()
259 write_object(repo_root, oid, content)
260
261
262 # ---------------------------------------------------------------------------
263 # Branch ref helpers (direct file I/O — mirrors store.py internal logic)
264 # ---------------------------------------------------------------------------
265
266
267 def _refs_dir(repo_root: pathlib.Path) -> pathlib.Path:
268 return repo_root / ".muse" / "refs" / "heads"
269
270
271 def _set_branch_head(
272 repo_root: pathlib.Path, branch: str, commit_id: str
273 ) -> None:
274 ref_path = _refs_dir(repo_root) / branch
275 ref_path.parent.mkdir(parents=True, exist_ok=True)
276 ref_path.write_text(commit_id + "\n")
277
278
279 def _get_branch_head(repo_root: pathlib.Path, branch: str) -> str | None:
280 ref_path = _refs_dir(repo_root) / branch
281 if not ref_path.exists():
282 return None
283 return ref_path.read_text().strip() or None
284
285
286 def _set_head_ref(repo_root: pathlib.Path, branch: str) -> None:
287 head_path = repo_root / ".muse" / "HEAD"
288 head_path.write_text(f"refs/heads/{branch}\n")
289
290
291 def _ensure_branch_exists(repo_root: pathlib.Path, branch: str) -> None:
292 _refs_dir(repo_root).mkdir(parents=True, exist_ok=True)
293 ref_path = _refs_dir(repo_root) / branch
294 if not ref_path.exists():
295 ref_path.write_text("")
296
297
298 # ---------------------------------------------------------------------------
299 # Core replay logic
300 # ---------------------------------------------------------------------------
301
302
303 def _replay_commit(
304 repo_root: pathlib.Path,
305 workdir: pathlib.Path,
306 git_sha: str,
307 muse_branch: str,
308 parent_muse_id: str | None,
309 meta: dict[str, str],
310 repo_id: str,
311 dry_run: bool,
312 ) -> str:
313 """Replay one Git commit into the Muse object store.
314
315 Returns the new Muse commit ID.
316 """
317 # Build manifest from workdir (already populated by caller).
318 manifest = _build_manifest(workdir)
319
320 # Compute snapshot ID deterministically.
321 snapshot_id = compute_snapshot_id(manifest)
322
323 # Build CommitRecord with original Git metadata.
324 committed_at = datetime.datetime.fromtimestamp(
325 int(meta["ts"]), tz=datetime.timezone.utc
326 )
327 author = f"{meta['name']} <{meta['email']}>"
328 message = meta["message"] or git_sha[:12]
329
330 committed_at_iso = committed_at.isoformat()
331 parent_ids = [parent_muse_id] if parent_muse_id else []
332 commit_id = compute_commit_id(
333 parent_ids=parent_ids,
334 snapshot_id=snapshot_id,
335 message=message,
336 committed_at_iso=committed_at_iso,
337 )
338
339 if dry_run:
340 logger.info(
341 "[dry-run] Would create commit %s (git: %s) on %s | %s",
342 commit_id[:12],
343 git_sha[:12],
344 muse_branch,
345 message[:60],
346 )
347 return commit_id
348
349 # Write objects into the content-addressed store.
350 _store_objects(repo_root, workdir, manifest)
351
352 # Write snapshot record.
353 snap = SnapshotRecord(snapshot_id=snapshot_id, manifest=manifest)
354 write_snapshot(repo_root, snap)
355
356 # Write commit record.
357 record = CommitRecord(
358 commit_id=commit_id,
359 repo_id=repo_id,
360 branch=muse_branch,
361 snapshot_id=snapshot_id,
362 message=message,
363 committed_at=committed_at,
364 parent_commit_id=parent_muse_id,
365 author=author,
366 )
367 write_commit(repo_root, record)
368
369 # Advance branch HEAD.
370 _set_branch_head(repo_root, muse_branch, commit_id)
371
372 return commit_id
373
374
375 def _replay_branch(
376 repo_root: pathlib.Path,
377 workdir: pathlib.Path,
378 git_shas: list[str],
379 muse_branch: str,
380 start_parent_muse_id: str | None,
381 repo_id: str,
382 dry_run: bool,
383 verbose: bool,
384 ) -> dict[str, str]:
385 """Replay a list of git SHAs (oldest first) onto *muse_branch*.
386
387 Returns a mapping of git_sha → muse_commit_id for every replayed commit.
388 """
389 _ensure_branch_exists(repo_root, muse_branch)
390
391 git_to_muse: dict[str, str] = {}
392 parent_muse_id = start_parent_muse_id
393 total = len(git_shas)
394
395 for i, git_sha in enumerate(git_shas, 1):
396 meta = _git_commit_meta(repo_root, git_sha)
397
398 if verbose or i % 10 == 0 or i == 1 or i == total:
399 logger.info(
400 "[%s] %d/%d git:%s '%s'",
401 muse_branch,
402 i,
403 total,
404 git_sha[:12],
405 meta["message"][:60],
406 )
407
408 # Populate state/ with this commit's tree.
409 if not dry_run:
410 _extract_tree_to(repo_root, git_sha, workdir)
411
412 muse_id = _replay_commit(
413 repo_root=repo_root,
414 workdir=workdir,
415 git_sha=git_sha,
416 muse_branch=muse_branch,
417 parent_muse_id=parent_muse_id,
418 meta=meta,
419 repo_id=repo_id,
420 dry_run=dry_run,
421 )
422
423 git_to_muse[git_sha] = muse_id
424 parent_muse_id = muse_id
425
426 return git_to_muse
427
428
429 # ---------------------------------------------------------------------------
430 # Entry point
431 # ---------------------------------------------------------------------------
432
433
434 def _load_repo_id(repo_root: pathlib.Path) -> str:
435 import json
436 repo_json = repo_root / ".muse" / "repo.json"
437 data: dict[str, str] = json.loads(repo_json.read_text())
438 return data["repo_id"]
439
440
441 def main(argv: list[str] | None = None) -> int:
442 parser = argparse.ArgumentParser(
443 description="Replay a Git commit graph into a Muse repository."
444 )
445 parser.add_argument(
446 "--repo-root",
447 type=pathlib.Path,
448 default=_REPO_ROOT,
449 help="Path to the repository root (default: parent of this script).",
450 )
451 parser.add_argument(
452 "--dry-run",
453 action="store_true",
454 help="Log what would happen without writing anything.",
455 )
456 parser.add_argument(
457 "--verbose",
458 "-v",
459 action="store_true",
460 help="Log every commit (default: log every 10 + first/last).",
461 )
462 parser.add_argument(
463 "--branch",
464 default="all",
465 help="Which git branch(es) to replay: 'main', 'dev', or 'all' (default).",
466 )
467 args = parser.parse_args(argv)
468
469 logging.basicConfig(
470 level=logging.INFO,
471 format="%(levelname)s %(message)s",
472 )
473
474 repo_root: pathlib.Path = args.repo_root.resolve()
475 dry_run: bool = args.dry_run
476 verbose: bool = args.verbose
477 branch_arg: str = args.branch
478
479 # Verify .muse/ exists.
480 if not (repo_root / ".muse" / "repo.json").exists():
481 logger.error(
482 "❌ No .muse/repo.json found in %s — run 'muse init' first.", repo_root
483 )
484 return 1
485
486 repo_id = _load_repo_id(repo_root)
487 logger.info("✅ Muse repo ID: %s", repo_id)
488
489 # Use a temp directory for git archive extraction — the repo root IS the
490 # working tree and must never be wiped between replays.
491 with tempfile.TemporaryDirectory(prefix="git2muse-") as _tmpdir:
492 workdir = pathlib.Path(_tmpdir)
493
494 # -----------------------------------------------------------------------
495 # Phase 1: main branch
496 # -----------------------------------------------------------------------
497 all_git_to_muse: dict[str, str] = {}
498
499 if branch_arg in ("main", "all"):
500 logger.info("━━━ Phase 1: replaying main branch ━━━")
501 main_shas = _git_commits_oldest_first(repo_root, "main")
502 # Skip merge commits — they add no unique tree delta.
503 main_shas = [
504 s for s in main_shas
505 if not _is_merge_commit(repo_root, s)
506 ]
507 logger.info(" %d non-merge commits on main", len(main_shas))
508
509 _set_head_ref(repo_root, "main")
510 mapping = _replay_branch(
511 repo_root=repo_root,
512 workdir=workdir,
513 git_shas=main_shas,
514 muse_branch="main",
515 start_parent_muse_id=None,
516 repo_id=repo_id,
517 dry_run=dry_run,
518 verbose=verbose,
519 )
520 all_git_to_muse.update(mapping)
521 logger.info("✅ main: %d commits written", len(mapping))
522
523 # -----------------------------------------------------------------------
524 # Phase 2: dev branch (commits not reachable from main)
525 # -----------------------------------------------------------------------
526 if branch_arg in ("dev", "all"):
527 logger.info("━━━ Phase 2: replaying dev branch ━━━")
528 dev_only_shas = _git_commits_oldest_first(
529 repo_root, "dev", exclude_branches=["main"]
530 )
531 dev_only_shas = [
532 s for s in dev_only_shas
533 if not _is_merge_commit(repo_root, s)
534 ]
535 logger.info(" %d dev-only non-merge commits", len(dev_only_shas))
536
537 if dev_only_shas:
538 # Find the git parent of the oldest dev-only commit — it should
539 # already be in all_git_to_muse (it's a main commit).
540 oldest_dev_sha = dev_only_shas[0]
541 git_parents = _git_parent_shas(repo_root, oldest_dev_sha)
542 branch_parent_muse_id: str | None = None
543 for gp in git_parents:
544 if gp in all_git_to_muse:
545 branch_parent_muse_id = all_git_to_muse[gp]
546 break
547 if branch_parent_muse_id is None:
548 # Fall back to current main HEAD.
549 branch_parent_muse_id = _get_branch_head(repo_root, "main")
550
551 _set_head_ref(repo_root, "dev")
552 mapping = _replay_branch(
553 repo_root=repo_root,
554 workdir=workdir,
555 git_shas=dev_only_shas,
556 muse_branch="dev",
557 start_parent_muse_id=branch_parent_muse_id,
558 repo_id=repo_id,
559 dry_run=dry_run,
560 verbose=verbose,
561 )
562 all_git_to_muse.update(mapping)
563 logger.info("✅ dev: %d commits written", len(mapping))
564 else:
565 logger.info(" dev has no unique commits beyond main — skipping")
566
567 # Leave HEAD pointing at main.
568 if not dry_run:
569 _set_head_ref(repo_root, "main")
570
571 # Summary.
572 main_count = len(all_git_to_muse)
573 logger.info("━━━ Done ━━━ total Muse commits written: %d", main_count)
574
575 return 0
576
577
578 if __name__ == "__main__":
579 sys.exit(main())