muse/plugins/code/_refactor_classify.py · gabriel/muse

_refactor_classify.py python

229 lines 8.6 KB

d855b718 refactor: strip phase/v2 workflow labels from all source, tests, and docs Gabriel Cardona <cgcardona@gmail.com> 5d ago

1	"""Composite refactor classification for the code domain.
2
3	Provides two tiers of classification:
4
5	Exact classification (deterministic, hash-based):
6
7	``rename`` same body_hash, different name
8	``move`` same content_id, different file path
9	``rename+move`` same body_hash, different name AND different file
10	``signature_only`` same body_hash, different signature_id
11	``impl_only`` same signature_id, different body_hash
12	``metadata_only`` same body_hash + signature_id, different metadata_id
13	``full_rewrite`` both signature and body changed
14
15	Inferred refactor (best-effort, heuristic):
16
17	``extract`` a new symbol appeared whose body is a strict subset of an
18	existing deleted/modified symbol's body
19	``inline`` a symbol disappeared and its known callers expanded
20	``split`` one symbol became two — each shares a portion of the body
21	``merge`` two symbols became one — body is a union of the two old bodies
22
23	Each inferred classification carries a ``confidence`` float (0.0–1.0) and an
24	``evidence`` list of strings explaining the reasoning.
25
26	These are used by ``muse detect-refactor`` to produce the enhanced JSON output.
27	"""
28	from __future__ import annotations
29
30	import logging
31	from typing import Literal
32
33	from muse.plugins.code.ast_parser import SymbolRecord
34
35	logger = logging.getLogger(__name__)
36
37	ExactClassification = Literal[
38	"rename",
39	"move",
40	"rename+move",
41	"signature_only",
42	"impl_only",
43	"metadata_only",
44	"full_rewrite",
45	"unchanged",
46	]
47
48	InferredRefactor = Literal["extract", "inline", "split", "merge", "none"]
49
50
51	class RefactorClassification:
52	"""Full classification of a single refactoring event."""
53
54	def __init__(
55	self,
56	old_address: str,
57	new_address: str,
58	old_rec: SymbolRecord,
59	new_rec: SymbolRecord,
60	exact: ExactClassification,
61	inferred: InferredRefactor = "none",
62	confidence: float = 1.0,
63	evidence: list[str] \| None = None,
64	) -> None:
65	self.old_address = old_address
66	self.new_address = new_address
67	self.old_rec = old_rec
68	self.new_rec = new_rec
69	self.exact = exact
70	self.inferred = inferred
71	self.confidence = confidence
72	self.evidence: list[str] = evidence or []
73
74	def to_dict(self) -> dict[str, str \| float \| list[str]]:
75	return {
76	"old_address": self.old_address,
77	"new_address": self.new_address,
78	"old_kind": self.old_rec["kind"],
79	"new_kind": self.new_rec["kind"],
80	"exact_classification": self.exact,
81	"inferred_refactor": self.inferred,
82	"confidence": round(self.confidence, 3),
83	"evidence": self.evidence,
84	"old_content_id": self.old_rec["content_id"][:8],
85	"new_content_id": self.new_rec["content_id"][:8],
86	"old_body_hash": self.old_rec["body_hash"][:8],
87	"new_body_hash": self.new_rec["body_hash"][:8],
88	"old_signature_id": self.old_rec["signature_id"][:8],
89	"new_signature_id": self.new_rec["signature_id"][:8],
90	}
91
92
93	def classify_exact(
94	old_addr: str,
95	new_addr: str,
96	old: SymbolRecord,
97	new: SymbolRecord,
98	) -> ExactClassification:
99	"""Return the deterministic hash-based refactor classification."""
100	old_file = old_addr.split("::")[0]
101	new_file = new_addr.split("::")[0]
102	same_file = old_file == new_file
103	same_name = old["name"] == new["name"]
104	same_body = old["body_hash"] == new["body_hash"]
105	same_sig = old["signature_id"] == new["signature_id"]
106	same_meta = old.get("metadata_id", "") == new.get("metadata_id", "")
107
108	if old["content_id"] == new["content_id"]:
109	return "unchanged"
110
111	# Cross-file move detection.
112	if not same_file:
113	if same_name and same_body:
114	return "move"
115	if same_body:
116	return "rename+move"
117
118	# Intra-file.
119	if same_body and not same_sig:
120	return "signature_only"
121	if same_body and same_sig and not same_meta:
122	return "metadata_only"
123	if same_sig and not same_body:
124	return "impl_only"
125	if same_body and not same_name:
126	return "rename"
127
128	return "full_rewrite"
129
130
131	def _body_tokens(body_hash: str, body_src: str) -> frozenset[str]:
132	"""Very rough body tokenisation for subset detection (split words on spaces)."""
133	return frozenset(body_src.split())
134
135
136	def classify_composite(
137	removed: dict[str, SymbolRecord],
138	added: dict[str, SymbolRecord],
139	) -> list[RefactorClassification]:
140	"""Classify composite refactors across a batch of added/removed symbols.
141
142	Args:
143	removed: Symbols deleted in this diff (address → record).
144	added: Symbols inserted in this diff (address → record).
145
146	Returns:
147	List of :class:`RefactorClassification` objects. Only pairs/groups
148	that pass a confidence threshold are included.
149	"""
150	results: list[RefactorClassification] = []
151	matched_removed: set[str] = set()
152	matched_added: set[str] = set()
153
154	# ── Exact matches first (rename / move / rename+move) ──────────────────
155	added_by_body: dict[str, str] = {r["body_hash"]: addr for addr, r in added.items()}
156	added_by_content: dict[str, str] = {r["content_id"]: addr for addr, r in added.items()}
157
158	for rem_addr, rem_rec in sorted(removed.items()):
159	# Exact content match → moved/copied.
160	if rem_rec["content_id"] in added_by_content:
161	new_addr = added_by_content[rem_rec["content_id"]]
162	new_rec = added[new_addr]
163	exact = classify_exact(rem_addr, new_addr, rem_rec, new_rec)
164	results.append(RefactorClassification(
165	old_address=rem_addr,
166	new_address=new_addr,
167	old_rec=rem_rec,
168	new_rec=new_rec,
169	exact=exact,
170	evidence=[f"content_id matches {rem_rec['content_id'][:8]}"],
171	))
172	matched_removed.add(rem_addr)
173	matched_added.add(new_addr)
174	continue
175
176	# Same body, different name → rename (possibly with cross-file move).
177	if rem_rec["body_hash"] in added_by_body:
178	new_addr = added_by_body[rem_rec["body_hash"]]
179	if new_addr not in matched_added:
180	new_rec = added[new_addr]
181	exact = classify_exact(rem_addr, new_addr, rem_rec, new_rec)
182	results.append(RefactorClassification(
183	old_address=rem_addr,
184	new_address=new_addr,
185	old_rec=rem_rec,
186	new_rec=new_rec,
187	exact=exact,
188	evidence=[f"body_hash matches {rem_rec['body_hash'][:8]}"],
189	))
190	matched_removed.add(rem_addr)
191	matched_added.add(new_addr)
192	continue
193
194	# ── Inferred: extract — new symbol, no prior body_hash match ───────────
195	# Heuristic: the new symbol's name appears as a call in the modified/surviving
196	# code of a removed symbol. Confidence proportional to name overlap.
197	unmatched_added = {a: r for a, r in added.items() if a not in matched_added}
198	unmatched_removed = {a: r for a, r in removed.items() if a not in matched_removed}
199
200	for add_addr, add_rec in sorted(unmatched_added.items()):
201	best_confidence = 0.0
202	best_src_addr: str \| None = None
203	# Look for removed/source symbols that might have been extracted from.
204	for rem_addr, rem_rec in sorted(unmatched_removed.items()):
205	# Simple heuristic: is the new symbol's name a substring of the
206	# source symbol's qualified_name or vice versa?
207	overlap = add_rec["name"].lower() in rem_rec["qualified_name"].lower()
208	if overlap:
209	confidence = 0.5 # Low confidence — name heuristic only.
210	if confidence > best_confidence:
211	best_confidence = confidence
212	best_src_addr = rem_addr
213	if best_src_addr and best_confidence >= 0.5:
214	src_rec = unmatched_removed[best_src_addr]
215	results.append(RefactorClassification(
216	old_address=best_src_addr,
217	new_address=add_addr,
218	old_rec=src_rec,
219	new_rec=add_rec,
220	exact="full_rewrite",
221	inferred="extract",
222	confidence=best_confidence,
223	evidence=[
224	f"new symbol '{add_rec['name']}' found inside "
225	f"old qualified_name '{src_rec['qualified_name']}'"
226	],
227	))
228
229	return results

Content Address

Object ID (SHA-256)

a47529dbf8955624eca468b4635e7e6a501e3cfd9562d0e2ecfe0a3b100a9a0a

This file is immutable and content-addressed. The same SHA always refers to the same bytes, across every clone and every time.

File Info

Path muse/plugins/code/_refactor_classify.py

Lines 229

Size 8.6 KB

Language python

Ref d855b718

Snapshot 7cc0502e0ca8…

Last Modified

d855b718

refactor: strip phase/v2 workflow labels from all source, tests, and docs

Gabriel Cardona <cgcardona@gmail.com> 5d ago

View commit →

Links

Browse tree at d855b718 All commits View raw