gabriel / muse public
_refactor_classify.py python
229 lines 8.6 KB
d855b718 refactor: strip phase/v2 workflow labels from all source, tests, and docs Gabriel Cardona <cgcardona@gmail.com> 5d ago
1 """Composite refactor classification for the code domain.
2
3 Provides two tiers of classification:
4
5 **Exact classification** (deterministic, hash-based):
6
7 ``rename`` same body_hash, different name
8 ``move`` same content_id, different file path
9 ``rename+move`` same body_hash, different name AND different file
10 ``signature_only`` same body_hash, different signature_id
11 ``impl_only`` same signature_id, different body_hash
12 ``metadata_only`` same body_hash + signature_id, different metadata_id
13 ``full_rewrite`` both signature and body changed
14
15 **Inferred refactor** (best-effort, heuristic):
16
17 ``extract`` a new symbol appeared whose body is a strict subset of an
18 existing deleted/modified symbol's body
19 ``inline`` a symbol disappeared and its known callers expanded
20 ``split`` one symbol became two — each shares a portion of the body
21 ``merge`` two symbols became one — body is a union of the two old bodies
22
23 Each inferred classification carries a ``confidence`` float (0.0–1.0) and an
24 ``evidence`` list of strings explaining the reasoning.
25
26 These are used by ``muse detect-refactor`` to produce the enhanced JSON output.
27 """
28 from __future__ import annotations
29
30 import logging
31 from typing import Literal
32
33 from muse.plugins.code.ast_parser import SymbolRecord
34
35 logger = logging.getLogger(__name__)
36
37 ExactClassification = Literal[
38 "rename",
39 "move",
40 "rename+move",
41 "signature_only",
42 "impl_only",
43 "metadata_only",
44 "full_rewrite",
45 "unchanged",
46 ]
47
48 InferredRefactor = Literal["extract", "inline", "split", "merge", "none"]
49
50
51 class RefactorClassification:
52 """Full classification of a single refactoring event."""
53
54 def __init__(
55 self,
56 old_address: str,
57 new_address: str,
58 old_rec: SymbolRecord,
59 new_rec: SymbolRecord,
60 exact: ExactClassification,
61 inferred: InferredRefactor = "none",
62 confidence: float = 1.0,
63 evidence: list[str] | None = None,
64 ) -> None:
65 self.old_address = old_address
66 self.new_address = new_address
67 self.old_rec = old_rec
68 self.new_rec = new_rec
69 self.exact = exact
70 self.inferred = inferred
71 self.confidence = confidence
72 self.evidence: list[str] = evidence or []
73
74 def to_dict(self) -> dict[str, str | float | list[str]]:
75 return {
76 "old_address": self.old_address,
77 "new_address": self.new_address,
78 "old_kind": self.old_rec["kind"],
79 "new_kind": self.new_rec["kind"],
80 "exact_classification": self.exact,
81 "inferred_refactor": self.inferred,
82 "confidence": round(self.confidence, 3),
83 "evidence": self.evidence,
84 "old_content_id": self.old_rec["content_id"][:8],
85 "new_content_id": self.new_rec["content_id"][:8],
86 "old_body_hash": self.old_rec["body_hash"][:8],
87 "new_body_hash": self.new_rec["body_hash"][:8],
88 "old_signature_id": self.old_rec["signature_id"][:8],
89 "new_signature_id": self.new_rec["signature_id"][:8],
90 }
91
92
93 def classify_exact(
94 old_addr: str,
95 new_addr: str,
96 old: SymbolRecord,
97 new: SymbolRecord,
98 ) -> ExactClassification:
99 """Return the deterministic hash-based refactor classification."""
100 old_file = old_addr.split("::")[0]
101 new_file = new_addr.split("::")[0]
102 same_file = old_file == new_file
103 same_name = old["name"] == new["name"]
104 same_body = old["body_hash"] == new["body_hash"]
105 same_sig = old["signature_id"] == new["signature_id"]
106 same_meta = old.get("metadata_id", "") == new.get("metadata_id", "")
107
108 if old["content_id"] == new["content_id"]:
109 return "unchanged"
110
111 # Cross-file move detection.
112 if not same_file:
113 if same_name and same_body:
114 return "move"
115 if same_body:
116 return "rename+move"
117
118 # Intra-file.
119 if same_body and not same_sig:
120 return "signature_only"
121 if same_body and same_sig and not same_meta:
122 return "metadata_only"
123 if same_sig and not same_body:
124 return "impl_only"
125 if same_body and not same_name:
126 return "rename"
127
128 return "full_rewrite"
129
130
131 def _body_tokens(body_hash: str, body_src: str) -> frozenset[str]:
132 """Very rough body tokenisation for subset detection (split words on spaces)."""
133 return frozenset(body_src.split())
134
135
136 def classify_composite(
137 removed: dict[str, SymbolRecord],
138 added: dict[str, SymbolRecord],
139 ) -> list[RefactorClassification]:
140 """Classify composite refactors across a batch of added/removed symbols.
141
142 Args:
143 removed: Symbols deleted in this diff (address → record).
144 added: Symbols inserted in this diff (address → record).
145
146 Returns:
147 List of :class:`RefactorClassification` objects. Only pairs/groups
148 that pass a confidence threshold are included.
149 """
150 results: list[RefactorClassification] = []
151 matched_removed: set[str] = set()
152 matched_added: set[str] = set()
153
154 # ── Exact matches first (rename / move / rename+move) ──────────────────
155 added_by_body: dict[str, str] = {r["body_hash"]: addr for addr, r in added.items()}
156 added_by_content: dict[str, str] = {r["content_id"]: addr for addr, r in added.items()}
157
158 for rem_addr, rem_rec in sorted(removed.items()):
159 # Exact content match → moved/copied.
160 if rem_rec["content_id"] in added_by_content:
161 new_addr = added_by_content[rem_rec["content_id"]]
162 new_rec = added[new_addr]
163 exact = classify_exact(rem_addr, new_addr, rem_rec, new_rec)
164 results.append(RefactorClassification(
165 old_address=rem_addr,
166 new_address=new_addr,
167 old_rec=rem_rec,
168 new_rec=new_rec,
169 exact=exact,
170 evidence=[f"content_id matches {rem_rec['content_id'][:8]}"],
171 ))
172 matched_removed.add(rem_addr)
173 matched_added.add(new_addr)
174 continue
175
176 # Same body, different name → rename (possibly with cross-file move).
177 if rem_rec["body_hash"] in added_by_body:
178 new_addr = added_by_body[rem_rec["body_hash"]]
179 if new_addr not in matched_added:
180 new_rec = added[new_addr]
181 exact = classify_exact(rem_addr, new_addr, rem_rec, new_rec)
182 results.append(RefactorClassification(
183 old_address=rem_addr,
184 new_address=new_addr,
185 old_rec=rem_rec,
186 new_rec=new_rec,
187 exact=exact,
188 evidence=[f"body_hash matches {rem_rec['body_hash'][:8]}"],
189 ))
190 matched_removed.add(rem_addr)
191 matched_added.add(new_addr)
192 continue
193
194 # ── Inferred: extract — new symbol, no prior body_hash match ───────────
195 # Heuristic: the new symbol's name appears as a call in the modified/surviving
196 # code of a removed symbol. Confidence proportional to name overlap.
197 unmatched_added = {a: r for a, r in added.items() if a not in matched_added}
198 unmatched_removed = {a: r for a, r in removed.items() if a not in matched_removed}
199
200 for add_addr, add_rec in sorted(unmatched_added.items()):
201 best_confidence = 0.0
202 best_src_addr: str | None = None
203 # Look for removed/source symbols that might have been extracted from.
204 for rem_addr, rem_rec in sorted(unmatched_removed.items()):
205 # Simple heuristic: is the new symbol's name a substring of the
206 # source symbol's qualified_name or vice versa?
207 overlap = add_rec["name"].lower() in rem_rec["qualified_name"].lower()
208 if overlap:
209 confidence = 0.5 # Low confidence — name heuristic only.
210 if confidence > best_confidence:
211 best_confidence = confidence
212 best_src_addr = rem_addr
213 if best_src_addr and best_confidence >= 0.5:
214 src_rec = unmatched_removed[best_src_addr]
215 results.append(RefactorClassification(
216 old_address=best_src_addr,
217 new_address=add_addr,
218 old_rec=src_rec,
219 new_rec=add_rec,
220 exact="full_rewrite",
221 inferred="extract",
222 confidence=best_confidence,
223 evidence=[
224 f"new symbol '{add_rec['name']}' found inside "
225 f"old qualified_name '{src_rec['qualified_name']}'"
226 ],
227 ))
228
229 return results