cgcardona / muse public
typing_audit.py python
354 lines 13.4 KB
e6786943 feat: upgrade to Python 3.14, drop from __future__ import annotations Gabriel Cardona <cgcardona@gmail.com> 1d ago
1 #!/usr/bin/env python3
2 """Typing audit — find and count all banned type patterns in the codebase.
3
4 Checks every rule from .cursorrules and AGENTS.md:
5 - Any as a type (param, return, collection value)
6 - object as a type (same severity as Any)
7 - cast() usage (all usage banned — fix the callee)
8 - # type: ignore (blanket and specific)
9 - Bare collections (list, dict, set, tuple without type parameters)
10 - Optional[X] (use X | None instead)
11 - Legacy typing imports (List, Dict, Set, Tuple — use lowercase builtins)
12
13 Outputs JSON (machine-readable) + a human summary to stdout.
14
15 Usage:
16 python tools/typing_audit.py # muse/ + tests/
17 python tools/typing_audit.py --json artifacts/typing_audit.json
18 python tools/typing_audit.py --dirs muse/ tests/
19 python tools/typing_audit.py --dirs muse/ --max-any 0
20 """
21
22 import argparse
23 import ast
24 import json
25 import re
26 import sys
27 from collections import defaultdict
28 from pathlib import Path
29 from typing import Any
30
31
32 # ── Pattern matchers ──────────────────────────────────────────────────────────
33 # Grouped by category. Every key contributes to the total violation count.
34
35 _PATTERNS: dict[str, re.Pattern[str]] = {
36 # ── Any-as-type ──────────────────────────────────────────────────────
37 "dict_str_any": re.compile(
38 r"\bdict\[str,\s*Any\]|\bDict\[str,\s*Any\]", re.IGNORECASE
39 ),
40 "list_any": re.compile(r"\blist\[Any\]|\bList\[Any\]", re.IGNORECASE),
41 "return_any": re.compile(r"->\s*Any\b"),
42 "param_any": re.compile(r":\s*Any\b"),
43 "mapping_any": re.compile(
44 r"\bMapping\[str,\s*Any\]", re.IGNORECASE
45 ),
46 "optional_any": re.compile(r"\bOptional\[Any\]", re.IGNORECASE),
47 "sequence_any": re.compile(
48 r"\bSequence\[Any\]|\bIterable\[Any\]", re.IGNORECASE
49 ),
50 "tuple_any": re.compile(r"\btuple\[.*Any.*\]|\bTuple\[.*Any.*\]"),
51
52 # ── object-as-type (same severity as Any) ────────────────────────────
53 "param_object": re.compile(r":\s*object\b"),
54 "return_object": re.compile(r"->\s*object\b"),
55 "collection_object": re.compile(
56 r"\b(?:dict|list|set|tuple|Sequence|Mapping)\[[^]]*\bobject\b"
57 ),
58
59 # ── cast() — all usage banned ────────────────────────────────────────
60 "cast_usage": re.compile(r"\bcast\("),
61
62 # ── type: ignore — suppresses real errors ────────────────────────────
63 "type_ignore": re.compile(r"#\s*type:\s*ignore"),
64
65 # ── Bare collections (no type parameters) ────────────────────────────
66 # Negative lookaheads: exclude parameterized [, constructor calls (,
67 # and prose patterns (": list of items" in docstrings).
68 "bare_list": re.compile(r"(?::\s*|->\s*)list\b(?!\[|\(|\s+[a-z])"),
69 "bare_dict": re.compile(r"(?::\s*|->\s*)dict\b(?!\[|\(|\s+[a-z])"),
70 "bare_set": re.compile(r"(?::\s*|->\s*)set\b(?!\[|\(|\s+[a-z])"),
71 "bare_tuple": re.compile(r"(?::\s*|->\s*)tuple\b(?!\[|\(|\s+[a-z])"),
72
73 # ── Optional[X] — use X | None instead ───────────────────────────────
74 # Excludes Optional[Any] which is already caught by optional_any.
75 "optional_usage": re.compile(r"\bOptional\[(?!Any\b)"),
76
77 # ── Legacy typing imports (use lowercase builtins) ───────────────────
78 "legacy_List": re.compile(r"\bList\["),
79 "legacy_Dict": re.compile(r"\bDict\["),
80 "legacy_Set": re.compile(r"\bSet\["),
81 "legacy_Tuple": re.compile(r"\bTuple\["),
82 }
83
84
85 def _count_pattern_in_line(line: str, pattern: re.Pattern[str]) -> int:
86 return len(pattern.findall(line))
87
88
89 def _imports_any(source: str) -> bool:
90 """Check if file imports Any from typing."""
91 return bool(re.search(r"from\s+typing\s+import\s+.*\bAny\b", source))
92
93
94 def _classify_type_ignores(line: str) -> str:
95 """Return the ignore variant (blanket vs specific)."""
96 m = re.search(r"#\s*type:\s*ignore\[([^\]]+)\]", line)
97 if m:
98 return f"type_ignore[{m.group(1)}]"
99 return "type_ignore[blanket]"
100
101
102 # ── AST-based detection ──────────────────────────────────────────────────────
103
104
105 def _find_untyped_defs(source: str, filepath: str) -> list[dict[str, Any]]:
106 """Find function defs missing return type or param annotations."""
107 results: list[dict[str, Any]] = []
108 try:
109 tree = ast.parse(source)
110 except SyntaxError:
111 return results
112
113 for node in ast.walk(tree):
114 if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
115 if node.returns is None:
116 results.append({
117 "file": filepath,
118 "line": node.lineno,
119 "name": node.name,
120 "issue": "missing_return_type",
121 })
122 for arg in node.args.args + node.args.kwonlyargs:
123 if arg.annotation is None and arg.arg != "self" and arg.arg != "cls":
124 results.append({
125 "file": filepath,
126 "line": node.lineno,
127 "name": f"{node.name}.{arg.arg}",
128 "issue": "missing_param_type",
129 })
130 return results
131
132
133 # ── File scanner ──────────────────────────────────────────────────────────────
134
135
136 def scan_file(filepath: Path) -> dict[str, Any]:
137 """Scan a single Python file for typing violations."""
138 try:
139 source = filepath.read_text(encoding="utf-8")
140 except (OSError, UnicodeDecodeError):
141 return {}
142
143 lines = source.splitlines()
144 result: dict[str, Any] = {
145 "file": str(filepath),
146 "imports_any": _imports_any(source),
147 "patterns": defaultdict(int),
148 "pattern_lines": defaultdict(list),
149 "type_ignore_variants": defaultdict(int),
150 "untyped_defs": [],
151 }
152
153 for lineno, line in enumerate(lines, 1):
154 stripped = line.strip()
155 if not stripped or stripped.startswith("#"):
156 continue
157
158 for name, pattern in _PATTERNS.items():
159 count = _count_pattern_in_line(line, pattern)
160 if count > 0:
161 result["patterns"][name] += count
162 result["pattern_lines"][name].append(lineno)
163
164 if name == "type_ignore":
165 variant = _classify_type_ignores(line)
166 result["type_ignore_variants"][variant] += 1
167
168 result["untyped_defs"] = _find_untyped_defs(source, str(filepath))
169 return result
170
171
172 def scan_directory(directory: Path) -> list[dict[str, Any]]:
173 """Scan all Python files in a directory tree."""
174 results: list[dict[str, Any]] = []
175 for py_file in sorted(directory.rglob("*.py")):
176 if "venv" in py_file.parts or "__pycache__" in py_file.parts:
177 continue
178 if ".git" in py_file.parts:
179 continue
180 file_result = scan_file(py_file)
181 if file_result:
182 results.append(file_result)
183 return results
184
185
186 # ── Report generation ─────────────────────────────────────────────────────────
187
188 # Display order: group patterns into logical categories for the report.
189 _CATEGORY_ORDER: list[tuple[str, list[str]]] = [
190 ("Any-as-type", [
191 "dict_str_any", "list_any", "return_any", "param_any",
192 "mapping_any", "optional_any", "sequence_any", "tuple_any",
193 ]),
194 ("object-as-type", [
195 "param_object", "return_object", "collection_object",
196 ]),
197 ("cast() usage", ["cast_usage"]),
198 ("type: ignore", ["type_ignore"]),
199 ("Bare collections", [
200 "bare_list", "bare_dict", "bare_set", "bare_tuple",
201 ]),
202 ("Optional (use X | None)", ["optional_usage"]),
203 ("Legacy typing imports", [
204 "legacy_List", "legacy_Dict", "legacy_Set", "legacy_Tuple",
205 ]),
206 ]
207
208
209 def generate_report(results: list[dict[str, Any]]) -> dict[str, Any]:
210 """Generate aggregate report from scan results."""
211 totals: dict[str, int] = defaultdict(int)
212 files_with_any_import = 0
213 per_file: dict[str, dict[str, int]] = {}
214 top_offenders: list[dict[str, Any]] = []
215 all_type_ignore_variants: dict[str, int] = defaultdict(int)
216 all_untyped_defs: list[dict[str, Any]] = []
217
218 for r in results:
219 filepath = r["file"]
220 if r.get("imports_any"):
221 files_with_any_import += 1
222
223 file_total = 0
224 file_patterns: dict[str, int] = {}
225 for pattern, count in r.get("patterns", {}).items():
226 totals[pattern] += count
227 file_patterns[pattern] = count
228 file_total += count
229
230 if file_total > 0:
231 per_file[filepath] = file_patterns
232 top_offenders.append({"file": filepath, "total": file_total, "patterns": file_patterns})
233
234 for variant, count in r.get("type_ignore_variants", {}).items():
235 all_type_ignore_variants[variant] += count
236
237 all_untyped_defs.extend(r.get("untyped_defs", []))
238
239 top_offenders.sort(key=lambda x: x["total"], reverse=True)
240
241 return {
242 "summary": {
243 "total_files_scanned": len(results),
244 "files_importing_any": files_with_any_import,
245 "total_any_patterns": sum(totals.values()),
246 "untyped_defs": len(all_untyped_defs),
247 },
248 "pattern_totals": dict(totals),
249 "type_ignore_variants": dict(all_type_ignore_variants),
250 "top_offenders": top_offenders[:30],
251 "per_file": per_file,
252 "untyped_defs": all_untyped_defs[:50],
253 }
254
255
256 def print_human_summary(report: dict[str, Any]) -> None:
257 """Print a human-readable summary."""
258 s = report["summary"]
259 totals = report["pattern_totals"]
260 print("\n" + "=" * 70)
261 print(" TYPING AUDIT — Violation Report")
262 print("=" * 70)
263 print(f" Files scanned: {s['total_files_scanned']}")
264 print(f" Files importing Any: {s['files_importing_any']}")
265 print(f" Total violations: {s['total_any_patterns']}")
266 print(f" Untyped defs: {s['untyped_defs']}")
267 print()
268
269 for category, pattern_names in _CATEGORY_ORDER:
270 category_total = sum(totals.get(p, 0) for p in pattern_names)
271 if category_total == 0:
272 continue
273 print(f" {category}:")
274 for p in pattern_names:
275 count = totals.get(p, 0)
276 if count > 0:
277 print(f" {p:30s} {count:5d}")
278 print()
279
280 if not any(totals.get(p, 0) for _, pats in _CATEGORY_ORDER for p in pats):
281 print(" Pattern breakdown: (none)")
282 print()
283
284 if report["type_ignore_variants"]:
285 print(" # type: ignore variants:")
286 for variant, count in sorted(report["type_ignore_variants"].items(), key=lambda x: -x[1]):
287 print(f" {variant:40s} {count:5d}")
288 print()
289 print(" Top 15 offenders:")
290 for entry in report["top_offenders"][:15]:
291 print(f" {entry['total']:4d} {entry['file']}")
292 print("=" * 70 + "\n")
293
294
295 # ── CLI ───────────────────────────────────────────────────────────────────────
296
297
298 def main() -> None:
299 parser = argparse.ArgumentParser(
300 description="Audit typing violations: Any, object, cast, bare collections, "
301 "Optional, legacy imports, type: ignore, untyped defs",
302 )
303 parser.add_argument(
304 "--dirs",
305 nargs="+",
306 default=["muse/", "tests/"],
307 help="Directories to scan",
308 )
309 parser.add_argument("--json", type=str, help="Write JSON report to file")
310 parser.add_argument(
311 "--max-any",
312 type=int,
313 default=None,
314 help="Fail (exit 1) if total violations exceed this threshold",
315 )
316 args = parser.parse_args()
317
318 all_results: list[dict[str, Any]] = []
319 for d in args.dirs:
320 p = Path(d)
321 if p.exists():
322 all_results.extend(scan_directory(p))
323 else:
324 print(f"WARNING: {d} does not exist, skipping", file=sys.stderr)
325
326 report = generate_report(all_results)
327 print_human_summary(report)
328
329 if args.json:
330 Path(args.json).parent.mkdir(parents=True, exist_ok=True)
331 Path(args.json).write_text(
332 json.dumps(report, indent=2, default=str),
333 encoding="utf-8",
334 )
335 print(f" JSON report written to {args.json}")
336
337 if args.max_any is not None:
338 total = report["summary"]["total_any_patterns"]
339 if total > args.max_any:
340 print(
341 f"\n❌ RATCHET FAILED: {total} violations exceed "
342 f"threshold of {args.max_any}",
343 file=sys.stderr,
344 )
345 sys.exit(1)
346 else:
347 print(
348 f"\n✅ RATCHET OK: {total} violations within "
349 f"threshold of {args.max_any}",
350 )
351
352
353 if __name__ == "__main__":
354 main()