cgcardona / muse public
typing_audit.py python
356 lines 13.4 KB
cf6e72f3 Add tools/typing_audit.py — regex + AST violation scanner Gabriel Cardona <gabriel@tellurstori.com> 3d ago
1 #!/usr/bin/env python3
2 """Typing audit — find and count all banned type patterns in the codebase.
3
4 Checks every rule from .cursorrules and AGENTS.md:
5 - Any as a type (param, return, collection value)
6 - object as a type (same severity as Any)
7 - cast() usage (all usage banned — fix the callee)
8 - # type: ignore (blanket and specific)
9 - Bare collections (list, dict, set, tuple without type parameters)
10 - Optional[X] (use X | None instead)
11 - Legacy typing imports (List, Dict, Set, Tuple — use lowercase builtins)
12
13 Outputs JSON (machine-readable) + a human summary to stdout.
14
15 Usage:
16 python tools/typing_audit.py # muse/ + tests/
17 python tools/typing_audit.py --json artifacts/typing_audit.json
18 python tools/typing_audit.py --dirs muse/ tests/
19 python tools/typing_audit.py --dirs muse/ --max-any 0
20 """
21
22 from __future__ import annotations
23
24 import argparse
25 import ast
26 import json
27 import re
28 import sys
29 from collections import defaultdict
30 from pathlib import Path
31 from typing import Any
32
33
34 # ── Pattern matchers ──────────────────────────────────────────────────────────
35 # Grouped by category. Every key contributes to the total violation count.
36
37 _PATTERNS: dict[str, re.Pattern[str]] = {
38 # ── Any-as-type ──────────────────────────────────────────────────────
39 "dict_str_any": re.compile(
40 r"\bdict\[str,\s*Any\]|\bDict\[str,\s*Any\]", re.IGNORECASE
41 ),
42 "list_any": re.compile(r"\blist\[Any\]|\bList\[Any\]", re.IGNORECASE),
43 "return_any": re.compile(r"->\s*Any\b"),
44 "param_any": re.compile(r":\s*Any\b"),
45 "mapping_any": re.compile(
46 r"\bMapping\[str,\s*Any\]", re.IGNORECASE
47 ),
48 "optional_any": re.compile(r"\bOptional\[Any\]", re.IGNORECASE),
49 "sequence_any": re.compile(
50 r"\bSequence\[Any\]|\bIterable\[Any\]", re.IGNORECASE
51 ),
52 "tuple_any": re.compile(r"\btuple\[.*Any.*\]|\bTuple\[.*Any.*\]"),
53
54 # ── object-as-type (same severity as Any) ────────────────────────────
55 "param_object": re.compile(r":\s*object\b"),
56 "return_object": re.compile(r"->\s*object\b"),
57 "collection_object": re.compile(
58 r"\b(?:dict|list|set|tuple|Sequence|Mapping)\[[^]]*\bobject\b"
59 ),
60
61 # ── cast() — all usage banned ────────────────────────────────────────
62 "cast_usage": re.compile(r"\bcast\("),
63
64 # ── type: ignore — suppresses real errors ────────────────────────────
65 "type_ignore": re.compile(r"#\s*type:\s*ignore"),
66
67 # ── Bare collections (no type parameters) ────────────────────────────
68 # Negative lookaheads: exclude parameterized [, constructor calls (,
69 # and prose patterns (": list of items" in docstrings).
70 "bare_list": re.compile(r"(?::\s*|->\s*)list\b(?!\[|\(|\s+[a-z])"),
71 "bare_dict": re.compile(r"(?::\s*|->\s*)dict\b(?!\[|\(|\s+[a-z])"),
72 "bare_set": re.compile(r"(?::\s*|->\s*)set\b(?!\[|\(|\s+[a-z])"),
73 "bare_tuple": re.compile(r"(?::\s*|->\s*)tuple\b(?!\[|\(|\s+[a-z])"),
74
75 # ── Optional[X] — use X | None instead ───────────────────────────────
76 # Excludes Optional[Any] which is already caught by optional_any.
77 "optional_usage": re.compile(r"\bOptional\[(?!Any\b)"),
78
79 # ── Legacy typing imports (use lowercase builtins) ───────────────────
80 "legacy_List": re.compile(r"\bList\["),
81 "legacy_Dict": re.compile(r"\bDict\["),
82 "legacy_Set": re.compile(r"\bSet\["),
83 "legacy_Tuple": re.compile(r"\bTuple\["),
84 }
85
86
87 def _count_pattern_in_line(line: str, pattern: re.Pattern[str]) -> int:
88 return len(pattern.findall(line))
89
90
91 def _imports_any(source: str) -> bool:
92 """Check if file imports Any from typing."""
93 return bool(re.search(r"from\s+typing\s+import\s+.*\bAny\b", source))
94
95
96 def _classify_type_ignores(line: str) -> str:
97 """Return the ignore variant (blanket vs specific)."""
98 m = re.search(r"#\s*type:\s*ignore\[([^\]]+)\]", line)
99 if m:
100 return f"type_ignore[{m.group(1)}]"
101 return "type_ignore[blanket]"
102
103
104 # ── AST-based detection ──────────────────────────────────────────────────────
105
106
107 def _find_untyped_defs(source: str, filepath: str) -> list[dict[str, Any]]:
108 """Find function defs missing return type or param annotations."""
109 results: list[dict[str, Any]] = []
110 try:
111 tree = ast.parse(source)
112 except SyntaxError:
113 return results
114
115 for node in ast.walk(tree):
116 if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
117 if node.returns is None:
118 results.append({
119 "file": filepath,
120 "line": node.lineno,
121 "name": node.name,
122 "issue": "missing_return_type",
123 })
124 for arg in node.args.args + node.args.kwonlyargs:
125 if arg.annotation is None and arg.arg != "self" and arg.arg != "cls":
126 results.append({
127 "file": filepath,
128 "line": node.lineno,
129 "name": f"{node.name}.{arg.arg}",
130 "issue": "missing_param_type",
131 })
132 return results
133
134
135 # ── File scanner ──────────────────────────────────────────────────────────────
136
137
138 def scan_file(filepath: Path) -> dict[str, Any]:
139 """Scan a single Python file for typing violations."""
140 try:
141 source = filepath.read_text(encoding="utf-8")
142 except (OSError, UnicodeDecodeError):
143 return {}
144
145 lines = source.splitlines()
146 result: dict[str, Any] = {
147 "file": str(filepath),
148 "imports_any": _imports_any(source),
149 "patterns": defaultdict(int),
150 "pattern_lines": defaultdict(list),
151 "type_ignore_variants": defaultdict(int),
152 "untyped_defs": [],
153 }
154
155 for lineno, line in enumerate(lines, 1):
156 stripped = line.strip()
157 if not stripped or stripped.startswith("#"):
158 continue
159
160 for name, pattern in _PATTERNS.items():
161 count = _count_pattern_in_line(line, pattern)
162 if count > 0:
163 result["patterns"][name] += count
164 result["pattern_lines"][name].append(lineno)
165
166 if name == "type_ignore":
167 variant = _classify_type_ignores(line)
168 result["type_ignore_variants"][variant] += 1
169
170 result["untyped_defs"] = _find_untyped_defs(source, str(filepath))
171 return result
172
173
174 def scan_directory(directory: Path) -> list[dict[str, Any]]:
175 """Scan all Python files in a directory tree."""
176 results: list[dict[str, Any]] = []
177 for py_file in sorted(directory.rglob("*.py")):
178 if "venv" in py_file.parts or "__pycache__" in py_file.parts:
179 continue
180 if ".git" in py_file.parts:
181 continue
182 file_result = scan_file(py_file)
183 if file_result:
184 results.append(file_result)
185 return results
186
187
188 # ── Report generation ─────────────────────────────────────────────────────────
189
190 # Display order: group patterns into logical categories for the report.
191 _CATEGORY_ORDER: list[tuple[str, list[str]]] = [
192 ("Any-as-type", [
193 "dict_str_any", "list_any", "return_any", "param_any",
194 "mapping_any", "optional_any", "sequence_any", "tuple_any",
195 ]),
196 ("object-as-type", [
197 "param_object", "return_object", "collection_object",
198 ]),
199 ("cast() usage", ["cast_usage"]),
200 ("type: ignore", ["type_ignore"]),
201 ("Bare collections", [
202 "bare_list", "bare_dict", "bare_set", "bare_tuple",
203 ]),
204 ("Optional (use X | None)", ["optional_usage"]),
205 ("Legacy typing imports", [
206 "legacy_List", "legacy_Dict", "legacy_Set", "legacy_Tuple",
207 ]),
208 ]
209
210
211 def generate_report(results: list[dict[str, Any]]) -> dict[str, Any]:
212 """Generate aggregate report from scan results."""
213 totals: dict[str, int] = defaultdict(int)
214 files_with_any_import = 0
215 per_file: dict[str, dict[str, int]] = {}
216 top_offenders: list[dict[str, Any]] = []
217 all_type_ignore_variants: dict[str, int] = defaultdict(int)
218 all_untyped_defs: list[dict[str, Any]] = []
219
220 for r in results:
221 filepath = r["file"]
222 if r.get("imports_any"):
223 files_with_any_import += 1
224
225 file_total = 0
226 file_patterns: dict[str, int] = {}
227 for pattern, count in r.get("patterns", {}).items():
228 totals[pattern] += count
229 file_patterns[pattern] = count
230 file_total += count
231
232 if file_total > 0:
233 per_file[filepath] = file_patterns
234 top_offenders.append({"file": filepath, "total": file_total, "patterns": file_patterns})
235
236 for variant, count in r.get("type_ignore_variants", {}).items():
237 all_type_ignore_variants[variant] += count
238
239 all_untyped_defs.extend(r.get("untyped_defs", []))
240
241 top_offenders.sort(key=lambda x: x["total"], reverse=True)
242
243 return {
244 "summary": {
245 "total_files_scanned": len(results),
246 "files_importing_any": files_with_any_import,
247 "total_any_patterns": sum(totals.values()),
248 "untyped_defs": len(all_untyped_defs),
249 },
250 "pattern_totals": dict(totals),
251 "type_ignore_variants": dict(all_type_ignore_variants),
252 "top_offenders": top_offenders[:30],
253 "per_file": per_file,
254 "untyped_defs": all_untyped_defs[:50],
255 }
256
257
258 def print_human_summary(report: dict[str, Any]) -> None:
259 """Print a human-readable summary."""
260 s = report["summary"]
261 totals = report["pattern_totals"]
262 print("\n" + "=" * 70)
263 print(" TYPING AUDIT — Violation Report")
264 print("=" * 70)
265 print(f" Files scanned: {s['total_files_scanned']}")
266 print(f" Files importing Any: {s['files_importing_any']}")
267 print(f" Total violations: {s['total_any_patterns']}")
268 print(f" Untyped defs: {s['untyped_defs']}")
269 print()
270
271 for category, pattern_names in _CATEGORY_ORDER:
272 category_total = sum(totals.get(p, 0) for p in pattern_names)
273 if category_total == 0:
274 continue
275 print(f" {category}:")
276 for p in pattern_names:
277 count = totals.get(p, 0)
278 if count > 0:
279 print(f" {p:30s} {count:5d}")
280 print()
281
282 if not any(totals.get(p, 0) for _, pats in _CATEGORY_ORDER for p in pats):
283 print(" Pattern breakdown: (none)")
284 print()
285
286 if report["type_ignore_variants"]:
287 print(" # type: ignore variants:")
288 for variant, count in sorted(report["type_ignore_variants"].items(), key=lambda x: -x[1]):
289 print(f" {variant:40s} {count:5d}")
290 print()
291 print(" Top 15 offenders:")
292 for entry in report["top_offenders"][:15]:
293 print(f" {entry['total']:4d} {entry['file']}")
294 print("=" * 70 + "\n")
295
296
297 # ── CLI ───────────────────────────────────────────────────────────────────────
298
299
300 def main() -> None:
301 parser = argparse.ArgumentParser(
302 description="Audit typing violations: Any, object, cast, bare collections, "
303 "Optional, legacy imports, type: ignore, untyped defs",
304 )
305 parser.add_argument(
306 "--dirs",
307 nargs="+",
308 default=["muse/", "tests/"],
309 help="Directories to scan",
310 )
311 parser.add_argument("--json", type=str, help="Write JSON report to file")
312 parser.add_argument(
313 "--max-any",
314 type=int,
315 default=None,
316 help="Fail (exit 1) if total violations exceed this threshold",
317 )
318 args = parser.parse_args()
319
320 all_results: list[dict[str, Any]] = []
321 for d in args.dirs:
322 p = Path(d)
323 if p.exists():
324 all_results.extend(scan_directory(p))
325 else:
326 print(f"WARNING: {d} does not exist, skipping", file=sys.stderr)
327
328 report = generate_report(all_results)
329 print_human_summary(report)
330
331 if args.json:
332 Path(args.json).parent.mkdir(parents=True, exist_ok=True)
333 Path(args.json).write_text(
334 json.dumps(report, indent=2, default=str),
335 encoding="utf-8",
336 )
337 print(f" JSON report written to {args.json}")
338
339 if args.max_any is not None:
340 total = report["summary"]["total_any_patterns"]
341 if total > args.max_any:
342 print(
343 f"\n❌ RATCHET FAILED: {total} violations exceed "
344 f"threshold of {args.max_any}",
345 file=sys.stderr,
346 )
347 sys.exit(1)
348 else:
349 print(
350 f"\n✅ RATCHET OK: {total} violations within "
351 f"threshold of {args.max_any}",
352 )
353
354
355 if __name__ == "__main__":
356 main()