gabriel / muse public
ast_parser.py python
934 lines 32.3 KB
062ae392 feat: code-domain semantic commands + code tour de force demo (#54) Gabriel Cardona <cgcardona@gmail.com> 5d ago
1 """AST parsing and symbol extraction for the code domain plugin.
2
3 This module provides the :class:`LanguageAdapter` protocol and concrete
4 adapters for parsing source files into :type:`SymbolTree` structures.
5
6 Language support matrix
7 -----------------------
8 - **Python** (``*.py``, ``*.pyi``): Full AST-based extraction using the
9 stdlib :mod:`ast` module. Content IDs are hashes of normalized (unparsed)
10 AST text — insensitive to whitespace, comments, and formatting.
11 - **JavaScript / TypeScript** (``*.js``, ``*.jsx``, ``*.mjs``, ``*.cjs``,
12 ``*.ts``, ``*.tsx``): tree-sitter based.
13 - **Go** (``*.go``): tree-sitter based. Method qualified names carry the
14 receiver type (e.g. ``Dog.Bark``).
15 - **Rust** (``*.rs``): tree-sitter based. Functions inside ``impl`` blocks
16 are qualified with the implementing type (e.g. ``Dog.bark``).
17 - **Java** (``*.java``), **C#** (``*.cs``): tree-sitter based.
18 - **C** (``*.c``, ``*.h``), **C++** (``*.cpp``, ``*.cc``, ``*.cxx``,
19 ``*.hpp``, ``*.hxx``): tree-sitter based.
20 - **Ruby** (``*.rb``), **Kotlin** (``*.kt``, ``*.kts``): tree-sitter based.
21
22 Symbol addresses
23 ----------------
24 Every extracted symbol is stored in the :type:`SymbolTree` dict under a
25 stable *address* key of the form::
26
27 "<workspace-relative-posix-path>::<qualified-symbol-name>"
28
29 Nested symbols (class methods) use dotted qualified names::
30
31 "src/models.py::User.save"
32 "src/models.py::User.__init__"
33
34 Top-level symbols::
35
36 "src/utils.py::calculate_total"
37 "src/utils.py::import::pathlib"
38
39 Content IDs and rename / move detection
40 ----------------------------------------
41 Each :class:`SymbolRecord` carries three hashes:
42
43 ``content_id``
44 SHA-256 of the full normalized AST of the symbol (includes name,
45 signature, and body). Two symbols are "the same thing" when their
46 ``content_id`` matches — regardless of where in the repo they live.
47
48 ``body_hash``
49 SHA-256 of the normalized body statements only (excludes the ``def``
50 line). Used to detect *renames*: same body, different name.
51
52 ``signature_id``
53 SHA-256 of ``"name(args) -> return"``. Used to detect *implementation-
54 only changes*: signature unchanged, body changed.
55
56 Extending
57 ---------
58 Implement :class:`LanguageAdapter` and append an instance to
59 :data:`ADAPTERS`. The adapter is selected by the file's suffix, with the
60 first matching adapter taking priority.
61 """
62 from __future__ import annotations
63
64 import ast
65 import hashlib
66 import importlib
67 import logging
68 import pathlib
69 import re
70 from typing import Literal, Protocol, TypedDict, runtime_checkable
71
72 from tree_sitter import Language, Node, Parser, Query, QueryCursor
73
74 logger = logging.getLogger(__name__)
75
76 # ---------------------------------------------------------------------------
77 # Symbol record types
78 # ---------------------------------------------------------------------------
79
80 SymbolKind = Literal[
81 "function",
82 "async_function",
83 "class",
84 "method",
85 "async_method",
86 "variable",
87 "import",
88 ]
89
90
91 class SymbolRecord(TypedDict):
92 """Content-addressed record for a single named symbol in source code."""
93
94 kind: SymbolKind
95 name: str
96 qualified_name: str # "ClassName.method" for nested; flat name for top-level
97 content_id: str # SHA-256 of full normalized AST (name + signature + body)
98 body_hash: str # SHA-256 of body stmts only — for rename detection
99 signature_id: str # SHA-256 of "name(args)->return" — for impl-only changes
100 lineno: int
101 end_lineno: int
102
103
104 #: Flat map from symbol address to :class:`SymbolRecord`.
105 #: Nested symbols (methods) appear at their qualified address alongside the
106 #: parent class.
107 SymbolTree = dict[str, SymbolRecord]
108
109
110 # ---------------------------------------------------------------------------
111 # Language adapter protocol
112 # ---------------------------------------------------------------------------
113
114
115 @runtime_checkable
116 class LanguageAdapter(Protocol):
117 """Protocol every language adapter must implement.
118
119 Adapters are stateless. The same instance may be called concurrently
120 for different files without synchronization.
121 """
122
123 def supported_extensions(self) -> frozenset[str]:
124 """Return the set of lowercase file suffixes this adapter handles."""
125 ...
126
127 def parse_symbols(self, source: bytes, file_path: str) -> SymbolTree:
128 """Extract the symbol tree from raw source bytes.
129
130 Args:
131 source: Raw bytes of the source file.
132 file_path: Workspace-relative POSIX path — used to build the
133 symbol address prefix.
134
135 Returns:
136 A :type:`SymbolTree` mapping symbol addresses to
137 :class:`SymbolRecord` dicts. Returns an empty dict on parse
138 errors so that the caller can fall through to file-level ops.
139 """
140 ...
141
142 def file_content_id(self, source: bytes) -> str:
143 """Return a stable content identifier for the whole file.
144
145 For AST-capable adapters: hash of the normalized (unparsed) module
146 AST — insensitive to formatting and comments.
147 For non-AST adapters: SHA-256 of raw bytes.
148
149 Args:
150 source: Raw bytes of the file.
151
152 Returns:
153 Hex-encoded SHA-256 digest.
154 """
155 ...
156
157
158 # ---------------------------------------------------------------------------
159 # Helpers
160 # ---------------------------------------------------------------------------
161
162
163 def _sha256(text: str) -> str:
164 return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
165
166
167 def _sha256_bytes(data: bytes) -> str:
168 return hashlib.sha256(data).hexdigest()
169
170
171 # ---------------------------------------------------------------------------
172 # Python adapter
173 # ---------------------------------------------------------------------------
174
175
176 class PythonAdapter:
177 """Python language adapter — AST-based, zero external dependencies.
178
179 Uses :func:`ast.parse` for parsing and :func:`ast.unparse` for
180 normalization. The result is a deterministic, whitespace-insensitive
181 representation that strips comments and normalizes indentation.
182
183 ``ast.unparse`` is available since Python 3.9; Muse requires 3.12.
184 """
185
186 def supported_extensions(self) -> frozenset[str]:
187 return frozenset({".py", ".pyi"})
188
189 def parse_symbols(self, source: bytes, file_path: str) -> SymbolTree:
190 try:
191 tree = ast.parse(source, filename=file_path)
192 except SyntaxError:
193 return {}
194 symbols: SymbolTree = {}
195 _extract_stmts(tree.body, file_path, "", symbols)
196 return symbols
197
198 def file_content_id(self, source: bytes) -> str:
199 try:
200 tree = ast.parse(source)
201 return _sha256(ast.unparse(tree))
202 except SyntaxError:
203 return _sha256_bytes(source)
204
205
206 # ---------------------------------------------------------------------------
207 # AST extraction helpers (module-level so they can be tested independently)
208 # ---------------------------------------------------------------------------
209
210
211 def _extract_stmts(
212 stmts: list[ast.stmt],
213 file_path: str,
214 class_prefix: str,
215 out: SymbolTree,
216 ) -> None:
217 """Recursively walk *stmts* and populate *out* with symbol records.
218
219 Args:
220 stmts: Statement list from an :class:`ast.Module` or
221 :class:`ast.ClassDef` body.
222 file_path: Workspace-relative POSIX path — used as address prefix.
223 class_prefix: Dotted class path for methods (e.g. ``"MyClass."``).
224 Empty string at top-level.
225 out: Accumulator — modified in place.
226 """
227 for node in stmts:
228 if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
229 is_async = isinstance(node, ast.AsyncFunctionDef)
230 if class_prefix:
231 kind: SymbolKind = "async_method" if is_async else "method"
232 else:
233 kind = "async_function" if is_async else "function"
234 qualified = f"{class_prefix}{node.name}"
235 addr = f"{file_path}::{qualified}"
236 out[addr] = _make_func_record(node, node.name, qualified, kind)
237
238 elif isinstance(node, ast.ClassDef):
239 qualified = f"{class_prefix}{node.name}"
240 addr = f"{file_path}::{qualified}"
241 out[addr] = _make_class_record(node, qualified)
242 _extract_stmts(node.body, file_path, f"{qualified}.", out)
243
244 elif isinstance(node, (ast.Assign, ast.AnnAssign)) and not class_prefix:
245 # Only top-level assignments — class-level attributes are captured
246 # as part of the parent class's content_id.
247 for name in _assignment_names(node):
248 addr = f"{file_path}::{name}"
249 out[addr] = _make_var_record(node, name)
250
251 elif isinstance(node, (ast.Import, ast.ImportFrom)) and not class_prefix:
252 for name in _import_names(node):
253 addr = f"{file_path}::import::{name}"
254 out[addr] = _make_import_record(node, name)
255
256
257 def _make_func_record(
258 node: ast.FunctionDef | ast.AsyncFunctionDef,
259 name: str,
260 qualified_name: str,
261 kind: SymbolKind,
262 ) -> SymbolRecord:
263 full_src = ast.unparse(node)
264 body_src = "\n".join(ast.unparse(s) for s in node.body)
265 args_src = ast.unparse(node.args)
266 ret_src = ast.unparse(node.returns) if node.returns else ""
267 return SymbolRecord(
268 kind=kind,
269 name=name,
270 qualified_name=qualified_name,
271 content_id=_sha256(full_src),
272 body_hash=_sha256(body_src),
273 signature_id=_sha256(f"{name}({args_src})->{ret_src}"),
274 lineno=node.lineno,
275 end_lineno=node.end_lineno or node.lineno,
276 )
277
278
279 def _make_class_record(node: ast.ClassDef, qualified_name: str) -> SymbolRecord:
280 full_src = ast.unparse(node)
281 base_src = ", ".join(ast.unparse(b) for b in node.bases) if node.bases else ""
282 # Body hash captures class structure (bases + method names) but NOT method
283 # bodies — those change independently and have their own records.
284 method_names = sorted(
285 n.name
286 for n in node.body
287 if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
288 )
289 structure = f"class {node.name}({base_src}):{method_names}"
290 header = f"class {node.name}({base_src})" if node.bases else f"class {node.name}"
291 return SymbolRecord(
292 kind="class",
293 name=node.name,
294 qualified_name=qualified_name,
295 content_id=_sha256(full_src),
296 body_hash=_sha256(structure),
297 signature_id=_sha256(header),
298 lineno=node.lineno,
299 end_lineno=node.end_lineno or node.lineno,
300 )
301
302
303 def _make_var_record(node: ast.Assign | ast.AnnAssign, name: str) -> SymbolRecord:
304 normalized = ast.unparse(node)
305 return SymbolRecord(
306 kind="variable",
307 name=name,
308 qualified_name=name,
309 content_id=_sha256(normalized),
310 body_hash=_sha256(normalized),
311 signature_id=_sha256(name),
312 lineno=node.lineno,
313 end_lineno=node.end_lineno or node.lineno,
314 )
315
316
317 def _make_import_record(
318 node: ast.Import | ast.ImportFrom, name: str
319 ) -> SymbolRecord:
320 normalized = ast.unparse(node)
321 return SymbolRecord(
322 kind="import",
323 name=name,
324 qualified_name=f"import::{name}",
325 content_id=_sha256(normalized),
326 body_hash=_sha256(normalized),
327 signature_id=_sha256(name),
328 lineno=node.lineno,
329 end_lineno=node.lineno,
330 )
331
332
333 def _assignment_names(node: ast.Assign | ast.AnnAssign) -> list[str]:
334 if isinstance(node, ast.Assign):
335 return [t.id for t in node.targets if isinstance(t, ast.Name)]
336 if isinstance(node.target, ast.Name):
337 return [node.target.id]
338 return []
339
340
341 def _import_names(node: ast.Import | ast.ImportFrom) -> list[str]:
342 if isinstance(node, ast.Import):
343 return [a.asname or a.name for a in node.names]
344 # ImportFrom
345 if node.names and node.names[0].name == "*":
346 return [f"*:{node.module or '?'}"]
347 return [a.asname or a.name for a in node.names]
348
349
350 # ---------------------------------------------------------------------------
351 # Fallback adapter — file-level identity only, no symbol extraction
352 # ---------------------------------------------------------------------------
353
354
355 class FallbackAdapter:
356 """Fallback adapter for languages without a dedicated AST parser.
357
358 Returns an empty :type:`SymbolTree` (file-level tracking only) and uses
359 raw-bytes SHA-256 as the file content ID.
360 """
361
362 def __init__(self, extensions: frozenset[str]) -> None:
363 self._extensions = extensions
364
365 def supported_extensions(self) -> frozenset[str]:
366 return self._extensions
367
368 def parse_symbols(self, source: bytes, file_path: str) -> SymbolTree: # noqa: ARG002
369 return {}
370
371 def file_content_id(self, source: bytes) -> str:
372 return _sha256_bytes(source)
373
374
375 # ---------------------------------------------------------------------------
376 # tree-sitter adapter — shared infrastructure for all non-Python languages
377 # ---------------------------------------------------------------------------
378
379 _WS_RE: re.Pattern[bytes] = re.compile(rb"\s+")
380
381
382 def _norm_ws(src: bytes) -> bytes:
383 """Collapse all whitespace runs to a single space and strip the result."""
384 return _WS_RE.sub(b" ", src).strip()
385
386
387 def _node_text(src: bytes, node: Node) -> bytes:
388 """Extract the raw source bytes covered by a tree-sitter node."""
389 return src[node.start_byte : node.end_byte]
390
391
392 def _class_name_from(src: bytes, node: Node, field: str) -> str | None:
393 """Extract a class/struct name from a parent CST node.
394
395 Tries ``child_by_field_name(field)`` first (covers Java, C#, C++, Rust).
396 Falls back to the first ``identifier``-typed named child to handle
397 languages like Kotlin where the class name is not a named field.
398 """
399 child = node.child_by_field_name(field)
400 if child is None:
401 for c in node.named_children:
402 if c.type == "identifier":
403 child = c
404 break
405 if child is None:
406 return None
407 return _node_text(src, child).decode("utf-8", errors="replace")
408
409
410 def _qualified_name_ts(
411 src: bytes,
412 sym_node: Node,
413 name: str,
414 class_node_types: frozenset[str],
415 class_name_field: str,
416 ) -> str:
417 """Walk the CST parent chain to build a dotted qualified name.
418
419 For a method ``bark`` inside ``class Dog``, returns ``"Dog.bark"``.
420 For a top-level function, returns just ``"standalone"``.
421 """
422 parts = [name]
423 parent = sym_node.parent
424 while parent is not None:
425 if parent.type in class_node_types:
426 class_name = _class_name_from(src, parent, class_name_field)
427 if class_name:
428 parts.insert(0, class_name)
429 parent = parent.parent
430 return ".".join(parts)
431
432
433 class LangSpec(TypedDict):
434 """Per-language tree-sitter configuration consumed by :class:`TreeSitterAdapter`."""
435
436 extensions: frozenset[str]
437 module_name: str # Python import name, e.g. ``"tree_sitter_javascript"``
438 lang_func: str # Attribute on the module returning the raw capsule
439 query_str: str # tree-sitter S-expr query — must capture ``@sym`` and ``@name``
440 kind_map: dict[str, SymbolKind] # CST node type → SymbolKind
441 class_node_types: frozenset[str] # Ancestor types that scope methods
442 class_name_field: str # Field name for the class name (e.g. ``"name"`` or ``"type"``)
443 receiver_capture: str # Capture name for Go-style method receivers; ``""`` to skip
444
445
446 class TreeSitterAdapter:
447 """Implements :class:`LanguageAdapter` using tree-sitter for real CST parsing.
448
449 tree-sitter is the same parsing technology used by GitHub Copilot, VS Code,
450 Neovim, and Zed. It produces a concrete syntax tree from every source file,
451 even if the file has syntax errors — making it suitable for real-world repos
452 that may contain partially-written code.
453
454 Parsing is error-tolerant: individual file failures are logged at DEBUG
455 level and return an empty :type:`SymbolTree` so the caller falls back to
456 file-level diffing rather than crashing.
457 """
458
459 def __init__(
460 self,
461 spec: LangSpec,
462 parser: Parser,
463 language: Language,
464 ) -> None:
465 self._spec = spec
466 self._parser = parser
467 self._language = language
468 self._query = Query(language, spec["query_str"])
469
470 def supported_extensions(self) -> frozenset[str]:
471 return self._spec["extensions"]
472
473 def parse_symbols(self, source: bytes, file_path: str) -> SymbolTree:
474 try:
475 tree = self._parser.parse(source)
476 cursor = QueryCursor(self._query)
477 symbols: SymbolTree = {}
478 recv_cap = self._spec["receiver_capture"]
479
480 for _pat, caps in cursor.matches(tree.root_node):
481 sym_list = caps.get("sym", [])
482 name_list = caps.get("name", [])
483 if not sym_list or not name_list:
484 continue
485 sym_node = sym_list[0]
486 name_node = name_list[0]
487
488 name_txt = _node_text(source, name_node).decode(
489 "utf-8", errors="replace"
490 )
491 kind = self._spec["kind_map"].get(sym_node.type, "function")
492
493 # Build qualified name — walking ancestor chain for methods.
494 qualified = _qualified_name_ts(
495 source,
496 sym_node,
497 name_txt,
498 self._spec["class_node_types"],
499 self._spec["class_name_field"],
500 )
501
502 # Go-style receiver prefix: (d *Dog) → "Dog.Bark"
503 if recv_cap:
504 recv_list = caps.get(recv_cap, [])
505 if recv_list:
506 recv_txt = (
507 _node_text(source, recv_list[0])
508 .decode("utf-8", errors="replace")
509 .lstrip("*")
510 .strip()
511 )
512 if recv_txt:
513 qualified = f"{recv_txt}.{qualified}"
514
515 addr = f"{file_path}::{qualified}"
516 node_bytes = _node_text(source, sym_node)
517 name_bytes = _node_text(source, name_node)
518 # Substitute the name with a placeholder to isolate the body
519 # from the identifier — two symbols with the same body but
520 # different names share the same body_hash, signalling a rename.
521 body_bytes = node_bytes.replace(name_bytes, b"\xfe", 1)
522
523 params_node = (
524 sym_node.child_by_field_name("parameters")
525 or sym_node.child_by_field_name("formal_parameters")
526 or sym_node.child_by_field_name("function_value_parameters")
527 )
528 params_bytes = (
529 _node_text(source, params_node)
530 if params_node is not None
531 else b""
532 )
533
534 symbols[addr] = SymbolRecord(
535 kind=kind,
536 name=name_txt,
537 qualified_name=qualified,
538 content_id=_sha256_bytes(_norm_ws(node_bytes)),
539 body_hash=_sha256_bytes(_norm_ws(body_bytes)),
540 signature_id=_sha256_bytes(_norm_ws(name_bytes + params_bytes)),
541 lineno=sym_node.start_point[0] + 1,
542 end_lineno=sym_node.end_point[0] + 1,
543 )
544 return symbols
545 except Exception as exc: # noqa: BLE001
546 logger.debug("tree-sitter parse error in %s: %s", file_path, exc)
547 return {}
548
549 def file_content_id(self, source: bytes) -> str:
550 """Whitespace-normalised SHA-256 of the source — insensitive to reformatting."""
551 return _sha256_bytes(_norm_ws(source))
552
553
554 def _make_ts_adapter(spec: LangSpec) -> LanguageAdapter:
555 """Build a :class:`TreeSitterAdapter`; fall back to :class:`FallbackAdapter` on error.
556
557 Importing the grammar capsule is deferred to this factory so that a
558 missing or incompatible grammar package degrades gracefully rather than
559 preventing the entire plugin from loading.
560 """
561 try:
562 mod = importlib.import_module(spec["module_name"])
563 raw_lang = getattr(mod, spec["lang_func"])()
564 lang = Language(raw_lang)
565 parser = Parser(lang)
566 return TreeSitterAdapter(spec, parser, lang)
567 except Exception as exc: # noqa: BLE001
568 logger.debug(
569 "tree-sitter grammar %s.%s unavailable — using file-level fallback: %s",
570 spec["module_name"],
571 spec["lang_func"],
572 exc,
573 )
574 return FallbackAdapter(spec["extensions"])
575
576
577 # ---------------------------------------------------------------------------
578 # Per-language tree-sitter specs
579 # ---------------------------------------------------------------------------
580
581 _JS_SPEC: LangSpec = {
582 "extensions": frozenset({".js", ".jsx", ".mjs", ".cjs"}),
583 "module_name": "tree_sitter_javascript",
584 "lang_func": "language",
585 # Note: tree-sitter-javascript uses "class" for both class declarations and
586 # named class expressions. "class_expression" is not a valid node type.
587 "query_str": (
588 "(function_declaration name: (identifier) @name) @sym\n"
589 "(function_expression name: (identifier) @name) @sym\n"
590 "(generator_function_declaration name: (identifier) @name) @sym\n"
591 "(class_declaration name: (identifier) @name) @sym\n"
592 "(class name: (identifier) @name) @sym\n"
593 "(method_definition name: (property_identifier) @name) @sym"
594 ),
595 "kind_map": {
596 "function_declaration": "function",
597 "function_expression": "function",
598 "generator_function_declaration": "function",
599 "class_declaration": "class",
600 "class": "class",
601 "method_definition": "method",
602 },
603 "class_node_types": frozenset({"class_declaration", "class"}),
604 "class_name_field": "name",
605 "receiver_capture": "",
606 }
607
608 _TS_QUERY = (
609 # TypeScript uses type_identifier (not identifier) for class names.
610 "(function_declaration name: (identifier) @name) @sym\n"
611 "(function_expression name: (identifier) @name) @sym\n"
612 "(generator_function_declaration name: (identifier) @name) @sym\n"
613 "(class_declaration name: (type_identifier) @name) @sym\n"
614 "(class name: (type_identifier) @name) @sym\n"
615 "(abstract_class_declaration name: (type_identifier) @name) @sym\n"
616 "(method_definition name: (property_identifier) @name) @sym\n"
617 "(interface_declaration name: (type_identifier) @name) @sym\n"
618 "(type_alias_declaration name: (type_identifier) @name) @sym\n"
619 "(enum_declaration name: (identifier) @name) @sym"
620 )
621
622 _TS_KIND_MAP: dict[str, SymbolKind] = {
623 "function_declaration": "function",
624 "function_expression": "function",
625 "generator_function_declaration": "function",
626 "class_declaration": "class",
627 "class": "class",
628 "abstract_class_declaration": "class",
629 "method_definition": "method",
630 "interface_declaration": "class",
631 "type_alias_declaration": "variable",
632 "enum_declaration": "class",
633 }
634
635 _TS_CLASS_NODES: frozenset[str] = frozenset(
636 {"class_declaration", "class", "abstract_class_declaration"}
637 )
638
639 _TS_SPEC: LangSpec = {
640 "extensions": frozenset({".ts"}),
641 "module_name": "tree_sitter_typescript",
642 "lang_func": "language_typescript",
643 "query_str": _TS_QUERY,
644 "kind_map": _TS_KIND_MAP,
645 "class_node_types": _TS_CLASS_NODES,
646 "class_name_field": "name",
647 "receiver_capture": "",
648 }
649
650 _TSX_SPEC: LangSpec = {
651 "extensions": frozenset({".tsx"}),
652 "module_name": "tree_sitter_typescript",
653 "lang_func": "language_tsx",
654 "query_str": _TS_QUERY,
655 "kind_map": _TS_KIND_MAP,
656 "class_node_types": _TS_CLASS_NODES,
657 "class_name_field": "name",
658 "receiver_capture": "",
659 }
660
661 _GO_SPEC: LangSpec = {
662 "extensions": frozenset({".go"}),
663 "module_name": "tree_sitter_go",
664 "lang_func": "language",
665 "query_str": (
666 "(function_declaration name: (identifier) @name) @sym\n"
667 "(method_declaration\n"
668 " receiver: (parameter_list\n"
669 " (parameter_declaration type: _ @recv))\n"
670 " name: (field_identifier) @name) @sym\n"
671 "(type_spec name: (type_identifier) @name) @sym"
672 ),
673 "kind_map": {
674 "function_declaration": "function",
675 "method_declaration": "method",
676 "type_spec": "class",
677 },
678 "class_node_types": frozenset(),
679 "class_name_field": "name",
680 "receiver_capture": "recv",
681 }
682
683 _RUST_SPEC: LangSpec = {
684 "extensions": frozenset({".rs"}),
685 "module_name": "tree_sitter_rust",
686 "lang_func": "language",
687 "query_str": (
688 "(function_item name: (identifier) @name) @sym\n"
689 "(struct_item name: (type_identifier) @name) @sym\n"
690 "(enum_item name: (type_identifier) @name) @sym\n"
691 "(trait_item name: (type_identifier) @name) @sym"
692 ),
693 "kind_map": {
694 "function_item": "function",
695 "struct_item": "class",
696 "enum_item": "class",
697 "trait_item": "class",
698 },
699 # impl_item scopes methods; its implementing type is in the "type" field.
700 "class_node_types": frozenset({"impl_item"}),
701 "class_name_field": "type",
702 "receiver_capture": "",
703 }
704
705 _JAVA_SPEC: LangSpec = {
706 "extensions": frozenset({".java"}),
707 "module_name": "tree_sitter_java",
708 "lang_func": "language",
709 "query_str": (
710 "(method_declaration name: (identifier) @name) @sym\n"
711 "(constructor_declaration name: (identifier) @name) @sym\n"
712 "(class_declaration name: (identifier) @name) @sym\n"
713 "(interface_declaration name: (identifier) @name) @sym\n"
714 "(enum_declaration name: (identifier) @name) @sym"
715 ),
716 "kind_map": {
717 "method_declaration": "method",
718 "constructor_declaration": "function",
719 "class_declaration": "class",
720 "interface_declaration": "class",
721 "enum_declaration": "class",
722 },
723 "class_node_types": frozenset({"class_declaration", "interface_declaration"}),
724 "class_name_field": "name",
725 "receiver_capture": "",
726 }
727
728 _C_SPEC: LangSpec = {
729 "extensions": frozenset({".c", ".h"}),
730 "module_name": "tree_sitter_c",
731 "lang_func": "language",
732 "query_str": (
733 "(function_definition\n"
734 " declarator: (function_declarator\n"
735 " declarator: (identifier) @name)) @sym"
736 ),
737 "kind_map": {"function_definition": "function"},
738 "class_node_types": frozenset(),
739 "class_name_field": "name",
740 "receiver_capture": "",
741 }
742
743 _CPP_SPEC: LangSpec = {
744 "extensions": frozenset({".cpp", ".cc", ".cxx", ".hpp", ".hxx"}),
745 "module_name": "tree_sitter_cpp",
746 "lang_func": "language",
747 "query_str": (
748 "(function_definition\n"
749 " declarator: (function_declarator\n"
750 " declarator: (identifier) @name)) @sym\n"
751 "(class_specifier name: (type_identifier) @name) @sym\n"
752 "(struct_specifier name: (type_identifier) @name) @sym"
753 ),
754 "kind_map": {
755 "function_definition": "function",
756 "class_specifier": "class",
757 "struct_specifier": "class",
758 },
759 "class_node_types": frozenset({"class_specifier", "struct_specifier"}),
760 "class_name_field": "name",
761 "receiver_capture": "",
762 }
763
764 _CS_SPEC: LangSpec = {
765 "extensions": frozenset({".cs"}),
766 "module_name": "tree_sitter_c_sharp",
767 "lang_func": "language",
768 "query_str": (
769 "(method_declaration name: (identifier) @name) @sym\n"
770 "(constructor_declaration name: (identifier) @name) @sym\n"
771 "(class_declaration name: (identifier) @name) @sym\n"
772 "(interface_declaration name: (identifier) @name) @sym\n"
773 "(enum_declaration name: (identifier) @name) @sym\n"
774 "(struct_declaration name: (identifier) @name) @sym"
775 ),
776 "kind_map": {
777 "method_declaration": "method",
778 "constructor_declaration": "function",
779 "class_declaration": "class",
780 "interface_declaration": "class",
781 "enum_declaration": "class",
782 "struct_declaration": "class",
783 },
784 "class_node_types": frozenset(
785 {"class_declaration", "interface_declaration", "struct_declaration"}
786 ),
787 "class_name_field": "name",
788 "receiver_capture": "",
789 }
790
791 _RUBY_SPEC: LangSpec = {
792 "extensions": frozenset({".rb"}),
793 "module_name": "tree_sitter_ruby",
794 "lang_func": "language",
795 "query_str": (
796 "(method name: (identifier) @name) @sym\n"
797 "(singleton_method name: (identifier) @name) @sym\n"
798 "(class name: (constant) @name) @sym\n"
799 "(module name: (constant) @name) @sym"
800 ),
801 "kind_map": {
802 "method": "method",
803 "singleton_method": "method",
804 "class": "class",
805 "module": "class",
806 },
807 "class_node_types": frozenset({"class", "module"}),
808 "class_name_field": "name",
809 "receiver_capture": "",
810 }
811
812 _KT_SPEC: LangSpec = {
813 "extensions": frozenset({".kt", ".kts"}),
814 "module_name": "tree_sitter_kotlin",
815 "lang_func": "language",
816 "query_str": (
817 "(function_declaration (identifier) @name) @sym\n"
818 "(class_declaration (identifier) @name) @sym"
819 ),
820 "kind_map": {
821 "function_declaration": "function",
822 "class_declaration": "class",
823 },
824 # Kotlin methods are function_declaration nodes inside class_body.
825 # child_by_field_name("name") is None for Kotlin classes; _class_name_from
826 # falls back to the first identifier-typed named child automatically.
827 "class_node_types": frozenset({"class_declaration"}),
828 "class_name_field": "name",
829 "receiver_capture": "",
830 }
831
832 #: All tree-sitter language specs, loaded in registration order.
833 _TS_LANG_SPECS: list[LangSpec] = [
834 _JS_SPEC,
835 _TS_SPEC,
836 _TSX_SPEC,
837 _GO_SPEC,
838 _RUST_SPEC,
839 _JAVA_SPEC,
840 _C_SPEC,
841 _CPP_SPEC,
842 _CS_SPEC,
843 _RUBY_SPEC,
844 _KT_SPEC,
845 ]
846
847
848 # ---------------------------------------------------------------------------
849 # Adapter registry and public helpers
850 # ---------------------------------------------------------------------------
851
852 _PYTHON = PythonAdapter()
853 _FALLBACK = FallbackAdapter(frozenset())
854
855 #: Adapters checked in order; first match wins.
856 ADAPTERS: list[LanguageAdapter] = [_PYTHON]
857
858 # Build and register tree-sitter adapters. _make_ts_adapter degrades to
859 # FallbackAdapter if a grammar package isn't installed.
860 for _spec in _TS_LANG_SPECS:
861 ADAPTERS.append(_make_ts_adapter(_spec))
862
863 #: File extensions that receive semantic (AST-based) symbol extraction.
864 SEMANTIC_EXTENSIONS: frozenset[str] = frozenset().union(
865 *(a.supported_extensions() for a in ADAPTERS if not isinstance(a, FallbackAdapter))
866 )
867
868 #: Source extensions tracked as first-class files (raw-bytes identity for
869 #: languages without an AST adapter, AST identity for Python).
870 SOURCE_EXTENSIONS: frozenset[str] = frozenset({
871 ".py", ".pyi",
872 ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs",
873 ".swift",
874 ".go",
875 ".rs",
876 ".java",
877 ".c", ".cpp", ".cc", ".cxx", ".h", ".hpp",
878 ".rb",
879 ".kt",
880 ".cs",
881 ".sh", ".bash", ".zsh",
882 ".toml", ".yaml", ".yml", ".json", ".jsonc",
883 ".md", ".rst", ".txt",
884 ".css", ".scss", ".html",
885 ".sql",
886 ".proto",
887 ".tf",
888 })
889
890
891 def adapter_for_path(file_path: str) -> LanguageAdapter:
892 """Return the best :class:`LanguageAdapter` for *file_path*.
893
894 Checks registered adapters in order; falls back to
895 :class:`FallbackAdapter` when no adapter claims the suffix.
896
897 Args:
898 file_path: Workspace-relative POSIX path (e.g. ``"src/utils.py"``).
899
900 Returns:
901 The first adapter whose :meth:`~LanguageAdapter.supported_extensions`
902 set contains the file's lowercase suffix.
903 """
904 suffix = pathlib.PurePosixPath(file_path).suffix.lower()
905 for adapter in ADAPTERS:
906 if suffix in adapter.supported_extensions():
907 return adapter
908 return _FALLBACK
909
910
911 def parse_symbols(source: bytes, file_path: str) -> SymbolTree:
912 """Parse *source* with the best available adapter for *file_path*.
913
914 Args:
915 source: Raw bytes of the source file.
916 file_path: Workspace-relative POSIX path.
917
918 Returns:
919 A :type:`SymbolTree` (may be empty for unsupported file types).
920 """
921 return adapter_for_path(file_path).parse_symbols(source, file_path)
922
923
924 def file_content_id(source: bytes, file_path: str) -> str:
925 """Return the semantic content ID for *file_path* given its raw *source*.
926
927 Args:
928 source: Raw bytes of the file.
929 file_path: Workspace-relative POSIX path.
930
931 Returns:
932 Hex-encoded SHA-256 digest — AST-based for Python, raw-bytes for others.
933 """
934 return adapter_for_path(file_path).file_content_id(source)