ast_parser.py
python
| 1 | """AST parsing and symbol extraction for the code domain plugin. |
| 2 | |
| 3 | This module provides the :class:`LanguageAdapter` protocol and concrete |
| 4 | adapters for parsing source files into :type:`SymbolTree` structures. |
| 5 | |
| 6 | Language support matrix |
| 7 | ----------------------- |
| 8 | - **Python** (``*.py``, ``*.pyi``): Full AST-based extraction using the |
| 9 | stdlib :mod:`ast` module. Content IDs are hashes of normalized (unparsed) |
| 10 | AST text — insensitive to whitespace, comments, and formatting. |
| 11 | - **JavaScript / TypeScript** (``*.js``, ``*.jsx``, ``*.mjs``, ``*.cjs``, |
| 12 | ``*.ts``, ``*.tsx``): tree-sitter based. |
| 13 | - **Go** (``*.go``): tree-sitter based. Method qualified names carry the |
| 14 | receiver type (e.g. ``Dog.Bark``). |
| 15 | - **Rust** (``*.rs``): tree-sitter based. Functions inside ``impl`` blocks |
| 16 | are qualified with the implementing type (e.g. ``Dog.bark``). |
| 17 | - **Java** (``*.java``), **C#** (``*.cs``): tree-sitter based. |
| 18 | - **C** (``*.c``, ``*.h``), **C++** (``*.cpp``, ``*.cc``, ``*.cxx``, |
| 19 | ``*.hpp``, ``*.hxx``): tree-sitter based. |
| 20 | - **Ruby** (``*.rb``), **Kotlin** (``*.kt``, ``*.kts``): tree-sitter based. |
| 21 | |
| 22 | Symbol addresses |
| 23 | ---------------- |
| 24 | Every extracted symbol is stored in the :type:`SymbolTree` dict under a |
| 25 | stable *address* key of the form:: |
| 26 | |
| 27 | "<workspace-relative-posix-path>::<qualified-symbol-name>" |
| 28 | |
| 29 | Nested symbols (class methods) use dotted qualified names:: |
| 30 | |
| 31 | "src/models.py::User.save" |
| 32 | "src/models.py::User.__init__" |
| 33 | |
| 34 | Top-level symbols:: |
| 35 | |
| 36 | "src/utils.py::calculate_total" |
| 37 | "src/utils.py::import::pathlib" |
| 38 | |
| 39 | Content IDs and rename / move detection |
| 40 | ---------------------------------------- |
| 41 | Each :class:`SymbolRecord` carries three hashes: |
| 42 | |
| 43 | ``content_id`` |
| 44 | SHA-256 of the full normalized AST of the symbol (includes name, |
| 45 | signature, and body). Two symbols are "the same thing" when their |
| 46 | ``content_id`` matches — regardless of where in the repo they live. |
| 47 | |
| 48 | ``body_hash`` |
| 49 | SHA-256 of the normalized body statements only (excludes the ``def`` |
| 50 | line). Used to detect *renames*: same body, different name. |
| 51 | |
| 52 | ``signature_id`` |
| 53 | SHA-256 of ``"name(args) -> return"``. Used to detect *implementation- |
| 54 | only changes*: signature unchanged, body changed. |
| 55 | |
| 56 | Extending |
| 57 | --------- |
| 58 | Implement :class:`LanguageAdapter` and append an instance to |
| 59 | :data:`ADAPTERS`. The adapter is selected by the file's suffix, with the |
| 60 | first matching adapter taking priority. |
| 61 | """ |
| 62 | from __future__ import annotations |
| 63 | |
| 64 | import ast |
| 65 | import hashlib |
| 66 | import importlib |
| 67 | import logging |
| 68 | import pathlib |
| 69 | import re |
| 70 | from typing import Literal, Protocol, TypedDict, runtime_checkable |
| 71 | |
| 72 | from tree_sitter import Language, Node, Parser, Query, QueryCursor |
| 73 | |
| 74 | logger = logging.getLogger(__name__) |
| 75 | |
| 76 | # --------------------------------------------------------------------------- |
| 77 | # Symbol record types |
| 78 | # --------------------------------------------------------------------------- |
| 79 | |
| 80 | SymbolKind = Literal[ |
| 81 | "function", |
| 82 | "async_function", |
| 83 | "class", |
| 84 | "method", |
| 85 | "async_method", |
| 86 | "variable", |
| 87 | "import", |
| 88 | ] |
| 89 | |
| 90 | |
| 91 | class SymbolRecord(TypedDict): |
| 92 | """Content-addressed record for a single named symbol in source code.""" |
| 93 | |
| 94 | kind: SymbolKind |
| 95 | name: str |
| 96 | qualified_name: str # "ClassName.method" for nested; flat name for top-level |
| 97 | content_id: str # SHA-256 of full normalized AST (name + signature + body) |
| 98 | body_hash: str # SHA-256 of body stmts only — for rename detection |
| 99 | signature_id: str # SHA-256 of "name(args)->return" — for impl-only changes |
| 100 | lineno: int |
| 101 | end_lineno: int |
| 102 | |
| 103 | |
| 104 | #: Flat map from symbol address to :class:`SymbolRecord`. |
| 105 | #: Nested symbols (methods) appear at their qualified address alongside the |
| 106 | #: parent class. |
| 107 | SymbolTree = dict[str, SymbolRecord] |
| 108 | |
| 109 | |
| 110 | # --------------------------------------------------------------------------- |
| 111 | # Language adapter protocol |
| 112 | # --------------------------------------------------------------------------- |
| 113 | |
| 114 | |
| 115 | @runtime_checkable |
| 116 | class LanguageAdapter(Protocol): |
| 117 | """Protocol every language adapter must implement. |
| 118 | |
| 119 | Adapters are stateless. The same instance may be called concurrently |
| 120 | for different files without synchronization. |
| 121 | """ |
| 122 | |
| 123 | def supported_extensions(self) -> frozenset[str]: |
| 124 | """Return the set of lowercase file suffixes this adapter handles.""" |
| 125 | ... |
| 126 | |
| 127 | def parse_symbols(self, source: bytes, file_path: str) -> SymbolTree: |
| 128 | """Extract the symbol tree from raw source bytes. |
| 129 | |
| 130 | Args: |
| 131 | source: Raw bytes of the source file. |
| 132 | file_path: Workspace-relative POSIX path — used to build the |
| 133 | symbol address prefix. |
| 134 | |
| 135 | Returns: |
| 136 | A :type:`SymbolTree` mapping symbol addresses to |
| 137 | :class:`SymbolRecord` dicts. Returns an empty dict on parse |
| 138 | errors so that the caller can fall through to file-level ops. |
| 139 | """ |
| 140 | ... |
| 141 | |
| 142 | def file_content_id(self, source: bytes) -> str: |
| 143 | """Return a stable content identifier for the whole file. |
| 144 | |
| 145 | For AST-capable adapters: hash of the normalized (unparsed) module |
| 146 | AST — insensitive to formatting and comments. |
| 147 | For non-AST adapters: SHA-256 of raw bytes. |
| 148 | |
| 149 | Args: |
| 150 | source: Raw bytes of the file. |
| 151 | |
| 152 | Returns: |
| 153 | Hex-encoded SHA-256 digest. |
| 154 | """ |
| 155 | ... |
| 156 | |
| 157 | |
| 158 | # --------------------------------------------------------------------------- |
| 159 | # Helpers |
| 160 | # --------------------------------------------------------------------------- |
| 161 | |
| 162 | |
| 163 | def _sha256(text: str) -> str: |
| 164 | return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest() |
| 165 | |
| 166 | |
| 167 | def _sha256_bytes(data: bytes) -> str: |
| 168 | return hashlib.sha256(data).hexdigest() |
| 169 | |
| 170 | |
| 171 | # --------------------------------------------------------------------------- |
| 172 | # Python adapter |
| 173 | # --------------------------------------------------------------------------- |
| 174 | |
| 175 | |
| 176 | class PythonAdapter: |
| 177 | """Python language adapter — AST-based, zero external dependencies. |
| 178 | |
| 179 | Uses :func:`ast.parse` for parsing and :func:`ast.unparse` for |
| 180 | normalization. The result is a deterministic, whitespace-insensitive |
| 181 | representation that strips comments and normalizes indentation. |
| 182 | |
| 183 | ``ast.unparse`` is available since Python 3.9; Muse requires 3.12. |
| 184 | """ |
| 185 | |
| 186 | def supported_extensions(self) -> frozenset[str]: |
| 187 | return frozenset({".py", ".pyi"}) |
| 188 | |
| 189 | def parse_symbols(self, source: bytes, file_path: str) -> SymbolTree: |
| 190 | try: |
| 191 | tree = ast.parse(source, filename=file_path) |
| 192 | except SyntaxError: |
| 193 | return {} |
| 194 | symbols: SymbolTree = {} |
| 195 | _extract_stmts(tree.body, file_path, "", symbols) |
| 196 | return symbols |
| 197 | |
| 198 | def file_content_id(self, source: bytes) -> str: |
| 199 | try: |
| 200 | tree = ast.parse(source) |
| 201 | return _sha256(ast.unparse(tree)) |
| 202 | except SyntaxError: |
| 203 | return _sha256_bytes(source) |
| 204 | |
| 205 | |
| 206 | # --------------------------------------------------------------------------- |
| 207 | # AST extraction helpers (module-level so they can be tested independently) |
| 208 | # --------------------------------------------------------------------------- |
| 209 | |
| 210 | |
| 211 | def _extract_stmts( |
| 212 | stmts: list[ast.stmt], |
| 213 | file_path: str, |
| 214 | class_prefix: str, |
| 215 | out: SymbolTree, |
| 216 | ) -> None: |
| 217 | """Recursively walk *stmts* and populate *out* with symbol records. |
| 218 | |
| 219 | Args: |
| 220 | stmts: Statement list from an :class:`ast.Module` or |
| 221 | :class:`ast.ClassDef` body. |
| 222 | file_path: Workspace-relative POSIX path — used as address prefix. |
| 223 | class_prefix: Dotted class path for methods (e.g. ``"MyClass."``). |
| 224 | Empty string at top-level. |
| 225 | out: Accumulator — modified in place. |
| 226 | """ |
| 227 | for node in stmts: |
| 228 | if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): |
| 229 | is_async = isinstance(node, ast.AsyncFunctionDef) |
| 230 | if class_prefix: |
| 231 | kind: SymbolKind = "async_method" if is_async else "method" |
| 232 | else: |
| 233 | kind = "async_function" if is_async else "function" |
| 234 | qualified = f"{class_prefix}{node.name}" |
| 235 | addr = f"{file_path}::{qualified}" |
| 236 | out[addr] = _make_func_record(node, node.name, qualified, kind) |
| 237 | |
| 238 | elif isinstance(node, ast.ClassDef): |
| 239 | qualified = f"{class_prefix}{node.name}" |
| 240 | addr = f"{file_path}::{qualified}" |
| 241 | out[addr] = _make_class_record(node, qualified) |
| 242 | _extract_stmts(node.body, file_path, f"{qualified}.", out) |
| 243 | |
| 244 | elif isinstance(node, (ast.Assign, ast.AnnAssign)) and not class_prefix: |
| 245 | # Only top-level assignments — class-level attributes are captured |
| 246 | # as part of the parent class's content_id. |
| 247 | for name in _assignment_names(node): |
| 248 | addr = f"{file_path}::{name}" |
| 249 | out[addr] = _make_var_record(node, name) |
| 250 | |
| 251 | elif isinstance(node, (ast.Import, ast.ImportFrom)) and not class_prefix: |
| 252 | for name in _import_names(node): |
| 253 | addr = f"{file_path}::import::{name}" |
| 254 | out[addr] = _make_import_record(node, name) |
| 255 | |
| 256 | |
| 257 | def _make_func_record( |
| 258 | node: ast.FunctionDef | ast.AsyncFunctionDef, |
| 259 | name: str, |
| 260 | qualified_name: str, |
| 261 | kind: SymbolKind, |
| 262 | ) -> SymbolRecord: |
| 263 | full_src = ast.unparse(node) |
| 264 | body_src = "\n".join(ast.unparse(s) for s in node.body) |
| 265 | args_src = ast.unparse(node.args) |
| 266 | ret_src = ast.unparse(node.returns) if node.returns else "" |
| 267 | return SymbolRecord( |
| 268 | kind=kind, |
| 269 | name=name, |
| 270 | qualified_name=qualified_name, |
| 271 | content_id=_sha256(full_src), |
| 272 | body_hash=_sha256(body_src), |
| 273 | signature_id=_sha256(f"{name}({args_src})->{ret_src}"), |
| 274 | lineno=node.lineno, |
| 275 | end_lineno=node.end_lineno or node.lineno, |
| 276 | ) |
| 277 | |
| 278 | |
| 279 | def _make_class_record(node: ast.ClassDef, qualified_name: str) -> SymbolRecord: |
| 280 | full_src = ast.unparse(node) |
| 281 | base_src = ", ".join(ast.unparse(b) for b in node.bases) if node.bases else "" |
| 282 | # Body hash captures class structure (bases + method names) but NOT method |
| 283 | # bodies — those change independently and have their own records. |
| 284 | method_names = sorted( |
| 285 | n.name |
| 286 | for n in node.body |
| 287 | if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef)) |
| 288 | ) |
| 289 | structure = f"class {node.name}({base_src}):{method_names}" |
| 290 | header = f"class {node.name}({base_src})" if node.bases else f"class {node.name}" |
| 291 | return SymbolRecord( |
| 292 | kind="class", |
| 293 | name=node.name, |
| 294 | qualified_name=qualified_name, |
| 295 | content_id=_sha256(full_src), |
| 296 | body_hash=_sha256(structure), |
| 297 | signature_id=_sha256(header), |
| 298 | lineno=node.lineno, |
| 299 | end_lineno=node.end_lineno or node.lineno, |
| 300 | ) |
| 301 | |
| 302 | |
| 303 | def _make_var_record(node: ast.Assign | ast.AnnAssign, name: str) -> SymbolRecord: |
| 304 | normalized = ast.unparse(node) |
| 305 | return SymbolRecord( |
| 306 | kind="variable", |
| 307 | name=name, |
| 308 | qualified_name=name, |
| 309 | content_id=_sha256(normalized), |
| 310 | body_hash=_sha256(normalized), |
| 311 | signature_id=_sha256(name), |
| 312 | lineno=node.lineno, |
| 313 | end_lineno=node.end_lineno or node.lineno, |
| 314 | ) |
| 315 | |
| 316 | |
| 317 | def _make_import_record( |
| 318 | node: ast.Import | ast.ImportFrom, name: str |
| 319 | ) -> SymbolRecord: |
| 320 | normalized = ast.unparse(node) |
| 321 | return SymbolRecord( |
| 322 | kind="import", |
| 323 | name=name, |
| 324 | qualified_name=f"import::{name}", |
| 325 | content_id=_sha256(normalized), |
| 326 | body_hash=_sha256(normalized), |
| 327 | signature_id=_sha256(name), |
| 328 | lineno=node.lineno, |
| 329 | end_lineno=node.lineno, |
| 330 | ) |
| 331 | |
| 332 | |
| 333 | def _assignment_names(node: ast.Assign | ast.AnnAssign) -> list[str]: |
| 334 | if isinstance(node, ast.Assign): |
| 335 | return [t.id for t in node.targets if isinstance(t, ast.Name)] |
| 336 | if isinstance(node.target, ast.Name): |
| 337 | return [node.target.id] |
| 338 | return [] |
| 339 | |
| 340 | |
| 341 | def _import_names(node: ast.Import | ast.ImportFrom) -> list[str]: |
| 342 | if isinstance(node, ast.Import): |
| 343 | return [a.asname or a.name for a in node.names] |
| 344 | # ImportFrom |
| 345 | if node.names and node.names[0].name == "*": |
| 346 | return [f"*:{node.module or '?'}"] |
| 347 | return [a.asname or a.name for a in node.names] |
| 348 | |
| 349 | |
| 350 | # --------------------------------------------------------------------------- |
| 351 | # Fallback adapter — file-level identity only, no symbol extraction |
| 352 | # --------------------------------------------------------------------------- |
| 353 | |
| 354 | |
| 355 | class FallbackAdapter: |
| 356 | """Fallback adapter for languages without a dedicated AST parser. |
| 357 | |
| 358 | Returns an empty :type:`SymbolTree` (file-level tracking only) and uses |
| 359 | raw-bytes SHA-256 as the file content ID. |
| 360 | """ |
| 361 | |
| 362 | def __init__(self, extensions: frozenset[str]) -> None: |
| 363 | self._extensions = extensions |
| 364 | |
| 365 | def supported_extensions(self) -> frozenset[str]: |
| 366 | return self._extensions |
| 367 | |
| 368 | def parse_symbols(self, source: bytes, file_path: str) -> SymbolTree: # noqa: ARG002 |
| 369 | return {} |
| 370 | |
| 371 | def file_content_id(self, source: bytes) -> str: |
| 372 | return _sha256_bytes(source) |
| 373 | |
| 374 | |
| 375 | # --------------------------------------------------------------------------- |
| 376 | # tree-sitter adapter — shared infrastructure for all non-Python languages |
| 377 | # --------------------------------------------------------------------------- |
| 378 | |
| 379 | _WS_RE: re.Pattern[bytes] = re.compile(rb"\s+") |
| 380 | |
| 381 | |
| 382 | def _norm_ws(src: bytes) -> bytes: |
| 383 | """Collapse all whitespace runs to a single space and strip the result.""" |
| 384 | return _WS_RE.sub(b" ", src).strip() |
| 385 | |
| 386 | |
| 387 | def _node_text(src: bytes, node: Node) -> bytes: |
| 388 | """Extract the raw source bytes covered by a tree-sitter node.""" |
| 389 | return src[node.start_byte : node.end_byte] |
| 390 | |
| 391 | |
| 392 | def _class_name_from(src: bytes, node: Node, field: str) -> str | None: |
| 393 | """Extract a class/struct name from a parent CST node. |
| 394 | |
| 395 | Tries ``child_by_field_name(field)`` first (covers Java, C#, C++, Rust). |
| 396 | Falls back to the first ``identifier``-typed named child to handle |
| 397 | languages like Kotlin where the class name is not a named field. |
| 398 | """ |
| 399 | child = node.child_by_field_name(field) |
| 400 | if child is None: |
| 401 | for c in node.named_children: |
| 402 | if c.type == "identifier": |
| 403 | child = c |
| 404 | break |
| 405 | if child is None: |
| 406 | return None |
| 407 | return _node_text(src, child).decode("utf-8", errors="replace") |
| 408 | |
| 409 | |
| 410 | def _qualified_name_ts( |
| 411 | src: bytes, |
| 412 | sym_node: Node, |
| 413 | name: str, |
| 414 | class_node_types: frozenset[str], |
| 415 | class_name_field: str, |
| 416 | ) -> str: |
| 417 | """Walk the CST parent chain to build a dotted qualified name. |
| 418 | |
| 419 | For a method ``bark`` inside ``class Dog``, returns ``"Dog.bark"``. |
| 420 | For a top-level function, returns just ``"standalone"``. |
| 421 | """ |
| 422 | parts = [name] |
| 423 | parent = sym_node.parent |
| 424 | while parent is not None: |
| 425 | if parent.type in class_node_types: |
| 426 | class_name = _class_name_from(src, parent, class_name_field) |
| 427 | if class_name: |
| 428 | parts.insert(0, class_name) |
| 429 | parent = parent.parent |
| 430 | return ".".join(parts) |
| 431 | |
| 432 | |
| 433 | class LangSpec(TypedDict): |
| 434 | """Per-language tree-sitter configuration consumed by :class:`TreeSitterAdapter`.""" |
| 435 | |
| 436 | extensions: frozenset[str] |
| 437 | module_name: str # Python import name, e.g. ``"tree_sitter_javascript"`` |
| 438 | lang_func: str # Attribute on the module returning the raw capsule |
| 439 | query_str: str # tree-sitter S-expr query — must capture ``@sym`` and ``@name`` |
| 440 | kind_map: dict[str, SymbolKind] # CST node type → SymbolKind |
| 441 | class_node_types: frozenset[str] # Ancestor types that scope methods |
| 442 | class_name_field: str # Field name for the class name (e.g. ``"name"`` or ``"type"``) |
| 443 | receiver_capture: str # Capture name for Go-style method receivers; ``""`` to skip |
| 444 | |
| 445 | |
| 446 | class TreeSitterAdapter: |
| 447 | """Implements :class:`LanguageAdapter` using tree-sitter for real CST parsing. |
| 448 | |
| 449 | tree-sitter is the same parsing technology used by GitHub Copilot, VS Code, |
| 450 | Neovim, and Zed. It produces a concrete syntax tree from every source file, |
| 451 | even if the file has syntax errors — making it suitable for real-world repos |
| 452 | that may contain partially-written code. |
| 453 | |
| 454 | Parsing is error-tolerant: individual file failures are logged at DEBUG |
| 455 | level and return an empty :type:`SymbolTree` so the caller falls back to |
| 456 | file-level diffing rather than crashing. |
| 457 | """ |
| 458 | |
| 459 | def __init__( |
| 460 | self, |
| 461 | spec: LangSpec, |
| 462 | parser: Parser, |
| 463 | language: Language, |
| 464 | ) -> None: |
| 465 | self._spec = spec |
| 466 | self._parser = parser |
| 467 | self._language = language |
| 468 | self._query = Query(language, spec["query_str"]) |
| 469 | |
| 470 | def supported_extensions(self) -> frozenset[str]: |
| 471 | return self._spec["extensions"] |
| 472 | |
| 473 | def parse_symbols(self, source: bytes, file_path: str) -> SymbolTree: |
| 474 | try: |
| 475 | tree = self._parser.parse(source) |
| 476 | cursor = QueryCursor(self._query) |
| 477 | symbols: SymbolTree = {} |
| 478 | recv_cap = self._spec["receiver_capture"] |
| 479 | |
| 480 | for _pat, caps in cursor.matches(tree.root_node): |
| 481 | sym_list = caps.get("sym", []) |
| 482 | name_list = caps.get("name", []) |
| 483 | if not sym_list or not name_list: |
| 484 | continue |
| 485 | sym_node = sym_list[0] |
| 486 | name_node = name_list[0] |
| 487 | |
| 488 | name_txt = _node_text(source, name_node).decode( |
| 489 | "utf-8", errors="replace" |
| 490 | ) |
| 491 | kind = self._spec["kind_map"].get(sym_node.type, "function") |
| 492 | |
| 493 | # Build qualified name — walking ancestor chain for methods. |
| 494 | qualified = _qualified_name_ts( |
| 495 | source, |
| 496 | sym_node, |
| 497 | name_txt, |
| 498 | self._spec["class_node_types"], |
| 499 | self._spec["class_name_field"], |
| 500 | ) |
| 501 | |
| 502 | # Go-style receiver prefix: (d *Dog) → "Dog.Bark" |
| 503 | if recv_cap: |
| 504 | recv_list = caps.get(recv_cap, []) |
| 505 | if recv_list: |
| 506 | recv_txt = ( |
| 507 | _node_text(source, recv_list[0]) |
| 508 | .decode("utf-8", errors="replace") |
| 509 | .lstrip("*") |
| 510 | .strip() |
| 511 | ) |
| 512 | if recv_txt: |
| 513 | qualified = f"{recv_txt}.{qualified}" |
| 514 | |
| 515 | addr = f"{file_path}::{qualified}" |
| 516 | node_bytes = _node_text(source, sym_node) |
| 517 | name_bytes = _node_text(source, name_node) |
| 518 | # Substitute the name with a placeholder to isolate the body |
| 519 | # from the identifier — two symbols with the same body but |
| 520 | # different names share the same body_hash, signalling a rename. |
| 521 | body_bytes = node_bytes.replace(name_bytes, b"\xfe", 1) |
| 522 | |
| 523 | params_node = ( |
| 524 | sym_node.child_by_field_name("parameters") |
| 525 | or sym_node.child_by_field_name("formal_parameters") |
| 526 | or sym_node.child_by_field_name("function_value_parameters") |
| 527 | ) |
| 528 | params_bytes = ( |
| 529 | _node_text(source, params_node) |
| 530 | if params_node is not None |
| 531 | else b"" |
| 532 | ) |
| 533 | |
| 534 | symbols[addr] = SymbolRecord( |
| 535 | kind=kind, |
| 536 | name=name_txt, |
| 537 | qualified_name=qualified, |
| 538 | content_id=_sha256_bytes(_norm_ws(node_bytes)), |
| 539 | body_hash=_sha256_bytes(_norm_ws(body_bytes)), |
| 540 | signature_id=_sha256_bytes(_norm_ws(name_bytes + params_bytes)), |
| 541 | lineno=sym_node.start_point[0] + 1, |
| 542 | end_lineno=sym_node.end_point[0] + 1, |
| 543 | ) |
| 544 | return symbols |
| 545 | except Exception as exc: # noqa: BLE001 |
| 546 | logger.debug("tree-sitter parse error in %s: %s", file_path, exc) |
| 547 | return {} |
| 548 | |
| 549 | def file_content_id(self, source: bytes) -> str: |
| 550 | """Whitespace-normalised SHA-256 of the source — insensitive to reformatting.""" |
| 551 | return _sha256_bytes(_norm_ws(source)) |
| 552 | |
| 553 | |
| 554 | def _make_ts_adapter(spec: LangSpec) -> LanguageAdapter: |
| 555 | """Build a :class:`TreeSitterAdapter`; fall back to :class:`FallbackAdapter` on error. |
| 556 | |
| 557 | Importing the grammar capsule is deferred to this factory so that a |
| 558 | missing or incompatible grammar package degrades gracefully rather than |
| 559 | preventing the entire plugin from loading. |
| 560 | """ |
| 561 | try: |
| 562 | mod = importlib.import_module(spec["module_name"]) |
| 563 | raw_lang = getattr(mod, spec["lang_func"])() |
| 564 | lang = Language(raw_lang) |
| 565 | parser = Parser(lang) |
| 566 | return TreeSitterAdapter(spec, parser, lang) |
| 567 | except Exception as exc: # noqa: BLE001 |
| 568 | logger.debug( |
| 569 | "tree-sitter grammar %s.%s unavailable — using file-level fallback: %s", |
| 570 | spec["module_name"], |
| 571 | spec["lang_func"], |
| 572 | exc, |
| 573 | ) |
| 574 | return FallbackAdapter(spec["extensions"]) |
| 575 | |
| 576 | |
| 577 | # --------------------------------------------------------------------------- |
| 578 | # Per-language tree-sitter specs |
| 579 | # --------------------------------------------------------------------------- |
| 580 | |
| 581 | _JS_SPEC: LangSpec = { |
| 582 | "extensions": frozenset({".js", ".jsx", ".mjs", ".cjs"}), |
| 583 | "module_name": "tree_sitter_javascript", |
| 584 | "lang_func": "language", |
| 585 | # Note: tree-sitter-javascript uses "class" for both class declarations and |
| 586 | # named class expressions. "class_expression" is not a valid node type. |
| 587 | "query_str": ( |
| 588 | "(function_declaration name: (identifier) @name) @sym\n" |
| 589 | "(function_expression name: (identifier) @name) @sym\n" |
| 590 | "(generator_function_declaration name: (identifier) @name) @sym\n" |
| 591 | "(class_declaration name: (identifier) @name) @sym\n" |
| 592 | "(class name: (identifier) @name) @sym\n" |
| 593 | "(method_definition name: (property_identifier) @name) @sym" |
| 594 | ), |
| 595 | "kind_map": { |
| 596 | "function_declaration": "function", |
| 597 | "function_expression": "function", |
| 598 | "generator_function_declaration": "function", |
| 599 | "class_declaration": "class", |
| 600 | "class": "class", |
| 601 | "method_definition": "method", |
| 602 | }, |
| 603 | "class_node_types": frozenset({"class_declaration", "class"}), |
| 604 | "class_name_field": "name", |
| 605 | "receiver_capture": "", |
| 606 | } |
| 607 | |
| 608 | _TS_QUERY = ( |
| 609 | # TypeScript uses type_identifier (not identifier) for class names. |
| 610 | "(function_declaration name: (identifier) @name) @sym\n" |
| 611 | "(function_expression name: (identifier) @name) @sym\n" |
| 612 | "(generator_function_declaration name: (identifier) @name) @sym\n" |
| 613 | "(class_declaration name: (type_identifier) @name) @sym\n" |
| 614 | "(class name: (type_identifier) @name) @sym\n" |
| 615 | "(abstract_class_declaration name: (type_identifier) @name) @sym\n" |
| 616 | "(method_definition name: (property_identifier) @name) @sym\n" |
| 617 | "(interface_declaration name: (type_identifier) @name) @sym\n" |
| 618 | "(type_alias_declaration name: (type_identifier) @name) @sym\n" |
| 619 | "(enum_declaration name: (identifier) @name) @sym" |
| 620 | ) |
| 621 | |
| 622 | _TS_KIND_MAP: dict[str, SymbolKind] = { |
| 623 | "function_declaration": "function", |
| 624 | "function_expression": "function", |
| 625 | "generator_function_declaration": "function", |
| 626 | "class_declaration": "class", |
| 627 | "class": "class", |
| 628 | "abstract_class_declaration": "class", |
| 629 | "method_definition": "method", |
| 630 | "interface_declaration": "class", |
| 631 | "type_alias_declaration": "variable", |
| 632 | "enum_declaration": "class", |
| 633 | } |
| 634 | |
| 635 | _TS_CLASS_NODES: frozenset[str] = frozenset( |
| 636 | {"class_declaration", "class", "abstract_class_declaration"} |
| 637 | ) |
| 638 | |
| 639 | _TS_SPEC: LangSpec = { |
| 640 | "extensions": frozenset({".ts"}), |
| 641 | "module_name": "tree_sitter_typescript", |
| 642 | "lang_func": "language_typescript", |
| 643 | "query_str": _TS_QUERY, |
| 644 | "kind_map": _TS_KIND_MAP, |
| 645 | "class_node_types": _TS_CLASS_NODES, |
| 646 | "class_name_field": "name", |
| 647 | "receiver_capture": "", |
| 648 | } |
| 649 | |
| 650 | _TSX_SPEC: LangSpec = { |
| 651 | "extensions": frozenset({".tsx"}), |
| 652 | "module_name": "tree_sitter_typescript", |
| 653 | "lang_func": "language_tsx", |
| 654 | "query_str": _TS_QUERY, |
| 655 | "kind_map": _TS_KIND_MAP, |
| 656 | "class_node_types": _TS_CLASS_NODES, |
| 657 | "class_name_field": "name", |
| 658 | "receiver_capture": "", |
| 659 | } |
| 660 | |
| 661 | _GO_SPEC: LangSpec = { |
| 662 | "extensions": frozenset({".go"}), |
| 663 | "module_name": "tree_sitter_go", |
| 664 | "lang_func": "language", |
| 665 | "query_str": ( |
| 666 | "(function_declaration name: (identifier) @name) @sym\n" |
| 667 | "(method_declaration\n" |
| 668 | " receiver: (parameter_list\n" |
| 669 | " (parameter_declaration type: _ @recv))\n" |
| 670 | " name: (field_identifier) @name) @sym\n" |
| 671 | "(type_spec name: (type_identifier) @name) @sym" |
| 672 | ), |
| 673 | "kind_map": { |
| 674 | "function_declaration": "function", |
| 675 | "method_declaration": "method", |
| 676 | "type_spec": "class", |
| 677 | }, |
| 678 | "class_node_types": frozenset(), |
| 679 | "class_name_field": "name", |
| 680 | "receiver_capture": "recv", |
| 681 | } |
| 682 | |
| 683 | _RUST_SPEC: LangSpec = { |
| 684 | "extensions": frozenset({".rs"}), |
| 685 | "module_name": "tree_sitter_rust", |
| 686 | "lang_func": "language", |
| 687 | "query_str": ( |
| 688 | "(function_item name: (identifier) @name) @sym\n" |
| 689 | "(struct_item name: (type_identifier) @name) @sym\n" |
| 690 | "(enum_item name: (type_identifier) @name) @sym\n" |
| 691 | "(trait_item name: (type_identifier) @name) @sym" |
| 692 | ), |
| 693 | "kind_map": { |
| 694 | "function_item": "function", |
| 695 | "struct_item": "class", |
| 696 | "enum_item": "class", |
| 697 | "trait_item": "class", |
| 698 | }, |
| 699 | # impl_item scopes methods; its implementing type is in the "type" field. |
| 700 | "class_node_types": frozenset({"impl_item"}), |
| 701 | "class_name_field": "type", |
| 702 | "receiver_capture": "", |
| 703 | } |
| 704 | |
| 705 | _JAVA_SPEC: LangSpec = { |
| 706 | "extensions": frozenset({".java"}), |
| 707 | "module_name": "tree_sitter_java", |
| 708 | "lang_func": "language", |
| 709 | "query_str": ( |
| 710 | "(method_declaration name: (identifier) @name) @sym\n" |
| 711 | "(constructor_declaration name: (identifier) @name) @sym\n" |
| 712 | "(class_declaration name: (identifier) @name) @sym\n" |
| 713 | "(interface_declaration name: (identifier) @name) @sym\n" |
| 714 | "(enum_declaration name: (identifier) @name) @sym" |
| 715 | ), |
| 716 | "kind_map": { |
| 717 | "method_declaration": "method", |
| 718 | "constructor_declaration": "function", |
| 719 | "class_declaration": "class", |
| 720 | "interface_declaration": "class", |
| 721 | "enum_declaration": "class", |
| 722 | }, |
| 723 | "class_node_types": frozenset({"class_declaration", "interface_declaration"}), |
| 724 | "class_name_field": "name", |
| 725 | "receiver_capture": "", |
| 726 | } |
| 727 | |
| 728 | _C_SPEC: LangSpec = { |
| 729 | "extensions": frozenset({".c", ".h"}), |
| 730 | "module_name": "tree_sitter_c", |
| 731 | "lang_func": "language", |
| 732 | "query_str": ( |
| 733 | "(function_definition\n" |
| 734 | " declarator: (function_declarator\n" |
| 735 | " declarator: (identifier) @name)) @sym" |
| 736 | ), |
| 737 | "kind_map": {"function_definition": "function"}, |
| 738 | "class_node_types": frozenset(), |
| 739 | "class_name_field": "name", |
| 740 | "receiver_capture": "", |
| 741 | } |
| 742 | |
| 743 | _CPP_SPEC: LangSpec = { |
| 744 | "extensions": frozenset({".cpp", ".cc", ".cxx", ".hpp", ".hxx"}), |
| 745 | "module_name": "tree_sitter_cpp", |
| 746 | "lang_func": "language", |
| 747 | "query_str": ( |
| 748 | "(function_definition\n" |
| 749 | " declarator: (function_declarator\n" |
| 750 | " declarator: (identifier) @name)) @sym\n" |
| 751 | "(class_specifier name: (type_identifier) @name) @sym\n" |
| 752 | "(struct_specifier name: (type_identifier) @name) @sym" |
| 753 | ), |
| 754 | "kind_map": { |
| 755 | "function_definition": "function", |
| 756 | "class_specifier": "class", |
| 757 | "struct_specifier": "class", |
| 758 | }, |
| 759 | "class_node_types": frozenset({"class_specifier", "struct_specifier"}), |
| 760 | "class_name_field": "name", |
| 761 | "receiver_capture": "", |
| 762 | } |
| 763 | |
| 764 | _CS_SPEC: LangSpec = { |
| 765 | "extensions": frozenset({".cs"}), |
| 766 | "module_name": "tree_sitter_c_sharp", |
| 767 | "lang_func": "language", |
| 768 | "query_str": ( |
| 769 | "(method_declaration name: (identifier) @name) @sym\n" |
| 770 | "(constructor_declaration name: (identifier) @name) @sym\n" |
| 771 | "(class_declaration name: (identifier) @name) @sym\n" |
| 772 | "(interface_declaration name: (identifier) @name) @sym\n" |
| 773 | "(enum_declaration name: (identifier) @name) @sym\n" |
| 774 | "(struct_declaration name: (identifier) @name) @sym" |
| 775 | ), |
| 776 | "kind_map": { |
| 777 | "method_declaration": "method", |
| 778 | "constructor_declaration": "function", |
| 779 | "class_declaration": "class", |
| 780 | "interface_declaration": "class", |
| 781 | "enum_declaration": "class", |
| 782 | "struct_declaration": "class", |
| 783 | }, |
| 784 | "class_node_types": frozenset( |
| 785 | {"class_declaration", "interface_declaration", "struct_declaration"} |
| 786 | ), |
| 787 | "class_name_field": "name", |
| 788 | "receiver_capture": "", |
| 789 | } |
| 790 | |
| 791 | _RUBY_SPEC: LangSpec = { |
| 792 | "extensions": frozenset({".rb"}), |
| 793 | "module_name": "tree_sitter_ruby", |
| 794 | "lang_func": "language", |
| 795 | "query_str": ( |
| 796 | "(method name: (identifier) @name) @sym\n" |
| 797 | "(singleton_method name: (identifier) @name) @sym\n" |
| 798 | "(class name: (constant) @name) @sym\n" |
| 799 | "(module name: (constant) @name) @sym" |
| 800 | ), |
| 801 | "kind_map": { |
| 802 | "method": "method", |
| 803 | "singleton_method": "method", |
| 804 | "class": "class", |
| 805 | "module": "class", |
| 806 | }, |
| 807 | "class_node_types": frozenset({"class", "module"}), |
| 808 | "class_name_field": "name", |
| 809 | "receiver_capture": "", |
| 810 | } |
| 811 | |
| 812 | _KT_SPEC: LangSpec = { |
| 813 | "extensions": frozenset({".kt", ".kts"}), |
| 814 | "module_name": "tree_sitter_kotlin", |
| 815 | "lang_func": "language", |
| 816 | "query_str": ( |
| 817 | "(function_declaration (identifier) @name) @sym\n" |
| 818 | "(class_declaration (identifier) @name) @sym" |
| 819 | ), |
| 820 | "kind_map": { |
| 821 | "function_declaration": "function", |
| 822 | "class_declaration": "class", |
| 823 | }, |
| 824 | # Kotlin methods are function_declaration nodes inside class_body. |
| 825 | # child_by_field_name("name") is None for Kotlin classes; _class_name_from |
| 826 | # falls back to the first identifier-typed named child automatically. |
| 827 | "class_node_types": frozenset({"class_declaration"}), |
| 828 | "class_name_field": "name", |
| 829 | "receiver_capture": "", |
| 830 | } |
| 831 | |
| 832 | #: All tree-sitter language specs, loaded in registration order. |
| 833 | _TS_LANG_SPECS: list[LangSpec] = [ |
| 834 | _JS_SPEC, |
| 835 | _TS_SPEC, |
| 836 | _TSX_SPEC, |
| 837 | _GO_SPEC, |
| 838 | _RUST_SPEC, |
| 839 | _JAVA_SPEC, |
| 840 | _C_SPEC, |
| 841 | _CPP_SPEC, |
| 842 | _CS_SPEC, |
| 843 | _RUBY_SPEC, |
| 844 | _KT_SPEC, |
| 845 | ] |
| 846 | |
| 847 | |
| 848 | # --------------------------------------------------------------------------- |
| 849 | # Adapter registry and public helpers |
| 850 | # --------------------------------------------------------------------------- |
| 851 | |
| 852 | _PYTHON = PythonAdapter() |
| 853 | _FALLBACK = FallbackAdapter(frozenset()) |
| 854 | |
| 855 | #: Adapters checked in order; first match wins. |
| 856 | ADAPTERS: list[LanguageAdapter] = [_PYTHON] |
| 857 | |
| 858 | # Build and register tree-sitter adapters. _make_ts_adapter degrades to |
| 859 | # FallbackAdapter if a grammar package isn't installed. |
| 860 | for _spec in _TS_LANG_SPECS: |
| 861 | ADAPTERS.append(_make_ts_adapter(_spec)) |
| 862 | |
| 863 | #: File extensions that receive semantic (AST-based) symbol extraction. |
| 864 | SEMANTIC_EXTENSIONS: frozenset[str] = frozenset().union( |
| 865 | *(a.supported_extensions() for a in ADAPTERS if not isinstance(a, FallbackAdapter)) |
| 866 | ) |
| 867 | |
| 868 | #: Source extensions tracked as first-class files (raw-bytes identity for |
| 869 | #: languages without an AST adapter, AST identity for Python). |
| 870 | SOURCE_EXTENSIONS: frozenset[str] = frozenset({ |
| 871 | ".py", ".pyi", |
| 872 | ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs", |
| 873 | ".swift", |
| 874 | ".go", |
| 875 | ".rs", |
| 876 | ".java", |
| 877 | ".c", ".cpp", ".cc", ".cxx", ".h", ".hpp", |
| 878 | ".rb", |
| 879 | ".kt", |
| 880 | ".cs", |
| 881 | ".sh", ".bash", ".zsh", |
| 882 | ".toml", ".yaml", ".yml", ".json", ".jsonc", |
| 883 | ".md", ".rst", ".txt", |
| 884 | ".css", ".scss", ".html", |
| 885 | ".sql", |
| 886 | ".proto", |
| 887 | ".tf", |
| 888 | }) |
| 889 | |
| 890 | |
| 891 | def adapter_for_path(file_path: str) -> LanguageAdapter: |
| 892 | """Return the best :class:`LanguageAdapter` for *file_path*. |
| 893 | |
| 894 | Checks registered adapters in order; falls back to |
| 895 | :class:`FallbackAdapter` when no adapter claims the suffix. |
| 896 | |
| 897 | Args: |
| 898 | file_path: Workspace-relative POSIX path (e.g. ``"src/utils.py"``). |
| 899 | |
| 900 | Returns: |
| 901 | The first adapter whose :meth:`~LanguageAdapter.supported_extensions` |
| 902 | set contains the file's lowercase suffix. |
| 903 | """ |
| 904 | suffix = pathlib.PurePosixPath(file_path).suffix.lower() |
| 905 | for adapter in ADAPTERS: |
| 906 | if suffix in adapter.supported_extensions(): |
| 907 | return adapter |
| 908 | return _FALLBACK |
| 909 | |
| 910 | |
| 911 | def parse_symbols(source: bytes, file_path: str) -> SymbolTree: |
| 912 | """Parse *source* with the best available adapter for *file_path*. |
| 913 | |
| 914 | Args: |
| 915 | source: Raw bytes of the source file. |
| 916 | file_path: Workspace-relative POSIX path. |
| 917 | |
| 918 | Returns: |
| 919 | A :type:`SymbolTree` (may be empty for unsupported file types). |
| 920 | """ |
| 921 | return adapter_for_path(file_path).parse_symbols(source, file_path) |
| 922 | |
| 923 | |
| 924 | def file_content_id(source: bytes, file_path: str) -> str: |
| 925 | """Return the semantic content ID for *file_path* given its raw *source*. |
| 926 | |
| 927 | Args: |
| 928 | source: Raw bytes of the file. |
| 929 | file_path: Workspace-relative POSIX path. |
| 930 | |
| 931 | Returns: |
| 932 | Hex-encoded SHA-256 digest — AST-based for Python, raw-bytes for others. |
| 933 | """ |
| 934 | return adapter_for_path(file_path).file_content_id(source) |