gabriel / musehub public
musehub_search.py python
331 lines 11.1 KB
6b53f1af feat: supercharge all pages, full SOC refactor, and Python 3.14 upgrade (#7) Gabriel Cardona <cgcardona@gmail.com> 5d ago
1 """MuseHub in-repo search service.
2
3 Provides four search modes over a repo's commit history, all operating on the
4 shared ``muse_cli_commits`` table and scoped to a single ``repo_id``.
5
6 Modes and their underlying algorithms:
7 - ``property`` — musical property filter (delegates to :mod:`musehub.services.muse_find`)
8 - ``ask`` — natural-language query; keyword extraction + overlap scoring
9 - ``keyword`` — raw keyword/phrase overlap (normalised overlap coefficient)
10 - ``pattern`` — substring pattern match against message and branch name
11
12 All four modes return :class:`~musehub.models.musehub.SearchResponse` so the
13 UI can render results with a single shared commit-row template regardless of mode.
14
15 Date-range filtering (``since`` / ``until``) is applied at the SQL layer for
16 efficiency before any Python-level scoring.
17 """
18
19 import logging
20 import re
21 from datetime import datetime
22
23 from sqlalchemy import and_
24 from sqlalchemy.ext.asyncio import AsyncSession
25 from sqlalchemy.future import select
26
27 from musehub.models.musehub import SearchCommitMatch, SearchResponse
28 from musehub.muse_cli.models import MuseCliCommit
29 # TODO(muse-extraction): muse_find extracted to cgcardona/muse — re-integrate via service API
30
31 logger = logging.getLogger(__name__)
32
33 _DEFAULT_LIMIT = 20
34 _TOKEN_RE = re.compile(r"[a-zA-Z0-9]+")
35
36 # Stop-words stripped during NL ask-mode keyword extraction.
37 _STOP_WORDS = frozenset({
38 "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
39 "have", "has", "had", "do", "does", "did", "will", "would", "could",
40 "should", "may", "might", "shall", "can", "need", "dare", "ought",
41 "i", "my", "me", "we", "our", "you", "your", "he", "she", "it",
42 "they", "their", "them", "what", "when", "where", "who", "which",
43 "how", "why", "in", "on", "at", "to", "of", "for", "and", "or",
44 "but", "not", "with", "from", "by", "about", "into", "through",
45 "did", "make", "made", "last", "any", "all", "that", "this",
46 })
47
48
49 # ---------------------------------------------------------------------------
50 # Internal helpers
51 # ---------------------------------------------------------------------------
52
53
54 def _tokenize(text: str) -> set[str]:
55 """Return a set of lowercase word tokens from *text*."""
56 return {m.group().lower() for m in _TOKEN_RE.finditer(text)}
57
58
59 def _overlap_score(query_tokens: set[str], message: str) -> float:
60 """Normalised overlap coefficient: |Q ∩ M| / |Q|.
61
62 Returns 1.0 when every query token appears in the message, 0.0 when
63 none do. Returns 0.0 for an empty query set to avoid division by zero.
64 """
65 if not query_tokens:
66 return 0.0
67 message_tokens = _tokenize(message)
68 return len(query_tokens & message_tokens) / len(query_tokens)
69
70
71 async def _fetch_candidates(
72 session: AsyncSession,
73 *,
74 repo_id: str,
75 since: datetime | None,
76 until: datetime | None,
77 cap: int = 5000,
78 ) -> tuple[list[MuseCliCommit], int]:
79 """Fetch candidate commits from DB with optional date range filter.
80
81 Returns ``(rows, total_scanned)`` where ``total_scanned`` is the raw DB count
82 before any Python-level filtering. We over-fetch (up to ``cap``) and let
83 callers apply their own ranking/limit so the SQL stays simple and fast.
84 """
85 stmt = select(MuseCliCommit).where(MuseCliCommit.repo_id == repo_id)
86
87 conditions = []
88 if since is not None:
89 conditions.append(MuseCliCommit.committed_at >= since)
90 if until is not None:
91 conditions.append(MuseCliCommit.committed_at <= until)
92 if conditions:
93 stmt = stmt.where(and_(*conditions))
94
95 stmt = stmt.order_by(MuseCliCommit.committed_at.desc()).limit(cap)
96
97 result = await session.execute(stmt)
98 rows = list(result.scalars().all())
99 return rows, len(rows)
100
101
102 def _commit_to_match(
103 commit: MuseCliCommit,
104 *,
105 score: float = 1.0,
106 match_source: str = "message",
107 ) -> SearchCommitMatch:
108 """Convert a DB row to the wire-format :class:`SearchCommitMatch`."""
109 return SearchCommitMatch(
110 commit_id=commit.commit_id,
111 branch=commit.branch,
112 message=commit.message,
113 author=commit.author,
114 timestamp=commit.committed_at,
115 score=round(score, 4),
116 match_source=match_source,
117 )
118
119
120 # ---------------------------------------------------------------------------
121 # Search modes
122 # ---------------------------------------------------------------------------
123
124
125 async def search_by_property(
126 session: AsyncSession,
127 *,
128 repo_id: str,
129 harmony: str | None = None,
130 rhythm: str | None = None,
131 melody: str | None = None,
132 structure: str | None = None,
133 dynamic: str | None = None,
134 emotion: str | None = None,
135 since: datetime | None = None,
136 until: datetime | None = None,
137 limit: int = _DEFAULT_LIMIT,
138 ) -> SearchResponse:
139 """Musical property filter.
140
141 TODO(muse-extraction): muse_find was extracted to cgcardona/muse.
142 Re-integrate via the Muse service API once available.
143 Currently returns an empty result set.
144 """
145 active_filters = {
146 k: v for k, v in {
147 "harmony": harmony,
148 "rhythm": rhythm,
149 "melody": melody,
150 "structure": structure,
151 "dynamic": dynamic,
152 "emotion": emotion,
153 }.items() if v is not None
154 }
155 query_echo = " AND ".join(f"{k}={v}" for k, v in active_filters.items()) or "(all commits)"
156 logger.warning("⚠️ musehub search property: muse_find not available (muse-extraction)")
157 return SearchResponse(
158 mode="property",
159 query=query_echo,
160 matches=[],
161 total_scanned=0,
162 limit=limit,
163 )
164
165
166 async def search_by_ask(
167 session: AsyncSession,
168 *,
169 repo_id: str,
170 question: str,
171 since: datetime | None = None,
172 until: datetime | None = None,
173 limit: int = _DEFAULT_LIMIT,
174 ) -> SearchResponse:
175 """Natural-language query — keyword extraction + overlap scoring.
176
177 Strips stop-words from the question to produce a focused keyword set,
178 then ranks commits by overlap coefficient. Commits with zero overlap
179 are excluded. Returns at most ``limit`` results ordered by score desc.
180
181 This is a stub implementation; LLM-powered answer generation is a planned
182 enhancement that will replace the keyword scoring step.
183
184 Args:
185 session: Async SQLAlchemy session.
186 repo_id: Repo to search.
187 question: Natural-language question string.
188 since: Earliest committed_at (inclusive).
189 until: Latest committed_at (inclusive).
190 limit: Maximum results to return.
191
192 Returns:
193 :class:`~musehub.models.musehub.SearchResponse` with mode="ask".
194 """
195 rows, total_scanned = await _fetch_candidates(
196 session, repo_id=repo_id, since=since, until=until
197 )
198
199 # Extract meaningful keywords after stop-word removal.
200 tokens_raw = re.split(r"[\s\W]+", question.lower())
201 keywords: set[str] = {t for t in tokens_raw if t and t not in _STOP_WORDS and len(t) > 1}
202
203 scored: list[tuple[float, MuseCliCommit]] = []
204 for commit in rows:
205 if keywords:
206 score = _overlap_score(keywords, commit.message)
207 else:
208 # No useful tokens → include all commits with neutral score.
209 score = 1.0
210 if score > 0.0:
211 scored.append((score, commit))
212
213 scored.sort(key=lambda x: (x[0], x[1].committed_at.timestamp()), reverse=True)
214 top = scored[:limit]
215
216 matches = [_commit_to_match(c, score=s, match_source="message") for s, c in top]
217
218 logger.info("✅ musehub search ask: %d matches (repo=%s)", len(matches), repo_id[:8])
219 return SearchResponse(
220 mode="ask",
221 query=question,
222 matches=matches,
223 total_scanned=total_scanned,
224 limit=limit,
225 )
226
227
228 async def search_by_keyword(
229 session: AsyncSession,
230 *,
231 repo_id: str,
232 keyword: str,
233 threshold: float = 0.0,
234 since: datetime | None = None,
235 until: datetime | None = None,
236 limit: int = _DEFAULT_LIMIT,
237 ) -> SearchResponse:
238 """Keyword search — overlap coefficient over commit messages.
239
240 Tokenises both *keyword* and each commit message, then scores using the
241 overlap coefficient. Commits below *threshold* are excluded.
242
243 Args:
244 session: Async SQLAlchemy session.
245 repo_id: Repo to search.
246 keyword: Keyword or phrase to search for.
247 threshold: Minimum overlap score [0, 1] to include a commit (default 0 = any match).
248 since: Earliest committed_at (inclusive).
249 until: Latest committed_at (inclusive).
250 limit: Maximum results to return.
251
252 Returns:
253 :class:`~musehub.models.musehub.SearchResponse` with mode="keyword".
254 """
255 rows, total_scanned = await _fetch_candidates(
256 session, repo_id=repo_id, since=since, until=until
257 )
258
259 query_tokens = _tokenize(keyword)
260 scored: list[tuple[float, MuseCliCommit]] = []
261 for commit in rows:
262 score = _overlap_score(query_tokens, commit.message)
263 if score >= threshold and score > 0.0:
264 scored.append((score, commit))
265
266 scored.sort(key=lambda x: (x[0], x[1].committed_at.timestamp()), reverse=True)
267 top = scored[:limit]
268
269 matches = [_commit_to_match(c, score=s, match_source="message") for s, c in top]
270
271 logger.info("✅ musehub search keyword: %d matches (repo=%s)", len(matches), repo_id[:8])
272 return SearchResponse(
273 mode="keyword",
274 query=keyword,
275 matches=matches,
276 total_scanned=total_scanned,
277 limit=limit,
278 )
279
280
281 async def search_by_pattern(
282 session: AsyncSession,
283 *,
284 repo_id: str,
285 pattern: str,
286 since: datetime | None = None,
287 until: datetime | None = None,
288 limit: int = _DEFAULT_LIMIT,
289 ) -> SearchResponse:
290 """Pattern search — case-insensitive substring match against message and branch.
291
292 Matches commits where *pattern* appears anywhere in the commit message or
293 the branch name. Prioritises message matches over branch-name matches in
294 the result ordering.
295
296 Args:
297 session: Async SQLAlchemy session.
298 repo_id: Repo to search.
299 pattern: Substring pattern to search for.
300 since: Earliest committed_at (inclusive).
301 until: Latest committed_at (inclusive).
302 limit: Maximum results to return.
303
304 Returns:
305 :class:`~musehub.models.musehub.SearchResponse` with mode="pattern".
306 """
307 rows, total_scanned = await _fetch_candidates(
308 session, repo_id=repo_id, since=since, until=until
309 )
310
311 pat = pattern.lower()
312 message_matches: list[SearchCommitMatch] = []
313 branch_matches: list[SearchCommitMatch] = []
314
315 for commit in rows:
316 if pat in commit.message.lower():
317 message_matches.append(_commit_to_match(commit, match_source="message"))
318 elif pat in commit.branch.lower():
319 branch_matches.append(_commit_to_match(commit, match_source="branch"))
320
321 # Message matches come first, then branch matches.
322 all_matches = (message_matches + branch_matches)[:limit]
323
324 logger.info("✅ musehub search pattern: %d matches (repo=%s)", len(all_matches), repo_id[:8])
325 return SearchResponse(
326 mode="pattern",
327 query=pattern,
328 matches=all_matches,
329 total_scanned=total_scanned,
330 limit=limit,
331 )