musehub_search.py
python
| 1 | """MuseHub in-repo search service. |
| 2 | |
| 3 | Provides four search modes over a repo's commit history, all operating on the |
| 4 | shared ``muse_cli_commits`` table and scoped to a single ``repo_id``. |
| 5 | |
| 6 | Modes and their underlying algorithms: |
| 7 | - ``property`` — musical property filter (delegates to :mod:`musehub.services.muse_find`) |
| 8 | - ``ask`` — natural-language query; keyword extraction + overlap scoring |
| 9 | - ``keyword`` — raw keyword/phrase overlap (normalised overlap coefficient) |
| 10 | - ``pattern`` — substring pattern match against message and branch name |
| 11 | |
| 12 | All four modes return :class:`~musehub.models.musehub.SearchResponse` so the |
| 13 | UI can render results with a single shared commit-row template regardless of mode. |
| 14 | |
| 15 | Date-range filtering (``since`` / ``until``) is applied at the SQL layer for |
| 16 | efficiency before any Python-level scoring. |
| 17 | """ |
| 18 | |
| 19 | import logging |
| 20 | import re |
| 21 | from datetime import datetime |
| 22 | |
| 23 | from sqlalchemy import and_ |
| 24 | from sqlalchemy.ext.asyncio import AsyncSession |
| 25 | from sqlalchemy.future import select |
| 26 | |
| 27 | from musehub.models.musehub import SearchCommitMatch, SearchResponse |
| 28 | from musehub.muse_cli.models import MuseCliCommit |
| 29 | # TODO(muse-extraction): muse_find extracted to cgcardona/muse — re-integrate via service API |
| 30 | |
| 31 | logger = logging.getLogger(__name__) |
| 32 | |
| 33 | _DEFAULT_LIMIT = 20 |
| 34 | _TOKEN_RE = re.compile(r"[a-zA-Z0-9]+") |
| 35 | |
| 36 | # Stop-words stripped during NL ask-mode keyword extraction. |
| 37 | _STOP_WORDS = frozenset({ |
| 38 | "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", |
| 39 | "have", "has", "had", "do", "does", "did", "will", "would", "could", |
| 40 | "should", "may", "might", "shall", "can", "need", "dare", "ought", |
| 41 | "i", "my", "me", "we", "our", "you", "your", "he", "she", "it", |
| 42 | "they", "their", "them", "what", "when", "where", "who", "which", |
| 43 | "how", "why", "in", "on", "at", "to", "of", "for", "and", "or", |
| 44 | "but", "not", "with", "from", "by", "about", "into", "through", |
| 45 | "did", "make", "made", "last", "any", "all", "that", "this", |
| 46 | }) |
| 47 | |
| 48 | |
| 49 | # --------------------------------------------------------------------------- |
| 50 | # Internal helpers |
| 51 | # --------------------------------------------------------------------------- |
| 52 | |
| 53 | |
| 54 | def _tokenize(text: str) -> set[str]: |
| 55 | """Return a set of lowercase word tokens from *text*.""" |
| 56 | return {m.group().lower() for m in _TOKEN_RE.finditer(text)} |
| 57 | |
| 58 | |
| 59 | def _overlap_score(query_tokens: set[str], message: str) -> float: |
| 60 | """Normalised overlap coefficient: |Q ∩ M| / |Q|. |
| 61 | |
| 62 | Returns 1.0 when every query token appears in the message, 0.0 when |
| 63 | none do. Returns 0.0 for an empty query set to avoid division by zero. |
| 64 | """ |
| 65 | if not query_tokens: |
| 66 | return 0.0 |
| 67 | message_tokens = _tokenize(message) |
| 68 | return len(query_tokens & message_tokens) / len(query_tokens) |
| 69 | |
| 70 | |
| 71 | async def _fetch_candidates( |
| 72 | session: AsyncSession, |
| 73 | *, |
| 74 | repo_id: str, |
| 75 | since: datetime | None, |
| 76 | until: datetime | None, |
| 77 | cap: int = 5000, |
| 78 | ) -> tuple[list[MuseCliCommit], int]: |
| 79 | """Fetch candidate commits from DB with optional date range filter. |
| 80 | |
| 81 | Returns ``(rows, total_scanned)`` where ``total_scanned`` is the raw DB count |
| 82 | before any Python-level filtering. We over-fetch (up to ``cap``) and let |
| 83 | callers apply their own ranking/limit so the SQL stays simple and fast. |
| 84 | """ |
| 85 | stmt = select(MuseCliCommit).where(MuseCliCommit.repo_id == repo_id) |
| 86 | |
| 87 | conditions = [] |
| 88 | if since is not None: |
| 89 | conditions.append(MuseCliCommit.committed_at >= since) |
| 90 | if until is not None: |
| 91 | conditions.append(MuseCliCommit.committed_at <= until) |
| 92 | if conditions: |
| 93 | stmt = stmt.where(and_(*conditions)) |
| 94 | |
| 95 | stmt = stmt.order_by(MuseCliCommit.committed_at.desc()).limit(cap) |
| 96 | |
| 97 | result = await session.execute(stmt) |
| 98 | rows = list(result.scalars().all()) |
| 99 | return rows, len(rows) |
| 100 | |
| 101 | |
| 102 | def _commit_to_match( |
| 103 | commit: MuseCliCommit, |
| 104 | *, |
| 105 | score: float = 1.0, |
| 106 | match_source: str = "message", |
| 107 | ) -> SearchCommitMatch: |
| 108 | """Convert a DB row to the wire-format :class:`SearchCommitMatch`.""" |
| 109 | return SearchCommitMatch( |
| 110 | commit_id=commit.commit_id, |
| 111 | branch=commit.branch, |
| 112 | message=commit.message, |
| 113 | author=commit.author, |
| 114 | timestamp=commit.committed_at, |
| 115 | score=round(score, 4), |
| 116 | match_source=match_source, |
| 117 | ) |
| 118 | |
| 119 | |
| 120 | # --------------------------------------------------------------------------- |
| 121 | # Search modes |
| 122 | # --------------------------------------------------------------------------- |
| 123 | |
| 124 | |
| 125 | async def search_by_property( |
| 126 | session: AsyncSession, |
| 127 | *, |
| 128 | repo_id: str, |
| 129 | harmony: str | None = None, |
| 130 | rhythm: str | None = None, |
| 131 | melody: str | None = None, |
| 132 | structure: str | None = None, |
| 133 | dynamic: str | None = None, |
| 134 | emotion: str | None = None, |
| 135 | since: datetime | None = None, |
| 136 | until: datetime | None = None, |
| 137 | limit: int = _DEFAULT_LIMIT, |
| 138 | ) -> SearchResponse: |
| 139 | """Musical property filter. |
| 140 | |
| 141 | TODO(muse-extraction): muse_find was extracted to cgcardona/muse. |
| 142 | Re-integrate via the Muse service API once available. |
| 143 | Currently returns an empty result set. |
| 144 | """ |
| 145 | active_filters = { |
| 146 | k: v for k, v in { |
| 147 | "harmony": harmony, |
| 148 | "rhythm": rhythm, |
| 149 | "melody": melody, |
| 150 | "structure": structure, |
| 151 | "dynamic": dynamic, |
| 152 | "emotion": emotion, |
| 153 | }.items() if v is not None |
| 154 | } |
| 155 | query_echo = " AND ".join(f"{k}={v}" for k, v in active_filters.items()) or "(all commits)" |
| 156 | logger.warning("⚠️ musehub search property: muse_find not available (muse-extraction)") |
| 157 | return SearchResponse( |
| 158 | mode="property", |
| 159 | query=query_echo, |
| 160 | matches=[], |
| 161 | total_scanned=0, |
| 162 | limit=limit, |
| 163 | ) |
| 164 | |
| 165 | |
| 166 | async def search_by_ask( |
| 167 | session: AsyncSession, |
| 168 | *, |
| 169 | repo_id: str, |
| 170 | question: str, |
| 171 | since: datetime | None = None, |
| 172 | until: datetime | None = None, |
| 173 | limit: int = _DEFAULT_LIMIT, |
| 174 | ) -> SearchResponse: |
| 175 | """Natural-language query — keyword extraction + overlap scoring. |
| 176 | |
| 177 | Strips stop-words from the question to produce a focused keyword set, |
| 178 | then ranks commits by overlap coefficient. Commits with zero overlap |
| 179 | are excluded. Returns at most ``limit`` results ordered by score desc. |
| 180 | |
| 181 | This is a stub implementation; LLM-powered answer generation is a planned |
| 182 | enhancement that will replace the keyword scoring step. |
| 183 | |
| 184 | Args: |
| 185 | session: Async SQLAlchemy session. |
| 186 | repo_id: Repo to search. |
| 187 | question: Natural-language question string. |
| 188 | since: Earliest committed_at (inclusive). |
| 189 | until: Latest committed_at (inclusive). |
| 190 | limit: Maximum results to return. |
| 191 | |
| 192 | Returns: |
| 193 | :class:`~musehub.models.musehub.SearchResponse` with mode="ask". |
| 194 | """ |
| 195 | rows, total_scanned = await _fetch_candidates( |
| 196 | session, repo_id=repo_id, since=since, until=until |
| 197 | ) |
| 198 | |
| 199 | # Extract meaningful keywords after stop-word removal. |
| 200 | tokens_raw = re.split(r"[\s\W]+", question.lower()) |
| 201 | keywords: set[str] = {t for t in tokens_raw if t and t not in _STOP_WORDS and len(t) > 1} |
| 202 | |
| 203 | scored: list[tuple[float, MuseCliCommit]] = [] |
| 204 | for commit in rows: |
| 205 | if keywords: |
| 206 | score = _overlap_score(keywords, commit.message) |
| 207 | else: |
| 208 | # No useful tokens → include all commits with neutral score. |
| 209 | score = 1.0 |
| 210 | if score > 0.0: |
| 211 | scored.append((score, commit)) |
| 212 | |
| 213 | scored.sort(key=lambda x: (x[0], x[1].committed_at.timestamp()), reverse=True) |
| 214 | top = scored[:limit] |
| 215 | |
| 216 | matches = [_commit_to_match(c, score=s, match_source="message") for s, c in top] |
| 217 | |
| 218 | logger.info("✅ musehub search ask: %d matches (repo=%s)", len(matches), repo_id[:8]) |
| 219 | return SearchResponse( |
| 220 | mode="ask", |
| 221 | query=question, |
| 222 | matches=matches, |
| 223 | total_scanned=total_scanned, |
| 224 | limit=limit, |
| 225 | ) |
| 226 | |
| 227 | |
| 228 | async def search_by_keyword( |
| 229 | session: AsyncSession, |
| 230 | *, |
| 231 | repo_id: str, |
| 232 | keyword: str, |
| 233 | threshold: float = 0.0, |
| 234 | since: datetime | None = None, |
| 235 | until: datetime | None = None, |
| 236 | limit: int = _DEFAULT_LIMIT, |
| 237 | ) -> SearchResponse: |
| 238 | """Keyword search — overlap coefficient over commit messages. |
| 239 | |
| 240 | Tokenises both *keyword* and each commit message, then scores using the |
| 241 | overlap coefficient. Commits below *threshold* are excluded. |
| 242 | |
| 243 | Args: |
| 244 | session: Async SQLAlchemy session. |
| 245 | repo_id: Repo to search. |
| 246 | keyword: Keyword or phrase to search for. |
| 247 | threshold: Minimum overlap score [0, 1] to include a commit (default 0 = any match). |
| 248 | since: Earliest committed_at (inclusive). |
| 249 | until: Latest committed_at (inclusive). |
| 250 | limit: Maximum results to return. |
| 251 | |
| 252 | Returns: |
| 253 | :class:`~musehub.models.musehub.SearchResponse` with mode="keyword". |
| 254 | """ |
| 255 | rows, total_scanned = await _fetch_candidates( |
| 256 | session, repo_id=repo_id, since=since, until=until |
| 257 | ) |
| 258 | |
| 259 | query_tokens = _tokenize(keyword) |
| 260 | scored: list[tuple[float, MuseCliCommit]] = [] |
| 261 | for commit in rows: |
| 262 | score = _overlap_score(query_tokens, commit.message) |
| 263 | if score >= threshold and score > 0.0: |
| 264 | scored.append((score, commit)) |
| 265 | |
| 266 | scored.sort(key=lambda x: (x[0], x[1].committed_at.timestamp()), reverse=True) |
| 267 | top = scored[:limit] |
| 268 | |
| 269 | matches = [_commit_to_match(c, score=s, match_source="message") for s, c in top] |
| 270 | |
| 271 | logger.info("✅ musehub search keyword: %d matches (repo=%s)", len(matches), repo_id[:8]) |
| 272 | return SearchResponse( |
| 273 | mode="keyword", |
| 274 | query=keyword, |
| 275 | matches=matches, |
| 276 | total_scanned=total_scanned, |
| 277 | limit=limit, |
| 278 | ) |
| 279 | |
| 280 | |
| 281 | async def search_by_pattern( |
| 282 | session: AsyncSession, |
| 283 | *, |
| 284 | repo_id: str, |
| 285 | pattern: str, |
| 286 | since: datetime | None = None, |
| 287 | until: datetime | None = None, |
| 288 | limit: int = _DEFAULT_LIMIT, |
| 289 | ) -> SearchResponse: |
| 290 | """Pattern search — case-insensitive substring match against message and branch. |
| 291 | |
| 292 | Matches commits where *pattern* appears anywhere in the commit message or |
| 293 | the branch name. Prioritises message matches over branch-name matches in |
| 294 | the result ordering. |
| 295 | |
| 296 | Args: |
| 297 | session: Async SQLAlchemy session. |
| 298 | repo_id: Repo to search. |
| 299 | pattern: Substring pattern to search for. |
| 300 | since: Earliest committed_at (inclusive). |
| 301 | until: Latest committed_at (inclusive). |
| 302 | limit: Maximum results to return. |
| 303 | |
| 304 | Returns: |
| 305 | :class:`~musehub.models.musehub.SearchResponse` with mode="pattern". |
| 306 | """ |
| 307 | rows, total_scanned = await _fetch_candidates( |
| 308 | session, repo_id=repo_id, since=since, until=until |
| 309 | ) |
| 310 | |
| 311 | pat = pattern.lower() |
| 312 | message_matches: list[SearchCommitMatch] = [] |
| 313 | branch_matches: list[SearchCommitMatch] = [] |
| 314 | |
| 315 | for commit in rows: |
| 316 | if pat in commit.message.lower(): |
| 317 | message_matches.append(_commit_to_match(commit, match_source="message")) |
| 318 | elif pat in commit.branch.lower(): |
| 319 | branch_matches.append(_commit_to_match(commit, match_source="branch")) |
| 320 | |
| 321 | # Message matches come first, then branch matches. |
| 322 | all_matches = (message_matches + branch_matches)[:limit] |
| 323 | |
| 324 | logger.info("✅ musehub search pattern: %d matches (repo=%s)", len(all_matches), repo_id[:8]) |
| 325 | return SearchResponse( |
| 326 | mode="pattern", |
| 327 | query=pattern, |
| 328 | matches=all_matches, |
| 329 | total_scanned=total_scanned, |
| 330 | limit=limit, |
| 331 | ) |