musehub/services/musehub_search.py · gabriel/musehub

musehub_search.py python

331 lines 11.1 KB

6b53f1af feat: supercharge all pages, full SOC refactor, and Python 3.14 upgrade (#7) Gabriel Cardona <cgcardona@gmail.com> 5d ago

1	"""MuseHub in-repo search service.
2
3	Provides four search modes over a repo's commit history, all operating on the
4	shared ``muse_cli_commits`` table and scoped to a single ``repo_id``.
5
6	Modes and their underlying algorithms:
7	- ``property`` — musical property filter (delegates to :mod:`musehub.services.muse_find`)
8	- ``ask`` — natural-language query; keyword extraction + overlap scoring
9	- ``keyword`` — raw keyword/phrase overlap (normalised overlap coefficient)
10	- ``pattern`` — substring pattern match against message and branch name
11
12	All four modes return :class:`~musehub.models.musehub.SearchResponse` so the
13	UI can render results with a single shared commit-row template regardless of mode.
14
15	Date-range filtering (``since`` / ``until``) is applied at the SQL layer for
16	efficiency before any Python-level scoring.
17	"""
18
19	import logging
20	import re
21	from datetime import datetime
22
23	from sqlalchemy import and_
24	from sqlalchemy.ext.asyncio import AsyncSession
25	from sqlalchemy.future import select
26
27	from musehub.models.musehub import SearchCommitMatch, SearchResponse
28	from musehub.muse_cli.models import MuseCliCommit
29	# TODO(muse-extraction): muse_find extracted to cgcardona/muse — re-integrate via service API
30
31	logger = logging.getLogger(__name__)
32
33	_DEFAULT_LIMIT = 20
34	_TOKEN_RE = re.compile(r"[a-zA-Z0-9]+")
35
36	# Stop-words stripped during NL ask-mode keyword extraction.
37	_STOP_WORDS = frozenset({
38	"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
39	"have", "has", "had", "do", "does", "did", "will", "would", "could",
40	"should", "may", "might", "shall", "can", "need", "dare", "ought",
41	"i", "my", "me", "we", "our", "you", "your", "he", "she", "it",
42	"they", "their", "them", "what", "when", "where", "who", "which",
43	"how", "why", "in", "on", "at", "to", "of", "for", "and", "or",
44	"but", "not", "with", "from", "by", "about", "into", "through",
45	"did", "make", "made", "last", "any", "all", "that", "this",
46	})
47
48
49	# ---------------------------------------------------------------------------
50	# Internal helpers
51	# ---------------------------------------------------------------------------
52
53
54	def _tokenize(text: str) -> set[str]:
55	"""Return a set of lowercase word tokens from text."""
56	return {m.group().lower() for m in _TOKEN_RE.finditer(text)}
57
58
59	def _overlap_score(query_tokens: set[str], message: str) -> float:
60	"""Normalised overlap coefficient: \|Q ∩ M\| / \|Q\|.
61
62	Returns 1.0 when every query token appears in the message, 0.0 when
63	none do. Returns 0.0 for an empty query set to avoid division by zero.
64	"""
65	if not query_tokens:
66	return 0.0
67	message_tokens = _tokenize(message)
68	return len(query_tokens & message_tokens) / len(query_tokens)
69
70
71	async def _fetch_candidates(
72	session: AsyncSession,
73	*,
74	repo_id: str,
75	since: datetime \| None,
76	until: datetime \| None,
77	cap: int = 5000,
78	) -> tuple[list[MuseCliCommit], int]:
79	"""Fetch candidate commits from DB with optional date range filter.
80
81	Returns ``(rows, total_scanned)`` where ``total_scanned`` is the raw DB count
82	before any Python-level filtering. We over-fetch (up to ``cap``) and let
83	callers apply their own ranking/limit so the SQL stays simple and fast.
84	"""
85	stmt = select(MuseCliCommit).where(MuseCliCommit.repo_id == repo_id)
86
87	conditions = []
88	if since is not None:
89	conditions.append(MuseCliCommit.committed_at >= since)
90	if until is not None:
91	conditions.append(MuseCliCommit.committed_at <= until)
92	if conditions:
93	stmt = stmt.where(and_(*conditions))
94
95	stmt = stmt.order_by(MuseCliCommit.committed_at.desc()).limit(cap)
96
97	result = await session.execute(stmt)
98	rows = list(result.scalars().all())
99	return rows, len(rows)
100
101
102	def _commit_to_match(
103	commit: MuseCliCommit,
104	*,
105	score: float = 1.0,
106	match_source: str = "message",
107	) -> SearchCommitMatch:
108	"""Convert a DB row to the wire-format :class:`SearchCommitMatch`."""
109	return SearchCommitMatch(
110	commit_id=commit.commit_id,
111	branch=commit.branch,
112	message=commit.message,
113	author=commit.author,
114	timestamp=commit.committed_at,
115	score=round(score, 4),
116	match_source=match_source,
117	)
118
119
120	# ---------------------------------------------------------------------------
121	# Search modes
122	# ---------------------------------------------------------------------------
123
124
125	async def search_by_property(
126	session: AsyncSession,
127	*,
128	repo_id: str,
129	harmony: str \| None = None,
130	rhythm: str \| None = None,
131	melody: str \| None = None,
132	structure: str \| None = None,
133	dynamic: str \| None = None,
134	emotion: str \| None = None,
135	since: datetime \| None = None,
136	until: datetime \| None = None,
137	limit: int = _DEFAULT_LIMIT,
138	) -> SearchResponse:
139	"""Musical property filter.
140
141	TODO(muse-extraction): muse_find was extracted to cgcardona/muse.
142	Re-integrate via the Muse service API once available.
143	Currently returns an empty result set.
144	"""
145	active_filters = {
146	k: v for k, v in {
147	"harmony": harmony,
148	"rhythm": rhythm,
149	"melody": melody,
150	"structure": structure,
151	"dynamic": dynamic,
152	"emotion": emotion,
153	}.items() if v is not None
154	}
155	query_echo = " AND ".join(f"{k}={v}" for k, v in active_filters.items()) or "(all commits)"
156	logger.warning("⚠️ musehub search property: muse_find not available (muse-extraction)")
157	return SearchResponse(
158	mode="property",
159	query=query_echo,
160	matches=[],
161	total_scanned=0,
162	limit=limit,
163	)
164
165
166	async def search_by_ask(
167	session: AsyncSession,
168	*,
169	repo_id: str,
170	question: str,
171	since: datetime \| None = None,
172	until: datetime \| None = None,
173	limit: int = _DEFAULT_LIMIT,
174	) -> SearchResponse:
175	"""Natural-language query — keyword extraction + overlap scoring.
176
177	Strips stop-words from the question to produce a focused keyword set,
178	then ranks commits by overlap coefficient. Commits with zero overlap
179	are excluded. Returns at most ``limit`` results ordered by score desc.
180
181	This is a stub implementation; LLM-powered answer generation is a planned
182	enhancement that will replace the keyword scoring step.
183
184	Args:
185	session: Async SQLAlchemy session.
186	repo_id: Repo to search.
187	question: Natural-language question string.
188	since: Earliest committed_at (inclusive).
189	until: Latest committed_at (inclusive).
190	limit: Maximum results to return.
191
192	Returns:
193	:class:`~musehub.models.musehub.SearchResponse` with mode="ask".
194	"""
195	rows, total_scanned = await _fetch_candidates(
196	session, repo_id=repo_id, since=since, until=until
197	)
198
199	# Extract meaningful keywords after stop-word removal.
200	tokens_raw = re.split(r"[\s\W]+", question.lower())
201	keywords: set[str] = {t for t in tokens_raw if t and t not in _STOP_WORDS and len(t) > 1}
202
203	scored: list[tuple[float, MuseCliCommit]] = []
204	for commit in rows:
205	if keywords:
206	score = _overlap_score(keywords, commit.message)
207	else:
208	# No useful tokens → include all commits with neutral score.
209	score = 1.0
210	if score > 0.0:
211	scored.append((score, commit))
212
213	scored.sort(key=lambda x: (x[0], x[1].committed_at.timestamp()), reverse=True)
214	top = scored[:limit]
215
216	matches = [_commit_to_match(c, score=s, match_source="message") for s, c in top]
217
218	logger.info("✅ musehub search ask: %d matches (repo=%s)", len(matches), repo_id[:8])
219	return SearchResponse(
220	mode="ask",
221	query=question,
222	matches=matches,
223	total_scanned=total_scanned,
224	limit=limit,
225	)
226
227
228	async def search_by_keyword(
229	session: AsyncSession,
230	*,
231	repo_id: str,
232	keyword: str,
233	threshold: float = 0.0,
234	since: datetime \| None = None,
235	until: datetime \| None = None,
236	limit: int = _DEFAULT_LIMIT,
237	) -> SearchResponse:
238	"""Keyword search — overlap coefficient over commit messages.
239
240	Tokenises both keyword and each commit message, then scores using the
241	overlap coefficient. Commits below threshold are excluded.
242
243	Args:
244	session: Async SQLAlchemy session.
245	repo_id: Repo to search.
246	keyword: Keyword or phrase to search for.
247	threshold: Minimum overlap score [0, 1] to include a commit (default 0 = any match).
248	since: Earliest committed_at (inclusive).
249	until: Latest committed_at (inclusive).
250	limit: Maximum results to return.
251
252	Returns:
253	:class:`~musehub.models.musehub.SearchResponse` with mode="keyword".
254	"""
255	rows, total_scanned = await _fetch_candidates(
256	session, repo_id=repo_id, since=since, until=until
257	)
258
259	query_tokens = _tokenize(keyword)
260	scored: list[tuple[float, MuseCliCommit]] = []
261	for commit in rows:
262	score = _overlap_score(query_tokens, commit.message)
263	if score >= threshold and score > 0.0:
264	scored.append((score, commit))
265
266	scored.sort(key=lambda x: (x[0], x[1].committed_at.timestamp()), reverse=True)
267	top = scored[:limit]
268
269	matches = [_commit_to_match(c, score=s, match_source="message") for s, c in top]
270
271	logger.info("✅ musehub search keyword: %d matches (repo=%s)", len(matches), repo_id[:8])
272	return SearchResponse(
273	mode="keyword",
274	query=keyword,
275	matches=matches,
276	total_scanned=total_scanned,
277	limit=limit,
278	)
279
280
281	async def search_by_pattern(
282	session: AsyncSession,
283	*,
284	repo_id: str,
285	pattern: str,
286	since: datetime \| None = None,
287	until: datetime \| None = None,
288	limit: int = _DEFAULT_LIMIT,
289	) -> SearchResponse:
290	"""Pattern search — case-insensitive substring match against message and branch.
291
292	Matches commits where pattern appears anywhere in the commit message or
293	the branch name. Prioritises message matches over branch-name matches in
294	the result ordering.
295
296	Args:
297	session: Async SQLAlchemy session.
298	repo_id: Repo to search.
299	pattern: Substring pattern to search for.
300	since: Earliest committed_at (inclusive).
301	until: Latest committed_at (inclusive).
302	limit: Maximum results to return.
303
304	Returns:
305	:class:`~musehub.models.musehub.SearchResponse` with mode="pattern".
306	"""
307	rows, total_scanned = await _fetch_candidates(
308	session, repo_id=repo_id, since=since, until=until
309	)
310
311	pat = pattern.lower()
312	message_matches: list[SearchCommitMatch] = []
313	branch_matches: list[SearchCommitMatch] = []
314
315	for commit in rows:
316	if pat in commit.message.lower():
317	message_matches.append(_commit_to_match(commit, match_source="message"))
318	elif pat in commit.branch.lower():
319	branch_matches.append(_commit_to_match(commit, match_source="branch"))
320
321	# Message matches come first, then branch matches.
322	all_matches = (message_matches + branch_matches)[:limit]
323
324	logger.info("✅ musehub search pattern: %d matches (repo=%s)", len(all_matches), repo_id[:8])
325	return SearchResponse(
326	mode="pattern",
327	query=pattern,
328	matches=all_matches,
329	total_scanned=total_scanned,
330	limit=limit,
331	)

Content Address

Object ID (SHA-256)

e3ed345ae32dd74d7bb69cffe49f99494a519ff98534b00b0a06c1ea62be4c2f

This file is immutable and content-addressed. The same SHA always refers to the same bytes, across every clone and every time.

File Info

Path musehub/services/musehub_search.py

Lines 331

Size 11.1 KB

Language python

Ref 6b53f1af

Snapshot f8ff37c1b40f…

Last Modified

6b53f1af

feat: supercharge all pages, full SOC refactor, and Python 3.14 upgrade (#7)

Gabriel Cardona <cgcardona@gmail.com> 5d ago

View commit →

Links

Browse tree at 6b53f1af All commits View raw