gabriel / musehub public
test_musehub_embeddings.py python
278 lines 9.5 KB
cd448303 Initial extraction of MuseHub from maestro monorepo. Gabriel Cardona <gabriel@tellurstori.com> 7d ago
1 """Tests for MuseHub musical feature vector extraction.
2
3 Covers acceptance criteria:
4 - Musical feature extraction from commit messages
5 - Deterministic, reproducible embeddings for the same input
6 - Correct feature parsing (key, tempo, mode, chord complexity)
7 - Vector dimensionality and normalisation
8 - push triggers feature extraction (embed_push_commits integration)
9 """
10 from __future__ import annotations
11
12 import math
13 from unittest.mock import MagicMock, patch
14
15 import pytest
16
17 from musehub.services.musehub_embeddings import (
18 VECTOR_DIM,
19 MusicalFeatures,
20 _encode_text_fingerprint,
21 _l2_normalise,
22 compute_embedding,
23 extract_features_from_message,
24 features_to_vector,
25 )
26
27
28 # ---------------------------------------------------------------------------
29 # extract_features_from_message
30 # ---------------------------------------------------------------------------
31
32
33 def test_embedding_computed_on_push_extracts_key() -> None:
34 """Push commit with key info in message extracts the correct key index."""
35 features = extract_features_from_message("Jazz ballad in Db major at 72 BPM")
36 assert features.key_index == 1 # Db is index 1 in _CHROMATIC
37
38
39 def test_extract_features_major_mode_score() -> None:
40 """Major key commit yields mode_score ≥ 0.5."""
41 features = extract_features_from_message("Composition in G major 120 BPM")
42 assert features.key_index == 7 # G
43 assert features.mode_score >= 0.5
44
45
46 def test_extract_features_minor_mode_score() -> None:
47 """Minor key commit yields mode_score < 0.5."""
48 features = extract_features_from_message("Dark theme in A minor 80 BPM")
49 assert features.mode_score < 0.5
50
51
52 def test_extract_features_tempo_normalisation() -> None:
53 """Tempo is normalised into [0, 1] range."""
54 features = extract_features_from_message("Fast tempo at 180 BPM")
55 assert 0.0 <= features.tempo_norm <= 1.0
56 # 180 BPM: (180-20)/280 ≈ 0.571
57 assert abs(features.tempo_norm - (180 - 20) / 280.0) < 0.01
58
59
60 def test_extract_features_tempo_clamped() -> None:
61 """Tempo outside valid range is clamped to [0, 1]."""
62 features = extract_features_from_message("Extreme tempo 350 BPM")
63 assert features.tempo_norm == 1.0
64
65
66 def test_extract_features_chord_complexity_extended() -> None:
67 """Commit mentioning extended chords yields higher chord_complexity."""
68 simple = extract_features_from_message("Simple triads in C major")
69 extended = extract_features_from_message("Jazz chords: 7th 9th 11th 13th in C major")
70 assert extended.chord_complexity > simple.chord_complexity
71
72
73 def test_extract_features_chroma_populated_for_known_key() -> None:
74 """Chroma histogram is non-zero for commits with a known key."""
75 features = extract_features_from_message("Piece in C major 120 BPM")
76 assert features.chroma[0] > 0 # C is tonic
77 assert features.chroma[7] > 0 # G is perfect fifth
78
79
80 def test_extract_features_unknown_message() -> None:
81 """Non-musical commit message produces neutral defaults without crashing."""
82 features = extract_features_from_message("fix: typo in README")
83 assert features.key_index == -1
84 assert 0.0 <= features.mode_score <= 1.0
85 assert 0.0 <= features.valence <= 1.0
86
87
88 def test_extract_features_valence_matches_mode() -> None:
89 """Valence equals mode_score — derived from mode, not independently computed."""
90 features = extract_features_from_message("Piece in F major 120 BPM")
91 assert features.valence == features.mode_score
92
93
94 # ---------------------------------------------------------------------------
95 # features_to_vector
96 # ---------------------------------------------------------------------------
97
98
99 def test_features_to_vector_correct_dimension() -> None:
100 """Output vector has exactly VECTOR_DIM dimensions."""
101 features = MusicalFeatures()
102 vector = features_to_vector(features)
103 assert len(vector) == VECTOR_DIM
104
105
106 def test_features_to_vector_l2_unit_norm() -> None:
107 """Non-zero vector is L2-normalised (norm ≈ 1.0)."""
108 features = MusicalFeatures(key_index=0, mode_score=0.7, tempo_norm=0.5)
109 vector = features_to_vector(features)
110 norm = math.sqrt(sum(v * v for v in vector))
111 assert abs(norm - 1.0) < 1e-6
112
113
114 def test_features_to_vector_all_finite() -> None:
115 """All vector components are finite (no NaN or Inf)."""
116 features = extract_features_from_message("F# minor at 90 BPM with sus chords")
117 vector = features_to_vector(features)
118 assert all(math.isfinite(v) for v in vector)
119
120
121 # ---------------------------------------------------------------------------
122 # compute_embedding
123 # ---------------------------------------------------------------------------
124
125
126 def test_embedding_stored_in_qdrant_deterministic() -> None:
127 """Same commit message always produces the same embedding vector."""
128 message = "Jazz ballad in Db major at 72 BPM"
129 v1 = compute_embedding(message)
130 v2 = compute_embedding(message)
131 assert v1 == v2
132
133
134 def test_compute_embedding_different_messages_differ() -> None:
135 """Distinct commit messages produce distinct embeddings."""
136 v1 = compute_embedding("Piece in C major 120 BPM")
137 v2 = compute_embedding("Piece in F# minor 60 BPM with 9th 11th chords")
138 assert v1 != v2
139
140
141 def test_compute_embedding_returns_vector_dim() -> None:
142 """compute_embedding returns a vector of the expected dimensionality."""
143 vector = compute_embedding("Test composition")
144 assert len(vector) == VECTOR_DIM
145
146
147 # ---------------------------------------------------------------------------
148 # embed_push_commits — integration with MusehubQdrantClient
149 # ---------------------------------------------------------------------------
150
151
152 def test_embedding_computed_on_push_calls_upsert() -> None:
153 """embed_push_commits calls qdrant upsert for each commit in the push payload."""
154 from musehub.models.musehub import CommitInput
155 from datetime import datetime, timezone
156 from musehub.services.musehub_sync import embed_push_commits
157
158 commits = [
159 CommitInput(
160 commit_id="abc123",
161 parent_ids=[],
162 message="Jazz ballad in Db major at 72 BPM",
163 timestamp=datetime(2024, 1, 1, tzinfo=timezone.utc),
164 snapshot_id=None,
165 author=None,
166 ),
167 CommitInput(
168 commit_id="def456",
169 parent_ids=["abc123"],
170 message="Variation in A minor at 90 BPM",
171 timestamp=datetime(2024, 1, 2, tzinfo=timezone.utc),
172 snapshot_id=None,
173 author=None,
174 ),
175 ]
176
177 mock_client = MagicMock()
178 with patch(
179 "musehub.services.musehub_sync.get_qdrant_client",
180 return_value=mock_client,
181 ):
182 embed_push_commits(
183 commits=commits,
184 repo_id="repo-001",
185 branch="main",
186 author="composer@stori",
187 is_public=True,
188 )
189
190 assert mock_client.upsert_embedding.call_count == 2
191
192
193 def test_embedding_computed_on_push_empty_commits_is_noop() -> None:
194 """embed_push_commits with empty list makes no Qdrant calls."""
195 from musehub.services.musehub_sync import embed_push_commits
196
197 mock_client = MagicMock()
198 with patch(
199 "musehub.services.musehub_sync.get_qdrant_client",
200 return_value=mock_client,
201 ):
202 embed_push_commits(
203 commits=[],
204 repo_id="repo-001",
205 branch="main",
206 author="composer@stori",
207 is_public=True,
208 )
209
210 mock_client.upsert_embedding.assert_not_called()
211
212
213 def test_embedding_computed_on_push_qdrant_error_does_not_raise() -> None:
214 """embed_push_commits logs errors from Qdrant without propagating exceptions.
215
216 The push response must not be blocked or failed by an embedding error.
217 """
218 from datetime import datetime, timezone
219 from musehub.models.musehub import CommitInput
220 from musehub.services.musehub_sync import embed_push_commits
221
222 commits = [
223 CommitInput(
224 commit_id="fail-commit",
225 parent_ids=[],
226 message="C major 120 BPM",
227 timestamp=datetime(2024, 1, 1, tzinfo=timezone.utc),
228 snapshot_id=None,
229 author=None,
230 )
231 ]
232
233 mock_client = MagicMock()
234 mock_client.upsert_embedding.side_effect = RuntimeError("Qdrant unavailable")
235
236 with patch(
237 "musehub.services.musehub_sync.get_qdrant_client",
238 return_value=mock_client,
239 ):
240 embed_push_commits(
241 commits=commits,
242 repo_id="repo-001",
243 branch="main",
244 author="composer@stori",
245 is_public=False,
246 )
247 # No exception raised — test passes by reaching this line
248
249
250 # ---------------------------------------------------------------------------
251 # Private helper unit tests
252 # ---------------------------------------------------------------------------
253
254
255 def test_text_fingerprint_length() -> None:
256 """Text fingerprint always produces 16 floats."""
257 fp = _encode_text_fingerprint("anything goes")
258 assert len(fp) == 16
259
260
261 def test_text_fingerprint_values_in_range() -> None:
262 """All fingerprint values are in [0, 1]."""
263 fp = _encode_text_fingerprint("jazz composition in C major")
264 assert all(0.0 <= v <= 1.0 for v in fp)
265
266
267 def test_l2_normalise_unit_vector() -> None:
268 """l2_normalise produces a unit vector."""
269 v = [3.0, 4.0]
270 result = _l2_normalise(v)
271 assert abs(math.sqrt(sum(x * x for x in result)) - 1.0) < 1e-9
272
273
274 def test_l2_normalise_zero_vector_unchanged() -> None:
275 """l2_normalise returns zero vector unchanged (no divide-by-zero)."""
276 v = [0.0] * 10
277 result = _l2_normalise(v)
278 assert result == v