sander-wood commited on
Commit
2a2e7c2
·
verified ·
1 Parent(s): ad822ab

Upload 12 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. 4tDYMayp6Dk.jpg +0 -0
  3. BuYf0taXoNw.jpg +0 -0
  4. Kw-_Ew5bVxs.jpg +0 -0
  5. V4EauuhVEw4.jpg +3 -0
  6. app.py +84 -37
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ V4EauuhVEw4.jpg filter=lfs diff=lfs merge=lfs -text
4tDYMayp6Dk.jpg ADDED
BuYf0taXoNw.jpg ADDED
Kw-_Ew5bVxs.jpg ADDED
V4EauuhVEw4.jpg ADDED

Git LFS Details

  • SHA256: c8773f450b8d2adc2e041c9f21bcb88cc861ee34300dae04734ff9ae57f413a7
  • Pointer size: 131 Bytes
  • Size of remote file: 292 kB
app.py CHANGED
@@ -12,26 +12,42 @@ from transformers import BlipProcessor, BlipForConditionalGeneration
12
  title = "# 🗜️ CLaMP 3 - Multimodal & Multilingual Semantic Music Search"
13
 
14
  badges = """
15
- <div style="text-align: center;">
16
- <a href="#"><img src="https://img.shields.io/badge/CLaMP%203%20Homepage-Coming%20Soon-lightgrey?style=for-the-badge&logo=home-assistant" alt="Homepage"></a>
17
- <a href="#"><img src="https://img.shields.io/badge/CLaMP%203%20Paper-Coming%20Soon-lightgrey?style=for-the-badge&logo=arxiv" alt="Paper"></a>
18
- <a href="https://github.com/sanderwood/clamp3"><img src="https://img.shields.io/badge/CLaMP%203%20Code-GitHub-181717?style=for-the-badge&logo=github" alt="GitHub"></a>
19
- <a href="https://huggingface.co/sander-wood/clamp3/tree/main"><img src="https://img.shields.io/badge/Model%20Weights-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Model Weights"></a>
20
- <a href="https://huggingface.co/datasets/sander-wood/m4-rag"><img src="https://img.shields.io/badge/M4--RAG%20Pretraining%20Dataset-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Dataset"></a>
21
- <a href="https://huggingface.co/datasets/sander-wood/wikimt-x"><img src="https://img.shields.io/badge/WikiMT--X%20Evaluation%20Benchmark-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Benchmark"></a>
22
- </div>
23
- <style>
24
- div a {
25
- display: inline-block;
26
- margin: 5px;
27
- }
28
- div a img {
29
- height: 30px;
30
- }
31
- </style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  """
33
 
34
- description = """CLaMP 3 is a **multimodal and multilingual** music information retrieval (MIR) framework, supporting **sheet music, audio, and performance signals** in over **100 languages**. Using **contrastive learning**, it aligns these modalities in a shared space for **cross-modal retrieval**.
35
 
36
  ### 🔍 **How This Demo Works**
37
  - You can **retrieve music using any text input (in any language) or an image** (`.png`, `.jpg`).
@@ -41,10 +57,13 @@ description = """CLaMP 3 is a **multimodal and multilingual** music information
41
  ### ⚠️ **Limitations**
42
  - This demo retrieves music **only from the WikiMT-X benchmark (1,000 pieces)**.
43
  - These pieces are **mainly from the U.S. and Western Europe (especially the U.S.)** and **mostly from the 20th century**.
44
- - The retrieval results are **mostly limited to Western 20th-century music**, so you **won’t** find music from **other regions or historical periods**.
45
- - If you need retrieval for a **different music collection**, deploy **CLaMP 3 on your own dataset**.
 
 
46
 
47
- This demo is for **research purposes only**."""
 
48
 
49
  # Load BLIP image captioning model and processor
50
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
@@ -246,20 +265,27 @@ def search_wrapper(search_mode, text_input, image_input):
246
  details = show_details(top_candidate)
247
  return text_to_use, gr.update(choices=choices), *details
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  with gr.Blocks() as demo:
250
  gr.Markdown(title)
251
  gr.HTML(badges)
252
  gr.Markdown(description)
253
- gr.HTML(
254
- """
255
- <style>
256
- .vertical-radio .gradio-radio label {
257
- display: block !important;
258
- margin-bottom: 5px;
259
- }
260
- </style>
261
- """
262
- )
263
  with gr.Row():
264
  with gr.Column():
265
  search_mode = gr.Radio(
@@ -270,13 +296,13 @@ with gr.Blocks() as demo:
270
  elem_classes=["vertical-radio"]
271
  )
272
  text_input = gr.Textbox(
273
- placeholder="Describe the music you're looking for (in any language)",
274
- lines=4
275
- )
276
  image_input = gr.Image(
277
- label="Or upload an image (PNG, JPG)",
278
- type="pil"
279
- )
280
  search_button = gr.Button("Search")
281
  candidate_radio = gr.Radio(choices=[], label="Select Retrieval Result", interactive=True, elem_classes=["vertical-radio"])
282
  with gr.Column():
@@ -290,11 +316,32 @@ with gr.Blocks() as demo:
290
  analysis_box = gr.Textbox(label="Analysis", interactive=False)
291
  description_box = gr.Textbox(label="Description", interactive=False)
292
  scene_box = gr.Textbox(label="Scene", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  search_button.click(
294
  fn=search_wrapper,
295
  inputs=[search_mode, text_input, image_input],
296
  outputs=[text_input, candidate_radio, title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box]
297
  )
 
298
  candidate_radio.change(
299
  fn=show_details,
300
  inputs=candidate_radio,
 
12
  title = "# 🗜️ CLaMP 3 - Multimodal & Multilingual Semantic Music Search"
13
 
14
  badges = """
15
+ <div style="text-align: center;">
16
+ <a href="https://sanderwood.github.io/clamp3/">
17
+ <img src="https://img.shields.io/badge/CLaMP%203%20Homepage-GitHub-181717?style=for-the-badge&logo=home-assistant" alt="Homepage">
18
+ </a>
19
+ <a href="https://arxiv.org/abs/2502.10362">
20
+ <img src="https://img.shields.io/badge/CLaMP%203%20Paper-Arxiv-red?style=for-the-badge&logo=arxiv" alt="Paper">
21
+ </a>
22
+ <a href="https://github.com/sanderwood/clamp3">
23
+ <img src="https://img.shields.io/badge/CLaMP%203%20Code-GitHub-181717?style=for-the-badge&logo=github" alt="GitHub">
24
+ </a>
25
+ <a href="https://huggingface.co/spaces/sander-wood/clamp3">
26
+ <img src="https://img.shields.io/badge/CLaMP%203%20Demo-Gradio-green?style=for-the-badge&logo=gradio" alt="Demo">
27
+ </a>
28
+ <a href="https://huggingface.co/sander-wood/clamp3/tree/main">
29
+ <img src="https://img.shields.io/badge/Model%20Weights-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Model Weights">
30
+ </a>
31
+ <a href="https://huggingface.co/datasets/sander-wood/m4-rag">
32
+ <img src="https://img.shields.io/badge/M4--RAG%20Dataset-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Dataset">
33
+ </a>
34
+ <a href="https://huggingface.co/datasets/sander-wood/wikimt-x">
35
+ <img src="https://img.shields.io/badge/WikiMT--X%20Benchmark-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Benchmark">
36
+ </a>
37
+ </div>
38
+
39
+ <style>
40
+ div a {
41
+ display: inline-block;
42
+ margin: 5px;
43
+ }
44
+ div a img {
45
+ height: 30px;
46
+ }
47
+ </style>
48
  """
49
 
50
+ description = """CLaMP 3 is a **multimodal and multilingual** music information retrieval (MIR) framework, supporting **sheet music, audio, and performance signals** in **100 languages**. Using **contrastive learning**, it aligns these modalities in a shared space for **cross-modal retrieval**.
51
 
52
  ### 🔍 **How This Demo Works**
53
  - You can **retrieve music using any text input (in any language) or an image** (`.png`, `.jpg`).
 
57
  ### ⚠️ **Limitations**
58
  - This demo retrieves music **only from the WikiMT-X benchmark (1,000 pieces)**.
59
  - These pieces are **mainly from the U.S. and Western Europe (especially the U.S.)** and **mostly from the 20th century**.
60
+ - Thus, retrieval results are **mostly limited to Western 20th-century music**, so you **won’t** find music from **other regions or historical periods**.
61
+
62
+ 🔧 **Need retrieval for a different music collection?** Deploy **[CLaMP 3](https://github.com/sanderwood/clamp3)** on your own dataset.
63
+ Generally, the larger and more diverse the reference music dataset, the better the retrieval quality, increasing the likelihood of finding relevant and accurately matched music.
64
 
65
+ **Note: This project is for research use only.**
66
+ """
67
 
68
  # Load BLIP image captioning model and processor
69
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 
265
  details = show_details(top_candidate)
266
  return text_to_use, gr.update(choices=choices), *details
267
 
268
+ # 定义示例数据(示例数据放在组件定义之后也可以正常运行)
269
+ examples = [
270
+ ["Text", "classic rock, British, 1960s, upbeat", None],
271
+ ["Text", "A Latin jazz piece with rhythmic percussion and brass", None],
272
+ ["Text", "big band, major key, swing, brass-heavy, syncopation, baritone vocal", None],
273
+ ["Text", "Heartfelt and nostalgic, with a bittersweet, melancholic feel", None],
274
+ ["Text", "Melodía instrumental en re mayor con progresión armónica repetitiva y fluida", None],
275
+ ["Text", "D大调四四拍的爱尔兰舞曲", None],
276
+ ["Text", "Ιερή μουσική με πνευματική ατμόσφαιρα", None],
277
+ ["Text", "የፍቅር ሙዚቃ ሞቅ እና ስሜታማ ከሆነ ነገር ግን ድንቅ እና አስደሳች ቃላት ያካትታል", None],
278
+ ["Image", None, "V4EauuhVEw4.jpg"],
279
+ ["Image", None, "Kw-_Ew5bVxs.jpg"],
280
+ ["Image", None, "BuYf0taXoNw.jpg"],
281
+ ["Image", None, "4tDYMayp6Dk.jpg"],
282
+ ]
283
+
284
  with gr.Blocks() as demo:
285
  gr.Markdown(title)
286
  gr.HTML(badges)
287
  gr.Markdown(description)
288
+
 
 
 
 
 
 
 
 
 
289
  with gr.Row():
290
  with gr.Column():
291
  search_mode = gr.Radio(
 
296
  elem_classes=["vertical-radio"]
297
  )
298
  text_input = gr.Textbox(
299
+ placeholder="Describe the music you're looking for (in any language)",
300
+ lines=4
301
+ )
302
  image_input = gr.Image(
303
+ label="Or upload an image (PNG, JPG)",
304
+ type="pil"
305
+ )
306
  search_button = gr.Button("Search")
307
  candidate_radio = gr.Radio(choices=[], label="Select Retrieval Result", interactive=True, elem_classes=["vertical-radio"])
308
  with gr.Column():
 
316
  analysis_box = gr.Textbox(label="Analysis", interactive=False)
317
  description_box = gr.Textbox(label="Description", interactive=False)
318
  scene_box = gr.Textbox(label="Scene", interactive=False)
319
+
320
+ gr.HTML(
321
+ """
322
+ <style>
323
+ .vertical-radio .gradio-radio label {
324
+ display: block !important;
325
+ margin-bottom: 5px;
326
+ }
327
+ </style>
328
+ """
329
+ )
330
+
331
+ gr.Examples(
332
+ examples=examples,
333
+ inputs=[search_mode, text_input, image_input],
334
+ outputs=[text_input, candidate_radio, title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box],
335
+ fn=search_wrapper,
336
+ cache_examples=False,
337
+ )
338
+
339
  search_button.click(
340
  fn=search_wrapper,
341
  inputs=[search_mode, text_input, image_input],
342
  outputs=[text_input, candidate_radio, title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box]
343
  )
344
+
345
  candidate_radio.change(
346
  fn=show_details,
347
  inputs=candidate_radio,