Delete semantic_search
Browse files
semantic_search/README.md
DELETED
@@ -1,62 +0,0 @@
|
|
1 |
-
# Semantic Search Codebase
|
2 |
-
|
3 |
-
## Overview
|
4 |
-
CLaMP 2 is a state-of-the-art multimodal music information retrieval system designed to work with 101 languages. This codebase includes scripts for evaluating model performance, performing semantic searches, and calculating similarity metrics based on CLaMP2-extracted **nomarlized** feature vectors from music or text data. Below is a description of the scripts contained in the `semantic_search/` folder.
|
5 |
-
|
6 |
-
## Repository Structure
|
7 |
-
The `semantic_search/` folder contains the following scripts:
|
8 |
-
|
9 |
-
### 1. `clamp2_score.py`
|
10 |
-
This script calculates the cosine similarity between the average feature vectors extracted from two sets of `.npy` files, serving as a measure of similarity between the reference and test datasets.
|
11 |
-
|
12 |
-
It can be used to validate the semantic similarity between generated music and ground truth, providing an objective metric. Through empirical observation, we found that this metric aligns well with subjective judgments made by individuals with professional music expertise.
|
13 |
-
|
14 |
-
**Usage:**
|
15 |
-
```bash
|
16 |
-
python clamp2_score.py <reference_folder> <test_folder>
|
17 |
-
```
|
18 |
-
- `reference_folder`: Path to the folder containing reference `.npy` files.
|
19 |
-
- `test_folder`: Path to the folder containing test `.npy` files.
|
20 |
-
|
21 |
-
**Functionality:**
|
22 |
-
- Loads all `.npy` files from the specified folders.
|
23 |
-
- Computes the average feature vector for each folder.
|
24 |
-
- Calculates the cosine similarity between the two averaged vectors.
|
25 |
-
- Outputs the similarity score rounded to four decimal places.
|
26 |
-
|
27 |
-
### 2. `semantic_search.py`
|
28 |
-
This script performs semantic search by calculating the cosine similarity between a query feature and a set of features stored in `.npy` files.
|
29 |
-
|
30 |
-
**Usage:**
|
31 |
-
```bash
|
32 |
-
python semantic_search.py <query_file> <features_folder> [--top_k TOP_K]
|
33 |
-
```
|
34 |
-
- `query_file`: Path to the query feature file (e.g., `ballad.npy`).
|
35 |
-
- `features_folder`: Path to the folder containing feature files for comparison.
|
36 |
-
- `--top_k`: (Optional) Number of top similar items to display. Defaults to 10 if not specified.
|
37 |
-
|
38 |
-
**Functionality:**
|
39 |
-
- Loads a query feature from the specified file.
|
40 |
-
- Loads feature vectors from the given folder.
|
41 |
-
- Computes cosine similarity between the query feature and each loaded feature vector.
|
42 |
-
- Displays the top K most similar features along with their similarity scores.
|
43 |
-
|
44 |
-
### 3. `semantic_search_metrics.py`
|
45 |
-
This script calculates evaluation metrics for semantic search by comparing query features to reference features.
|
46 |
-
|
47 |
-
**Usage:**
|
48 |
-
```bash
|
49 |
-
python semantic_search_metrics.py <query_folder> <reference_folder>
|
50 |
-
```
|
51 |
-
- `query_folder`: Path to the folder containing query features (in `.npy` format).
|
52 |
-
- `reference_folder`: Path to the folder containing reference features (in `.npy` format).
|
53 |
-
|
54 |
-
**Functionality:**
|
55 |
-
- Loads query features from the specified folder.
|
56 |
-
- Loads reference features from the given folder.
|
57 |
-
- Computes the following metrics based on cosine similarity:
|
58 |
-
- **Mean Reciprocal Rank (MRR)**
|
59 |
-
- **Hit@1**
|
60 |
-
- **Hit@10**
|
61 |
-
- **Hit@100**
|
62 |
-
- Outputs the calculated metrics to the console.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
semantic_search/clamp2_score.py
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import numpy as np
|
3 |
-
import argparse
|
4 |
-
|
5 |
-
def load_npy_files(folder_path):
|
6 |
-
"""
|
7 |
-
Load all .npy files from a specified folder and return a list of numpy arrays.
|
8 |
-
"""
|
9 |
-
npy_list = []
|
10 |
-
for file_name in os.listdir(folder_path):
|
11 |
-
if file_name.endswith('.npy'):
|
12 |
-
file_path = os.path.join(folder_path, file_name)
|
13 |
-
np_array = np.load(file_path)[0]
|
14 |
-
npy_list.append(np_array)
|
15 |
-
return npy_list
|
16 |
-
|
17 |
-
def average_npy(npy_list):
|
18 |
-
"""
|
19 |
-
Compute the average of a list of numpy arrays.
|
20 |
-
"""
|
21 |
-
return np.mean(npy_list, axis=0)
|
22 |
-
|
23 |
-
def cosine_similarity(vec1, vec2):
|
24 |
-
"""
|
25 |
-
Compute cosine similarity between two numpy arrays.
|
26 |
-
"""
|
27 |
-
dot_product = np.dot(vec1, vec2)
|
28 |
-
|
29 |
-
norm_vec1 = np.linalg.norm(vec1)
|
30 |
-
norm_vec2 = np.linalg.norm(vec2)
|
31 |
-
|
32 |
-
cosine_sim = dot_product / (norm_vec1 * norm_vec2)
|
33 |
-
|
34 |
-
return cosine_sim
|
35 |
-
|
36 |
-
if __name__ == '__main__':
|
37 |
-
# Set up argument parsing for input folders
|
38 |
-
parser = argparse.ArgumentParser(description="Calculate cosine similarity between average feature vectors.")
|
39 |
-
parser.add_argument('reference', type=str, help='Path to the reference folder containing .npy files.')
|
40 |
-
parser.add_argument('test', type=str, help='Path to the test folder containing .npy files.')
|
41 |
-
args = parser.parse_args()
|
42 |
-
|
43 |
-
reference = args.reference
|
44 |
-
test = args.test
|
45 |
-
# Load .npy files
|
46 |
-
ref_npy = load_npy_files(reference)
|
47 |
-
test_npy = load_npy_files(test)
|
48 |
-
|
49 |
-
# Compute the average of each list of numpy arrays
|
50 |
-
avg_ref = average_npy(ref_npy)
|
51 |
-
avg_test = average_npy(test_npy)
|
52 |
-
|
53 |
-
# Compute the cosine similarity between the two averaged numpy arrays
|
54 |
-
similarity = cosine_similarity(avg_ref, avg_test)
|
55 |
-
|
56 |
-
# Output the cosine similarity rounded to four decimal places
|
57 |
-
print(f"Cosine similarity between '{reference}' and '{test}': {similarity:.4f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
semantic_search/semantic_search.py
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import torch
|
3 |
-
import numpy as np
|
4 |
-
import argparse
|
5 |
-
|
6 |
-
def get_info(folder_path):
|
7 |
-
"""
|
8 |
-
Load all .npy files from a specified folder and return a dictionary of features.
|
9 |
-
"""
|
10 |
-
files = sorted(os.listdir(folder_path))
|
11 |
-
features = {}
|
12 |
-
|
13 |
-
for file in files:
|
14 |
-
if file.endswith(".npy"):
|
15 |
-
key = file.split(".")[0]
|
16 |
-
features[key] = np.load(os.path.join(folder_path, file))[0]
|
17 |
-
|
18 |
-
return features
|
19 |
-
|
20 |
-
def main(query_file, features_folder, top_k=10):
|
21 |
-
# Load query feature from the specified file
|
22 |
-
query_feature = np.load(query_file)[0] # Load directly from the query file
|
23 |
-
query_tensor = torch.tensor(query_feature).unsqueeze(dim=0)
|
24 |
-
|
25 |
-
# Load key features from the specified folder
|
26 |
-
key_features = get_info(features_folder)
|
27 |
-
|
28 |
-
# Prepare tensor for key features
|
29 |
-
key_feats_tensor = torch.tensor(np.array([key_features[k] for k in key_features.keys()]))
|
30 |
-
|
31 |
-
# Calculate cosine similarity
|
32 |
-
similarities = torch.cosine_similarity(query_tensor, key_feats_tensor)
|
33 |
-
ranked_indices = torch.argsort(similarities, descending=True)
|
34 |
-
|
35 |
-
# Get the keys for the features
|
36 |
-
keys = list(key_features.keys())
|
37 |
-
|
38 |
-
print(f"Top {top_k} similar items:")
|
39 |
-
for i in range(top_k):
|
40 |
-
print(keys[ranked_indices[i]], similarities[ranked_indices[i]].item())
|
41 |
-
|
42 |
-
if __name__ == '__main__':
|
43 |
-
# Set up argument parsing for input paths
|
44 |
-
parser = argparse.ArgumentParser(description="Find top similar features based on cosine similarity.")
|
45 |
-
parser.add_argument('query_file', type=str, help='Path to the query feature file (e.g., ballad.npy).')
|
46 |
-
parser.add_argument('features_folder', type=str, help='Path to the folder containing feature files for comparison.')
|
47 |
-
parser.add_argument('--top_k', type=int, default=10, help='Number of top similar items to display (default: 10).')
|
48 |
-
args = parser.parse_args()
|
49 |
-
|
50 |
-
# Execute the main functionality
|
51 |
-
main(args.query_file, args.features_folder, args.top_k)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
semantic_search/semantic_search_metrics.py
DELETED
@@ -1,78 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import torch
|
3 |
-
import numpy as np
|
4 |
-
import argparse
|
5 |
-
|
6 |
-
def get_features(path):
|
7 |
-
"""
|
8 |
-
Load and return feature data from .npy files in the given directory.
|
9 |
-
Each feature is stored in a dictionary with the filename (without extension) as the key.
|
10 |
-
"""
|
11 |
-
files = sorted(os.listdir(path))
|
12 |
-
features = {}
|
13 |
-
|
14 |
-
for file in files:
|
15 |
-
if file.endswith(".npy"):
|
16 |
-
key = file.split(".")[0]
|
17 |
-
features[key] = np.load(os.path.join(path, file))[0]
|
18 |
-
|
19 |
-
return features
|
20 |
-
|
21 |
-
def calculate_metrics(query_features, reference_features):
|
22 |
-
"""
|
23 |
-
Calculate MRR, Hit@1, Hit@10, and Hit@100 metrics based on the similarity
|
24 |
-
between query and reference features.
|
25 |
-
"""
|
26 |
-
common_keys = set(query_features.keys()) & set(reference_features.keys())
|
27 |
-
mrr, hit_1, hit_10, hit_100 = 0, 0, 0, 0
|
28 |
-
|
29 |
-
for idx, key in enumerate(common_keys):
|
30 |
-
# Convert query feature to tensor and add batch dimension
|
31 |
-
query_feat = torch.tensor(query_features[key]).unsqueeze(dim=0)
|
32 |
-
|
33 |
-
# Collect all reference features for common keys
|
34 |
-
ref_feats = torch.tensor(np.array([reference_features[k] for k in common_keys]))
|
35 |
-
|
36 |
-
# Compute cosine similarity between the query and all reference features
|
37 |
-
similarities = torch.cosine_similarity(query_feat, ref_feats)
|
38 |
-
|
39 |
-
# Create a list of (similarity, index) pairs
|
40 |
-
indexed_sims = list(enumerate(similarities.tolist()))
|
41 |
-
|
42 |
-
# Sort by similarity in descending order, with idx-based tie-breaking
|
43 |
-
sorted_indices = sorted(indexed_sims, key=lambda x: (x[1], x[0] == idx), reverse=True)
|
44 |
-
|
45 |
-
# Extract the sorted rank list
|
46 |
-
ranks = [x[0] for x in sorted_indices]
|
47 |
-
|
48 |
-
# Calculate MRR
|
49 |
-
mrr += 1 / (ranks.index(idx) + 1)
|
50 |
-
|
51 |
-
# Calculate Hit@1, Hit@10, Hit@100
|
52 |
-
if idx in ranks[:100]:
|
53 |
-
hit_100 += 1
|
54 |
-
if idx in ranks[:10]:
|
55 |
-
hit_10 += 1
|
56 |
-
if idx in ranks[:1]:
|
57 |
-
hit_1 += 1
|
58 |
-
|
59 |
-
# Compute the final metrics
|
60 |
-
total_keys = len(common_keys)
|
61 |
-
print(f"MRR: {round(mrr / total_keys, 4)}")
|
62 |
-
print(f"Hit@1: {round(hit_1 / total_keys, 4)}")
|
63 |
-
print(f"Hit@10: {round(hit_10 / total_keys, 4)}")
|
64 |
-
print(f"Hit@100: {round(hit_100 / total_keys, 4)}")
|
65 |
-
|
66 |
-
if __name__ == '__main__':
|
67 |
-
# Set up argument parsing for input directories
|
68 |
-
parser = argparse.ArgumentParser(description="Calculate similarity metrics between query and reference features.")
|
69 |
-
parser.add_argument('query_folder', type=str, help='Path to the folder containing query features (.npy files).')
|
70 |
-
parser.add_argument('reference_folder', type=str, help='Path to the folder containing reference features (.npy files).')
|
71 |
-
args = parser.parse_args()
|
72 |
-
|
73 |
-
# Load features from the specified folders
|
74 |
-
query_features = get_features(args.query_folder)
|
75 |
-
reference_features = get_features(args.reference_folder)
|
76 |
-
|
77 |
-
# Calculate and print the metrics
|
78 |
-
calculate_metrics(query_features, reference_features)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|