sander-wood commited on
Commit
96e13a0
·
verified ·
1 Parent(s): e3b6675

Delete semantic_search

Browse files
semantic_search/README.md DELETED
@@ -1,62 +0,0 @@
1
- # Semantic Search Codebase
2
-
3
- ## Overview
4
- CLaMP 2 is a state-of-the-art multimodal music information retrieval system designed to work with 101 languages. This codebase includes scripts for evaluating model performance, performing semantic searches, and calculating similarity metrics based on CLaMP2-extracted **nomarlized** feature vectors from music or text data. Below is a description of the scripts contained in the `semantic_search/` folder.
5
-
6
- ## Repository Structure
7
- The `semantic_search/` folder contains the following scripts:
8
-
9
- ### 1. `clamp2_score.py`
10
- This script calculates the cosine similarity between the average feature vectors extracted from two sets of `.npy` files, serving as a measure of similarity between the reference and test datasets.
11
-
12
- It can be used to validate the semantic similarity between generated music and ground truth, providing an objective metric. Through empirical observation, we found that this metric aligns well with subjective judgments made by individuals with professional music expertise.
13
-
14
- **Usage:**
15
- ```bash
16
- python clamp2_score.py <reference_folder> <test_folder>
17
- ```
18
- - `reference_folder`: Path to the folder containing reference `.npy` files.
19
- - `test_folder`: Path to the folder containing test `.npy` files.
20
-
21
- **Functionality:**
22
- - Loads all `.npy` files from the specified folders.
23
- - Computes the average feature vector for each folder.
24
- - Calculates the cosine similarity between the two averaged vectors.
25
- - Outputs the similarity score rounded to four decimal places.
26
-
27
- ### 2. `semantic_search.py`
28
- This script performs semantic search by calculating the cosine similarity between a query feature and a set of features stored in `.npy` files.
29
-
30
- **Usage:**
31
- ```bash
32
- python semantic_search.py <query_file> <features_folder> [--top_k TOP_K]
33
- ```
34
- - `query_file`: Path to the query feature file (e.g., `ballad.npy`).
35
- - `features_folder`: Path to the folder containing feature files for comparison.
36
- - `--top_k`: (Optional) Number of top similar items to display. Defaults to 10 if not specified.
37
-
38
- **Functionality:**
39
- - Loads a query feature from the specified file.
40
- - Loads feature vectors from the given folder.
41
- - Computes cosine similarity between the query feature and each loaded feature vector.
42
- - Displays the top K most similar features along with their similarity scores.
43
-
44
- ### 3. `semantic_search_metrics.py`
45
- This script calculates evaluation metrics for semantic search by comparing query features to reference features.
46
-
47
- **Usage:**
48
- ```bash
49
- python semantic_search_metrics.py <query_folder> <reference_folder>
50
- ```
51
- - `query_folder`: Path to the folder containing query features (in `.npy` format).
52
- - `reference_folder`: Path to the folder containing reference features (in `.npy` format).
53
-
54
- **Functionality:**
55
- - Loads query features from the specified folder.
56
- - Loads reference features from the given folder.
57
- - Computes the following metrics based on cosine similarity:
58
- - **Mean Reciprocal Rank (MRR)**
59
- - **Hit@1**
60
- - **Hit@10**
61
- - **Hit@100**
62
- - Outputs the calculated metrics to the console.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
semantic_search/clamp2_score.py DELETED
@@ -1,57 +0,0 @@
1
- import os
2
- import numpy as np
3
- import argparse
4
-
5
- def load_npy_files(folder_path):
6
- """
7
- Load all .npy files from a specified folder and return a list of numpy arrays.
8
- """
9
- npy_list = []
10
- for file_name in os.listdir(folder_path):
11
- if file_name.endswith('.npy'):
12
- file_path = os.path.join(folder_path, file_name)
13
- np_array = np.load(file_path)[0]
14
- npy_list.append(np_array)
15
- return npy_list
16
-
17
- def average_npy(npy_list):
18
- """
19
- Compute the average of a list of numpy arrays.
20
- """
21
- return np.mean(npy_list, axis=0)
22
-
23
- def cosine_similarity(vec1, vec2):
24
- """
25
- Compute cosine similarity between two numpy arrays.
26
- """
27
- dot_product = np.dot(vec1, vec2)
28
-
29
- norm_vec1 = np.linalg.norm(vec1)
30
- norm_vec2 = np.linalg.norm(vec2)
31
-
32
- cosine_sim = dot_product / (norm_vec1 * norm_vec2)
33
-
34
- return cosine_sim
35
-
36
- if __name__ == '__main__':
37
- # Set up argument parsing for input folders
38
- parser = argparse.ArgumentParser(description="Calculate cosine similarity between average feature vectors.")
39
- parser.add_argument('reference', type=str, help='Path to the reference folder containing .npy files.')
40
- parser.add_argument('test', type=str, help='Path to the test folder containing .npy files.')
41
- args = parser.parse_args()
42
-
43
- reference = args.reference
44
- test = args.test
45
- # Load .npy files
46
- ref_npy = load_npy_files(reference)
47
- test_npy = load_npy_files(test)
48
-
49
- # Compute the average of each list of numpy arrays
50
- avg_ref = average_npy(ref_npy)
51
- avg_test = average_npy(test_npy)
52
-
53
- # Compute the cosine similarity between the two averaged numpy arrays
54
- similarity = cosine_similarity(avg_ref, avg_test)
55
-
56
- # Output the cosine similarity rounded to four decimal places
57
- print(f"Cosine similarity between '{reference}' and '{test}': {similarity:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
semantic_search/semantic_search.py DELETED
@@ -1,51 +0,0 @@
1
- import os
2
- import torch
3
- import numpy as np
4
- import argparse
5
-
6
- def get_info(folder_path):
7
- """
8
- Load all .npy files from a specified folder and return a dictionary of features.
9
- """
10
- files = sorted(os.listdir(folder_path))
11
- features = {}
12
-
13
- for file in files:
14
- if file.endswith(".npy"):
15
- key = file.split(".")[0]
16
- features[key] = np.load(os.path.join(folder_path, file))[0]
17
-
18
- return features
19
-
20
- def main(query_file, features_folder, top_k=10):
21
- # Load query feature from the specified file
22
- query_feature = np.load(query_file)[0] # Load directly from the query file
23
- query_tensor = torch.tensor(query_feature).unsqueeze(dim=0)
24
-
25
- # Load key features from the specified folder
26
- key_features = get_info(features_folder)
27
-
28
- # Prepare tensor for key features
29
- key_feats_tensor = torch.tensor(np.array([key_features[k] for k in key_features.keys()]))
30
-
31
- # Calculate cosine similarity
32
- similarities = torch.cosine_similarity(query_tensor, key_feats_tensor)
33
- ranked_indices = torch.argsort(similarities, descending=True)
34
-
35
- # Get the keys for the features
36
- keys = list(key_features.keys())
37
-
38
- print(f"Top {top_k} similar items:")
39
- for i in range(top_k):
40
- print(keys[ranked_indices[i]], similarities[ranked_indices[i]].item())
41
-
42
- if __name__ == '__main__':
43
- # Set up argument parsing for input paths
44
- parser = argparse.ArgumentParser(description="Find top similar features based on cosine similarity.")
45
- parser.add_argument('query_file', type=str, help='Path to the query feature file (e.g., ballad.npy).')
46
- parser.add_argument('features_folder', type=str, help='Path to the folder containing feature files for comparison.')
47
- parser.add_argument('--top_k', type=int, default=10, help='Number of top similar items to display (default: 10).')
48
- args = parser.parse_args()
49
-
50
- # Execute the main functionality
51
- main(args.query_file, args.features_folder, args.top_k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
semantic_search/semantic_search_metrics.py DELETED
@@ -1,78 +0,0 @@
1
- import os
2
- import torch
3
- import numpy as np
4
- import argparse
5
-
6
- def get_features(path):
7
- """
8
- Load and return feature data from .npy files in the given directory.
9
- Each feature is stored in a dictionary with the filename (without extension) as the key.
10
- """
11
- files = sorted(os.listdir(path))
12
- features = {}
13
-
14
- for file in files:
15
- if file.endswith(".npy"):
16
- key = file.split(".")[0]
17
- features[key] = np.load(os.path.join(path, file))[0]
18
-
19
- return features
20
-
21
- def calculate_metrics(query_features, reference_features):
22
- """
23
- Calculate MRR, Hit@1, Hit@10, and Hit@100 metrics based on the similarity
24
- between query and reference features.
25
- """
26
- common_keys = set(query_features.keys()) & set(reference_features.keys())
27
- mrr, hit_1, hit_10, hit_100 = 0, 0, 0, 0
28
-
29
- for idx, key in enumerate(common_keys):
30
- # Convert query feature to tensor and add batch dimension
31
- query_feat = torch.tensor(query_features[key]).unsqueeze(dim=0)
32
-
33
- # Collect all reference features for common keys
34
- ref_feats = torch.tensor(np.array([reference_features[k] for k in common_keys]))
35
-
36
- # Compute cosine similarity between the query and all reference features
37
- similarities = torch.cosine_similarity(query_feat, ref_feats)
38
-
39
- # Create a list of (similarity, index) pairs
40
- indexed_sims = list(enumerate(similarities.tolist()))
41
-
42
- # Sort by similarity in descending order, with idx-based tie-breaking
43
- sorted_indices = sorted(indexed_sims, key=lambda x: (x[1], x[0] == idx), reverse=True)
44
-
45
- # Extract the sorted rank list
46
- ranks = [x[0] for x in sorted_indices]
47
-
48
- # Calculate MRR
49
- mrr += 1 / (ranks.index(idx) + 1)
50
-
51
- # Calculate Hit@1, Hit@10, Hit@100
52
- if idx in ranks[:100]:
53
- hit_100 += 1
54
- if idx in ranks[:10]:
55
- hit_10 += 1
56
- if idx in ranks[:1]:
57
- hit_1 += 1
58
-
59
- # Compute the final metrics
60
- total_keys = len(common_keys)
61
- print(f"MRR: {round(mrr / total_keys, 4)}")
62
- print(f"Hit@1: {round(hit_1 / total_keys, 4)}")
63
- print(f"Hit@10: {round(hit_10 / total_keys, 4)}")
64
- print(f"Hit@100: {round(hit_100 / total_keys, 4)}")
65
-
66
- if __name__ == '__main__':
67
- # Set up argument parsing for input directories
68
- parser = argparse.ArgumentParser(description="Calculate similarity metrics between query and reference features.")
69
- parser.add_argument('query_folder', type=str, help='Path to the folder containing query features (.npy files).')
70
- parser.add_argument('reference_folder', type=str, help='Path to the folder containing reference features (.npy files).')
71
- args = parser.parse_args()
72
-
73
- # Load features from the specified folders
74
- query_features = get_features(args.query_folder)
75
- reference_features = get_features(args.reference_folder)
76
-
77
- # Calculate and print the metrics
78
- calculate_metrics(query_features, reference_features)