Figea commited on
Commit
27dc682
·
verified ·
1 Parent(s): 73c6cca

Delete src/synonyms_final_vf.py

Browse files
Files changed (1) hide show
  1. src/synonyms_final_vf.py +0 -125
src/synonyms_final_vf.py DELETED
@@ -1,125 +0,0 @@
1
-
2
- import pandas as pd
3
- import spacy
4
- import numpy as np
5
- from sklearn.cluster import DBSCAN
6
- from sklearn.metrics.pairwise import cosine_distances
7
- import matplotlib.pyplot as plt
8
- import nltk
9
- from nltk.corpus import wordnet
10
-
11
- def load_data(file_path):
12
- """
13
- This function loads the data from a given file_path
14
-
15
- parameter: str the file path
16
-
17
- Returns: the unique words in gloss column
18
- """
19
- data = pd.read_csv(file_path, delimiter=";")
20
- return data["gloss"].unique()
21
-
22
- def initialize_spacy_model(model_name="en_core_web_md"):
23
- return spacy.load(model_name)
24
-
25
- def download_wordnet():
26
- """
27
- This function downloads a dictionary that will be used to find antonyms
28
- """
29
- nltk.download('wordnet')
30
-
31
- def generate_word_vectors(words, model):
32
- return np.array([model(word).vector for word in words])
33
-
34
- def plot_k_distance_graph(distances, k):
35
- k_distances = np.sort(distances, axis=1)[:, k]
36
- k_distances = np.sort(k_distances)
37
- plt.figure(figsize=(10, 5))
38
- plt.plot(k_distances)
39
- plt.xlabel('Points sorted by distance')
40
- plt.ylabel(f'{k}-th Nearest Neighbor Distance')
41
- plt.title(f'k-distance Graph for k={k}')
42
- plt.grid(True)
43
- plt.show()
44
-
45
- def perform_dbscan_clustering(word_vectors, eps, min_samples=5):
46
- dbscan = DBSCAN(metric='cosine', eps=eps, min_samples=min_samples)
47
- dbscan.fit(word_vectors)
48
- return dbscan
49
-
50
- def create_cluster_mapping(words, dbscan_labels):
51
- cluster_to_words = {}
52
- for word, cluster in zip(words, dbscan_labels):
53
- if cluster not in cluster_to_words:
54
- cluster_to_words[cluster] = []
55
- cluster_to_words[cluster].append(word)
56
- return cluster_to_words
57
-
58
- def find_antonyms(word):
59
- antonyms = set()
60
- for syn in wordnet.synsets(word):
61
- for lemma in syn.lemmas():
62
- if lemma.antonyms():
63
- antonyms.add(lemma.antonyms()[0].name())
64
- return antonyms
65
-
66
- def find_synonyms_in_cluster(word, model, cluster_to_words, dbscan_model):
67
- """
68
- This function finds the most similar word in the same cluster, and excludes antonyms
69
- """
70
- word_vector = model(word).vector
71
- cluster_label = dbscan_model.fit_predict([word_vector])[0]
72
- cluster_words = cluster_to_words.get(cluster_label, [])
73
-
74
- if not cluster_words:
75
- return None
76
-
77
- antonyms = find_antonyms(word)
78
- similarities = [(dict_word, model(dict_word).similarity(model(word))) for dict_word in cluster_words if dict_word != word and dict_word not in antonyms]
79
-
80
- if not similarities:
81
- return None
82
-
83
- most_similar_word = sorted(similarities, key=lambda item: -item[1])[0][0]
84
- return most_similar_word
85
-
86
- def display_clusters(cluster_to_words):
87
- for cluster_label, words in cluster_to_words.items():
88
- if cluster_label != -1: # Exclude noise points
89
- print(f"Cluster {cluster_label}: {words}")
90
- else:
91
- print(f"Noise: {words}")
92
-
93
- def main(file_path, model_name="en_core_web_md", eps=0.23, min_samples=5, k=5):
94
- global nlp, cluster_to_words, dbscan
95
-
96
- dict_2000 = load_data(file_path)
97
- nlp = initialize_spacy_model(model_name)
98
- download_wordnet()
99
-
100
- word_vectors = generate_word_vectors(dict_2000, nlp)
101
-
102
- # distances = cosine_distances(word_vectors)
103
- # plot_k_distance_graph(distances, k)
104
-
105
- dbscan = perform_dbscan_clustering(word_vectors, eps, min_samples)
106
- cluster_to_words = create_cluster_mapping(dict_2000, dbscan.labels_)
107
-
108
- if __name__ == "__main__":
109
- main("filtered_WLASL.csv")
110
-
111
- ##TEST##
112
- #target_word = "unhappy"
113
- #synonym = find_synonyms_in_cluster(target_word, nlp, cluster_to_words, dbscan)
114
- #print(f"The most similar word to '{target_word}' is '{synonym}'")
115
-
116
- ##If you want to see clusters##
117
- #num_clusters = len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0)
118
- #print(f"Number of clusters: {num_clusters}")
119
-
120
- #cluster_label = dbscan.fit_predict([nlp("unhappy").vector])[0]
121
- #same_cluster_words = cluster_to_words.get(cluster_label, [])
122
- #print(f"Words in the same cluster as 'unhappy': {same_cluster_words}")
123
-
124
- #display_clusters(cluster_to_words)
125
-