Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,7 @@ import zipfile
|
|
9 |
import os
|
10 |
import io
|
11 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
|
|
12 |
|
13 |
def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
|
14 |
model = SentenceTransformer(model_name)
|
@@ -21,79 +22,8 @@ def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
|
|
21 |
|
22 |
return "The similarity score between the two codes is: %.2f" % overall_similarity
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
file_names = []
|
27 |
-
codes = []
|
28 |
-
|
29 |
-
# Handle .zip files
|
30 |
-
if file_path.endswith('.zip'):
|
31 |
-
with zipfile.ZipFile(file_path, 'r') as z:
|
32 |
-
file_names = z.namelist()
|
33 |
-
codes = [z.read(file).decode('utf-8', errors='ignore') for file in file_names]
|
34 |
-
|
35 |
-
else:
|
36 |
-
raise ValueError("Unsupported file type. Only .zip is supported.")
|
37 |
-
|
38 |
-
return file_names, codes
|
39 |
-
|
40 |
-
def filter_and_return_top(df, similarity_threshold,returned_results):
|
41 |
-
filtered_df = df[df['similarity_score'] > similarity_threshold]
|
42 |
-
return filtered_df.head(returned_results)
|
43 |
-
|
44 |
-
# Perform paraphrase mining with the specified weights
|
45 |
-
def perform_paraphrase_mining(model, codes_list, weight_semantic, weight_levenshtein, weight_jaro_winkler):
|
46 |
-
return paraphrase_mining_with_combined_score(
|
47 |
-
model,
|
48 |
-
codes_list,
|
49 |
-
weight_semantic=weight_semantic,
|
50 |
-
weight_levenshtein=weight_levenshtein,
|
51 |
-
weight_jaro_winkler=weight_jaro_winkler
|
52 |
-
)
|
53 |
-
|
54 |
-
def paraphrase_mining_with_combined_score(
|
55 |
-
model,
|
56 |
-
sentences: List[str],
|
57 |
-
show_progress_bar: bool = False,
|
58 |
-
weight_semantic: float = 1.0,
|
59 |
-
weight_levenshtein: float = 0.0,
|
60 |
-
weight_jaro_winkler: float = 0.0
|
61 |
-
):
|
62 |
-
embeddings = model.encode(
|
63 |
-
sentences, show_progress_bar=show_progress_bar, convert_to_tensor=True)
|
64 |
-
paraphrases = util.paraphrase_mining_embeddings(embeddings, score_function=util.cos_sim)
|
65 |
-
|
66 |
-
results = []
|
67 |
-
for score, i, j in paraphrases:
|
68 |
-
lev_ratio = Levenshtein.normalized_similarity(sentences[i], sentences[j])
|
69 |
-
jaro_winkler_ratio = JaroWinkler.normalized_similarity(sentences[i], sentences[j])
|
70 |
-
|
71 |
-
combined_score = (weight_semantic * score) + \
|
72 |
-
(weight_levenshtein * lev_ratio) + \
|
73 |
-
(weight_jaro_winkler * jaro_winkler_ratio)
|
74 |
-
|
75 |
-
results.append([combined_score, i, j])
|
76 |
-
|
77 |
-
results = sorted(results, key=lambda x: x[0], reverse=True)
|
78 |
-
return results
|
79 |
-
|
80 |
-
def get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
|
81 |
-
file_names, codes = extract_and_read_compressed_file(zipped_file)
|
82 |
-
model = SentenceTransformer(model_name)
|
83 |
-
code_pairs = perform_paraphrase_mining(model, codes,Ws, Wl, Wj)
|
84 |
-
pairs_results = []
|
85 |
-
|
86 |
-
for score, i, j in code_pairs:
|
87 |
-
pairs_results.append({
|
88 |
-
'file_name_1': file_names[i],
|
89 |
-
'file_name_2': file_names[j],
|
90 |
-
'similarity_score': score
|
91 |
-
})
|
92 |
-
|
93 |
-
similarity_df = pd.concat([pd.DataFrame(pairs_results)], ignore_index=True)
|
94 |
-
similarity_df = similarity_df.sort_values(by='similarity_score', ascending=False)
|
95 |
-
result = filter_and_return_top(similarity_df,threshold,number_results).round(2)
|
96 |
-
|
97 |
return result
|
98 |
|
99 |
# Define the Gradio app
|
@@ -108,7 +38,7 @@ with gr.Blocks() as demo:
|
|
108 |
label="Pre-Trained Model to use for Embeddings",
|
109 |
placeholder="Search for Pre-Trained models on Hugging Face",
|
110 |
search_type="model",
|
111 |
-
value = "huggingface/CodeBERTa-small-v1"
|
112 |
)
|
113 |
|
114 |
# Accordion for weights and models
|
@@ -159,7 +89,7 @@ with gr.Blocks() as demo:
|
|
159 |
|
160 |
# Button to trigger the file processing
|
161 |
process_btn = gr.Button("Process File")
|
162 |
-
process_btn.click(
|
163 |
|
164 |
# Launch the Gradio app with live=True
|
165 |
demo.launch(show_error=True,debug=True)
|
|
|
9 |
import os
|
10 |
import io
|
11 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
12 |
+
from matheel.similarity import get_sim_list
|
13 |
|
14 |
def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
|
15 |
model = SentenceTransformer(model_name)
|
|
|
22 |
|
23 |
return "The similarity score between the two codes is: %.2f" % overall_similarity
|
24 |
|
25 |
+
def get_sim_list_gradio(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
|
26 |
+
result = get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
return result
|
28 |
|
29 |
# Define the Gradio app
|
|
|
38 |
label="Pre-Trained Model to use for Embeddings",
|
39 |
placeholder="Search for Pre-Trained models on Hugging Face",
|
40 |
search_type="model",
|
41 |
+
#value = "huggingface/CodeBERTa-small-v1"
|
42 |
)
|
43 |
|
44 |
# Accordion for weights and models
|
|
|
89 |
|
90 |
# Button to trigger the file processing
|
91 |
process_btn = gr.Button("Process File")
|
92 |
+
process_btn.click(get_sim_list_gradio, inputs=[file_uploader, Ws, Wl, Wj, model_dropdown,threshold,number_results], outputs=df_output)
|
93 |
|
94 |
# Launch the Gradio app with live=True
|
95 |
demo.launch(show_error=True,debug=True)
|