buelfhood commited on
Commit
685453f
·
verified ·
1 Parent(s): 924577a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -92
app.py CHANGED
@@ -8,6 +8,8 @@ from typing import List
8
  import zipfile
9
  import os
10
  import io
 
 
11
 
12
  def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
13
  model = SentenceTransformer(model_name)
@@ -20,83 +22,12 @@ def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
20
 
21
  return "The similarity score between the two codes is: %.2f" % overall_similarity
22
 
23
- # Define the function to process the uploaded file and return a DataFrame
24
- def extract_and_read_compressed_file(file_path):
25
- file_names = []
26
- codes = []
27
-
28
- # Handle .zip files
29
- if file_path.endswith('.zip'):
30
- with zipfile.ZipFile(file_path, 'r') as z:
31
- file_names = z.namelist()
32
- codes = [z.read(file).decode('utf-8', errors='ignore') for file in file_names]
33
-
34
- else:
35
- raise ValueError("Unsupported file type. Only .zip is supported.")
36
-
37
- return file_names, codes
38
-
39
- def filter_and_return_top(df, similarity_threshold,returned_results):
40
- filtered_df = df[df['similarity_score'] > similarity_threshold]
41
- return filtered_df.head(returned_results)
42
-
43
- # Perform paraphrase mining with the specified weights
44
- def perform_paraphrase_mining(model, codes_list, weight_semantic, weight_levenshtein, weight_jaro_winkler):
45
- return paraphrase_mining_with_combined_score(
46
- model,
47
- codes_list,
48
- weight_semantic=weight_semantic,
49
- weight_levenshtein=weight_levenshtein,
50
- weight_jaro_winkler=weight_jaro_winkler
51
- )
52
-
53
- def paraphrase_mining_with_combined_score(
54
- model,
55
- sentences: List[str],
56
- show_progress_bar: bool = False,
57
- weight_semantic: float = 1.0,
58
- weight_levenshtein: float = 0.0,
59
- weight_jaro_winkler: float = 0.0
60
- ):
61
- embeddings = model.encode(
62
- sentences, show_progress_bar=show_progress_bar, convert_to_tensor=True)
63
- paraphrases = util.paraphrase_mining_embeddings(embeddings, score_function=util.cos_sim)
64
-
65
- results = []
66
- for score, i, j in paraphrases:
67
- lev_ratio = Levenshtein.normalized_similarity(sentences[i], sentences[j])
68
- jaro_winkler_ratio = JaroWinkler.normalized_similarity(sentences[i], sentences[j])
69
-
70
- combined_score = (weight_semantic * score) + \
71
- (weight_levenshtein * lev_ratio) + \
72
- (weight_jaro_winkler * jaro_winkler_ratio)
73
-
74
- results.append([combined_score, i, j])
75
-
76
- results = sorted(results, key=lambda x: x[0], reverse=True)
77
- return results
78
-
79
- def get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
80
- file_names, codes = extract_and_read_compressed_file(zipped_file)
81
- model = SentenceTransformer(model_name)
82
- code_pairs = perform_paraphrase_mining(model, codes,Ws, Wl, Wj)
83
- pairs_results = []
84
-
85
- for score, i, j in code_pairs:
86
- pairs_results.append({
87
- 'file_name_1': file_names[i],
88
- 'file_name_2': file_names[j],
89
- 'similarity_score': score
90
- })
91
-
92
- similarity_df = pd.concat([pd.DataFrame(pairs_results)], ignore_index=True)
93
- similarity_df = similarity_df.sort_values(by='similarity_score', ascending=False)
94
- result = filter_and_return_top(similarity_df,threshold,number_results).round(2)
95
-
96
  return result
97
 
98
  # Define the Gradio app
99
- with gr.Blocks(theme=gr.themes.Glass()) as demo:
100
  # Tab for similarity calculation
101
  with gr.Tab("Code Pair Similarity"):
102
  # Input components
@@ -108,15 +39,11 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
108
  Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
109
  Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
110
  Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
111
- model_dropdown = gr.Dropdown(
112
- [("codebert", "microsoft/codebert-base"),
113
- ("graphcodebert", "microsoft/graphcodebert-base"),
114
- ("UnixCoder", "microsoft/unixcoder-base-unimodal"),
115
- ("CodeBERTa", "huggingface/CodeBERTa-small-v1"),
116
- ("CodeT5 small", "Salesforce/codet5-small"),
117
- ("PLBART", "uclanlp/plbart-java-cs"),],
118
- label="Select Model",
119
- value= "uclanlp/plbart-java-cs"
120
  )
121
 
122
  # Output component
@@ -146,15 +73,11 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
146
  Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
147
  Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
148
  Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
149
- model_dropdown = gr.Dropdown(
150
- [("codebert", "microsoft/codebert-base"),
151
- ("graphcodebert", "microsoft/graphcodebert-base"),
152
- ("UnixCoder", "microsoft/unixcoder-base-unimodal"),
153
- ("CodeBERTa", "huggingface/CodeBERTa-small-v1"),
154
- ("CodeT5 small", "Salesforce/codet5-small"),
155
- ("PLBART", "uclanlp/plbart-java-cs"),],
156
- label="Select Model",
157
- value= "uclanlp/plbart-java-cs"
158
  )
159
  threshold = gr.Slider(0, 1, value=0, label="Threshold", step=0.01)
160
  number_results = gr.Slider(1, 1000, value=10, label="Number of Returned pairs", step=1)
 
8
  import zipfile
9
  import os
10
  import io
11
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
12
+ from matheel.similarity import get_sim_list
13
 
14
  def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
15
  model = SentenceTransformer(model_name)
 
22
 
23
  return "The similarity score between the two codes is: %.2f" % overall_similarity
24
 
25
+ def get_sim_list_gradio(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
26
+ result = get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  return result
28
 
29
  # Define the Gradio app
30
+ with gr.Blocks() as demo:
31
  # Tab for similarity calculation
32
  with gr.Tab("Code Pair Similarity"):
33
  # Input components
 
39
  Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
40
  Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
41
  Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
42
+ model_dropdown = HuggingfaceHubSearch(
43
+ label="Pre-Trained Model to use for Embeddings",
44
+ placeholder="Search for Pre-Trained models on Hugging Face",
45
+ search_type="model",
46
+ #value = "huggingface/CodeBERTa-small-v1"
 
 
 
 
47
  )
48
 
49
  # Output component
 
73
  Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
74
  Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
75
  Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
76
+ model_dropdown = HuggingfaceHubSearch(
77
+ label="Pre-Trained Model to use for Embeddings",
78
+ placeholder="Search for Pre-Trained models on Hugging Face",
79
+ search_type="model",
80
+ #value = "huggingface/CodeBERTa-small-v1"
 
 
 
 
81
  )
82
  threshold = gr.Slider(0, 1, value=0, label="Threshold", step=0.01)
83
  number_results = gr.Slider(1, 1000, value=10, label="Number of Returned pairs", step=1)