buelfhood commited on
Commit
b6e46d2
·
verified ·
1 Parent(s): ae237a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -75
app.py CHANGED
@@ -9,6 +9,7 @@ import zipfile
9
  import os
10
  import io
11
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
 
12
 
13
  def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
14
  model = SentenceTransformer(model_name)
@@ -21,79 +22,8 @@ def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
21
 
22
  return "The similarity score between the two codes is: %.2f" % overall_similarity
23
 
24
- # Define the function to process the uploaded file and return a DataFrame
25
- def extract_and_read_compressed_file(file_path):
26
- file_names = []
27
- codes = []
28
-
29
- # Handle .zip files
30
- if file_path.endswith('.zip'):
31
- with zipfile.ZipFile(file_path, 'r') as z:
32
- file_names = z.namelist()
33
- codes = [z.read(file).decode('utf-8', errors='ignore') for file in file_names]
34
-
35
- else:
36
- raise ValueError("Unsupported file type. Only .zip is supported.")
37
-
38
- return file_names, codes
39
-
40
- def filter_and_return_top(df, similarity_threshold,returned_results):
41
- filtered_df = df[df['similarity_score'] > similarity_threshold]
42
- return filtered_df.head(returned_results)
43
-
44
- # Perform paraphrase mining with the specified weights
45
- def perform_paraphrase_mining(model, codes_list, weight_semantic, weight_levenshtein, weight_jaro_winkler):
46
- return paraphrase_mining_with_combined_score(
47
- model,
48
- codes_list,
49
- weight_semantic=weight_semantic,
50
- weight_levenshtein=weight_levenshtein,
51
- weight_jaro_winkler=weight_jaro_winkler
52
- )
53
-
54
- def paraphrase_mining_with_combined_score(
55
- model,
56
- sentences: List[str],
57
- show_progress_bar: bool = False,
58
- weight_semantic: float = 1.0,
59
- weight_levenshtein: float = 0.0,
60
- weight_jaro_winkler: float = 0.0
61
- ):
62
- embeddings = model.encode(
63
- sentences, show_progress_bar=show_progress_bar, convert_to_tensor=True)
64
- paraphrases = util.paraphrase_mining_embeddings(embeddings, score_function=util.cos_sim)
65
-
66
- results = []
67
- for score, i, j in paraphrases:
68
- lev_ratio = Levenshtein.normalized_similarity(sentences[i], sentences[j])
69
- jaro_winkler_ratio = JaroWinkler.normalized_similarity(sentences[i], sentences[j])
70
-
71
- combined_score = (weight_semantic * score) + \
72
- (weight_levenshtein * lev_ratio) + \
73
- (weight_jaro_winkler * jaro_winkler_ratio)
74
-
75
- results.append([combined_score, i, j])
76
-
77
- results = sorted(results, key=lambda x: x[0], reverse=True)
78
- return results
79
-
80
- def get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
81
- file_names, codes = extract_and_read_compressed_file(zipped_file)
82
- model = SentenceTransformer(model_name)
83
- code_pairs = perform_paraphrase_mining(model, codes,Ws, Wl, Wj)
84
- pairs_results = []
85
-
86
- for score, i, j in code_pairs:
87
- pairs_results.append({
88
- 'file_name_1': file_names[i],
89
- 'file_name_2': file_names[j],
90
- 'similarity_score': score
91
- })
92
-
93
- similarity_df = pd.concat([pd.DataFrame(pairs_results)], ignore_index=True)
94
- similarity_df = similarity_df.sort_values(by='similarity_score', ascending=False)
95
- result = filter_and_return_top(similarity_df,threshold,number_results).round(2)
96
-
97
  return result
98
 
99
  # Define the Gradio app
@@ -108,7 +38,7 @@ with gr.Blocks() as demo:
108
  label="Pre-Trained Model to use for Embeddings",
109
  placeholder="Search for Pre-Trained models on Hugging Face",
110
  search_type="model",
111
- value = "huggingface/CodeBERTa-small-v1"
112
  )
113
 
114
  # Accordion for weights and models
@@ -159,7 +89,7 @@ with gr.Blocks() as demo:
159
 
160
  # Button to trigger the file processing
161
  process_btn = gr.Button("Process File")
162
- process_btn.click(get_sim_list, inputs=[file_uploader, Ws, Wl, Wj, model_dropdown,threshold,number_results], outputs=df_output)
163
 
164
  # Launch the Gradio app with live=True
165
  demo.launch(show_error=True,debug=True)
 
9
  import os
10
  import io
11
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
12
+ from matheel.similarity import get_sim_list
13
 
14
  def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
15
  model = SentenceTransformer(model_name)
 
22
 
23
  return "The similarity score between the two codes is: %.2f" % overall_similarity
24
 
25
+ def get_sim_list_gradio(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
26
+ result = get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  return result
28
 
29
  # Define the Gradio app
 
38
  label="Pre-Trained Model to use for Embeddings",
39
  placeholder="Search for Pre-Trained models on Hugging Face",
40
  search_type="model",
41
+ #value = "huggingface/CodeBERTa-small-v1"
42
  )
43
 
44
  # Accordion for weights and models
 
89
 
90
  # Button to trigger the file processing
91
  process_btn = gr.Button("Process File")
92
+ process_btn.click(get_sim_list_gradio, inputs=[file_uploader, Ws, Wl, Wj, model_dropdown,threshold,number_results], outputs=df_output)
93
 
94
  # Launch the Gradio app with live=True
95
  demo.launch(show_error=True,debug=True)