Spaces:

buelfhood
/

Matheel

Running

App Files Files Community

buelfhood commited on 6 days ago

Commit

b6e46d2

verified ·

1 Parent(s): ae237a7

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -75

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import zipfile
 import os
 import io
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
     model = SentenceTransformer(model_name)
@@ -21,79 +22,8 @@ def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
     return "The similarity score between the two codes is: %.2f" % overall_similarity
-# Define the function to process the uploaded file and return a DataFrame
-def extract_and_read_compressed_file(file_path):
-    file_names = []
-    codes = []
-    # Handle .zip files
-    if file_path.endswith('.zip'):
-        with zipfile.ZipFile(file_path, 'r') as z:
-            file_names = z.namelist()
-            codes = [z.read(file).decode('utf-8', errors='ignore') for file in file_names]
-    else:
-        raise ValueError("Unsupported file type. Only .zip is supported.")
-    return file_names, codes
-def filter_and_return_top(df, similarity_threshold,returned_results):
-    filtered_df = df[df['similarity_score'] > similarity_threshold]
-    return filtered_df.head(returned_results)
-# Perform paraphrase mining with the specified weights
-def perform_paraphrase_mining(model, codes_list, weight_semantic, weight_levenshtein, weight_jaro_winkler):
-    return paraphrase_mining_with_combined_score(
-        model,
-        codes_list,
-        weight_semantic=weight_semantic,
-        weight_levenshtein=weight_levenshtein,
-        weight_jaro_winkler=weight_jaro_winkler
-    )
-def paraphrase_mining_with_combined_score(
-    model,
-    sentences: List[str],
-    show_progress_bar: bool = False,
-    weight_semantic: float = 1.0,
-    weight_levenshtein: float = 0.0,
-    weight_jaro_winkler: float = 0.0
-):
-    embeddings = model.encode(
-        sentences, show_progress_bar=show_progress_bar, convert_to_tensor=True)
-    paraphrases = util.paraphrase_mining_embeddings(embeddings, score_function=util.cos_sim)
-    results = []
-    for score, i, j in paraphrases:
-        lev_ratio = Levenshtein.normalized_similarity(sentences[i], sentences[j])
-        jaro_winkler_ratio = JaroWinkler.normalized_similarity(sentences[i], sentences[j])
-        combined_score = (weight_semantic * score) + \
-                         (weight_levenshtein * lev_ratio) + \
-                         (weight_jaro_winkler * jaro_winkler_ratio)
-        results.append([combined_score, i, j])
-    results = sorted(results, key=lambda x: x[0], reverse=True)
-    return results
-def get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
-    file_names, codes = extract_and_read_compressed_file(zipped_file)
-    model = SentenceTransformer(model_name)
-    code_pairs = perform_paraphrase_mining(model, codes,Ws, Wl, Wj)
-    pairs_results = []
-    for score, i, j in code_pairs:
-      pairs_results.append({
-        'file_name_1': file_names[i],
-        'file_name_2': file_names[j],
-        'similarity_score': score
-    })
-    similarity_df = pd.concat([pd.DataFrame(pairs_results)], ignore_index=True)
-    similarity_df = similarity_df.sort_values(by='similarity_score', ascending=False)
-    result = filter_and_return_top(similarity_df,threshold,number_results).round(2)
     return result
 # Define the Gradio app
@@ -108,7 +38,7 @@ with gr.Blocks() as demo:
                 label="Pre-Trained Model to use for Embeddings",
                 placeholder="Search for Pre-Trained models on Hugging Face",
                 search_type="model",
-                value = "huggingface/CodeBERTa-small-v1"
             )
         # Accordion for weights and models
@@ -159,7 +89,7 @@ with gr.Blocks() as demo:
         # Button to trigger the file processing
         process_btn = gr.Button("Process File")
-        process_btn.click(get_sim_list, inputs=[file_uploader, Ws, Wl, Wj, model_dropdown,threshold,number_results], outputs=df_output)
 # Launch the Gradio app with live=True
 demo.launch(show_error=True,debug=True)

 import os
 import io
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from matheel.similarity import get_sim_list
 def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
     model = SentenceTransformer(model_name)
     return "The similarity score between the two codes is: %.2f" % overall_similarity
+def get_sim_list_gradio(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
+    result = get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results)
     return result
 # Define the Gradio app
                 label="Pre-Trained Model to use for Embeddings",
                 placeholder="Search for Pre-Trained models on Hugging Face",
                 search_type="model",
+                #value = "huggingface/CodeBERTa-small-v1"
             )
         # Accordion for weights and models
         # Button to trigger the file processing
         process_btn = gr.Button("Process File")
+        process_btn.click(get_sim_list_gradio, inputs=[file_uploader, Ws, Wl, Wj, model_dropdown,threshold,number_results], outputs=df_output)
 # Launch the Gradio app with live=True
 demo.launch(show_error=True,debug=True)