Spaces:

buelfhood
/

Matheel

Running

App Files Files Community

buelfhood commited on 8 days ago

Commit

685453f

verified ·

1 Parent(s): 924577a

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -92

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ from typing import List
 import zipfile
 import os
 import io
 def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
     model = SentenceTransformer(model_name)
@@ -20,83 +22,12 @@ def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
     return "The similarity score between the two codes is: %.2f" % overall_similarity
-# Define the function to process the uploaded file and return a DataFrame
-def extract_and_read_compressed_file(file_path):
-    file_names = []
-    codes = []
-    # Handle .zip files
-    if file_path.endswith('.zip'):
-        with zipfile.ZipFile(file_path, 'r') as z:
-            file_names = z.namelist()
-            codes = [z.read(file).decode('utf-8', errors='ignore') for file in file_names]
-    else:
-        raise ValueError("Unsupported file type. Only .zip is supported.")
-    return file_names, codes
-def filter_and_return_top(df, similarity_threshold,returned_results):
-    filtered_df = df[df['similarity_score'] > similarity_threshold]
-    return filtered_df.head(returned_results)
-# Perform paraphrase mining with the specified weights
-def perform_paraphrase_mining(model, codes_list, weight_semantic, weight_levenshtein, weight_jaro_winkler):
-    return paraphrase_mining_with_combined_score(
-        model,
-        codes_list,
-        weight_semantic=weight_semantic,
-        weight_levenshtein=weight_levenshtein,
-        weight_jaro_winkler=weight_jaro_winkler
-    )
-def paraphrase_mining_with_combined_score(
-    model,
-    sentences: List[str],
-    show_progress_bar: bool = False,
-    weight_semantic: float = 1.0,
-    weight_levenshtein: float = 0.0,
-    weight_jaro_winkler: float = 0.0
-):
-    embeddings = model.encode(
-        sentences, show_progress_bar=show_progress_bar, convert_to_tensor=True)
-    paraphrases = util.paraphrase_mining_embeddings(embeddings, score_function=util.cos_sim)
-    results = []
-    for score, i, j in paraphrases:
-        lev_ratio = Levenshtein.normalized_similarity(sentences[i], sentences[j])
-        jaro_winkler_ratio = JaroWinkler.normalized_similarity(sentences[i], sentences[j])
-        combined_score = (weight_semantic * score) + \
-                         (weight_levenshtein * lev_ratio) + \
-                         (weight_jaro_winkler * jaro_winkler_ratio)
-        results.append([combined_score, i, j])
-    results = sorted(results, key=lambda x: x[0], reverse=True)
-    return results
-def get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
-    file_names, codes = extract_and_read_compressed_file(zipped_file)
-    model = SentenceTransformer(model_name)
-    code_pairs = perform_paraphrase_mining(model, codes,Ws, Wl, Wj)
-    pairs_results = []
-    for score, i, j in code_pairs:
-      pairs_results.append({
-        'file_name_1': file_names[i],
-        'file_name_2': file_names[j],
-        'similarity_score': score
-    })
-    similarity_df = pd.concat([pd.DataFrame(pairs_results)], ignore_index=True)
-    similarity_df = similarity_df.sort_values(by='similarity_score', ascending=False)
-    result = filter_and_return_top(similarity_df,threshold,number_results).round(2)
     return result
 # Define the Gradio app
-with gr.Blocks(theme=gr.themes.Glass()) as demo:
     # Tab for similarity calculation
     with gr.Tab("Code Pair Similarity"):
         # Input components
@@ -108,15 +39,11 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
             Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
             Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
             Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
-            model_dropdown = gr.Dropdown(
-            [("codebert", "microsoft/codebert-base"),
-             ("graphcodebert", "microsoft/graphcodebert-base"),
-             ("UnixCoder", "microsoft/unixcoder-base-unimodal"),
-             ("CodeBERTa", "huggingface/CodeBERTa-small-v1"),
-             ("CodeT5 small", "Salesforce/codet5-small"),
-             ("PLBART", "uclanlp/plbart-java-cs"),],
-            label="Select Model",
-            value= "uclanlp/plbart-java-cs"
             )
         # Output component
@@ -146,15 +73,11 @@ with gr.Blocks(theme=gr.themes.Glass()) as demo:
             Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
             Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
             Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
-            model_dropdown = gr.Dropdown(
-            [("codebert", "microsoft/codebert-base"),
-             ("graphcodebert", "microsoft/graphcodebert-base"),
-             ("UnixCoder", "microsoft/unixcoder-base-unimodal"),
-             ("CodeBERTa", "huggingface/CodeBERTa-small-v1"),
-             ("CodeT5 small", "Salesforce/codet5-small"),
-             ("PLBART", "uclanlp/plbart-java-cs"),],
-            label="Select Model",
-            value= "uclanlp/plbart-java-cs"
             )
             threshold = gr.Slider(0, 1, value=0, label="Threshold", step=0.01)
             number_results = gr.Slider(1, 1000, value=10, label="Number of Returned pairs", step=1)

 import zipfile
 import os
 import io
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from matheel.similarity import get_sim_list
 def calculate_similarity(code1, code2, Ws, Wl, Wj, model_name):
     model = SentenceTransformer(model_name)
     return "The similarity score between the two codes is: %.2f" % overall_similarity
+def get_sim_list_gradio(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results):
+    result = get_sim_list(zipped_file,Ws, Wl, Wj, model_name,threshold,number_results)
     return result
 # Define the Gradio app
+with gr.Blocks() as demo:
     # Tab for similarity calculation
     with gr.Tab("Code Pair Similarity"):
         # Input components
             Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
             Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
             Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
+            model_dropdown = HuggingfaceHubSearch(
+                label="Pre-Trained Model to use for Embeddings",
+                placeholder="Search for Pre-Trained models on Hugging Face",
+                search_type="model",
+                #value = "huggingface/CodeBERTa-small-v1"
             )
         # Output component
             Ws = gr.Slider(0, 1, value=0.7, label="Semantic Search Weight", step=0.1)
             Wl = gr.Slider(0, 1, value=0.3, label="Levenshiern Distance Weight", step=0.1)
             Wj = gr.Slider(0, 1, value=0.0, label="Jaro Winkler Weight", step=0.1)
+            model_dropdown = HuggingfaceHubSearch(
+                label="Pre-Trained Model to use for Embeddings",
+                placeholder="Search for Pre-Trained models on Hugging Face",
+                search_type="model",
+                #value = "huggingface/CodeBERTa-small-v1"
             )
             threshold = gr.Slider(0, 1, value=0, label="Threshold", step=0.01)
             number_results = gr.Slider(1, 1000, value=10, label="Number of Returned pairs", step=1)