File size: 3,099 Bytes
698d2dc
c157335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dda6ce9
c157335
dda6ce9
 
 
 
 
c157335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
777389c
c157335
 
 
dda6ce9
c157335
dda6ce9
c157335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394abec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return pipeline('text-classification', model=model, tokenizer=tokenizer, truncation=True, max_length=512, top_k=4)

classifier = load_model("ngocminhta/authscan-baseline")
classifier2 = load_model("ngocminhta/authscan-baseline-machine")

TEXT_CLASS_MAPPING_MACHINE = {
    'LABEL_0': 'Gemini 1.5 Pro',
    'LABEL_1': 'Gemini 2.0 Experimental',
    'LABEL_2': 'GPT-4o Mini',
    'LABEL_3': 'Llama 3.1 8B'
}

TEXT_CLASS_MAPPING = {
    'LABEL_0': 'Human-Written',
    'LABEL_1': 'Machine-Generated'
}

def update_language(language):
    if language == 'Java' or language == 'Python':
        return gr.update(language='python')
    elif language == 'C':
        return gr.update(language='c')
    elif language == 'C++':
        return gr.update(language='cpp')
    return gr.update(language='python')

def process_result_detection_tab(text, language): 
    result = classifier(f"Language: {language}\n\n{text}")[0]
    result_machine = classifier2(f"Language: {language}\n\n{text}")[0]

    labels = [TEXT_CLASS_MAPPING[x['label']] for x in result]
    labels_machine = [TEXT_CLASS_MAPPING_MACHINE[x['label']] for x in result_machine]

    scores = list(np.array([x['score'] for x in result]))
    scores_machine = list(np.array([x['score'] for x in result_machine]))
    final_results = dict(zip(labels, scores))
    if max(final_results, key=final_results.get) == 'Machine-Generated':
        final_results_machine = dict(zip(labels_machine, scores_machine))
    else:
        final_results_machine = None
    return final_results, final_results_machine

def clear_detection_tab():
    return "", gr.update(interactive=False)

with gr.Blocks() as demo:
    gr.Markdown("""<h1><center>AuthScan</center></h1>""")

    with gr.Row():
        language = gr.Dropdown(
            choices=["C", "C++", "Java", "Python"],
            label="Select Programming Language",
            value="C"
        )

    with gr.Row():
        input_text = gr.Code(
          label="Enter code here",
          language="python",
          elem_id="code_input",
        )

    with gr.Row():
        check_button = gr.Button("Check Origin", variant="primary")
        clear_button = gr.Button("Clear", variant="stop")

    out = gr.Label(label='Result')
    out_machine = gr.Label(label='Detailed Information')
    # When language is changed, update the code component's language
    language.change(update_language, inputs=language, outputs=input_text)

    check_button.click(process_result_detection_tab, inputs=[input_text, language], outputs=[out, out_machine])
    # out_machine.change(lambda x: gr.update(visible=True) if out_machine else gr.update(visible=False), inputs=out_machine, outputs=out_machine)
    clear_button.click(clear_detection_tab, inputs=[], outputs=[input_text, check_button])

demo.launch(share=True)