Spaces:
Runtime error
Runtime error
Merge branch 'minko'
Browse files- .gitignore +6 -0
- analysis.py +0 -2
- app.py +12 -7
- predictors.py +75 -12
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/analysis.cpython-311.pyc
|
| 2 |
+
__pycache__/app.cpython-311.pyc
|
| 3 |
+
__pycache__/explainability.cpython-311.pyc
|
| 4 |
+
__pycache__/plagiarism.cpython-311.pyc
|
| 5 |
+
__pycache__/predictors.cpython-311.pyc
|
| 6 |
+
__pycache__/utils.cpython-311.pyc
|
analysis.py
CHANGED
|
@@ -22,12 +22,10 @@ import yaml
|
|
| 22 |
import nltk
|
| 23 |
import os
|
| 24 |
from explainability import *
|
| 25 |
-
from dotenv import load_dotenv
|
| 26 |
import subprocess
|
| 27 |
|
| 28 |
nltk.download("punkt")
|
| 29 |
nltk.download("stopwords")
|
| 30 |
-
load_dotenv()
|
| 31 |
with open("config.yaml", "r") as file:
|
| 32 |
params = yaml.safe_load(file)
|
| 33 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 22 |
import nltk
|
| 23 |
import os
|
| 24 |
from explainability import *
|
|
|
|
| 25 |
import subprocess
|
| 26 |
|
| 27 |
nltk.download("punkt")
|
| 28 |
nltk.download("stopwords")
|
|
|
|
| 29 |
with open("config.yaml", "r") as file:
|
| 30 |
params = yaml.safe_load(file)
|
| 31 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import numpy as np
|
| 3 |
from datetime import date
|
| 4 |
-
from predictors import predict_bc_scores, predict_mc_scores
|
| 5 |
from analysis import depth_analysis
|
| 6 |
from predictors import predict_quillbot
|
| 7 |
from plagiarism import plagiarism_check, build_date
|
|
@@ -12,11 +12,12 @@ np.set_printoptions(suppress=True)
|
|
| 12 |
|
| 13 |
def ai_generated_test(option, input):
|
| 14 |
if option == "Human vs AI":
|
| 15 |
-
return predict_bc_scores(input), None
|
| 16 |
else:
|
| 17 |
return (
|
| 18 |
predict_bc_scores(input),
|
| 19 |
predict_mc_scores(input),
|
|
|
|
| 20 |
)
|
| 21 |
|
| 22 |
|
|
@@ -49,11 +50,13 @@ def main(
|
|
| 49 |
depth_analysis_plot = depth_analysis(input)
|
| 50 |
bc_score = predict_bc_scores(input)
|
| 51 |
mc_score = predict_mc_scores(input)
|
|
|
|
| 52 |
quilscore = predict_quillbot(input)
|
| 53 |
|
| 54 |
return (
|
| 55 |
bc_score,
|
| 56 |
mc_score,
|
|
|
|
| 57 |
formatted_tokens,
|
| 58 |
depth_analysis_plot,
|
| 59 |
quilscore,
|
|
@@ -147,6 +150,8 @@ with gr.Blocks() as demo:
|
|
| 147 |
bcLabel = gr.Label(label="Source")
|
| 148 |
with gr.Column():
|
| 149 |
mcLabel = gr.Label(label="Creator")
|
|
|
|
|
|
|
| 150 |
with gr.Row():
|
| 151 |
QLabel = gr.Label(label="Humanized")
|
| 152 |
with gr.Group():
|
|
@@ -213,6 +218,7 @@ with gr.Blocks() as demo:
|
|
| 213 |
outputs=[
|
| 214 |
bcLabel,
|
| 215 |
mcLabel,
|
|
|
|
| 216 |
sentenceBreakdown,
|
| 217 |
writing_analysis_plot,
|
| 218 |
QLabel,
|
|
@@ -223,10 +229,7 @@ with gr.Blocks() as demo:
|
|
| 223 |
only_ai_btn.click(
|
| 224 |
fn=ai_generated_test,
|
| 225 |
inputs=[ai_option, input_text],
|
| 226 |
-
outputs=[
|
| 227 |
-
bcLabel,
|
| 228 |
-
mcLabel,
|
| 229 |
-
],
|
| 230 |
api_name="ai_check",
|
| 231 |
)
|
| 232 |
|
|
@@ -266,4 +269,6 @@ with gr.Blocks() as demo:
|
|
| 266 |
date_from = ""
|
| 267 |
date_to = ""
|
| 268 |
|
| 269 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import numpy as np
|
| 3 |
from datetime import date
|
| 4 |
+
from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
|
| 5 |
from analysis import depth_analysis
|
| 6 |
from predictors import predict_quillbot
|
| 7 |
from plagiarism import plagiarism_check, build_date
|
|
|
|
| 12 |
|
| 13 |
def ai_generated_test(option, input):
|
| 14 |
if option == "Human vs AI":
|
| 15 |
+
return predict_bc_scores(input), None, None
|
| 16 |
else:
|
| 17 |
return (
|
| 18 |
predict_bc_scores(input),
|
| 19 |
predict_mc_scores(input),
|
| 20 |
+
predict_1on1_scores(input),
|
| 21 |
)
|
| 22 |
|
| 23 |
|
|
|
|
| 50 |
depth_analysis_plot = depth_analysis(input)
|
| 51 |
bc_score = predict_bc_scores(input)
|
| 52 |
mc_score = predict_mc_scores(input)
|
| 53 |
+
mc_1on1_score = predict_1on1_scores(input)
|
| 54 |
quilscore = predict_quillbot(input)
|
| 55 |
|
| 56 |
return (
|
| 57 |
bc_score,
|
| 58 |
mc_score,
|
| 59 |
+
mc_1on1_score,
|
| 60 |
formatted_tokens,
|
| 61 |
depth_analysis_plot,
|
| 62 |
quilscore,
|
|
|
|
| 150 |
bcLabel = gr.Label(label="Source")
|
| 151 |
with gr.Column():
|
| 152 |
mcLabel = gr.Label(label="Creator")
|
| 153 |
+
with gr.Column():
|
| 154 |
+
mc1on1Label = gr.Label(label="Creator(1 on 1 Approach)")
|
| 155 |
with gr.Row():
|
| 156 |
QLabel = gr.Label(label="Humanized")
|
| 157 |
with gr.Group():
|
|
|
|
| 218 |
outputs=[
|
| 219 |
bcLabel,
|
| 220 |
mcLabel,
|
| 221 |
+
mc1on1Label,
|
| 222 |
sentenceBreakdown,
|
| 223 |
writing_analysis_plot,
|
| 224 |
QLabel,
|
|
|
|
| 229 |
only_ai_btn.click(
|
| 230 |
fn=ai_generated_test,
|
| 231 |
inputs=[ai_option, input_text],
|
| 232 |
+
outputs=[bcLabel, mcLabel, mc1on1Label],
|
|
|
|
|
|
|
|
|
|
| 233 |
api_name="ai_check",
|
| 234 |
)
|
| 235 |
|
|
|
|
| 269 |
date_from = ""
|
| 270 |
date_to = ""
|
| 271 |
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
demo.launch(share=True, auth=("polygraf-admin", "test@aisd"))
|
predictors.py
CHANGED
|
@@ -19,19 +19,19 @@ from scipy.special import softmax
|
|
| 19 |
import yaml
|
| 20 |
import os
|
| 21 |
from utils import *
|
| 22 |
-
from dotenv import load_dotenv
|
| 23 |
|
| 24 |
with open("config.yaml", "r") as file:
|
| 25 |
params = yaml.safe_load(file)
|
| 26 |
nltk.download("punkt")
|
| 27 |
nltk.download("stopwords")
|
| 28 |
-
load_dotenv()
|
| 29 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 30 |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
| 31 |
text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
|
| 32 |
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
|
|
|
|
| 33 |
quillbot_labels = params["QUILLBOT_LABELS"]
|
| 34 |
mc_label_map = params["MC_OUTPUT_LABELS"]
|
|
|
|
| 35 |
mc_token_size = int(params["MC_TOKEN_SIZE"])
|
| 36 |
bc_token_size = int(params["BC_TOKEN_SIZE"])
|
| 37 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
|
@@ -46,6 +46,13 @@ quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path)
|
|
| 46 |
quillbot_model = AutoModelForSequenceClassification.from_pretrained(
|
| 47 |
text_quillbot_model_path
|
| 48 |
).to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
def split_text_allow_complete_sentences_nltk(
|
|
@@ -234,13 +241,69 @@ def predict_bc_scores(input):
|
|
| 234 |
return bc_score
|
| 235 |
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
import yaml
|
| 20 |
import os
|
| 21 |
from utils import *
|
|
|
|
| 22 |
|
| 23 |
with open("config.yaml", "r") as file:
|
| 24 |
params = yaml.safe_load(file)
|
| 25 |
nltk.download("punkt")
|
| 26 |
nltk.download("stopwords")
|
|
|
|
| 27 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 28 |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
| 29 |
text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
|
| 30 |
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
|
| 31 |
+
text_1on1_models = params["TEXT_1ON1_MODEL"]
|
| 32 |
quillbot_labels = params["QUILLBOT_LABELS"]
|
| 33 |
mc_label_map = params["MC_OUTPUT_LABELS"]
|
| 34 |
+
text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
|
| 35 |
mc_token_size = int(params["MC_TOKEN_SIZE"])
|
| 36 |
bc_token_size = int(params["BC_TOKEN_SIZE"])
|
| 37 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
|
|
|
| 46 |
quillbot_model = AutoModelForSequenceClassification.from_pretrained(
|
| 47 |
text_quillbot_model_path
|
| 48 |
).to(device)
|
| 49 |
+
tokenizers_1on1 = {}
|
| 50 |
+
models_1on1 = {}
|
| 51 |
+
for model in text_1on1_models:
|
| 52 |
+
tokenizers_1on1[model] = AutoTokenizer.from_pretrained(model)
|
| 53 |
+
models_1on1[model] = AutoModelForSequenceClassification.from_pretrained(
|
| 54 |
+
model
|
| 55 |
+
).to(device)
|
| 56 |
|
| 57 |
|
| 58 |
def split_text_allow_complete_sentences_nltk(
|
|
|
|
| 241 |
return bc_score
|
| 242 |
|
| 243 |
|
| 244 |
+
def predict_1on1(model, tokenizer, text):
|
| 245 |
+
with torch.no_grad():
|
| 246 |
+
model.eval()
|
| 247 |
+
tokens = tokenizer(
|
| 248 |
+
text,
|
| 249 |
+
padding="max_length",
|
| 250 |
+
truncation=True,
|
| 251 |
+
return_tensors="pt",
|
| 252 |
+
max_length=mc_token_size,
|
| 253 |
+
).to(device)
|
| 254 |
+
output = model(**tokens)
|
| 255 |
+
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
| 256 |
+
return output_norm
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def predict_1on1_combined(input):
|
| 260 |
+
predictions = []
|
| 261 |
+
for i, model in enumerate(text_1on1_models):
|
| 262 |
+
predictions.append(
|
| 263 |
+
predict_1on1(models_1on1[model], tokenizers_1on1[model], input)[1]
|
| 264 |
+
)
|
| 265 |
+
return predictions
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def predict_1on1_scores(input):
|
| 269 |
+
# BC SCORE
|
| 270 |
+
bc_scores = []
|
| 271 |
+
samples_len_bc = len(
|
| 272 |
+
split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
| 273 |
+
)
|
| 274 |
+
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
| 275 |
+
for i in range(samples_len_bc):
|
| 276 |
+
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
| 277 |
+
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
| 278 |
+
bc_scores.append(bc_score)
|
| 279 |
+
bc_scores_array = np.array(bc_scores)
|
| 280 |
+
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
| 281 |
+
bc_score_list = average_bc_scores.tolist()
|
| 282 |
+
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
| 283 |
+
|
| 284 |
+
# MC SCORE
|
| 285 |
+
mc_scores = []
|
| 286 |
+
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
| 287 |
+
samples_len_mc = len(
|
| 288 |
+
split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
| 289 |
+
)
|
| 290 |
+
for i in range(samples_len_mc):
|
| 291 |
+
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
| 292 |
+
mc_score = predict_1on1_combined(cleaned_text_mc)
|
| 293 |
+
mc_scores.append(mc_score)
|
| 294 |
+
mc_scores_array = np.array(mc_scores)
|
| 295 |
+
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
| 296 |
+
normalized_mc_scores = average_mc_scores / np.sum(average_mc_scores)
|
| 297 |
+
mc_score_list = normalized_mc_scores.tolist()
|
| 298 |
+
mc_score = {}
|
| 299 |
+
for score, label in zip(mc_score_list, text_1on1_label_map):
|
| 300 |
+
mc_score[label.upper()] = score
|
| 301 |
+
|
| 302 |
+
print(mc_score)
|
| 303 |
+
sum_prob = 1 - bc_score["HUMAN"]
|
| 304 |
+
for key, value in mc_score.items():
|
| 305 |
+
mc_score[key] = value * sum_prob
|
| 306 |
+
if sum_prob < 0.01:
|
| 307 |
+
mc_score = {}
|
| 308 |
+
|
| 309 |
+
return mc_score
|