Spaces:
Running
Running
fix changes in plagiarism check
Browse files- app.py +210 -121
- plagiarism.py +0 -0
app.py
CHANGED
@@ -1,4 +1,10 @@
|
|
1 |
-
from utils import
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import gradio as gr
|
3 |
from urllib.request import urlopen, Request
|
4 |
from googleapiclient.discovery import build
|
@@ -14,7 +20,7 @@ from scipy.special import softmax
|
|
14 |
from evaluate import load
|
15 |
from datetime import date
|
16 |
import nltk
|
17 |
-
import fitz
|
18 |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
19 |
import nltk, spacy, subprocess, torch
|
20 |
import plotly.graph_objects as go
|
@@ -27,20 +33,19 @@ import multiprocessing
|
|
27 |
from functools import partial
|
28 |
import concurrent.futures
|
29 |
|
30 |
-
nltk.download(
|
31 |
|
32 |
from writing_analysis import (
|
33 |
normalize,
|
34 |
preprocess_text1,
|
35 |
-
preprocess_text2,
|
36 |
vocabulary_richness_ttr,
|
37 |
calculate_gunning_fog,
|
38 |
calculate_average_sentence_length,
|
39 |
calculate_average_word_length,
|
40 |
calculate_syntactic_tree_depth,
|
41 |
calculate_perplexity,
|
42 |
-
|
43 |
-
)
|
44 |
|
45 |
np.set_printoptions(suppress=True)
|
46 |
|
@@ -89,7 +94,7 @@ def plagiarism_check(
|
|
89 |
)
|
90 |
print(f"Time for google search: {time.perf_counter()-time1}")
|
91 |
time1 = time.perf_counter()
|
92 |
-
|
93 |
print("Number of URLs: ", len(urlCount))
|
94 |
print(urlList)
|
95 |
|
@@ -113,8 +118,8 @@ def plagiarism_check(
|
|
113 |
page_content = soup.text
|
114 |
source_embeddings.append(embed_text(page_content))
|
115 |
else:
|
116 |
-
|
117 |
-
|
118 |
# Populate matching scores for scrapped pages
|
119 |
# for i, soup in enumerate(soups):
|
120 |
# print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
@@ -126,30 +131,27 @@ def plagiarism_check(
|
|
126 |
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
127 |
# ScoreArray[i][j] = score
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
return ScoreArray
|
148 |
|
149 |
ScoreArray = main(soups, sentences)
|
150 |
|
151 |
-
|
152 |
-
|
153 |
print(f"Time for matching score: {time.perf_counter()-time1}")
|
154 |
time1 = time.perf_counter()
|
155 |
|
@@ -177,7 +179,7 @@ def main(soups, sentences):
|
|
177 |
sentenceToMaxURL[j] = i
|
178 |
if maxScore > 0.5:
|
179 |
sentencePlag[j] = True
|
180 |
-
|
181 |
if (
|
182 |
(len(sentences) > 1)
|
183 |
and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
|
@@ -216,11 +218,13 @@ def main(soups, sentences):
|
|
216 |
|
217 |
print(formatted_tokens)
|
218 |
print(index_descending)
|
219 |
-
|
220 |
for ind in index_descending:
|
221 |
formatted_tokens.append(
|
222 |
(
|
223 |
-
urlList[ind]
|
|
|
|
|
224 |
"[" + str(urlMap[ind]) + "]",
|
225 |
)
|
226 |
)
|
@@ -232,7 +236,7 @@ def main(soups, sentences):
|
|
232 |
|
233 |
return formatted_tokens
|
234 |
|
235 |
-
|
236 |
"""
|
237 |
AI DETECTION SECTION
|
238 |
"""
|
@@ -240,73 +244,106 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
240 |
|
241 |
text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
|
242 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
243 |
-
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
|
|
|
|
244 |
|
245 |
-
text_mc_model_path =
|
|
|
|
|
246 |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
247 |
-
text_mc_model = AutoModelForSequenceClassification.from_pretrained(
|
|
|
|
|
248 |
|
249 |
quillbot_labels = ["Original", "QuillBot"]
|
250 |
quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
|
251 |
-
quillbot_model = AutoModelForSequenceClassification.from_pretrained(
|
|
|
|
|
|
|
252 |
|
253 |
def remove_accents(input_str):
|
254 |
text_no_accents = unidecode(input_str)
|
255 |
return text_no_accents
|
256 |
|
|
|
257 |
def remove_special_characters(text):
|
258 |
text = remove_accents(text)
|
259 |
pattern = r'[^\w\s\d.,!?\'"()-;]+'
|
260 |
-
text = re.sub(pattern,
|
261 |
return text
|
262 |
|
|
|
263 |
def remove_special_characters_2(text):
|
264 |
-
pattern = r
|
265 |
-
text = re.sub(pattern,
|
266 |
return text
|
267 |
|
|
|
268 |
def update_character_count(text):
|
269 |
return f"{len(text)} characters"
|
270 |
|
271 |
|
272 |
-
def split_text_allow_complete_sentences_nltk(
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
sentences = nltk.sent_tokenize(text)
|
274 |
segments = []
|
275 |
current_segment = []
|
276 |
-
current_length = 0
|
277 |
-
|
278 |
-
if type_det ==
|
279 |
tokenizer = text_bc_tokenizer
|
280 |
max_length = 333
|
281 |
-
|
282 |
-
elif type_det ==
|
283 |
tokenizer = text_mc_tokenizer
|
284 |
max_length = 256
|
285 |
-
|
286 |
for sentence in sentences:
|
287 |
tokens = tokenizer.tokenize(sentence)
|
288 |
sentence_length = len(tokens)
|
289 |
-
|
290 |
-
if current_length + sentence_length <= max_length + tolerance - 2:
|
291 |
current_segment.append(sentence)
|
292 |
current_length += sentence_length
|
293 |
else:
|
294 |
if current_segment:
|
295 |
-
encoded_segment = tokenizer.encode(
|
|
|
|
|
|
|
|
|
|
|
296 |
segments.append((current_segment, len(encoded_segment)))
|
297 |
current_segment = [sentence]
|
298 |
current_length = sentence_length
|
299 |
-
|
300 |
if current_segment:
|
301 |
-
encoded_segment = tokenizer.encode(
|
|
|
|
|
|
|
|
|
|
|
302 |
segments.append((current_segment, len(encoded_segment)))
|
303 |
|
304 |
final_segments = []
|
305 |
for i, (seg, length) in enumerate(segments):
|
306 |
-
if i == len(segments) - 1:
|
307 |
if length < min_last_segment_length and len(final_segments) > 0:
|
308 |
prev_seg, prev_length = final_segments[-1]
|
309 |
-
combined_encoded = tokenizer.encode(
|
|
|
|
|
|
|
|
|
|
|
310 |
if len(combined_encoded) <= max_length + tolerance:
|
311 |
final_segments[-1] = (prev_seg + seg, len(combined_encoded))
|
312 |
else:
|
@@ -319,56 +356,86 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30,
|
|
319 |
decoded_segments = []
|
320 |
encoded_segments = []
|
321 |
for seg, _ in final_segments:
|
322 |
-
encoded_segment = tokenizer.encode(
|
|
|
|
|
|
|
|
|
|
|
323 |
decoded_segment = tokenizer.decode(encoded_segment)
|
324 |
decoded_segments.append(decoded_segment)
|
325 |
return decoded_segments
|
326 |
|
|
|
327 |
def predict_quillbot(text):
|
328 |
with torch.no_grad():
|
329 |
quillbot_model.eval()
|
330 |
-
tokenized_text = quillbot_tokenizer(
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
output = quillbot_model(**tokenized_text)
|
332 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
333 |
-
q_score = {
|
|
|
|
|
|
|
334 |
return q_score
|
335 |
|
|
|
336 |
def predict_bc(model, tokenizer, text):
|
337 |
with torch.no_grad():
|
338 |
model.eval()
|
339 |
tokens = text_bc_tokenizer(
|
340 |
-
text,
|
|
|
|
|
|
|
|
|
341 |
).to(device)
|
342 |
output = model(**tokens)
|
343 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
344 |
print("BC Score: ", output_norm)
|
345 |
return output_norm
|
346 |
|
|
|
347 |
def predict_mc(model, tokenizer, text):
|
348 |
with torch.no_grad():
|
349 |
model.eval()
|
350 |
tokens = text_mc_tokenizer(
|
351 |
-
text,
|
|
|
|
|
|
|
|
|
352 |
).to(device)
|
353 |
output = model(**tokens)
|
354 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
355 |
print("MC Score: ", output_norm)
|
356 |
return output_norm
|
357 |
|
|
|
358 |
def ai_generated_test(ai_option, input):
|
359 |
-
|
360 |
bc_scores = []
|
361 |
mc_scores = []
|
362 |
-
samples_len_bc = len(
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
|
|
|
|
|
|
|
|
367 |
for i in range(samples_len_bc):
|
368 |
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
369 |
-
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc
|
370 |
bc_scores.append(bc_score)
|
371 |
-
|
372 |
for i in range(samples_len_mc):
|
373 |
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
374 |
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
@@ -380,27 +447,28 @@ def ai_generated_test(ai_option, input):
|
|
380 |
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
381 |
bc_score_list = average_bc_scores.tolist()
|
382 |
mc_score_list = average_mc_scores.tolist()
|
383 |
-
|
384 |
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
385 |
mc_score = {}
|
386 |
label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
|
387 |
-
|
388 |
for score, label in zip(mc_score_list, label_map):
|
389 |
mc_score[label.upper()] = score
|
390 |
-
|
391 |
sum_prob = 1 - bc_score["HUMAN"]
|
392 |
for key, value in mc_score.items():
|
393 |
mc_score[key] = value * sum_prob
|
394 |
-
|
395 |
if ai_option == "Human vs AI":
|
396 |
mc_score = {}
|
397 |
|
398 |
-
if sum_prob < 0.01
|
399 |
mc_score = {}
|
400 |
return bc_score, mc_score
|
401 |
else:
|
402 |
return bc_score, mc_score
|
403 |
|
|
|
404 |
# COMBINED
|
405 |
def main(
|
406 |
ai_option,
|
@@ -428,28 +496,30 @@ def main(
|
|
428 |
domains_to_skip,
|
429 |
)
|
430 |
depth_analysis_plot = depth_analysis(input)
|
431 |
-
bc_score, mc_score = ai_generated_test(ai_option,input)
|
432 |
quilscore = predict_quillbot(input)
|
433 |
-
|
434 |
return (
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
|
442 |
|
443 |
def build_date(year, month, day):
|
444 |
return f"{year}{months[month]}{day}"
|
445 |
|
|
|
446 |
def len_validator(text):
|
447 |
-
min_tokens = 200
|
448 |
-
lengt = len(text_bc_tokenizer.tokenize(text
|
449 |
-
if
|
450 |
-
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
|
451 |
-
else
|
452 |
-
return f"Input length ({lengt}) is satisified."
|
|
|
453 |
|
454 |
def extract_text_from_pdf(pdf_path):
|
455 |
doc = fitz.open(pdf_path)
|
@@ -461,9 +531,9 @@ def extract_text_from_pdf(pdf_path):
|
|
461 |
|
462 |
# DEPTH ANALYSIS
|
463 |
print("loading depth analysis")
|
464 |
-
nltk.download(
|
465 |
-
nltk.download(
|
466 |
-
command = [
|
467 |
# Execute the command
|
468 |
subprocess.run(command)
|
469 |
nlp = spacy.load("en_core_web_sm")
|
@@ -473,6 +543,7 @@ model_id = "gpt2"
|
|
473 |
gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
|
474 |
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
|
475 |
|
|
|
476 |
def depth_analysis(input_text):
|
477 |
|
478 |
# vocanulary richness
|
@@ -482,48 +553,59 @@ def depth_analysis(input_text):
|
|
482 |
# readability
|
483 |
gunning_fog = calculate_gunning_fog(input_text)
|
484 |
gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
|
485 |
-
|
486 |
# average sentence length and average word length
|
487 |
words, sentences = preprocess_text2(input_text)
|
488 |
average_sentence_length = calculate_average_sentence_length(sentences)
|
489 |
average_word_length = calculate_average_word_length(words)
|
490 |
-
average_sentence_length_norm = normalize(
|
491 |
-
|
|
|
|
|
|
|
|
|
492 |
|
493 |
# syntactic_tree_depth
|
494 |
average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
|
495 |
-
average_tree_depth_norm = normalize(
|
|
|
|
|
496 |
|
497 |
# perplexity
|
498 |
-
perplexity = calculate_perplexity(
|
|
|
|
|
499 |
perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
|
500 |
|
501 |
features = {
|
502 |
-
"readability": gunning_fog_norm,
|
503 |
"syntactic tree depth": average_tree_depth_norm,
|
504 |
"vocabulary richness": ttr_value,
|
505 |
"perplexity": perplexity_norm,
|
506 |
"average sentence length": average_sentence_length_norm,
|
507 |
-
"average word length": average_word_length_norm,
|
508 |
}
|
509 |
|
510 |
print(features)
|
511 |
|
512 |
fig = go.Figure()
|
513 |
|
514 |
-
fig.add_trace(
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
|
|
|
|
520 |
|
521 |
fig.update_layout(
|
522 |
polar=dict(
|
523 |
radialaxis=dict(
|
524 |
visible=True,
|
525 |
range=[0, 100],
|
526 |
-
)
|
|
|
527 |
showlegend=False,
|
528 |
# autosize=False,
|
529 |
# width=600,
|
@@ -575,16 +657,23 @@ with gr.Blocks() as demo:
|
|
575 |
with gr.Row():
|
576 |
input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
|
577 |
file_input = gr.File(label="Upload PDF")
|
578 |
-
file_input.change(
|
|
|
|
|
579 |
|
580 |
-
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
581 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
582 |
|
583 |
with gr.Row():
|
584 |
with gr.Column():
|
585 |
-
ai_option = gr.Radio(
|
|
|
|
|
|
|
586 |
with gr.Column():
|
587 |
-
plag_option = gr.Radio(
|
|
|
|
|
588 |
|
589 |
with gr.Row():
|
590 |
with gr.Column():
|
@@ -607,14 +696,14 @@ with gr.Blocks() as demo:
|
|
607 |
## Output
|
608 |
"""
|
609 |
)
|
610 |
-
|
611 |
# models = gr.Dropdown(
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
with gr.Row():
|
619 |
with gr.Column():
|
620 |
bcLabel = gr.Label(label="Source")
|
@@ -666,9 +755,7 @@ with gr.Blocks() as demo:
|
|
666 |
|
667 |
with gr.Row():
|
668 |
with gr.Column():
|
669 |
-
writing_analysis_plot = gr.Plot(
|
670 |
-
label="Writing Analysis Plot"
|
671 |
-
)
|
672 |
|
673 |
full_check_btn.click(
|
674 |
fn=main,
|
@@ -690,7 +777,7 @@ with gr.Blocks() as demo:
|
|
690 |
mcLabel,
|
691 |
sentenceBreakdown,
|
692 |
writing_analysis_plot,
|
693 |
-
QLabel
|
694 |
],
|
695 |
api_name="main",
|
696 |
)
|
@@ -740,5 +827,7 @@ with gr.Blocks() as demo:
|
|
740 |
|
741 |
date_from = ""
|
742 |
date_to = ""
|
743 |
-
|
744 |
-
demo.launch(
|
|
|
|
|
|
1 |
+
from utils import (
|
2 |
+
cosineSim,
|
3 |
+
googleSearch,
|
4 |
+
getSentences,
|
5 |
+
parallel_scrap,
|
6 |
+
matchingScore,
|
7 |
+
)
|
8 |
import gradio as gr
|
9 |
from urllib.request import urlopen, Request
|
10 |
from googleapiclient.discovery import build
|
|
|
20 |
from evaluate import load
|
21 |
from datetime import date
|
22 |
import nltk
|
23 |
+
import fitz
|
24 |
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
25 |
import nltk, spacy, subprocess, torch
|
26 |
import plotly.graph_objects as go
|
|
|
33 |
from functools import partial
|
34 |
import concurrent.futures
|
35 |
|
36 |
+
nltk.download("punkt")
|
37 |
|
38 |
from writing_analysis import (
|
39 |
normalize,
|
40 |
preprocess_text1,
|
41 |
+
preprocess_text2,
|
42 |
vocabulary_richness_ttr,
|
43 |
calculate_gunning_fog,
|
44 |
calculate_average_sentence_length,
|
45 |
calculate_average_word_length,
|
46 |
calculate_syntactic_tree_depth,
|
47 |
calculate_perplexity,
|
48 |
+
)
|
|
|
49 |
|
50 |
np.set_printoptions(suppress=True)
|
51 |
|
|
|
94 |
)
|
95 |
print(f"Time for google search: {time.perf_counter()-time1}")
|
96 |
time1 = time.perf_counter()
|
97 |
+
|
98 |
print("Number of URLs: ", len(urlCount))
|
99 |
print(urlList)
|
100 |
|
|
|
118 |
page_content = soup.text
|
119 |
source_embeddings.append(embed_text(page_content))
|
120 |
else:
|
121 |
+
source_embeddings.append(None)
|
122 |
+
|
123 |
# Populate matching scores for scrapped pages
|
124 |
# for i, soup in enumerate(soups):
|
125 |
# print(f"Analyzing {i+1} of {len(soups)} soups........................")
|
|
|
131 |
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
132 |
# ScoreArray[i][j] = score
|
133 |
|
134 |
+
def compute_cosine_similarity(args):
|
135 |
+
sent, source_embedding, i, j = args
|
136 |
+
score = cos_sim_torch(embed_text(sent), source_embedding)
|
137 |
+
return i, j, score
|
138 |
+
|
139 |
+
def main(soups, sentences):
|
140 |
+
source_embeddings = [preprocess(soup) for soup in soups]
|
141 |
+
ScoreArray = [[0 for _ in sentences] for _ in soups]
|
142 |
+
args_list = []
|
143 |
+
for i, soup in enumerate(soups):
|
144 |
+
if soup:
|
145 |
+
for j, sent in enumerate(sentences):
|
146 |
+
args_list.append((sent, source_embeddings[i], i, j))
|
147 |
+
with concurrent.futures.ProcessPoolExecutor() as executor:
|
148 |
+
results = executor.map(compute_cosine_similarity, args_list)
|
149 |
+
for i, j, score in results:
|
150 |
+
ScoreArray[i][j] = score
|
151 |
+
return ScoreArray
|
|
|
152 |
|
153 |
ScoreArray = main(soups, sentences)
|
154 |
|
|
|
|
|
155 |
print(f"Time for matching score: {time.perf_counter()-time1}")
|
156 |
time1 = time.perf_counter()
|
157 |
|
|
|
179 |
sentenceToMaxURL[j] = i
|
180 |
if maxScore > 0.5:
|
181 |
sentencePlag[j] = True
|
182 |
+
|
183 |
if (
|
184 |
(len(sentences) > 1)
|
185 |
and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
|
|
|
218 |
|
219 |
print(formatted_tokens)
|
220 |
print(index_descending)
|
221 |
+
|
222 |
for ind in index_descending:
|
223 |
formatted_tokens.append(
|
224 |
(
|
225 |
+
urlList[ind]
|
226 |
+
+ " --- Matching Score: "
|
227 |
+
+ f"{str(round(urlScore[ind] * 100, 2))}%",
|
228 |
"[" + str(urlMap[ind]) + "]",
|
229 |
)
|
230 |
)
|
|
|
236 |
|
237 |
return formatted_tokens
|
238 |
|
239 |
+
|
240 |
"""
|
241 |
AI DETECTION SECTION
|
242 |
"""
|
|
|
244 |
|
245 |
text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
|
246 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
247 |
+
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
248 |
+
text_bc_model_path
|
249 |
+
).to(device)
|
250 |
|
251 |
+
text_mc_model_path = (
|
252 |
+
"polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
|
253 |
+
)
|
254 |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
255 |
+
text_mc_model = AutoModelForSequenceClassification.from_pretrained(
|
256 |
+
text_mc_model_path
|
257 |
+
).to(device)
|
258 |
|
259 |
quillbot_labels = ["Original", "QuillBot"]
|
260 |
quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
|
261 |
+
quillbot_model = AutoModelForSequenceClassification.from_pretrained(
|
262 |
+
"polygraf-ai/quillbot-detector-28k"
|
263 |
+
).to(device)
|
264 |
+
|
265 |
|
266 |
def remove_accents(input_str):
|
267 |
text_no_accents = unidecode(input_str)
|
268 |
return text_no_accents
|
269 |
|
270 |
+
|
271 |
def remove_special_characters(text):
|
272 |
text = remove_accents(text)
|
273 |
pattern = r'[^\w\s\d.,!?\'"()-;]+'
|
274 |
+
text = re.sub(pattern, "", text)
|
275 |
return text
|
276 |
|
277 |
+
|
278 |
def remove_special_characters_2(text):
|
279 |
+
pattern = r"[^a-zA-Z0-9 ]+"
|
280 |
+
text = re.sub(pattern, "", text)
|
281 |
return text
|
282 |
|
283 |
+
|
284 |
def update_character_count(text):
|
285 |
return f"{len(text)} characters"
|
286 |
|
287 |
|
288 |
+
def split_text_allow_complete_sentences_nltk(
|
289 |
+
text,
|
290 |
+
max_length=256,
|
291 |
+
tolerance=30,
|
292 |
+
min_last_segment_length=100,
|
293 |
+
type_det="bc",
|
294 |
+
):
|
295 |
sentences = nltk.sent_tokenize(text)
|
296 |
segments = []
|
297 |
current_segment = []
|
298 |
+
current_length = 0
|
299 |
+
|
300 |
+
if type_det == "bc":
|
301 |
tokenizer = text_bc_tokenizer
|
302 |
max_length = 333
|
303 |
+
|
304 |
+
elif type_det == "mc":
|
305 |
tokenizer = text_mc_tokenizer
|
306 |
max_length = 256
|
307 |
+
|
308 |
for sentence in sentences:
|
309 |
tokens = tokenizer.tokenize(sentence)
|
310 |
sentence_length = len(tokens)
|
311 |
+
|
312 |
+
if current_length + sentence_length <= max_length + tolerance - 2:
|
313 |
current_segment.append(sentence)
|
314 |
current_length += sentence_length
|
315 |
else:
|
316 |
if current_segment:
|
317 |
+
encoded_segment = tokenizer.encode(
|
318 |
+
" ".join(current_segment),
|
319 |
+
add_special_tokens=True,
|
320 |
+
max_length=max_length + tolerance,
|
321 |
+
truncation=True,
|
322 |
+
)
|
323 |
segments.append((current_segment, len(encoded_segment)))
|
324 |
current_segment = [sentence]
|
325 |
current_length = sentence_length
|
326 |
+
|
327 |
if current_segment:
|
328 |
+
encoded_segment = tokenizer.encode(
|
329 |
+
" ".join(current_segment),
|
330 |
+
add_special_tokens=True,
|
331 |
+
max_length=max_length + tolerance,
|
332 |
+
truncation=True,
|
333 |
+
)
|
334 |
segments.append((current_segment, len(encoded_segment)))
|
335 |
|
336 |
final_segments = []
|
337 |
for i, (seg, length) in enumerate(segments):
|
338 |
+
if i == len(segments) - 1:
|
339 |
if length < min_last_segment_length and len(final_segments) > 0:
|
340 |
prev_seg, prev_length = final_segments[-1]
|
341 |
+
combined_encoded = tokenizer.encode(
|
342 |
+
" ".join(prev_seg + seg),
|
343 |
+
add_special_tokens=True,
|
344 |
+
max_length=max_length + tolerance,
|
345 |
+
truncation=True,
|
346 |
+
)
|
347 |
if len(combined_encoded) <= max_length + tolerance:
|
348 |
final_segments[-1] = (prev_seg + seg, len(combined_encoded))
|
349 |
else:
|
|
|
356 |
decoded_segments = []
|
357 |
encoded_segments = []
|
358 |
for seg, _ in final_segments:
|
359 |
+
encoded_segment = tokenizer.encode(
|
360 |
+
" ".join(seg),
|
361 |
+
add_special_tokens=True,
|
362 |
+
max_length=max_length + tolerance,
|
363 |
+
truncation=True,
|
364 |
+
)
|
365 |
decoded_segment = tokenizer.decode(encoded_segment)
|
366 |
decoded_segments.append(decoded_segment)
|
367 |
return decoded_segments
|
368 |
|
369 |
+
|
370 |
def predict_quillbot(text):
|
371 |
with torch.no_grad():
|
372 |
quillbot_model.eval()
|
373 |
+
tokenized_text = quillbot_tokenizer(
|
374 |
+
text,
|
375 |
+
padding="max_length",
|
376 |
+
truncation=True,
|
377 |
+
max_length=256,
|
378 |
+
return_tensors="pt",
|
379 |
+
).to(device)
|
380 |
output = quillbot_model(**tokenized_text)
|
381 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
382 |
+
q_score = {
|
383 |
+
"QuillBot": output_norm[1].item(),
|
384 |
+
"Original": output_norm[0].item(),
|
385 |
+
}
|
386 |
return q_score
|
387 |
|
388 |
+
|
389 |
def predict_bc(model, tokenizer, text):
|
390 |
with torch.no_grad():
|
391 |
model.eval()
|
392 |
tokens = text_bc_tokenizer(
|
393 |
+
text,
|
394 |
+
padding="max_length",
|
395 |
+
truncation=True,
|
396 |
+
max_length=333,
|
397 |
+
return_tensors="pt",
|
398 |
).to(device)
|
399 |
output = model(**tokens)
|
400 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
401 |
print("BC Score: ", output_norm)
|
402 |
return output_norm
|
403 |
|
404 |
+
|
405 |
def predict_mc(model, tokenizer, text):
|
406 |
with torch.no_grad():
|
407 |
model.eval()
|
408 |
tokens = text_mc_tokenizer(
|
409 |
+
text,
|
410 |
+
padding="max_length",
|
411 |
+
truncation=True,
|
412 |
+
return_tensors="pt",
|
413 |
+
max_length=256,
|
414 |
).to(device)
|
415 |
output = model(**tokens)
|
416 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
417 |
print("MC Score: ", output_norm)
|
418 |
return output_norm
|
419 |
|
420 |
+
|
421 |
def ai_generated_test(ai_option, input):
|
422 |
+
|
423 |
bc_scores = []
|
424 |
mc_scores = []
|
425 |
+
samples_len_bc = len(
|
426 |
+
split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
427 |
+
)
|
428 |
+
samples_len_mc = len(
|
429 |
+
split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
430 |
+
)
|
431 |
+
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
432 |
+
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
433 |
+
|
434 |
for i in range(samples_len_bc):
|
435 |
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
436 |
+
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
437 |
bc_scores.append(bc_score)
|
438 |
+
|
439 |
for i in range(samples_len_mc):
|
440 |
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
441 |
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
|
|
447 |
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
448 |
bc_score_list = average_bc_scores.tolist()
|
449 |
mc_score_list = average_mc_scores.tolist()
|
450 |
+
|
451 |
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
452 |
mc_score = {}
|
453 |
label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
|
454 |
+
|
455 |
for score, label in zip(mc_score_list, label_map):
|
456 |
mc_score[label.upper()] = score
|
457 |
+
|
458 |
sum_prob = 1 - bc_score["HUMAN"]
|
459 |
for key, value in mc_score.items():
|
460 |
mc_score[key] = value * sum_prob
|
461 |
+
|
462 |
if ai_option == "Human vs AI":
|
463 |
mc_score = {}
|
464 |
|
465 |
+
if sum_prob < 0.01:
|
466 |
mc_score = {}
|
467 |
return bc_score, mc_score
|
468 |
else:
|
469 |
return bc_score, mc_score
|
470 |
|
471 |
+
|
472 |
# COMBINED
|
473 |
def main(
|
474 |
ai_option,
|
|
|
496 |
domains_to_skip,
|
497 |
)
|
498 |
depth_analysis_plot = depth_analysis(input)
|
499 |
+
bc_score, mc_score = ai_generated_test(ai_option, input)
|
500 |
quilscore = predict_quillbot(input)
|
501 |
+
|
502 |
return (
|
503 |
+
bc_score,
|
504 |
+
mc_score,
|
505 |
+
formatted_tokens,
|
506 |
+
depth_analysis_plot,
|
507 |
+
quilscore,
|
508 |
+
)
|
509 |
|
510 |
|
511 |
def build_date(year, month, day):
|
512 |
return f"{year}{months[month]}{day}"
|
513 |
|
514 |
+
|
515 |
def len_validator(text):
|
516 |
+
min_tokens = 200
|
517 |
+
lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
|
518 |
+
if lengt < min_tokens:
|
519 |
+
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
|
520 |
+
else:
|
521 |
+
return f"Input length ({lengt}) is satisified."
|
522 |
+
|
523 |
|
524 |
def extract_text_from_pdf(pdf_path):
|
525 |
doc = fitz.open(pdf_path)
|
|
|
531 |
|
532 |
# DEPTH ANALYSIS
|
533 |
print("loading depth analysis")
|
534 |
+
nltk.download("stopwords")
|
535 |
+
nltk.download("punkt")
|
536 |
+
command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
|
537 |
# Execute the command
|
538 |
subprocess.run(command)
|
539 |
nlp = spacy.load("en_core_web_sm")
|
|
|
543 |
gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
|
544 |
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
|
545 |
|
546 |
+
|
547 |
def depth_analysis(input_text):
|
548 |
|
549 |
# vocanulary richness
|
|
|
553 |
# readability
|
554 |
gunning_fog = calculate_gunning_fog(input_text)
|
555 |
gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
|
556 |
+
|
557 |
# average sentence length and average word length
|
558 |
words, sentences = preprocess_text2(input_text)
|
559 |
average_sentence_length = calculate_average_sentence_length(sentences)
|
560 |
average_word_length = calculate_average_word_length(words)
|
561 |
+
average_sentence_length_norm = normalize(
|
562 |
+
average_sentence_length, min_value=0, max_value=40
|
563 |
+
)
|
564 |
+
average_word_length_norm = normalize(
|
565 |
+
average_word_length, min_value=0, max_value=8
|
566 |
+
)
|
567 |
|
568 |
# syntactic_tree_depth
|
569 |
average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
|
570 |
+
average_tree_depth_norm = normalize(
|
571 |
+
average_tree_depth, min_value=0, max_value=10
|
572 |
+
)
|
573 |
|
574 |
# perplexity
|
575 |
+
perplexity = calculate_perplexity(
|
576 |
+
input_text, gpt2_model, gpt2_tokenizer, device
|
577 |
+
)
|
578 |
perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
|
579 |
|
580 |
features = {
|
581 |
+
"readability": gunning_fog_norm,
|
582 |
"syntactic tree depth": average_tree_depth_norm,
|
583 |
"vocabulary richness": ttr_value,
|
584 |
"perplexity": perplexity_norm,
|
585 |
"average sentence length": average_sentence_length_norm,
|
586 |
+
"average word length": average_word_length_norm,
|
587 |
}
|
588 |
|
589 |
print(features)
|
590 |
|
591 |
fig = go.Figure()
|
592 |
|
593 |
+
fig.add_trace(
|
594 |
+
go.Scatterpolar(
|
595 |
+
r=list(features.values()),
|
596 |
+
theta=list(features.keys()),
|
597 |
+
fill="toself",
|
598 |
+
name="Radar Plot",
|
599 |
+
)
|
600 |
+
)
|
601 |
|
602 |
fig.update_layout(
|
603 |
polar=dict(
|
604 |
radialaxis=dict(
|
605 |
visible=True,
|
606 |
range=[0, 100],
|
607 |
+
)
|
608 |
+
),
|
609 |
showlegend=False,
|
610 |
# autosize=False,
|
611 |
# width=600,
|
|
|
657 |
with gr.Row():
|
658 |
input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
|
659 |
file_input = gr.File(label="Upload PDF")
|
660 |
+
file_input.change(
|
661 |
+
fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
|
662 |
+
)
|
663 |
|
664 |
+
char_count = gr.Textbox(label="Minumum Character Limit Check")
|
665 |
input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
|
666 |
|
667 |
with gr.Row():
|
668 |
with gr.Column():
|
669 |
+
ai_option = gr.Radio(
|
670 |
+
["Human vs AI", "Human vs AI Source Models"],
|
671 |
+
label="Choose an option please.",
|
672 |
+
)
|
673 |
with gr.Column():
|
674 |
+
plag_option = gr.Radio(
|
675 |
+
["Standard", "Advanced"], label="Choose an option please."
|
676 |
+
)
|
677 |
|
678 |
with gr.Row():
|
679 |
with gr.Column():
|
|
|
696 |
## Output
|
697 |
"""
|
698 |
)
|
699 |
+
|
700 |
# models = gr.Dropdown(
|
701 |
+
# model_list,
|
702 |
+
# value=model_list,
|
703 |
+
# multiselect=True,
|
704 |
+
# label="Models to test against",
|
705 |
+
# )
|
706 |
+
|
707 |
with gr.Row():
|
708 |
with gr.Column():
|
709 |
bcLabel = gr.Label(label="Source")
|
|
|
755 |
|
756 |
with gr.Row():
|
757 |
with gr.Column():
|
758 |
+
writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
|
|
|
|
|
759 |
|
760 |
full_check_btn.click(
|
761 |
fn=main,
|
|
|
777 |
mcLabel,
|
778 |
sentenceBreakdown,
|
779 |
writing_analysis_plot,
|
780 |
+
QLabel,
|
781 |
],
|
782 |
api_name="main",
|
783 |
)
|
|
|
827 |
|
828 |
date_from = ""
|
829 |
date_to = ""
|
830 |
+
|
831 |
+
demo.launch(
|
832 |
+
share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
|
833 |
+
)
|
plagiarism.py
ADDED
File without changes
|