Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

minko186 commited on Mar 2

Commit

fe15d80

•

1 Parent(s): f6a92d7

fix changes in plagiarism check

Browse files

Files changed (2) hide show

app.py +210 -121
plagiarism.py +0 -0

app.py CHANGED Viewed

@@ -1,4 +1,10 @@
-from utils import cosineSim, googleSearch, getSentences, parallel_scrap, matchingScore
 import gradio as gr
 from urllib.request import urlopen, Request
 from googleapiclient.discovery import build
@@ -14,7 +20,7 @@ from scipy.special import softmax
 from evaluate import load
 from datetime import date
 import nltk
-import fitz
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 import nltk, spacy, subprocess, torch
 import plotly.graph_objects as go
@@ -27,20 +33,19 @@ import multiprocessing
 from functools import partial
 import concurrent.futures
-nltk.download('punkt')
 from writing_analysis import (
     normalize,
     preprocess_text1,
-    preprocess_text2,
     vocabulary_richness_ttr,
     calculate_gunning_fog,
     calculate_average_sentence_length,
     calculate_average_word_length,
     calculate_syntactic_tree_depth,
     calculate_perplexity,
-    )
 np.set_printoptions(suppress=True)
@@ -89,7 +94,7 @@ def plagiarism_check(
     )
     print(f"Time for google search: {time.perf_counter()-time1}")
     time1 = time.perf_counter()
     print("Number of URLs: ", len(urlCount))
     print(urlList)
@@ -113,8 +118,8 @@ def plagiarism_check(
             page_content = soup.text
             source_embeddings.append(embed_text(page_content))
         else:
-           source_embeddings.append(None)
     # Populate matching scores for scrapped pages
     # for i, soup in enumerate(soups):
     #     print(f"Analyzing {i+1} of {len(soups)} soups........................")
@@ -126,30 +131,27 @@ def plagiarism_check(
     #             score = cos_sim_torch(embed_text(sent), source_embeddings[i])
     #             ScoreArray[i][j] = score
-def compute_cosine_similarity(args):
-    sent, source_embedding, i, j = args
-    score = cos_sim_torch(embed_text(sent), source_embedding)
-    return i, j, score
-def main(soups, sentences):
-    source_embeddings = [preprocess(soup) for soup in soups]
-    ScoreArray = [[0 for _ in sentences] for _ in soups]
-    args_list = []
-    for i, soup in enumerate(soups):
-        if soup:
-            for j, sent in enumerate(sentences):
-                args_list.append((sent, source_embeddings[i], i, j))
-    with concurrent.futures.ProcessPoolExecutor() as executor:
-        results = executor.map(compute_cosine_similarity, args_list)
-        for i, j, score in results:
-            ScoreArray[i][j] = score
-    return ScoreArray
     ScoreArray = main(soups, sentences)
     print(f"Time for matching score: {time.perf_counter()-time1}")
     time1 = time.perf_counter()
@@ -177,7 +179,7 @@ def main(soups, sentences):
                 sentenceToMaxURL[j] = i
         if maxScore > 0.5:
             sentencePlag[j] = True
     if (
         (len(sentences) > 1)
         and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
@@ -216,11 +218,13 @@ def main(soups, sentences):
     print(formatted_tokens)
     print(index_descending)
     for ind in index_descending:
         formatted_tokens.append(
             (
-                urlList[ind] + " --- Matching Score: " + f"{str(round(urlScore[ind] * 100, 2))}%",
                 "[" + str(urlMap[ind]) + "]",
             )
         )
@@ -232,7 +236,7 @@ def main(soups, sentences):
     return formatted_tokens
 """
 AI DETECTION SECTION
 """
@@ -240,73 +244,106 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
 text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
-text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
-text_mc_model_path = "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
 text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
-text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
 quillbot_labels = ["Original", "QuillBot"]
 quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
-quillbot_model = AutoModelForSequenceClassification.from_pretrained("polygraf-ai/quillbot-detector-28k").to(device)
 def remove_accents(input_str):
     text_no_accents = unidecode(input_str)
     return text_no_accents
 def remove_special_characters(text):
     text = remove_accents(text)
     pattern = r'[^\w\s\d.,!?\'"()-;]+'
-    text = re.sub(pattern, '', text)
     return text
 def remove_special_characters_2(text):
-    pattern = r'[^a-zA-Z0-9 ]+'
-    text = re.sub(pattern, '', text)
     return text
 def update_character_count(text):
     return f"{len(text)} characters"
-def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30, min_last_segment_length=100, type_det='bc'):
     sentences = nltk.sent_tokenize(text)
     segments = []
     current_segment = []
-    current_length = 0
-    if type_det == 'bc':
         tokenizer = text_bc_tokenizer
         max_length = 333
-    elif type_det == 'mc':
         tokenizer = text_mc_tokenizer
         max_length = 256
     for sentence in sentences:
         tokens = tokenizer.tokenize(sentence)
         sentence_length = len(tokens)
-        if current_length + sentence_length <= max_length + tolerance - 2:
             current_segment.append(sentence)
             current_length += sentence_length
         else:
             if current_segment:
-                encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
                 segments.append((current_segment, len(encoded_segment)))
             current_segment = [sentence]
             current_length = sentence_length
     if current_segment:
-        encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
         segments.append((current_segment, len(encoded_segment)))
     final_segments = []
     for i, (seg, length) in enumerate(segments):
-        if i == len(segments) - 1:
             if length < min_last_segment_length and len(final_segments) > 0:
                 prev_seg, prev_length = final_segments[-1]
-                combined_encoded = tokenizer.encode(' '.join(prev_seg + seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
                 if len(combined_encoded) <= max_length + tolerance:
                     final_segments[-1] = (prev_seg + seg, len(combined_encoded))
                 else:
@@ -319,56 +356,86 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=30,
     decoded_segments = []
     encoded_segments = []
     for seg, _ in final_segments:
-        encoded_segment = tokenizer.encode(' '.join(seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
         decoded_segment = tokenizer.decode(encoded_segment)
         decoded_segments.append(decoded_segment)
     return decoded_segments
 def predict_quillbot(text):
     with torch.no_grad():
         quillbot_model.eval()
-        tokenized_text = quillbot_tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt").to(device)
         output = quillbot_model(**tokenized_text)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
-        q_score = {"QuillBot": output_norm[1].item(), "Original": output_norm[0].item()}
         return q_score
 def predict_bc(model, tokenizer, text):
     with torch.no_grad():
         model.eval()
         tokens = text_bc_tokenizer(
-            text, padding='max_length', truncation=True, max_length=333, return_tensors="pt"
         ).to(device)
         output = model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         print("BC Score: ", output_norm)
         return output_norm
 def predict_mc(model, tokenizer, text):
     with torch.no_grad():
         model.eval()
         tokens = text_mc_tokenizer(
-            text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
         ).to(device)
         output = model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         print("MC Score: ", output_norm)
         return output_norm
 def ai_generated_test(ai_option, input):
     bc_scores = []
     mc_scores = []
-    samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
-    samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc'))
-    segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
-    segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
     for i in range(samples_len_bc):
         cleaned_text_bc = remove_special_characters(segments_bc[i])
-        bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
         bc_scores.append(bc_score)
     for i in range(samples_len_mc):
         cleaned_text_mc = remove_special_characters(segments_mc[i])
         mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
@@ -380,27 +447,28 @@ def ai_generated_test(ai_option, input):
     average_mc_scores = np.mean(mc_scores_array, axis=0)
     bc_score_list = average_bc_scores.tolist()
     mc_score_list = average_mc_scores.tolist()
     bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
     mc_score = {}
     label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
     for score, label in zip(mc_score_list, label_map):
         mc_score[label.upper()] = score
     sum_prob = 1 - bc_score["HUMAN"]
     for key, value in mc_score.items():
         mc_score[key] = value * sum_prob
     if ai_option == "Human vs AI":
         mc_score = {}
-    if sum_prob < 0.01  :
         mc_score = {}
         return bc_score, mc_score
     else:
         return bc_score, mc_score
 # COMBINED
 def main(
     ai_option,
@@ -428,28 +496,30 @@ def main(
         domains_to_skip,
     )
     depth_analysis_plot = depth_analysis(input)
-    bc_score, mc_score = ai_generated_test(ai_option,input)
     quilscore = predict_quillbot(input)
     return (
-    bc_score,
-    mc_score,
-    formatted_tokens,
-    depth_analysis_plot,
-    quilscore
-            )
 def build_date(year, month, day):
     return f"{year}{months[month]}{day}"
 def len_validator(text):
-    min_tokens = 200
-    lengt = len(text_bc_tokenizer.tokenize(text = text, return_tensors="pt"))
-    if  lengt < min_tokens:
-        return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
-    else :
-        return f"Input length ({lengt}) is satisified."
 def extract_text_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
@@ -461,9 +531,9 @@ def extract_text_from_pdf(pdf_path):
 # DEPTH ANALYSIS
 print("loading depth analysis")
-nltk.download('stopwords')
-nltk.download('punkt')
-command = ['python3', '-m', 'spacy', 'download', 'en_core_web_sm']
 # Execute the command
 subprocess.run(command)
 nlp = spacy.load("en_core_web_sm")
@@ -473,6 +543,7 @@ model_id = "gpt2"
 gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
 def depth_analysis(input_text):
     # vocanulary richness
@@ -482,48 +553,59 @@ def depth_analysis(input_text):
     # readability
     gunning_fog = calculate_gunning_fog(input_text)
     gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
     # average sentence length and average word length
     words, sentences = preprocess_text2(input_text)
     average_sentence_length = calculate_average_sentence_length(sentences)
     average_word_length = calculate_average_word_length(words)
-    average_sentence_length_norm = normalize(average_sentence_length, min_value=0, max_value=40)
-    average_word_length_norm = normalize(average_word_length, min_value=0, max_value=8)
     # syntactic_tree_depth
     average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
-    average_tree_depth_norm = normalize(average_tree_depth, min_value=0, max_value=10)
     # perplexity
-    perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
     perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
     features = {
-        "readability": gunning_fog_norm,
         "syntactic tree depth": average_tree_depth_norm,
         "vocabulary richness": ttr_value,
         "perplexity": perplexity_norm,
         "average sentence length": average_sentence_length_norm,
-        "average word length": average_word_length_norm,
     }
     print(features)
     fig = go.Figure()
-    fig.add_trace(go.Scatterpolar(
-        r=list(features.values()),
-        theta=list(features.keys()),
-        fill='toself',
-        name='Radar Plot'
-    ))
     fig.update_layout(
         polar=dict(
             radialaxis=dict(
                 visible=True,
                 range=[0, 100],
-            )),
         showlegend=False,
         # autosize=False,
         # width=600,
@@ -575,16 +657,23 @@ with gr.Blocks() as demo:
     with gr.Row():
         input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
         file_input = gr.File(label="Upload PDF")
-        file_input.change(fn=extract_text_from_pdf, inputs=file_input, outputs=input_text)
-    char_count = gr.Textbox(label="Minumum Character Limit Check")
     input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
     with gr.Row():
         with gr.Column():
-            ai_option = gr.Radio(["Human vs AI", "Human vs AI Source Models"], label="Choose an option please.")
         with gr.Column():
-            plag_option = gr.Radio(["Standard", "Advanced"], label="Choose an option please.")
     with gr.Row():
         with gr.Column():
@@ -607,14 +696,14 @@ with gr.Blocks() as demo:
         ## Output
         """
     )
     # models = gr.Dropdown(
-            # model_list,
-            # value=model_list,
-            # multiselect=True,
-            # label="Models to test against",
-        # )
     with gr.Row():
         with gr.Column():
             bcLabel = gr.Label(label="Source")
@@ -666,9 +755,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            writing_analysis_plot = gr.Plot(
-                label="Writing Analysis Plot"
-            )
     full_check_btn.click(
         fn=main,
@@ -690,7 +777,7 @@ with gr.Blocks() as demo:
             mcLabel,
             sentenceBreakdown,
             writing_analysis_plot,
-            QLabel
         ],
         api_name="main",
     )
@@ -740,5 +827,7 @@ with gr.Blocks() as demo:
     date_from = ""
     date_to = ""
-demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))

+from utils import (
+    cosineSim,
+    googleSearch,
+    getSentences,
+    parallel_scrap,
+    matchingScore,
+)
 import gradio as gr
 from urllib.request import urlopen, Request
 from googleapiclient.discovery import build
 from evaluate import load
 from datetime import date
 import nltk
+import fitz
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 import nltk, spacy, subprocess, torch
 import plotly.graph_objects as go
 from functools import partial
 import concurrent.futures
+nltk.download("punkt")
 from writing_analysis import (
     normalize,
     preprocess_text1,
+    preprocess_text2,
     vocabulary_richness_ttr,
     calculate_gunning_fog,
     calculate_average_sentence_length,
     calculate_average_word_length,
     calculate_syntactic_tree_depth,
     calculate_perplexity,
+)
 np.set_printoptions(suppress=True)
     )
     print(f"Time for google search: {time.perf_counter()-time1}")
     time1 = time.perf_counter()
     print("Number of URLs: ", len(urlCount))
     print(urlList)
             page_content = soup.text
             source_embeddings.append(embed_text(page_content))
         else:
+            source_embeddings.append(None)
     # Populate matching scores for scrapped pages
     # for i, soup in enumerate(soups):
     #     print(f"Analyzing {i+1} of {len(soups)} soups........................")
     #             score = cos_sim_torch(embed_text(sent), source_embeddings[i])
     #             ScoreArray[i][j] = score
+    def compute_cosine_similarity(args):
+        sent, source_embedding, i, j = args
+        score = cos_sim_torch(embed_text(sent), source_embedding)
+        return i, j, score
+    def main(soups, sentences):
+        source_embeddings = [preprocess(soup) for soup in soups]
+        ScoreArray = [[0 for _ in sentences] for _ in soups]
+        args_list = []
+        for i, soup in enumerate(soups):
+            if soup:
+                for j, sent in enumerate(sentences):
+                    args_list.append((sent, source_embeddings[i], i, j))
+        with concurrent.futures.ProcessPoolExecutor() as executor:
+            results = executor.map(compute_cosine_similarity, args_list)
+            for i, j, score in results:
+                ScoreArray[i][j] = score
+        return ScoreArray
     ScoreArray = main(soups, sentences)
     print(f"Time for matching score: {time.perf_counter()-time1}")
     time1 = time.perf_counter()
                 sentenceToMaxURL[j] = i
         if maxScore > 0.5:
             sentencePlag[j] = True
     if (
         (len(sentences) > 1)
         and (sentenceToMaxURL[1] != sentenceToMaxURL[0])
     print(formatted_tokens)
     print(index_descending)
     for ind in index_descending:
         formatted_tokens.append(
             (
+                urlList[ind]
+                + " --- Matching Score: "
+                + f"{str(round(urlScore[ind] * 100, 2))}%",
                 "[" + str(urlMap[ind]) + "]",
             )
         )
     return formatted_tokens
 """
 AI DETECTION SECTION
 """
 text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
 text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
+text_bc_model = AutoModelForSequenceClassification.from_pretrained(
+    text_bc_model_path
+).to(device)
+text_mc_model_path = (
+    "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
+)
 text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
+text_mc_model = AutoModelForSequenceClassification.from_pretrained(
+    text_mc_model_path
+).to(device)
 quillbot_labels = ["Original", "QuillBot"]
 quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
+quillbot_model = AutoModelForSequenceClassification.from_pretrained(
+    "polygraf-ai/quillbot-detector-28k"
+).to(device)
 def remove_accents(input_str):
     text_no_accents = unidecode(input_str)
     return text_no_accents
 def remove_special_characters(text):
     text = remove_accents(text)
     pattern = r'[^\w\s\d.,!?\'"()-;]+'
+    text = re.sub(pattern, "", text)
     return text
 def remove_special_characters_2(text):
+    pattern = r"[^a-zA-Z0-9 ]+"
+    text = re.sub(pattern, "", text)
     return text
 def update_character_count(text):
     return f"{len(text)} characters"
+def split_text_allow_complete_sentences_nltk(
+    text,
+    max_length=256,
+    tolerance=30,
+    min_last_segment_length=100,
+    type_det="bc",
+):
     sentences = nltk.sent_tokenize(text)
     segments = []
     current_segment = []
+    current_length = 0
+    if type_det == "bc":
         tokenizer = text_bc_tokenizer
         max_length = 333
+    elif type_det == "mc":
         tokenizer = text_mc_tokenizer
         max_length = 256
     for sentence in sentences:
         tokens = tokenizer.tokenize(sentence)
         sentence_length = len(tokens)
+        if current_length + sentence_length <= max_length + tolerance - 2:
             current_segment.append(sentence)
             current_length += sentence_length
         else:
             if current_segment:
+                encoded_segment = tokenizer.encode(
+                    " ".join(current_segment),
+                    add_special_tokens=True,
+                    max_length=max_length + tolerance,
+                    truncation=True,
+                )
                 segments.append((current_segment, len(encoded_segment)))
             current_segment = [sentence]
             current_length = sentence_length
     if current_segment:
+        encoded_segment = tokenizer.encode(
+            " ".join(current_segment),
+            add_special_tokens=True,
+            max_length=max_length + tolerance,
+            truncation=True,
+        )
         segments.append((current_segment, len(encoded_segment)))
     final_segments = []
     for i, (seg, length) in enumerate(segments):
+        if i == len(segments) - 1:
             if length < min_last_segment_length and len(final_segments) > 0:
                 prev_seg, prev_length = final_segments[-1]
+                combined_encoded = tokenizer.encode(
+                    " ".join(prev_seg + seg),
+                    add_special_tokens=True,
+                    max_length=max_length + tolerance,
+                    truncation=True,
+                )
                 if len(combined_encoded) <= max_length + tolerance:
                     final_segments[-1] = (prev_seg + seg, len(combined_encoded))
                 else:
     decoded_segments = []
     encoded_segments = []
     for seg, _ in final_segments:
+        encoded_segment = tokenizer.encode(
+            " ".join(seg),
+            add_special_tokens=True,
+            max_length=max_length + tolerance,
+            truncation=True,
+        )
         decoded_segment = tokenizer.decode(encoded_segment)
         decoded_segments.append(decoded_segment)
     return decoded_segments
 def predict_quillbot(text):
     with torch.no_grad():
         quillbot_model.eval()
+        tokenized_text = quillbot_tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=256,
+            return_tensors="pt",
+        ).to(device)
         output = quillbot_model(**tokenized_text)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
+        q_score = {
+            "QuillBot": output_norm[1].item(),
+            "Original": output_norm[0].item(),
+        }
         return q_score
 def predict_bc(model, tokenizer, text):
     with torch.no_grad():
         model.eval()
         tokens = text_bc_tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=333,
+            return_tensors="pt",
         ).to(device)
         output = model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         print("BC Score: ", output_norm)
         return output_norm
 def predict_mc(model, tokenizer, text):
     with torch.no_grad():
         model.eval()
         tokens = text_mc_tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+            max_length=256,
         ).to(device)
         output = model(**tokens)
         output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
         print("MC Score: ", output_norm)
         return output_norm
 def ai_generated_test(ai_option, input):
     bc_scores = []
     mc_scores = []
+    samples_len_bc = len(
+        split_text_allow_complete_sentences_nltk(input, type_det="bc")
+    )
+    samples_len_mc = len(
+        split_text_allow_complete_sentences_nltk(input, type_det="mc")
+    )
+    segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
+    segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
     for i in range(samples_len_bc):
         cleaned_text_bc = remove_special_characters(segments_bc[i])
+        bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
         bc_scores.append(bc_score)
     for i in range(samples_len_mc):
         cleaned_text_mc = remove_special_characters(segments_mc[i])
         mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
     average_mc_scores = np.mean(mc_scores_array, axis=0)
     bc_score_list = average_bc_scores.tolist()
     mc_score_list = average_mc_scores.tolist()
     bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
     mc_score = {}
     label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
     for score, label in zip(mc_score_list, label_map):
         mc_score[label.upper()] = score
     sum_prob = 1 - bc_score["HUMAN"]
     for key, value in mc_score.items():
         mc_score[key] = value * sum_prob
     if ai_option == "Human vs AI":
         mc_score = {}
+    if sum_prob < 0.01:
         mc_score = {}
         return bc_score, mc_score
     else:
         return bc_score, mc_score
 # COMBINED
 def main(
     ai_option,
         domains_to_skip,
     )
     depth_analysis_plot = depth_analysis(input)
+    bc_score, mc_score = ai_generated_test(ai_option, input)
     quilscore = predict_quillbot(input)
     return (
+        bc_score,
+        mc_score,
+        formatted_tokens,
+        depth_analysis_plot,
+        quilscore,
+    )
 def build_date(year, month, day):
     return f"{year}{months[month]}{day}"
 def len_validator(text):
+    min_tokens = 200
+    lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
+    if lengt < min_tokens:
+        return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
+    else:
+        return f"Input length ({lengt}) is satisified."
 def extract_text_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
 # DEPTH ANALYSIS
 print("loading depth analysis")
+nltk.download("stopwords")
+nltk.download("punkt")
+command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
 # Execute the command
 subprocess.run(command)
 nlp = spacy.load("en_core_web_sm")
 gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
 def depth_analysis(input_text):
     # vocanulary richness
     # readability
     gunning_fog = calculate_gunning_fog(input_text)
     gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
     # average sentence length and average word length
     words, sentences = preprocess_text2(input_text)
     average_sentence_length = calculate_average_sentence_length(sentences)
     average_word_length = calculate_average_word_length(words)
+    average_sentence_length_norm = normalize(
+        average_sentence_length, min_value=0, max_value=40
+    )
+    average_word_length_norm = normalize(
+        average_word_length, min_value=0, max_value=8
+    )
     # syntactic_tree_depth
     average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
+    average_tree_depth_norm = normalize(
+        average_tree_depth, min_value=0, max_value=10
+    )
     # perplexity
+    perplexity = calculate_perplexity(
+        input_text, gpt2_model, gpt2_tokenizer, device
+    )
     perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
     features = {
+        "readability": gunning_fog_norm,
         "syntactic tree depth": average_tree_depth_norm,
         "vocabulary richness": ttr_value,
         "perplexity": perplexity_norm,
         "average sentence length": average_sentence_length_norm,
+        "average word length": average_word_length_norm,
     }
     print(features)
     fig = go.Figure()
+    fig.add_trace(
+        go.Scatterpolar(
+            r=list(features.values()),
+            theta=list(features.keys()),
+            fill="toself",
+            name="Radar Plot",
+        )
+    )
     fig.update_layout(
         polar=dict(
             radialaxis=dict(
                 visible=True,
                 range=[0, 100],
+            )
+        ),
         showlegend=False,
         # autosize=False,
         # width=600,
     with gr.Row():
         input_text = gr.Textbox(label="Input text", lines=6, placeholder="")
         file_input = gr.File(label="Upload PDF")
+        file_input.change(
+            fn=extract_text_from_pdf, inputs=file_input, outputs=input_text
+        )
+    char_count = gr.Textbox(label="Minumum Character Limit Check")
     input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
     with gr.Row():
         with gr.Column():
+            ai_option = gr.Radio(
+                ["Human vs AI", "Human vs AI Source Models"],
+                label="Choose an option please.",
+            )
         with gr.Column():
+            plag_option = gr.Radio(
+                ["Standard", "Advanced"], label="Choose an option please."
+            )
     with gr.Row():
         with gr.Column():
         ## Output
         """
     )
     # models = gr.Dropdown(
+    # model_list,
+    # value=model_list,
+    # multiselect=True,
+    # label="Models to test against",
+    # )
     with gr.Row():
         with gr.Column():
             bcLabel = gr.Label(label="Source")
     with gr.Row():
         with gr.Column():
+            writing_analysis_plot = gr.Plot(label="Writing Analysis Plot")
     full_check_btn.click(
         fn=main,
             mcLabel,
             sentenceBreakdown,
             writing_analysis_plot,
+            QLabel,
         ],
         api_name="main",
     )
     date_from = ""
     date_to = ""
+demo.launch(
+    share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
+)

plagiarism.py ADDED Viewed

File without changes