Spaces:

polygraf-ai
/

copyright_checker

Runtime error

App Files Files Community

eljanmahammadli commited on Mar 25, 2024

Commit

9df8406

1 Parent(s): dee0f90

added human vs ai highlighter

Browse files

Files changed (14) hide show

.gitignore +1 -6
__pycache__/analysis.cpython-311.pyc +0 -0
__pycache__/app.cpython-311.pyc +0 -0
__pycache__/explainability.cpython-311.pyc +0 -0
__pycache__/plagiarism.cpython-311.pyc +0 -0
__pycache__/predictors.cpython-311.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
__pycache__/utils.cpython-311.pyc +0 -0
__pycache__/utils.cpython-39.pyc +0 -0
__pycache__/writing_analysis.cpython-310.pyc +0 -0
__pycache__/writing_analysis.cpython-39.pyc +0 -0
app.py +38 -12
highlighter.py +26 -17
predictors.py +42 -33

.gitignore CHANGED Viewed

@@ -1,6 +1 @@
-__pycache__/analysis.cpython-311.pyc
-__pycache__/app.cpython-311.pyc
-__pycache__/explainability.cpython-311.pyc
-__pycache__/plagiarism.cpython-311.pyc
-__pycache__/predictors.cpython-311.pyc
-__pycache__/utils.cpython-311.pyc


1	+ __pycache__/

__pycache__/analysis.cpython-311.pyc DELETED Viewed

Binary file (4.75 kB)

__pycache__/app.cpython-311.pyc DELETED Viewed

Binary file (10.9 kB)

__pycache__/explainability.cpython-311.pyc DELETED Viewed

Binary file (7.89 kB)

__pycache__/plagiarism.cpython-311.pyc DELETED Viewed

Binary file (14.1 kB)

__pycache__/predictors.cpython-311.pyc DELETED Viewed

Binary file (12 kB)

__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (7.17 kB)

__pycache__/utils.cpython-311.pyc DELETED Viewed

Binary file (3.76 kB)

__pycache__/utils.cpython-39.pyc DELETED Viewed

Binary file (7.19 kB)

__pycache__/writing_analysis.cpython-310.pyc DELETED Viewed

Binary file (4.57 kB)

__pycache__/writing_analysis.cpython-39.pyc DELETED Viewed

Binary file (4.64 kB)

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ from plagiarism import plagiarism_check, build_date
 from highlighter import analyze_and_highlight
 from utils import extract_text_from_pdf, len_validator
 import yaml
 np.set_printoptions(suppress=True)
@@ -17,6 +19,10 @@ with open("config.yaml", "r") as file:
 model_list = params["MC_OUTPUT_LABELS"]
 def ai_generated_test(option, input, models):
     if option == "Human vs AI":
         return predict_bc_scores(input), None
@@ -131,15 +137,17 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             only_ai_btn = gr.Button("AI Check")
         with gr.Column():
             only_plagiarism_btn = gr.Button("Source Check")
-    with gr.Row():
-        quillbot_check = gr.Button("Humanized Text Check")
     with gr.Row():
-        quillbot_highlighter = gr.Button("Humanized Highlighter")
     with gr.Row():
         depth_analysis_btn = gr.Button("Detailed Writing Analysis")
@@ -157,16 +165,21 @@ with gr.Blocks() as demo:
         with gr.Column():
             bcLabel = gr.Label(label="Source")
         with gr.Column():
-            mcLabel = gr.Label(label="Creator")
         # with gr.Column():
         #     mc1on1Label = gr.Label(label="Creator(1 on 1 Approach)")
     with gr.Row():
         with gr.Column():
             QLabel = gr.Label(label="Humanized")
         with gr.Column():
-            highlighter_html = gr.HTML(label='Humanized Highlighter')
     with gr.Group():
         with gr.Row():
             month_from = gr.Dropdown(
@@ -280,13 +293,26 @@ with gr.Blocks() as demo:
         api_name="depth_analysis",
     )
-    quillbot_highlighter.click(
-        fn=analyze_and_highlight,
         inputs=[input_text],
-        outputs=[highlighter_html],
-        api_name="quillbot_highlighter",
     )
     date_from = ""
     date_to = ""

 from highlighter import analyze_and_highlight
 from utils import extract_text_from_pdf, len_validator
 import yaml
+from functools import partial
 np.set_printoptions(suppress=True)
 model_list = params["MC_OUTPUT_LABELS"]
+analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
+analyze_and_highlight_quillbot = partial(analyze_and_highlight, model_type="quillbot")
 def ai_generated_test(option, input, models):
     if option == "Human vs AI":
         return predict_bc_scores(input), None
     with gr.Row():
         with gr.Column():
             only_ai_btn = gr.Button("AI Check")
         with gr.Column():
             only_plagiarism_btn = gr.Button("Source Check")
+        with gr.Column():
+            quillbot_check = gr.Button("Humanized Text Check")
     with gr.Row():
+        with gr.Column():
+            bc_highlighter_button = gr.Button("Human vs. AI Highlighter")
+        with gr.Column():
+            quillbot_highlighter_button = gr.Button("Humanized Highlighter")
     with gr.Row():
         depth_analysis_btn = gr.Button("Detailed Writing Analysis")
         with gr.Column():
             bcLabel = gr.Label(label="Source")
         with gr.Column():
+            bc_highlighter_output = gr.HTML(label="Human vs. AI Highlighter")
         # with gr.Column():
         #     mc1on1Label = gr.Label(label="Creator(1 on 1 Approach)")
+    with gr.Row():
+        with gr.Column():
+            mcLabel = gr.Label(label="Creator")
     with gr.Row():
         with gr.Column():
             QLabel = gr.Label(label="Humanized")
         with gr.Column():
+            quillbot_highlighter_output = gr.HTML(label="Humanized Highlighter")
     with gr.Group():
         with gr.Row():
             month_from = gr.Dropdown(
         api_name="depth_analysis",
     )
+    quillbot_highlighter_button.click(
+        fn=analyze_and_highlight_quillbot,
         inputs=[input_text],
+        outputs=[quillbot_highlighter_output],
+        api_name="humanized_highlighter",
     )
+    bc_highlighter_button.click(
+        fn=analyze_and_highlight_bc,
+        inputs=[input_text],
+        outputs=[bc_highlighter_output],
+        api_name="bc_highlighter",
+    )
+    # quillbot_highlighter.click(
+    #     fn=analyze_and_highlight,
+    #     inputs=[input_text],
+    #     outputs=[highlighter_html],
+    #     api_name="quillbot_highlighter",
+    # )
     date_from = ""
     date_to = ""

highlighter.py CHANGED Viewed

@@ -1,43 +1,52 @@
 from lime.lime_text import LimeTextExplainer
 from nltk.tokenize import sent_tokenize
-from predictors import predict_proba_quillbot
-def explainer(text):
-    class_names = ['negative', 'positive']
-    explainer = LimeTextExplainer(class_names=class_names, split_expression=sent_tokenize)
-    exp = explainer.explain_instance(text, predict_proba_quillbot, num_features=20, num_samples=300)
     sentences = [sent for sent in sent_tokenize(text)]
     weights_mapping = exp.as_map()[1]
     sentences_weights = {sentence: 0 for sentence in sentences}
     for idx, weight in weights_mapping:
         if 0 <= idx < len(sentences):
             sentences_weights[sentences[idx]] = weight
-    print(sentences_weights)
-    return sentences_weights
-def analyze_and_highlight(text):
     highlighted_text = ""
-    sentences_weights = explainer(text)
     min_weight = min(sentences_weights.values())
     max_weight = max(sentences_weights.values())
     for sentence, weight in sentences_weights.items():
         normalized_weight = (weight - min_weight) / (max_weight - min_weight)
         if weight >= 0:
-            color = f'rgba(255, {255 * (1 - normalized_weight)}, {255 * (1 - normalized_weight)}, 1)'
         else:
-            color = f'rgba({255 * normalized_weight}, 255, {255 * normalized_weight}, 1)'
         sentence = sentence.strip()
         if not sentence:
             continue
-        highlighted_sentence = f'<span style="background-color: {color}; color: black;">{sentence}</span> '
-        highlighted_text += highlighted_sentence
-    return highlighted_text

 from lime.lime_text import LimeTextExplainer
 from nltk.tokenize import sent_tokenize
+from predictors import predict_for_explainanility
+def explainer(text, model_type):
+    def predictor_wrapper(text):
+        return predict_for_explainanility(text=text, model_type=model_type)
+    class_names = ["negative", "positive"]
+    explainer_ = LimeTextExplainer(
+        class_names=class_names, split_expression=sent_tokenize
+    )
     sentences = [sent for sent in sent_tokenize(text)]
+    num_sentences = len(sentences)
+    exp = explainer_.explain_instance(
+        text, predictor_wrapper, num_features=num_sentences, num_samples=500
+    )
     weights_mapping = exp.as_map()[1]
     sentences_weights = {sentence: 0 for sentence in sentences}
     for idx, weight in weights_mapping:
         if 0 <= idx < len(sentences):
             sentences_weights[sentences[idx]] = weight
+    print(sentences_weights, model_type)
+    return sentences_weights, exp
+def analyze_and_highlight(text, model_type):
     highlighted_text = ""
+    sentences_weights, _ = explainer(text, model_type)
     min_weight = min(sentences_weights.values())
     max_weight = max(sentences_weights.values())
     for sentence, weight in sentences_weights.items():
         normalized_weight = (weight - min_weight) / (max_weight - min_weight)
         if weight >= 0:
+            color = f"rgba(255, {255 * (1 - normalized_weight)}, {255 * (1 - normalized_weight)}, 1)"
         else:
+            color = (
+                f"rgba({255 * normalized_weight}, 255, {255 * normalized_weight}, 1)"
+            )
         sentence = sentence.strip()
         if not sentence:
             continue
+        highlighted_sentence = (
+            f'<span style="background-color: {color}; color: black;">{sentence}</span> '
+        )
+        highlighted_text += highlighted_sentence
+    return highlighted_text

predictors.py CHANGED Viewed

@@ -50,9 +50,16 @@ tokenizers_1on1 = {}
 models_1on1 = {}
 for model_name, model in zip(mc_label_map, text_1on1_models):
     tokenizers_1on1[model_name] = AutoTokenizer.from_pretrained(model)
-    models_1on1[model_name] = (
-        AutoModelForSequenceClassification.from_pretrained(model).to(device)
-    )
 def split_text_allow_complete_sentences_nltk(
@@ -153,10 +160,30 @@ def predict_quillbot(text):
         return q_score
-def predict_proba_quillbot(text):
     with torch.no_grad():
-        tokenized_text = quillbot_tokenizer(text, return_tensors="pt", padding=True).to(device)
-        outputs = quillbot_model(**tokenized_text)
         tensor_logits = outputs[0]
         probas = F.softmax(tensor_logits).detach().cpu().numpy()
     return probas
@@ -196,9 +223,7 @@ def predict_mc_scores(input):
     bc_scores = []
     mc_scores = []
-    samples_len_bc = len(
-        split_text_allow_complete_sentences_nltk(input, type_det="bc")
-    )
     segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
     for i in range(samples_len_bc):
         cleaned_text_bc = remove_special_characters(segments_bc[i])
@@ -209,9 +234,7 @@ def predict_mc_scores(input):
     bc_score_list = average_bc_scores.tolist()
     bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
     segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
-    samples_len_mc = len(
-        split_text_allow_complete_sentences_nltk(input, type_det="mc")
-    )
     for i in range(samples_len_mc):
         cleaned_text_mc = remove_special_characters(segments_mc[i])
         mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
@@ -234,9 +257,7 @@ def predict_mc_scores(input):
 def predict_bc_scores(input):
     bc_scores = []
-    samples_len_bc = len(
-        split_text_allow_complete_sentences_nltk(input, type_det="bc")
-    )
     segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
     for i in range(samples_len_bc):
         cleaned_text_bc = remove_special_characters(segments_bc[i])
@@ -274,9 +295,7 @@ def predict_1on1_combined(input):
 def predict_1on1_single(input, model):
-    predictions = predict_1on1(
-        models_1on1[model], tokenizers_1on1[model], input
-    )[1]
     return predictions
@@ -288,9 +307,7 @@ def predict_1on1_scores(input, models):
     print(f"Models to Test: {models}")
     # BC SCORE
     bc_scores = []
-    samples_len_bc = len(
-        split_text_allow_complete_sentences_nltk(input, type_det="bc")
-    )
     segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
     for i in range(samples_len_bc):
         cleaned_text_bc = remove_special_characters(segments_bc[i])
@@ -305,17 +322,13 @@ def predict_1on1_scores(input, models):
     if len(models) > 1:
         print("Starting MC")
         mc_scores = []
-        segments_mc = split_text_allow_complete_sentences_nltk(
-            input, type_det="mc"
-        )
         samples_len_mc = len(
             split_text_allow_complete_sentences_nltk(input, type_det="mc")
         )
         for i in range(samples_len_mc):
             cleaned_text_mc = remove_special_characters(segments_mc[i])
-            mc_score = predict_mc(
-                text_mc_model, text_mc_tokenizer, cleaned_text_mc
-            )
             mc_scores.append(mc_score)
         mc_scores_array = np.array(mc_scores)
         average_mc_scores = np.mean(mc_scores_array, axis=0)
@@ -325,9 +338,7 @@ def predict_1on1_scores(input, models):
             mc_score[label.upper()] = score
         mc_score = {
-            key: mc_score[key.upper()]
-            for key in models
-            if key.upper() in mc_score
         }
         total = sum(mc_score.values())
         # Normalize each value by dividing it by the total
@@ -342,9 +353,7 @@ def predict_1on1_scores(input, models):
     elif len(models) == 1:
         print("Starting 1on1")
         mc_scores = []
-        segments_mc = split_text_allow_complete_sentences_nltk(
-            input, type_det="mc"
-        )
         samples_len_mc = len(
             split_text_allow_complete_sentences_nltk(input, type_det="mc")
         )

 models_1on1 = {}
 for model_name, model in zip(mc_label_map, text_1on1_models):
     tokenizers_1on1[model_name] = AutoTokenizer.from_pretrained(model)
+    models_1on1[model_name] = AutoModelForSequenceClassification.from_pretrained(
+        model
+    ).to(device)
+# proxy models for explainability
+mini_model_name = "polygraf-ai/bc-model-bert-mini"
+bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_model_name)
+bc_model_mini = AutoModelForSequenceClassification.from_pretrained(mini_model_name).to(
+    device
+)
 def split_text_allow_complete_sentences_nltk(
         return q_score
+def predict_for_explainanility(text, model_type=None):
+    if model_type == "quillbot":
+        cleaning = False
+        max_length = 256
+        model = quillbot_model
+        tokenizer = quillbot_tokenizer
+    elif model_type == "bc":
+        cleaning = True
+        max_length = 512
+        model = bc_model_mini
+        tokenizer = bc_tokenizer_mini
+    else:
+        raise ValueError("Invalid model type")
     with torch.no_grad():
+        if cleaning:
+            text = [remove_special_characters(t) for t in text]
+        tokenized_text = tokenizer(
+            text,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+            max_length=max_length,
+        ).to(device)
+        outputs = model(**tokenized_text)
         tensor_logits = outputs[0]
         probas = F.softmax(tensor_logits).detach().cpu().numpy()
     return probas
     bc_scores = []
     mc_scores = []
+    samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
     segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
     for i in range(samples_len_bc):
         cleaned_text_bc = remove_special_characters(segments_bc[i])
     bc_score_list = average_bc_scores.tolist()
     bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
     segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
+    samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det="mc"))
     for i in range(samples_len_mc):
         cleaned_text_mc = remove_special_characters(segments_mc[i])
         mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
 def predict_bc_scores(input):
     bc_scores = []
+    samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
     segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
     for i in range(samples_len_bc):
         cleaned_text_bc = remove_special_characters(segments_bc[i])
 def predict_1on1_single(input, model):
+    predictions = predict_1on1(models_1on1[model], tokenizers_1on1[model], input)[1]
     return predictions
     print(f"Models to Test: {models}")
     # BC SCORE
     bc_scores = []
+    samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
     segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
     for i in range(samples_len_bc):
         cleaned_text_bc = remove_special_characters(segments_bc[i])
     if len(models) > 1:
         print("Starting MC")
         mc_scores = []
+        segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
         samples_len_mc = len(
             split_text_allow_complete_sentences_nltk(input, type_det="mc")
         )
         for i in range(samples_len_mc):
             cleaned_text_mc = remove_special_characters(segments_mc[i])
+            mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
             mc_scores.append(mc_score)
         mc_scores_array = np.array(mc_scores)
         average_mc_scores = np.mean(mc_scores_array, axis=0)
             mc_score[label.upper()] = score
         mc_score = {
+            key: mc_score[key.upper()] for key in models if key.upper() in mc_score
         }
         total = sum(mc_score.values())
         # Normalize each value by dividing it by the total
     elif len(models) == 1:
         print("Starting 1on1")
         mc_scores = []
+        segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
         samples_len_mc = len(
             split_text_allow_complete_sentences_nltk(input, type_det="mc")
         )