Spaces:

polygraf-ai
/

copyright_checker

Runtime error

App Files Files Community

minko186 commited on May 9, 2024

Commit

9c75413

1 Parent(s): aeca56e

added option to choose size of sentence block for source detect

Browse files

Files changed (4) hide show

analysis.py +27 -9
app.py +11 -0
isotonic_regression_model.joblib +0 -0
plagiarism.py +22 -14

analysis.py CHANGED Viewed

@@ -62,7 +62,10 @@ def depth_analysis(input_text):
         "punctuation_diversity": (-0.21875, 0.53125),
         "type_token_ratio": (0.33002482852189063, 1.0894414982357028),
         "calculate_perplexity": (-25.110544681549072, 82.4620680809021),
-        "calculate_syntactic_tree_depth": (1.8380681818181812, 10.997159090909092),
         "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
         "mtld": (-84.03125000000001, 248.81875000000002),
     }
@@ -72,14 +75,17 @@ def depth_analysis(input_text):
     determiner_use = determiners_frequency(input_text, nlp)
     punctuation_variety = punctuation_diversity(input_text)
     sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
-    perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
     lexical_diversity = type_token_ratio(input_text)
     unique_words = hapax_legomena_ratio(input_text)
     vocabulary_stability = mtld(input_text)
     # normalize between 0 and 100
     vocabulary_level_norm = normalize(
-        vocabulary_level, *usual_ranges["estimated_slightly_difficult_words_ratio"]
     )
     entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
     determiner_use_norm = normalize(
@@ -91,12 +97,18 @@ def depth_analysis(input_text):
     lexical_diversity_norm = normalize(
         lexical_diversity, *usual_ranges["type_token_ratio"]
     )
-    unique_words_norm = normalize(unique_words, *usual_ranges["hapax_legomena_ratio"])
-    vocabulary_stability_norm = normalize(vocabulary_stability, *usual_ranges["mtld"])
     sentence_depth_norm = normalize(
         sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
     )
-    perplexity_norm = normalize(perplexity, *usual_ranges["calculate_perplexity"])
     features = {
         "Lexical Diversity": lexical_diversity_norm,
@@ -161,7 +173,8 @@ def depth_analysis(input_text):
                         path=Path.unit_regular_polygon(num_vars),
                     )
                     spine.set_transform(
-                        Affine2D().scale(0.5).translate(0.5, 0.5) + self.transAxes
                     )
                     return {"polar": spine}
@@ -172,14 +185,19 @@ def depth_analysis(input_text):
     theta = radar_factory(N, frame="polygon")
     data = features.values()
     labels = features.keys()
-    fig, ax = plt.subplots(subplot_kw=dict(projection="radar"), figsize=(7.5, 5))
     ax.plot(theta, data)
     ax.fill(theta, data, alpha=0.4)
     ax.set_varlabels(labels)
     rgrids = np.linspace(0, 100, num=6)
     ax.set_rgrids(
-        rgrids, labels=[f"{round(r)}%" for r in rgrids], fontsize=8, color="black"
     )
     ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)

         "punctuation_diversity": (-0.21875, 0.53125),
         "type_token_ratio": (0.33002482852189063, 1.0894414982357028),
         "calculate_perplexity": (-25.110544681549072, 82.4620680809021),
+        "calculate_syntactic_tree_depth": (
+            1.8380681818181812,
+            10.997159090909092,
+        ),
         "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
         "mtld": (-84.03125000000001, 248.81875000000002),
     }
     determiner_use = determiners_frequency(input_text, nlp)
     punctuation_variety = punctuation_diversity(input_text)
     sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
+    perplexity = calculate_perplexity(
+        input_text, gpt2_model, gpt2_tokenizer, device
+    )
     lexical_diversity = type_token_ratio(input_text)
     unique_words = hapax_legomena_ratio(input_text)
     vocabulary_stability = mtld(input_text)
     # normalize between 0 and 100
     vocabulary_level_norm = normalize(
+        vocabulary_level,
+        *usual_ranges["estimated_slightly_difficult_words_ratio"],
     )
     entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
     determiner_use_norm = normalize(
     lexical_diversity_norm = normalize(
         lexical_diversity, *usual_ranges["type_token_ratio"]
     )
+    unique_words_norm = normalize(
+        unique_words, *usual_ranges["hapax_legomena_ratio"]
+    )
+    vocabulary_stability_norm = normalize(
+        vocabulary_stability, *usual_ranges["mtld"]
+    )
     sentence_depth_norm = normalize(
         sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
     )
+    perplexity_norm = normalize(
+        perplexity, *usual_ranges["calculate_perplexity"]
+    )
     features = {
         "Lexical Diversity": lexical_diversity_norm,
                         path=Path.unit_regular_polygon(num_vars),
                     )
                     spine.set_transform(
+                        Affine2D().scale(0.5).translate(0.5, 0.5)
+                        + self.transAxes
                     )
                     return {"polar": spine}
     theta = radar_factory(N, frame="polygon")
     data = features.values()
     labels = features.keys()
+    fig, ax = plt.subplots(
+        subplot_kw=dict(projection="radar"), figsize=(7.5, 5)
+    )
     ax.plot(theta, data)
     ax.fill(theta, data, alpha=0.4)
     ax.set_varlabels(labels)
     rgrids = np.linspace(0, 100, num=6)
     ax.set_rgrids(
+        rgrids,
+        labels=[f"{round(r)}%" for r in rgrids],
+        fontsize=8,
+        color="black",
     )
     ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)

app.py CHANGED Viewed

@@ -46,6 +46,7 @@ def main(
     month_to,
     day_to,
     domains_to_skip,
 ):
     # formatted_tokens = plagiarism_check(
@@ -69,6 +70,7 @@ def main(
         month_to,
         day_to,
         domains_to_skip,
     )
     depth_analysis_plot = depth_analysis(input)
     bc_score = predict_bc_scores(input)
@@ -146,6 +148,13 @@ with gr.Blocks() as demo:
             plag_option = gr.Radio(
                 ["Standard", "Advanced"], label="Choose an option please."
             )
     with gr.Row():
         with gr.Column():
@@ -300,6 +309,7 @@ with gr.Blocks() as demo:
             month_to,
             day_to,
             domains_to_skip,
         ],
         outputs=[
             bcLabel,
@@ -340,6 +350,7 @@ with gr.Blocks() as demo:
             month_to,
             day_to,
             domains_to_skip,
         ],
         outputs=[
             sentenceBreakdown,

     month_to,
     day_to,
     domains_to_skip,
+    source_block_size,
 ):
     # formatted_tokens = plagiarism_check(
         month_to,
         day_to,
         domains_to_skip,
+        source_block_size,
     )
     depth_analysis_plot = depth_analysis(input)
     bc_score = predict_bc_scores(input)
             plag_option = gr.Radio(
                 ["Standard", "Advanced"], label="Choose an option please."
             )
+    with gr.Row():
+        source_block_size = gr.Dropdown(
+            choices=["1", "2", "3", "Paragraph"],
+            label="Source Check Granularity",
+            value="2",
+            interactive=True,
+        )
     with gr.Row():
         with gr.Column():
             month_to,
             day_to,
             domains_to_skip,
+            source_block_size,
         ],
         outputs=[
             bcLabel,
             month_to,
             day_to,
             domains_to_skip,
+            source_block_size,
         ],
         outputs=[
             sentenceBreakdown,

isotonic_regression_model.joblib CHANGED Viewed

Binary files a/isotonic_regression_model.joblib and b/isotonic_regression_model.joblib differ

plagiarism.py CHANGED Viewed

@@ -66,16 +66,21 @@ def get_cosine(vec1, vec2):
         return float(numerator) / denominator
-def split_sentence_blocks(text):
-    two_sents = []
-    for para in text.split("\n\n"):
-        sents = sent_tokenize(para)
-        for i in range(len(sents)):
-            if (i % 2) == 0:
-                two_sents.append(sents[i])
-            else:
-                two_sents[len(two_sents) - 1] += " " + sents[i]
-    return two_sents
 def build_date(year=2024, month="March", day=1):
@@ -177,7 +182,7 @@ def google_search(
                 if count >= 3:
                     break
                 # skip user selected domains
-                if any(
                     ("." + domain) in link["link"] for domain in domains_to_skip
                 ):
                     continue
@@ -217,6 +222,7 @@ def plagiarism_check(
     month_to,
     day_to,
     domains_to_skip,
 ):
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
@@ -227,7 +233,7 @@ def plagiarism_check(
     url_scores = []
     sentence_scores = []
-    sentences = split_sentence_blocks(input)
     url_count = {}
     score_array = []
     url_list = []
@@ -308,6 +314,7 @@ def html_highlight(
     month_to,
     day_to,
     domains_to_skip,
 ):
     sentence_scores, url_scores = plagiarism_check(
         plag_option,
@@ -319,6 +326,7 @@ def html_highlight(
         month_to,
         day_to,
         domains_to_skip,
     )
     html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
@@ -347,5 +355,5 @@ def html_highlight(
         html_content += formatted_url
     html_content += "</div>"
-    return html_content

         return float(numerator) / denominator
+def split_sentence_blocks(text, size):
+    if size == "Paragraph":
+        blocks = text.split("\n")
+        return blocks
+    else:
+        blocks = []
+        size = int(size)
+        for para in text.split("\n\n"):
+            sents = sent_tokenize(para)
+            for i in range(len(sents)):
+                if (i % size) == 0:
+                    blocks.append(sents[i])
+                else:
+                    blocks[int(i / size)] += " " + sents[i]
+        return blocks
 def build_date(year=2024, month="March", day=1):
                 if count >= 3:
                     break
                 # skip user selected domains
+                if (domains_to_skip is not None) and any(
                     ("." + domain) in link["link"] for domain in domains_to_skip
                 ):
                     continue
     month_to,
     day_to,
     domains_to_skip,
+    source_block_size,
 ):
     # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
     # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
     url_scores = []
     sentence_scores = []
+    sentences = split_sentence_blocks(input, source_block_size)
     url_count = {}
     score_array = []
     url_list = []
     month_to,
     day_to,
     domains_to_skip,
+    source_block_size,
 ):
     sentence_scores, url_scores = plagiarism_check(
         plag_option,
         month_to,
         day_to,
         domains_to_skip,
+        source_block_size,
     )
     html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
         html_content += formatted_url
     html_content += "</div>"
+    return html_content