Spaces:

polygraf-ai
/

copyright_checker

Runtime error

App Files Files Community

eljanmahammadli commited on Feb 8, 2024

Commit

d176253

1 Parent(s): 42d5442

implemented depth analyis

Browse files

Files changed (2) hide show

app.py +115 -1
writing_analysis.py +3 -97

app.py CHANGED Viewed

@@ -13,7 +13,22 @@ from scipy.special import softmax
 from evaluate import load
 from datetime import date
 import nltk
-import os
 np.set_printoptions(suppress=True)
@@ -240,6 +255,90 @@ def build_date(year, month, day):
     return f"{year}{months[month]}{day}"
 # START OF GRADIO
 title = "Copyright Checker"
@@ -281,6 +380,8 @@ with gr.Blocks() as demo:
             only_plagiarism_btn = gr.Button("Plagiarism Check")
         with gr.Column():
             submit_btn = gr.Button("Full Check")
     gr.Markdown(
         """
         ## Output
@@ -341,6 +442,12 @@ with gr.Blocks() as demo:
                 },
             )
     submit_btn.click(
         fn=main,
         inputs=[
@@ -390,6 +497,13 @@ with gr.Blocks() as demo:
         api_name="plagiarism_check",
     )
     date_from = ""
     date_to = ""

 from evaluate import load
 from datetime import date
 import nltk
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+import nltk, spacy, subprocess, torch
+import plotly.graph_objects as go
+from writing_analysis import (
+    normalize,
+    preprocess_text1,
+    preprocess_text2,
+    vocabulary_richness_ttr,
+    calculate_gunning_fog,
+    calculate_average_sentence_length,
+    calculate_average_word_length,
+    calculate_syntactic_tree_depth,
+    calculate_perplexity,
+    )
 np.set_printoptions(suppress=True)
     return f"{year}{months[month]}{day}"
+# DEPTH ANALYSIS
+print("loading depth analysis")
+nltk.download('stopwords')
+nltk.download('punkt')
+nlp = spacy.load("en_core_web_sm")
+command = ['python', '-m', 'spacy', 'download', 'en_core_web_sm']
+# Execute the command
+subprocess.run(command)
+# for perplexity
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_id = "gpt2"
+gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
+gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
+def depth_analysis(input_text):
+    # vocanulary richness
+    processed_words = preprocess_text1(input_text)
+    ttr_value = vocabulary_richness_ttr(processed_words)
+    # readability
+    gunning_fog = calculate_gunning_fog(input_text)
+    gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
+    # average sentence length and average word length
+    words, sentences = preprocess_text2(input_text)
+    average_sentence_length = calculate_average_sentence_length(sentences)
+    average_word_length = calculate_average_word_length(words)
+    average_sentence_length_norm = normalize(average_sentence_length, min_value=0, max_value=40)
+    average_word_length_norm = normalize(average_word_length, min_value=0, max_value=8)
+    # syntactic_tree_depth
+    average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
+    average_tree_depth_norm = normalize(average_tree_depth, min_value=0, max_value=10)
+    # perplexity
+    perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
+    perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
+    features = {
+        "readability": gunning_fog_norm,
+        "syntactic tree depth": average_tree_depth_norm,
+        "vocabulary richness": ttr_value,
+        "perplexity": perplexity_norm,
+        "average sentence length": average_sentence_length_norm,
+        "average word length": average_word_length_norm,
+    }
+    print(features)
+    fig = go.Figure()
+    fig.add_trace(go.Scatterpolar(
+        r=list(features.values()),
+        theta=list(features.keys()),
+        fill='toself',
+        name='Radar Plot'
+    ))
+    fig.update_layout(
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                range=[0, 100],
+            )),
+        showlegend=False,
+        # autosize=False,
+        # width=600,
+        # height=600,
+        margin=dict(
+            l=10,
+            r=20,
+            b=10,
+            t=10,
+            # pad=100
+        ),
+    )
+    return fig
 # START OF GRADIO
 title = "Copyright Checker"
             only_plagiarism_btn = gr.Button("Plagiarism Check")
         with gr.Column():
             submit_btn = gr.Button("Full Check")
+        with gr.Column():
+            depth_analysis_btn = gr.Button("Depth Analysis")
     gr.Markdown(
         """
         ## Output
                 },
             )
+    with gr.Row():
+        with gr.Column():
+            writing_analysis_plot = gr.Plot(
+                label="Radar Plot"
+            )
     submit_btn.click(
         fn=main,
         inputs=[
         api_name="plagiarism_check",
     )
+    depth_analysis_btn.click(
+        fn=depth_analysis,
+        inputs=[input_text],
+        outputs=[writing_analysis_plot],
+        api_name="depth_analysis",
+    )
     date_from = ""
     date_to = ""

writing_analysis.py CHANGED Viewed

@@ -1,26 +1,10 @@
-import re, nltk, spacy, textstat, subprocess
 from nltk import FreqDist
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize, sent_tokenize
-from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 import torch
 from tqdm import tqdm
-import gradio as gr
-import plotly.graph_objects as go
-nltk.download('stopwords')
-nltk.download('punkt')
-nlp = spacy.load("en_core_web_sm")
-command = ['python', '-m', 'spacy', 'download', 'en_core_web_sm']
-# Execute the command
-subprocess.run(command)
-# for perplexity
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_id = "gpt2"
-model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
-tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
 def normalize(value, min_value, max_value):
     normalized_value = ((value - min_value) * 100) / (max_value - min_value)
@@ -79,7 +63,7 @@ def calculate_average_word_length(words):
 def calculate_max_depth(sent):
     return max(len(list(token.ancestors)) for token in sent)
-def calculate_syntactic_tree_depth(text):
     """0-10 based on the histogram"""
     doc = nlp(text)
     sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
@@ -87,7 +71,7 @@ def calculate_syntactic_tree_depth(text):
     return average_depth
 # reference: https://huggingface.co/docs/transformers/perplexity
-def calculate_perplexity(text, stride=512):
     """range 0-30 based on the histogram"""
     encodings = tokenizer(text, return_tensors="pt")
     max_length = model.config.n_positions
@@ -114,81 +98,3 @@ def calculate_perplexity(text, stride=512):
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()
-def radar_plot(input_text):
-    # vocanulary richness
-    processed_words = preprocess_text1(input_text)
-    ttr_value = vocabulary_richness_ttr(processed_words)
-    # readability
-    gunning_fog = calculate_gunning_fog(input_text)
-    gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
-    # average sentence length and average word length
-    words, sentences = preprocess_text2(input_text)
-    average_sentence_length = calculate_average_sentence_length(sentences)
-    average_word_length = calculate_average_word_length(words)
-    average_sentence_length_norm = normalize(average_sentence_length, min_value=0, max_value=40)
-    average_word_length_norm = normalize(average_word_length, min_value=0, max_value=8)
-    # syntactic_tree_depth
-    average_tree_depth = calculate_syntactic_tree_depth(input_text)
-    average_tree_depth_norm = normalize(average_tree_depth, min_value=0, max_value=10)
-    # perplexity
-    perplexity = calculate_perplexity(input_text)
-    perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
-    features = {
-        "readability": gunning_fog_norm,
-        "syntactic tree depth": average_tree_depth_norm,
-        "vocabulary richness": ttr_value,
-        "perplexity": perplexity_norm,
-        "average sentence length": average_sentence_length_norm,
-        "average word length": average_word_length_norm,
-    }
-    print(features)
-    fig = go.Figure()
-    fig.add_trace(go.Scatterpolar(
-        r=list(features.values()),
-        theta=list(features.keys()),
-        fill='toself',
-        name='Radar Plot'
-    ))
-    fig.update_layout(
-        polar=dict(
-            radialaxis=dict(
-                visible=True,
-                range=[0, 100],
-            )),
-        showlegend=False,
-        # autosize=False,
-        # width=600,
-        # height=600,
-        margin=dict(
-            l=10,
-            r=20,
-            b=10,
-            t=10,
-            # pad=100
-        ),
-    )
-    return fig
-# Gradio Interface
-interface = gr.Interface(
-    fn=radar_plot,
-    inputs=gr.Textbox(label="Input text"),
-    outputs=gr.Plot(label="Radar Plot"),
-    title="Writing analysis",
-    description="Enter text for writing analysis",
-)
-interface.launch()

+import re, textstat
 from nltk import FreqDist
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize, sent_tokenize
 import torch
 from tqdm import tqdm
 def normalize(value, min_value, max_value):
     normalized_value = ((value - min_value) * 100) / (max_value - min_value)
 def calculate_max_depth(sent):
     return max(len(list(token.ancestors)) for token in sent)
+def calculate_syntactic_tree_depth(nlp, text):
     """0-10 based on the histogram"""
     doc = nlp(text)
     sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
     return average_depth
 # reference: https://huggingface.co/docs/transformers/perplexity
+def calculate_perplexity(text, model, tokenizer, device, stride=512):
     """range 0-30 based on the histogram"""
     encodings = tokenizer(text, return_tensors="pt")
     max_length = model.config.n_positions
     ppl = torch.exp(torch.stack(nlls).mean())
     return ppl.item()