eljanmahammadli commited on
Commit
d176253
·
1 Parent(s): 42d5442

implemented depth analyis

Browse files
Files changed (2) hide show
  1. app.py +115 -1
  2. writing_analysis.py +3 -97
app.py CHANGED
@@ -13,7 +13,22 @@ from scipy.special import softmax
13
  from evaluate import load
14
  from datetime import date
15
  import nltk
16
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  np.set_printoptions(suppress=True)
19
 
@@ -240,6 +255,90 @@ def build_date(year, month, day):
240
  return f"{year}{months[month]}{day}"
241
 
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  # START OF GRADIO
244
 
245
  title = "Copyright Checker"
@@ -281,6 +380,8 @@ with gr.Blocks() as demo:
281
  only_plagiarism_btn = gr.Button("Plagiarism Check")
282
  with gr.Column():
283
  submit_btn = gr.Button("Full Check")
 
 
284
  gr.Markdown(
285
  """
286
  ## Output
@@ -341,6 +442,12 @@ with gr.Blocks() as demo:
341
  },
342
  )
343
 
 
 
 
 
 
 
344
  submit_btn.click(
345
  fn=main,
346
  inputs=[
@@ -390,6 +497,13 @@ with gr.Blocks() as demo:
390
  api_name="plagiarism_check",
391
  )
392
 
 
 
 
 
 
 
 
393
  date_from = ""
394
  date_to = ""
395
 
 
13
  from evaluate import load
14
  from datetime import date
15
  import nltk
16
+
17
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
18
+ import nltk, spacy, subprocess, torch
19
+ import plotly.graph_objects as go
20
+ from writing_analysis import (
21
+ normalize,
22
+ preprocess_text1,
23
+ preprocess_text2,
24
+ vocabulary_richness_ttr,
25
+ calculate_gunning_fog,
26
+ calculate_average_sentence_length,
27
+ calculate_average_word_length,
28
+ calculate_syntactic_tree_depth,
29
+ calculate_perplexity,
30
+
31
+ )
32
 
33
  np.set_printoptions(suppress=True)
34
 
 
255
  return f"{year}{months[month]}{day}"
256
 
257
 
258
+ # DEPTH ANALYSIS
259
+ print("loading depth analysis")
260
+ nltk.download('stopwords')
261
+ nltk.download('punkt')
262
+ nlp = spacy.load("en_core_web_sm")
263
+ command = ['python', '-m', 'spacy', 'download', 'en_core_web_sm']
264
+
265
+ # Execute the command
266
+ subprocess.run(command)
267
+
268
+ # for perplexity
269
+ device = "cuda" if torch.cuda.is_available() else "cpu"
270
+ model_id = "gpt2"
271
+ gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
272
+ gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
273
+
274
+ def depth_analysis(input_text):
275
+
276
+ # vocanulary richness
277
+ processed_words = preprocess_text1(input_text)
278
+ ttr_value = vocabulary_richness_ttr(processed_words)
279
+
280
+ # readability
281
+ gunning_fog = calculate_gunning_fog(input_text)
282
+ gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
283
+
284
+ # average sentence length and average word length
285
+ words, sentences = preprocess_text2(input_text)
286
+ average_sentence_length = calculate_average_sentence_length(sentences)
287
+ average_word_length = calculate_average_word_length(words)
288
+ average_sentence_length_norm = normalize(average_sentence_length, min_value=0, max_value=40)
289
+ average_word_length_norm = normalize(average_word_length, min_value=0, max_value=8)
290
+
291
+ # syntactic_tree_depth
292
+ average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
293
+ average_tree_depth_norm = normalize(average_tree_depth, min_value=0, max_value=10)
294
+
295
+ # perplexity
296
+ perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
297
+ perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
298
+
299
+ features = {
300
+ "readability": gunning_fog_norm,
301
+ "syntactic tree depth": average_tree_depth_norm,
302
+ "vocabulary richness": ttr_value,
303
+ "perplexity": perplexity_norm,
304
+ "average sentence length": average_sentence_length_norm,
305
+ "average word length": average_word_length_norm,
306
+ }
307
+
308
+ print(features)
309
+
310
+ fig = go.Figure()
311
+
312
+ fig.add_trace(go.Scatterpolar(
313
+ r=list(features.values()),
314
+ theta=list(features.keys()),
315
+ fill='toself',
316
+ name='Radar Plot'
317
+ ))
318
+
319
+ fig.update_layout(
320
+ polar=dict(
321
+ radialaxis=dict(
322
+ visible=True,
323
+ range=[0, 100],
324
+ )),
325
+ showlegend=False,
326
+ # autosize=False,
327
+ # width=600,
328
+ # height=600,
329
+ margin=dict(
330
+ l=10,
331
+ r=20,
332
+ b=10,
333
+ t=10,
334
+ # pad=100
335
+ ),
336
+ )
337
+
338
+ return fig
339
+
340
+
341
+
342
  # START OF GRADIO
343
 
344
  title = "Copyright Checker"
 
380
  only_plagiarism_btn = gr.Button("Plagiarism Check")
381
  with gr.Column():
382
  submit_btn = gr.Button("Full Check")
383
+ with gr.Column():
384
+ depth_analysis_btn = gr.Button("Depth Analysis")
385
  gr.Markdown(
386
  """
387
  ## Output
 
442
  },
443
  )
444
 
445
+ with gr.Row():
446
+ with gr.Column():
447
+ writing_analysis_plot = gr.Plot(
448
+ label="Radar Plot"
449
+ )
450
+
451
  submit_btn.click(
452
  fn=main,
453
  inputs=[
 
497
  api_name="plagiarism_check",
498
  )
499
 
500
+ depth_analysis_btn.click(
501
+ fn=depth_analysis,
502
+ inputs=[input_text],
503
+ outputs=[writing_analysis_plot],
504
+ api_name="depth_analysis",
505
+ )
506
+
507
  date_from = ""
508
  date_to = ""
509
 
writing_analysis.py CHANGED
@@ -1,26 +1,10 @@
1
- import re, nltk, spacy, textstat, subprocess
2
  from nltk import FreqDist
3
  from nltk.corpus import stopwords
4
  from nltk.tokenize import word_tokenize, sent_tokenize
5
- from transformers import GPT2LMHeadModel, GPT2TokenizerFast
6
  import torch
7
  from tqdm import tqdm
8
- import gradio as gr
9
- import plotly.graph_objects as go
10
 
11
- nltk.download('stopwords')
12
- nltk.download('punkt')
13
- nlp = spacy.load("en_core_web_sm")
14
- command = ['python', '-m', 'spacy', 'download', 'en_core_web_sm']
15
-
16
- # Execute the command
17
- subprocess.run(command)
18
-
19
- # for perplexity
20
- device = "cuda" if torch.cuda.is_available() else "cpu"
21
- model_id = "gpt2"
22
- model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
23
- tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
24
 
25
  def normalize(value, min_value, max_value):
26
  normalized_value = ((value - min_value) * 100) / (max_value - min_value)
@@ -79,7 +63,7 @@ def calculate_average_word_length(words):
79
  def calculate_max_depth(sent):
80
  return max(len(list(token.ancestors)) for token in sent)
81
 
82
- def calculate_syntactic_tree_depth(text):
83
  """0-10 based on the histogram"""
84
  doc = nlp(text)
85
  sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
@@ -87,7 +71,7 @@ def calculate_syntactic_tree_depth(text):
87
  return average_depth
88
 
89
  # reference: https://huggingface.co/docs/transformers/perplexity
90
- def calculate_perplexity(text, stride=512):
91
  """range 0-30 based on the histogram"""
92
  encodings = tokenizer(text, return_tensors="pt")
93
  max_length = model.config.n_positions
@@ -114,81 +98,3 @@ def calculate_perplexity(text, stride=512):
114
 
115
  ppl = torch.exp(torch.stack(nlls).mean())
116
  return ppl.item()
117
-
118
-
119
- def radar_plot(input_text):
120
-
121
- # vocanulary richness
122
- processed_words = preprocess_text1(input_text)
123
- ttr_value = vocabulary_richness_ttr(processed_words)
124
-
125
- # readability
126
- gunning_fog = calculate_gunning_fog(input_text)
127
- gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
128
-
129
- # average sentence length and average word length
130
- words, sentences = preprocess_text2(input_text)
131
- average_sentence_length = calculate_average_sentence_length(sentences)
132
- average_word_length = calculate_average_word_length(words)
133
- average_sentence_length_norm = normalize(average_sentence_length, min_value=0, max_value=40)
134
- average_word_length_norm = normalize(average_word_length, min_value=0, max_value=8)
135
-
136
- # syntactic_tree_depth
137
- average_tree_depth = calculate_syntactic_tree_depth(input_text)
138
- average_tree_depth_norm = normalize(average_tree_depth, min_value=0, max_value=10)
139
-
140
- # perplexity
141
- perplexity = calculate_perplexity(input_text)
142
- perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
143
-
144
- features = {
145
- "readability": gunning_fog_norm,
146
- "syntactic tree depth": average_tree_depth_norm,
147
- "vocabulary richness": ttr_value,
148
- "perplexity": perplexity_norm,
149
- "average sentence length": average_sentence_length_norm,
150
- "average word length": average_word_length_norm,
151
- }
152
-
153
- print(features)
154
-
155
- fig = go.Figure()
156
-
157
- fig.add_trace(go.Scatterpolar(
158
- r=list(features.values()),
159
- theta=list(features.keys()),
160
- fill='toself',
161
- name='Radar Plot'
162
- ))
163
-
164
- fig.update_layout(
165
- polar=dict(
166
- radialaxis=dict(
167
- visible=True,
168
- range=[0, 100],
169
- )),
170
- showlegend=False,
171
- # autosize=False,
172
- # width=600,
173
- # height=600,
174
- margin=dict(
175
- l=10,
176
- r=20,
177
- b=10,
178
- t=10,
179
- # pad=100
180
- ),
181
- )
182
-
183
- return fig
184
-
185
- # Gradio Interface
186
- interface = gr.Interface(
187
- fn=radar_plot,
188
- inputs=gr.Textbox(label="Input text"),
189
- outputs=gr.Plot(label="Radar Plot"),
190
- title="Writing analysis",
191
- description="Enter text for writing analysis",
192
- )
193
-
194
- interface.launch()
 
1
+ import re, textstat
2
  from nltk import FreqDist
3
  from nltk.corpus import stopwords
4
  from nltk.tokenize import word_tokenize, sent_tokenize
 
5
  import torch
6
  from tqdm import tqdm
 
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def normalize(value, min_value, max_value):
10
  normalized_value = ((value - min_value) * 100) / (max_value - min_value)
 
63
  def calculate_max_depth(sent):
64
  return max(len(list(token.ancestors)) for token in sent)
65
 
66
+ def calculate_syntactic_tree_depth(nlp, text):
67
  """0-10 based on the histogram"""
68
  doc = nlp(text)
69
  sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
 
71
  return average_depth
72
 
73
  # reference: https://huggingface.co/docs/transformers/perplexity
74
+ def calculate_perplexity(text, model, tokenizer, device, stride=512):
75
  """range 0-30 based on the histogram"""
76
  encodings = tokenizer(text, return_tensors="pt")
77
  max_length = model.config.n_positions
 
98
 
99
  ppl = torch.exp(torch.stack(nlls).mean())
100
  return ppl.item()