minko186 commited on
Commit
9c75413
1 Parent(s): aeca56e

added option to choose size of sentence block for source detect

Browse files
Files changed (4) hide show
  1. analysis.py +27 -9
  2. app.py +11 -0
  3. isotonic_regression_model.joblib +0 -0
  4. plagiarism.py +22 -14
analysis.py CHANGED
@@ -62,7 +62,10 @@ def depth_analysis(input_text):
62
  "punctuation_diversity": (-0.21875, 0.53125),
63
  "type_token_ratio": (0.33002482852189063, 1.0894414982357028),
64
  "calculate_perplexity": (-25.110544681549072, 82.4620680809021),
65
- "calculate_syntactic_tree_depth": (1.8380681818181812, 10.997159090909092),
 
 
 
66
  "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
67
  "mtld": (-84.03125000000001, 248.81875000000002),
68
  }
@@ -72,14 +75,17 @@ def depth_analysis(input_text):
72
  determiner_use = determiners_frequency(input_text, nlp)
73
  punctuation_variety = punctuation_diversity(input_text)
74
  sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
75
- perplexity = calculate_perplexity(input_text, gpt2_model, gpt2_tokenizer, device)
 
 
76
  lexical_diversity = type_token_ratio(input_text)
77
  unique_words = hapax_legomena_ratio(input_text)
78
  vocabulary_stability = mtld(input_text)
79
 
80
  # normalize between 0 and 100
81
  vocabulary_level_norm = normalize(
82
- vocabulary_level, *usual_ranges["estimated_slightly_difficult_words_ratio"]
 
83
  )
84
  entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
85
  determiner_use_norm = normalize(
@@ -91,12 +97,18 @@ def depth_analysis(input_text):
91
  lexical_diversity_norm = normalize(
92
  lexical_diversity, *usual_ranges["type_token_ratio"]
93
  )
94
- unique_words_norm = normalize(unique_words, *usual_ranges["hapax_legomena_ratio"])
95
- vocabulary_stability_norm = normalize(vocabulary_stability, *usual_ranges["mtld"])
 
 
 
 
96
  sentence_depth_norm = normalize(
97
  sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
98
  )
99
- perplexity_norm = normalize(perplexity, *usual_ranges["calculate_perplexity"])
 
 
100
 
101
  features = {
102
  "Lexical Diversity": lexical_diversity_norm,
@@ -161,7 +173,8 @@ def depth_analysis(input_text):
161
  path=Path.unit_regular_polygon(num_vars),
162
  )
163
  spine.set_transform(
164
- Affine2D().scale(0.5).translate(0.5, 0.5) + self.transAxes
 
165
  )
166
  return {"polar": spine}
167
 
@@ -172,14 +185,19 @@ def depth_analysis(input_text):
172
  theta = radar_factory(N, frame="polygon")
173
  data = features.values()
174
  labels = features.keys()
175
- fig, ax = plt.subplots(subplot_kw=dict(projection="radar"), figsize=(7.5, 5))
 
 
176
  ax.plot(theta, data)
177
  ax.fill(theta, data, alpha=0.4)
178
  ax.set_varlabels(labels)
179
 
180
  rgrids = np.linspace(0, 100, num=6)
181
  ax.set_rgrids(
182
- rgrids, labels=[f"{round(r)}%" for r in rgrids], fontsize=8, color="black"
 
 
 
183
  )
184
  ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
185
 
 
62
  "punctuation_diversity": (-0.21875, 0.53125),
63
  "type_token_ratio": (0.33002482852189063, 1.0894414982357028),
64
  "calculate_perplexity": (-25.110544681549072, 82.4620680809021),
65
+ "calculate_syntactic_tree_depth": (
66
+ 1.8380681818181812,
67
+ 10.997159090909092,
68
+ ),
69
  "hapax_legomena_ratio": (0.0830971690138207, 1.0302715687215778),
70
  "mtld": (-84.03125000000001, 248.81875000000002),
71
  }
 
75
  determiner_use = determiners_frequency(input_text, nlp)
76
  punctuation_variety = punctuation_diversity(input_text)
77
  sentence_depth = calculate_syntactic_tree_depth(input_text, nlp)
78
+ perplexity = calculate_perplexity(
79
+ input_text, gpt2_model, gpt2_tokenizer, device
80
+ )
81
  lexical_diversity = type_token_ratio(input_text)
82
  unique_words = hapax_legomena_ratio(input_text)
83
  vocabulary_stability = mtld(input_text)
84
 
85
  # normalize between 0 and 100
86
  vocabulary_level_norm = normalize(
87
+ vocabulary_level,
88
+ *usual_ranges["estimated_slightly_difficult_words_ratio"],
89
  )
90
  entity_ratio_norm = normalize(entity_ratio, *usual_ranges["entity_density"])
91
  determiner_use_norm = normalize(
 
97
  lexical_diversity_norm = normalize(
98
  lexical_diversity, *usual_ranges["type_token_ratio"]
99
  )
100
+ unique_words_norm = normalize(
101
+ unique_words, *usual_ranges["hapax_legomena_ratio"]
102
+ )
103
+ vocabulary_stability_norm = normalize(
104
+ vocabulary_stability, *usual_ranges["mtld"]
105
+ )
106
  sentence_depth_norm = normalize(
107
  sentence_depth, *usual_ranges["calculate_syntactic_tree_depth"]
108
  )
109
+ perplexity_norm = normalize(
110
+ perplexity, *usual_ranges["calculate_perplexity"]
111
+ )
112
 
113
  features = {
114
  "Lexical Diversity": lexical_diversity_norm,
 
173
  path=Path.unit_regular_polygon(num_vars),
174
  )
175
  spine.set_transform(
176
+ Affine2D().scale(0.5).translate(0.5, 0.5)
177
+ + self.transAxes
178
  )
179
  return {"polar": spine}
180
 
 
185
  theta = radar_factory(N, frame="polygon")
186
  data = features.values()
187
  labels = features.keys()
188
+ fig, ax = plt.subplots(
189
+ subplot_kw=dict(projection="radar"), figsize=(7.5, 5)
190
+ )
191
  ax.plot(theta, data)
192
  ax.fill(theta, data, alpha=0.4)
193
  ax.set_varlabels(labels)
194
 
195
  rgrids = np.linspace(0, 100, num=6)
196
  ax.set_rgrids(
197
+ rgrids,
198
+ labels=[f"{round(r)}%" for r in rgrids],
199
+ fontsize=8,
200
+ color="black",
201
  )
202
  ax.grid(True, color="black", linestyle="-", linewidth=0.5, alpha=0.5)
203
 
app.py CHANGED
@@ -46,6 +46,7 @@ def main(
46
  month_to,
47
  day_to,
48
  domains_to_skip,
 
49
  ):
50
 
51
  # formatted_tokens = plagiarism_check(
@@ -69,6 +70,7 @@ def main(
69
  month_to,
70
  day_to,
71
  domains_to_skip,
 
72
  )
73
  depth_analysis_plot = depth_analysis(input)
74
  bc_score = predict_bc_scores(input)
@@ -146,6 +148,13 @@ with gr.Blocks() as demo:
146
  plag_option = gr.Radio(
147
  ["Standard", "Advanced"], label="Choose an option please."
148
  )
 
 
 
 
 
 
 
149
 
150
  with gr.Row():
151
  with gr.Column():
@@ -300,6 +309,7 @@ with gr.Blocks() as demo:
300
  month_to,
301
  day_to,
302
  domains_to_skip,
 
303
  ],
304
  outputs=[
305
  bcLabel,
@@ -340,6 +350,7 @@ with gr.Blocks() as demo:
340
  month_to,
341
  day_to,
342
  domains_to_skip,
 
343
  ],
344
  outputs=[
345
  sentenceBreakdown,
 
46
  month_to,
47
  day_to,
48
  domains_to_skip,
49
+ source_block_size,
50
  ):
51
 
52
  # formatted_tokens = plagiarism_check(
 
70
  month_to,
71
  day_to,
72
  domains_to_skip,
73
+ source_block_size,
74
  )
75
  depth_analysis_plot = depth_analysis(input)
76
  bc_score = predict_bc_scores(input)
 
148
  plag_option = gr.Radio(
149
  ["Standard", "Advanced"], label="Choose an option please."
150
  )
151
+ with gr.Row():
152
+ source_block_size = gr.Dropdown(
153
+ choices=["1", "2", "3", "Paragraph"],
154
+ label="Source Check Granularity",
155
+ value="2",
156
+ interactive=True,
157
+ )
158
 
159
  with gr.Row():
160
  with gr.Column():
 
309
  month_to,
310
  day_to,
311
  domains_to_skip,
312
+ source_block_size,
313
  ],
314
  outputs=[
315
  bcLabel,
 
350
  month_to,
351
  day_to,
352
  domains_to_skip,
353
+ source_block_size,
354
  ],
355
  outputs=[
356
  sentenceBreakdown,
isotonic_regression_model.joblib CHANGED
Binary files a/isotonic_regression_model.joblib and b/isotonic_regression_model.joblib differ
 
plagiarism.py CHANGED
@@ -66,16 +66,21 @@ def get_cosine(vec1, vec2):
66
  return float(numerator) / denominator
67
 
68
 
69
- def split_sentence_blocks(text):
70
- two_sents = []
71
- for para in text.split("\n\n"):
72
- sents = sent_tokenize(para)
73
- for i in range(len(sents)):
74
- if (i % 2) == 0:
75
- two_sents.append(sents[i])
76
- else:
77
- two_sents[len(two_sents) - 1] += " " + sents[i]
78
- return two_sents
 
 
 
 
 
79
 
80
 
81
  def build_date(year=2024, month="March", day=1):
@@ -177,7 +182,7 @@ def google_search(
177
  if count >= 3:
178
  break
179
  # skip user selected domains
180
- if any(
181
  ("." + domain) in link["link"] for domain in domains_to_skip
182
  ):
183
  continue
@@ -217,6 +222,7 @@ def plagiarism_check(
217
  month_to,
218
  day_to,
219
  domains_to_skip,
 
220
  ):
221
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
222
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
@@ -227,7 +233,7 @@ def plagiarism_check(
227
 
228
  url_scores = []
229
  sentence_scores = []
230
- sentences = split_sentence_blocks(input)
231
  url_count = {}
232
  score_array = []
233
  url_list = []
@@ -308,6 +314,7 @@ def html_highlight(
308
  month_to,
309
  day_to,
310
  domains_to_skip,
 
311
  ):
312
  sentence_scores, url_scores = plagiarism_check(
313
  plag_option,
@@ -319,6 +326,7 @@ def html_highlight(
319
  month_to,
320
  day_to,
321
  domains_to_skip,
 
322
  )
323
 
324
  html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
@@ -347,5 +355,5 @@ def html_highlight(
347
  html_content += formatted_url
348
 
349
  html_content += "</div>"
350
-
351
- return html_content
 
66
  return float(numerator) / denominator
67
 
68
 
69
+ def split_sentence_blocks(text, size):
70
+ if size == "Paragraph":
71
+ blocks = text.split("\n")
72
+ return blocks
73
+ else:
74
+ blocks = []
75
+ size = int(size)
76
+ for para in text.split("\n\n"):
77
+ sents = sent_tokenize(para)
78
+ for i in range(len(sents)):
79
+ if (i % size) == 0:
80
+ blocks.append(sents[i])
81
+ else:
82
+ blocks[int(i / size)] += " " + sents[i]
83
+ return blocks
84
 
85
 
86
  def build_date(year=2024, month="March", day=1):
 
182
  if count >= 3:
183
  break
184
  # skip user selected domains
185
+ if (domains_to_skip is not None) and any(
186
  ("." + domain) in link["link"] for domain in domains_to_skip
187
  ):
188
  continue
 
222
  month_to,
223
  day_to,
224
  domains_to_skip,
225
+ source_block_size,
226
  ):
227
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
228
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
 
233
 
234
  url_scores = []
235
  sentence_scores = []
236
+ sentences = split_sentence_blocks(input, source_block_size)
237
  url_count = {}
238
  score_array = []
239
  url_list = []
 
314
  month_to,
315
  day_to,
316
  domains_to_skip,
317
+ source_block_size,
318
  ):
319
  sentence_scores, url_scores = plagiarism_check(
320
  plag_option,
 
326
  month_to,
327
  day_to,
328
  domains_to_skip,
329
+ source_block_size,
330
  )
331
 
332
  html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
 
355
  html_content += formatted_url
356
 
357
  html_content += "</div>"
358
+
359
+ return html_content