minko186 commited on
Commit
c0a6bc9
1 Parent(s): e3b9187

change highlight from gradio to html

Browse files
Files changed (2) hide show
  1. app.py +34 -14
  2. plagiarism.py +135 -58
app.py CHANGED
@@ -4,7 +4,7 @@ from datetime import date
4
  from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
5
  from analysis import depth_analysis
6
  from predictors import predict_quillbot
7
- from plagiarism import plagiarism_check, build_date
8
  from highlighter import analyze_and_highlight
9
  from utils import extract_text_from_pdf, len_validator
10
  import yaml
@@ -20,7 +20,9 @@ model_list = params["MC_OUTPUT_LABELS"]
20
 
21
 
22
  analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
23
- analyze_and_highlight_quillbot = partial(analyze_and_highlight, model_type="quillbot")
 
 
24
 
25
 
26
  def ai_generated_test(option, input, models):
@@ -46,7 +48,18 @@ def main(
46
  domains_to_skip,
47
  ):
48
 
49
- formatted_tokens = plagiarism_check(
 
 
 
 
 
 
 
 
 
 
 
50
  plag_option,
51
  input,
52
  year_from,
@@ -211,15 +224,19 @@ with gr.Blocks() as demo:
211
 
212
  with gr.Row():
213
  with gr.Column():
214
- sentenceBreakdown = gr.HighlightedText(
 
 
 
 
 
 
 
 
 
 
215
  label="Source Detection Sentence Breakdown",
216
- combine_adjacent=True,
217
- color_map={
218
- "[1]": "red",
219
- "[2]": "orange",
220
- "[3]": "yellow",
221
- "[4]": "green",
222
- },
223
  )
224
 
225
  with gr.Row():
@@ -268,7 +285,8 @@ with gr.Blocks() as demo:
268
  )
269
 
270
  only_plagiarism_btn.click(
271
- fn=plagiarism_check,
 
272
  inputs=[
273
  plag_option,
274
  input_text,
@@ -311,5 +329,7 @@ with gr.Blocks() as demo:
311
  date_to = ""
312
 
313
 
314
- if __name__ == "__main__":
315
- demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))
 
 
 
4
  from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
5
  from analysis import depth_analysis
6
  from predictors import predict_quillbot
7
+ from plagiarism import plagiarism_check, build_date, html_highlight
8
  from highlighter import analyze_and_highlight
9
  from utils import extract_text_from_pdf, len_validator
10
  import yaml
 
20
 
21
 
22
  analyze_and_highlight_bc = partial(analyze_and_highlight, model_type="bc")
23
+ analyze_and_highlight_quillbot = partial(
24
+ analyze_and_highlight, model_type="quillbot"
25
+ )
26
 
27
 
28
  def ai_generated_test(option, input, models):
 
48
  domains_to_skip,
49
  ):
50
 
51
+ # formatted_tokens = plagiarism_check(
52
+ # plag_option,
53
+ # input,
54
+ # year_from,
55
+ # month_from,
56
+ # day_from,
57
+ # year_to,
58
+ # month_to,
59
+ # day_to,
60
+ # domains_to_skip,
61
+ # )
62
+ formatted_tokens = html_highlight(
63
  plag_option,
64
  input,
65
  year_from,
 
224
 
225
  with gr.Row():
226
  with gr.Column():
227
+ # sentenceBreakdown = gr.HighlightedText(
228
+ # label="Source Detection Sentence Breakdown",
229
+ # combine_adjacent=True,
230
+ # color_map={
231
+ # "[1]": "red",
232
+ # "[2]": "orange",
233
+ # "[3]": "yellow",
234
+ # "[4]": "green",
235
+ # },
236
+ # )
237
+ sentenceBreakdown = gr.HTML(
238
  label="Source Detection Sentence Breakdown",
239
+ value="Source Detection Sentence Breakdown",
 
 
 
 
 
 
240
  )
241
 
242
  with gr.Row():
 
285
  )
286
 
287
  only_plagiarism_btn.click(
288
+ # fn=plagiarism_check,
289
+ fn=html_highlight,
290
  inputs=[
291
  plag_option,
292
  input_text,
 
329
  date_to = ""
330
 
331
 
332
+ if __name__ == "__main__":
333
+ demo.launch(
334
+ share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
335
+ )
plagiarism.py CHANGED
@@ -20,6 +20,7 @@ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
20
  # input: two vectors
21
  # output: integer between 0 and 1.
22
 
 
23
  def get_cosine(vec1, vec2):
24
  intersection = set(vec1.keys()) & set(vec2.keys())
25
 
@@ -75,9 +76,9 @@ def sentence_similarity(text1, text2):
75
  def google_search(
76
  plag_option,
77
  sentences,
78
- urlCount,
79
- scoreArray,
80
- urlList,
81
  sorted_date,
82
  domains_to_skip,
83
  api_key,
@@ -112,19 +113,19 @@ def google_search(
112
 
113
  # update cosine similarity between snippet and given text
114
  url = link["link"]
115
- if url not in urlList:
116
- urlList.append(url)
117
- scoreArray.append([0] * len(sentences))
118
- urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
119
  if plag_option == "Standard":
120
- scoreArray[urlList.index(url)][i] = cosineSim(
121
  sentence, snippet
122
  )
123
  else:
124
- scoreArray[urlList.index(url)][i] = sentence_similarity(
125
  sentence, snippet
126
  )
127
- return urlCount, scoreArray
128
 
129
 
130
  def split_sentence_blocks(text):
@@ -191,7 +192,6 @@ async def parallel_scrap(urls):
191
  return results
192
 
193
 
194
-
195
  def matching_score(sentence_content_tuple):
196
  sentence, content = sentence_content_tuple
197
  if sentence in content:
@@ -204,11 +204,65 @@ def matching_score(sentence_content_tuple):
204
  matched = [x for x in ngrams if " ".join(x) in content]
205
  return len(matched) / len(ngrams)
206
 
 
207
  def process_with_multiprocessing(input_data):
208
- with Pool(processes=4) as pool:
209
  scores = pool.map(matching_score, input_data)
210
  return scores
211
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  def plagiarism_check(
213
  plag_option,
214
  input,
@@ -227,41 +281,44 @@ def plagiarism_check(
227
  api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
228
  cse_id = "851813e81162b4ed4"
229
 
 
 
 
 
230
  sentences = split_sentence_blocks(input)
231
- urlCount = {}
232
- ScoreArray = []
233
- urlList = []
234
  date_from = build_date(year_from, month_from, day_from)
235
  date_to = build_date(year_to, month_to, day_to)
236
  sort_date = f"date:r:{date_from}:{date_to}"
237
  # get list of URLS to check
238
- urlCount, ScoreArray = google_search(
239
  plag_option,
240
  sentences,
241
- urlCount,
242
- ScoreArray,
243
- urlList,
244
  sort_date,
245
  domains_to_skip,
246
  api_key,
247
  cse_id,
248
  )
249
-
250
  # Scrape URLs in list
251
  formatted_tokens = []
252
- soups = asyncio.run(parallel_scrap(urlList))
253
-
254
  # # Populate matching scores for scrapped pages
255
  # for i, soup in enumerate(soups):
256
  # print(f"Analyzing {i+1} of {len(soups)} soups........................")
257
  # if soup:
258
  # page_content = soup.text
259
-
260
  # for j, sent in enumerate(sentences):
261
  # args_list = (sent, page_content)
262
  # score = matching_score(args_list)
263
  # # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
264
- # ScoreArray[i][j] = score
265
 
266
  input_data = []
267
  for i, soup in enumerate(soups):
@@ -269,69 +326,89 @@ def plagiarism_check(
269
  page_content = soup.text
270
  for j, sent in enumerate(sentences):
271
  input_data.append((sent, page_content))
272
-
273
  scores = process_with_multiprocessing(input_data)
274
- k = 0
 
 
275
  for i, soup in enumerate(soups):
276
  if soup:
277
  for j, _ in enumerate(sentences):
278
- ScoreArray[i][j] = scores[k]
279
- k += 1
280
-
 
 
281
  sentenceToMaxURL = [-1] * len(sentences)
282
-
283
  for j in range(len(sentences)):
284
  if j > 0:
285
- maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
286
  sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
287
  else:
288
  maxScore = -1
289
 
290
- for i in range(len(ScoreArray)):
291
  margin = (
292
- 0.1
293
  if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
294
  else 0
295
  )
296
- if ScoreArray[i][j] - maxScore > margin:
297
- maxScore = ScoreArray[i][j]
298
  sentenceToMaxURL[j] = i
 
 
 
299
 
300
  index = np.unique(sentenceToMaxURL)
301
 
302
- urlScore = {}
303
  for url in index:
304
  s = [
305
- ScoreArray[url][sen]
306
  for sen in range(len(sentences))
307
  if sentenceToMaxURL[sen] == url
308
  ]
309
- urlScore[url] = sum(s) / len(s)
310
 
311
- index_descending = sorted(urlScore, key=urlScore.get, reverse=True)
312
 
313
  urlMap = {}
314
  for count, i in enumerate(index_descending):
315
  urlMap[i] = count + 1
316
-
317
  for i, sent in enumerate(sentences):
318
- formatted_tokens.append(
319
- (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
320
- )
321
-
322
- formatted_tokens.append(("\n", None))
323
- formatted_tokens.append(("\n", None))
324
- formatted_tokens.append(("\n", None))
325
-
326
- for ind in index_descending:
327
- formatted_tokens.append(
328
- (
329
- urlList[ind]
330
- + " --- Matching Score: "
331
- + f"{str(round(urlScore[ind] * 100, 2))}%",
332
- "[" + str(urlMap[ind]) + "]",
333
  )
 
 
 
 
 
334
  )
335
- formatted_tokens.append(("\n", None))
336
 
337
- return formatted_tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # input: two vectors
21
  # output: integer between 0 and 1.
22
 
23
+
24
  def get_cosine(vec1, vec2):
25
  intersection = set(vec1.keys()) & set(vec2.keys())
26
 
 
76
  def google_search(
77
  plag_option,
78
  sentences,
79
+ url_count,
80
+ score_array,
81
+ url_list,
82
  sorted_date,
83
  domains_to_skip,
84
  api_key,
 
113
 
114
  # update cosine similarity between snippet and given text
115
  url = link["link"]
116
+ if url not in url_list:
117
+ url_list.append(url)
118
+ score_array.append([0] * len(sentences))
119
+ url_count[url] = url_count[url] + 1 if url in url_count else 1
120
  if plag_option == "Standard":
121
+ score_array[url_list.index(url)][i] = cosineSim(
122
  sentence, snippet
123
  )
124
  else:
125
+ score_array[url_list.index(url)][i] = sentence_similarity(
126
  sentence, snippet
127
  )
128
+ return url_count, score_array
129
 
130
 
131
  def split_sentence_blocks(text):
 
192
  return results
193
 
194
 
 
195
  def matching_score(sentence_content_tuple):
196
  sentence, content = sentence_content_tuple
197
  if sentence in content:
 
204
  matched = [x for x in ngrams if " ".join(x) in content]
205
  return len(matched) / len(ngrams)
206
 
207
+
208
  def process_with_multiprocessing(input_data):
209
+ with Pool(processes=1) as pool:
210
  scores = pool.map(matching_score, input_data)
211
  return scores
212
+
213
+
214
+ def print2d(array):
215
+ for row in array:
216
+ print(row)
217
+
218
+
219
+ def html_highlight(
220
+ plag_option,
221
+ input,
222
+ year_from,
223
+ month_from,
224
+ day_from,
225
+ year_to,
226
+ month_to,
227
+ day_to,
228
+ domains_to_skip,
229
+ ):
230
+ sentence_scores, url_scores = plagiarism_check(
231
+ plag_option,
232
+ input,
233
+ year_from,
234
+ month_from,
235
+ day_from,
236
+ year_to,
237
+ month_to,
238
+ day_to,
239
+ domains_to_skip,
240
+ )
241
+ color_map = [
242
+ "#e06b63",
243
+ "#eb9d59",
244
+ "#c2ad36",
245
+ "#e1ed72",
246
+ "#c2db76",
247
+ "#a2db76",
248
+ ]
249
+ html_content = "<div style='font-family: Roboto; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
250
+ for sentence, _, _, idx in sentence_scores:
251
+ color = color_map[idx - 1]
252
+ formatted_sentence = f'<p style="background-color: {color}; padding: 5px;">{sentence} [{idx}]</p>'
253
+ html_content += formatted_sentence
254
+
255
+ html_content += "<hr>"
256
+ for url, score, idx in url_scores:
257
+ color = color_map[idx - 1]
258
+ formatted_name = f'<p style="background-color: {color}; padding: 5px;">({idx}) {url} --- Matching Score:{score}</p>'
259
+ html_content += formatted_name
260
+
261
+ html_content += "</div>"
262
+
263
+ return html_content
264
+
265
+
266
  def plagiarism_check(
267
  plag_option,
268
  input,
 
281
  api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
282
  cse_id = "851813e81162b4ed4"
283
 
284
+ url_scores = []
285
+ sentence_scores = []
286
+ # for input in input.split("\n\n"):
287
+ print(input)
288
  sentences = split_sentence_blocks(input)
289
+ url_count = {}
290
+ score_array = []
291
+ url_list = []
292
  date_from = build_date(year_from, month_from, day_from)
293
  date_to = build_date(year_to, month_to, day_to)
294
  sort_date = f"date:r:{date_from}:{date_to}"
295
  # get list of URLS to check
296
+ url_count, score_array = google_search(
297
  plag_option,
298
  sentences,
299
+ url_count,
300
+ score_array,
301
+ url_list,
302
  sort_date,
303
  domains_to_skip,
304
  api_key,
305
  cse_id,
306
  )
 
307
  # Scrape URLs in list
308
  formatted_tokens = []
309
+ soups = asyncio.run(parallel_scrap(url_list))
310
+
311
  # # Populate matching scores for scrapped pages
312
  # for i, soup in enumerate(soups):
313
  # print(f"Analyzing {i+1} of {len(soups)} soups........................")
314
  # if soup:
315
  # page_content = soup.text
316
+
317
  # for j, sent in enumerate(sentences):
318
  # args_list = (sent, page_content)
319
  # score = matching_score(args_list)
320
  # # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
321
+ # score_array[i][j] = score
322
 
323
  input_data = []
324
  for i, soup in enumerate(soups):
 
326
  page_content = soup.text
327
  for j, sent in enumerate(sentences):
328
  input_data.append((sent, page_content))
 
329
  scores = process_with_multiprocessing(input_data)
330
+
331
+ k = 0
332
+ # Update score array for each (soup, sentence)
333
  for i, soup in enumerate(soups):
334
  if soup:
335
  for j, _ in enumerate(sentences):
336
+ score_array[i][j] = scores[k]
337
+ k += 1
338
+
339
+ # Map sentence with max URL with small margin to keep consider same URL
340
+ # for consecutive sentences
341
  sentenceToMaxURL = [-1] * len(sentences)
 
342
  for j in range(len(sentences)):
343
  if j > 0:
344
+ maxScore = score_array[sentenceToMaxURL[j - 1]][j]
345
  sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
346
  else:
347
  maxScore = -1
348
 
349
+ for i in range(len(score_array)):
350
  margin = (
351
+ 0.05
352
  if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
353
  else 0
354
  )
355
+ if score_array[i][j] - maxScore > margin:
356
+ maxScore = score_array[i][j]
357
  sentenceToMaxURL[j] = i
358
+ # if score_array[i][j] > maxScore:
359
+ # maxScore = score_array[i][j]
360
+ # sentenceToMaxURL[j] = i
361
 
362
  index = np.unique(sentenceToMaxURL)
363
 
364
+ url_source = {}
365
  for url in index:
366
  s = [
367
+ score_array[url][sen]
368
  for sen in range(len(sentences))
369
  if sentenceToMaxURL[sen] == url
370
  ]
371
+ url_source[url] = sum(s) / len(s)
372
 
373
+ index_descending = sorted(url_source, key=url_source.get, reverse=True)
374
 
375
  urlMap = {}
376
  for count, i in enumerate(index_descending):
377
  urlMap[i] = count + 1
378
+
379
  for i, sent in enumerate(sentences):
380
+ ind = sentenceToMaxURL[i]
381
+ if url_source[ind] > 0.1:
382
+ sentence_scores.append(
383
+ [sent, url_source[ind], url_list[ind], urlMap[ind]]
 
 
 
 
 
 
 
 
 
 
 
384
  )
385
+ else:
386
+ sentence_scores.append([sent, None, url_list[ind], urlMap[ind]])
387
+ for ind in index_descending:
388
+ url_scores.append(
389
+ [url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
390
  )
 
391
 
392
+ return sentence_scores, url_scores
393
+
394
+ # for i, sent in enumerate(sentences):
395
+ # formatted_tokens.append(
396
+ # (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
397
+ # )
398
+
399
+ # formatted_tokens.append(("\n", None))
400
+ # formatted_tokens.append(("\n", None))
401
+ # formatted_tokens.append(("\n", None))
402
+
403
+ # for ind in index_descending:
404
+ # formatted_tokens.append(
405
+ # (
406
+ # url_list[ind]
407
+ # + " --- Matching Score: "
408
+ # + f"{str(round(url_source[ind] * 100, 2))}%",
409
+ # "[" + str(urlMap[ind]) + "]",
410
+ # )
411
+ # )
412
+ # formatted_tokens.append(("\n", None))
413
+
414
+ # return formatted_tokens