aliasgerovs commited on
Commit
0eaca07
1 Parent(s): 173f4a0
Files changed (6) hide show
  1. .env +1 -0
  2. README.md +1 -1
  3. app.py +35 -1
  4. const.py +97 -0
  5. nohup.out +0 -0
  6. plagiarism.py +197 -27
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ ASSEMBLYAI_API_KEY = 'f9d0fe8c23304ae193d694294b615dcc'
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: ©
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 4.17.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
4
  colorFrom: gray
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.36.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -13,6 +13,9 @@ from functools import partial
13
  from audio import assemblyai_transcribe
14
  import yt_dlp
15
  import os
 
 
 
16
 
17
  np.set_printoptions(suppress=True)
18
 
@@ -28,6 +31,21 @@ analyze_and_highlight_quillbot = partial(
28
  )
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def ai_generated_test(option, bias_buster_selected, input):
32
  if bias_buster_selected:
33
  input = update(input)
@@ -118,6 +136,14 @@ with gr.Blocks() as demo:
118
  )
119
 
120
 
 
 
 
 
 
 
 
 
121
  with gr.Row():
122
  url_input = gr.Textbox(
123
  label="Input Page URL to check", lines=1, placeholder="")
@@ -128,7 +154,15 @@ with gr.Blocks() as demo:
128
  audio_url_input.change(
129
  fn=assemblyai_transcribe, inputs=audio_url_input, outputs=input_text
130
  )
131
-
 
 
 
 
 
 
 
 
132
  char_count = gr.Textbox(label="Minumum Character Limit Check")
133
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
134
 
 
13
  from audio import assemblyai_transcribe
14
  import yt_dlp
15
  import os
16
+ import pandas as pd
17
+ from const import plag_script
18
+ from datasets import load_dataset, Dataset
19
 
20
  np.set_printoptions(suppress=True)
21
 
 
31
  )
32
 
33
 
34
+
35
+ def save_request(email, video_url):
36
+ # Save the email and video URL to the CSV file
37
+ if email is None or email == "":
38
+ return "Please enter your email.", gr.update(visible=True)
39
+ dat = load_dataset(requests_repo)["train"]
40
+ df = dat.to_pandas()
41
+ new_row = pd.DataFrame(
42
+ {"email": [email], "video_url": [video_url], "status": "pending"}
43
+ )
44
+ df = pd.concat([df, new_row], ignore_index=True)
45
+ dat = Dataset.from_pandas(df)
46
+ dat.push_to_hub(requests_repo)
47
+ return "Your request has been saved.", gr.update(visible=False)
48
+
49
  def ai_generated_test(option, bias_buster_selected, input):
50
  if bias_buster_selected:
51
  input = update(input)
 
136
  )
137
 
138
 
139
+ with gr.Column(visible=False) as request_row:
140
+ with gr.Row():
141
+ email_input = gr.Textbox(label="Email")
142
+ youtube_url_input = gr.Textbox(label="YouTube Video URL")
143
+ with gr.Row():
144
+ video_submit_btn = gr.Button("Submit Video Request")
145
+
146
+
147
  with gr.Row():
148
  url_input = gr.Textbox(
149
  label="Input Page URL to check", lines=1, placeholder="")
 
154
  audio_url_input.change(
155
  fn=assemblyai_transcribe, inputs=audio_url_input, outputs=input_text
156
  )
157
+
158
+
159
+ video_submit_btn.click(
160
+ fn=save_request,
161
+ inputs=[email_input, youtube_url_input],
162
+ outputs=[input_text, request_row],
163
+ api_name="video_request",
164
+ )
165
+
166
  char_count = gr.Textbox(label="Minumum Character Limit Check")
167
  input_text.change(fn=len_validator, inputs=input_text, outputs=char_count)
168
 
const.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ url_types = {
2
+ "Student Publications": [
3
+ "studentpulse.com",
4
+ "undergraduateresearch.ucsd.edu",
5
+ "jmurj.jmu.edu",
6
+ "cur.org",
7
+ "urj.ucf.edu",
8
+ "ugresearch.umn.edu",
9
+ "undergraduateresearch.wustl.edu",
10
+ "ugresearch.ucla.edu",
11
+ "jur.byu.edu",
12
+ "undergradresearch.ncsu.edu",
13
+ ],
14
+ "Wikihost": [
15
+ "fandom.com",
16
+ "wikidot.com",
17
+ "wikia.org",
18
+ "wikispaces.com",
19
+ "gamepedia.com",
20
+ "wikibooks.org",
21
+ "wikiversity.org",
22
+ "wikitravel.org",
23
+ "wikinews.org",
24
+ "wiktionary.org",
25
+ "wikipedia.org",
26
+ ],
27
+ "Official News": [
28
+ "bbc.com",
29
+ "cnn.com",
30
+ "nytimes.com",
31
+ "reuters.com",
32
+ "theguardian.com",
33
+ "washingtonpost.com",
34
+ "foxnews.com",
35
+ "aljazeera.com",
36
+ "bloomberg.com",
37
+ "npr.org",
38
+ ],
39
+ "Online Learning": [
40
+ "coursera.org",
41
+ "edx.org",
42
+ "udacity.com",
43
+ "udemy.com",
44
+ "khanacademy.org",
45
+ "futurelearn.com",
46
+ "skillshare.com",
47
+ "linkedin.com/learning",
48
+ "pluralsight.com",
49
+ "codecademy.com",
50
+ ],
51
+ "Government Official": [
52
+ "usa.gov",
53
+ "gov.uk",
54
+ "europa.eu",
55
+ "canada.ca",
56
+ "australia.gov.au",
57
+ "india.gov.in",
58
+ "japan.go.jp",
59
+ "korea.go.kr",
60
+ "gov.sg",
61
+ "nz.govt.nz",
62
+ "defense.gov",
63
+ ".gov",
64
+ ],
65
+ "Publications": [
66
+ "scholar.google.com",
67
+ "pubmed.ncbi.nlm.nih.gov",
68
+ "researchgate.net",
69
+ "jstor.org",
70
+ "ieeexplore.ieee.org",
71
+ "sciencedirect.com",
72
+ "arxiv.org",
73
+ "link.springer.com",
74
+ "onlinelibrary.wiley.com",
75
+ "doaj.org",
76
+ "journals.plos.org/plosone",
77
+ "journals.sagepub.com",
78
+ "dl.acm.org",
79
+ "biorxiv.org",
80
+ "tandfonline.com",
81
+ ],
82
+ }
83
+
84
+
85
+ plag_script = """
86
+ async () => {
87
+ globalThis.toggleDetails = (event) => {
88
+ event.preventDefault(); // Prevent the default link behavior
89
+ let detailsContainer = document.getElementById("detailsContainer");
90
+ if (detailsContainer.style.display === "none") {
91
+ detailsContainer.style.display = "block";
92
+ } else {
93
+ detailsContainer.style.display = "none";
94
+ }
95
+ }
96
+ }
97
+ """
nohup.out CHANGED
The diff for this file is too large to render. See raw diff
 
plagiarism.py CHANGED
@@ -10,7 +10,8 @@ from bs4 import BeautifulSoup
10
  import numpy as np
11
  import concurrent
12
  from multiprocessing import Pool
13
-
 
14
 
15
  WORD = re.compile(r"\w+")
16
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
@@ -73,10 +74,10 @@ def get_cosine(vec1, vec2):
73
 
74
  def split_sentence_blocks(text, size):
75
  if size == "Paragraph":
76
- blocks = text.split("\n")
77
  return blocks
78
  else:
79
- sents = sent_tokenize(text)
80
  return sents
81
 
82
 
@@ -115,12 +116,36 @@ async def parallel_scrap(urls):
115
  return results
116
 
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def matching_score(sentence_content_tuple):
119
  sentence, content, score = sentence_content_tuple
120
  if sentence in content:
121
- return 1
122
- if score > 0.9:
123
- return score
124
  else:
125
  n = 5
126
 
@@ -132,12 +157,28 @@ def matching_score(sentence_content_tuple):
132
 
133
  ngrams_sentence = split_ngrams(sentence, n)
134
  if len(ngrams_sentence) == 0:
135
- return 0
136
- ngrams_content = set(tuple(ngram) for ngram in split_ngrams(content, n))
137
- matched_count = sum(
138
- 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
 
 
 
 
 
 
 
 
 
 
139
  )
140
- return matched_count / len(ngrams_sentence)
 
 
 
 
 
 
141
 
142
 
143
  def process_with_multiprocessing(input_data):
@@ -166,12 +207,21 @@ def map_sentence_url(sentences, score_array):
166
  return sentenceToMaxURL
167
 
168
 
 
 
 
 
 
 
 
 
169
  def google_search(
170
  plag_option,
171
  sentences,
172
  url_count,
173
  score_array,
174
  url_list,
 
175
  sorted_date,
176
  domains_to_skip,
177
  api_key,
@@ -209,7 +259,9 @@ def google_search(
209
  if url not in url_list:
210
  url_list.append(url)
211
  score_array.append([0] * len(sentences))
 
212
  url_count[url] = url_count[url] + 1 if url in url_count else 1
 
213
  if plag_option == "Standard":
214
  score_array[url_list.index(url)][i] = cosineSim(
215
  sentence, snippet
@@ -234,21 +286,22 @@ def plagiarism_check(
234
  source_block_size,
235
  ):
236
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
237
- api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
238
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
239
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
240
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
241
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
242
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
 
243
  cse_id = "851813e81162b4ed4"
244
 
245
  url_scores = []
246
  sentence_scores = []
247
  sentences = split_sentence_blocks(input, source_block_size)
248
- print(sentences)
249
  url_count = {}
250
  score_array = []
251
  url_list = []
 
252
  date_from = build_date(year_from, month_from, day_from)
253
  date_to = build_date(year_to, month_to, day_to)
254
  sort_date = f"date:r:{date_from}:{date_to}"
@@ -259,6 +312,7 @@ def plagiarism_check(
259
  url_count,
260
  score_array,
261
  url_list,
 
262
  sort_date,
263
  domains_to_skip,
264
  api_key,
@@ -273,13 +327,18 @@ def plagiarism_check(
273
  for j, sent in enumerate(sentences):
274
  input_data.append((sent, page_content, score_array[i][j]))
275
  scores = process_with_multiprocessing(input_data)
 
 
 
 
276
 
277
  k = 0
278
  # Update score array for each (soup, sentence)
279
  for i, soup in enumerate(soups):
280
  if soup:
281
  for j, _ in enumerate(sentences):
282
- score_array[i][j] = scores[k]
 
283
  k += 1
284
 
285
  sentenceToMaxURL = map_sentence_url(sentences, score_array)
@@ -303,14 +362,35 @@ def plagiarism_check(
303
  ind = sentenceToMaxURL[i]
304
  if url_source[ind] > 0.1:
305
  sentence_scores.append(
306
- [sent, url_source[ind], url_list[ind], urlMap[ind]]
 
 
 
 
 
307
  )
308
  else:
309
  sentence_scores.append([sent, None, url_list[ind], -1])
 
 
310
  for ind in index_descending:
311
- if url_source[ind] > 0.1:
 
 
 
 
 
 
 
 
 
312
  url_scores.append(
313
- [url_list[ind], round(url_source[ind] * 100, 2), urlMap[ind]]
 
 
 
 
 
314
  )
315
 
316
  return sentence_scores, url_scores
@@ -342,33 +422,123 @@ def html_highlight(
342
  source_block_size,
343
  )
344
 
345
- html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; padding: 10px; color: #FFFFFF;'>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  prev_idx = None
347
  combined_sentence = ""
348
- for sentence, _, _, idx in sentence_scores:
 
 
 
 
 
 
 
 
 
 
349
  if idx != prev_idx and prev_idx is not None:
350
  color = color_map[prev_idx - 1]
351
- index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
352
- formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
353
  html_content += formatted_sentence
354
  combined_sentence = ""
355
  combined_sentence += " " + sentence
356
  prev_idx = idx
357
 
 
 
 
 
 
 
358
  if combined_sentence:
359
  color = color_map[prev_idx - 1]
360
- index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
361
- formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
 
 
362
  html_content += formatted_sentence
363
 
364
  html_content += "<hr>"
365
- for url, score, idx in url_scores:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  color = color_map[idx - 1]
367
- formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  html_content += formatted_url
369
 
370
- html_content += "</div>"
371
 
372
  print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
373
 
374
- return html_content
 
10
  import numpy as np
11
  import concurrent
12
  from multiprocessing import Pool
13
+ from const import url_types
14
+ from collections import defaultdict
15
 
16
  WORD = re.compile(r"\w+")
17
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
74
 
75
  def split_sentence_blocks(text, size):
76
  if size == "Paragraph":
77
+ blocks = text.strip().split("\n")
78
  return blocks
79
  else:
80
+ sents = sent_tokenize(text.strip())
81
  return sents
82
 
83
 
 
116
  return results
117
 
118
 
119
+ def merge_ngrams_into_sentence(ngrams):
120
+ if ngrams == None:
121
+ return ""
122
+ if len(ngrams) > 20:
123
+ ngrams = ngrams[:20]
124
+ merged_sentence = []
125
+ i = 0
126
+ for ngram in ngrams:
127
+ overlap = len(set(ngram) & set(merged_sentence[-len(ngram) :]))
128
+ if overlap == 0:
129
+ merged_sentence.extend(ngram)
130
+ elif overlap < len(ngram):
131
+ merged_sentence.extend(ngram[overlap:])
132
+ return " ".join(merged_sentence)
133
+
134
+
135
+ def remove_ngrams_after(ngrams, target_ngram):
136
+ try:
137
+ index = ngrams.index(target_ngram)
138
+ return ngrams[: index + 1]
139
+ except ValueError:
140
+ return None
141
+
142
+
143
  def matching_score(sentence_content_tuple):
144
  sentence, content, score = sentence_content_tuple
145
  if sentence in content:
146
+ return 1, sentence
147
+ # if score > 0.9:
148
+ # return score
149
  else:
150
  n = 5
151
 
 
157
 
158
  ngrams_sentence = split_ngrams(sentence, n)
159
  if len(ngrams_sentence) == 0:
160
+ return 0, ""
161
+ ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
162
+ matched_content_ngrams = []
163
+ found = False
164
+ last_found = None
165
+ for ngram in ngrams_sentence:
166
+ for ngram_content in ngrams_content:
167
+ if tuple(ngram) == ngram_content:
168
+ found = True
169
+ last_found = ngram_content
170
+ if found:
171
+ matched_content_ngrams.append(ngram_content)
172
+ matched_content_ngrams = remove_ngrams_after(
173
+ matched_content_ngrams, last_found
174
  )
175
+ matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
176
+
177
+ matched_ngrams = [
178
+ 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
179
+ ]
180
+ matched_count = sum(matched_ngrams)
181
+ return matched_count / len(ngrams_sentence), matched_content
182
 
183
 
184
  def process_with_multiprocessing(input_data):
 
207
  return sentenceToMaxURL
208
 
209
 
210
+ def check_url_category(url):
211
+ for category, urls in url_types.items():
212
+ for u in urls:
213
+ if u in url:
214
+ return category
215
+ return "Internet Source"
216
+
217
+
218
  def google_search(
219
  plag_option,
220
  sentences,
221
  url_count,
222
  score_array,
223
  url_list,
224
+ snippets,
225
  sorted_date,
226
  domains_to_skip,
227
  api_key,
 
259
  if url not in url_list:
260
  url_list.append(url)
261
  score_array.append([0] * len(sentences))
262
+ snippets.append([""] * len(sentences))
263
  url_count[url] = url_count[url] + 1 if url in url_count else 1
264
+ snippets[url_list.index(url)][i] = snippet
265
  if plag_option == "Standard":
266
  score_array[url_list.index(url)][i] = cosineSim(
267
  sentence, snippet
 
286
  source_block_size,
287
  ):
288
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
289
+ # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
290
  # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
291
  # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
292
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
293
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
294
  # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
295
+ api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
296
  cse_id = "851813e81162b4ed4"
297
 
298
  url_scores = []
299
  sentence_scores = []
300
  sentences = split_sentence_blocks(input, source_block_size)
 
301
  url_count = {}
302
  score_array = []
303
  url_list = []
304
+ snippets = []
305
  date_from = build_date(year_from, month_from, day_from)
306
  date_to = build_date(year_to, month_to, day_to)
307
  sort_date = f"date:r:{date_from}:{date_to}"
 
312
  url_count,
313
  score_array,
314
  url_list,
315
+ snippets,
316
  sort_date,
317
  domains_to_skip,
318
  api_key,
 
327
  for j, sent in enumerate(sentences):
328
  input_data.append((sent, page_content, score_array[i][j]))
329
  scores = process_with_multiprocessing(input_data)
330
+ matched_sentence_array = [
331
+ ["" for _ in range(len(score_array[0]))]
332
+ for _ in range(len(score_array))
333
+ ]
334
 
335
  k = 0
336
  # Update score array for each (soup, sentence)
337
  for i, soup in enumerate(soups):
338
  if soup:
339
  for j, _ in enumerate(sentences):
340
+ score_array[i][j] = scores[k][0]
341
+ matched_sentence_array[i][j] = scores[k][1]
342
  k += 1
343
 
344
  sentenceToMaxURL = map_sentence_url(sentences, score_array)
 
362
  ind = sentenceToMaxURL[i]
363
  if url_source[ind] > 0.1:
364
  sentence_scores.append(
365
+ [
366
+ sent,
367
+ round(url_source[ind] * 100, 2),
368
+ url_list[ind],
369
+ urlMap[ind],
370
+ ]
371
  )
372
  else:
373
  sentence_scores.append([sent, None, url_list[ind], -1])
374
+ print("SNIPPETS: ", snippets)
375
+ snippets = [[item for item in sublist if item] for sublist in snippets]
376
  for ind in index_descending:
377
+ if url_source[ind] > 0.35:
378
+ matched_sentence_array = [
379
+ [item for item in sublist if item]
380
+ for sublist in matched_sentence_array
381
+ ]
382
+ matched_sentence = "...".join(
383
+ [sent for sent in matched_sentence_array[ind]]
384
+ )
385
+ if matched_sentence == "":
386
+ matched_sentence = "...".join([sent for sent in snippets[ind]])
387
  url_scores.append(
388
+ [
389
+ url_list[ind],
390
+ round(url_source[ind] * 100, 2),
391
+ urlMap[ind],
392
+ matched_sentence,
393
+ ]
394
  )
395
 
396
  return sentence_scores, url_scores
 
422
  source_block_size,
423
  )
424
 
425
+ html_content = """
426
+ <link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>
427
+ <div style='font-family: {font}; border: 2px solid black; padding: 10px; color: #FFFFFF;'>
428
+ <html>
429
+ <head>
430
+ <title>Toggle Details</title>
431
+ <style>
432
+ .score-container {
433
+ display: flex;
434
+ justify-content: space-around;
435
+ align-items: left;
436
+ padding: 20px;
437
+ }
438
+ .score-item {
439
+ text-align: center;
440
+ padding: 10px;
441
+ background-color: #636362;
442
+ border-radius: 5px;
443
+ flex-grow: 1;
444
+ margin: 0 5px;
445
+ }
446
+ .details {
447
+ display: none;
448
+ padding: 10px;
449
+ }
450
+ .url-link {
451
+ font-size: 1.2em;
452
+ }
453
+ .url-link span {
454
+ margin-right: 10px;
455
+ }
456
+ .toggle-button {
457
+ color: #333;
458
+ border: none;
459
+ padding: 5px 10px;
460
+ text-align: center;
461
+ text-decoration: none;
462
+ display: inline-block;
463
+ cursor: pointer;
464
+ }
465
+ </style>
466
+ </head>
467
+ """
468
+
469
  prev_idx = None
470
  combined_sentence = ""
471
+ total_score = 0
472
+ total_count = 0
473
+ category_scores = defaultdict(list)
474
+ for sentence, score, url, idx in sentence_scores:
475
+ category = check_url_category(url)
476
+ if score is None:
477
+ total_score += 0
478
+ else:
479
+ total_score += score
480
+ total_count += 1
481
+ category_scores[category].append(score)
482
  if idx != prev_idx and prev_idx is not None:
483
  color = color_map[prev_idx - 1]
484
+ index_part = f"<span>[{prev_idx}]</span>"
485
+ formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
486
  html_content += formatted_sentence
487
  combined_sentence = ""
488
  combined_sentence += " " + sentence
489
  prev_idx = idx
490
 
491
+ total_average_score = round(total_score / total_count, 2)
492
+ category_averages = {
493
+ category: round((sum(scores) / len(scores)), 2)
494
+ for category, scores in category_scores.items()
495
+ }
496
+
497
  if combined_sentence:
498
  color = color_map[prev_idx - 1]
499
+ index_part = ""
500
+ if prev_idx != -1:
501
+ index_part = f"<span>[{prev_idx}]</span>"
502
+ formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
503
  html_content += formatted_sentence
504
 
505
  html_content += "<hr>"
506
+
507
+ html_content += f"""
508
+ <div class="score-container">
509
+ <div class="score-item">
510
+ <h3>Overall Similarity</h3>
511
+ <p>{total_average_score}%</p>
512
+ </div>
513
+ """
514
+ for category, score in category_averages.items():
515
+ html_content += f"""
516
+ <div class="score-item"><h3>{category}</h3><p>{score}%</p></div>
517
+ """
518
+ html_content += "</div>"
519
+
520
+ for url, score, idx, sentence in url_scores:
521
+ url_category = check_url_category(url)
522
  color = color_map[idx - 1]
523
+ formatted_url = f"""
524
+ <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p><i>{url_category}</i></p>
525
+ <p> --- <b>Matching Score: </b>{score}%</p>
526
+ <p> --- <b>Original Source Content: </b>{sentence}</p>
527
+ """
528
+ # formatted_url = f"""
529
+ # <div class="url-link">
530
+ # <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p>{url_category}</p>
531
+ # <a href="#" onclick="toggleDetails(event)" class="toggle-button">&gt;</a>
532
+ # </div>
533
+ # <div id="detailsContainer" class="details">
534
+ # <p> --- <b>Matching Score: </b>{score}%</p>
535
+ # <p> --- <b>Original Source Content: </b>{sentence}</p>
536
+ # </div>
537
+ # """
538
  html_content += formatted_url
539
 
540
+ html_content += "</html>"
541
 
542
  print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
543
 
544
+ return html_content