Spaces:
Running
Running
Update predictors.py
Browse files- predictors.py +24 -564
predictors.py
CHANGED
@@ -1,17 +1,4 @@
|
|
1 |
-
import
|
2 |
-
from nltk.tokenize import sent_tokenize
|
3 |
-
from googleapiclient.discovery import build
|
4 |
-
from collections import Counter
|
5 |
-
import re, math
|
6 |
-
from sentence_transformers import SentenceTransformer, util
|
7 |
-
import asyncio
|
8 |
-
import httpx
|
9 |
-
from bs4 import BeautifulSoup
|
10 |
-
import numpy as np
|
11 |
-
import concurrent
|
12 |
-
from multiprocessing import Pool
|
13 |
-
from const import url_types
|
14 |
-
from collections import defaultdictimport torch
|
15 |
import numpy as np
|
16 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
17 |
import nltk
|
@@ -307,555 +294,28 @@ def predict_mc_scores(input):
|
|
307 |
mc_scores = []
|
308 |
segments_mc = split_text_allow_complete_sentences_nltk(
|
309 |
input, type_det="mc"
|
310 |
-
|
311 |
-
|
312 |
-
WORD = re.compile(r"\w+")
|
313 |
-
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
314 |
-
|
315 |
-
|
316 |
-
months = {
|
317 |
-
"January": "01",
|
318 |
-
"February": "02",
|
319 |
-
"March": "03",
|
320 |
-
"April": "04",
|
321 |
-
"May": "05",
|
322 |
-
"June": "06",
|
323 |
-
"July": "07",
|
324 |
-
"August": "08",
|
325 |
-
"September": "09",
|
326 |
-
"October": "10",
|
327 |
-
"November": "11",
|
328 |
-
"December": "12",
|
329 |
-
}
|
330 |
-
|
331 |
-
color_map = [
|
332 |
-
"#cf2323",
|
333 |
-
"#d65129",
|
334 |
-
"#d66329",
|
335 |
-
"#d67129",
|
336 |
-
"#eb9d59",
|
337 |
-
"#c2ad36",
|
338 |
-
"#d6ae29",
|
339 |
-
"#d6b929",
|
340 |
-
"#e1ed72",
|
341 |
-
"#c2db76",
|
342 |
-
"#a2db76",
|
343 |
-
]
|
344 |
-
|
345 |
-
|
346 |
-
def text_to_vector(text):
|
347 |
-
words = WORD.findall(text)
|
348 |
-
return Counter(words)
|
349 |
-
|
350 |
-
|
351 |
-
def cosineSim(text1, text2):
|
352 |
-
vector1 = text_to_vector(text1)
|
353 |
-
vector2 = text_to_vector(text2)
|
354 |
-
# print vector1,vector2
|
355 |
-
cosine = get_cosine(vector1, vector2)
|
356 |
-
return cosine
|
357 |
-
|
358 |
-
|
359 |
-
def get_cosine(vec1, vec2):
|
360 |
-
intersection = set(vec1.keys()) & set(vec2.keys())
|
361 |
-
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
362 |
-
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
|
363 |
-
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
|
364 |
-
denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
365 |
-
if denominator == 0:
|
366 |
-
return 0.0
|
367 |
-
else:
|
368 |
-
return float(numerator) / denominator
|
369 |
-
|
370 |
-
|
371 |
-
def split_sentence_blocks(text, size):
|
372 |
-
if size == "Paragraph":
|
373 |
-
blocks = text.strip().split("\n")
|
374 |
-
return blocks
|
375 |
-
else:
|
376 |
-
sents = sent_tokenize(text.strip())
|
377 |
-
return sents
|
378 |
-
|
379 |
-
|
380 |
-
def build_date(year=2024, month="March", day=1):
|
381 |
-
return f"{year}{months[month]}{day}"
|
382 |
-
|
383 |
-
|
384 |
-
def split_ngrams(text, n):
|
385 |
-
words = text.split()
|
386 |
-
return [tuple(words[i : i + n]) for i in range(len(words) - n + 1)]
|
387 |
-
|
388 |
-
|
389 |
-
def sentence_similarity(text1, text2):
|
390 |
-
embedding_1 = model.encode(text1, convert_to_tensor=True)
|
391 |
-
embedding_2 = model.encode(text2, convert_to_tensor=True)
|
392 |
-
o = util.pytorch_cos_sim(embedding_1, embedding_2)
|
393 |
-
return o.item()
|
394 |
-
|
395 |
-
|
396 |
-
async def get_url_data(url, client):
|
397 |
-
try:
|
398 |
-
r = await client.get(url)
|
399 |
-
if r.status_code == 200:
|
400 |
-
soup = BeautifulSoup(r.content, "html.parser")
|
401 |
-
return soup
|
402 |
-
except Exception:
|
403 |
-
return None
|
404 |
-
|
405 |
-
|
406 |
-
async def parallel_scrap(urls):
|
407 |
-
async with httpx.AsyncClient(timeout=30) as client:
|
408 |
-
tasks = []
|
409 |
-
for url in urls:
|
410 |
-
tasks.append(get_url_data(url=url, client=client))
|
411 |
-
results = await asyncio.gather(*tasks, return_exceptions=True)
|
412 |
-
return results
|
413 |
-
|
414 |
-
|
415 |
-
def merge_ngrams_into_sentence(ngrams):
|
416 |
-
if ngrams == None:
|
417 |
-
return ""
|
418 |
-
if len(ngrams) > 20:
|
419 |
-
ngrams = ngrams[:20]
|
420 |
-
merged_sentence = []
|
421 |
-
i = 0
|
422 |
-
for ngram in ngrams:
|
423 |
-
overlap = len(set(ngram) & set(merged_sentence[-len(ngram) :]))
|
424 |
-
if overlap == 0:
|
425 |
-
merged_sentence.extend(ngram)
|
426 |
-
elif overlap < len(ngram):
|
427 |
-
merged_sentence.extend(ngram[overlap:])
|
428 |
-
return " ".join(merged_sentence)
|
429 |
-
|
430 |
-
|
431 |
-
def remove_ngrams_after(ngrams, target_ngram):
|
432 |
-
try:
|
433 |
-
index = ngrams.index(target_ngram)
|
434 |
-
return ngrams[: index + 1]
|
435 |
-
except ValueError:
|
436 |
-
return None
|
437 |
-
|
438 |
-
|
439 |
-
def matching_score(sentence_content_tuple):
|
440 |
-
sentence, content, score = sentence_content_tuple
|
441 |
-
if sentence in content:
|
442 |
-
return 1, sentence
|
443 |
-
# if score > 0.9:
|
444 |
-
# return score
|
445 |
-
else:
|
446 |
-
n = 5
|
447 |
-
|
448 |
-
# ngrams = split_ngrams(sentence, n)
|
449 |
-
# if len(ngrams) == 0:
|
450 |
-
# return 0
|
451 |
-
# matched = [x for x in ngrams if " ".join(x) in content]
|
452 |
-
# return len(matched) / len(ngrams)
|
453 |
-
|
454 |
-
# list comprehension matching
|
455 |
-
# ngrams_sentence = split_ngrams(sentence, n)
|
456 |
-
# ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
|
457 |
-
# if len(ngrams_sentence) == 0:
|
458 |
-
# return 0, ""
|
459 |
-
# matched_ngrams = [
|
460 |
-
# 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
|
461 |
-
# ]
|
462 |
-
# matched_count = sum(matched_ngrams)
|
463 |
-
|
464 |
-
# set intersection matching
|
465 |
-
ngrams_sentence = set(split_ngrams(sentence, n))
|
466 |
-
ngrams_content = set(split_ngrams(content, n))
|
467 |
-
if len(ngrams_sentence) == 0:
|
468 |
-
return 0, ""
|
469 |
-
matched_ngrams = ngrams_sentence.intersection(ngrams_content)
|
470 |
-
matched_count = len(matched_ngrams)
|
471 |
-
|
472 |
-
# matched content
|
473 |
-
matched_content_ngrams = []
|
474 |
-
found = False
|
475 |
-
last_found = None
|
476 |
-
for ngram in ngrams_sentence:
|
477 |
-
for ngram_content in ngrams_content:
|
478 |
-
if tuple(ngram) == ngram_content:
|
479 |
-
found = True
|
480 |
-
last_found = ngram_content
|
481 |
-
if found:
|
482 |
-
matched_content_ngrams.append(ngram_content)
|
483 |
-
matched_content_ngrams = remove_ngrams_after(
|
484 |
-
matched_content_ngrams, last_found
|
485 |
-
)
|
486 |
-
matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
|
487 |
-
|
488 |
-
return matched_count / len(ngrams_sentence), matched_content
|
489 |
-
|
490 |
-
|
491 |
-
def process_with_multiprocessing(input_data):
|
492 |
-
with Pool(processes=1) as pool:
|
493 |
-
scores = pool.map(matching_score, input_data)
|
494 |
-
return scores
|
495 |
-
|
496 |
-
|
497 |
-
def map_sentence_url(sentences, score_array):
|
498 |
-
sentenceToMaxURL = [-1] * len(sentences)
|
499 |
-
for j in range(len(sentences)):
|
500 |
-
if j > 0:
|
501 |
-
maxScore = score_array[sentenceToMaxURL[j - 1]][j]
|
502 |
-
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
|
503 |
-
else:
|
504 |
-
maxScore = -1
|
505 |
-
for i in range(len(score_array)):
|
506 |
-
margin = (
|
507 |
-
0.05
|
508 |
-
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
|
509 |
-
else 0
|
510 |
-
)
|
511 |
-
if score_array[i][j] - maxScore > margin:
|
512 |
-
maxScore = score_array[i][j]
|
513 |
-
sentenceToMaxURL[j] = i
|
514 |
-
return sentenceToMaxURL
|
515 |
-
|
516 |
-
|
517 |
-
def check_url_category(url):
|
518 |
-
for category, urls in url_types.items():
|
519 |
-
for u in urls:
|
520 |
-
if u in url:
|
521 |
-
return category
|
522 |
-
return "Internet Source"
|
523 |
-
|
524 |
-
|
525 |
-
def google_search(
|
526 |
-
plag_option,
|
527 |
-
sentences,
|
528 |
-
url_count,
|
529 |
-
score_array,
|
530 |
-
url_list,
|
531 |
-
snippets,
|
532 |
-
sorted_date,
|
533 |
-
domains_to_skip,
|
534 |
-
api_key,
|
535 |
-
cse_id,
|
536 |
-
**kwargs,
|
537 |
-
):
|
538 |
-
service = build("customsearch", "v1", developerKey=api_key)
|
539 |
-
num_pages = 1
|
540 |
-
for i, sentence in enumerate(sentences):
|
541 |
-
results = (
|
542 |
-
service.cse()
|
543 |
-
.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
|
544 |
-
.execute()
|
545 |
-
)
|
546 |
-
if "items" in results and len(results["items"]) > 0:
|
547 |
-
for count, link in enumerate(results["items"]):
|
548 |
-
if count >= num_pages:
|
549 |
-
break
|
550 |
-
# skip user selected domains
|
551 |
-
if (domains_to_skip is not None) and any(
|
552 |
-
("." + domain) in link["link"] for domain in domains_to_skip
|
553 |
-
):
|
554 |
-
continue
|
555 |
-
# clean up snippet of '...'
|
556 |
-
snippet = link["snippet"]
|
557 |
-
ind = snippet.find("...")
|
558 |
-
if ind < 20 and ind > 9:
|
559 |
-
snippet = snippet[ind + len("... ") :]
|
560 |
-
ind = snippet.find("...")
|
561 |
-
if ind > len(snippet) - 5:
|
562 |
-
snippet = snippet[:ind]
|
563 |
-
|
564 |
-
# update cosine similarity between snippet and given text
|
565 |
-
url = link["link"]
|
566 |
-
if url not in url_list:
|
567 |
-
url_list.append(url)
|
568 |
-
score_array.append([0] * len(sentences))
|
569 |
-
snippets.append([""] * len(sentences))
|
570 |
-
url_count[url] = url_count[url] + 1 if url in url_count else 1
|
571 |
-
snippets[url_list.index(url)][i] = snippet
|
572 |
-
if plag_option == "Standard":
|
573 |
-
score_array[url_list.index(url)][i] = cosineSim(
|
574 |
-
sentence, snippet
|
575 |
-
)
|
576 |
-
else:
|
577 |
-
score_array[url_list.index(url)][i] = sentence_similarity(
|
578 |
-
sentence, snippet
|
579 |
-
)
|
580 |
-
return url_count, score_array
|
581 |
-
|
582 |
-
|
583 |
-
def plagiarism_check(
|
584 |
-
plag_option,
|
585 |
-
input,
|
586 |
-
year_from,
|
587 |
-
month_from,
|
588 |
-
day_from,
|
589 |
-
year_to,
|
590 |
-
month_to,
|
591 |
-
day_to,
|
592 |
-
domains_to_skip,
|
593 |
-
source_block_size,
|
594 |
-
):
|
595 |
-
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
596 |
-
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
597 |
-
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
|
598 |
-
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
|
599 |
-
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
|
600 |
-
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
|
601 |
-
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
|
602 |
-
cse_id = "851813e81162b4ed4"
|
603 |
-
|
604 |
-
url_scores = []
|
605 |
-
sentence_scores = []
|
606 |
-
sentences = split_sentence_blocks(input, source_block_size)
|
607 |
-
url_count = {}
|
608 |
-
score_array = []
|
609 |
-
url_list = []
|
610 |
-
snippets = []
|
611 |
-
date_from = build_date(year_from, month_from, day_from)
|
612 |
-
date_to = build_date(year_to, month_to, day_to)
|
613 |
-
sort_date = f"date:r:{date_from}:{date_to}"
|
614 |
-
# get list of URLS to check
|
615 |
-
start_time = time.perf_counter()
|
616 |
-
url_count, score_array = google_search(
|
617 |
-
plag_option,
|
618 |
-
sentences,
|
619 |
-
url_count,
|
620 |
-
score_array,
|
621 |
-
url_list,
|
622 |
-
snippets,
|
623 |
-
sort_date,
|
624 |
-
domains_to_skip,
|
625 |
-
api_key,
|
626 |
-
cse_id,
|
627 |
)
|
628 |
-
|
629 |
-
|
630 |
-
start_time = time.perf_counter()
|
631 |
-
soups = asyncio.run(parallel_scrap(url_list))
|
632 |
-
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
|
633 |
-
input_data = []
|
634 |
-
for i, soup in enumerate(soups):
|
635 |
-
if soup:
|
636 |
-
page_content = soup.text
|
637 |
-
for j, sent in enumerate(sentences):
|
638 |
-
input_data.append((sent, page_content, score_array[i][j]))
|
639 |
-
start_time = time.perf_counter()
|
640 |
-
# scores = process_with_multiprocessing(input_data)
|
641 |
-
scores = []
|
642 |
-
for i in input_data:
|
643 |
-
scores.append(matching_score(i))
|
644 |
-
print("MATCHING SCORE PROCESSING TIME: ", time.perf_counter() - start_time)
|
645 |
-
matched_sentence_array = [
|
646 |
-
["" for _ in range(len(score_array[0]))]
|
647 |
-
for _ in range(len(score_array))
|
648 |
-
]
|
649 |
-
|
650 |
-
k = 0
|
651 |
-
# Update score array for each (soup, sentence)
|
652 |
-
for i, soup in enumerate(soups):
|
653 |
-
if soup:
|
654 |
-
for j, _ in enumerate(sentences):
|
655 |
-
score_array[i][j] = scores[k][0]
|
656 |
-
matched_sentence_array[i][j] = scores[k][1]
|
657 |
-
k += 1
|
658 |
-
|
659 |
-
sentenceToMaxURL = map_sentence_url(sentences, score_array)
|
660 |
-
index = np.unique(sentenceToMaxURL)
|
661 |
-
|
662 |
-
url_source = {}
|
663 |
-
for url in index:
|
664 |
-
s = [
|
665 |
-
score_array[url][sen]
|
666 |
-
for sen in range(len(sentences))
|
667 |
-
if sentenceToMaxURL[sen] == url
|
668 |
-
]
|
669 |
-
url_source[url] = sum(s) / len(s)
|
670 |
-
index_descending = sorted(url_source, key=url_source.get, reverse=True)
|
671 |
-
urlMap = {}
|
672 |
-
for count, i in enumerate(index_descending):
|
673 |
-
urlMap[i] = count + 1
|
674 |
-
|
675 |
-
# build results
|
676 |
-
for i, sent in enumerate(sentences):
|
677 |
-
ind = sentenceToMaxURL[i]
|
678 |
-
if url_source[ind] > 0.1:
|
679 |
-
sentence_scores.append(
|
680 |
-
[
|
681 |
-
sent,
|
682 |
-
round(url_source[ind] * 100, 2),
|
683 |
-
url_list[ind],
|
684 |
-
urlMap[ind],
|
685 |
-
]
|
686 |
-
)
|
687 |
-
else:
|
688 |
-
sentence_scores.append([sent, None, url_list[ind], -1])
|
689 |
-
print("SNIPPETS: ", snippets)
|
690 |
-
snippets = [[item for item in sublist if item] for sublist in snippets]
|
691 |
-
for ind in index_descending:
|
692 |
-
if url_source[ind] > 0.1:
|
693 |
-
matched_sentence_array = [
|
694 |
-
[item for item in sublist if item]
|
695 |
-
for sublist in matched_sentence_array
|
696 |
-
]
|
697 |
-
matched_sentence = "...".join(
|
698 |
-
[sent for sent in matched_sentence_array[ind]]
|
699 |
-
)
|
700 |
-
if matched_sentence == "":
|
701 |
-
matched_sentence = "...".join([sent for sent in snippets[ind]])
|
702 |
-
url_scores.append(
|
703 |
-
[
|
704 |
-
url_list[ind],
|
705 |
-
round(url_source[ind] * 100, 2),
|
706 |
-
urlMap[ind],
|
707 |
-
matched_sentence,
|
708 |
-
]
|
709 |
-
)
|
710 |
-
|
711 |
-
return sentence_scores, url_scores
|
712 |
-
|
713 |
-
|
714 |
-
def html_highlight(
|
715 |
-
plag_option,
|
716 |
-
input,
|
717 |
-
year_from,
|
718 |
-
month_from,
|
719 |
-
day_from,
|
720 |
-
year_to,
|
721 |
-
month_to,
|
722 |
-
day_to,
|
723 |
-
domains_to_skip,
|
724 |
-
source_block_size,
|
725 |
-
):
|
726 |
-
start_time = time.perf_counter()
|
727 |
-
sentence_scores, url_scores = plagiarism_check(
|
728 |
-
plag_option,
|
729 |
-
input,
|
730 |
-
year_from,
|
731 |
-
month_from,
|
732 |
-
day_from,
|
733 |
-
year_to,
|
734 |
-
month_to,
|
735 |
-
day_to,
|
736 |
-
domains_to_skip,
|
737 |
-
source_block_size,
|
738 |
)
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
}
|
761 |
-
.details {
|
762 |
-
display: none;
|
763 |
-
padding: 10px;
|
764 |
-
}
|
765 |
-
.url-link {
|
766 |
-
font-size: 1.2em;
|
767 |
-
}
|
768 |
-
.url-link span {
|
769 |
-
margin-right: 10px;
|
770 |
-
}
|
771 |
-
.toggle-button {
|
772 |
-
color: #333;
|
773 |
-
border: none;
|
774 |
-
padding: 5px 10px;
|
775 |
-
text-align: center;
|
776 |
-
text-decoration: none;
|
777 |
-
display: inline-block;
|
778 |
-
cursor: pointer;
|
779 |
-
}
|
780 |
-
</style>
|
781 |
-
</head>
|
782 |
-
"""
|
783 |
-
|
784 |
-
prev_idx = None
|
785 |
-
combined_sentence = ""
|
786 |
-
total_score = 0
|
787 |
-
total_count = 0
|
788 |
-
category_scores = defaultdict(set)
|
789 |
-
for sentence, score, url, idx in sentence_scores:
|
790 |
-
category = check_url_category(url)
|
791 |
-
if score is None:
|
792 |
-
total_score += 0
|
793 |
-
else:
|
794 |
-
total_score += score
|
795 |
-
category_scores[category].add(score)
|
796 |
-
total_count += 1
|
797 |
-
|
798 |
-
if idx != prev_idx and prev_idx is not None:
|
799 |
-
color = color_map[prev_idx - 1]
|
800 |
-
index_part = f"<span>[{prev_idx}]</span>"
|
801 |
-
formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
|
802 |
-
html_content += formatted_sentence
|
803 |
-
combined_sentence = ""
|
804 |
-
combined_sentence += " " + sentence
|
805 |
-
prev_idx = idx
|
806 |
-
|
807 |
-
print(category_scores)
|
808 |
-
total_average_score = round(total_score / total_count, 2)
|
809 |
-
category_averages = {
|
810 |
-
category: round((sum(scores) / len(scores)), 2)
|
811 |
-
for category, scores in category_scores.items()
|
812 |
-
}
|
813 |
-
|
814 |
-
if combined_sentence:
|
815 |
-
color = color_map[prev_idx - 1]
|
816 |
-
index_part = ""
|
817 |
-
if prev_idx != -1:
|
818 |
-
index_part = f"<span>[{prev_idx}]</span>"
|
819 |
-
formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
|
820 |
-
html_content += formatted_sentence
|
821 |
-
|
822 |
-
html_content += "<hr>"
|
823 |
-
|
824 |
-
html_content += f"""
|
825 |
-
<div class="score-container">
|
826 |
-
<div class="score-item">
|
827 |
-
<h3>Overall Similarity</h3>
|
828 |
-
<p>{total_average_score}%</p>
|
829 |
-
</div>
|
830 |
-
"""
|
831 |
-
for category, score in category_averages.items():
|
832 |
-
html_content += f"""
|
833 |
-
<div class="score-item"><h3>{category}</h3><p>{score}%</p></div>
|
834 |
-
"""
|
835 |
-
html_content += "</div>"
|
836 |
-
|
837 |
-
for url, score, idx, sentence in url_scores:
|
838 |
-
url_category = check_url_category(url)
|
839 |
-
color = color_map[idx - 1]
|
840 |
-
formatted_url = f"""
|
841 |
-
<p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p><i>{url_category}</i></p>
|
842 |
-
<p> --- <b>Matching Score: </b>{score}%</p>
|
843 |
-
<p> --- <b>Original Source Content: </b>{sentence}</p>
|
844 |
-
"""
|
845 |
-
# formatted_url = f"""
|
846 |
-
# <div class="url-link">
|
847 |
-
# <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p>{url_category}</p>
|
848 |
-
# <a href="#" onclick="toggleDetails(event)" class="toggle-button">></a>
|
849 |
-
# </div>
|
850 |
-
# <div id="detailsContainer" class="details">
|
851 |
-
# <p> --- <b>Matching Score: </b>{score}%</p>
|
852 |
-
# <p> --- <b>Original Source Content: </b>{sentence}</p>
|
853 |
-
# </div>
|
854 |
-
# """
|
855 |
-
html_content += formatted_url
|
856 |
-
|
857 |
-
html_content += "</html>"
|
858 |
-
|
859 |
-
print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
|
860 |
-
|
861 |
-
return html_content
|
|
|
1 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
4 |
import nltk
|
|
|
294 |
mc_scores = []
|
295 |
segments_mc = split_text_allow_complete_sentences_nltk(
|
296 |
input, type_det="mc"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
)
|
298 |
+
samples_len_mc = len(
|
299 |
+
split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
)
|
301 |
+
for i in range(samples_len_mc):
|
302 |
+
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
303 |
+
mc_score = predict_mc(
|
304 |
+
text_mc_model, text_mc_tokenizer, cleaned_text_mc
|
305 |
+
)
|
306 |
+
mc_scores.append(mc_score)
|
307 |
+
mc_scores_array = np.array(mc_scores)
|
308 |
+
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
309 |
+
mc_score_list = average_mc_scores.tolist()
|
310 |
+
mc_score = {}
|
311 |
+
for score, label in zip(mc_score_list, mc_label_map):
|
312 |
+
mc_score[label.upper()] = score
|
313 |
+
|
314 |
+
sum_prob = 1 - bc_score["HUMAN"]
|
315 |
+
for key, value in mc_score.items():
|
316 |
+
mc_score[key] = value * sum_prob
|
317 |
+
print("MC Score:", mc_score)
|
318 |
+
if sum_prob < 0.01:
|
319 |
+
mc_score = {}
|
320 |
+
|
321 |
+
return mc_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|