Spaces:
Running
Running
import time | |
from nltk.tokenize import sent_tokenize | |
from googleapiclient.discovery import build | |
from collections import Counter | |
import re, math | |
from sentence_transformers import SentenceTransformer, util | |
import asyncio | |
import httpx | |
from bs4 import BeautifulSoup | |
import numpy as np | |
import concurrent | |
from multiprocessing import Pool | |
from const import url_types | |
from collections import defaultdictimport torch | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import nltk | |
import torch.nn.functional as F | |
import nltk | |
from scipy.special import softmax | |
import yaml | |
from utils import * | |
import joblib | |
from optimum.bettertransformer import BetterTransformer | |
import gc | |
from cleantext import clean | |
import gradio as gr | |
from tqdm.auto import tqdm | |
from transformers import pipeline | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from optimum.pipelines import pipeline | |
with open("config.yaml", "r") as file: | |
params = yaml.safe_load(file) | |
nltk.download("punkt") | |
nltk.download("stopwords") | |
device_needed = "cuda" if torch.cuda.is_available() else "cpu" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
print('DEVICE IS :' , device) | |
text_bc_model_path = params["TEXT_BC_MODEL_PATH"] | |
text_mc_model_path = params["TEXT_MC_MODEL_PATH"] | |
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"] | |
quillbot_labels = params["QUILLBOT_LABELS"] | |
mc_label_map = params["MC_OUTPUT_LABELS"] | |
mc_token_size = int(params["MC_TOKEN_SIZE"]) | |
bc_token_size = int(params["BC_TOKEN_SIZE"]) | |
bias_checker_model_name = params['BIAS_CHECKER_MODEL_PATH'] | |
bias_corrector_model_name = params['BIAS_CORRECTOR_MODEL_PATH'] | |
# access_token = params['HF_TOKEN'] | |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path) | |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device) | |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path) | |
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device) | |
quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path) | |
quillbot_model = AutoModelForSequenceClassification.from_pretrained(text_quillbot_model_path).to(device) | |
# proxy models for explainability | |
mini_bc_model_name = "polygraf-ai/bc-model" | |
bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name) | |
bc_model_mini = AutoModelForSequenceClassification.from_pretrained(mini_bc_model_name).to(device_needed) | |
mini_humanizer_model_name = "polygraf-ai/humanizer-model" | |
humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(mini_humanizer_model_name) | |
humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(mini_humanizer_model_name).to(device_needed) | |
bc_model_mini = BetterTransformer.transform(bc_model_mini) | |
humanizer_model_mini = BetterTransformer.transform(humanizer_model_mini) | |
text_bc_model = BetterTransformer.transform(text_bc_model) | |
text_mc_model = BetterTransformer.transform(text_mc_model) | |
quillbot_model = BetterTransformer.transform(quillbot_model) | |
bias_model_checker = AutoModelForSequenceClassification.from_pretrained(bias_checker_model_name) | |
tokenizer = AutoTokenizer.from_pretrained(bias_checker_model_name) | |
bias_model_checker = BetterTransformer.transform(bias_model_checker, keep_original_model=False) | |
bias_checker = pipeline( | |
"text-classification", | |
model=bias_checker_model_name, | |
tokenizer=bias_checker_model_name, | |
) | |
gc.collect() | |
bias_corrector = pipeline( "text2text-generation", model=bias_corrector_model_name, accelerator="ort") | |
# model score calibration | |
iso_reg = joblib.load("isotonic_regression_model.joblib") | |
def split_text(text: str) -> list: | |
sentences = sent_tokenize(text) | |
return [[sentence] for sentence in sentences] | |
def correct_text(text: str, bias_checker, bias_corrector, separator: str = " ") -> tuple: | |
sentence_batches = split_text(text) | |
corrected_text = [] | |
corrections = [] | |
for batch in tqdm(sentence_batches, total=len(sentence_batches), desc="correcting text.."): | |
raw_text = " ".join(batch) | |
results = bias_checker(raw_text) | |
if results[0]["label"] != "LABEL_1" or (results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9): | |
corrected_batch = bias_corrector(raw_text) | |
corrected_version = corrected_batch[0]["generated_text"] | |
corrected_text.append(corrected_version) | |
corrections.append((raw_text, corrected_version)) | |
else: | |
corrected_text.append(raw_text) | |
corrected_text = separator.join(corrected_text) | |
return corrected_text, corrections | |
def update(text: str): | |
text = clean(text, lower=False) | |
corrected_text, corrections = correct_text(text, bias_checker, bias_corrector) | |
corrections_display = "".join([f"{corr}" for orig, corr in corrections]) | |
if corrections_display == "": | |
corrections_display = text | |
return corrections_display | |
def update_main(text: str): | |
text = clean(text, lower=False) | |
corrected_text, corrections = correct_text(text, bias_checker, bias_corrector) | |
corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections]) | |
return corrected_text, corrections_display | |
def split_text(text: str) -> list: | |
sentences = sent_tokenize(text) | |
return [[sentence] for sentence in sentences] | |
def get_token_length(tokenizer, sentence): | |
return len(tokenizer.tokenize(sentence)) | |
def split_text_allow_complete_sentences_nltk(text, type_det="bc"): | |
sentences = sent_tokenize(text) | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
if type_det == "bc": | |
tokenizer = text_bc_tokenizer | |
max_tokens = bc_token_size | |
elif type_det == "mc": | |
tokenizer = text_mc_tokenizer | |
max_tokens = mc_token_size | |
elif type_det == "quillbot": | |
tokenizer = quillbot_tokenizer | |
max_tokens = 256 | |
def add_sentence_to_chunk(sentence): | |
nonlocal current_chunk, current_length | |
sentence_length = get_token_length(tokenizer, sentence) | |
if current_length + sentence_length > max_tokens: | |
chunks.append((current_chunk, current_length)) | |
current_chunk = [] | |
current_length = 0 | |
current_chunk.append(sentence) | |
current_length += sentence_length | |
for sentence in sentences: | |
add_sentence_to_chunk(sentence) | |
if current_chunk: | |
chunks.append((current_chunk, current_length)) | |
adjusted_chunks = [] | |
while chunks: | |
chunk = chunks.pop(0) | |
if len(chunks) > 0 and chunk[1] < max_tokens / 2: | |
next_chunk = chunks.pop(0) | |
combined_length = chunk[1] + next_chunk[1] | |
if combined_length <= max_tokens: | |
adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length)) | |
else: | |
adjusted_chunks.append(chunk) | |
chunks.insert(0, next_chunk) | |
else: | |
adjusted_chunks.append(chunk) | |
result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks] | |
return result_chunks | |
def predict_quillbot(text, bias_buster_selected): | |
if bias_buster_selected: | |
text = update(text) | |
with torch.no_grad(): | |
quillbot_model.eval() | |
tokenized_text = quillbot_tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
max_length=256, | |
return_tensors="pt", | |
).to(device) | |
output = quillbot_model(**tokenized_text) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
q_score = { | |
"Humanized": output_norm[1].item(), | |
"Original": output_norm[0].item(), | |
} | |
return q_score | |
def predict_for_explainanility(text, model_type=None): | |
if model_type == "quillbot": | |
cleaning = False | |
max_length = 256 | |
model = humanizer_model_mini | |
tokenizer = humanizer_tokenizer_mini | |
elif model_type == "bc": | |
cleaning = True | |
max_length = bc_token_size | |
model = bc_model_mini | |
tokenizer = bc_tokenizer_mini | |
else: | |
raise ValueError("Invalid model type") | |
with torch.no_grad(): | |
if cleaning: | |
text = [remove_special_characters(t) for t in text] | |
tokenized_text = tokenizer( | |
text, | |
return_tensors="pt", | |
padding="max_length", | |
truncation=True, | |
max_length=max_length, | |
).to(device_needed) | |
outputs = model(**tokenized_text) | |
tensor_logits = outputs[0] | |
probas = F.softmax(tensor_logits).detach().cpu().numpy() | |
return probas | |
def predict_bc(model, tokenizer, text): | |
with torch.no_grad(): | |
model.eval() | |
tokens = text_bc_tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
max_length=bc_token_size, | |
return_tensors="pt", | |
).to(device) | |
output = model(**tokens) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
return output_norm | |
def predict_mc(model, tokenizer, text): | |
with torch.no_grad(): | |
model.eval() | |
tokens = text_mc_tokenizer( | |
text, | |
padding="max_length", | |
truncation=True, | |
return_tensors="pt", | |
max_length=mc_token_size, | |
).to(device) | |
output = model(**tokens) | |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0] | |
return output_norm | |
def predict_bc_scores(input): | |
bc_scores = [] | |
samples_len_bc = len( | |
split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
) | |
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
for i in range(samples_len_bc): | |
cleaned_text_bc = remove_special_characters(segments_bc[i]) | |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc) | |
bc_scores.append(bc_score) | |
bc_scores_array = np.array(bc_scores) | |
average_bc_scores = np.mean(bc_scores_array, axis=0) | |
bc_score_list = average_bc_scores.tolist() | |
print( | |
f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}" | |
) | |
# isotonic regression calibration | |
ai_score = iso_reg.predict([bc_score_list[1]])[0] | |
human_score = 1 - ai_score | |
bc_score = {"AI": ai_score, "HUMAN": human_score} | |
print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}") | |
print(f"Input Text: {cleaned_text_bc}") | |
return bc_score | |
def predict_mc_scores(input): | |
# BC SCORE | |
bc_scores = [] | |
samples_len_bc = len( | |
split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
) | |
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc") | |
for i in range(samples_len_bc): | |
cleaned_text_bc = remove_special_characters(segments_bc[i]) | |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc) | |
bc_scores.append(bc_score) | |
bc_scores_array = np.array(bc_scores) | |
average_bc_scores = np.mean(bc_scores_array, axis=0) | |
bc_score_list = average_bc_scores.tolist() | |
print( | |
f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}" | |
) | |
# isotonic regression calibration | |
ai_score = iso_reg.predict([bc_score_list[1]])[0] | |
human_score = 1 - ai_score | |
bc_score = {"AI": ai_score, "HUMAN": human_score} | |
print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}") | |
mc_scores = [] | |
segments_mc = split_text_allow_complete_sentences_nltk( | |
input, type_det="mc" | |
WORD = re.compile(r"\w+") | |
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
months = { | |
"January": "01", | |
"February": "02", | |
"March": "03", | |
"April": "04", | |
"May": "05", | |
"June": "06", | |
"July": "07", | |
"August": "08", | |
"September": "09", | |
"October": "10", | |
"November": "11", | |
"December": "12", | |
} | |
color_map = [ | |
"#cf2323", | |
"#d65129", | |
"#d66329", | |
"#d67129", | |
"#eb9d59", | |
"#c2ad36", | |
"#d6ae29", | |
"#d6b929", | |
"#e1ed72", | |
"#c2db76", | |
"#a2db76", | |
] | |
def text_to_vector(text): | |
words = WORD.findall(text) | |
return Counter(words) | |
def cosineSim(text1, text2): | |
vector1 = text_to_vector(text1) | |
vector2 = text_to_vector(text2) | |
# print vector1,vector2 | |
cosine = get_cosine(vector1, vector2) | |
return cosine | |
def get_cosine(vec1, vec2): | |
intersection = set(vec1.keys()) & set(vec2.keys()) | |
numerator = sum([vec1[x] * vec2[x] for x in intersection]) | |
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()]) | |
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()]) | |
denominator = math.sqrt(sum1) * math.sqrt(sum2) | |
if denominator == 0: | |
return 0.0 | |
else: | |
return float(numerator) / denominator | |
def split_sentence_blocks(text, size): | |
if size == "Paragraph": | |
blocks = text.strip().split("\n") | |
return blocks | |
else: | |
sents = sent_tokenize(text.strip()) | |
return sents | |
def build_date(year=2024, month="March", day=1): | |
return f"{year}{months[month]}{day}" | |
def split_ngrams(text, n): | |
words = text.split() | |
return [tuple(words[i : i + n]) for i in range(len(words) - n + 1)] | |
def sentence_similarity(text1, text2): | |
embedding_1 = model.encode(text1, convert_to_tensor=True) | |
embedding_2 = model.encode(text2, convert_to_tensor=True) | |
o = util.pytorch_cos_sim(embedding_1, embedding_2) | |
return o.item() | |
async def get_url_data(url, client): | |
try: | |
r = await client.get(url) | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.content, "html.parser") | |
return soup | |
except Exception: | |
return None | |
async def parallel_scrap(urls): | |
async with httpx.AsyncClient(timeout=30) as client: | |
tasks = [] | |
for url in urls: | |
tasks.append(get_url_data(url=url, client=client)) | |
results = await asyncio.gather(*tasks, return_exceptions=True) | |
return results | |
def merge_ngrams_into_sentence(ngrams): | |
if ngrams == None: | |
return "" | |
if len(ngrams) > 20: | |
ngrams = ngrams[:20] | |
merged_sentence = [] | |
i = 0 | |
for ngram in ngrams: | |
overlap = len(set(ngram) & set(merged_sentence[-len(ngram) :])) | |
if overlap == 0: | |
merged_sentence.extend(ngram) | |
elif overlap < len(ngram): | |
merged_sentence.extend(ngram[overlap:]) | |
return " ".join(merged_sentence) | |
def remove_ngrams_after(ngrams, target_ngram): | |
try: | |
index = ngrams.index(target_ngram) | |
return ngrams[: index + 1] | |
except ValueError: | |
return None | |
def matching_score(sentence_content_tuple): | |
sentence, content, score = sentence_content_tuple | |
if sentence in content: | |
return 1, sentence | |
# if score > 0.9: | |
# return score | |
else: | |
n = 5 | |
# ngrams = split_ngrams(sentence, n) | |
# if len(ngrams) == 0: | |
# return 0 | |
# matched = [x for x in ngrams if " ".join(x) in content] | |
# return len(matched) / len(ngrams) | |
# list comprehension matching | |
# ngrams_sentence = split_ngrams(sentence, n) | |
# ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)] | |
# if len(ngrams_sentence) == 0: | |
# return 0, "" | |
# matched_ngrams = [ | |
# 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content | |
# ] | |
# matched_count = sum(matched_ngrams) | |
# set intersection matching | |
ngrams_sentence = set(split_ngrams(sentence, n)) | |
ngrams_content = set(split_ngrams(content, n)) | |
if len(ngrams_sentence) == 0: | |
return 0, "" | |
matched_ngrams = ngrams_sentence.intersection(ngrams_content) | |
matched_count = len(matched_ngrams) | |
# matched content | |
matched_content_ngrams = [] | |
found = False | |
last_found = None | |
for ngram in ngrams_sentence: | |
for ngram_content in ngrams_content: | |
if tuple(ngram) == ngram_content: | |
found = True | |
last_found = ngram_content | |
if found: | |
matched_content_ngrams.append(ngram_content) | |
matched_content_ngrams = remove_ngrams_after( | |
matched_content_ngrams, last_found | |
) | |
matched_content = merge_ngrams_into_sentence(matched_content_ngrams) | |
return matched_count / len(ngrams_sentence), matched_content | |
def process_with_multiprocessing(input_data): | |
with Pool(processes=1) as pool: | |
scores = pool.map(matching_score, input_data) | |
return scores | |
def map_sentence_url(sentences, score_array): | |
sentenceToMaxURL = [-1] * len(sentences) | |
for j in range(len(sentences)): | |
if j > 0: | |
maxScore = score_array[sentenceToMaxURL[j - 1]][j] | |
sentenceToMaxURL[j] = sentenceToMaxURL[j - 1] | |
else: | |
maxScore = -1 | |
for i in range(len(score_array)): | |
margin = ( | |
0.05 | |
if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1]) | |
else 0 | |
) | |
if score_array[i][j] - maxScore > margin: | |
maxScore = score_array[i][j] | |
sentenceToMaxURL[j] = i | |
return sentenceToMaxURL | |
def check_url_category(url): | |
for category, urls in url_types.items(): | |
for u in urls: | |
if u in url: | |
return category | |
return "Internet Source" | |
def google_search( | |
plag_option, | |
sentences, | |
url_count, | |
score_array, | |
url_list, | |
snippets, | |
sorted_date, | |
domains_to_skip, | |
api_key, | |
cse_id, | |
**kwargs, | |
): | |
service = build("customsearch", "v1", developerKey=api_key) | |
num_pages = 1 | |
for i, sentence in enumerate(sentences): | |
results = ( | |
service.cse() | |
.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs) | |
.execute() | |
) | |
if "items" in results and len(results["items"]) > 0: | |
for count, link in enumerate(results["items"]): | |
if count >= num_pages: | |
break | |
# skip user selected domains | |
if (domains_to_skip is not None) and any( | |
("." + domain) in link["link"] for domain in domains_to_skip | |
): | |
continue | |
# clean up snippet of '...' | |
snippet = link["snippet"] | |
ind = snippet.find("...") | |
if ind < 20 and ind > 9: | |
snippet = snippet[ind + len("... ") :] | |
ind = snippet.find("...") | |
if ind > len(snippet) - 5: | |
snippet = snippet[:ind] | |
# update cosine similarity between snippet and given text | |
url = link["link"] | |
if url not in url_list: | |
url_list.append(url) | |
score_array.append([0] * len(sentences)) | |
snippets.append([""] * len(sentences)) | |
url_count[url] = url_count[url] + 1 if url in url_count else 1 | |
snippets[url_list.index(url)][i] = snippet | |
if plag_option == "Standard": | |
score_array[url_list.index(url)][i] = cosineSim( | |
sentence, snippet | |
) | |
else: | |
score_array[url_list.index(url)][i] = sentence_similarity( | |
sentence, snippet | |
) | |
return url_count, score_array | |
def plagiarism_check( | |
plag_option, | |
input, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
source_block_size, | |
): | |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g" | |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8" | |
# api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g" | |
api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE" | |
# api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk" | |
# api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg" | |
# api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8" | |
cse_id = "851813e81162b4ed4" | |
url_scores = [] | |
sentence_scores = [] | |
sentences = split_sentence_blocks(input, source_block_size) | |
url_count = {} | |
score_array = [] | |
url_list = [] | |
snippets = [] | |
date_from = build_date(year_from, month_from, day_from) | |
date_to = build_date(year_to, month_to, day_to) | |
sort_date = f"date:r:{date_from}:{date_to}" | |
# get list of URLS to check | |
start_time = time.perf_counter() | |
url_count, score_array = google_search( | |
plag_option, | |
sentences, | |
url_count, | |
score_array, | |
url_list, | |
snippets, | |
sort_date, | |
domains_to_skip, | |
api_key, | |
cse_id, | |
) | |
print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time) | |
# Scrape URLs in list | |
start_time = time.perf_counter() | |
soups = asyncio.run(parallel_scrap(url_list)) | |
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time) | |
input_data = [] | |
for i, soup in enumerate(soups): | |
if soup: | |
page_content = soup.text | |
for j, sent in enumerate(sentences): | |
input_data.append((sent, page_content, score_array[i][j])) | |
start_time = time.perf_counter() | |
# scores = process_with_multiprocessing(input_data) | |
scores = [] | |
for i in input_data: | |
scores.append(matching_score(i)) | |
print("MATCHING SCORE PROCESSING TIME: ", time.perf_counter() - start_time) | |
matched_sentence_array = [ | |
["" for _ in range(len(score_array[0]))] | |
for _ in range(len(score_array)) | |
] | |
k = 0 | |
# Update score array for each (soup, sentence) | |
for i, soup in enumerate(soups): | |
if soup: | |
for j, _ in enumerate(sentences): | |
score_array[i][j] = scores[k][0] | |
matched_sentence_array[i][j] = scores[k][1] | |
k += 1 | |
sentenceToMaxURL = map_sentence_url(sentences, score_array) | |
index = np.unique(sentenceToMaxURL) | |
url_source = {} | |
for url in index: | |
s = [ | |
score_array[url][sen] | |
for sen in range(len(sentences)) | |
if sentenceToMaxURL[sen] == url | |
] | |
url_source[url] = sum(s) / len(s) | |
index_descending = sorted(url_source, key=url_source.get, reverse=True) | |
urlMap = {} | |
for count, i in enumerate(index_descending): | |
urlMap[i] = count + 1 | |
# build results | |
for i, sent in enumerate(sentences): | |
ind = sentenceToMaxURL[i] | |
if url_source[ind] > 0.1: | |
sentence_scores.append( | |
[ | |
sent, | |
round(url_source[ind] * 100, 2), | |
url_list[ind], | |
urlMap[ind], | |
] | |
) | |
else: | |
sentence_scores.append([sent, None, url_list[ind], -1]) | |
print("SNIPPETS: ", snippets) | |
snippets = [[item for item in sublist if item] for sublist in snippets] | |
for ind in index_descending: | |
if url_source[ind] > 0.1: | |
matched_sentence_array = [ | |
[item for item in sublist if item] | |
for sublist in matched_sentence_array | |
] | |
matched_sentence = "...".join( | |
[sent for sent in matched_sentence_array[ind]] | |
) | |
if matched_sentence == "": | |
matched_sentence = "...".join([sent for sent in snippets[ind]]) | |
url_scores.append( | |
[ | |
url_list[ind], | |
round(url_source[ind] * 100, 2), | |
urlMap[ind], | |
matched_sentence, | |
] | |
) | |
return sentence_scores, url_scores | |
def html_highlight( | |
plag_option, | |
input, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
source_block_size, | |
): | |
start_time = time.perf_counter() | |
sentence_scores, url_scores = plagiarism_check( | |
plag_option, | |
input, | |
year_from, | |
month_from, | |
day_from, | |
year_to, | |
month_to, | |
day_to, | |
domains_to_skip, | |
source_block_size, | |
) | |
html_content = """ | |
<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'> | |
<div style='font-family: {font}; border: 2px solid black; padding: 10px; color: #FFFFFF;'> | |
<html> | |
<head> | |
<title>Toggle Details</title> | |
<style> | |
.score-container { | |
display: flex; | |
justify-content: space-around; | |
align-items: left; | |
padding: 20px; | |
} | |
.score-item { | |
text-align: center; | |
padding: 10px; | |
background-color: #636362; | |
border-radius: 5px; | |
flex-grow: 1; | |
margin: 0 5px; | |
} | |
.details { | |
display: none; | |
padding: 10px; | |
} | |
.url-link { | |
font-size: 1.2em; | |
} | |
.url-link span { | |
margin-right: 10px; | |
} | |
.toggle-button { | |
color: #333; | |
border: none; | |
padding: 5px 10px; | |
text-align: center; | |
text-decoration: none; | |
display: inline-block; | |
cursor: pointer; | |
} | |
</style> | |
</head> | |
""" | |
prev_idx = None | |
combined_sentence = "" | |
total_score = 0 | |
total_count = 0 | |
category_scores = defaultdict(set) | |
for sentence, score, url, idx in sentence_scores: | |
category = check_url_category(url) | |
if score is None: | |
total_score += 0 | |
else: | |
total_score += score | |
category_scores[category].add(score) | |
total_count += 1 | |
if idx != prev_idx and prev_idx is not None: | |
color = color_map[prev_idx - 1] | |
index_part = f"<span>[{prev_idx}]</span>" | |
formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>' | |
html_content += formatted_sentence | |
combined_sentence = "" | |
combined_sentence += " " + sentence | |
prev_idx = idx | |
print(category_scores) | |
total_average_score = round(total_score / total_count, 2) | |
category_averages = { | |
category: round((sum(scores) / len(scores)), 2) | |
for category, scores in category_scores.items() | |
} | |
if combined_sentence: | |
color = color_map[prev_idx - 1] | |
index_part = "" | |
if prev_idx != -1: | |
index_part = f"<span>[{prev_idx}]</span>" | |
formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>' | |
html_content += formatted_sentence | |
html_content += "<hr>" | |
html_content += f""" | |
<div class="score-container"> | |
<div class="score-item"> | |
<h3>Overall Similarity</h3> | |
<p>{total_average_score}%</p> | |
</div> | |
""" | |
for category, score in category_averages.items(): | |
html_content += f""" | |
<div class="score-item"><h3>{category}</h3><p>{score}%</p></div> | |
""" | |
html_content += "</div>" | |
for url, score, idx, sentence in url_scores: | |
url_category = check_url_category(url) | |
color = color_map[idx - 1] | |
formatted_url = f""" | |
<p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p><i>{url_category}</i></p> | |
<p> --- <b>Matching Score: </b>{score}%</p> | |
<p> --- <b>Original Source Content: </b>{sentence}</p> | |
""" | |
# formatted_url = f""" | |
# <div class="url-link"> | |
# <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p>{url_category}</p> | |
# <a href="#" onclick="toggleDetails(event)" class="toggle-button">></a> | |
# </div> | |
# <div id="detailsContainer" class="details"> | |
# <p> --- <b>Matching Score: </b>{score}%</p> | |
# <p> --- <b>Original Source Content: </b>{sentence}</p> | |
# </div> | |
# """ | |
html_content += formatted_url | |
html_content += "</html>" | |
print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time) | |
return html_content | |