minko186 commited on
Commit
c6bd7c4
·
verified ·
1 Parent(s): 4d41695

Update predictors.py

Browse files
Files changed (1) hide show
  1. predictors.py +561 -317
predictors.py CHANGED
@@ -1,321 +1,565 @@
1
- import torch
2
- import numpy as np
3
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
- import nltk
5
- import torch.nn.functional as F
6
- import nltk
7
- from scipy.special import softmax
8
- import yaml
9
- from utils import *
10
- import joblib
11
- from optimum.bettertransformer import BetterTransformer
12
- import gc
13
- from cleantext import clean
14
- import gradio as gr
15
- from tqdm.auto import tqdm
16
- from transformers import pipeline
17
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
18
- import nltk
19
  from nltk.tokenize import sent_tokenize
20
- from optimum.pipelines import pipeline
21
-
22
- with open("config.yaml", "r") as file:
23
- params = yaml.safe_load(file)
24
-
25
- nltk.download("punkt")
26
- nltk.download("stopwords")
27
- device_needed = "cuda" if torch.cuda.is_available() else "cpu"
28
- device = "cuda" if torch.cuda.is_available() else "cpu"
29
- print('DEVICE IS :' , device)
30
-
31
- text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
32
- text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
33
- text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
34
- quillbot_labels = params["QUILLBOT_LABELS"]
35
- mc_label_map = params["MC_OUTPUT_LABELS"]
36
- mc_token_size = int(params["MC_TOKEN_SIZE"])
37
- bc_token_size = int(params["BC_TOKEN_SIZE"])
38
- bias_checker_model_name = params['BIAS_CHECKER_MODEL_PATH']
39
- bias_corrector_model_name = params['BIAS_CORRECTOR_MODEL_PATH']
40
- # access_token = params['HF_TOKEN']
41
-
42
- text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
43
- text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
44
- text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
45
- text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
46
- quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path)
47
- quillbot_model = AutoModelForSequenceClassification.from_pretrained(text_quillbot_model_path).to(device)
48
-
49
- # proxy models for explainability
50
- mini_bc_model_name = "polygraf-ai/bc-model"
51
- bc_tokenizer_mini = AutoTokenizer.from_pretrained(mini_bc_model_name)
52
- bc_model_mini = AutoModelForSequenceClassification.from_pretrained(mini_bc_model_name).to(device_needed)
53
- mini_humanizer_model_name = "polygraf-ai/humanizer-model"
54
- humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(mini_humanizer_model_name)
55
- humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(mini_humanizer_model_name).to(device_needed)
56
-
57
- bc_model_mini = BetterTransformer.transform(bc_model_mini)
58
- humanizer_model_mini = BetterTransformer.transform(humanizer_model_mini)
59
- text_bc_model = BetterTransformer.transform(text_bc_model)
60
- text_mc_model = BetterTransformer.transform(text_mc_model)
61
- quillbot_model = BetterTransformer.transform(quillbot_model)
62
-
63
- bias_model_checker = AutoModelForSequenceClassification.from_pretrained(bias_checker_model_name)
64
- tokenizer = AutoTokenizer.from_pretrained(bias_checker_model_name)
65
- bias_model_checker = BetterTransformer.transform(bias_model_checker, keep_original_model=False)
66
- bias_checker = pipeline(
67
- "text-classification",
68
- model=bias_checker_model_name,
69
- tokenizer=bias_checker_model_name,
70
- )
71
- gc.collect()
72
- bias_corrector = pipeline( "text2text-generation", model=bias_corrector_model_name, accelerator="ort")
73
-
74
- # model score calibration
75
- iso_reg = joblib.load("isotonic_regression_model.joblib")
76
-
77
-
78
- def split_text(text: str) -> list:
79
- sentences = sent_tokenize(text)
80
- return [[sentence] for sentence in sentences]
81
-
82
- def correct_text(text: str, bias_checker, bias_corrector, separator: str = " ") -> tuple:
83
- sentence_batches = split_text(text)
84
- corrected_text = []
85
- corrections = []
86
- for batch in tqdm(sentence_batches, total=len(sentence_batches), desc="correcting text.."):
87
- raw_text = " ".join(batch)
88
- results = bias_checker(raw_text)
89
- if results[0]["label"] != "LABEL_1" or (results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9):
90
- corrected_batch = bias_corrector(raw_text)
91
- corrected_version = corrected_batch[0]["generated_text"]
92
- corrected_text.append(corrected_version)
93
- corrections.append((raw_text, corrected_version))
94
- else:
95
- corrected_text.append(raw_text)
96
- corrected_text = separator.join(corrected_text)
97
- return corrected_text, corrections
98
-
99
- def update(text: str):
100
- text = clean(text, lower=False)
101
- corrected_text, corrections = correct_text(text, bias_checker, bias_corrector)
102
- corrections_display = "".join([f"{corr}" for orig, corr in corrections])
103
- if corrections_display == "":
104
- corrections_display = text
105
- return corrections_display
106
-
107
- def update_main(text: str):
108
- text = clean(text, lower=False)
109
- corrected_text, corrections = correct_text(text, bias_checker, bias_corrector)
110
- corrections_display = "\n\n".join([f"Original: {orig}\nCorrected: {corr}" for orig, corr in corrections])
111
- return corrected_text, corrections_display
112
-
113
- def split_text(text: str) -> list:
114
- sentences = sent_tokenize(text)
115
- return [[sentence] for sentence in sentences]
116
-
117
- def get_token_length(tokenizer, sentence):
118
- return len(tokenizer.tokenize(sentence))
119
-
120
- def split_text_allow_complete_sentences_nltk(text, type_det="bc"):
121
- sentences = sent_tokenize(text)
122
- chunks = []
123
- current_chunk = []
124
- current_length = 0
125
- if type_det == "bc":
126
- tokenizer = text_bc_tokenizer
127
- max_tokens = bc_token_size
128
- elif type_det == "mc":
129
- tokenizer = text_mc_tokenizer
130
- max_tokens = mc_token_size
131
-
132
- elif type_det == "quillbot":
133
- tokenizer = quillbot_tokenizer
134
- max_tokens = 256
135
-
136
- def add_sentence_to_chunk(sentence):
137
- nonlocal current_chunk, current_length
138
- sentence_length = get_token_length(tokenizer, sentence)
139
- if current_length + sentence_length > max_tokens:
140
- chunks.append((current_chunk, current_length))
141
- current_chunk = []
142
- current_length = 0
143
- current_chunk.append(sentence)
144
- current_length += sentence_length
145
-
146
- for sentence in sentences:
147
- add_sentence_to_chunk(sentence)
148
- if current_chunk:
149
- chunks.append((current_chunk, current_length))
150
- adjusted_chunks = []
151
- while chunks:
152
- chunk = chunks.pop(0)
153
- if len(chunks) > 0 and chunk[1] < max_tokens / 2:
154
- next_chunk = chunks.pop(0)
155
- combined_length = chunk[1] + next_chunk[1]
156
- if combined_length <= max_tokens:
157
- adjusted_chunks.append((chunk[0] + next_chunk[0], combined_length))
158
- else:
159
- adjusted_chunks.append(chunk)
160
- chunks.insert(0, next_chunk)
161
- else:
162
- adjusted_chunks.append(chunk)
163
- result_chunks = [" ".join(chunk[0]) for chunk in adjusted_chunks]
164
- return result_chunks
165
-
166
-
167
- def predict_quillbot(text, bias_buster_selected):
168
- if bias_buster_selected:
169
- text = update(text)
170
- with torch.no_grad():
171
- quillbot_model.eval()
172
- tokenized_text = quillbot_tokenizer(
173
- text,
174
- padding="max_length",
175
- truncation=True,
176
- max_length=256,
177
- return_tensors="pt",
178
- ).to(device)
179
- output = quillbot_model(**tokenized_text)
180
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
181
- q_score = {
182
- "Humanized": output_norm[1].item(),
183
- "Original": output_norm[0].item(),
184
- }
185
- return q_score
186
-
187
-
188
- def predict_for_explainanility(text, model_type=None):
189
- if model_type == "quillbot":
190
- cleaning = False
191
- max_length = 256
192
- model = humanizer_model_mini
193
- tokenizer = humanizer_tokenizer_mini
194
- elif model_type == "bc":
195
- cleaning = True
196
- max_length = bc_token_size
197
- model = bc_model_mini
198
- tokenizer = bc_tokenizer_mini
199
  else:
200
- raise ValueError("Invalid model type")
201
- with torch.no_grad():
202
- if cleaning:
203
- text = [remove_special_characters(t) for t in text]
204
- tokenized_text = tokenizer(
205
- text,
206
- return_tensors="pt",
207
- padding="max_length",
208
- truncation=True,
209
- max_length=max_length,
210
- ).to(device_needed)
211
- outputs = model(**tokenized_text)
212
- tensor_logits = outputs[0]
213
- probas = F.softmax(tensor_logits).detach().cpu().numpy()
214
- return probas
215
-
216
-
217
- def predict_bc(model, tokenizer, text):
218
- with torch.no_grad():
219
- model.eval()
220
- tokens = text_bc_tokenizer(
221
- text,
222
- padding="max_length",
223
- truncation=True,
224
- max_length=bc_token_size,
225
- return_tensors="pt",
226
- ).to(device)
227
- output = model(**tokens)
228
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
229
- return output_norm
230
-
231
-
232
- def predict_mc(model, tokenizer, text):
233
- with torch.no_grad():
234
- model.eval()
235
- tokens = text_mc_tokenizer(
236
- text,
237
- padding="max_length",
238
- truncation=True,
239
- return_tensors="pt",
240
- max_length=mc_token_size,
241
- ).to(device)
242
- output = model(**tokens)
243
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
244
- return output_norm
245
-
246
-
247
- def predict_bc_scores(input):
248
- bc_scores = []
249
- samples_len_bc = len(
250
- split_text_allow_complete_sentences_nltk(input, type_det="bc")
251
- )
252
- segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
253
- for i in range(samples_len_bc):
254
- cleaned_text_bc = remove_special_characters(segments_bc[i])
255
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
256
- bc_scores.append(bc_score)
257
- bc_scores_array = np.array(bc_scores)
258
- average_bc_scores = np.mean(bc_scores_array, axis=0)
259
- bc_score_list = average_bc_scores.tolist()
260
- print(
261
- f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}"
262
- )
263
- # isotonic regression calibration
264
- ai_score = iso_reg.predict([bc_score_list[1]])[0]
265
- human_score = 1 - ai_score
266
- bc_score = {"AI": ai_score, "HUMAN": human_score}
267
- print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
268
- print(f"Input Text: {cleaned_text_bc}")
269
- return bc_score
270
-
271
-
272
- def predict_mc_scores(input):
273
- # BC SCORE
274
- bc_scores = []
275
- samples_len_bc = len(
276
- split_text_allow_complete_sentences_nltk(input, type_det="bc")
277
- )
278
- segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
279
- for i in range(samples_len_bc):
280
- cleaned_text_bc = remove_special_characters(segments_bc[i])
281
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
282
- bc_scores.append(bc_score)
283
- bc_scores_array = np.array(bc_scores)
284
- average_bc_scores = np.mean(bc_scores_array, axis=0)
285
- bc_score_list = average_bc_scores.tolist()
286
- print(
287
- f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}"
288
- )
289
- # isotonic regression calibration
290
- ai_score = iso_reg.predict([bc_score_list[1]])[0]
291
- human_score = 1 - ai_score
292
- bc_score = {"AI": ai_score, "HUMAN": human_score}
293
- print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
294
- mc_scores = []
295
- segments_mc = split_text_allow_complete_sentences_nltk(
296
- input, type_det="mc"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  )
298
- samples_len_mc = len(
299
- split_text_allow_complete_sentences_nltk(input, type_det="mc")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  )
301
- for i in range(samples_len_mc):
302
- cleaned_text_mc = remove_special_characters(segments_mc[i])
303
- mc_score = predict_mc(
304
- text_mc_model, text_mc_tokenizer, cleaned_text_mc
305
- )
306
- mc_scores.append(mc_score)
307
- mc_scores_array = np.array(mc_scores)
308
- average_mc_scores = np.mean(mc_scores_array, axis=0)
309
- mc_score_list = average_mc_scores.tolist()
310
- mc_score = {}
311
- for score, label in zip(mc_score_list, mc_label_map):
312
- mc_score[label.upper()] = score
313
-
314
- sum_prob = 1 - bc_score["HUMAN"]
315
- for key, value in mc_score.items():
316
- mc_score[key] = value * sum_prob
317
- print("MC Score:", mc_score)
318
- if sum_prob < 0.01:
319
- mc_score = {}
320
-
321
- return mc_score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from nltk.tokenize import sent_tokenize
3
+ from googleapiclient.discovery import build
4
+ from collections import Counter
5
+ import re, math
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import asyncio
8
+ import httpx
9
+ from bs4 import BeautifulSoup
10
+ import numpy as np
11
+ import concurrent
12
+ from multiprocessing import Pool
13
+ from const import url_types
14
+ from collections import defaultdict
15
+
16
+ WORD = re.compile(r"\w+")
17
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
+
19
+
20
+ months = {
21
+ "January": "01",
22
+ "February": "02",
23
+ "March": "03",
24
+ "April": "04",
25
+ "May": "05",
26
+ "June": "06",
27
+ "July": "07",
28
+ "August": "08",
29
+ "September": "09",
30
+ "October": "10",
31
+ "November": "11",
32
+ "December": "12",
33
+ }
34
+
35
+ color_map = [
36
+ "#cf2323",
37
+ "#d65129",
38
+ "#d66329",
39
+ "#d67129",
40
+ "#eb9d59",
41
+ "#c2ad36",
42
+ "#d6ae29",
43
+ "#d6b929",
44
+ "#e1ed72",
45
+ "#c2db76",
46
+ "#a2db76",
47
+ ]
48
+
49
+
50
+ def text_to_vector(text):
51
+ words = WORD.findall(text)
52
+ return Counter(words)
53
+
54
+
55
+ def cosineSim(text1, text2):
56
+ vector1 = text_to_vector(text1)
57
+ vector2 = text_to_vector(text2)
58
+ # print vector1,vector2
59
+ cosine = get_cosine(vector1, vector2)
60
+ return cosine
61
+
62
+
63
+ def get_cosine(vec1, vec2):
64
+ intersection = set(vec1.keys()) & set(vec2.keys())
65
+ numerator = sum([vec1[x] * vec2[x] for x in intersection])
66
+ sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
67
+ sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
68
+ denominator = math.sqrt(sum1) * math.sqrt(sum2)
69
+ if denominator == 0:
70
+ return 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  else:
72
+ return float(numerator) / denominator
73
+
74
+
75
+ def split_sentence_blocks(text, size):
76
+ if size == "Paragraph":
77
+ blocks = text.strip().split("\n")
78
+ return blocks
79
+ else:
80
+ sents = sent_tokenize(text.strip())
81
+ return sents
82
+
83
+
84
+ def build_date(year=2024, month="March", day=1):
85
+ return f"{year}{months[month]}{day}"
86
+
87
+
88
+ def split_ngrams(text, n):
89
+ words = text.split()
90
+ return [tuple(words[i : i + n]) for i in range(len(words) - n + 1)]
91
+
92
+
93
+ def sentence_similarity(text1, text2):
94
+ embedding_1 = model.encode(text1, convert_to_tensor=True)
95
+ embedding_2 = model.encode(text2, convert_to_tensor=True)
96
+ o = util.pytorch_cos_sim(embedding_1, embedding_2)
97
+ return o.item()
98
+
99
+
100
+ async def get_url_data(url, client):
101
+ try:
102
+ r = await client.get(url)
103
+ if r.status_code == 200:
104
+ soup = BeautifulSoup(r.content, "html.parser")
105
+ return soup
106
+ except Exception:
107
+ return None
108
+
109
+
110
+ async def parallel_scrap(urls):
111
+ async with httpx.AsyncClient(timeout=30) as client:
112
+ tasks = []
113
+ for url in urls:
114
+ tasks.append(get_url_data(url=url, client=client))
115
+ results = await asyncio.gather(*tasks, return_exceptions=True)
116
+ return results
117
+
118
+
119
+ def merge_ngrams_into_sentence(ngrams):
120
+ if ngrams == None:
121
+ return ""
122
+ if len(ngrams) > 20:
123
+ ngrams = ngrams[:20]
124
+ merged_sentence = []
125
+ i = 0
126
+ for ngram in ngrams:
127
+ overlap = len(set(ngram) & set(merged_sentence[-len(ngram) :]))
128
+ if overlap == 0:
129
+ merged_sentence.extend(ngram)
130
+ elif overlap < len(ngram):
131
+ merged_sentence.extend(ngram[overlap:])
132
+ return " ".join(merged_sentence)
133
+
134
+
135
+ def remove_ngrams_after(ngrams, target_ngram):
136
+ try:
137
+ index = ngrams.index(target_ngram)
138
+ return ngrams[: index + 1]
139
+ except ValueError:
140
+ return None
141
+
142
+
143
+ def matching_score(sentence_content_tuple):
144
+ sentence, content, score = sentence_content_tuple
145
+ if sentence in content:
146
+ return 1, sentence
147
+ # if score > 0.9:
148
+ # return score
149
+ else:
150
+ n = 5
151
+
152
+ # ngrams = split_ngrams(sentence, n)
153
+ # if len(ngrams) == 0:
154
+ # return 0
155
+ # matched = [x for x in ngrams if " ".join(x) in content]
156
+ # return len(matched) / len(ngrams)
157
+
158
+ # list comprehension matching
159
+ # ngrams_sentence = split_ngrams(sentence, n)
160
+ # ngrams_content = [tuple(ngram) for ngram in split_ngrams(content, n)]
161
+ # if len(ngrams_sentence) == 0:
162
+ # return 0, ""
163
+ # matched_ngrams = [
164
+ # 1 for ngram in ngrams_sentence if tuple(ngram) in ngrams_content
165
+ # ]
166
+ # matched_count = sum(matched_ngrams)
167
+
168
+ # set intersection matching
169
+ ngrams_sentence = set(split_ngrams(sentence, n))
170
+ ngrams_content = set(split_ngrams(content, n))
171
+ if len(ngrams_sentence) == 0:
172
+ return 0, ""
173
+ matched_ngrams = ngrams_sentence.intersection(ngrams_content)
174
+ matched_count = len(matched_ngrams)
175
+
176
+ # matched content
177
+ matched_content_ngrams = []
178
+ found = False
179
+ last_found = None
180
+ for ngram in ngrams_sentence:
181
+ for ngram_content in ngrams_content:
182
+ if tuple(ngram) == ngram_content:
183
+ found = True
184
+ last_found = ngram_content
185
+ if found:
186
+ matched_content_ngrams.append(ngram_content)
187
+ matched_content_ngrams = remove_ngrams_after(
188
+ matched_content_ngrams, last_found
189
+ )
190
+ matched_content = merge_ngrams_into_sentence(matched_content_ngrams)
191
+
192
+ return matched_count / len(ngrams_sentence), matched_content
193
+
194
+
195
+ def process_with_multiprocessing(input_data):
196
+ with Pool(processes=1) as pool:
197
+ scores = pool.map(matching_score, input_data)
198
+ return scores
199
+
200
+
201
+ def map_sentence_url(sentences, score_array):
202
+ sentenceToMaxURL = [-1] * len(sentences)
203
+ for j in range(len(sentences)):
204
+ if j > 0:
205
+ maxScore = score_array[sentenceToMaxURL[j - 1]][j]
206
+ sentenceToMaxURL[j] = sentenceToMaxURL[j - 1]
207
+ else:
208
+ maxScore = -1
209
+ for i in range(len(score_array)):
210
+ margin = (
211
+ 0.05
212
+ if (j > 0 and sentenceToMaxURL[j] == sentenceToMaxURL[j - 1])
213
+ else 0
214
+ )
215
+ if score_array[i][j] - maxScore > margin:
216
+ maxScore = score_array[i][j]
217
+ sentenceToMaxURL[j] = i
218
+ return sentenceToMaxURL
219
+
220
+
221
+ def check_url_category(url):
222
+ for category, urls in url_types.items():
223
+ for u in urls:
224
+ if u in url:
225
+ return category
226
+ return "Internet Source"
227
+
228
+
229
+ def google_search(
230
+ plag_option,
231
+ sentences,
232
+ url_count,
233
+ score_array,
234
+ url_list,
235
+ snippets,
236
+ sorted_date,
237
+ domains_to_skip,
238
+ api_key,
239
+ cse_id,
240
+ **kwargs,
241
+ ):
242
+ service = build("customsearch", "v1", developerKey=api_key)
243
+ num_pages = 1
244
+ for i, sentence in enumerate(sentences):
245
+ results = (
246
+ service.cse()
247
+ .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
248
+ .execute()
249
+ )
250
+ if "items" in results and len(results["items"]) > 0:
251
+ for count, link in enumerate(results["items"]):
252
+ if count >= num_pages:
253
+ break
254
+ # skip user selected domains
255
+ if (domains_to_skip is not None) and any(
256
+ ("." + domain) in link["link"] for domain in domains_to_skip
257
+ ):
258
+ continue
259
+ # clean up snippet of '...'
260
+ snippet = link["snippet"]
261
+ ind = snippet.find("...")
262
+ if ind < 20 and ind > 9:
263
+ snippet = snippet[ind + len("... ") :]
264
+ ind = snippet.find("...")
265
+ if ind > len(snippet) - 5:
266
+ snippet = snippet[:ind]
267
+
268
+ # update cosine similarity between snippet and given text
269
+ url = link["link"]
270
+ if url not in url_list:
271
+ url_list.append(url)
272
+ score_array.append([0] * len(sentences))
273
+ snippets.append([""] * len(sentences))
274
+ url_count[url] = url_count[url] + 1 if url in url_count else 1
275
+ snippets[url_list.index(url)][i] = snippet
276
+ if plag_option == "Standard":
277
+ score_array[url_list.index(url)][i] = cosineSim(
278
+ sentence, snippet
279
+ )
280
+ else:
281
+ score_array[url_list.index(url)][i] = sentence_similarity(
282
+ sentence, snippet
283
+ )
284
+ return url_count, score_array
285
+
286
+
287
+ def plagiarism_check(
288
+ plag_option,
289
+ input,
290
+ year_from,
291
+ month_from,
292
+ day_from,
293
+ year_to,
294
+ month_to,
295
+ day_to,
296
+ domains_to_skip,
297
+ source_block_size,
298
+ ):
299
+ # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
300
+ # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
301
+ # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
302
+ api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
303
+ # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
304
+ # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
305
+ # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
306
+ cse_id = "851813e81162b4ed4"
307
+
308
+ url_scores = []
309
+ sentence_scores = []
310
+ sentences = split_sentence_blocks(input, source_block_size)
311
+ url_count = {}
312
+ score_array = []
313
+ url_list = []
314
+ snippets = []
315
+ date_from = build_date(year_from, month_from, day_from)
316
+ date_to = build_date(year_to, month_to, day_to)
317
+ sort_date = f"date:r:{date_from}:{date_to}"
318
+ # get list of URLS to check
319
+ start_time = time.perf_counter()
320
+ url_count, score_array = google_search(
321
+ plag_option,
322
+ sentences,
323
+ url_count,
324
+ score_array,
325
+ url_list,
326
+ snippets,
327
+ sort_date,
328
+ domains_to_skip,
329
+ api_key,
330
+ cse_id,
331
  )
332
+ print("GOOGLE SEARCH PROCESSING TIME: ", time.perf_counter() - start_time)
333
+ # Scrape URLs in list
334
+ start_time = time.perf_counter()
335
+ soups = asyncio.run(parallel_scrap(url_list))
336
+ print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
337
+ input_data = []
338
+ for i, soup in enumerate(soups):
339
+ if soup:
340
+ page_content = soup.text
341
+ for j, sent in enumerate(sentences):
342
+ input_data.append((sent, page_content, score_array[i][j]))
343
+ start_time = time.perf_counter()
344
+ # scores = process_with_multiprocessing(input_data)
345
+ scores = []
346
+ for i in input_data:
347
+ scores.append(matching_score(i))
348
+ print("MATCHING SCORE PROCESSING TIME: ", time.perf_counter() - start_time)
349
+ matched_sentence_array = [
350
+ ["" for _ in range(len(score_array[0]))]
351
+ for _ in range(len(score_array))
352
+ ]
353
+
354
+ k = 0
355
+ # Update score array for each (soup, sentence)
356
+ for i, soup in enumerate(soups):
357
+ if soup:
358
+ for j, _ in enumerate(sentences):
359
+ score_array[i][j] = scores[k][0]
360
+ matched_sentence_array[i][j] = scores[k][1]
361
+ k += 1
362
+
363
+ sentenceToMaxURL = map_sentence_url(sentences, score_array)
364
+ index = np.unique(sentenceToMaxURL)
365
+
366
+ url_source = {}
367
+ for url in index:
368
+ s = [
369
+ score_array[url][sen]
370
+ for sen in range(len(sentences))
371
+ if sentenceToMaxURL[sen] == url
372
+ ]
373
+ url_source[url] = sum(s) / len(s)
374
+ index_descending = sorted(url_source, key=url_source.get, reverse=True)
375
+ urlMap = {}
376
+ for count, i in enumerate(index_descending):
377
+ urlMap[i] = count + 1
378
+
379
+ # build results
380
+ for i, sent in enumerate(sentences):
381
+ ind = sentenceToMaxURL[i]
382
+ if url_source[ind] > 0.1:
383
+ sentence_scores.append(
384
+ [
385
+ sent,
386
+ round(url_source[ind] * 100, 2),
387
+ url_list[ind],
388
+ urlMap[ind],
389
+ ]
390
+ )
391
+ else:
392
+ sentence_scores.append([sent, None, url_list[ind], -1])
393
+ print("SNIPPETS: ", snippets)
394
+ snippets = [[item for item in sublist if item] for sublist in snippets]
395
+ for ind in index_descending:
396
+ if url_source[ind] > 0.1:
397
+ matched_sentence_array = [
398
+ [item for item in sublist if item]
399
+ for sublist in matched_sentence_array
400
+ ]
401
+ matched_sentence = "...".join(
402
+ [sent for sent in matched_sentence_array[ind]]
403
+ )
404
+ if matched_sentence == "":
405
+ matched_sentence = "...".join([sent for sent in snippets[ind]])
406
+ url_scores.append(
407
+ [
408
+ url_list[ind],
409
+ round(url_source[ind] * 100, 2),
410
+ urlMap[ind],
411
+ matched_sentence,
412
+ ]
413
+ )
414
+
415
+ return sentence_scores, url_scores
416
+
417
+
418
+ def html_highlight(
419
+ plag_option,
420
+ input,
421
+ year_from,
422
+ month_from,
423
+ day_from,
424
+ year_to,
425
+ month_to,
426
+ day_to,
427
+ domains_to_skip,
428
+ source_block_size,
429
+ ):
430
+ start_time = time.perf_counter()
431
+ sentence_scores, url_scores = plagiarism_check(
432
+ plag_option,
433
+ input,
434
+ year_from,
435
+ month_from,
436
+ day_from,
437
+ year_to,
438
+ month_to,
439
+ day_to,
440
+ domains_to_skip,
441
+ source_block_size,
442
  )
443
+
444
+ html_content = """
445
+ <link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>
446
+ <div style='font-family: {font}; border: 2px solid black; padding: 10px; color: #FFFFFF;'>
447
+ <html>
448
+ <head>
449
+ <title>Toggle Details</title>
450
+ <style>
451
+ .score-container {
452
+ display: flex;
453
+ justify-content: space-around;
454
+ align-items: left;
455
+ padding: 20px;
456
+ }
457
+ .score-item {
458
+ text-align: center;
459
+ padding: 10px;
460
+ background-color: #636362;
461
+ border-radius: 5px;
462
+ flex-grow: 1;
463
+ margin: 0 5px;
464
+ }
465
+ .details {
466
+ display: none;
467
+ padding: 10px;
468
+ }
469
+ .url-link {
470
+ font-size: 1.2em;
471
+ }
472
+ .url-link span {
473
+ margin-right: 10px;
474
+ }
475
+ .toggle-button {
476
+ color: #333;
477
+ border: none;
478
+ padding: 5px 10px;
479
+ text-align: center;
480
+ text-decoration: none;
481
+ display: inline-block;
482
+ cursor: pointer;
483
+ }
484
+ </style>
485
+ </head>
486
+ """
487
+
488
+ prev_idx = None
489
+ combined_sentence = ""
490
+ total_score = 0
491
+ total_count = 0
492
+ category_scores = defaultdict(set)
493
+ for sentence, score, url, idx in sentence_scores:
494
+ category = check_url_category(url)
495
+ if score is None:
496
+ total_score += 0
497
+ else:
498
+ total_score += score
499
+ category_scores[category].add(score)
500
+ total_count += 1
501
+
502
+ if idx != prev_idx and prev_idx is not None:
503
+ color = color_map[prev_idx - 1]
504
+ index_part = f"<span>[{prev_idx}]</span>"
505
+ formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
506
+ html_content += formatted_sentence
507
+ combined_sentence = ""
508
+ combined_sentence += " " + sentence
509
+ prev_idx = idx
510
+
511
+ print(category_scores)
512
+ total_average_score = round(total_score / total_count, 2)
513
+ category_averages = {
514
+ category: round((sum(scores) / len(scores)), 2)
515
+ for category, scores in category_scores.items()
516
+ }
517
+
518
+ if combined_sentence:
519
+ color = color_map[prev_idx - 1]
520
+ index_part = ""
521
+ if prev_idx != -1:
522
+ index_part = f"<span>[{prev_idx}]</span>"
523
+ formatted_sentence = f'<p style="background-color: {color}; padding: 2px;">{combined_sentence} {index_part}</p>'
524
+ html_content += formatted_sentence
525
+
526
+ html_content += "<hr>"
527
+
528
+ html_content += f"""
529
+ <div class="score-container">
530
+ <div class="score-item">
531
+ <h3>Overall Similarity</h3>
532
+ <p>{total_average_score}%</p>
533
+ </div>
534
+ """
535
+ for category, score in category_averages.items():
536
+ html_content += f"""
537
+ <div class="score-item"><h3>{category}</h3><p>{score}%</p></div>
538
+ """
539
+ html_content += "</div>"
540
+
541
+ for url, score, idx, sentence in url_scores:
542
+ url_category = check_url_category(url)
543
+ color = color_map[idx - 1]
544
+ formatted_url = f"""
545
+ <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p><i>{url_category}</i></p>
546
+ <p> --- <b>Matching Score: </b>{score}%</p>
547
+ <p> --- <b>Original Source Content: </b>{sentence}</p>
548
+ """
549
+ # formatted_url = f"""
550
+ # <div class="url-link">
551
+ # <p style="background-color: {color}; padding: 5px; font-size: 1.2em">[{idx}] <b>{url}</b></p><p>{url_category}</p>
552
+ # <a href="#" onclick="toggleDetails(event)" class="toggle-button">&gt;</a>
553
+ # </div>
554
+ # <div id="detailsContainer" class="details">
555
+ # <p> --- <b>Matching Score: </b>{score}%</p>
556
+ # <p> --- <b>Original Source Content: </b>{sentence}</p>
557
+ # </div>
558
+ # """
559
+ html_content += formatted_url
560
+
561
+ html_content += "</html>"
562
+
563
+ print("PLAGIARISM PROCESSING TIME: ", time.perf_counter() - start_time)
564
+
565
+ return html_content