minko186 commited on
Commit
350b1a0
·
1 Parent(s): a224fbc

refactored plagiarism

Browse files
Files changed (3) hide show
  1. plagiarism.py +149 -184
  2. predictors.py +41 -29
  3. utils.py +2 -22
plagiarism.py CHANGED
@@ -16,37 +16,36 @@ WORD = re.compile(r"\w+")
16
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
17
 
18
 
19
- # returns cosine similarity of two vectors
20
- # input: two vectors
21
- # output: integer between 0 and 1.
22
- def get_cosine(vec1, vec2):
23
- intersection = set(vec1.keys()) & set(vec2.keys())
24
-
25
- # calculating numerator
26
- numerator = sum([vec1[x] * vec2[x] for x in intersection])
27
-
28
- # calculating denominator
29
- sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
30
- sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
31
- denominator = math.sqrt(sum1) * math.sqrt(sum2)
 
32
 
33
- # checking for divide by zero
34
- if denominator == 0:
35
- return 0.0
36
- else:
37
- return float(numerator) / denominator
 
 
 
38
 
39
 
40
- # converts given text into a vector
41
  def text_to_vector(text):
42
- # uses the Regular expression above and gets all words
43
  words = WORD.findall(text)
44
- # returns a counter of all the words (count of number of occurences)
45
  return Counter(words)
46
 
47
 
48
- # returns cosine similarity of two words
49
- # uses: text_to_vector(text) and get_cosine(v1,v2)
50
  def cosineSim(text1, text2):
51
  vector1 = text_to_vector(text1)
52
  vector2 = text_to_vector(text2)
@@ -55,75 +54,16 @@ def cosineSim(text1, text2):
55
  return cosine
56
 
57
 
58
- def cos_sim_torch(embedding_1, embedding_2):
59
- return util.pytorch_cos_sim(embedding_1, embedding_2).item()
60
-
61
-
62
- def embed_text(text):
63
- return model.encode(text, convert_to_tensor=True)
64
-
65
-
66
- def sentence_similarity(text1, text2):
67
- embedding_1 = model.encode(text1, convert_to_tensor=True)
68
- embedding_2 = model.encode(text2, convert_to_tensor=True)
69
-
70
- o = util.pytorch_cos_sim(embedding_1, embedding_2)
71
- return o.item()
72
-
73
-
74
- def google_search(
75
- plag_option,
76
- sentences,
77
- url_count,
78
- score_array,
79
- url_list,
80
- sorted_date,
81
- domains_to_skip,
82
- api_key,
83
- cse_id,
84
- **kwargs,
85
- ):
86
- service = build("customsearch", "v1", developerKey=api_key)
87
- for i, sentence in enumerate(sentences):
88
- results = (
89
- service.cse()
90
- .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
91
- .execute()
92
- )
93
- if "items" in results and len(results["items"]) > 0:
94
- for count, link in enumerate(results["items"]):
95
- # stop after 3 pages
96
- if count >= 3:
97
- break
98
- # skip user selected domains
99
- if any(
100
- ("." + domain) in link["link"] for domain in domains_to_skip
101
- ):
102
- continue
103
- # clean up snippet of '...'
104
- snippet = link["snippet"]
105
- ind = snippet.find("...")
106
- if ind < 20 and ind > 9:
107
- snippet = snippet[ind + len("... ") :]
108
- ind = snippet.find("...")
109
- if ind > len(snippet) - 5:
110
- snippet = snippet[:ind]
111
-
112
- # update cosine similarity between snippet and given text
113
- url = link["link"]
114
- if url not in url_list:
115
- url_list.append(url)
116
- score_array.append([0] * len(sentences))
117
- url_count[url] = url_count[url] + 1 if url in url_count else 1
118
- if plag_option == "Standard":
119
- score_array[url_list.index(url)][i] = cosineSim(
120
- sentence, snippet
121
- )
122
- else:
123
- score_array[url_list.index(url)][i] = sentence_similarity(
124
- sentence, snippet
125
- )
126
- return url_count, score_array
127
 
128
 
129
  def split_sentence_blocks(text):
@@ -138,49 +78,32 @@ def split_sentence_blocks(text):
138
  return two_sents
139
 
140
 
141
- months = {
142
- "January": "01",
143
- "February": "02",
144
- "March": "03",
145
- "April": "04",
146
- "May": "05",
147
- "June": "06",
148
- "July": "07",
149
- "August": "08",
150
- "September": "09",
151
- "October": "10",
152
- "November": "11",
153
- "December": "12",
154
- }
155
-
156
-
157
  def build_date(year=2024, month="March", day=1):
158
  return f"{year}{months[month]}{day}"
159
 
160
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  async def get_url_data(url, client):
162
  try:
163
  r = await client.get(url)
164
- # print(r.status_code)
165
  if r.status_code == 200:
166
- # print("in")
167
  soup = BeautifulSoup(r.content, "html.parser")
168
  return soup
169
  except Exception:
170
  return None
171
 
172
 
173
- def remove_punc(text):
174
- res = re.sub(r"[^\w\s]", "", text)
175
- return res
176
-
177
-
178
- def split_ngrams(text, n):
179
- # return n-grams of size n
180
- words = text.split()
181
- return [words[i : i + n] for i in range(len(words) - n + 1)]
182
-
183
-
184
  async def parallel_scrap(urls):
185
  async with httpx.AsyncClient(timeout=30) as client:
186
  tasks = []
@@ -209,11 +132,6 @@ def process_with_multiprocessing(input_data):
209
  return scores
210
 
211
 
212
- def print2d(array):
213
- for row in array:
214
- print(row)
215
-
216
-
217
  def map_sentence_url(sentences, score_array):
218
  sentenceToMaxURL = [-1] * len(sentences)
219
  for j in range(len(sentences)):
@@ -234,65 +152,59 @@ def map_sentence_url(sentences, score_array):
234
  return sentenceToMaxURL
235
 
236
 
237
- def html_highlight(
238
  plag_option,
239
- input,
240
- year_from,
241
- month_from,
242
- day_from,
243
- year_to,
244
- month_to,
245
- day_to,
246
  domains_to_skip,
 
 
 
247
  ):
248
- sentence_scores, url_scores = plagiarism_check(
249
- plag_option,
250
- input,
251
- year_from,
252
- month_from,
253
- day_from,
254
- year_to,
255
- month_to,
256
- day_to,
257
- domains_to_skip,
258
- )
259
- color_map = [
260
- "#cf2323",
261
- "#eb9d59",
262
- "#c2ad36",
263
- "#e1ed72",
264
- "#c2db76",
265
- "#a2db76",
266
- ]
267
- font = "Roboto"
268
- html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
269
- prev_idx = None
270
- combined_sentence = ""
271
- for sentence, _, _, idx in sentence_scores:
272
- if idx != prev_idx and prev_idx is not None:
273
- color = color_map[prev_idx - 1]
274
- index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
275
- formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
276
- html_content += formatted_sentence
277
- combined_sentence = ""
278
- combined_sentence += " " + sentence
279
- prev_idx = idx
280
-
281
- if combined_sentence:
282
- color = color_map[prev_idx - 1]
283
- index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
284
- formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
285
- html_content += formatted_sentence
286
-
287
- html_content += "<hr>"
288
- for url, score, idx in url_scores:
289
- color = color_map[idx - 1]
290
- formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
291
- html_content += formatted_url
292
-
293
- html_content += "</div>"
294
 
295
- return html_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
 
298
  def plagiarism_check(
@@ -306,11 +218,11 @@ def plagiarism_check(
306
  day_to,
307
  domains_to_skip,
308
  ):
309
- api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
310
- api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
311
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
312
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
313
- # api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
314
  cse_id = "851813e81162b4ed4"
315
 
316
  url_scores = []
@@ -384,3 +296,56 @@ def plagiarism_check(
384
  )
385
 
386
  return sentence_scores, url_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
17
 
18
 
19
+ months = {
20
+ "January": "01",
21
+ "February": "02",
22
+ "March": "03",
23
+ "April": "04",
24
+ "May": "05",
25
+ "June": "06",
26
+ "July": "07",
27
+ "August": "08",
28
+ "September": "09",
29
+ "October": "10",
30
+ "November": "11",
31
+ "December": "12",
32
+ }
33
 
34
+ color_map = [
35
+ "#cf2323",
36
+ "#eb9d59",
37
+ "#c2ad36",
38
+ "#e1ed72",
39
+ "#c2db76",
40
+ "#a2db76",
41
+ ]
42
 
43
 
 
44
  def text_to_vector(text):
 
45
  words = WORD.findall(text)
 
46
  return Counter(words)
47
 
48
 
 
 
49
  def cosineSim(text1, text2):
50
  vector1 = text_to_vector(text1)
51
  vector2 = text_to_vector(text2)
 
54
  return cosine
55
 
56
 
57
+ def get_cosine(vec1, vec2):
58
+ intersection = set(vec1.keys()) & set(vec2.keys())
59
+ numerator = sum([vec1[x] * vec2[x] for x in intersection])
60
+ sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
61
+ sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
62
+ denominator = math.sqrt(sum1) * math.sqrt(sum2)
63
+ if denominator == 0:
64
+ return 0.0
65
+ else:
66
+ return float(numerator) / denominator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  def split_sentence_blocks(text):
 
78
  return two_sents
79
 
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def build_date(year=2024, month="March", day=1):
82
  return f"{year}{months[month]}{day}"
83
 
84
 
85
+ def split_ngrams(text, n):
86
+ words = text.split()
87
+ return [words[i : i + n] for i in range(len(words) - n + 1)]
88
+
89
+
90
+ def sentence_similarity(text1, text2):
91
+ embedding_1 = model.encode(text1, convert_to_tensor=True)
92
+ embedding_2 = model.encode(text2, convert_to_tensor=True)
93
+ o = util.pytorch_cos_sim(embedding_1, embedding_2)
94
+ return o.item()
95
+
96
+
97
  async def get_url_data(url, client):
98
  try:
99
  r = await client.get(url)
 
100
  if r.status_code == 200:
 
101
  soup = BeautifulSoup(r.content, "html.parser")
102
  return soup
103
  except Exception:
104
  return None
105
 
106
 
 
 
 
 
 
 
 
 
 
 
 
107
  async def parallel_scrap(urls):
108
  async with httpx.AsyncClient(timeout=30) as client:
109
  tasks = []
 
132
  return scores
133
 
134
 
 
 
 
 
 
135
  def map_sentence_url(sentences, score_array):
136
  sentenceToMaxURL = [-1] * len(sentences)
137
  for j in range(len(sentences)):
 
152
  return sentenceToMaxURL
153
 
154
 
155
+ def google_search(
156
  plag_option,
157
+ sentences,
158
+ url_count,
159
+ score_array,
160
+ url_list,
161
+ sorted_date,
 
 
162
  domains_to_skip,
163
+ api_key,
164
+ cse_id,
165
+ **kwargs,
166
  ):
167
+ service = build("customsearch", "v1", developerKey=api_key)
168
+ for i, sentence in enumerate(sentences):
169
+ results = (
170
+ service.cse()
171
+ .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
172
+ .execute()
173
+ )
174
+ if "items" in results and len(results["items"]) > 0:
175
+ for count, link in enumerate(results["items"]):
176
+ # stop after 3 pages
177
+ if count >= 3:
178
+ break
179
+ # skip user selected domains
180
+ if any(
181
+ ("." + domain) in link["link"] for domain in domains_to_skip
182
+ ):
183
+ continue
184
+ # clean up snippet of '...'
185
+ snippet = link["snippet"]
186
+ ind = snippet.find("...")
187
+ if ind < 20 and ind > 9:
188
+ snippet = snippet[ind + len("... ") :]
189
+ ind = snippet.find("...")
190
+ if ind > len(snippet) - 5:
191
+ snippet = snippet[:ind]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ # update cosine similarity between snippet and given text
194
+ url = link["link"]
195
+ if url not in url_list:
196
+ url_list.append(url)
197
+ score_array.append([0] * len(sentences))
198
+ url_count[url] = url_count[url] + 1 if url in url_count else 1
199
+ if plag_option == "Standard":
200
+ score_array[url_list.index(url)][i] = cosineSim(
201
+ sentence, snippet
202
+ )
203
+ else:
204
+ score_array[url_list.index(url)][i] = sentence_similarity(
205
+ sentence, snippet
206
+ )
207
+ return url_count, score_array
208
 
209
 
210
  def plagiarism_check(
 
218
  day_to,
219
  domains_to_skip,
220
  ):
221
+ # api_key = "AIzaSyCLyCCpOPLZWuptuPAPSg8cUIZhdEMVf6g"
222
+ # api_key = "AIzaSyCS1WQDMl1IMjaXtwSd_2rA195-Yc4psQE"
223
  # api_key = "AIzaSyCB61O70B8AC3l5Kk3KMoLb6DN37B7nqIk"
224
  # api_key = "AIzaSyCg1IbevcTAXAPYeYreps6wYWDbU0Kz8tg"
225
+ api_key = "AIzaSyA5VVwY1eEoIoflejObrxFDI0DJvtbmgW8"
226
  cse_id = "851813e81162b4ed4"
227
 
228
  url_scores = []
 
296
  )
297
 
298
  return sentence_scores, url_scores
299
+
300
+
301
+ def html_highlight(
302
+ plag_option,
303
+ input,
304
+ year_from,
305
+ month_from,
306
+ day_from,
307
+ year_to,
308
+ month_to,
309
+ day_to,
310
+ domains_to_skip,
311
+ ):
312
+ sentence_scores, url_scores = plagiarism_check(
313
+ plag_option,
314
+ input,
315
+ year_from,
316
+ month_from,
317
+ day_from,
318
+ year_to,
319
+ month_to,
320
+ day_to,
321
+ domains_to_skip,
322
+ )
323
+
324
+ html_content = "<link href='https://fonts.googleapis.com/css?family=Roboto' rel='stylesheet'>\n<div style='font-family: {font}; border: 2px solid black; background-color: #333333; padding: 10px; color: #FFFFFF;'>"
325
+ prev_idx = None
326
+ combined_sentence = ""
327
+ for sentence, _, _, idx in sentence_scores:
328
+ if idx != prev_idx and prev_idx is not None:
329
+ color = color_map[prev_idx - 1]
330
+ index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
331
+ formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
332
+ html_content += formatted_sentence
333
+ combined_sentence = ""
334
+ combined_sentence += " " + sentence
335
+ prev_idx = idx
336
+
337
+ if combined_sentence:
338
+ color = color_map[prev_idx - 1]
339
+ index_part = f'<span style="background-color: {color}; padding: 2px;">[{prev_idx}]</span>'
340
+ formatted_sentence = f"<p>{combined_sentence} {index_part}</p>"
341
+ html_content += formatted_sentence
342
+
343
+ html_content += "<hr>"
344
+ for url, score, idx in url_scores:
345
+ color = color_map[idx - 1]
346
+ formatted_url = f'<p style="background-color: {color}; padding: 5px;">({idx}) <b>{url}</b></p><p> --- Matching Score: {score}%</p>'
347
+ html_content += formatted_url
348
+
349
+ html_content += "</div>"
350
+
351
+ return html_content
predictors.py CHANGED
@@ -1,23 +1,11 @@
1
- import requests
2
- import httpx
3
  import torch
4
- import re
5
- from bs4 import BeautifulSoup
6
  import numpy as np
7
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
- import asyncio
9
- from evaluate import load
10
- from datetime import date
11
  import nltk
12
- from transformers import GPT2LMHeadModel, GPT2TokenizerFast
13
- import plotly.graph_objects as go
14
  import torch.nn.functional as F
15
  import nltk
16
- from unidecode import unidecode
17
- import time
18
  from scipy.special import softmax
19
  import yaml
20
- import os
21
  from utils import *
22
  import joblib
23
 
@@ -51,9 +39,9 @@ tokenizers_1on1 = {}
51
  models_1on1 = {}
52
  for model_name, model in zip(mc_label_map, text_1on1_models):
53
  tokenizers_1on1[model_name] = AutoTokenizer.from_pretrained(model)
54
- models_1on1[model_name] = AutoModelForSequenceClassification.from_pretrained(
55
- model
56
- ).to(device)
57
 
58
  # proxy models for explainability
59
  mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
@@ -62,7 +50,9 @@ bc_model_mini = AutoModelForSequenceClassification.from_pretrained(
62
  mini_bc_model_name
63
  ).to(device)
64
  mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
65
- humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(mini_humanizer_model_name)
 
 
66
  humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
67
  mini_humanizer_model_name
68
  ).to(device)
@@ -232,7 +222,9 @@ def predict_mc_scores(input):
232
  bc_scores = []
233
  mc_scores = []
234
 
235
- samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
 
 
236
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
237
  for i in range(samples_len_bc):
238
  cleaned_text_bc = remove_special_characters(segments_bc[i])
@@ -243,7 +235,9 @@ def predict_mc_scores(input):
243
  bc_score_list = average_bc_scores.tolist()
244
  bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
245
  segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
246
- samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det="mc"))
 
 
247
  for i in range(samples_len_mc):
248
  cleaned_text_mc = remove_special_characters(segments_mc[i])
249
  mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
@@ -266,7 +260,9 @@ def predict_mc_scores(input):
266
 
267
  def predict_bc_scores(input):
268
  bc_scores = []
269
- samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
 
 
270
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
271
  for i in range(samples_len_bc):
272
  cleaned_text_bc = remove_special_characters(segments_bc[i])
@@ -275,7 +271,9 @@ def predict_bc_scores(input):
275
  bc_scores_array = np.array(bc_scores)
276
  average_bc_scores = np.mean(bc_scores_array, axis=0)
277
  bc_score_list = average_bc_scores.tolist()
278
- print(f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}")
 
 
279
  # isotonic regression calibration
280
  ai_score = iso_reg.predict([bc_score_list[1]])[0]
281
  human_score = 1 - ai_score
@@ -309,7 +307,9 @@ def predict_1on1_combined(input):
309
 
310
 
311
  def predict_1on1_single(input, model):
312
- predictions = predict_1on1(models_1on1[model], tokenizers_1on1[model], input)[1]
 
 
313
  return predictions
314
 
315
 
@@ -321,7 +321,9 @@ def predict_1on1_scores(input, models):
321
  print(f"Models to Test: {models}")
322
  # BC SCORE
323
  bc_scores = []
324
- samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det="bc"))
 
 
325
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
326
  for i in range(samples_len_bc):
327
  cleaned_text_bc = remove_special_characters(segments_bc[i])
@@ -330,24 +332,30 @@ def predict_1on1_scores(input, models):
330
  bc_scores_array = np.array(bc_scores)
331
  average_bc_scores = np.mean(bc_scores_array, axis=0)
332
  bc_score_list = average_bc_scores.tolist()
333
- print(f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}")
 
 
334
  # isotonic regression calibration
335
  ai_score = iso_reg.predict([bc_score_list[1]])[0]
336
  human_score = 1 - ai_score
337
  bc_score = {"AI": ai_score, "HUMAN": human_score}
338
  print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
339
-
340
  # MC SCORE
341
  if len(models) > 1:
342
  print("Starting MC")
343
  mc_scores = []
344
- segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
 
 
345
  samples_len_mc = len(
346
  split_text_allow_complete_sentences_nltk(input, type_det="mc")
347
  )
348
  for i in range(samples_len_mc):
349
  cleaned_text_mc = remove_special_characters(segments_mc[i])
350
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
 
 
351
  mc_scores.append(mc_score)
352
  mc_scores_array = np.array(mc_scores)
353
  average_mc_scores = np.mean(mc_scores_array, axis=0)
@@ -357,7 +365,9 @@ def predict_1on1_scores(input, models):
357
  mc_score[label.upper()] = score
358
 
359
  mc_score = {
360
- key: mc_score[key.upper()] for key in models if key.upper() in mc_score
 
 
361
  }
362
  total = sum(mc_score.values())
363
  # Normalize each value by dividing it by the total
@@ -365,14 +375,16 @@ def predict_1on1_scores(input, models):
365
  sum_prob = 1 - bc_score["HUMAN"]
366
  for key, value in mc_score.items():
367
  mc_score[key] = value * sum_prob
368
- print('MC Score:',mc_score)
369
  if sum_prob < 0.01:
370
  mc_score = {}
371
 
372
  elif len(models) == 1:
373
  print("Starting 1on1")
374
  mc_scores = []
375
- segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
 
 
376
  samples_len_mc = len(
377
  split_text_allow_complete_sentences_nltk(input, type_det="mc")
378
  )
 
 
 
1
  import torch
 
 
2
  import numpy as np
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
 
 
4
  import nltk
 
 
5
  import torch.nn.functional as F
6
  import nltk
 
 
7
  from scipy.special import softmax
8
  import yaml
 
9
  from utils import *
10
  import joblib
11
 
 
39
  models_1on1 = {}
40
  for model_name, model in zip(mc_label_map, text_1on1_models):
41
  tokenizers_1on1[model_name] = AutoTokenizer.from_pretrained(model)
42
+ models_1on1[model_name] = (
43
+ AutoModelForSequenceClassification.from_pretrained(model).to(device)
44
+ )
45
 
46
  # proxy models for explainability
47
  mini_bc_model_name = "polygraf-ai/bc-model-bert-mini"
 
50
  mini_bc_model_name
51
  ).to(device)
52
  mini_humanizer_model_name = "polygraf-ai/quillbot-detector-bert-mini-9K"
53
+ humanizer_tokenizer_mini = AutoTokenizer.from_pretrained(
54
+ mini_humanizer_model_name
55
+ )
56
  humanizer_model_mini = AutoModelForSequenceClassification.from_pretrained(
57
  mini_humanizer_model_name
58
  ).to(device)
 
222
  bc_scores = []
223
  mc_scores = []
224
 
225
+ samples_len_bc = len(
226
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
227
+ )
228
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
229
  for i in range(samples_len_bc):
230
  cleaned_text_bc = remove_special_characters(segments_bc[i])
 
235
  bc_score_list = average_bc_scores.tolist()
236
  bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
237
  segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
238
+ samples_len_mc = len(
239
+ split_text_allow_complete_sentences_nltk(input, type_det="mc")
240
+ )
241
  for i in range(samples_len_mc):
242
  cleaned_text_mc = remove_special_characters(segments_mc[i])
243
  mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
 
260
 
261
  def predict_bc_scores(input):
262
  bc_scores = []
263
+ samples_len_bc = len(
264
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
265
+ )
266
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
267
  for i in range(samples_len_bc):
268
  cleaned_text_bc = remove_special_characters(segments_bc[i])
 
271
  bc_scores_array = np.array(bc_scores)
272
  average_bc_scores = np.mean(bc_scores_array, axis=0)
273
  bc_score_list = average_bc_scores.tolist()
274
+ print(
275
+ f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}"
276
+ )
277
  # isotonic regression calibration
278
  ai_score = iso_reg.predict([bc_score_list[1]])[0]
279
  human_score = 1 - ai_score
 
307
 
308
 
309
  def predict_1on1_single(input, model):
310
+ predictions = predict_1on1(
311
+ models_1on1[model], tokenizers_1on1[model], input
312
+ )[1]
313
  return predictions
314
 
315
 
 
321
  print(f"Models to Test: {models}")
322
  # BC SCORE
323
  bc_scores = []
324
+ samples_len_bc = len(
325
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
326
+ )
327
  segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
328
  for i in range(samples_len_bc):
329
  cleaned_text_bc = remove_special_characters(segments_bc[i])
 
332
  bc_scores_array = np.array(bc_scores)
333
  average_bc_scores = np.mean(bc_scores_array, axis=0)
334
  bc_score_list = average_bc_scores.tolist()
335
+ print(
336
+ f"Original BC scores: AI: {bc_score_list[1]}, HUMAN: {bc_score_list[0]}"
337
+ )
338
  # isotonic regression calibration
339
  ai_score = iso_reg.predict([bc_score_list[1]])[0]
340
  human_score = 1 - ai_score
341
  bc_score = {"AI": ai_score, "HUMAN": human_score}
342
  print(f"Calibration BC scores: AI: {ai_score}, HUMAN: {human_score}")
343
+
344
  # MC SCORE
345
  if len(models) > 1:
346
  print("Starting MC")
347
  mc_scores = []
348
+ segments_mc = split_text_allow_complete_sentences_nltk(
349
+ input, type_det="mc"
350
+ )
351
  samples_len_mc = len(
352
  split_text_allow_complete_sentences_nltk(input, type_det="mc")
353
  )
354
  for i in range(samples_len_mc):
355
  cleaned_text_mc = remove_special_characters(segments_mc[i])
356
+ mc_score = predict_mc(
357
+ text_mc_model, text_mc_tokenizer, cleaned_text_mc
358
+ )
359
  mc_scores.append(mc_score)
360
  mc_scores_array = np.array(mc_scores)
361
  average_mc_scores = np.mean(mc_scores_array, axis=0)
 
365
  mc_score[label.upper()] = score
366
 
367
  mc_score = {
368
+ key: mc_score[key.upper()]
369
+ for key in models
370
+ if key.upper() in mc_score
371
  }
372
  total = sum(mc_score.values())
373
  # Normalize each value by dividing it by the total
 
375
  sum_prob = 1 - bc_score["HUMAN"]
376
  for key, value in mc_score.items():
377
  mc_score[key] = value * sum_prob
378
+ print("MC Score:", mc_score)
379
  if sum_prob < 0.01:
380
  mc_score = {}
381
 
382
  elif len(models) == 1:
383
  print("Starting 1on1")
384
  mc_scores = []
385
+ segments_mc = split_text_allow_complete_sentences_nltk(
386
+ input, type_det="mc"
387
+ )
388
  samples_len_mc = len(
389
  split_text_allow_complete_sentences_nltk(input, type_det="mc")
390
  )
utils.py CHANGED
@@ -1,28 +1,11 @@
1
- from urllib.request import urlopen, Request
2
- from googleapiclient.discovery import build
3
- import requests
4
- import httpx
5
  import re
6
- from bs4 import BeautifulSoup
7
- import re, math
8
- from collections import Counter
9
- import numpy as np
10
- import asyncio
11
- import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
- import threading
14
- import torch
15
  import re
16
- import numpy as np
17
- import asyncio
18
- from datetime import date
19
- import nltk
20
  from unidecode import unidecode
21
- from scipy.special import softmax
22
  from transformers import AutoTokenizer
23
  import yaml
24
  import fitz
25
- import os
26
 
27
 
28
  def remove_accents(input_str):
@@ -48,9 +31,6 @@ def update_character_count(text):
48
  return f"{len(text)} characters"
49
 
50
 
51
- nltk.download("punkt")
52
-
53
-
54
  with open("config.yaml", "r") as file:
55
  params = yaml.safe_load(file)
56
 
@@ -77,4 +57,4 @@ def extract_text_from_pdf(pdf_path):
77
 
78
 
79
  WORD = re.compile(r"\w+")
80
- model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
1
  import re
2
+ import re
 
 
 
 
 
3
  from sentence_transformers import SentenceTransformer, util
 
 
4
  import re
 
 
 
 
5
  from unidecode import unidecode
 
6
  from transformers import AutoTokenizer
7
  import yaml
8
  import fitz
 
9
 
10
 
11
  def remove_accents(input_str):
 
31
  return f"{len(text)} characters"
32
 
33
 
 
 
 
34
  with open("config.yaml", "r") as file:
35
  params = yaml.safe_load(file)
36
 
 
57
 
58
 
59
  WORD = re.compile(r"\w+")
60
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")