aliasgerovs commited on
Commit
155bd85
·
2 Parent(s): e14644a 9532776

Merge branch 'main' into demo

Browse files
Files changed (5) hide show
  1. .gitignore +6 -0
  2. analysis.py +0 -2
  3. app.py +19 -11
  4. plagiarism.py +41 -48
  5. predictors.py +74 -12
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __pycache__/analysis.cpython-311.pyc
2
+ __pycache__/app.cpython-311.pyc
3
+ __pycache__/explainability.cpython-311.pyc
4
+ __pycache__/plagiarism.cpython-311.pyc
5
+ __pycache__/predictors.cpython-311.pyc
6
+ __pycache__/utils.cpython-311.pyc
analysis.py CHANGED
@@ -22,12 +22,10 @@ import yaml
22
  import nltk
23
  import os
24
  from explainability import *
25
- from dotenv import load_dotenv
26
  import subprocess
27
 
28
  nltk.download("punkt")
29
  nltk.download("stopwords")
30
- load_dotenv()
31
  with open("config.yaml", "r") as file:
32
  params = yaml.safe_load(file)
33
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
22
  import nltk
23
  import os
24
  from explainability import *
 
25
  import subprocess
26
 
27
  nltk.download("punkt")
28
  nltk.download("stopwords")
 
29
  with open("config.yaml", "r") as file:
30
  params = yaml.safe_load(file)
31
  device = "cuda" if torch.cuda.is_available() else "cpu"
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import numpy as np
3
  from datetime import date
4
- from predictors import predict_bc_scores, predict_mc_scores
5
  from analysis import depth_analysis
6
  from predictors import predict_quillbot
7
  from plagiarism import plagiarism_check, build_date
@@ -13,11 +13,12 @@ np.set_printoptions(suppress=True)
13
  def ai_generated_test(option, input):
14
  if option == "Human vs AI":
15
  return predict_bc_scores(input), None
16
- else:
17
- return (
18
- predict_bc_scores(input),
19
- predict_mc_scores(input),
20
- )
 
21
 
22
 
23
  # COMBINED
@@ -49,11 +50,13 @@ def main(
49
  depth_analysis_plot = depth_analysis(input)
50
  bc_score = predict_bc_scores(input)
51
  mc_score = predict_mc_scores(input)
 
52
  quilscore = predict_quillbot(input)
53
 
54
  return (
55
  bc_score,
56
  mc_score,
 
57
  formatted_tokens,
58
  depth_analysis_plot,
59
  quilscore,
@@ -105,7 +108,11 @@ with gr.Blocks() as demo:
105
  with gr.Row():
106
  with gr.Column():
107
  ai_option = gr.Radio(
108
- ["Human vs AI", "Human vs AI Source Models"],
 
 
 
 
109
  label="Choose an option please.",
110
  )
111
  with gr.Column():
@@ -147,6 +154,8 @@ with gr.Blocks() as demo:
147
  bcLabel = gr.Label(label="Source")
148
  with gr.Column():
149
  mcLabel = gr.Label(label="Creator")
 
 
150
  with gr.Row():
151
  QLabel = gr.Label(label="Humanized")
152
  with gr.Group():
@@ -213,6 +222,7 @@ with gr.Blocks() as demo:
213
  outputs=[
214
  bcLabel,
215
  mcLabel,
 
216
  sentenceBreakdown,
217
  writing_analysis_plot,
218
  QLabel,
@@ -223,10 +233,8 @@ with gr.Blocks() as demo:
223
  only_ai_btn.click(
224
  fn=ai_generated_test,
225
  inputs=[ai_option, input_text],
226
- outputs=[
227
- bcLabel,
228
- mcLabel,
229
- ],
230
  api_name="ai_check",
231
  )
232
 
 
1
  import gradio as gr
2
  import numpy as np
3
  from datetime import date
4
+ from predictors import predict_bc_scores, predict_mc_scores, predict_1on1_scores
5
  from analysis import depth_analysis
6
  from predictors import predict_quillbot
7
  from plagiarism import plagiarism_check, build_date
 
13
  def ai_generated_test(option, input):
14
  if option == "Human vs AI":
15
  return predict_bc_scores(input), None
16
+ elif option == "Human vs AI Source Models":
17
+ return predict_bc_scores(input), predict_mc_scores(input)
18
+ # elif option == "Human vs AI Source Models (1 on 1)":
19
+ # return predict_bc_scores(input), None, predict_1on1_scores(input)
20
+
21
+ return None, None
22
 
23
 
24
  # COMBINED
 
50
  depth_analysis_plot = depth_analysis(input)
51
  bc_score = predict_bc_scores(input)
52
  mc_score = predict_mc_scores(input)
53
+ mc_1on1_score = predict_1on1_scores(input)
54
  quilscore = predict_quillbot(input)
55
 
56
  return (
57
  bc_score,
58
  mc_score,
59
+ mc_1on1_score,
60
  formatted_tokens,
61
  depth_analysis_plot,
62
  quilscore,
 
108
  with gr.Row():
109
  with gr.Column():
110
  ai_option = gr.Radio(
111
+ [
112
+ "Human vs AI",
113
+ "Human vs AI Source Models",
114
+ # "Human vs AI Source Models (1 on 1)",
115
+ ],
116
  label="Choose an option please.",
117
  )
118
  with gr.Column():
 
154
  bcLabel = gr.Label(label="Source")
155
  with gr.Column():
156
  mcLabel = gr.Label(label="Creator")
157
+ # with gr.Column():
158
+ # mc1on1Label = gr.Label(label="Creator(1 on 1 Approach)")
159
  with gr.Row():
160
  QLabel = gr.Label(label="Humanized")
161
  with gr.Group():
 
222
  outputs=[
223
  bcLabel,
224
  mcLabel,
225
+ # mc1on1Label,
226
  sentenceBreakdown,
227
  writing_analysis_plot,
228
  QLabel,
 
233
  only_ai_btn.click(
234
  fn=ai_generated_test,
235
  inputs=[ai_option, input_text],
236
+ # outputs=[bcLabel, mcLabel, mc1on1Label],
237
+ outputs=[bcLabel, mcLabel],
 
 
238
  api_name="ai_check",
239
  )
240
 
plagiarism.py CHANGED
@@ -9,6 +9,7 @@ import httpx
9
  from bs4 import BeautifulSoup
10
  import numpy as np
11
  import concurrent
 
12
 
13
 
14
  WORD = re.compile(r"\w+")
@@ -18,6 +19,7 @@ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
  # returns cosine similarity of two vectors
19
  # input: two vectors
20
  # output: integer between 0 and 1.
 
21
  def get_cosine(vec1, vec2):
22
  intersection = set(vec1.keys()) & set(vec2.keys())
23
 
@@ -130,7 +132,7 @@ def split_sentence_blocks(text):
130
  sents = sent_tokenize(text)
131
  two_sents = []
132
  for i in range(len(sents)):
133
- if (i % 4) == 0:
134
  two_sents.append(sents[i])
135
  else:
136
  two_sents[len(two_sents) - 1] += " " + sents[i]
@@ -189,9 +191,9 @@ async def parallel_scrap(urls):
189
  return results
190
 
191
 
192
- def matching_score(args_list):
193
- sentence = remove_punc(args_list[0])
194
- content = remove_punc(args_list[1])
195
  if sentence in content:
196
  return 1
197
  else:
@@ -200,9 +202,13 @@ def matching_score(args_list):
200
  if len(ngrams) == 0:
201
  return 0
202
  matched = [x for x in ngrams if " ".join(x) in content]
203
- return len(matched) / len(ngrams)
204
-
205
 
 
 
 
 
 
206
  def plagiarism_check(
207
  plag_option,
208
  input,
@@ -244,55 +250,36 @@ def plagiarism_check(
244
  # Scrape URLs in list
245
  formatted_tokens = []
246
  soups = asyncio.run(parallel_scrap(urlList))
247
-
248
- # Populate matching scores for scrapped pages
249
- for i, soup in enumerate(soups):
250
- print(f"Analyzing {i+1} of {len(soups)} soups........................")
251
- if soup:
252
- page_content = soup.text
253
- for j, sent in enumerate(sentences):
254
- args_list = (sent, page_content)
255
- score = matching_score(args_list)
256
- # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
257
- ScoreArray[i][j] = score
258
-
259
- # with concurrent.futures.ProcessPoolExecutor() as executor:
260
- # results = executor.map(matching_score, args_list)
261
-
262
- # *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
263
- # source_embeddings = []
264
  # for i, soup in enumerate(soups):
 
265
  # if soup:
266
  # page_content = soup.text
267
- # source_embeddings.append(embed_text(page_content))
268
- # else:
269
- # source_embeddings.append(None)
270
-
271
- # def compute_cosine_similarity(args):
272
- # sent, source_embedding, i, j = args
273
- # score = cos_sim_torch(embed_text(sent), source_embedding)
274
- # return i, j, score
275
-
276
- # def main(soups, sentences):
277
- # source_embeddings = [preprocess(soup) for soup in soups]
278
- # ScoreArray = [[0 for _ in sentences] for _ in soups]
279
- # args_list = []
280
- # for i, soup in enumerate(soups):
281
- # if soup:
282
- # for j, sent in enumerate(sentences):
283
- # args_list.append((sent, source_embeddings[i], i, j))
284
- # with concurrent.futures.ProcessPoolExecutor() as executor:
285
- # results = executor.map(compute_cosine_similarity, args_list)
286
- # for i, j, score in results:
287
  # ScoreArray[i][j] = score
288
- # return ScoreArray
289
 
290
- # # Populate matching scores for scrapped pages
291
- # ScoreArray = main(soups, sentences)
292
- # *******************************************************************************************
 
 
 
293
 
294
- # Calculate URL of max matching score for each sentence chunk
 
 
 
 
 
 
 
295
  sentenceToMaxURL = [-1] * len(sentences)
 
296
  for j in range(len(sentences)):
297
  if j > 0:
298
  maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
@@ -326,10 +313,16 @@ def plagiarism_check(
326
  urlMap = {}
327
  for count, i in enumerate(index_descending):
328
  urlMap[i] = count + 1
 
329
  for i, sent in enumerate(sentences):
330
  formatted_tokens.append(
331
  (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
332
  )
 
 
 
 
 
333
  for ind in index_descending:
334
  formatted_tokens.append(
335
  (
 
9
  from bs4 import BeautifulSoup
10
  import numpy as np
11
  import concurrent
12
+ from multiprocessing import Pool
13
 
14
 
15
  WORD = re.compile(r"\w+")
 
19
  # returns cosine similarity of two vectors
20
  # input: two vectors
21
  # output: integer between 0 and 1.
22
+
23
  def get_cosine(vec1, vec2):
24
  intersection = set(vec1.keys()) & set(vec2.keys())
25
 
 
132
  sents = sent_tokenize(text)
133
  two_sents = []
134
  for i in range(len(sents)):
135
+ if (i % 2) == 0:
136
  two_sents.append(sents[i])
137
  else:
138
  two_sents[len(two_sents) - 1] += " " + sents[i]
 
191
  return results
192
 
193
 
194
+
195
+ def matching_score(sentence_content_tuple):
196
+ sentence, content = sentence_content_tuple
197
  if sentence in content:
198
  return 1
199
  else:
 
202
  if len(ngrams) == 0:
203
  return 0
204
  matched = [x for x in ngrams if " ".join(x) in content]
205
+ return len(matched) / len(ngrams)
 
206
 
207
+ def process_with_multiprocessing(input_data):
208
+ with Pool(processes=4) as pool:
209
+ scores = pool.map(matching_score, input_data)
210
+ return scores
211
+
212
  def plagiarism_check(
213
  plag_option,
214
  input,
 
250
  # Scrape URLs in list
251
  formatted_tokens = []
252
  soups = asyncio.run(parallel_scrap(urlList))
253
+
254
+ # # Populate matching scores for scrapped pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  # for i, soup in enumerate(soups):
256
+ # print(f"Analyzing {i+1} of {len(soups)} soups........................")
257
  # if soup:
258
  # page_content = soup.text
259
+
260
+ # for j, sent in enumerate(sentences):
261
+ # args_list = (sent, page_content)
262
+ # score = matching_score(args_list)
263
+ # # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  # ScoreArray[i][j] = score
 
265
 
266
+ input_data = []
267
+ for i, soup in enumerate(soups):
268
+ if soup:
269
+ page_content = soup.text
270
+ for j, sent in enumerate(sentences):
271
+ input_data.append((sent, page_content))
272
 
273
+ scores = process_with_multiprocessing(input_data)
274
+ k = 0
275
+ for i, soup in enumerate(soups):
276
+ if soup:
277
+ for j, _ in enumerate(sentences):
278
+ ScoreArray[i][j] = scores[k]
279
+ k += 1
280
+
281
  sentenceToMaxURL = [-1] * len(sentences)
282
+
283
  for j in range(len(sentences)):
284
  if j > 0:
285
  maxScore = ScoreArray[sentenceToMaxURL[j - 1]][j]
 
313
  urlMap = {}
314
  for count, i in enumerate(index_descending):
315
  urlMap[i] = count + 1
316
+
317
  for i, sent in enumerate(sentences):
318
  formatted_tokens.append(
319
  (sent, "[" + str(urlMap[sentenceToMaxURL[i]]) + "]")
320
  )
321
+
322
+ formatted_tokens.append(("\n", None))
323
+ formatted_tokens.append(("\n", None))
324
+ formatted_tokens.append(("\n", None))
325
+
326
  for ind in index_descending:
327
  formatted_tokens.append(
328
  (
predictors.py CHANGED
@@ -19,19 +19,19 @@ from scipy.special import softmax
19
  import yaml
20
  import os
21
  from utils import *
22
- from dotenv import load_dotenv
23
 
24
  with open("config.yaml", "r") as file:
25
  params = yaml.safe_load(file)
26
  nltk.download("punkt")
27
  nltk.download("stopwords")
28
- load_dotenv()
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
  text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
31
  text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
32
  text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
 
33
  quillbot_labels = params["QUILLBOT_LABELS"]
34
  mc_label_map = params["MC_OUTPUT_LABELS"]
 
35
  mc_token_size = int(params["MC_TOKEN_SIZE"])
36
  bc_token_size = int(params["BC_TOKEN_SIZE"])
37
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
@@ -46,6 +46,13 @@ quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path)
46
  quillbot_model = AutoModelForSequenceClassification.from_pretrained(
47
  text_quillbot_model_path
48
  ).to(device)
 
 
 
 
 
 
 
49
 
50
 
51
  def split_text_allow_complete_sentences_nltk(
@@ -234,13 +241,68 @@ def predict_bc_scores(input):
234
  return bc_score
235
 
236
 
237
- # def predict_1on1(input):
238
- # models = ['bard', 'claude', 'gpt4', 'mistral_ai', 'llama2']
239
- # text = str(row["text"])
240
- # predictions = {}
241
- # prediction = predict(text, bard_model, bard_tokenizer) predictions['bard'] = prediction[1]
242
- # prediction = predict(text, claude_model, claude_tokenizer) predictions['claude'] = prediction[1]
243
- # prediction = predict(text, gpt4_model, gpt4_tokenizer) predictions['gpt4'] = prediction[1]
244
- # prediction = predict(text, mistral_ai_model, mistral_ai_tokenizer) predictions['mistral_ai'] = prediction[1]
245
- # prediction = predict(text, llama2_model, llama2_tokenizer) predictions['llama2'] = prediction[1]
246
- # max_key = max(predictions, key=predictions.get)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  import yaml
20
  import os
21
  from utils import *
 
22
 
23
  with open("config.yaml", "r") as file:
24
  params = yaml.safe_load(file)
25
  nltk.download("punkt")
26
  nltk.download("stopwords")
 
27
  device = "cuda" if torch.cuda.is_available() else "cpu"
28
  text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
29
  text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
30
  text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
31
+ text_1on1_models = params["TEXT_1ON1_MODEL"]
32
  quillbot_labels = params["QUILLBOT_LABELS"]
33
  mc_label_map = params["MC_OUTPUT_LABELS"]
34
+ text_1on1_label_map = params["1ON1_OUTPUT_LABELS"]
35
  mc_token_size = int(params["MC_TOKEN_SIZE"])
36
  bc_token_size = int(params["BC_TOKEN_SIZE"])
37
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
 
46
  quillbot_model = AutoModelForSequenceClassification.from_pretrained(
47
  text_quillbot_model_path
48
  ).to(device)
49
+ # tokenizers_1on1 = {}
50
+ # models_1on1 = {}
51
+ # for model in text_1on1_models:
52
+ # tokenizers_1on1[model] = AutoTokenizer.from_pretrained(model)
53
+ # models_1on1[model] = AutoModelForSequenceClassification.from_pretrained(
54
+ # model
55
+ # ).to(device)
56
 
57
 
58
  def split_text_allow_complete_sentences_nltk(
 
241
  return bc_score
242
 
243
 
244
+ def predict_1on1(model, tokenizer, text):
245
+ with torch.no_grad():
246
+ model.eval()
247
+ tokens = tokenizer(
248
+ text,
249
+ padding="max_length",
250
+ truncation=True,
251
+ return_tensors="pt",
252
+ max_length=mc_token_size,
253
+ ).to(device)
254
+ output = model(**tokens)
255
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
256
+ return output_norm
257
+
258
+
259
+ def predict_1on1_combined(input):
260
+ predictions = []
261
+ for i, model in enumerate(text_1on1_models):
262
+ predictions.append(
263
+ predict_1on1(models_1on1[model], tokenizers_1on1[model], input)[1]
264
+ )
265
+ return predictions
266
+
267
+
268
+ def predict_1on1_scores(input):
269
+ # BC SCORE
270
+ bc_scores = []
271
+ samples_len_bc = len(
272
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
273
+ )
274
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
275
+ for i in range(samples_len_bc):
276
+ cleaned_text_bc = remove_special_characters(segments_bc[i])
277
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
278
+ bc_scores.append(bc_score)
279
+ bc_scores_array = np.array(bc_scores)
280
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
281
+ bc_score_list = average_bc_scores.tolist()
282
+ bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
283
+
284
+ # MC SCORE
285
+ mc_scores = []
286
+ segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
287
+ samples_len_mc = len(
288
+ split_text_allow_complete_sentences_nltk(input, type_det="mc")
289
+ )
290
+ for i in range(samples_len_mc):
291
+ cleaned_text_mc = remove_special_characters(segments_mc[i])
292
+ mc_score = predict_1on1_combined(cleaned_text_mc)
293
+ mc_scores.append(mc_score)
294
+ mc_scores_array = np.array(mc_scores)
295
+ average_mc_scores = np.mean(mc_scores_array, axis=0)
296
+ normalized_mc_scores = average_mc_scores / np.sum(average_mc_scores)
297
+ mc_score_list = normalized_mc_scores.tolist()
298
+ mc_score = {}
299
+ for score, label in zip(mc_score_list, text_1on1_label_map):
300
+ mc_score[label.upper()] = score
301
+
302
+ sum_prob = 1 - bc_score["HUMAN"]
303
+ for key, value in mc_score.items():
304
+ mc_score[key] = value * sum_prob
305
+ if sum_prob < 0.01:
306
+ mc_score = {}
307
+
308
+ return mc_score