minko186 commited on
Commit
45d10c4
1 Parent(s): 029c7a1

refactoring

Browse files
__pycache__/analysis.cpython-311.pyc ADDED
Binary file (4.75 kB). View file
 
__pycache__/app.cpython-311.pyc ADDED
Binary file (10.9 kB). View file
 
__pycache__/explainability.cpython-311.pyc ADDED
Binary file (7.89 kB). View file
 
__pycache__/plagiarism.cpython-311.pyc ADDED
Binary file (14.1 kB). View file
 
__pycache__/predictors.cpython-311.pyc ADDED
Binary file (12 kB). View file
 
__pycache__/utils.cpython-311.pyc ADDED
Binary file (3.76 kB). View file
 
analysis.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import httpx
3
+ import torch
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ import numpy as np
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+ import asyncio
9
+ from scipy.special import softmax
10
+ from evaluate import load
11
+ from datetime import date
12
+ import nltk
13
+ import fitz
14
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
15
+ import nltk, spacy, subprocess, torch
16
+ import plotly.graph_objects as go
17
+ import torch.nn.functional as F
18
+ import nltk
19
+ from unidecode import unidecode
20
+ import time
21
+ import yaml
22
+ import nltk
23
+ import os
24
+ from explainability import *
25
+ from dotenv import load_dotenv
26
+ import subprocess
27
+
28
+ nltk.download("punkt")
29
+ nltk.download("stopwords")
30
+ load_dotenv()
31
+ with open("config.yaml", "r") as file:
32
+ params = yaml.safe_load(file)
33
+ device = "cuda" if torch.cuda.is_available() else "cpu"
34
+ readability_model_id = params["READABILITY_MODEL_ID"]
35
+ gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
36
+ gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
37
+
38
+ command = ["python", "-m", "spacy", "download", "en_core_web_sm"]
39
+ subprocess.run(command)
40
+ nlp = spacy.load("en_core_web_sm")
41
+
42
+
43
+ def depth_analysis(input_text):
44
+ processed_words = preprocess_text1(input_text)
45
+ ttr_value = vocabulary_richness_ttr(processed_words)
46
+ gunning_fog = calculate_gunning_fog(input_text)
47
+ gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
48
+ words, sentences = preprocess_text2(input_text)
49
+ average_sentence_length = calculate_average_sentence_length(sentences)
50
+ average_word_length = calculate_average_word_length(words)
51
+ average_sentence_length_norm = normalize(
52
+ average_sentence_length, min_value=0, max_value=40
53
+ )
54
+ average_word_length_norm = normalize(
55
+ average_word_length, min_value=0, max_value=8
56
+ )
57
+ average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
58
+ average_tree_depth_norm = normalize(
59
+ average_tree_depth, min_value=0, max_value=10
60
+ )
61
+ perplexity = calculate_perplexity(
62
+ input_text, gpt2_model, gpt2_tokenizer, device
63
+ )
64
+ perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
65
+
66
+ features = {
67
+ "readability": gunning_fog_norm,
68
+ "syntactic tree depth": average_tree_depth_norm,
69
+ "vocabulary richness": ttr_value,
70
+ "perplexity": perplexity_norm,
71
+ "average sentence length": average_sentence_length_norm,
72
+ "average word length": average_word_length_norm,
73
+ }
74
+ fig = go.Figure()
75
+ fig.add_trace(
76
+ go.Scatterpolar(
77
+ r=list(features.values()),
78
+ theta=list(features.keys()),
79
+ fill="toself",
80
+ name="Radar Plot",
81
+ )
82
+ )
83
+ fig.update_layout(
84
+ polar=dict(
85
+ radialaxis=dict(
86
+ visible=True,
87
+ range=[0, 100],
88
+ )
89
+ ),
90
+ showlegend=False,
91
+ margin=dict(
92
+ l=10,
93
+ r=20,
94
+ b=10,
95
+ t=10,
96
+ ),
97
+ )
98
+ return fig
app.py CHANGED
@@ -1,286 +1,23 @@
1
- from utils import (
2
- cosineSim,
3
- googleSearch,
4
- getSentences,
5
- parallel_scrap,
6
- matchingScore,
7
- )
8
  import gradio as gr
9
- from urllib.request import urlopen, Request
10
- from googleapiclient.discovery import build
11
- import requests
12
- import httpx
13
- import torch
14
- import re
15
- from bs4 import BeautifulSoup
16
  import numpy as np
17
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
18
- import asyncio
19
- from scipy.special import softmax
20
- from evaluate import load
21
  from datetime import date
22
- import nltk
23
- import fitz
24
- from transformers import GPT2LMHeadModel, GPT2TokenizerFast
25
- import nltk, spacy, subprocess, torch
26
- import plotly.graph_objects as go
27
- import torch.nn.functional as F
28
- import nltk
29
- from unidecode import unidecode
30
- import time
31
- from utils import cos_sim_torch, embed_text
32
- import multiprocessing
33
- from functools import partial
34
- import concurrent.futures
35
- from plagiarism import plagiarism_check
36
-
37
- nltk.download("punkt")
38
-
39
- from writing_analysis import (
40
- normalize,
41
- preprocess_text1,
42
- preprocess_text2,
43
- vocabulary_richness_ttr,
44
- calculate_gunning_fog,
45
- calculate_average_sentence_length,
46
- calculate_average_word_length,
47
- calculate_syntactic_tree_depth,
48
- calculate_perplexity,
49
- )
50
 
51
  np.set_printoptions(suppress=True)
52
 
53
 
54
- """
55
- AI DETECTION SECTION
56
- """
57
- device = "cuda" if torch.cuda.is_available() else "cpu"
58
-
59
- text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
60
- text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
61
- text_bc_model = AutoModelForSequenceClassification.from_pretrained(
62
- text_bc_model_path
63
- ).to(device)
64
-
65
- text_mc_model_path = (
66
- "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
67
- )
68
- text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
69
- text_mc_model = AutoModelForSequenceClassification.from_pretrained(
70
- text_mc_model_path
71
- ).to(device)
72
-
73
- quillbot_labels = ["Original", "QuillBot"]
74
- quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
75
- quillbot_model = AutoModelForSequenceClassification.from_pretrained(
76
- "polygraf-ai/quillbot-detector-28k"
77
- ).to(device)
78
-
79
-
80
- def remove_accents(input_str):
81
- text_no_accents = unidecode(input_str)
82
- return text_no_accents
83
-
84
-
85
- def remove_special_characters(text):
86
- text = remove_accents(text)
87
- pattern = r'[^\w\s\d.,!?\'"()-;]+'
88
- text = re.sub(pattern, "", text)
89
- return text
90
-
91
-
92
- def remove_special_characters_2(text):
93
- pattern = r"[^a-zA-Z0-9 ]+"
94
- text = re.sub(pattern, "", text)
95
- return text
96
-
97
-
98
- def update_character_count(text):
99
- return f"{len(text)} characters"
100
-
101
-
102
- def split_text_allow_complete_sentences_nltk(
103
- text,
104
- max_length=256,
105
- tolerance=30,
106
- min_last_segment_length=100,
107
- type_det="bc",
108
- ):
109
- sentences = nltk.sent_tokenize(text)
110
- segments = []
111
- current_segment = []
112
- current_length = 0
113
-
114
- if type_det == "bc":
115
- tokenizer = text_bc_tokenizer
116
- max_length = 333
117
-
118
- elif type_det == "mc":
119
- tokenizer = text_mc_tokenizer
120
- max_length = 256
121
-
122
- for sentence in sentences:
123
- tokens = tokenizer.tokenize(sentence)
124
- sentence_length = len(tokens)
125
-
126
- if current_length + sentence_length <= max_length + tolerance - 2:
127
- current_segment.append(sentence)
128
- current_length += sentence_length
129
- else:
130
- if current_segment:
131
- encoded_segment = tokenizer.encode(
132
- " ".join(current_segment),
133
- add_special_tokens=True,
134
- max_length=max_length + tolerance,
135
- truncation=True,
136
- )
137
- segments.append((current_segment, len(encoded_segment)))
138
- current_segment = [sentence]
139
- current_length = sentence_length
140
-
141
- if current_segment:
142
- encoded_segment = tokenizer.encode(
143
- " ".join(current_segment),
144
- add_special_tokens=True,
145
- max_length=max_length + tolerance,
146
- truncation=True,
147
- )
148
- segments.append((current_segment, len(encoded_segment)))
149
-
150
- final_segments = []
151
- for i, (seg, length) in enumerate(segments):
152
- if i == len(segments) - 1:
153
- if length < min_last_segment_length and len(final_segments) > 0:
154
- prev_seg, prev_length = final_segments[-1]
155
- combined_encoded = tokenizer.encode(
156
- " ".join(prev_seg + seg),
157
- add_special_tokens=True,
158
- max_length=max_length + tolerance,
159
- truncation=True,
160
- )
161
- if len(combined_encoded) <= max_length + tolerance:
162
- final_segments[-1] = (prev_seg + seg, len(combined_encoded))
163
- else:
164
- final_segments.append((seg, length))
165
- else:
166
- final_segments.append((seg, length))
167
- else:
168
- final_segments.append((seg, length))
169
-
170
- decoded_segments = []
171
- encoded_segments = []
172
- for seg, _ in final_segments:
173
- encoded_segment = tokenizer.encode(
174
- " ".join(seg),
175
- add_special_tokens=True,
176
- max_length=max_length + tolerance,
177
- truncation=True,
178
- )
179
- decoded_segment = tokenizer.decode(encoded_segment)
180
- decoded_segments.append(decoded_segment)
181
- return decoded_segments
182
-
183
-
184
- def predict_quillbot(text):
185
- with torch.no_grad():
186
- quillbot_model.eval()
187
- tokenized_text = quillbot_tokenizer(
188
- text,
189
- padding="max_length",
190
- truncation=True,
191
- max_length=256,
192
- return_tensors="pt",
193
- ).to(device)
194
- output = quillbot_model(**tokenized_text)
195
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
196
- q_score = {
197
- "QuillBot": output_norm[1].item(),
198
- "Original": output_norm[0].item(),
199
- }
200
- return q_score
201
-
202
-
203
- def predict_bc(model, tokenizer, text):
204
- with torch.no_grad():
205
- model.eval()
206
- tokens = text_bc_tokenizer(
207
- text,
208
- padding="max_length",
209
- truncation=True,
210
- max_length=333,
211
- return_tensors="pt",
212
- ).to(device)
213
- output = model(**tokens)
214
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
215
- print("BC Score: ", output_norm)
216
- return output_norm
217
-
218
-
219
- def predict_mc(model, tokenizer, text):
220
- with torch.no_grad():
221
- model.eval()
222
- tokens = text_mc_tokenizer(
223
- text,
224
- padding="max_length",
225
- truncation=True,
226
- return_tensors="pt",
227
- max_length=256,
228
- ).to(device)
229
- output = model(**tokens)
230
- output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
231
- print("MC Score: ", output_norm)
232
- return output_norm
233
-
234
-
235
- def ai_generated_test(ai_option, input):
236
-
237
- bc_scores = []
238
- mc_scores = []
239
- samples_len_bc = len(
240
- split_text_allow_complete_sentences_nltk(input, type_det="bc")
241
- )
242
- samples_len_mc = len(
243
- split_text_allow_complete_sentences_nltk(input, type_det="mc")
244
- )
245
- segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
246
- segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
247
-
248
- for i in range(samples_len_bc):
249
- cleaned_text_bc = remove_special_characters(segments_bc[i])
250
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
251
- bc_scores.append(bc_score)
252
-
253
- for i in range(samples_len_mc):
254
- cleaned_text_mc = remove_special_characters(segments_mc[i])
255
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
256
- mc_scores.append(mc_score)
257
-
258
- bc_scores_array = np.array(bc_scores)
259
- mc_scores_array = np.array(mc_scores)
260
- average_bc_scores = np.mean(bc_scores_array, axis=0)
261
- average_mc_scores = np.mean(mc_scores_array, axis=0)
262
- bc_score_list = average_bc_scores.tolist()
263
- mc_score_list = average_mc_scores.tolist()
264
-
265
- bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
266
- mc_score = {}
267
- label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
268
-
269
- for score, label in zip(mc_score_list, label_map):
270
- mc_score[label.upper()] = score
271
-
272
- sum_prob = 1 - bc_score["HUMAN"]
273
- for key, value in mc_score.items():
274
- mc_score[key] = value * sum_prob
275
-
276
- if ai_option == "Human vs AI":
277
- mc_score = {}
278
-
279
- if sum_prob < 0.01:
280
- mc_score = {}
281
- return bc_score, mc_score
282
  else:
283
- return bc_score, mc_score
 
 
 
284
 
285
 
286
  # COMBINED
@@ -310,7 +47,8 @@ def main(
310
  domains_to_skip,
311
  )
312
  depth_analysis_plot = depth_analysis(input)
313
- bc_score, mc_score = ai_generated_test(ai_option, input)
 
314
  quilscore = predict_quillbot(input)
315
 
316
  return (
@@ -322,120 +60,6 @@ def main(
322
  )
323
 
324
 
325
- def build_date(year, month, day):
326
- return f"{year}{months[month]}{day}"
327
-
328
-
329
- def len_validator(text):
330
- min_tokens = 200
331
- lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
332
- if lengt < min_tokens:
333
- return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
334
- else:
335
- return f"Input length ({lengt}) is satisified."
336
-
337
-
338
- def extract_text_from_pdf(pdf_path):
339
- doc = fitz.open(pdf_path)
340
- text = ""
341
- for page in doc:
342
- text += page.get_text()
343
- return text
344
-
345
-
346
- # DEPTH ANALYSIS
347
- print("loading depth analysis")
348
- nltk.download("stopwords")
349
- nltk.download("punkt")
350
- command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
351
- # Execute the command
352
- subprocess.run(command)
353
- nlp = spacy.load("en_core_web_sm")
354
-
355
- # for perplexity
356
- model_id = "gpt2"
357
- gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
358
- gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
359
-
360
-
361
- def depth_analysis(input_text):
362
-
363
- # vocanulary richness
364
- processed_words = preprocess_text1(input_text)
365
- ttr_value = vocabulary_richness_ttr(processed_words)
366
-
367
- # readability
368
- gunning_fog = calculate_gunning_fog(input_text)
369
- gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
370
-
371
- # average sentence length and average word length
372
- words, sentences = preprocess_text2(input_text)
373
- average_sentence_length = calculate_average_sentence_length(sentences)
374
- average_word_length = calculate_average_word_length(words)
375
- average_sentence_length_norm = normalize(
376
- average_sentence_length, min_value=0, max_value=40
377
- )
378
- average_word_length_norm = normalize(
379
- average_word_length, min_value=0, max_value=8
380
- )
381
-
382
- # syntactic_tree_depth
383
- average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
384
- average_tree_depth_norm = normalize(
385
- average_tree_depth, min_value=0, max_value=10
386
- )
387
-
388
- # perplexity
389
- perplexity = calculate_perplexity(
390
- input_text, gpt2_model, gpt2_tokenizer, device
391
- )
392
- perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
393
-
394
- features = {
395
- "readability": gunning_fog_norm,
396
- "syntactic tree depth": average_tree_depth_norm,
397
- "vocabulary richness": ttr_value,
398
- "perplexity": perplexity_norm,
399
- "average sentence length": average_sentence_length_norm,
400
- "average word length": average_word_length_norm,
401
- }
402
-
403
- print(features)
404
-
405
- fig = go.Figure()
406
-
407
- fig.add_trace(
408
- go.Scatterpolar(
409
- r=list(features.values()),
410
- theta=list(features.keys()),
411
- fill="toself",
412
- name="Radar Plot",
413
- )
414
- )
415
-
416
- fig.update_layout(
417
- polar=dict(
418
- radialaxis=dict(
419
- visible=True,
420
- range=[0, 100],
421
- )
422
- ),
423
- showlegend=False,
424
- # autosize=False,
425
- # width=600,
426
- # height=600,
427
- margin=dict(
428
- l=10,
429
- r=20,
430
- b=10,
431
- t=10,
432
- # pad=100
433
- ),
434
- )
435
-
436
- return fig
437
-
438
-
439
  # START OF GRADIO
440
 
441
  title = "Copyright Checker"
@@ -497,7 +121,7 @@ with gr.Blocks() as demo:
497
  only_plagiarism_btn = gr.Button("Source Check")
498
 
499
  with gr.Row():
500
- quillbot_check = gr.Button("Humanized Text Check (Quillbot)")
501
 
502
  with gr.Row():
503
  depth_analysis_btn = gr.Button("Detailed Writing Analysis")
@@ -642,6 +266,4 @@ with gr.Blocks() as demo:
642
  date_from = ""
643
  date_to = ""
644
 
645
- demo.launch(
646
- share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
647
- )
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
2
  import numpy as np
 
 
 
 
3
  from datetime import date
4
+ from predictors import predict_bc_scores, predict_mc_scores
5
+ from analysis import depth_analysis
6
+ from predictors import predict_quillbot
7
+ from plagiarism import plagiarism_check, build_date
8
+ from utils import extract_text_from_pdf, len_validator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  np.set_printoptions(suppress=True)
11
 
12
 
13
+ def ai_generated_test(option, input):
14
+ if option == "Human vs AI":
15
+ return predict_bc_scores(input), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  else:
17
+ return (
18
+ predict_bc_scores(input),
19
+ predict_mc_scores(input),
20
+ )
21
 
22
 
23
  # COMBINED
 
47
  domains_to_skip,
48
  )
49
  depth_analysis_plot = depth_analysis(input)
50
+ bc_score = predict_bc_scores(input)
51
+ mc_score = predict_mc_scores(input)
52
  quilscore = predict_quillbot(input)
53
 
54
  return (
 
60
  )
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  # START OF GRADIO
64
 
65
  title = "Copyright Checker"
 
121
  only_plagiarism_btn = gr.Button("Source Check")
122
 
123
  with gr.Row():
124
+ quillbot_check = gr.Button("Humanized Text Check")
125
 
126
  with gr.Row():
127
  depth_analysis_btn = gr.Button("Detailed Writing Analysis")
 
266
  date_from = ""
267
  date_to = ""
268
 
269
+ demo.launch(share=True, auth=("polygraf-admin", "test@aisd"))
 
 
explainability.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, textstat
2
+ from nltk import FreqDist
3
+ from nltk.corpus import stopwords
4
+ from nltk.tokenize import word_tokenize, sent_tokenize
5
+ import torch
6
+ import nltk
7
+ from tqdm import tqdm
8
+
9
+ nltk.download("punkt")
10
+
11
+
12
+ def normalize(value, min_value, max_value):
13
+ normalized_value = ((value - min_value) * 100) / (max_value - min_value)
14
+ return max(0, min(100, normalized_value))
15
+
16
+
17
+ def preprocess_text1(text):
18
+ text = text.lower()
19
+ text = re.sub(r"[^\w\s]", "", text) # remove punctuation
20
+ stop_words = set(stopwords.words("english")) # remove stopwords
21
+ words = [word for word in text.split() if word not in stop_words]
22
+ words = [word for word in words if not word.isdigit()] # remove numbers
23
+ return words
24
+
25
+
26
+ def vocabulary_richness_ttr(words):
27
+ unique_words = set(words)
28
+ ttr = len(unique_words) / len(words) * 100
29
+ return ttr
30
+
31
+
32
+ def calculate_gunning_fog(text):
33
+ """range 0-20"""
34
+ gunning_fog = textstat.gunning_fog(text)
35
+ return gunning_fog
36
+
37
+
38
+ def calculate_automated_readability_index(text):
39
+ """range 1-20"""
40
+ ari = textstat.automated_readability_index(text)
41
+ return ari
42
+
43
+
44
+ def calculate_flesch_reading_ease(text):
45
+ """range 0-100"""
46
+ fre = textstat.flesch_reading_ease(text)
47
+ return fre
48
+
49
+
50
+ def preprocess_text2(text):
51
+ sentences = sent_tokenize(text)
52
+ words = [
53
+ word.lower()
54
+ for sent in sentences
55
+ for word in word_tokenize(sent)
56
+ if word.isalnum()
57
+ ]
58
+ stop_words = set(stopwords.words("english"))
59
+ words = [word for word in words if word not in stop_words]
60
+ return words, sentences
61
+
62
+
63
+ def calculate_average_sentence_length(sentences):
64
+ """range 0-40 or 50 based on the histogram"""
65
+ total_words = sum(len(word_tokenize(sent)) for sent in sentences)
66
+ average_sentence_length = total_words / (len(sentences) + 0.0000001)
67
+ return average_sentence_length
68
+
69
+
70
+ def calculate_average_word_length(words):
71
+ """range 0-8 based on the histogram"""
72
+ total_characters = sum(len(word) for word in words)
73
+ average_word_length = total_characters / (len(words) + 0.0000001)
74
+ return average_word_length
75
+
76
+
77
+ def calculate_max_depth(sent):
78
+ return max(len(list(token.ancestors)) for token in sent)
79
+
80
+
81
+ def calculate_syntactic_tree_depth(nlp, text):
82
+ """0-10 based on the histogram"""
83
+ doc = nlp(text)
84
+ sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
85
+ average_depth = (
86
+ sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
87
+ )
88
+ return average_depth
89
+
90
+
91
+ def calculate_perplexity(text, model, tokenizer, device, stride=512):
92
+ """range 0-30 based on the histogram"""
93
+ encodings = tokenizer(text, return_tensors="pt")
94
+ max_length = model.config.n_positions
95
+ seq_len = encodings.input_ids.size(1)
96
+
97
+ nlls = []
98
+ prev_end_loc = 0
99
+ for begin_loc in tqdm(range(0, seq_len, stride)):
100
+ end_loc = min(begin_loc + max_length, seq_len)
101
+ trg_len = (
102
+ end_loc - prev_end_loc
103
+ ) # may be different from stride on last loop
104
+ input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
105
+ target_ids = input_ids.clone()
106
+ target_ids[:, :-trg_len] = -100
107
+
108
+ with torch.no_grad():
109
+ outputs = model(input_ids, labels=target_ids)
110
+ neg_log_likelihood = outputs.loss
111
+
112
+ nlls.append(neg_log_likelihood)
113
+
114
+ prev_end_loc = end_loc
115
+ if end_loc == seq_len:
116
+ break
117
+
118
+ ppl = torch.exp(torch.stack(nlls).mean())
119
+ return ppl.item()
plagiarism.py CHANGED
@@ -8,6 +8,7 @@ import asyncio
8
  import httpx
9
  from bs4 import BeautifulSoup
10
  import numpy as np
 
11
 
12
 
13
  WORD = re.compile(r"\w+")
@@ -129,7 +130,7 @@ def split_sentence_blocks(text):
129
  sents = sent_tokenize(text)
130
  two_sents = []
131
  for i in range(len(sents)):
132
- if (i % 2) == 0:
133
  two_sents.append(sents[i])
134
  else:
135
  two_sents[len(two_sents) - 1] += " " + sents[i]
@@ -188,9 +189,9 @@ async def parallel_scrap(urls):
188
  return results
189
 
190
 
191
- def matching_score(sentence, content):
192
- sentence = remove_punc(sentence)
193
- content = remove_punc(content)
194
  if sentence in content:
195
  return 1
196
  else:
@@ -250,11 +251,14 @@ def plagiarism_check(
250
  if soup:
251
  page_content = soup.text
252
  for j, sent in enumerate(sentences):
253
- score = matching_score(sent, page_content)
254
- score = matching_score(sent, page_content)
255
  # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
256
  ScoreArray[i][j] = score
257
 
 
 
 
258
  # *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
259
  # source_embeddings = []
260
  # for i, soup in enumerate(soups):
 
8
  import httpx
9
  from bs4 import BeautifulSoup
10
  import numpy as np
11
+ import concurrent
12
 
13
 
14
  WORD = re.compile(r"\w+")
 
130
  sents = sent_tokenize(text)
131
  two_sents = []
132
  for i in range(len(sents)):
133
+ if (i % 4) == 0:
134
  two_sents.append(sents[i])
135
  else:
136
  two_sents[len(two_sents) - 1] += " " + sents[i]
 
189
  return results
190
 
191
 
192
+ def matching_score(args_list):
193
+ sentence = remove_punc(args_list[0])
194
+ content = remove_punc(args_list[1])
195
  if sentence in content:
196
  return 1
197
  else:
 
251
  if soup:
252
  page_content = soup.text
253
  for j, sent in enumerate(sentences):
254
+ args_list = (sent, page_content)
255
+ score = matching_score(args_list)
256
  # score = cos_sim_torch(embed_text(sent), source_embeddings[i])
257
  ScoreArray[i][j] = score
258
 
259
+ # with concurrent.futures.ProcessPoolExecutor() as executor:
260
+ # results = executor.map(matching_score, args_list)
261
+
262
  # *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
263
  # source_embeddings = []
264
  # for i, soup in enumerate(soups):
predictors.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import httpx
3
+ import torch
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ import numpy as np
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+ import asyncio
9
+ from evaluate import load
10
+ from datetime import date
11
+ import nltk
12
+ from transformers import GPT2LMHeadModel, GPT2TokenizerFast
13
+ import plotly.graph_objects as go
14
+ import torch.nn.functional as F
15
+ import nltk
16
+ from unidecode import unidecode
17
+ import time
18
+ from scipy.special import softmax
19
+ import yaml
20
+ import os
21
+ from utils import *
22
+ from dotenv import load_dotenv
23
+
24
+ with open("config.yaml", "r") as file:
25
+ params = yaml.safe_load(file)
26
+ nltk.download("punkt")
27
+ nltk.download("stopwords")
28
+ load_dotenv()
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
31
+ text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
32
+ text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
33
+ quillbot_labels = params["QUILLBOT_LABELS"]
34
+ mc_label_map = params["MC_OUTPUT_LABELS"]
35
+ mc_token_size = int(params["MC_TOKEN_SIZE"])
36
+ bc_token_size = int(params["BC_TOKEN_SIZE"])
37
+ text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
38
+ text_bc_model = AutoModelForSequenceClassification.from_pretrained(
39
+ text_bc_model_path
40
+ ).to(device)
41
+ text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
42
+ text_mc_model = AutoModelForSequenceClassification.from_pretrained(
43
+ text_mc_model_path
44
+ ).to(device)
45
+ quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path)
46
+ quillbot_model = AutoModelForSequenceClassification.from_pretrained(
47
+ text_quillbot_model_path
48
+ ).to(device)
49
+
50
+
51
+ def split_text_allow_complete_sentences_nltk(
52
+ text,
53
+ max_length=256,
54
+ tolerance=30,
55
+ min_last_segment_length=100,
56
+ type_det="bc",
57
+ ):
58
+ sentences = nltk.sent_tokenize(text)
59
+ segments = []
60
+ current_segment = []
61
+ current_length = 0
62
+ if type_det == "bc":
63
+ tokenizer = text_bc_tokenizer
64
+ max_length = bc_token_size
65
+ elif type_det == "mc":
66
+ tokenizer = text_mc_tokenizer
67
+ max_length = mc_token_size
68
+ for sentence in sentences:
69
+ tokens = tokenizer.tokenize(sentence)
70
+ sentence_length = len(tokens)
71
+
72
+ if current_length + sentence_length <= max_length + tolerance - 2:
73
+ current_segment.append(sentence)
74
+ current_length += sentence_length
75
+ else:
76
+ if current_segment:
77
+ encoded_segment = tokenizer.encode(
78
+ " ".join(current_segment),
79
+ add_special_tokens=True,
80
+ max_length=max_length + tolerance,
81
+ truncation=True,
82
+ )
83
+ segments.append((current_segment, len(encoded_segment)))
84
+ current_segment = [sentence]
85
+ current_length = sentence_length
86
+
87
+ if current_segment:
88
+ encoded_segment = tokenizer.encode(
89
+ " ".join(current_segment),
90
+ add_special_tokens=True,
91
+ max_length=max_length + tolerance,
92
+ truncation=True,
93
+ )
94
+ segments.append((current_segment, len(encoded_segment)))
95
+
96
+ final_segments = []
97
+ for i, (seg, length) in enumerate(segments):
98
+ if i == len(segments) - 1:
99
+ if length < min_last_segment_length and len(final_segments) > 0:
100
+ prev_seg, prev_length = final_segments[-1]
101
+ combined_encoded = tokenizer.encode(
102
+ " ".join(prev_seg + seg),
103
+ add_special_tokens=True,
104
+ max_length=max_length + tolerance,
105
+ truncation=True,
106
+ )
107
+ if len(combined_encoded) <= max_length + tolerance:
108
+ final_segments[-1] = (prev_seg + seg, len(combined_encoded))
109
+ else:
110
+ final_segments.append((seg, length))
111
+ else:
112
+ final_segments.append((seg, length))
113
+ else:
114
+ final_segments.append((seg, length))
115
+
116
+ decoded_segments = []
117
+ encoded_segments = []
118
+ for seg, _ in final_segments:
119
+ encoded_segment = tokenizer.encode(
120
+ " ".join(seg),
121
+ add_special_tokens=True,
122
+ max_length=max_length + tolerance,
123
+ truncation=True,
124
+ )
125
+ decoded_segment = tokenizer.decode(encoded_segment)
126
+ decoded_segments.append(decoded_segment)
127
+ return decoded_segments
128
+
129
+
130
+ def predict_quillbot(text):
131
+ with torch.no_grad():
132
+ quillbot_model.eval()
133
+ tokenized_text = quillbot_tokenizer(
134
+ text,
135
+ padding="max_length",
136
+ truncation=True,
137
+ max_length=256,
138
+ return_tensors="pt",
139
+ ).to(device)
140
+ output = quillbot_model(**tokenized_text)
141
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
142
+ q_score = {
143
+ "Humanized": output_norm[1].item(),
144
+ "Original": output_norm[0].item(),
145
+ }
146
+ return q_score
147
+
148
+
149
+ def predict_bc(model, tokenizer, text):
150
+ with torch.no_grad():
151
+ model.eval()
152
+ tokens = text_bc_tokenizer(
153
+ text,
154
+ padding="max_length",
155
+ truncation=True,
156
+ max_length=bc_token_size,
157
+ return_tensors="pt",
158
+ ).to(device)
159
+ output = model(**tokens)
160
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
161
+ return output_norm
162
+
163
+
164
+ def predict_mc(model, tokenizer, text):
165
+ with torch.no_grad():
166
+ model.eval()
167
+ tokens = text_mc_tokenizer(
168
+ text,
169
+ padding="max_length",
170
+ truncation=True,
171
+ return_tensors="pt",
172
+ max_length=mc_token_size,
173
+ ).to(device)
174
+ output = model(**tokens)
175
+ output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
176
+ return output_norm
177
+
178
+
179
+ def predict_mc_scores(input):
180
+ bc_scores = []
181
+ mc_scores = []
182
+
183
+ samples_len_bc = len(
184
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
185
+ )
186
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
187
+ for i in range(samples_len_bc):
188
+ cleaned_text_bc = remove_special_characters(segments_bc[i])
189
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
190
+ bc_scores.append(bc_score)
191
+ bc_scores_array = np.array(bc_scores)
192
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
193
+ bc_score_list = average_bc_scores.tolist()
194
+ bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
195
+ segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
196
+ samples_len_mc = len(
197
+ split_text_allow_complete_sentences_nltk(input, type_det="mc")
198
+ )
199
+ for i in range(samples_len_mc):
200
+ cleaned_text_mc = remove_special_characters(segments_mc[i])
201
+ mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
202
+ mc_scores.append(mc_score)
203
+ mc_scores_array = np.array(mc_scores)
204
+ average_mc_scores = np.mean(mc_scores_array, axis=0)
205
+ mc_score_list = average_mc_scores.tolist()
206
+ mc_score = {}
207
+ for score, label in zip(mc_score_list, mc_label_map):
208
+ mc_score[label.upper()] = score
209
+
210
+ sum_prob = 1 - bc_score["HUMAN"]
211
+ for key, value in mc_score.items():
212
+ mc_score[key] = value * sum_prob
213
+ if sum_prob < 0.01:
214
+ mc_score = {}
215
+
216
+ return mc_score
217
+
218
+
219
+ def predict_bc_scores(input):
220
+ bc_scores = []
221
+ mc_scores = []
222
+ samples_len_bc = len(
223
+ split_text_allow_complete_sentences_nltk(input, type_det="bc")
224
+ )
225
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
226
+ for i in range(samples_len_bc):
227
+ cleaned_text_bc = remove_special_characters(segments_bc[i])
228
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
229
+ bc_scores.append(bc_score)
230
+ bc_scores_array = np.array(bc_scores)
231
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
232
+ bc_score_list = average_bc_scores.tolist()
233
+ bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
234
+ return bc_score
235
+
236
+
237
+ # def predict_1on1(input):
238
+ # models = ['bard', 'claude', 'gpt4', 'mistral_ai', 'llama2']
239
+ # text = str(row["text"])
240
+ # predictions = {}
241
+ # prediction = predict(text, bard_model, bard_tokenizer) predictions['bard'] = prediction[1]
242
+ # prediction = predict(text, claude_model, claude_tokenizer) predictions['claude'] = prediction[1]
243
+ # prediction = predict(text, gpt4_model, gpt4_tokenizer) predictions['gpt4'] = prediction[1]
244
+ # prediction = predict(text, mistral_ai_model, mistral_ai_tokenizer) predictions['mistral_ai'] = prediction[1]
245
+ # prediction = predict(text, llama2_model, llama2_tokenizer) predictions['llama2'] = prediction[1]
246
+ # max_key = max(predictions, key=predictions.get)
requirements.txt CHANGED
@@ -6,8 +6,8 @@ BeautifulSoup4
6
  scrapingbee
7
  requests
8
  numpy
9
- torch==1.13.0
10
- transformers==4.25.1
11
  transformers-interpret
12
  textstat
13
  scipy
 
6
  scrapingbee
7
  requests
8
  numpy
9
+ torch
10
+ transformers
11
  transformers-interpret
12
  textstat
13
  scipy
utils.py CHANGED
@@ -11,284 +11,354 @@ import asyncio
11
  import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
  import threading
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- nltk.download('punkt')
16
 
17
  WORD = re.compile(r"\w+")
18
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
19
 
20
 
21
  # returns cosine similarity of two vectors
22
  # input: two vectors
23
  # output: integer between 0 and 1.
24
- def get_cosine(vec1, vec2):
25
- intersection = set(vec1.keys()) & set(vec2.keys())
26
 
27
- # calculating numerator
28
- numerator = sum([vec1[x] * vec2[x] for x in intersection])
29
 
30
- # calculating denominator
31
- sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
32
- sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
33
- denominator = math.sqrt(sum1) * math.sqrt(sum2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # checking for divide by zero
36
- if denominator == 0:
37
- return 0.0
38
- else:
39
- return float(numerator) / denominator
40
-
41
-
42
- # converts given text into a vector
43
- def text_to_vector(text):
44
- # uses the Regular expression above and gets all words
45
- words = WORD.findall(text)
46
- # returns a counter of all the words (count of number of occurences)
47
- return Counter(words)
48
-
49
-
50
- # returns cosine similarity of two words
51
- # uses: text_to_vector(text) and get_cosine(v1,v2)
52
- def cosineSim(text1, text2):
53
- vector1 = text_to_vector(text1)
54
- vector2 = text_to_vector(text2)
55
- # print vector1,vector2
56
- cosine = get_cosine(vector1, vector2)
57
- return cosine
58
-
59
- def cos_sim_torch(embedding_1, embedding_2):
60
- return util.pytorch_cos_sim(embedding_1, embedding_2).item()
61
-
62
- def embed_text(text):
63
- return model.encode(text, convert_to_tensor=True)
64
-
65
- def sentence_similarity(text1, text2):
66
- embedding_1= model.encode(text1, convert_to_tensor=True)
67
- embedding_2 = model.encode(text2, convert_to_tensor=True)
68
-
69
- o = util.pytorch_cos_sim(embedding_1, embedding_2)
70
- return o.item()
71
-
72
- def get_soup_requests(url):
73
- page = requests.get(url)
74
- if page.status_code == 200:
75
- soup = BeautifulSoup(page.content, "html.parser")
76
- return soup
77
- print("HTML soup failed")
78
- return None
79
-
80
-
81
- def get_soup_httpx(url):
82
- client = httpx.Client(timeout=30)
83
- try:
84
- page = client.get(url)
85
- if page.status_code == httpx.codes.OK:
86
- soup = BeautifulSoup(page.content, "html.parser")
87
- return soup
88
- except:
89
- print("HTTPx soup failed")
90
- return None
91
-
92
- def getSentences(text):
93
- from nltk.tokenize import sent_tokenize
94
-
95
- sents = sent_tokenize(text)
96
- two_sents = []
97
- for i in range(len(sents)):
98
- if (i % 2) == 0:
99
- two_sents.append(sents[i])
100
- else:
101
- two_sents[len(two_sents) - 1] += " " + sents[i]
102
- return two_sents
103
-
104
-
105
- def googleSearch(
106
- plag_option,
107
- sentences,
108
- urlCount,
109
- scoreArray,
110
- urlList,
111
- sorted_date,
112
- domains_to_skip,
113
- api_key,
114
- cse_id,
115
- **kwargs,
116
- ):
117
- service = build("customsearch", "v1", developerKey=api_key)
118
- for i, sentence in enumerate(sentences):
119
- results = (
120
- service.cse()
121
- .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
122
- .execute()
123
- )
124
- if "items" in results and len(results["items"]) > 0:
125
- for count, link in enumerate(results["items"]):
126
- # stop after 3 pages
127
- if count >= 3:
128
- break
129
- # skip user selected domains
130
- if any(
131
- ("." + domain) in link["link"]
132
- for domain in domains_to_skip
133
- ):
134
- continue
135
- # clean up snippet of '...'
136
- snippet = link["snippet"]
137
- ind = snippet.find("...")
138
- if ind < 20 and ind > 9:
139
- snippet = snippet[ind + len("... ") :]
140
- ind = snippet.find("...")
141
- if ind > len(snippet) - 5:
142
- snippet = snippet[:ind]
143
-
144
- # update cosine similarity between snippet and given text
145
- url = link["link"]
146
- if url not in urlList:
147
- urlList.append(url)
148
- scoreArray.append([0] * len(sentences))
149
- urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
150
- if plag_option == 'Standard':
151
- scoreArray[urlList.index(url)][i] = cosineSim(
152
- sentence, snippet)
153
- else :
154
- scoreArray[urlList.index(url)][i] = sentence_similarity(
155
- sentence, snippet
156
- )
157
- else:
158
- print("Google Search failed")
159
- return urlCount, scoreArray
160
-
161
-
162
- def getQueries(text, n):
163
- # return n-grams of size n
164
- words = text.split()
165
- return [words[i : i + n] for i in range(len(words) - n + 1)]
166
-
167
-
168
- def print2D(array):
169
- print(np.array(array))
170
-
171
-
172
- def removePunc(text):
173
- res = re.sub(r"[^\w\s]", "", text)
174
- return res
175
-
176
-
177
- async def get_url_data(url, client):
178
- try:
179
- r = await client.get(url)
180
- # print(r.status_code)
181
- if r.status_code == 200:
182
- # print("in")
183
- soup = BeautifulSoup(r.content, "html.parser")
184
- return soup
185
- except Exception:
186
- print("HTTPx parallel soup failed")
187
- return None
188
-
189
-
190
- async def parallel_scrap(urls):
191
- async with httpx.AsyncClient(timeout=30) as client:
192
- tasks = []
193
- for url in urls:
194
- tasks.append(get_url_data(url=url, client=client))
195
- results = await asyncio.gather(*tasks, return_exceptions=True)
196
- return results
197
-
198
-
199
- class TimeoutError(Exception):
200
- pass
201
-
202
-
203
-
204
- def matchingScore(sentence, content):
205
- if sentence in content:
206
- return 1
207
- sentence = removePunc(sentence)
208
- content = removePunc(content)
209
- if sentence in content:
210
- return 1
211
- else:
212
- n = 5
213
- ngrams = getQueries(sentence, n)
214
- if len(ngrams) == 0:
215
- return 0
216
- matched = [x for x in ngrams if " ".join(x) in content]
217
- return len(matched) / len(ngrams)
218
 
 
 
 
 
 
 
 
 
219
 
220
- # def matchingScoreWithTimeout(sentence, content):
221
- # def timeout_handler():
222
- # raise TimeoutError("Function timed out")
223
 
224
- # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds
225
- # timer.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  # try:
227
- # score = sentence_similarity(sentence, content)
228
- # # score = matchingScore(sentence, content)
229
- # timer.cancel() # Cancel the timer if calculation completes before timeout
230
- # return score
231
- # except TimeoutError:
232
- # return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
 
 
234
 
235
- # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  # content = removePunc(content)
237
  # for j, sentence in enumerate(sentences):
238
  # sentence = removePunc(sentence)
239
- # if sentence in content:
240
- # ScoreArray[content_idx][j] = 1
241
- # else:
242
- # n = 5
243
- # ngrams = getQueries(sentence, n)
244
- # if len(ngrams) == 0:
245
- # return 0
246
- # matched = [x for x in ngrams if " ".join(x) in content]
247
- # ScoreArray[content_idx][j] = len(matched) / len(ngrams)
248
  # print(
249
- # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
250
  # )
251
  # return ScoreArray
252
 
253
- async def matchingScoreAsync(sentences, content, content_idx, ScoreArray, model, util):
254
- content = removePunc(content)
255
- for j, sentence in enumerate(sentences):
256
- sentence = removePunc(sentence)
257
- similarity_score = sentence_similarity(sentence, content, model, util)
258
- ScoreArray[content_idx][j] = similarity_score
259
- print(f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................")
260
- return ScoreArray
261
-
262
-
263
- async def parallel_analyze(soups, sentences, ScoreArray):
264
- tasks = []
265
- for i, soup in enumerate(soups):
266
- if soup:
267
- page_content = soup.text
268
- tasks.append(
269
- matchingScoreAsync(sentences, page_content, i, ScoreArray)
270
- )
271
- else:
272
- print(
273
- f"Analyzed {i+1} of soups (SOUP FAILED)........................"
274
- )
275
- ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
276
- return ScoreArray
277
-
278
-
279
- async def parallel_analyze_2(soups, sentences, ScoreArray):
280
- tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
281
- for i, soup in enumerate(soups):
282
- if soup:
283
- page_content = soup.text
284
- for j, sent in enumerate(sentences):
285
- print(
286
- f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
287
- )
288
- tasks[i][j] = sentence_similarity(sent, page_content)
289
- else:
290
- print(
291
- f"Analyzed {i+1} of soups (SOUP FAILED)........................"
292
- )
293
- ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
294
- return ScoreArray
 
11
  import nltk
12
  from sentence_transformers import SentenceTransformer, util
13
  import threading
14
+ import torch
15
+ import re
16
+ import numpy as np
17
+ import asyncio
18
+ from datetime import date
19
+ import nltk
20
+ from unidecode import unidecode
21
+ from scipy.special import softmax
22
+ from transformers import AutoTokenizer
23
+ import yaml
24
+ import fitz
25
+ import os
26
+
27
+
28
+ def remove_accents(input_str):
29
+ text_no_accents = unidecode(input_str)
30
+ return text_no_accents
31
+
32
+
33
+ def remove_special_characters(text):
34
+ text = remove_accents(text)
35
+ pattern = r'[^\w\s\d.,!?\'"()-;]+'
36
+ text = re.sub(pattern, "", text)
37
+ return text
38
+
39
+
40
+ def remove_special_characters_2(text):
41
+ pattern = r"[^a-zA-Z0-9 ]+"
42
+ text = re.sub(pattern, "", text)
43
+ return text
44
+
45
+
46
+ def update_character_count(text):
47
+ return f"{len(text)} characters"
48
+
49
+
50
+ nltk.download("punkt")
51
+
52
+
53
+ with open("config.yaml", "r") as file:
54
+ params = yaml.safe_load(file)
55
+
56
+ text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
57
+
58
+ text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
59
+
60
+
61
+ def len_validator(text):
62
+ min_tokens = 200
63
+ lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
64
+ if lengt < min_tokens:
65
+ return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
66
+ else:
67
+ return f"Input length ({lengt}) is satisified."
68
+
69
+
70
+ def extract_text_from_pdf(pdf_path):
71
+ doc = fitz.open(pdf_path)
72
+ text = ""
73
+ for page in doc:
74
+ text += page.get_text()
75
+ return text
76
 
 
77
 
78
  WORD = re.compile(r"\w+")
79
+ model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
80
 
81
 
82
  # returns cosine similarity of two vectors
83
  # input: two vectors
84
  # output: integer between 0 and 1.
85
+ # def get_cosine(vec1, vec2):
86
+ # intersection = set(vec1.keys()) & set(vec2.keys())
87
 
88
+ # # calculating numerator
89
+ # numerator = sum([vec1[x] * vec2[x] for x in intersection])
90
 
91
+ # # calculating denominator
92
+ # sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
93
+ # sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
94
+ # denominator = math.sqrt(sum1) * math.sqrt(sum2)
95
+
96
+ # # checking for divide by zero
97
+ # if denominator == 0:
98
+ # return 0.0
99
+ # else:
100
+ # return float(numerator) / denominator
101
+
102
+
103
+ # # converts given text into a vector
104
+ # def text_to_vector(text):
105
+ # # uses the Regular expression above and gets all words
106
+ # words = WORD.findall(text)
107
+ # # returns a counter of all the words (count of number of occurences)
108
+ # return Counter(words)
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
+ # # returns cosine similarity of two words
112
+ # # uses: text_to_vector(text) and get_cosine(v1,v2)
113
+ # def cosineSim(text1, text2):
114
+ # vector1 = text_to_vector(text1)
115
+ # vector2 = text_to_vector(text2)
116
+ # # print vector1,vector2
117
+ # cosine = get_cosine(vector1, vector2)
118
+ # return cosine
119
 
 
 
 
120
 
121
+ # def cos_sim_torch(embedding_1, embedding_2):
122
+ # return util.pytorch_cos_sim(embedding_1, embedding_2).item()
123
+
124
+
125
+ # def embed_text(text):
126
+ # return model.encode(text, convert_to_tensor=True)
127
+
128
+
129
+ # def sentence_similarity(text1, text2):
130
+ # embedding_1 = model.encode(text1, convert_to_tensor=True)
131
+ # embedding_2 = model.encode(text2, convert_to_tensor=True)
132
+
133
+ # o = util.pytorch_cos_sim(embedding_1, embedding_2)
134
+ # return o.item()
135
+
136
+
137
+ # def get_soup_requests(url):
138
+ # page = requests.get(url)
139
+ # if page.status_code == 200:
140
+ # soup = BeautifulSoup(page.content, "html.parser")
141
+ # return soup
142
+ # print("HTML soup failed")
143
+ # return None
144
+
145
+
146
+ # def get_soup_httpx(url):
147
+ # client = httpx.Client(timeout=30)
148
  # try:
149
+ # page = client.get(url)
150
+ # if page.status_code == httpx.codes.OK:
151
+ # soup = BeautifulSoup(page.content, "html.parser")
152
+ # return soup
153
+ # except:
154
+ # print("HTTPx soup failed")
155
+ # return None
156
+
157
+
158
+ # def getSentences(text):
159
+ # from nltk.tokenize import sent_tokenize
160
+
161
+ # sents = sent_tokenize(text)
162
+ # two_sents = []
163
+ # for i in range(len(sents)):
164
+ # if (i % 2) == 0:
165
+ # two_sents.append(sents[i])
166
+ # else:
167
+ # two_sents[len(two_sents) - 1] += " " + sents[i]
168
+ # return two_sents
169
+
170
+
171
+ # def googleSearch(
172
+ # plag_option,
173
+ # sentences,
174
+ # urlCount,
175
+ # scoreArray,
176
+ # urlList,
177
+ # sorted_date,
178
+ # domains_to_skip,
179
+ # api_key,
180
+ # cse_id,
181
+ # **kwargs,
182
+ # ):
183
+ # service = build("customsearch", "v1", developerKey=api_key)
184
+ # for i, sentence in enumerate(sentences):
185
+ # results = (
186
+ # service.cse()
187
+ # .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
188
+ # .execute()
189
+ # )
190
+ # if "items" in results and len(results["items"]) > 0:
191
+ # for count, link in enumerate(results["items"]):
192
+ # # stop after 3 pages
193
+ # if count >= 3:
194
+ # break
195
+ # # skip user selected domains
196
+ # if any(
197
+ # ("." + domain) in link["link"] for domain in domains_to_skip
198
+ # ):
199
+ # continue
200
+ # # clean up snippet of '...'
201
+ # snippet = link["snippet"]
202
+ # ind = snippet.find("...")
203
+ # if ind < 20 and ind > 9:
204
+ # snippet = snippet[ind + len("... ") :]
205
+ # ind = snippet.find("...")
206
+ # if ind > len(snippet) - 5:
207
+ # snippet = snippet[:ind]
208
+
209
+ # # update cosine similarity between snippet and given text
210
+ # url = link["link"]
211
+ # if url not in urlList:
212
+ # urlList.append(url)
213
+ # scoreArray.append([0] * len(sentences))
214
+ # urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
215
+ # if plag_option == "Standard":
216
+ # scoreArray[urlList.index(url)][i] = cosineSim(
217
+ # sentence, snippet
218
+ # )
219
+ # else:
220
+ # scoreArray[urlList.index(url)][i] = sentence_similarity(
221
+ # sentence, snippet
222
+ # )
223
+ # else:
224
+ # print("Google Search failed")
225
+ # return urlCount, scoreArray
226
+
227
+
228
+ # def getQueries(text, n):
229
+ # # return n-grams of size n
230
+ # words = text.split()
231
+ # return [words[i : i + n] for i in range(len(words) - n + 1)]
232
+
233
 
234
+ # def print2D(array):
235
+ # print(np.array(array))
236
 
237
+
238
+ # def removePunc(text):
239
+ # res = re.sub(r"[^\w\s]", "", text)
240
+ # return res
241
+
242
+
243
+ # async def get_url_data(url, client):
244
+ # try:
245
+ # r = await client.get(url)
246
+ # # print(r.status_code)
247
+ # if r.status_code == 200:
248
+ # # print("in")
249
+ # soup = BeautifulSoup(r.content, "html.parser")
250
+ # return soup
251
+ # except Exception:
252
+ # print("HTTPx parallel soup failed")
253
+ # return None
254
+
255
+
256
+ # async def parallel_scrap(urls):
257
+ # async with httpx.AsyncClient(timeout=30) as client:
258
+ # tasks = []
259
+ # for url in urls:
260
+ # tasks.append(get_url_data(url=url, client=client))
261
+ # results = await asyncio.gather(*tasks, return_exceptions=True)
262
+ # return results
263
+
264
+
265
+ # class TimeoutError(Exception):
266
+ # pass
267
+
268
+
269
+ # def matchingScore(sentence, content):
270
+ # if sentence in content:
271
+ # return 1
272
+ # sentence = removePunc(sentence)
273
+ # content = removePunc(content)
274
+ # if sentence in content:
275
+ # return 1
276
+ # else:
277
+ # n = 5
278
+ # ngrams = getQueries(sentence, n)
279
+ # if len(ngrams) == 0:
280
+ # return 0
281
+ # matched = [x for x in ngrams if " ".join(x) in content]
282
+ # return len(matched) / len(ngrams)
283
+
284
+
285
+ # # def matchingScoreWithTimeout(sentence, content):
286
+ # # def timeout_handler():
287
+ # # raise TimeoutError("Function timed out")
288
+
289
+ # # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds
290
+ # # timer.start()
291
+ # # try:
292
+ # # score = sentence_similarity(sentence, content)
293
+ # # # score = matchingScore(sentence, content)
294
+ # # timer.cancel() # Cancel the timer if calculation completes before timeout
295
+ # # return score
296
+ # # except TimeoutError:
297
+ # # return 0
298
+
299
+
300
+ # # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
301
+ # # content = removePunc(content)
302
+ # # for j, sentence in enumerate(sentences):
303
+ # # sentence = removePunc(sentence)
304
+ # # if sentence in content:
305
+ # # ScoreArray[content_idx][j] = 1
306
+ # # else:
307
+ # # n = 5
308
+ # # ngrams = getQueries(sentence, n)
309
+ # # if len(ngrams) == 0:
310
+ # # return 0
311
+ # # matched = [x for x in ngrams if " ".join(x) in content]
312
+ # # ScoreArray[content_idx][j] = len(matched) / len(ngrams)
313
+ # # print(
314
+ # # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
315
+ # # )
316
+ # # return ScoreArray
317
+
318
+
319
+ # async def matchingScoreAsync(
320
+ # sentences, content, content_idx, ScoreArray, model, util
321
+ # ):
322
  # content = removePunc(content)
323
  # for j, sentence in enumerate(sentences):
324
  # sentence = removePunc(sentence)
325
+ # similarity_score = sentence_similarity(sentence, content, model, util)
326
+ # ScoreArray[content_idx][j] = similarity_score
 
 
 
 
 
 
 
327
  # print(
328
+ # f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................"
329
  # )
330
  # return ScoreArray
331
 
332
+
333
+ # async def parallel_analyze(soups, sentences, ScoreArray):
334
+ # tasks = []
335
+ # for i, soup in enumerate(soups):
336
+ # if soup:
337
+ # page_content = soup.text
338
+ # tasks.append(
339
+ # matchingScoreAsync(sentences, page_content, i, ScoreArray)
340
+ # )
341
+ # else:
342
+ # print(
343
+ # f"Analyzed {i+1} of soups (SOUP FAILED)........................"
344
+ # )
345
+ # ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
346
+ # return ScoreArray
347
+
348
+
349
+ # async def parallel_analyze_2(soups, sentences, ScoreArray):
350
+ # tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
351
+ # for i, soup in enumerate(soups):
352
+ # if soup:
353
+ # page_content = soup.text
354
+ # for j, sent in enumerate(sentences):
355
+ # print(
356
+ # f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
357
+ # )
358
+ # tasks[i][j] = sentence_similarity(sent, page_content)
359
+ # else:
360
+ # print(
361
+ # f"Analyzed {i+1} of soups (SOUP FAILED)........................"
362
+ # )
363
+ # ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
364
+ # return ScoreArray