Spaces:
Sleeping
Sleeping
refactoring
Browse files- __pycache__/analysis.cpython-311.pyc +0 -0
- __pycache__/app.cpython-311.pyc +0 -0
- __pycache__/explainability.cpython-311.pyc +0 -0
- __pycache__/plagiarism.cpython-311.pyc +0 -0
- __pycache__/predictors.cpython-311.pyc +0 -0
- __pycache__/utils.cpython-311.pyc +0 -0
- analysis.py +98 -0
- app.py +16 -394
- explainability.py +119 -0
- plagiarism.py +10 -6
- predictors.py +246 -0
- requirements.txt +2 -2
- utils.py +327 -257
__pycache__/analysis.cpython-311.pyc
ADDED
Binary file (4.75 kB). View file
|
|
__pycache__/app.cpython-311.pyc
ADDED
Binary file (10.9 kB). View file
|
|
__pycache__/explainability.cpython-311.pyc
ADDED
Binary file (7.89 kB). View file
|
|
__pycache__/plagiarism.cpython-311.pyc
ADDED
Binary file (14.1 kB). View file
|
|
__pycache__/predictors.cpython-311.pyc
ADDED
Binary file (12 kB). View file
|
|
__pycache__/utils.cpython-311.pyc
ADDED
Binary file (3.76 kB). View file
|
|
analysis.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import httpx
|
3 |
+
import torch
|
4 |
+
import re
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import numpy as np
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
+
import asyncio
|
9 |
+
from scipy.special import softmax
|
10 |
+
from evaluate import load
|
11 |
+
from datetime import date
|
12 |
+
import nltk
|
13 |
+
import fitz
|
14 |
+
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
15 |
+
import nltk, spacy, subprocess, torch
|
16 |
+
import plotly.graph_objects as go
|
17 |
+
import torch.nn.functional as F
|
18 |
+
import nltk
|
19 |
+
from unidecode import unidecode
|
20 |
+
import time
|
21 |
+
import yaml
|
22 |
+
import nltk
|
23 |
+
import os
|
24 |
+
from explainability import *
|
25 |
+
from dotenv import load_dotenv
|
26 |
+
import subprocess
|
27 |
+
|
28 |
+
nltk.download("punkt")
|
29 |
+
nltk.download("stopwords")
|
30 |
+
load_dotenv()
|
31 |
+
with open("config.yaml", "r") as file:
|
32 |
+
params = yaml.safe_load(file)
|
33 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
34 |
+
readability_model_id = params["READABILITY_MODEL_ID"]
|
35 |
+
gpt2_model = GPT2LMHeadModel.from_pretrained(readability_model_id).to(device)
|
36 |
+
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(readability_model_id)
|
37 |
+
|
38 |
+
command = ["python", "-m", "spacy", "download", "en_core_web_sm"]
|
39 |
+
subprocess.run(command)
|
40 |
+
nlp = spacy.load("en_core_web_sm")
|
41 |
+
|
42 |
+
|
43 |
+
def depth_analysis(input_text):
|
44 |
+
processed_words = preprocess_text1(input_text)
|
45 |
+
ttr_value = vocabulary_richness_ttr(processed_words)
|
46 |
+
gunning_fog = calculate_gunning_fog(input_text)
|
47 |
+
gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
|
48 |
+
words, sentences = preprocess_text2(input_text)
|
49 |
+
average_sentence_length = calculate_average_sentence_length(sentences)
|
50 |
+
average_word_length = calculate_average_word_length(words)
|
51 |
+
average_sentence_length_norm = normalize(
|
52 |
+
average_sentence_length, min_value=0, max_value=40
|
53 |
+
)
|
54 |
+
average_word_length_norm = normalize(
|
55 |
+
average_word_length, min_value=0, max_value=8
|
56 |
+
)
|
57 |
+
average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
|
58 |
+
average_tree_depth_norm = normalize(
|
59 |
+
average_tree_depth, min_value=0, max_value=10
|
60 |
+
)
|
61 |
+
perplexity = calculate_perplexity(
|
62 |
+
input_text, gpt2_model, gpt2_tokenizer, device
|
63 |
+
)
|
64 |
+
perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
|
65 |
+
|
66 |
+
features = {
|
67 |
+
"readability": gunning_fog_norm,
|
68 |
+
"syntactic tree depth": average_tree_depth_norm,
|
69 |
+
"vocabulary richness": ttr_value,
|
70 |
+
"perplexity": perplexity_norm,
|
71 |
+
"average sentence length": average_sentence_length_norm,
|
72 |
+
"average word length": average_word_length_norm,
|
73 |
+
}
|
74 |
+
fig = go.Figure()
|
75 |
+
fig.add_trace(
|
76 |
+
go.Scatterpolar(
|
77 |
+
r=list(features.values()),
|
78 |
+
theta=list(features.keys()),
|
79 |
+
fill="toself",
|
80 |
+
name="Radar Plot",
|
81 |
+
)
|
82 |
+
)
|
83 |
+
fig.update_layout(
|
84 |
+
polar=dict(
|
85 |
+
radialaxis=dict(
|
86 |
+
visible=True,
|
87 |
+
range=[0, 100],
|
88 |
+
)
|
89 |
+
),
|
90 |
+
showlegend=False,
|
91 |
+
margin=dict(
|
92 |
+
l=10,
|
93 |
+
r=20,
|
94 |
+
b=10,
|
95 |
+
t=10,
|
96 |
+
),
|
97 |
+
)
|
98 |
+
return fig
|
app.py
CHANGED
@@ -1,286 +1,23 @@
|
|
1 |
-
from utils import (
|
2 |
-
cosineSim,
|
3 |
-
googleSearch,
|
4 |
-
getSentences,
|
5 |
-
parallel_scrap,
|
6 |
-
matchingScore,
|
7 |
-
)
|
8 |
import gradio as gr
|
9 |
-
from urllib.request import urlopen, Request
|
10 |
-
from googleapiclient.discovery import build
|
11 |
-
import requests
|
12 |
-
import httpx
|
13 |
-
import torch
|
14 |
-
import re
|
15 |
-
from bs4 import BeautifulSoup
|
16 |
import numpy as np
|
17 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
18 |
-
import asyncio
|
19 |
-
from scipy.special import softmax
|
20 |
-
from evaluate import load
|
21 |
from datetime import date
|
22 |
-
import
|
23 |
-
import
|
24 |
-
from
|
25 |
-
|
26 |
-
import
|
27 |
-
import torch.nn.functional as F
|
28 |
-
import nltk
|
29 |
-
from unidecode import unidecode
|
30 |
-
import time
|
31 |
-
from utils import cos_sim_torch, embed_text
|
32 |
-
import multiprocessing
|
33 |
-
from functools import partial
|
34 |
-
import concurrent.futures
|
35 |
-
from plagiarism import plagiarism_check
|
36 |
-
|
37 |
-
nltk.download("punkt")
|
38 |
-
|
39 |
-
from writing_analysis import (
|
40 |
-
normalize,
|
41 |
-
preprocess_text1,
|
42 |
-
preprocess_text2,
|
43 |
-
vocabulary_richness_ttr,
|
44 |
-
calculate_gunning_fog,
|
45 |
-
calculate_average_sentence_length,
|
46 |
-
calculate_average_word_length,
|
47 |
-
calculate_syntactic_tree_depth,
|
48 |
-
calculate_perplexity,
|
49 |
-
)
|
50 |
|
51 |
np.set_printoptions(suppress=True)
|
52 |
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
58 |
-
|
59 |
-
text_bc_model_path = "polygraf-ai/text-detect-bc-v11-4m"
|
60 |
-
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
61 |
-
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
62 |
-
text_bc_model_path
|
63 |
-
).to(device)
|
64 |
-
|
65 |
-
text_mc_model_path = (
|
66 |
-
"polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
|
67 |
-
)
|
68 |
-
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
69 |
-
text_mc_model = AutoModelForSequenceClassification.from_pretrained(
|
70 |
-
text_mc_model_path
|
71 |
-
).to(device)
|
72 |
-
|
73 |
-
quillbot_labels = ["Original", "QuillBot"]
|
74 |
-
quillbot_tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
|
75 |
-
quillbot_model = AutoModelForSequenceClassification.from_pretrained(
|
76 |
-
"polygraf-ai/quillbot-detector-28k"
|
77 |
-
).to(device)
|
78 |
-
|
79 |
-
|
80 |
-
def remove_accents(input_str):
|
81 |
-
text_no_accents = unidecode(input_str)
|
82 |
-
return text_no_accents
|
83 |
-
|
84 |
-
|
85 |
-
def remove_special_characters(text):
|
86 |
-
text = remove_accents(text)
|
87 |
-
pattern = r'[^\w\s\d.,!?\'"()-;]+'
|
88 |
-
text = re.sub(pattern, "", text)
|
89 |
-
return text
|
90 |
-
|
91 |
-
|
92 |
-
def remove_special_characters_2(text):
|
93 |
-
pattern = r"[^a-zA-Z0-9 ]+"
|
94 |
-
text = re.sub(pattern, "", text)
|
95 |
-
return text
|
96 |
-
|
97 |
-
|
98 |
-
def update_character_count(text):
|
99 |
-
return f"{len(text)} characters"
|
100 |
-
|
101 |
-
|
102 |
-
def split_text_allow_complete_sentences_nltk(
|
103 |
-
text,
|
104 |
-
max_length=256,
|
105 |
-
tolerance=30,
|
106 |
-
min_last_segment_length=100,
|
107 |
-
type_det="bc",
|
108 |
-
):
|
109 |
-
sentences = nltk.sent_tokenize(text)
|
110 |
-
segments = []
|
111 |
-
current_segment = []
|
112 |
-
current_length = 0
|
113 |
-
|
114 |
-
if type_det == "bc":
|
115 |
-
tokenizer = text_bc_tokenizer
|
116 |
-
max_length = 333
|
117 |
-
|
118 |
-
elif type_det == "mc":
|
119 |
-
tokenizer = text_mc_tokenizer
|
120 |
-
max_length = 256
|
121 |
-
|
122 |
-
for sentence in sentences:
|
123 |
-
tokens = tokenizer.tokenize(sentence)
|
124 |
-
sentence_length = len(tokens)
|
125 |
-
|
126 |
-
if current_length + sentence_length <= max_length + tolerance - 2:
|
127 |
-
current_segment.append(sentence)
|
128 |
-
current_length += sentence_length
|
129 |
-
else:
|
130 |
-
if current_segment:
|
131 |
-
encoded_segment = tokenizer.encode(
|
132 |
-
" ".join(current_segment),
|
133 |
-
add_special_tokens=True,
|
134 |
-
max_length=max_length + tolerance,
|
135 |
-
truncation=True,
|
136 |
-
)
|
137 |
-
segments.append((current_segment, len(encoded_segment)))
|
138 |
-
current_segment = [sentence]
|
139 |
-
current_length = sentence_length
|
140 |
-
|
141 |
-
if current_segment:
|
142 |
-
encoded_segment = tokenizer.encode(
|
143 |
-
" ".join(current_segment),
|
144 |
-
add_special_tokens=True,
|
145 |
-
max_length=max_length + tolerance,
|
146 |
-
truncation=True,
|
147 |
-
)
|
148 |
-
segments.append((current_segment, len(encoded_segment)))
|
149 |
-
|
150 |
-
final_segments = []
|
151 |
-
for i, (seg, length) in enumerate(segments):
|
152 |
-
if i == len(segments) - 1:
|
153 |
-
if length < min_last_segment_length and len(final_segments) > 0:
|
154 |
-
prev_seg, prev_length = final_segments[-1]
|
155 |
-
combined_encoded = tokenizer.encode(
|
156 |
-
" ".join(prev_seg + seg),
|
157 |
-
add_special_tokens=True,
|
158 |
-
max_length=max_length + tolerance,
|
159 |
-
truncation=True,
|
160 |
-
)
|
161 |
-
if len(combined_encoded) <= max_length + tolerance:
|
162 |
-
final_segments[-1] = (prev_seg + seg, len(combined_encoded))
|
163 |
-
else:
|
164 |
-
final_segments.append((seg, length))
|
165 |
-
else:
|
166 |
-
final_segments.append((seg, length))
|
167 |
-
else:
|
168 |
-
final_segments.append((seg, length))
|
169 |
-
|
170 |
-
decoded_segments = []
|
171 |
-
encoded_segments = []
|
172 |
-
for seg, _ in final_segments:
|
173 |
-
encoded_segment = tokenizer.encode(
|
174 |
-
" ".join(seg),
|
175 |
-
add_special_tokens=True,
|
176 |
-
max_length=max_length + tolerance,
|
177 |
-
truncation=True,
|
178 |
-
)
|
179 |
-
decoded_segment = tokenizer.decode(encoded_segment)
|
180 |
-
decoded_segments.append(decoded_segment)
|
181 |
-
return decoded_segments
|
182 |
-
|
183 |
-
|
184 |
-
def predict_quillbot(text):
|
185 |
-
with torch.no_grad():
|
186 |
-
quillbot_model.eval()
|
187 |
-
tokenized_text = quillbot_tokenizer(
|
188 |
-
text,
|
189 |
-
padding="max_length",
|
190 |
-
truncation=True,
|
191 |
-
max_length=256,
|
192 |
-
return_tensors="pt",
|
193 |
-
).to(device)
|
194 |
-
output = quillbot_model(**tokenized_text)
|
195 |
-
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
196 |
-
q_score = {
|
197 |
-
"QuillBot": output_norm[1].item(),
|
198 |
-
"Original": output_norm[0].item(),
|
199 |
-
}
|
200 |
-
return q_score
|
201 |
-
|
202 |
-
|
203 |
-
def predict_bc(model, tokenizer, text):
|
204 |
-
with torch.no_grad():
|
205 |
-
model.eval()
|
206 |
-
tokens = text_bc_tokenizer(
|
207 |
-
text,
|
208 |
-
padding="max_length",
|
209 |
-
truncation=True,
|
210 |
-
max_length=333,
|
211 |
-
return_tensors="pt",
|
212 |
-
).to(device)
|
213 |
-
output = model(**tokens)
|
214 |
-
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
215 |
-
print("BC Score: ", output_norm)
|
216 |
-
return output_norm
|
217 |
-
|
218 |
-
|
219 |
-
def predict_mc(model, tokenizer, text):
|
220 |
-
with torch.no_grad():
|
221 |
-
model.eval()
|
222 |
-
tokens = text_mc_tokenizer(
|
223 |
-
text,
|
224 |
-
padding="max_length",
|
225 |
-
truncation=True,
|
226 |
-
return_tensors="pt",
|
227 |
-
max_length=256,
|
228 |
-
).to(device)
|
229 |
-
output = model(**tokens)
|
230 |
-
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
231 |
-
print("MC Score: ", output_norm)
|
232 |
-
return output_norm
|
233 |
-
|
234 |
-
|
235 |
-
def ai_generated_test(ai_option, input):
|
236 |
-
|
237 |
-
bc_scores = []
|
238 |
-
mc_scores = []
|
239 |
-
samples_len_bc = len(
|
240 |
-
split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
241 |
-
)
|
242 |
-
samples_len_mc = len(
|
243 |
-
split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
244 |
-
)
|
245 |
-
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
246 |
-
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
247 |
-
|
248 |
-
for i in range(samples_len_bc):
|
249 |
-
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
250 |
-
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
251 |
-
bc_scores.append(bc_score)
|
252 |
-
|
253 |
-
for i in range(samples_len_mc):
|
254 |
-
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
255 |
-
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
256 |
-
mc_scores.append(mc_score)
|
257 |
-
|
258 |
-
bc_scores_array = np.array(bc_scores)
|
259 |
-
mc_scores_array = np.array(mc_scores)
|
260 |
-
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
261 |
-
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
262 |
-
bc_score_list = average_bc_scores.tolist()
|
263 |
-
mc_score_list = average_mc_scores.tolist()
|
264 |
-
|
265 |
-
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
266 |
-
mc_score = {}
|
267 |
-
label_map = ["OpenAI GPT", "Mistral", "CLAUDE", "Gemini", "LLAMA 2"]
|
268 |
-
|
269 |
-
for score, label in zip(mc_score_list, label_map):
|
270 |
-
mc_score[label.upper()] = score
|
271 |
-
|
272 |
-
sum_prob = 1 - bc_score["HUMAN"]
|
273 |
-
for key, value in mc_score.items():
|
274 |
-
mc_score[key] = value * sum_prob
|
275 |
-
|
276 |
-
if ai_option == "Human vs AI":
|
277 |
-
mc_score = {}
|
278 |
-
|
279 |
-
if sum_prob < 0.01:
|
280 |
-
mc_score = {}
|
281 |
-
return bc_score, mc_score
|
282 |
else:
|
283 |
-
return
|
|
|
|
|
|
|
284 |
|
285 |
|
286 |
# COMBINED
|
@@ -310,7 +47,8 @@ def main(
|
|
310 |
domains_to_skip,
|
311 |
)
|
312 |
depth_analysis_plot = depth_analysis(input)
|
313 |
-
bc_score
|
|
|
314 |
quilscore = predict_quillbot(input)
|
315 |
|
316 |
return (
|
@@ -322,120 +60,6 @@ def main(
|
|
322 |
)
|
323 |
|
324 |
|
325 |
-
def build_date(year, month, day):
|
326 |
-
return f"{year}{months[month]}{day}"
|
327 |
-
|
328 |
-
|
329 |
-
def len_validator(text):
|
330 |
-
min_tokens = 200
|
331 |
-
lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
|
332 |
-
if lengt < min_tokens:
|
333 |
-
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
|
334 |
-
else:
|
335 |
-
return f"Input length ({lengt}) is satisified."
|
336 |
-
|
337 |
-
|
338 |
-
def extract_text_from_pdf(pdf_path):
|
339 |
-
doc = fitz.open(pdf_path)
|
340 |
-
text = ""
|
341 |
-
for page in doc:
|
342 |
-
text += page.get_text()
|
343 |
-
return text
|
344 |
-
|
345 |
-
|
346 |
-
# DEPTH ANALYSIS
|
347 |
-
print("loading depth analysis")
|
348 |
-
nltk.download("stopwords")
|
349 |
-
nltk.download("punkt")
|
350 |
-
command = ["python3", "-m", "spacy", "download", "en_core_web_sm"]
|
351 |
-
# Execute the command
|
352 |
-
subprocess.run(command)
|
353 |
-
nlp = spacy.load("en_core_web_sm")
|
354 |
-
|
355 |
-
# for perplexity
|
356 |
-
model_id = "gpt2"
|
357 |
-
gpt2_model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
|
358 |
-
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
|
359 |
-
|
360 |
-
|
361 |
-
def depth_analysis(input_text):
|
362 |
-
|
363 |
-
# vocanulary richness
|
364 |
-
processed_words = preprocess_text1(input_text)
|
365 |
-
ttr_value = vocabulary_richness_ttr(processed_words)
|
366 |
-
|
367 |
-
# readability
|
368 |
-
gunning_fog = calculate_gunning_fog(input_text)
|
369 |
-
gunning_fog_norm = normalize(gunning_fog, min_value=0, max_value=20)
|
370 |
-
|
371 |
-
# average sentence length and average word length
|
372 |
-
words, sentences = preprocess_text2(input_text)
|
373 |
-
average_sentence_length = calculate_average_sentence_length(sentences)
|
374 |
-
average_word_length = calculate_average_word_length(words)
|
375 |
-
average_sentence_length_norm = normalize(
|
376 |
-
average_sentence_length, min_value=0, max_value=40
|
377 |
-
)
|
378 |
-
average_word_length_norm = normalize(
|
379 |
-
average_word_length, min_value=0, max_value=8
|
380 |
-
)
|
381 |
-
|
382 |
-
# syntactic_tree_depth
|
383 |
-
average_tree_depth = calculate_syntactic_tree_depth(nlp, input_text)
|
384 |
-
average_tree_depth_norm = normalize(
|
385 |
-
average_tree_depth, min_value=0, max_value=10
|
386 |
-
)
|
387 |
-
|
388 |
-
# perplexity
|
389 |
-
perplexity = calculate_perplexity(
|
390 |
-
input_text, gpt2_model, gpt2_tokenizer, device
|
391 |
-
)
|
392 |
-
perplexity_norm = normalize(perplexity, min_value=0, max_value=30)
|
393 |
-
|
394 |
-
features = {
|
395 |
-
"readability": gunning_fog_norm,
|
396 |
-
"syntactic tree depth": average_tree_depth_norm,
|
397 |
-
"vocabulary richness": ttr_value,
|
398 |
-
"perplexity": perplexity_norm,
|
399 |
-
"average sentence length": average_sentence_length_norm,
|
400 |
-
"average word length": average_word_length_norm,
|
401 |
-
}
|
402 |
-
|
403 |
-
print(features)
|
404 |
-
|
405 |
-
fig = go.Figure()
|
406 |
-
|
407 |
-
fig.add_trace(
|
408 |
-
go.Scatterpolar(
|
409 |
-
r=list(features.values()),
|
410 |
-
theta=list(features.keys()),
|
411 |
-
fill="toself",
|
412 |
-
name="Radar Plot",
|
413 |
-
)
|
414 |
-
)
|
415 |
-
|
416 |
-
fig.update_layout(
|
417 |
-
polar=dict(
|
418 |
-
radialaxis=dict(
|
419 |
-
visible=True,
|
420 |
-
range=[0, 100],
|
421 |
-
)
|
422 |
-
),
|
423 |
-
showlegend=False,
|
424 |
-
# autosize=False,
|
425 |
-
# width=600,
|
426 |
-
# height=600,
|
427 |
-
margin=dict(
|
428 |
-
l=10,
|
429 |
-
r=20,
|
430 |
-
b=10,
|
431 |
-
t=10,
|
432 |
-
# pad=100
|
433 |
-
),
|
434 |
-
)
|
435 |
-
|
436 |
-
return fig
|
437 |
-
|
438 |
-
|
439 |
# START OF GRADIO
|
440 |
|
441 |
title = "Copyright Checker"
|
@@ -497,7 +121,7 @@ with gr.Blocks() as demo:
|
|
497 |
only_plagiarism_btn = gr.Button("Source Check")
|
498 |
|
499 |
with gr.Row():
|
500 |
-
quillbot_check = gr.Button("Humanized Text Check
|
501 |
|
502 |
with gr.Row():
|
503 |
depth_analysis_btn = gr.Button("Detailed Writing Analysis")
|
@@ -642,6 +266,4 @@ with gr.Blocks() as demo:
|
|
642 |
date_from = ""
|
643 |
date_to = ""
|
644 |
|
645 |
-
demo.launch(
|
646 |
-
share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd")
|
647 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
|
|
|
|
|
|
|
|
3 |
from datetime import date
|
4 |
+
from predictors import predict_bc_scores, predict_mc_scores
|
5 |
+
from analysis import depth_analysis
|
6 |
+
from predictors import predict_quillbot
|
7 |
+
from plagiarism import plagiarism_check, build_date
|
8 |
+
from utils import extract_text_from_pdf, len_validator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
np.set_printoptions(suppress=True)
|
11 |
|
12 |
|
13 |
+
def ai_generated_test(option, input):
|
14 |
+
if option == "Human vs AI":
|
15 |
+
return predict_bc_scores(input), None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
else:
|
17 |
+
return (
|
18 |
+
predict_bc_scores(input),
|
19 |
+
predict_mc_scores(input),
|
20 |
+
)
|
21 |
|
22 |
|
23 |
# COMBINED
|
|
|
47 |
domains_to_skip,
|
48 |
)
|
49 |
depth_analysis_plot = depth_analysis(input)
|
50 |
+
bc_score = predict_bc_scores(input)
|
51 |
+
mc_score = predict_mc_scores(input)
|
52 |
quilscore = predict_quillbot(input)
|
53 |
|
54 |
return (
|
|
|
60 |
)
|
61 |
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
# START OF GRADIO
|
64 |
|
65 |
title = "Copyright Checker"
|
|
|
121 |
only_plagiarism_btn = gr.Button("Source Check")
|
122 |
|
123 |
with gr.Row():
|
124 |
+
quillbot_check = gr.Button("Humanized Text Check")
|
125 |
|
126 |
with gr.Row():
|
127 |
depth_analysis_btn = gr.Button("Detailed Writing Analysis")
|
|
|
266 |
date_from = ""
|
267 |
date_to = ""
|
268 |
|
269 |
+
demo.launch(share=True, auth=("polygraf-admin", "test@aisd"))
|
|
|
|
explainability.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re, textstat
|
2 |
+
from nltk import FreqDist
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
5 |
+
import torch
|
6 |
+
import nltk
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
nltk.download("punkt")
|
10 |
+
|
11 |
+
|
12 |
+
def normalize(value, min_value, max_value):
|
13 |
+
normalized_value = ((value - min_value) * 100) / (max_value - min_value)
|
14 |
+
return max(0, min(100, normalized_value))
|
15 |
+
|
16 |
+
|
17 |
+
def preprocess_text1(text):
|
18 |
+
text = text.lower()
|
19 |
+
text = re.sub(r"[^\w\s]", "", text) # remove punctuation
|
20 |
+
stop_words = set(stopwords.words("english")) # remove stopwords
|
21 |
+
words = [word for word in text.split() if word not in stop_words]
|
22 |
+
words = [word for word in words if not word.isdigit()] # remove numbers
|
23 |
+
return words
|
24 |
+
|
25 |
+
|
26 |
+
def vocabulary_richness_ttr(words):
|
27 |
+
unique_words = set(words)
|
28 |
+
ttr = len(unique_words) / len(words) * 100
|
29 |
+
return ttr
|
30 |
+
|
31 |
+
|
32 |
+
def calculate_gunning_fog(text):
|
33 |
+
"""range 0-20"""
|
34 |
+
gunning_fog = textstat.gunning_fog(text)
|
35 |
+
return gunning_fog
|
36 |
+
|
37 |
+
|
38 |
+
def calculate_automated_readability_index(text):
|
39 |
+
"""range 1-20"""
|
40 |
+
ari = textstat.automated_readability_index(text)
|
41 |
+
return ari
|
42 |
+
|
43 |
+
|
44 |
+
def calculate_flesch_reading_ease(text):
|
45 |
+
"""range 0-100"""
|
46 |
+
fre = textstat.flesch_reading_ease(text)
|
47 |
+
return fre
|
48 |
+
|
49 |
+
|
50 |
+
def preprocess_text2(text):
|
51 |
+
sentences = sent_tokenize(text)
|
52 |
+
words = [
|
53 |
+
word.lower()
|
54 |
+
for sent in sentences
|
55 |
+
for word in word_tokenize(sent)
|
56 |
+
if word.isalnum()
|
57 |
+
]
|
58 |
+
stop_words = set(stopwords.words("english"))
|
59 |
+
words = [word for word in words if word not in stop_words]
|
60 |
+
return words, sentences
|
61 |
+
|
62 |
+
|
63 |
+
def calculate_average_sentence_length(sentences):
|
64 |
+
"""range 0-40 or 50 based on the histogram"""
|
65 |
+
total_words = sum(len(word_tokenize(sent)) for sent in sentences)
|
66 |
+
average_sentence_length = total_words / (len(sentences) + 0.0000001)
|
67 |
+
return average_sentence_length
|
68 |
+
|
69 |
+
|
70 |
+
def calculate_average_word_length(words):
|
71 |
+
"""range 0-8 based on the histogram"""
|
72 |
+
total_characters = sum(len(word) for word in words)
|
73 |
+
average_word_length = total_characters / (len(words) + 0.0000001)
|
74 |
+
return average_word_length
|
75 |
+
|
76 |
+
|
77 |
+
def calculate_max_depth(sent):
|
78 |
+
return max(len(list(token.ancestors)) for token in sent)
|
79 |
+
|
80 |
+
|
81 |
+
def calculate_syntactic_tree_depth(nlp, text):
|
82 |
+
"""0-10 based on the histogram"""
|
83 |
+
doc = nlp(text)
|
84 |
+
sentence_depths = [calculate_max_depth(sent) for sent in doc.sents]
|
85 |
+
average_depth = (
|
86 |
+
sum(sentence_depths) / len(sentence_depths) if sentence_depths else 0
|
87 |
+
)
|
88 |
+
return average_depth
|
89 |
+
|
90 |
+
|
91 |
+
def calculate_perplexity(text, model, tokenizer, device, stride=512):
|
92 |
+
"""range 0-30 based on the histogram"""
|
93 |
+
encodings = tokenizer(text, return_tensors="pt")
|
94 |
+
max_length = model.config.n_positions
|
95 |
+
seq_len = encodings.input_ids.size(1)
|
96 |
+
|
97 |
+
nlls = []
|
98 |
+
prev_end_loc = 0
|
99 |
+
for begin_loc in tqdm(range(0, seq_len, stride)):
|
100 |
+
end_loc = min(begin_loc + max_length, seq_len)
|
101 |
+
trg_len = (
|
102 |
+
end_loc - prev_end_loc
|
103 |
+
) # may be different from stride on last loop
|
104 |
+
input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
|
105 |
+
target_ids = input_ids.clone()
|
106 |
+
target_ids[:, :-trg_len] = -100
|
107 |
+
|
108 |
+
with torch.no_grad():
|
109 |
+
outputs = model(input_ids, labels=target_ids)
|
110 |
+
neg_log_likelihood = outputs.loss
|
111 |
+
|
112 |
+
nlls.append(neg_log_likelihood)
|
113 |
+
|
114 |
+
prev_end_loc = end_loc
|
115 |
+
if end_loc == seq_len:
|
116 |
+
break
|
117 |
+
|
118 |
+
ppl = torch.exp(torch.stack(nlls).mean())
|
119 |
+
return ppl.item()
|
plagiarism.py
CHANGED
@@ -8,6 +8,7 @@ import asyncio
|
|
8 |
import httpx
|
9 |
from bs4 import BeautifulSoup
|
10 |
import numpy as np
|
|
|
11 |
|
12 |
|
13 |
WORD = re.compile(r"\w+")
|
@@ -129,7 +130,7 @@ def split_sentence_blocks(text):
|
|
129 |
sents = sent_tokenize(text)
|
130 |
two_sents = []
|
131 |
for i in range(len(sents)):
|
132 |
-
if (i %
|
133 |
two_sents.append(sents[i])
|
134 |
else:
|
135 |
two_sents[len(two_sents) - 1] += " " + sents[i]
|
@@ -188,9 +189,9 @@ async def parallel_scrap(urls):
|
|
188 |
return results
|
189 |
|
190 |
|
191 |
-
def matching_score(
|
192 |
-
sentence = remove_punc(
|
193 |
-
content = remove_punc(
|
194 |
if sentence in content:
|
195 |
return 1
|
196 |
else:
|
@@ -250,11 +251,14 @@ def plagiarism_check(
|
|
250 |
if soup:
|
251 |
page_content = soup.text
|
252 |
for j, sent in enumerate(sentences):
|
253 |
-
|
254 |
-
score = matching_score(
|
255 |
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
256 |
ScoreArray[i][j] = score
|
257 |
|
|
|
|
|
|
|
258 |
# *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
|
259 |
# source_embeddings = []
|
260 |
# for i, soup in enumerate(soups):
|
|
|
8 |
import httpx
|
9 |
from bs4 import BeautifulSoup
|
10 |
import numpy as np
|
11 |
+
import concurrent
|
12 |
|
13 |
|
14 |
WORD = re.compile(r"\w+")
|
|
|
130 |
sents = sent_tokenize(text)
|
131 |
two_sents = []
|
132 |
for i in range(len(sents)):
|
133 |
+
if (i % 4) == 0:
|
134 |
two_sents.append(sents[i])
|
135 |
else:
|
136 |
two_sents[len(two_sents) - 1] += " " + sents[i]
|
|
|
189 |
return results
|
190 |
|
191 |
|
192 |
+
def matching_score(args_list):
|
193 |
+
sentence = remove_punc(args_list[0])
|
194 |
+
content = remove_punc(args_list[1])
|
195 |
if sentence in content:
|
196 |
return 1
|
197 |
else:
|
|
|
251 |
if soup:
|
252 |
page_content = soup.text
|
253 |
for j, sent in enumerate(sentences):
|
254 |
+
args_list = (sent, page_content)
|
255 |
+
score = matching_score(args_list)
|
256 |
# score = cos_sim_torch(embed_text(sent), source_embeddings[i])
|
257 |
ScoreArray[i][j] = score
|
258 |
|
259 |
+
# with concurrent.futures.ProcessPoolExecutor() as executor:
|
260 |
+
# results = executor.map(matching_score, args_list)
|
261 |
+
|
262 |
# *****IF THIS IS TO BE USED, PLEASE PROVIDE "preprocess()" FUNCTION IN LINE 248**************
|
263 |
# source_embeddings = []
|
264 |
# for i, soup in enumerate(soups):
|
predictors.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import httpx
|
3 |
+
import torch
|
4 |
+
import re
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import numpy as np
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
8 |
+
import asyncio
|
9 |
+
from evaluate import load
|
10 |
+
from datetime import date
|
11 |
+
import nltk
|
12 |
+
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
|
13 |
+
import plotly.graph_objects as go
|
14 |
+
import torch.nn.functional as F
|
15 |
+
import nltk
|
16 |
+
from unidecode import unidecode
|
17 |
+
import time
|
18 |
+
from scipy.special import softmax
|
19 |
+
import yaml
|
20 |
+
import os
|
21 |
+
from utils import *
|
22 |
+
from dotenv import load_dotenv
|
23 |
+
|
24 |
+
with open("config.yaml", "r") as file:
|
25 |
+
params = yaml.safe_load(file)
|
26 |
+
nltk.download("punkt")
|
27 |
+
nltk.download("stopwords")
|
28 |
+
load_dotenv()
|
29 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
30 |
+
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
31 |
+
text_mc_model_path = params["TEXT_MC_MODEL_PATH"]
|
32 |
+
text_quillbot_model_path = params["TEXT_QUILLBOT_MODEL_PATH"]
|
33 |
+
quillbot_labels = params["QUILLBOT_LABELS"]
|
34 |
+
mc_label_map = params["MC_OUTPUT_LABELS"]
|
35 |
+
mc_token_size = int(params["MC_TOKEN_SIZE"])
|
36 |
+
bc_token_size = int(params["BC_TOKEN_SIZE"])
|
37 |
+
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
38 |
+
text_bc_model = AutoModelForSequenceClassification.from_pretrained(
|
39 |
+
text_bc_model_path
|
40 |
+
).to(device)
|
41 |
+
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
42 |
+
text_mc_model = AutoModelForSequenceClassification.from_pretrained(
|
43 |
+
text_mc_model_path
|
44 |
+
).to(device)
|
45 |
+
quillbot_tokenizer = AutoTokenizer.from_pretrained(text_quillbot_model_path)
|
46 |
+
quillbot_model = AutoModelForSequenceClassification.from_pretrained(
|
47 |
+
text_quillbot_model_path
|
48 |
+
).to(device)
|
49 |
+
|
50 |
+
|
51 |
+
def split_text_allow_complete_sentences_nltk(
|
52 |
+
text,
|
53 |
+
max_length=256,
|
54 |
+
tolerance=30,
|
55 |
+
min_last_segment_length=100,
|
56 |
+
type_det="bc",
|
57 |
+
):
|
58 |
+
sentences = nltk.sent_tokenize(text)
|
59 |
+
segments = []
|
60 |
+
current_segment = []
|
61 |
+
current_length = 0
|
62 |
+
if type_det == "bc":
|
63 |
+
tokenizer = text_bc_tokenizer
|
64 |
+
max_length = bc_token_size
|
65 |
+
elif type_det == "mc":
|
66 |
+
tokenizer = text_mc_tokenizer
|
67 |
+
max_length = mc_token_size
|
68 |
+
for sentence in sentences:
|
69 |
+
tokens = tokenizer.tokenize(sentence)
|
70 |
+
sentence_length = len(tokens)
|
71 |
+
|
72 |
+
if current_length + sentence_length <= max_length + tolerance - 2:
|
73 |
+
current_segment.append(sentence)
|
74 |
+
current_length += sentence_length
|
75 |
+
else:
|
76 |
+
if current_segment:
|
77 |
+
encoded_segment = tokenizer.encode(
|
78 |
+
" ".join(current_segment),
|
79 |
+
add_special_tokens=True,
|
80 |
+
max_length=max_length + tolerance,
|
81 |
+
truncation=True,
|
82 |
+
)
|
83 |
+
segments.append((current_segment, len(encoded_segment)))
|
84 |
+
current_segment = [sentence]
|
85 |
+
current_length = sentence_length
|
86 |
+
|
87 |
+
if current_segment:
|
88 |
+
encoded_segment = tokenizer.encode(
|
89 |
+
" ".join(current_segment),
|
90 |
+
add_special_tokens=True,
|
91 |
+
max_length=max_length + tolerance,
|
92 |
+
truncation=True,
|
93 |
+
)
|
94 |
+
segments.append((current_segment, len(encoded_segment)))
|
95 |
+
|
96 |
+
final_segments = []
|
97 |
+
for i, (seg, length) in enumerate(segments):
|
98 |
+
if i == len(segments) - 1:
|
99 |
+
if length < min_last_segment_length and len(final_segments) > 0:
|
100 |
+
prev_seg, prev_length = final_segments[-1]
|
101 |
+
combined_encoded = tokenizer.encode(
|
102 |
+
" ".join(prev_seg + seg),
|
103 |
+
add_special_tokens=True,
|
104 |
+
max_length=max_length + tolerance,
|
105 |
+
truncation=True,
|
106 |
+
)
|
107 |
+
if len(combined_encoded) <= max_length + tolerance:
|
108 |
+
final_segments[-1] = (prev_seg + seg, len(combined_encoded))
|
109 |
+
else:
|
110 |
+
final_segments.append((seg, length))
|
111 |
+
else:
|
112 |
+
final_segments.append((seg, length))
|
113 |
+
else:
|
114 |
+
final_segments.append((seg, length))
|
115 |
+
|
116 |
+
decoded_segments = []
|
117 |
+
encoded_segments = []
|
118 |
+
for seg, _ in final_segments:
|
119 |
+
encoded_segment = tokenizer.encode(
|
120 |
+
" ".join(seg),
|
121 |
+
add_special_tokens=True,
|
122 |
+
max_length=max_length + tolerance,
|
123 |
+
truncation=True,
|
124 |
+
)
|
125 |
+
decoded_segment = tokenizer.decode(encoded_segment)
|
126 |
+
decoded_segments.append(decoded_segment)
|
127 |
+
return decoded_segments
|
128 |
+
|
129 |
+
|
130 |
+
def predict_quillbot(text):
|
131 |
+
with torch.no_grad():
|
132 |
+
quillbot_model.eval()
|
133 |
+
tokenized_text = quillbot_tokenizer(
|
134 |
+
text,
|
135 |
+
padding="max_length",
|
136 |
+
truncation=True,
|
137 |
+
max_length=256,
|
138 |
+
return_tensors="pt",
|
139 |
+
).to(device)
|
140 |
+
output = quillbot_model(**tokenized_text)
|
141 |
+
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
142 |
+
q_score = {
|
143 |
+
"Humanized": output_norm[1].item(),
|
144 |
+
"Original": output_norm[0].item(),
|
145 |
+
}
|
146 |
+
return q_score
|
147 |
+
|
148 |
+
|
149 |
+
def predict_bc(model, tokenizer, text):
|
150 |
+
with torch.no_grad():
|
151 |
+
model.eval()
|
152 |
+
tokens = text_bc_tokenizer(
|
153 |
+
text,
|
154 |
+
padding="max_length",
|
155 |
+
truncation=True,
|
156 |
+
max_length=bc_token_size,
|
157 |
+
return_tensors="pt",
|
158 |
+
).to(device)
|
159 |
+
output = model(**tokens)
|
160 |
+
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
161 |
+
return output_norm
|
162 |
+
|
163 |
+
|
164 |
+
def predict_mc(model, tokenizer, text):
|
165 |
+
with torch.no_grad():
|
166 |
+
model.eval()
|
167 |
+
tokens = text_mc_tokenizer(
|
168 |
+
text,
|
169 |
+
padding="max_length",
|
170 |
+
truncation=True,
|
171 |
+
return_tensors="pt",
|
172 |
+
max_length=mc_token_size,
|
173 |
+
).to(device)
|
174 |
+
output = model(**tokens)
|
175 |
+
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
176 |
+
return output_norm
|
177 |
+
|
178 |
+
|
179 |
+
def predict_mc_scores(input):
|
180 |
+
bc_scores = []
|
181 |
+
mc_scores = []
|
182 |
+
|
183 |
+
samples_len_bc = len(
|
184 |
+
split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
185 |
+
)
|
186 |
+
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
187 |
+
for i in range(samples_len_bc):
|
188 |
+
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
189 |
+
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
190 |
+
bc_scores.append(bc_score)
|
191 |
+
bc_scores_array = np.array(bc_scores)
|
192 |
+
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
193 |
+
bc_score_list = average_bc_scores.tolist()
|
194 |
+
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
195 |
+
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
196 |
+
samples_len_mc = len(
|
197 |
+
split_text_allow_complete_sentences_nltk(input, type_det="mc")
|
198 |
+
)
|
199 |
+
for i in range(samples_len_mc):
|
200 |
+
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
201 |
+
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
202 |
+
mc_scores.append(mc_score)
|
203 |
+
mc_scores_array = np.array(mc_scores)
|
204 |
+
average_mc_scores = np.mean(mc_scores_array, axis=0)
|
205 |
+
mc_score_list = average_mc_scores.tolist()
|
206 |
+
mc_score = {}
|
207 |
+
for score, label in zip(mc_score_list, mc_label_map):
|
208 |
+
mc_score[label.upper()] = score
|
209 |
+
|
210 |
+
sum_prob = 1 - bc_score["HUMAN"]
|
211 |
+
for key, value in mc_score.items():
|
212 |
+
mc_score[key] = value * sum_prob
|
213 |
+
if sum_prob < 0.01:
|
214 |
+
mc_score = {}
|
215 |
+
|
216 |
+
return mc_score
|
217 |
+
|
218 |
+
|
219 |
+
def predict_bc_scores(input):
|
220 |
+
bc_scores = []
|
221 |
+
mc_scores = []
|
222 |
+
samples_len_bc = len(
|
223 |
+
split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
224 |
+
)
|
225 |
+
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det="bc")
|
226 |
+
for i in range(samples_len_bc):
|
227 |
+
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
228 |
+
bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text_bc)
|
229 |
+
bc_scores.append(bc_score)
|
230 |
+
bc_scores_array = np.array(bc_scores)
|
231 |
+
average_bc_scores = np.mean(bc_scores_array, axis=0)
|
232 |
+
bc_score_list = average_bc_scores.tolist()
|
233 |
+
bc_score = {"AI": bc_score_list[1], "HUMAN": bc_score_list[0]}
|
234 |
+
return bc_score
|
235 |
+
|
236 |
+
|
237 |
+
# def predict_1on1(input):
|
238 |
+
# models = ['bard', 'claude', 'gpt4', 'mistral_ai', 'llama2']
|
239 |
+
# text = str(row["text"])
|
240 |
+
# predictions = {}
|
241 |
+
# prediction = predict(text, bard_model, bard_tokenizer) predictions['bard'] = prediction[1]
|
242 |
+
# prediction = predict(text, claude_model, claude_tokenizer) predictions['claude'] = prediction[1]
|
243 |
+
# prediction = predict(text, gpt4_model, gpt4_tokenizer) predictions['gpt4'] = prediction[1]
|
244 |
+
# prediction = predict(text, mistral_ai_model, mistral_ai_tokenizer) predictions['mistral_ai'] = prediction[1]
|
245 |
+
# prediction = predict(text, llama2_model, llama2_tokenizer) predictions['llama2'] = prediction[1]
|
246 |
+
# max_key = max(predictions, key=predictions.get)
|
requirements.txt
CHANGED
@@ -6,8 +6,8 @@ BeautifulSoup4
|
|
6 |
scrapingbee
|
7 |
requests
|
8 |
numpy
|
9 |
-
torch
|
10 |
-
transformers
|
11 |
transformers-interpret
|
12 |
textstat
|
13 |
scipy
|
|
|
6 |
scrapingbee
|
7 |
requests
|
8 |
numpy
|
9 |
+
torch
|
10 |
+
transformers
|
11 |
transformers-interpret
|
12 |
textstat
|
13 |
scipy
|
utils.py
CHANGED
@@ -11,284 +11,354 @@ import asyncio
|
|
11 |
import nltk
|
12 |
from sentence_transformers import SentenceTransformer, util
|
13 |
import threading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
nltk.download('punkt')
|
16 |
|
17 |
WORD = re.compile(r"\w+")
|
18 |
-
model = SentenceTransformer(
|
19 |
|
20 |
|
21 |
# returns cosine similarity of two vectors
|
22 |
# input: two vectors
|
23 |
# output: integer between 0 and 1.
|
24 |
-
def get_cosine(vec1, vec2):
|
25 |
-
|
26 |
|
27 |
-
|
28 |
-
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
# checking for divide by zero
|
36 |
-
if denominator == 0:
|
37 |
-
return 0.0
|
38 |
-
else:
|
39 |
-
return float(numerator) / denominator
|
40 |
-
|
41 |
-
|
42 |
-
# converts given text into a vector
|
43 |
-
def text_to_vector(text):
|
44 |
-
# uses the Regular expression above and gets all words
|
45 |
-
words = WORD.findall(text)
|
46 |
-
# returns a counter of all the words (count of number of occurences)
|
47 |
-
return Counter(words)
|
48 |
-
|
49 |
-
|
50 |
-
# returns cosine similarity of two words
|
51 |
-
# uses: text_to_vector(text) and get_cosine(v1,v2)
|
52 |
-
def cosineSim(text1, text2):
|
53 |
-
vector1 = text_to_vector(text1)
|
54 |
-
vector2 = text_to_vector(text2)
|
55 |
-
# print vector1,vector2
|
56 |
-
cosine = get_cosine(vector1, vector2)
|
57 |
-
return cosine
|
58 |
-
|
59 |
-
def cos_sim_torch(embedding_1, embedding_2):
|
60 |
-
return util.pytorch_cos_sim(embedding_1, embedding_2).item()
|
61 |
-
|
62 |
-
def embed_text(text):
|
63 |
-
return model.encode(text, convert_to_tensor=True)
|
64 |
-
|
65 |
-
def sentence_similarity(text1, text2):
|
66 |
-
embedding_1= model.encode(text1, convert_to_tensor=True)
|
67 |
-
embedding_2 = model.encode(text2, convert_to_tensor=True)
|
68 |
-
|
69 |
-
o = util.pytorch_cos_sim(embedding_1, embedding_2)
|
70 |
-
return o.item()
|
71 |
-
|
72 |
-
def get_soup_requests(url):
|
73 |
-
page = requests.get(url)
|
74 |
-
if page.status_code == 200:
|
75 |
-
soup = BeautifulSoup(page.content, "html.parser")
|
76 |
-
return soup
|
77 |
-
print("HTML soup failed")
|
78 |
-
return None
|
79 |
-
|
80 |
-
|
81 |
-
def get_soup_httpx(url):
|
82 |
-
client = httpx.Client(timeout=30)
|
83 |
-
try:
|
84 |
-
page = client.get(url)
|
85 |
-
if page.status_code == httpx.codes.OK:
|
86 |
-
soup = BeautifulSoup(page.content, "html.parser")
|
87 |
-
return soup
|
88 |
-
except:
|
89 |
-
print("HTTPx soup failed")
|
90 |
-
return None
|
91 |
-
|
92 |
-
def getSentences(text):
|
93 |
-
from nltk.tokenize import sent_tokenize
|
94 |
-
|
95 |
-
sents = sent_tokenize(text)
|
96 |
-
two_sents = []
|
97 |
-
for i in range(len(sents)):
|
98 |
-
if (i % 2) == 0:
|
99 |
-
two_sents.append(sents[i])
|
100 |
-
else:
|
101 |
-
two_sents[len(two_sents) - 1] += " " + sents[i]
|
102 |
-
return two_sents
|
103 |
-
|
104 |
-
|
105 |
-
def googleSearch(
|
106 |
-
plag_option,
|
107 |
-
sentences,
|
108 |
-
urlCount,
|
109 |
-
scoreArray,
|
110 |
-
urlList,
|
111 |
-
sorted_date,
|
112 |
-
domains_to_skip,
|
113 |
-
api_key,
|
114 |
-
cse_id,
|
115 |
-
**kwargs,
|
116 |
-
):
|
117 |
-
service = build("customsearch", "v1", developerKey=api_key)
|
118 |
-
for i, sentence in enumerate(sentences):
|
119 |
-
results = (
|
120 |
-
service.cse()
|
121 |
-
.list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
|
122 |
-
.execute()
|
123 |
-
)
|
124 |
-
if "items" in results and len(results["items"]) > 0:
|
125 |
-
for count, link in enumerate(results["items"]):
|
126 |
-
# stop after 3 pages
|
127 |
-
if count >= 3:
|
128 |
-
break
|
129 |
-
# skip user selected domains
|
130 |
-
if any(
|
131 |
-
("." + domain) in link["link"]
|
132 |
-
for domain in domains_to_skip
|
133 |
-
):
|
134 |
-
continue
|
135 |
-
# clean up snippet of '...'
|
136 |
-
snippet = link["snippet"]
|
137 |
-
ind = snippet.find("...")
|
138 |
-
if ind < 20 and ind > 9:
|
139 |
-
snippet = snippet[ind + len("... ") :]
|
140 |
-
ind = snippet.find("...")
|
141 |
-
if ind > len(snippet) - 5:
|
142 |
-
snippet = snippet[:ind]
|
143 |
-
|
144 |
-
# update cosine similarity between snippet and given text
|
145 |
-
url = link["link"]
|
146 |
-
if url not in urlList:
|
147 |
-
urlList.append(url)
|
148 |
-
scoreArray.append([0] * len(sentences))
|
149 |
-
urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
150 |
-
if plag_option == 'Standard':
|
151 |
-
scoreArray[urlList.index(url)][i] = cosineSim(
|
152 |
-
sentence, snippet)
|
153 |
-
else :
|
154 |
-
scoreArray[urlList.index(url)][i] = sentence_similarity(
|
155 |
-
sentence, snippet
|
156 |
-
)
|
157 |
-
else:
|
158 |
-
print("Google Search failed")
|
159 |
-
return urlCount, scoreArray
|
160 |
-
|
161 |
-
|
162 |
-
def getQueries(text, n):
|
163 |
-
# return n-grams of size n
|
164 |
-
words = text.split()
|
165 |
-
return [words[i : i + n] for i in range(len(words) - n + 1)]
|
166 |
-
|
167 |
-
|
168 |
-
def print2D(array):
|
169 |
-
print(np.array(array))
|
170 |
-
|
171 |
-
|
172 |
-
def removePunc(text):
|
173 |
-
res = re.sub(r"[^\w\s]", "", text)
|
174 |
-
return res
|
175 |
-
|
176 |
-
|
177 |
-
async def get_url_data(url, client):
|
178 |
-
try:
|
179 |
-
r = await client.get(url)
|
180 |
-
# print(r.status_code)
|
181 |
-
if r.status_code == 200:
|
182 |
-
# print("in")
|
183 |
-
soup = BeautifulSoup(r.content, "html.parser")
|
184 |
-
return soup
|
185 |
-
except Exception:
|
186 |
-
print("HTTPx parallel soup failed")
|
187 |
-
return None
|
188 |
-
|
189 |
-
|
190 |
-
async def parallel_scrap(urls):
|
191 |
-
async with httpx.AsyncClient(timeout=30) as client:
|
192 |
-
tasks = []
|
193 |
-
for url in urls:
|
194 |
-
tasks.append(get_url_data(url=url, client=client))
|
195 |
-
results = await asyncio.gather(*tasks, return_exceptions=True)
|
196 |
-
return results
|
197 |
-
|
198 |
-
|
199 |
-
class TimeoutError(Exception):
|
200 |
-
pass
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
def matchingScore(sentence, content):
|
205 |
-
if sentence in content:
|
206 |
-
return 1
|
207 |
-
sentence = removePunc(sentence)
|
208 |
-
content = removePunc(content)
|
209 |
-
if sentence in content:
|
210 |
-
return 1
|
211 |
-
else:
|
212 |
-
n = 5
|
213 |
-
ngrams = getQueries(sentence, n)
|
214 |
-
if len(ngrams) == 0:
|
215 |
-
return 0
|
216 |
-
matched = [x for x in ngrams if " ".join(x) in content]
|
217 |
-
return len(matched) / len(ngrams)
|
218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
219 |
|
220 |
-
# def matchingScoreWithTimeout(sentence, content):
|
221 |
-
# def timeout_handler():
|
222 |
-
# raise TimeoutError("Function timed out")
|
223 |
|
224 |
-
#
|
225 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
# try:
|
227 |
-
#
|
228 |
-
#
|
229 |
-
#
|
230 |
-
#
|
231 |
-
# except
|
232 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
|
|
|
|
234 |
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
# content = removePunc(content)
|
237 |
# for j, sentence in enumerate(sentences):
|
238 |
# sentence = removePunc(sentence)
|
239 |
-
#
|
240 |
-
#
|
241 |
-
# else:
|
242 |
-
# n = 5
|
243 |
-
# ngrams = getQueries(sentence, n)
|
244 |
-
# if len(ngrams) == 0:
|
245 |
-
# return 0
|
246 |
-
# matched = [x for x in ngrams if " ".join(x) in content]
|
247 |
-
# ScoreArray[content_idx][j] = len(matched) / len(ngrams)
|
248 |
# print(
|
249 |
-
# f"Analyzed {content_idx+1} of
|
250 |
# )
|
251 |
# return ScoreArray
|
252 |
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
|
287 |
-
)
|
288 |
-
tasks[i][j] = sentence_similarity(sent, page_content)
|
289 |
-
else:
|
290 |
-
print(
|
291 |
-
f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
292 |
-
)
|
293 |
-
ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
294 |
-
return ScoreArray
|
|
|
11 |
import nltk
|
12 |
from sentence_transformers import SentenceTransformer, util
|
13 |
import threading
|
14 |
+
import torch
|
15 |
+
import re
|
16 |
+
import numpy as np
|
17 |
+
import asyncio
|
18 |
+
from datetime import date
|
19 |
+
import nltk
|
20 |
+
from unidecode import unidecode
|
21 |
+
from scipy.special import softmax
|
22 |
+
from transformers import AutoTokenizer
|
23 |
+
import yaml
|
24 |
+
import fitz
|
25 |
+
import os
|
26 |
+
|
27 |
+
|
28 |
+
def remove_accents(input_str):
|
29 |
+
text_no_accents = unidecode(input_str)
|
30 |
+
return text_no_accents
|
31 |
+
|
32 |
+
|
33 |
+
def remove_special_characters(text):
|
34 |
+
text = remove_accents(text)
|
35 |
+
pattern = r'[^\w\s\d.,!?\'"()-;]+'
|
36 |
+
text = re.sub(pattern, "", text)
|
37 |
+
return text
|
38 |
+
|
39 |
+
|
40 |
+
def remove_special_characters_2(text):
|
41 |
+
pattern = r"[^a-zA-Z0-9 ]+"
|
42 |
+
text = re.sub(pattern, "", text)
|
43 |
+
return text
|
44 |
+
|
45 |
+
|
46 |
+
def update_character_count(text):
|
47 |
+
return f"{len(text)} characters"
|
48 |
+
|
49 |
+
|
50 |
+
nltk.download("punkt")
|
51 |
+
|
52 |
+
|
53 |
+
with open("config.yaml", "r") as file:
|
54 |
+
params = yaml.safe_load(file)
|
55 |
+
|
56 |
+
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
|
57 |
+
|
58 |
+
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
59 |
+
|
60 |
+
|
61 |
+
def len_validator(text):
|
62 |
+
min_tokens = 200
|
63 |
+
lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
|
64 |
+
if lengt < min_tokens:
|
65 |
+
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
|
66 |
+
else:
|
67 |
+
return f"Input length ({lengt}) is satisified."
|
68 |
+
|
69 |
+
|
70 |
+
def extract_text_from_pdf(pdf_path):
|
71 |
+
doc = fitz.open(pdf_path)
|
72 |
+
text = ""
|
73 |
+
for page in doc:
|
74 |
+
text += page.get_text()
|
75 |
+
return text
|
76 |
|
|
|
77 |
|
78 |
WORD = re.compile(r"\w+")
|
79 |
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
80 |
|
81 |
|
82 |
# returns cosine similarity of two vectors
|
83 |
# input: two vectors
|
84 |
# output: integer between 0 and 1.
|
85 |
+
# def get_cosine(vec1, vec2):
|
86 |
+
# intersection = set(vec1.keys()) & set(vec2.keys())
|
87 |
|
88 |
+
# # calculating numerator
|
89 |
+
# numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
90 |
|
91 |
+
# # calculating denominator
|
92 |
+
# sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
|
93 |
+
# sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
|
94 |
+
# denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
95 |
+
|
96 |
+
# # checking for divide by zero
|
97 |
+
# if denominator == 0:
|
98 |
+
# return 0.0
|
99 |
+
# else:
|
100 |
+
# return float(numerator) / denominator
|
101 |
+
|
102 |
+
|
103 |
+
# # converts given text into a vector
|
104 |
+
# def text_to_vector(text):
|
105 |
+
# # uses the Regular expression above and gets all words
|
106 |
+
# words = WORD.findall(text)
|
107 |
+
# # returns a counter of all the words (count of number of occurences)
|
108 |
+
# return Counter(words)
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
+
# # returns cosine similarity of two words
|
112 |
+
# # uses: text_to_vector(text) and get_cosine(v1,v2)
|
113 |
+
# def cosineSim(text1, text2):
|
114 |
+
# vector1 = text_to_vector(text1)
|
115 |
+
# vector2 = text_to_vector(text2)
|
116 |
+
# # print vector1,vector2
|
117 |
+
# cosine = get_cosine(vector1, vector2)
|
118 |
+
# return cosine
|
119 |
|
|
|
|
|
|
|
120 |
|
121 |
+
# def cos_sim_torch(embedding_1, embedding_2):
|
122 |
+
# return util.pytorch_cos_sim(embedding_1, embedding_2).item()
|
123 |
+
|
124 |
+
|
125 |
+
# def embed_text(text):
|
126 |
+
# return model.encode(text, convert_to_tensor=True)
|
127 |
+
|
128 |
+
|
129 |
+
# def sentence_similarity(text1, text2):
|
130 |
+
# embedding_1 = model.encode(text1, convert_to_tensor=True)
|
131 |
+
# embedding_2 = model.encode(text2, convert_to_tensor=True)
|
132 |
+
|
133 |
+
# o = util.pytorch_cos_sim(embedding_1, embedding_2)
|
134 |
+
# return o.item()
|
135 |
+
|
136 |
+
|
137 |
+
# def get_soup_requests(url):
|
138 |
+
# page = requests.get(url)
|
139 |
+
# if page.status_code == 200:
|
140 |
+
# soup = BeautifulSoup(page.content, "html.parser")
|
141 |
+
# return soup
|
142 |
+
# print("HTML soup failed")
|
143 |
+
# return None
|
144 |
+
|
145 |
+
|
146 |
+
# def get_soup_httpx(url):
|
147 |
+
# client = httpx.Client(timeout=30)
|
148 |
# try:
|
149 |
+
# page = client.get(url)
|
150 |
+
# if page.status_code == httpx.codes.OK:
|
151 |
+
# soup = BeautifulSoup(page.content, "html.parser")
|
152 |
+
# return soup
|
153 |
+
# except:
|
154 |
+
# print("HTTPx soup failed")
|
155 |
+
# return None
|
156 |
+
|
157 |
+
|
158 |
+
# def getSentences(text):
|
159 |
+
# from nltk.tokenize import sent_tokenize
|
160 |
+
|
161 |
+
# sents = sent_tokenize(text)
|
162 |
+
# two_sents = []
|
163 |
+
# for i in range(len(sents)):
|
164 |
+
# if (i % 2) == 0:
|
165 |
+
# two_sents.append(sents[i])
|
166 |
+
# else:
|
167 |
+
# two_sents[len(two_sents) - 1] += " " + sents[i]
|
168 |
+
# return two_sents
|
169 |
+
|
170 |
+
|
171 |
+
# def googleSearch(
|
172 |
+
# plag_option,
|
173 |
+
# sentences,
|
174 |
+
# urlCount,
|
175 |
+
# scoreArray,
|
176 |
+
# urlList,
|
177 |
+
# sorted_date,
|
178 |
+
# domains_to_skip,
|
179 |
+
# api_key,
|
180 |
+
# cse_id,
|
181 |
+
# **kwargs,
|
182 |
+
# ):
|
183 |
+
# service = build("customsearch", "v1", developerKey=api_key)
|
184 |
+
# for i, sentence in enumerate(sentences):
|
185 |
+
# results = (
|
186 |
+
# service.cse()
|
187 |
+
# .list(q=sentence, cx=cse_id, sort=sorted_date, **kwargs)
|
188 |
+
# .execute()
|
189 |
+
# )
|
190 |
+
# if "items" in results and len(results["items"]) > 0:
|
191 |
+
# for count, link in enumerate(results["items"]):
|
192 |
+
# # stop after 3 pages
|
193 |
+
# if count >= 3:
|
194 |
+
# break
|
195 |
+
# # skip user selected domains
|
196 |
+
# if any(
|
197 |
+
# ("." + domain) in link["link"] for domain in domains_to_skip
|
198 |
+
# ):
|
199 |
+
# continue
|
200 |
+
# # clean up snippet of '...'
|
201 |
+
# snippet = link["snippet"]
|
202 |
+
# ind = snippet.find("...")
|
203 |
+
# if ind < 20 and ind > 9:
|
204 |
+
# snippet = snippet[ind + len("... ") :]
|
205 |
+
# ind = snippet.find("...")
|
206 |
+
# if ind > len(snippet) - 5:
|
207 |
+
# snippet = snippet[:ind]
|
208 |
+
|
209 |
+
# # update cosine similarity between snippet and given text
|
210 |
+
# url = link["link"]
|
211 |
+
# if url not in urlList:
|
212 |
+
# urlList.append(url)
|
213 |
+
# scoreArray.append([0] * len(sentences))
|
214 |
+
# urlCount[url] = urlCount[url] + 1 if url in urlCount else 1
|
215 |
+
# if plag_option == "Standard":
|
216 |
+
# scoreArray[urlList.index(url)][i] = cosineSim(
|
217 |
+
# sentence, snippet
|
218 |
+
# )
|
219 |
+
# else:
|
220 |
+
# scoreArray[urlList.index(url)][i] = sentence_similarity(
|
221 |
+
# sentence, snippet
|
222 |
+
# )
|
223 |
+
# else:
|
224 |
+
# print("Google Search failed")
|
225 |
+
# return urlCount, scoreArray
|
226 |
+
|
227 |
+
|
228 |
+
# def getQueries(text, n):
|
229 |
+
# # return n-grams of size n
|
230 |
+
# words = text.split()
|
231 |
+
# return [words[i : i + n] for i in range(len(words) - n + 1)]
|
232 |
+
|
233 |
|
234 |
+
# def print2D(array):
|
235 |
+
# print(np.array(array))
|
236 |
|
237 |
+
|
238 |
+
# def removePunc(text):
|
239 |
+
# res = re.sub(r"[^\w\s]", "", text)
|
240 |
+
# return res
|
241 |
+
|
242 |
+
|
243 |
+
# async def get_url_data(url, client):
|
244 |
+
# try:
|
245 |
+
# r = await client.get(url)
|
246 |
+
# # print(r.status_code)
|
247 |
+
# if r.status_code == 200:
|
248 |
+
# # print("in")
|
249 |
+
# soup = BeautifulSoup(r.content, "html.parser")
|
250 |
+
# return soup
|
251 |
+
# except Exception:
|
252 |
+
# print("HTTPx parallel soup failed")
|
253 |
+
# return None
|
254 |
+
|
255 |
+
|
256 |
+
# async def parallel_scrap(urls):
|
257 |
+
# async with httpx.AsyncClient(timeout=30) as client:
|
258 |
+
# tasks = []
|
259 |
+
# for url in urls:
|
260 |
+
# tasks.append(get_url_data(url=url, client=client))
|
261 |
+
# results = await asyncio.gather(*tasks, return_exceptions=True)
|
262 |
+
# return results
|
263 |
+
|
264 |
+
|
265 |
+
# class TimeoutError(Exception):
|
266 |
+
# pass
|
267 |
+
|
268 |
+
|
269 |
+
# def matchingScore(sentence, content):
|
270 |
+
# if sentence in content:
|
271 |
+
# return 1
|
272 |
+
# sentence = removePunc(sentence)
|
273 |
+
# content = removePunc(content)
|
274 |
+
# if sentence in content:
|
275 |
+
# return 1
|
276 |
+
# else:
|
277 |
+
# n = 5
|
278 |
+
# ngrams = getQueries(sentence, n)
|
279 |
+
# if len(ngrams) == 0:
|
280 |
+
# return 0
|
281 |
+
# matched = [x for x in ngrams if " ".join(x) in content]
|
282 |
+
# return len(matched) / len(ngrams)
|
283 |
+
|
284 |
+
|
285 |
+
# # def matchingScoreWithTimeout(sentence, content):
|
286 |
+
# # def timeout_handler():
|
287 |
+
# # raise TimeoutError("Function timed out")
|
288 |
+
|
289 |
+
# # timer = threading.Timer(10, timeout_handler) # Set a timer for 2 seconds
|
290 |
+
# # timer.start()
|
291 |
+
# # try:
|
292 |
+
# # score = sentence_similarity(sentence, content)
|
293 |
+
# # # score = matchingScore(sentence, content)
|
294 |
+
# # timer.cancel() # Cancel the timer if calculation completes before timeout
|
295 |
+
# # return score
|
296 |
+
# # except TimeoutError:
|
297 |
+
# # return 0
|
298 |
+
|
299 |
+
|
300 |
+
# # async def matchingScoreAsync(sentences, content, content_idx, ScoreArray):
|
301 |
+
# # content = removePunc(content)
|
302 |
+
# # for j, sentence in enumerate(sentences):
|
303 |
+
# # sentence = removePunc(sentence)
|
304 |
+
# # if sentence in content:
|
305 |
+
# # ScoreArray[content_idx][j] = 1
|
306 |
+
# # else:
|
307 |
+
# # n = 5
|
308 |
+
# # ngrams = getQueries(sentence, n)
|
309 |
+
# # if len(ngrams) == 0:
|
310 |
+
# # return 0
|
311 |
+
# # matched = [x for x in ngrams if " ".join(x) in content]
|
312 |
+
# # ScoreArray[content_idx][j] = len(matched) / len(ngrams)
|
313 |
+
# # print(
|
314 |
+
# # f"Analyzed {content_idx+1} of soups (SOUP SUCCEEDED)........................"
|
315 |
+
# # )
|
316 |
+
# # return ScoreArray
|
317 |
+
|
318 |
+
|
319 |
+
# async def matchingScoreAsync(
|
320 |
+
# sentences, content, content_idx, ScoreArray, model, util
|
321 |
+
# ):
|
322 |
# content = removePunc(content)
|
323 |
# for j, sentence in enumerate(sentences):
|
324 |
# sentence = removePunc(sentence)
|
325 |
+
# similarity_score = sentence_similarity(sentence, content, model, util)
|
326 |
+
# ScoreArray[content_idx][j] = similarity_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
# print(
|
328 |
+
# f"Analyzed {content_idx+1} of contents (CONTENT ANALYZED)........................"
|
329 |
# )
|
330 |
# return ScoreArray
|
331 |
|
332 |
+
|
333 |
+
# async def parallel_analyze(soups, sentences, ScoreArray):
|
334 |
+
# tasks = []
|
335 |
+
# for i, soup in enumerate(soups):
|
336 |
+
# if soup:
|
337 |
+
# page_content = soup.text
|
338 |
+
# tasks.append(
|
339 |
+
# matchingScoreAsync(sentences, page_content, i, ScoreArray)
|
340 |
+
# )
|
341 |
+
# else:
|
342 |
+
# print(
|
343 |
+
# f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
344 |
+
# )
|
345 |
+
# ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
346 |
+
# return ScoreArray
|
347 |
+
|
348 |
+
|
349 |
+
# async def parallel_analyze_2(soups, sentences, ScoreArray):
|
350 |
+
# tasks = [[0] * len(ScoreArray[0]) for i in range(len(ScoreArray))]
|
351 |
+
# for i, soup in enumerate(soups):
|
352 |
+
# if soup:
|
353 |
+
# page_content = soup.text
|
354 |
+
# for j, sent in enumerate(sentences):
|
355 |
+
# print(
|
356 |
+
# f"Analyzing {i+1} of {len(soups)} soups with {j+1} of {len(sentences)} sentences........................"
|
357 |
+
# )
|
358 |
+
# tasks[i][j] = sentence_similarity(sent, page_content)
|
359 |
+
# else:
|
360 |
+
# print(
|
361 |
+
# f"Analyzed {i+1} of soups (SOUP FAILED)........................"
|
362 |
+
# )
|
363 |
+
# ScoreArray = await asyncio.gather(*tasks, return_exceptions=True)
|
364 |
+
# return ScoreArray
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|