Spaces:
Running
Running
aliasgerovs
commited on
Merge branch 'main' into demo
Browse files
app.py
CHANGED
@@ -19,10 +19,9 @@ import nltk, spacy, subprocess, torch
|
|
19 |
import plotly.graph_objects as go
|
20 |
import nltk
|
21 |
from unidecode import unidecode
|
22 |
-
|
23 |
|
24 |
nltk.download('punkt')
|
25 |
-
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
|
26 |
|
27 |
from writing_analysis import (
|
28 |
normalize,
|
@@ -189,7 +188,7 @@ text_bc_model_path = "polygraf-ai/v3-bert-3-2m-trun-bc-lighter-spec"
|
|
189 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
190 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
|
191 |
|
192 |
-
text_mc_model_path = "polygraf-ai/text-
|
193 |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
194 |
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
|
195 |
|
@@ -214,11 +213,17 @@ def update_character_count(text):
|
|
214 |
return f"{len(text)} characters"
|
215 |
|
216 |
|
217 |
-
def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=
|
218 |
sentences = nltk.sent_tokenize(text)
|
219 |
segments = []
|
220 |
current_segment = []
|
221 |
-
current_length = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
for sentence in sentences:
|
224 |
tokens = tokenizer.tokenize(sentence)
|
@@ -263,7 +268,7 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=10,
|
|
263 |
|
264 |
|
265 |
def predict_bc(model, tokenizer, text):
|
266 |
-
tokens =
|
267 |
text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
|
268 |
).to(device)["input_ids"]
|
269 |
|
@@ -273,7 +278,7 @@ def predict_bc(model, tokenizer, text):
|
|
273 |
return output_norm
|
274 |
|
275 |
def predict_mc(model, tokenizer, text):
|
276 |
-
tokens =
|
277 |
text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
|
278 |
).to(device)["input_ids"]
|
279 |
output = model(tokens)
|
@@ -285,15 +290,19 @@ def ai_generated_test(ai_option, input):
|
|
285 |
|
286 |
bc_scores = []
|
287 |
mc_scores = []
|
288 |
-
|
289 |
-
|
|
|
|
|
290 |
|
291 |
-
for i in range(
|
292 |
-
cleaned_text_bc = remove_special_characters(
|
293 |
-
cleaned_text_mc = remove_special_characters(segments[i])
|
294 |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
|
295 |
-
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
296 |
bc_scores.append(bc_score)
|
|
|
|
|
|
|
|
|
297 |
mc_scores.append(mc_score)
|
298 |
|
299 |
bc_scores_array = np.array(bc_scores)
|
@@ -364,12 +373,12 @@ def build_date(year, month, day):
|
|
364 |
return f"{year}{months[month]}{day}"
|
365 |
|
366 |
def len_validator(text):
|
367 |
-
min_tokens =
|
368 |
-
lengt = len(
|
369 |
if lengt < min_tokens:
|
370 |
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
|
371 |
else :
|
372 |
-
return f"Input length is satisified."
|
373 |
|
374 |
def extract_text_from_pdf(pdf_path):
|
375 |
doc = fitz.open(pdf_path)
|
|
|
19 |
import plotly.graph_objects as go
|
20 |
import nltk
|
21 |
from unidecode import unidecode
|
22 |
+
|
23 |
|
24 |
nltk.download('punkt')
|
|
|
25 |
|
26 |
from writing_analysis import (
|
27 |
normalize,
|
|
|
188 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
189 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
|
190 |
|
191 |
+
text_mc_model_path = "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
|
192 |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
193 |
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
|
194 |
|
|
|
213 |
return f"{len(text)} characters"
|
214 |
|
215 |
|
216 |
+
def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40, min_last_segment_length=150, type_det='bc'):
|
217 |
sentences = nltk.sent_tokenize(text)
|
218 |
segments = []
|
219 |
current_segment = []
|
220 |
+
current_length = 0
|
221 |
+
|
222 |
+
if type_det == 'bc':
|
223 |
+
tokenizer = text_bc_tokenizer
|
224 |
+
|
225 |
+
elif type_det == 'mc':
|
226 |
+
tokenizer = text_mc_tokenizer
|
227 |
|
228 |
for sentence in sentences:
|
229 |
tokens = tokenizer.tokenize(sentence)
|
|
|
268 |
|
269 |
|
270 |
def predict_bc(model, tokenizer, text):
|
271 |
+
tokens = text_bc_tokenizer(
|
272 |
text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
|
273 |
).to(device)["input_ids"]
|
274 |
|
|
|
278 |
return output_norm
|
279 |
|
280 |
def predict_mc(model, tokenizer, text):
|
281 |
+
tokens = text_mc_tokenizer(
|
282 |
text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
|
283 |
).to(device)["input_ids"]
|
284 |
output = model(tokens)
|
|
|
290 |
|
291 |
bc_scores = []
|
292 |
mc_scores = []
|
293 |
+
samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
|
294 |
+
samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc'))
|
295 |
+
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
|
296 |
+
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
|
297 |
|
298 |
+
for i in range(samples_len_bc):
|
299 |
+
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
|
|
300 |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
|
|
|
301 |
bc_scores.append(bc_score)
|
302 |
+
|
303 |
+
for i in range(samples_len_mc):
|
304 |
+
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
305 |
+
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
306 |
mc_scores.append(mc_score)
|
307 |
|
308 |
bc_scores_array = np.array(bc_scores)
|
|
|
373 |
return f"{year}{months[month]}{day}"
|
374 |
|
375 |
def len_validator(text):
|
376 |
+
min_tokens = 200
|
377 |
+
lengt = len(text_bc_tokenizer.tokenize(text = text, return_tensors="pt"))
|
378 |
if lengt < min_tokens:
|
379 |
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
|
380 |
else :
|
381 |
+
return f"Input length ({lengt}) is satisified."
|
382 |
|
383 |
def extract_text_from_pdf(pdf_path):
|
384 |
doc = fitz.open(pdf_path)
|