Spaces:
Running
Running
eljanmahammadli
commited on
Commit
•
a17017c
1
Parent(s):
9d6deff
Update app.py
Browse files
app.py
CHANGED
@@ -22,7 +22,6 @@ from unidecode import unidecode
|
|
22 |
|
23 |
|
24 |
nltk.download('punkt')
|
25 |
-
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
|
26 |
|
27 |
from writing_analysis import (
|
28 |
normalize,
|
@@ -189,7 +188,7 @@ text_bc_model_path = "polygraf-ai/v3-bert-3-2m-trun-bc-lighter-spec"
|
|
189 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
190 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
|
191 |
|
192 |
-
text_mc_model_path = "polygraf-ai/text-
|
193 |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
194 |
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
|
195 |
|
@@ -214,11 +213,17 @@ def update_character_count(text):
|
|
214 |
return f"{len(text)} characters"
|
215 |
|
216 |
|
217 |
-
def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40, min_last_segment_length=150):
|
218 |
sentences = nltk.sent_tokenize(text)
|
219 |
segments = []
|
220 |
current_segment = []
|
221 |
-
current_length = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
|
223 |
for sentence in sentences:
|
224 |
tokens = tokenizer.tokenize(sentence)
|
@@ -263,7 +268,7 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40,
|
|
263 |
|
264 |
|
265 |
def predict_bc(model, tokenizer, text):
|
266 |
-
tokens =
|
267 |
text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
|
268 |
).to(device)["input_ids"]
|
269 |
|
@@ -273,7 +278,7 @@ def predict_bc(model, tokenizer, text):
|
|
273 |
return output_norm
|
274 |
|
275 |
def predict_mc(model, tokenizer, text):
|
276 |
-
tokens =
|
277 |
text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
|
278 |
).to(device)["input_ids"]
|
279 |
output = model(tokens)
|
@@ -285,15 +290,19 @@ def ai_generated_test(ai_option, input):
|
|
285 |
|
286 |
bc_scores = []
|
287 |
mc_scores = []
|
288 |
-
|
289 |
-
|
|
|
|
|
290 |
|
291 |
-
for i in range(
|
292 |
-
cleaned_text_bc = remove_special_characters(
|
293 |
-
cleaned_text_mc = remove_special_characters(segments[i])
|
294 |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
|
295 |
-
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
296 |
bc_scores.append(bc_score)
|
|
|
|
|
|
|
|
|
297 |
mc_scores.append(mc_score)
|
298 |
|
299 |
bc_scores_array = np.array(bc_scores)
|
|
|
22 |
|
23 |
|
24 |
nltk.download('punkt')
|
|
|
25 |
|
26 |
from writing_analysis import (
|
27 |
normalize,
|
|
|
188 |
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
|
189 |
text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
|
190 |
|
191 |
+
text_mc_model_path = "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
|
192 |
text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
|
193 |
text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
|
194 |
|
|
|
213 |
return f"{len(text)} characters"
|
214 |
|
215 |
|
216 |
+
def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40, min_last_segment_length=150, type_det):
|
217 |
sentences = nltk.sent_tokenize(text)
|
218 |
segments = []
|
219 |
current_segment = []
|
220 |
+
current_length = 0
|
221 |
+
|
222 |
+
if type_det = 'bc':
|
223 |
+
tokenizer = text_bc_tokenizer
|
224 |
+
|
225 |
+
elif type_det = 'mc':
|
226 |
+
tokenizer = text_mc_tokenizer
|
227 |
|
228 |
for sentence in sentences:
|
229 |
tokens = tokenizer.tokenize(sentence)
|
|
|
268 |
|
269 |
|
270 |
def predict_bc(model, tokenizer, text):
|
271 |
+
tokens = text_bc_tokenizer(
|
272 |
text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
|
273 |
).to(device)["input_ids"]
|
274 |
|
|
|
278 |
return output_norm
|
279 |
|
280 |
def predict_mc(model, tokenizer, text):
|
281 |
+
tokens = text_mc_tokenizer(
|
282 |
text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
|
283 |
).to(device)["input_ids"]
|
284 |
output = model(tokens)
|
|
|
290 |
|
291 |
bc_scores = []
|
292 |
mc_scores = []
|
293 |
+
samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
|
294 |
+
samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc'))
|
295 |
+
segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
|
296 |
+
segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
|
297 |
|
298 |
+
for i in range(samples_len_bc):
|
299 |
+
cleaned_text_bc = remove_special_characters(segments_bc[i])
|
|
|
300 |
bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
|
|
|
301 |
bc_scores.append(bc_score)
|
302 |
+
|
303 |
+
for i in range(samples_len_mc):
|
304 |
+
cleaned_text_mc = remove_special_characters(segments_mc[i])
|
305 |
+
mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
|
306 |
mc_scores.append(mc_score)
|
307 |
|
308 |
bc_scores_array = np.array(bc_scores)
|