eljanmahammadli commited on
Commit
a17017c
1 Parent(s): 9d6deff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -12
app.py CHANGED
@@ -22,7 +22,6 @@ from unidecode import unidecode
22
 
23
 
24
  nltk.download('punkt')
25
- tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
26
 
27
  from writing_analysis import (
28
  normalize,
@@ -189,7 +188,7 @@ text_bc_model_path = "polygraf-ai/v3-bert-3-2m-trun-bc-lighter-spec"
189
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
190
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
191
 
192
- text_mc_model_path = "polygraf-ai/text-detect-mc-bert-base-uncased-mistral-openai-447k-256"
193
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
194
  text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
195
 
@@ -214,11 +213,17 @@ def update_character_count(text):
214
  return f"{len(text)} characters"
215
 
216
 
217
- def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40, min_last_segment_length=150):
218
  sentences = nltk.sent_tokenize(text)
219
  segments = []
220
  current_segment = []
221
- current_length = 0
 
 
 
 
 
 
222
 
223
  for sentence in sentences:
224
  tokens = tokenizer.tokenize(sentence)
@@ -263,7 +268,7 @@ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40,
263
 
264
 
265
  def predict_bc(model, tokenizer, text):
266
- tokens = tokenizer(
267
  text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
268
  ).to(device)["input_ids"]
269
 
@@ -273,7 +278,7 @@ def predict_bc(model, tokenizer, text):
273
  return output_norm
274
 
275
  def predict_mc(model, tokenizer, text):
276
- tokens = tokenizer(
277
  text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
278
  ).to(device)["input_ids"]
279
  output = model(tokens)
@@ -285,15 +290,19 @@ def ai_generated_test(ai_option, input):
285
 
286
  bc_scores = []
287
  mc_scores = []
288
- samples_len = len(split_text_allow_complete_sentences_nltk(input))
289
- segments = split_text_allow_complete_sentences_nltk(input)
 
 
290
 
291
- for i in range(samples_len):
292
- cleaned_text_bc = remove_special_characters(segments[i])
293
- cleaned_text_mc = remove_special_characters(segments[i])
294
  bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
295
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
296
  bc_scores.append(bc_score)
 
 
 
 
297
  mc_scores.append(mc_score)
298
 
299
  bc_scores_array = np.array(bc_scores)
 
22
 
23
 
24
  nltk.download('punkt')
 
25
 
26
  from writing_analysis import (
27
  normalize,
 
188
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
189
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
190
 
191
+ text_mc_model_path = "polygraf-ai/ai-text-detection-mc-robert-open-ai-detector-v4"
192
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
193
  text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
194
 
 
213
  return f"{len(text)} characters"
214
 
215
 
216
+ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=40, min_last_segment_length=150, type_det):
217
  sentences = nltk.sent_tokenize(text)
218
  segments = []
219
  current_segment = []
220
+ current_length = 0
221
+
222
+ if type_det = 'bc':
223
+ tokenizer = text_bc_tokenizer
224
+
225
+ elif type_det = 'mc':
226
+ tokenizer = text_mc_tokenizer
227
 
228
  for sentence in sentences:
229
  tokens = tokenizer.tokenize(sentence)
 
268
 
269
 
270
  def predict_bc(model, tokenizer, text):
271
+ tokens = text_bc_tokenizer(
272
  text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
273
  ).to(device)["input_ids"]
274
 
 
278
  return output_norm
279
 
280
  def predict_mc(model, tokenizer, text):
281
+ tokens = text_mc_tokenizer(
282
  text, padding='max_length', truncation=True, return_tensors="pt", max_length=256
283
  ).to(device)["input_ids"]
284
  output = model(tokens)
 
290
 
291
  bc_scores = []
292
  mc_scores = []
293
+ samples_len_bc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'bc'))
294
+ samples_len_mc = len(split_text_allow_complete_sentences_nltk(input, type_det = 'mc'))
295
+ segments_bc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
296
+ segments_mc = split_text_allow_complete_sentences_nltk(input, type_det = 'bc')
297
 
298
+ for i in range(samples_len_bc):
299
+ cleaned_text_bc = remove_special_characters(segments_bc[i])
 
300
  bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text_bc )
 
301
  bc_scores.append(bc_score)
302
+
303
+ for i in range(samples_len_mc):
304
+ cleaned_text_mc = remove_special_characters(segments_mc[i])
305
+ mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text_mc)
306
  mc_scores.append(mc_score)
307
 
308
  bc_scores_array = np.array(bc_scores)