aliasgerovs commited on
Commit
162c216
·
2 Parent(s): 7d7dbb0 fa7c450

Merge branch 'main' into demo

Browse files
Files changed (1) hide show
  1. app.py +97 -20
app.py CHANGED
@@ -17,6 +17,11 @@ import fitz
17
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
18
  import nltk, spacy, subprocess, torch
19
  import plotly.graph_objects as go
 
 
 
 
 
20
  from writing_analysis import (
21
  normalize,
22
  preprocess_text1,
@@ -175,11 +180,11 @@ AI DETECTION SECTION
175
  """
176
  device = "cuda" if torch.cuda.is_available() else "cpu"
177
 
178
- text_bc_model_path = "polygraf-ai/ai-text-bc-bert-2-7m"
179
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
180
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
181
 
182
- text_mc_model_path = "polygraf-ai/ai-text-mc-v5-lighter-spec"
183
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
184
  text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
185
 
@@ -190,34 +195,105 @@ def remove_special_characters(text):
190
  def update_character_count(text):
191
  return f"{len(text)} characters"
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  def predict_bc(model, tokenizer, text):
194
  tokens = tokenizer(
195
- text, padding=True, truncation=True, return_tensors="pt"
196
  ).to(device)["input_ids"]
 
197
  output = model(tokens)
198
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
199
  print("BC Score: ", output_norm)
200
- bc_score = {"AI": output_norm[1].item(), "HUMAN": output_norm[0].item()}
201
- return bc_score
202
 
203
  def predict_mc(model, tokenizer, text):
204
  tokens = tokenizer(
205
- text, padding=True, truncation=True, return_tensors="pt"
206
  ).to(device)["input_ids"]
207
  output = model(tokens)
208
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
209
  print("MC Score: ", output_norm)
210
- mc_score = {}
211
- label_map = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA 2"]
212
- for score, label in zip(output_norm, label_map):
213
- mc_score[label.upper()] = score.item()
214
- return mc_score
215
 
216
  def ai_generated_test(ai_option, input):
217
-
218
- cleaned_text = remove_special_characters(input)
219
- bc_score = predict_bc(text_bc_model, text_bc_tokenizer, cleaned_text)
220
- mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  sum_prob = 1 - bc_score["HUMAN"]
223
  for key, value in mc_score.items():
@@ -273,9 +349,10 @@ def build_date(year, month, day):
273
  return f"{year}{months[month]}{day}"
274
 
275
  def len_validator(text):
276
- min_chars = 350
277
- if len(text) < min_chars:
278
- return f"Warning! Input length is {len(text)}. Please input a text that is greater than {min_chars} characters long. Recommended length {min_chars*2} characters."
 
279
  else :
280
  return f"Input length is satisified."
281
 
@@ -393,7 +470,7 @@ with gr.Blocks() as demo:
393
  d1 = today.strftime("%d/%B/%Y")
394
  d1 = d1.split("/")
395
 
396
- model_list = ["GPT 3.5", "GPT 4", "CLAUDE", "BARD", "LLAMA2"]
397
  domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
398
  gr.Markdown(
399
  """
@@ -557,4 +634,4 @@ with gr.Blocks() as demo:
557
  date_from = ""
558
  date_to = ""
559
 
560
- demo.launch(share=True, server_name="0.0.0.0", server_port = 80, auth=("polygraf-admin", "test@aisd"))
 
17
  from transformers import GPT2LMHeadModel, GPT2TokenizerFast
18
  import nltk, spacy, subprocess, torch
19
  import plotly.graph_objects as go
20
+ import nltk
21
+
22
+ nltk.download('punkt')
23
+ tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
24
+
25
  from writing_analysis import (
26
  normalize,
27
  preprocess_text1,
 
180
  """
181
  device = "cuda" if torch.cuda.is_available() else "cpu"
182
 
183
+ text_bc_model_path = "polygraf-ai/v3-bert-3-2m-trun-bc"
184
  text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
185
  text_bc_model = AutoModelForSequenceClassification.from_pretrained(text_bc_model_path).to(device)
186
 
187
+ text_mc_model_path = "polygraf-ai/text-detect-mc-bert-base-uncased-v1-bert-429k"
188
  text_mc_tokenizer = AutoTokenizer.from_pretrained(text_mc_model_path)
189
  text_mc_model = AutoModelForSequenceClassification.from_pretrained(text_mc_model_path).to(device)
190
 
 
195
  def update_character_count(text):
196
  return f"{len(text)} characters"
197
 
198
+
199
+ def split_text_allow_complete_sentences_nltk(text, max_length=256, tolerance=10, min_last_segment_length=120):
200
+ sentences = nltk.sent_tokenize(text)
201
+ segments = []
202
+ current_segment = []
203
+ current_length = 0
204
+
205
+ for sentence in sentences:
206
+ tokens = tokenizer.tokenize(sentence)
207
+ sentence_length = len(tokens)
208
+
209
+ if current_length + sentence_length <= max_length + tolerance - 2:
210
+ current_segment.append(sentence)
211
+ current_length += sentence_length
212
+ else:
213
+ if current_segment:
214
+ encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
215
+ segments.append((current_segment, len(encoded_segment)))
216
+ current_segment = [sentence]
217
+ current_length = sentence_length
218
+
219
+ if current_segment:
220
+ encoded_segment = tokenizer.encode(' '.join(current_segment), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
221
+ segments.append((current_segment, len(encoded_segment)))
222
+
223
+ final_segments = []
224
+ for i, (seg, length) in enumerate(segments):
225
+ if i == len(segments) - 1:
226
+ if length < min_last_segment_length and len(final_segments) > 0:
227
+ prev_seg, prev_length = final_segments[-1]
228
+ combined_encoded = tokenizer.encode(' '.join(prev_seg + seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
229
+ if len(combined_encoded) <= max_length + tolerance:
230
+ final_segments[-1] = (prev_seg + seg, len(combined_encoded))
231
+ else:
232
+ final_segments.append((seg, length))
233
+ else:
234
+ final_segments.append((seg, length))
235
+ else:
236
+ final_segments.append((seg, length))
237
+
238
+ decoded_segments = []
239
+ encoded_segments = []
240
+ for seg, _ in final_segments:
241
+ encoded_segment = tokenizer.encode(' '.join(seg), add_special_tokens=True, max_length=max_length+tolerance, truncation=True)
242
+ decoded_segment = tokenizer.decode(encoded_segment)
243
+ decoded_segments.append(decoded_segment)
244
+ return decoded_segments
245
+
246
+
247
  def predict_bc(model, tokenizer, text):
248
  tokens = tokenizer(
249
+ text, padding='max_length', truncation=True, max_length=256, return_tensors="pt"
250
  ).to(device)["input_ids"]
251
+
252
  output = model(tokens)
253
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
254
  print("BC Score: ", output_norm)
255
+ return output_norm
 
256
 
257
  def predict_mc(model, tokenizer, text):
258
  tokens = tokenizer(
259
+ text, padding='max_length', truncation=True, return_tensors="pt", max_length=512
260
  ).to(device)["input_ids"]
261
  output = model(tokens)
262
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
263
  print("MC Score: ", output_norm)
264
+ return output_norm
 
 
 
 
265
 
266
  def ai_generated_test(ai_option, input):
267
+
268
+ bc_scores = []
269
+ mc_scores = []
270
+ samples_len = len(split_text_allow_complete_sentences_nltk(input))
271
+ segments = split_text_allow_complete_sentences_nltk(input)
272
+
273
+ for i in range(samples_len):
274
+ cleaned_text = remove_special_characters(segments[i])
275
+ bc_score = predict_bc(text_bc_model, text_bc_tokenizer,cleaned_text )
276
+ mc_score = predict_mc(text_mc_model, text_mc_tokenizer, cleaned_text)
277
+ bc_scores.append(bc_score)
278
+ mc_scores.append(mc_score)
279
+
280
+ bc_scores_array = np.array(bc_scores)
281
+ mc_scores_array = np.array(mc_scores)
282
+ average_bc_scores = np.mean(bc_scores_array, axis=0)
283
+ average_mc_scores = np.mean(mc_scores_array, axis=0)
284
+ bc_score_list = average_bc_scores.tolist()
285
+ mc_score_list = average_mc_scores.tolist()
286
+
287
+ # Temporary
288
+ mc_score_list[1] = mc_score_list[0] + mc_score_list[1]
289
+ mc_score_list = mc_score_list[1:]
290
+
291
+ bc_score = {"AI": bc_score[1].item(), "HUMAN": bc_score[0].item()}
292
+ mc_score = {}
293
+ label_map = ["OpenAI GPT", "CLAUDE", "BARD", "LLAMA 2"]
294
+
295
+ for score, label in zip(mc_score_list, label_map):
296
+ mc_score[label.upper()] = score
297
 
298
  sum_prob = 1 - bc_score["HUMAN"]
299
  for key, value in mc_score.items():
 
349
  return f"{year}{months[month]}{day}"
350
 
351
  def len_validator(text):
352
+ min_tokens = 128
353
+ lengt = len(tokenizer.tokenize(text = text, return_tensors="pt"))
354
+ if lengt < min_tokens:
355
+ return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
356
  else :
357
  return f"Input length is satisified."
358
 
 
470
  d1 = today.strftime("%d/%B/%Y")
471
  d1 = d1.split("/")
472
 
473
+ model_list = ["OpenAI GPT", "CLAUDE", "BARD", "LLAMA2"]
474
  domain_list = ["com", "org", "net", "int", "edu", "gov", "mil"]
475
  gr.Markdown(
476
  """
 
634
  date_from = ""
635
  date_to = ""
636
 
637
+ demo.launch(share=True, server_name="0.0.0.0", auth=("polygraf-admin", "test@aisd"))