aliasgerovs commited on
Commit
3d3f6ce
·
2 Parent(s): 84c08ee e4282cc

Merge branch 'main' of https://huggingface.co/spaces/polygraf-ai/article_writer

Browse files
Files changed (1) hide show
  1. app.py +96 -38
app.py CHANGED
@@ -13,33 +13,41 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipe
13
  from scipy.special import softmax
14
  from collections import defaultdict
15
  import nltk
16
- from utils import remove_special_characters
17
 
18
  # Check if CUDA is available
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  print(f"Using device: {device}")
21
 
22
  models = {
23
- 'Polygraf AI Watson (Base Model)': AutoModelForSequenceClassification.from_pretrained('polygraf-ai/bc-roberta-openai-2sent').to(device),
24
- 'Polygraf AI Sherlock (Advanced Model)': AutoModelForSequenceClassification.from_pretrained('polygraf-ai/bc_combined_3sent').to(device),
 
 
 
 
25
  }
26
  tokenizers = {
27
- 'Polygraf AI Watson (Base Model)': AutoTokenizer.from_pretrained('polygraf-ai/bc-roberta-openai-2sent'),
28
- 'Polygraf AI Sherlock (Advanced Model)': AutoTokenizer.from_pretrained('polygraf-ai/bc_combined_3sent'),
29
  }
30
 
 
31
  # Function to move model to the appropriate device
32
  def to_device(model):
33
  return model.to(device)
34
 
 
35
  def copy_to_input(text):
36
  return text
37
 
 
38
  def remove_bracketed_numbers(text):
39
  pattern = r"^\[\d+\]"
40
  cleaned_text = re.sub(pattern, "", text)
41
  return cleaned_text
42
 
 
43
  def clean_text(text: str) -> str:
44
  paragraphs = text.split("\n\n")
45
  cleaned_paragraphs = []
@@ -49,6 +57,26 @@ def clean_text(text: str) -> str:
49
  cleaned_paragraphs.append(cleaned)
50
  return "\n".join(cleaned_paragraphs)
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def format_and_correct_language_check(text: str) -> str:
53
  tool = language_tool_python.LanguageTool("en-US")
54
  return tool.correct(text)
@@ -68,60 +96,79 @@ def predict(model, tokenizer, text):
68
  output = model(**tokens)
69
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
70
  output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
71
- return output_norm
 
72
 
73
- def ai_generated_test(text, model='BC Original'):
74
  return predict(models[model], tokenizers[model], text)
75
 
76
- def process_text(text, model='BC Original'):
 
 
77
  sentences = nltk.sent_tokenize(text)
78
  num_sentences = len(sentences)
79
  scores = defaultdict(list)
 
80
  overall_scores = []
81
-
 
82
  for i in range(num_sentences):
83
- chunk = ' '.join(sentences[i:i+3])
84
- if chunk:
 
85
  result = ai_generated_test(chunk, model)
86
- score = result['AI']
87
- for j in range(i, min(i+3, num_sentences)):
88
  scores[j].append(score)
89
 
90
- colored_sentences = []
91
- for i, sentence in enumerate(sentences):
92
- if scores[i]:
93
- avg_score = sum(scores[i]) / len(scores[i])
94
- if avg_score >= 0.65:
95
- colored_sentence = f"<span style='background-color:red;'>{sentence}</span>"
96
- else:
97
- colored_sentence = sentence
98
- colored_sentences.append(colored_sentence)
99
- overall_scores.append(avg_score)
100
-
 
 
 
 
 
 
 
 
 
 
 
101
  overall_score = sum(overall_scores) / len(overall_scores)
102
  overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
103
- return overall_score, " ".join(colored_sentences)
 
104
 
105
  ai_check_options = [
106
  "Polygraf AI Watson (Base Model)",
107
  "Polygraf AI Sherlock (Advanced Model)",
108
  ]
109
 
 
110
  def ai_generated_test_sapling(text: str) -> Dict:
111
  response = requests.post(
112
- "https://api.sapling.ai/api/v1/aidetect",
113
- json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
114
  )
115
  return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
116
 
 
117
  class GPT2PPL:
118
  def __init__(self):
119
  self.device = device
120
- self.model = to_device(GPT2LMHeadModel.from_pretrained('gpt2'))
121
- self.tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
122
 
123
  def __call__(self, text):
124
- encodings = self.tokenizer(text, return_tensors='pt')
125
  encodings = {k: v.to(self.device) for k, v in encodings.items()}
126
  max_length = self.model.config.n_positions
127
  stride = 512
@@ -145,15 +192,18 @@ class GPT2PPL:
145
  ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
146
  return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
147
 
 
148
  def ai_generated_test_gptzero(text):
149
  gptzero_model = GPT2PPL()
150
  result = gptzero_model(text)
151
  print(result)
152
  return result, None
153
 
 
154
  def highlighter_polygraf(text, model="Polygraf AI Watson (Base Model)"):
155
  return process_text(text=text, model=model)
156
 
 
157
  def ai_check(text: str, option: str):
158
  if option.startswith("Polygraf AI"):
159
  return highlighter_polygraf(text, option)
@@ -193,6 +243,7 @@ def generate_prompt(settings: Dict[str, str]) -> str:
193
  """
194
  return prompt
195
 
 
196
  def regenerate_prompt(settings: Dict[str, str]) -> str:
197
  prompt = f"""
198
  "{settings['generated_article']}"
@@ -210,6 +261,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
210
  """
211
  return prompt
212
 
 
213
  def generate_article(
214
  topic: str,
215
  keywords: str,
@@ -272,6 +324,7 @@ def generate_article(
272
 
273
  return clean_text(article)
274
 
 
275
  def humanize(
276
  text: str,
277
  model: str,
@@ -290,12 +343,14 @@ def humanize(
290
  )
291
  return format_and_correct_language_check(result)
292
 
 
293
  def update_visibility_api(model: str):
294
  if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
295
  return gr.update(visible=True)
296
  else:
297
  return gr.update(visible=False)
298
 
 
299
  def format_references(text: str) -> str:
300
  lines = text.split("\n")
301
  references = []
@@ -318,6 +373,7 @@ def format_references(text: str) -> str:
318
 
319
  return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
320
 
 
321
  def generate_and_format(
322
  topic,
323
  keywords,
@@ -356,6 +412,7 @@ def generate_and_format(
356
  )
357
  return format_references(article)
358
 
 
359
  def create_interface():
360
  with gr.Blocks(
361
  theme=gr.themes.Default(
@@ -404,7 +461,7 @@ def create_interface():
404
  step=50,
405
  value=1000,
406
  label="Article Length",
407
- elem_classes="input-highlight-pink"
408
  )
409
 
410
  with gr.Row():
@@ -536,14 +593,14 @@ def create_interface():
536
  label="Add comments to help edit generated text", interactive=True, visible=False
537
  )
538
  regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
539
- with gr.Row():
540
- with gr.Column():
541
- ai_detector_dropdown = gr.Radio(
542
- choices=ai_check_options, label="Select AI Detector", value="Polygraf AI Watson (Base Model)"
543
- )
544
- ai_check_btn = gr.Button("AI Check")
545
  ai_check_result = gr.Label(label="AI Check Result")
546
- highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
547
  humanize_btn = gr.Button("Humanize")
548
  # humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
549
  humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
@@ -564,6 +621,7 @@ def create_interface():
564
  ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
565
  output_article.change(become_visible, inputs=output_article, outputs=ai_comments)
566
  ai_comments.change(become_visible, inputs=output_article, outputs=regenerate_btn)
 
567
 
568
  generate_btn.click(
569
  fn=generate_and_format,
 
13
  from scipy.special import softmax
14
  from collections import defaultdict
15
  import nltk
16
+ from utils import remove_special_characters
17
 
18
  # Check if CUDA is available
19
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
  print(f"Using device: {device}")
21
 
22
  models = {
23
+ "Polygraf AI Watson (Base Model)": AutoModelForSequenceClassification.from_pretrained(
24
+ "polygraf-ai/bc-roberta-openai-2sent"
25
+ ).to(device),
26
+ "Polygraf AI Sherlock (Advanced Model)": AutoModelForSequenceClassification.from_pretrained(
27
+ "polygraf-ai/bc_combined_3sent"
28
+ ).to(device),
29
  }
30
  tokenizers = {
31
+ "Polygraf AI Watson (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"),
32
+ "Polygraf AI Sherlock (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
33
  }
34
 
35
+
36
  # Function to move model to the appropriate device
37
  def to_device(model):
38
  return model.to(device)
39
 
40
+
41
  def copy_to_input(text):
42
  return text
43
 
44
+
45
  def remove_bracketed_numbers(text):
46
  pattern = r"^\[\d+\]"
47
  cleaned_text = re.sub(pattern, "", text)
48
  return cleaned_text
49
 
50
+
51
  def clean_text(text: str) -> str:
52
  paragraphs = text.split("\n\n")
53
  cleaned_paragraphs = []
 
57
  cleaned_paragraphs.append(cleaned)
58
  return "\n".join(cleaned_paragraphs)
59
 
60
+
61
+ def format_and_correct(text: str) -> str:
62
+ prompt = f"""
63
+ Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content:
64
+ {text}
65
+ """
66
+ corrected_text = generate(prompt, "Groq", None)
67
+ return clean_text(corrected_text)
68
+
69
+
70
+ def format_and_correct_para(text: str) -> str:
71
+ paragraphs = text.split("\n")
72
+ corrected_paragraphs = []
73
+ for paragraph in paragraphs:
74
+ corrected = format_and_correct(paragraph)
75
+ corrected_paragraphs.append(corrected)
76
+ corrected_text = "\n\n".join(corrected_paragraphs)
77
+ return corrected_text
78
+
79
+
80
  def format_and_correct_language_check(text: str) -> str:
81
  tool = language_tool_python.LanguageTool("en-US")
82
  return tool.correct(text)
 
96
  output = model(**tokens)
97
  output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
98
  output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
99
+ return output_norm
100
+
101
 
102
+ def ai_generated_test(text, model="BC Original"):
103
  return predict(models[model], tokenizers[model], text)
104
 
105
+
106
+ def process_text(text, model="BC Original"):
107
+ # sentences = split_into_sentences(text)
108
  sentences = nltk.sent_tokenize(text)
109
  num_sentences = len(sentences)
110
  scores = defaultdict(list)
111
+
112
  overall_scores = []
113
+
114
+ # Process each chunk of 3 sentences and store the score for each sentence in the chunk
115
  for i in range(num_sentences):
116
+ chunk = " ".join(sentences[i : i + 3])
117
+ if chunk:
118
+ # result = classifier(chunk)
119
  result = ai_generated_test(chunk, model)
120
+ score = result["AI"]
121
+ for j in range(i, min(i + 3, num_sentences)):
122
  scores[j].append(score)
123
 
124
+ # Calculate the average score for each sentence and apply color coding
125
+ paragraphs = text.split("\n")
126
+ paragraphs = [s for s in paragraphs if s.strip()]
127
+ colored_paragraphs = []
128
+ i = 0
129
+ for paragraph in paragraphs:
130
+ temp_sentences = nltk.sent_tokenize(paragraph)
131
+ colored_sentences = []
132
+ for sentence in temp_sentences:
133
+ if scores[i]:
134
+ avg_score = sum(scores[i]) / len(scores[i])
135
+ if avg_score >= 0.65:
136
+ colored_sentence = f"<span style='background-color:red;'>{sentence}</span>"
137
+ else:
138
+ colored_sentence = sentence
139
+ colored_sentences.append(colored_sentence)
140
+ overall_scores.append(avg_score)
141
+ i = i + 1
142
+ combined_sentences = " ".join(colored_sentences)
143
+ print(combined_sentences)
144
+ colored_paragraphs.append(combined_sentences)
145
+
146
  overall_score = sum(overall_scores) / len(overall_scores)
147
  overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
148
+ return overall_score, format_references("<br><br>".join(colored_paragraphs))
149
+
150
 
151
  ai_check_options = [
152
  "Polygraf AI Watson (Base Model)",
153
  "Polygraf AI Sherlock (Advanced Model)",
154
  ]
155
 
156
+
157
  def ai_generated_test_sapling(text: str) -> Dict:
158
  response = requests.post(
159
+ "https://api.sapling.ai/api/v1/aidetect", json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
 
160
  )
161
  return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
162
 
163
+
164
  class GPT2PPL:
165
  def __init__(self):
166
  self.device = device
167
+ self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2"))
168
+ self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
169
 
170
  def __call__(self, text):
171
+ encodings = self.tokenizer(text, return_tensors="pt")
172
  encodings = {k: v.to(self.device) for k, v in encodings.items()}
173
  max_length = self.model.config.n_positions
174
  stride = 512
 
192
  ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
193
  return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
194
 
195
+
196
  def ai_generated_test_gptzero(text):
197
  gptzero_model = GPT2PPL()
198
  result = gptzero_model(text)
199
  print(result)
200
  return result, None
201
 
202
+
203
  def highlighter_polygraf(text, model="Polygraf AI Watson (Base Model)"):
204
  return process_text(text=text, model=model)
205
 
206
+
207
  def ai_check(text: str, option: str):
208
  if option.startswith("Polygraf AI"):
209
  return highlighter_polygraf(text, option)
 
243
  """
244
  return prompt
245
 
246
+
247
  def regenerate_prompt(settings: Dict[str, str]) -> str:
248
  prompt = f"""
249
  "{settings['generated_article']}"
 
261
  """
262
  return prompt
263
 
264
+
265
  def generate_article(
266
  topic: str,
267
  keywords: str,
 
324
 
325
  return clean_text(article)
326
 
327
+
328
  def humanize(
329
  text: str,
330
  model: str,
 
343
  )
344
  return format_and_correct_language_check(result)
345
 
346
+
347
  def update_visibility_api(model: str):
348
  if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
349
  return gr.update(visible=True)
350
  else:
351
  return gr.update(visible=False)
352
 
353
+
354
  def format_references(text: str) -> str:
355
  lines = text.split("\n")
356
  references = []
 
373
 
374
  return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
375
 
376
+
377
  def generate_and_format(
378
  topic,
379
  keywords,
 
412
  )
413
  return format_references(article)
414
 
415
+
416
  def create_interface():
417
  with gr.Blocks(
418
  theme=gr.themes.Default(
 
461
  step=50,
462
  value=1000,
463
  label="Article Length",
464
+ elem_classes="input-highlight-pink",
465
  )
466
 
467
  with gr.Row():
 
593
  label="Add comments to help edit generated text", interactive=True, visible=False
594
  )
595
  regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
596
+ ai_detector_dropdown = gr.Radio(
597
+ choices=ai_check_options, label="Select AI Detector", value="Polygraf AI"
598
+ )
599
+ ai_check_btn = gr.Button("AI Check")
600
+
601
+ with gr.Accordion("AI Detection Results", open=True):
602
  ai_check_result = gr.Label(label="AI Check Result")
603
+ highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
604
  humanize_btn = gr.Button("Humanize")
605
  # humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
606
  humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
 
621
  ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
622
  output_article.change(become_visible, inputs=output_article, outputs=ai_comments)
623
  ai_comments.change(become_visible, inputs=output_article, outputs=regenerate_btn)
624
+ ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
625
 
626
  generate_btn.click(
627
  fn=generate_and_format,