Spaces:
Runtime error
Runtime error
Merge branch 'main' of https://huggingface.co/spaces/polygraf-ai/article_writer
Browse files
app.py
CHANGED
@@ -13,33 +13,41 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipe
|
|
13 |
from scipy.special import softmax
|
14 |
from collections import defaultdict
|
15 |
import nltk
|
16 |
-
from utils import remove_special_characters
|
17 |
|
18 |
# Check if CUDA is available
|
19 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
20 |
print(f"Using device: {device}")
|
21 |
|
22 |
models = {
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
}
|
26 |
tokenizers = {
|
27 |
-
|
28 |
-
|
29 |
}
|
30 |
|
|
|
31 |
# Function to move model to the appropriate device
|
32 |
def to_device(model):
|
33 |
return model.to(device)
|
34 |
|
|
|
35 |
def copy_to_input(text):
|
36 |
return text
|
37 |
|
|
|
38 |
def remove_bracketed_numbers(text):
|
39 |
pattern = r"^\[\d+\]"
|
40 |
cleaned_text = re.sub(pattern, "", text)
|
41 |
return cleaned_text
|
42 |
|
|
|
43 |
def clean_text(text: str) -> str:
|
44 |
paragraphs = text.split("\n\n")
|
45 |
cleaned_paragraphs = []
|
@@ -49,6 +57,26 @@ def clean_text(text: str) -> str:
|
|
49 |
cleaned_paragraphs.append(cleaned)
|
50 |
return "\n".join(cleaned_paragraphs)
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
def format_and_correct_language_check(text: str) -> str:
|
53 |
tool = language_tool_python.LanguageTool("en-US")
|
54 |
return tool.correct(text)
|
@@ -68,60 +96,79 @@ def predict(model, tokenizer, text):
|
|
68 |
output = model(**tokens)
|
69 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
70 |
output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
|
71 |
-
return output_norm
|
|
|
72 |
|
73 |
-
def ai_generated_test(text, model=
|
74 |
return predict(models[model], tokenizers[model], text)
|
75 |
|
76 |
-
|
|
|
|
|
77 |
sentences = nltk.sent_tokenize(text)
|
78 |
num_sentences = len(sentences)
|
79 |
scores = defaultdict(list)
|
|
|
80 |
overall_scores = []
|
81 |
-
|
|
|
82 |
for i in range(num_sentences):
|
83 |
-
chunk =
|
84 |
-
if chunk:
|
|
|
85 |
result = ai_generated_test(chunk, model)
|
86 |
-
score = result[
|
87 |
-
for j in range(i, min(i+3, num_sentences)):
|
88 |
scores[j].append(score)
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
overall_score = sum(overall_scores) / len(overall_scores)
|
102 |
overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
|
103 |
-
return overall_score, "
|
|
|
104 |
|
105 |
ai_check_options = [
|
106 |
"Polygraf AI Watson (Base Model)",
|
107 |
"Polygraf AI Sherlock (Advanced Model)",
|
108 |
]
|
109 |
|
|
|
110 |
def ai_generated_test_sapling(text: str) -> Dict:
|
111 |
response = requests.post(
|
112 |
-
"https://api.sapling.ai/api/v1/aidetect",
|
113 |
-
json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
|
114 |
)
|
115 |
return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
|
116 |
|
|
|
117 |
class GPT2PPL:
|
118 |
def __init__(self):
|
119 |
self.device = device
|
120 |
-
self.model = to_device(GPT2LMHeadModel.from_pretrained(
|
121 |
-
self.tokenizer = GPT2TokenizerFast.from_pretrained(
|
122 |
|
123 |
def __call__(self, text):
|
124 |
-
encodings = self.tokenizer(text, return_tensors=
|
125 |
encodings = {k: v.to(self.device) for k, v in encodings.items()}
|
126 |
max_length = self.model.config.n_positions
|
127 |
stride = 512
|
@@ -145,15 +192,18 @@ class GPT2PPL:
|
|
145 |
ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
|
146 |
return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
|
147 |
|
|
|
148 |
def ai_generated_test_gptzero(text):
|
149 |
gptzero_model = GPT2PPL()
|
150 |
result = gptzero_model(text)
|
151 |
print(result)
|
152 |
return result, None
|
153 |
|
|
|
154 |
def highlighter_polygraf(text, model="Polygraf AI Watson (Base Model)"):
|
155 |
return process_text(text=text, model=model)
|
156 |
|
|
|
157 |
def ai_check(text: str, option: str):
|
158 |
if option.startswith("Polygraf AI"):
|
159 |
return highlighter_polygraf(text, option)
|
@@ -193,6 +243,7 @@ def generate_prompt(settings: Dict[str, str]) -> str:
|
|
193 |
"""
|
194 |
return prompt
|
195 |
|
|
|
196 |
def regenerate_prompt(settings: Dict[str, str]) -> str:
|
197 |
prompt = f"""
|
198 |
"{settings['generated_article']}"
|
@@ -210,6 +261,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
|
|
210 |
"""
|
211 |
return prompt
|
212 |
|
|
|
213 |
def generate_article(
|
214 |
topic: str,
|
215 |
keywords: str,
|
@@ -272,6 +324,7 @@ def generate_article(
|
|
272 |
|
273 |
return clean_text(article)
|
274 |
|
|
|
275 |
def humanize(
|
276 |
text: str,
|
277 |
model: str,
|
@@ -290,12 +343,14 @@ def humanize(
|
|
290 |
)
|
291 |
return format_and_correct_language_check(result)
|
292 |
|
|
|
293 |
def update_visibility_api(model: str):
|
294 |
if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
|
295 |
return gr.update(visible=True)
|
296 |
else:
|
297 |
return gr.update(visible=False)
|
298 |
|
|
|
299 |
def format_references(text: str) -> str:
|
300 |
lines = text.split("\n")
|
301 |
references = []
|
@@ -318,6 +373,7 @@ def format_references(text: str) -> str:
|
|
318 |
|
319 |
return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
|
320 |
|
|
|
321 |
def generate_and_format(
|
322 |
topic,
|
323 |
keywords,
|
@@ -356,6 +412,7 @@ def generate_and_format(
|
|
356 |
)
|
357 |
return format_references(article)
|
358 |
|
|
|
359 |
def create_interface():
|
360 |
with gr.Blocks(
|
361 |
theme=gr.themes.Default(
|
@@ -404,7 +461,7 @@ def create_interface():
|
|
404 |
step=50,
|
405 |
value=1000,
|
406 |
label="Article Length",
|
407 |
-
elem_classes="input-highlight-pink"
|
408 |
)
|
409 |
|
410 |
with gr.Row():
|
@@ -536,14 +593,14 @@ def create_interface():
|
|
536 |
label="Add comments to help edit generated text", interactive=True, visible=False
|
537 |
)
|
538 |
regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
ai_check_result = gr.Label(label="AI Check Result")
|
546 |
-
|
547 |
humanize_btn = gr.Button("Humanize")
|
548 |
# humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
|
549 |
humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
|
@@ -564,6 +621,7 @@ def create_interface():
|
|
564 |
ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
|
565 |
output_article.change(become_visible, inputs=output_article, outputs=ai_comments)
|
566 |
ai_comments.change(become_visible, inputs=output_article, outputs=regenerate_btn)
|
|
|
567 |
|
568 |
generate_btn.click(
|
569 |
fn=generate_and_format,
|
|
|
13 |
from scipy.special import softmax
|
14 |
from collections import defaultdict
|
15 |
import nltk
|
16 |
+
from utils import remove_special_characters
|
17 |
|
18 |
# Check if CUDA is available
|
19 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
20 |
print(f"Using device: {device}")
|
21 |
|
22 |
models = {
|
23 |
+
"Polygraf AI Watson (Base Model)": AutoModelForSequenceClassification.from_pretrained(
|
24 |
+
"polygraf-ai/bc-roberta-openai-2sent"
|
25 |
+
).to(device),
|
26 |
+
"Polygraf AI Sherlock (Advanced Model)": AutoModelForSequenceClassification.from_pretrained(
|
27 |
+
"polygraf-ai/bc_combined_3sent"
|
28 |
+
).to(device),
|
29 |
}
|
30 |
tokenizers = {
|
31 |
+
"Polygraf AI Watson (Base Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc-roberta-openai-2sent"),
|
32 |
+
"Polygraf AI Sherlock (Advanced Model)": AutoTokenizer.from_pretrained("polygraf-ai/bc_combined_3sent"),
|
33 |
}
|
34 |
|
35 |
+
|
36 |
# Function to move model to the appropriate device
|
37 |
def to_device(model):
|
38 |
return model.to(device)
|
39 |
|
40 |
+
|
41 |
def copy_to_input(text):
|
42 |
return text
|
43 |
|
44 |
+
|
45 |
def remove_bracketed_numbers(text):
|
46 |
pattern = r"^\[\d+\]"
|
47 |
cleaned_text = re.sub(pattern, "", text)
|
48 |
return cleaned_text
|
49 |
|
50 |
+
|
51 |
def clean_text(text: str) -> str:
|
52 |
paragraphs = text.split("\n\n")
|
53 |
cleaned_paragraphs = []
|
|
|
57 |
cleaned_paragraphs.append(cleaned)
|
58 |
return "\n".join(cleaned_paragraphs)
|
59 |
|
60 |
+
|
61 |
+
def format_and_correct(text: str) -> str:
|
62 |
+
prompt = f"""
|
63 |
+
Please correct the formatting, grammar, and spelling errors in the following text without changing its content significantly. Ensure proper paragraph breaks and maintain the original content:
|
64 |
+
{text}
|
65 |
+
"""
|
66 |
+
corrected_text = generate(prompt, "Groq", None)
|
67 |
+
return clean_text(corrected_text)
|
68 |
+
|
69 |
+
|
70 |
+
def format_and_correct_para(text: str) -> str:
|
71 |
+
paragraphs = text.split("\n")
|
72 |
+
corrected_paragraphs = []
|
73 |
+
for paragraph in paragraphs:
|
74 |
+
corrected = format_and_correct(paragraph)
|
75 |
+
corrected_paragraphs.append(corrected)
|
76 |
+
corrected_text = "\n\n".join(corrected_paragraphs)
|
77 |
+
return corrected_text
|
78 |
+
|
79 |
+
|
80 |
def format_and_correct_language_check(text: str) -> str:
|
81 |
tool = language_tool_python.LanguageTool("en-US")
|
82 |
return tool.correct(text)
|
|
|
96 |
output = model(**tokens)
|
97 |
output_norm = softmax(output.logits.detach().cpu().numpy(), 1)[0]
|
98 |
output_norm = {"HUMAN": output_norm[0], "AI": output_norm[1]}
|
99 |
+
return output_norm
|
100 |
+
|
101 |
|
102 |
+
def ai_generated_test(text, model="BC Original"):
|
103 |
return predict(models[model], tokenizers[model], text)
|
104 |
|
105 |
+
|
106 |
+
def process_text(text, model="BC Original"):
|
107 |
+
# sentences = split_into_sentences(text)
|
108 |
sentences = nltk.sent_tokenize(text)
|
109 |
num_sentences = len(sentences)
|
110 |
scores = defaultdict(list)
|
111 |
+
|
112 |
overall_scores = []
|
113 |
+
|
114 |
+
# Process each chunk of 3 sentences and store the score for each sentence in the chunk
|
115 |
for i in range(num_sentences):
|
116 |
+
chunk = " ".join(sentences[i : i + 3])
|
117 |
+
if chunk:
|
118 |
+
# result = classifier(chunk)
|
119 |
result = ai_generated_test(chunk, model)
|
120 |
+
score = result["AI"]
|
121 |
+
for j in range(i, min(i + 3, num_sentences)):
|
122 |
scores[j].append(score)
|
123 |
|
124 |
+
# Calculate the average score for each sentence and apply color coding
|
125 |
+
paragraphs = text.split("\n")
|
126 |
+
paragraphs = [s for s in paragraphs if s.strip()]
|
127 |
+
colored_paragraphs = []
|
128 |
+
i = 0
|
129 |
+
for paragraph in paragraphs:
|
130 |
+
temp_sentences = nltk.sent_tokenize(paragraph)
|
131 |
+
colored_sentences = []
|
132 |
+
for sentence in temp_sentences:
|
133 |
+
if scores[i]:
|
134 |
+
avg_score = sum(scores[i]) / len(scores[i])
|
135 |
+
if avg_score >= 0.65:
|
136 |
+
colored_sentence = f"<span style='background-color:red;'>{sentence}</span>"
|
137 |
+
else:
|
138 |
+
colored_sentence = sentence
|
139 |
+
colored_sentences.append(colored_sentence)
|
140 |
+
overall_scores.append(avg_score)
|
141 |
+
i = i + 1
|
142 |
+
combined_sentences = " ".join(colored_sentences)
|
143 |
+
print(combined_sentences)
|
144 |
+
colored_paragraphs.append(combined_sentences)
|
145 |
+
|
146 |
overall_score = sum(overall_scores) / len(overall_scores)
|
147 |
overall_score = {"HUMAN": 1 - overall_score, "AI": overall_score}
|
148 |
+
return overall_score, format_references("<br><br>".join(colored_paragraphs))
|
149 |
+
|
150 |
|
151 |
ai_check_options = [
|
152 |
"Polygraf AI Watson (Base Model)",
|
153 |
"Polygraf AI Sherlock (Advanced Model)",
|
154 |
]
|
155 |
|
156 |
+
|
157 |
def ai_generated_test_sapling(text: str) -> Dict:
|
158 |
response = requests.post(
|
159 |
+
"https://api.sapling.ai/api/v1/aidetect", json={"key": "60L9BPSVPIIOEZM0CD1DQWRBPJIUR7SB", "text": f"{text}"}
|
|
|
160 |
)
|
161 |
return {"AI": response.json()["score"], "HUMAN": 1 - response.json()["score"]}
|
162 |
|
163 |
+
|
164 |
class GPT2PPL:
|
165 |
def __init__(self):
|
166 |
self.device = device
|
167 |
+
self.model = to_device(GPT2LMHeadModel.from_pretrained("gpt2"))
|
168 |
+
self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
169 |
|
170 |
def __call__(self, text):
|
171 |
+
encodings = self.tokenizer(text, return_tensors="pt")
|
172 |
encodings = {k: v.to(self.device) for k, v in encodings.items()}
|
173 |
max_length = self.model.config.n_positions
|
174 |
stride = 512
|
|
|
192 |
ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
|
193 |
return {"AI": float(ppl), "HUMAN": 1 - float(ppl)}
|
194 |
|
195 |
+
|
196 |
def ai_generated_test_gptzero(text):
|
197 |
gptzero_model = GPT2PPL()
|
198 |
result = gptzero_model(text)
|
199 |
print(result)
|
200 |
return result, None
|
201 |
|
202 |
+
|
203 |
def highlighter_polygraf(text, model="Polygraf AI Watson (Base Model)"):
|
204 |
return process_text(text=text, model=model)
|
205 |
|
206 |
+
|
207 |
def ai_check(text: str, option: str):
|
208 |
if option.startswith("Polygraf AI"):
|
209 |
return highlighter_polygraf(text, option)
|
|
|
243 |
"""
|
244 |
return prompt
|
245 |
|
246 |
+
|
247 |
def regenerate_prompt(settings: Dict[str, str]) -> str:
|
248 |
prompt = f"""
|
249 |
"{settings['generated_article']}"
|
|
|
261 |
"""
|
262 |
return prompt
|
263 |
|
264 |
+
|
265 |
def generate_article(
|
266 |
topic: str,
|
267 |
keywords: str,
|
|
|
324 |
|
325 |
return clean_text(article)
|
326 |
|
327 |
+
|
328 |
def humanize(
|
329 |
text: str,
|
330 |
model: str,
|
|
|
343 |
)
|
344 |
return format_and_correct_language_check(result)
|
345 |
|
346 |
+
|
347 |
def update_visibility_api(model: str):
|
348 |
if model in ["OpenAI GPT 3.5", "OpenAI GPT 4"]:
|
349 |
return gr.update(visible=True)
|
350 |
else:
|
351 |
return gr.update(visible=False)
|
352 |
|
353 |
+
|
354 |
def format_references(text: str) -> str:
|
355 |
lines = text.split("\n")
|
356 |
references = []
|
|
|
373 |
|
374 |
return "\n\n".join(article_text) + "\n\nReferences:\n" + "\n".join(formatted_refs)
|
375 |
|
376 |
+
|
377 |
def generate_and_format(
|
378 |
topic,
|
379 |
keywords,
|
|
|
412 |
)
|
413 |
return format_references(article)
|
414 |
|
415 |
+
|
416 |
def create_interface():
|
417 |
with gr.Blocks(
|
418 |
theme=gr.themes.Default(
|
|
|
461 |
step=50,
|
462 |
value=1000,
|
463 |
label="Article Length",
|
464 |
+
elem_classes="input-highlight-pink",
|
465 |
)
|
466 |
|
467 |
with gr.Row():
|
|
|
593 |
label="Add comments to help edit generated text", interactive=True, visible=False
|
594 |
)
|
595 |
regenerate_btn = gr.Button("Regenerate Article", variant="primary", visible=False)
|
596 |
+
ai_detector_dropdown = gr.Radio(
|
597 |
+
choices=ai_check_options, label="Select AI Detector", value="Polygraf AI"
|
598 |
+
)
|
599 |
+
ai_check_btn = gr.Button("AI Check")
|
600 |
+
|
601 |
+
with gr.Accordion("AI Detection Results", open=True):
|
602 |
ai_check_result = gr.Label(label="AI Check Result")
|
603 |
+
highlighted_text = gr.HTML(label="Sentence Breakdown", visible=False)
|
604 |
humanize_btn = gr.Button("Humanize")
|
605 |
# humanized_output = gr.Textbox(label="Humanized Article", lines=20, elem_classes=["custom-textbox"])
|
606 |
humanized_output = gr.Markdown(label="Humanized Article", value="\n\n\n\n", render=True)
|
|
|
621 |
ai_detector_dropdown.change(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
|
622 |
output_article.change(become_visible, inputs=output_article, outputs=ai_comments)
|
623 |
ai_comments.change(become_visible, inputs=output_article, outputs=regenerate_btn)
|
624 |
+
ai_check_btn.click(highlight_visible, inputs=ai_detector_dropdown, outputs=highlighted_text)
|
625 |
|
626 |
generate_btn.click(
|
627 |
fn=generate_and_format,
|