Spaces:
Runtime error
Runtime error
fix double space on generated text + changed humanizer to batched
Browse files- ai_generate.py +1 -22
- app.py +22 -18
- humanize.py +51 -36
ai_generate.py
CHANGED
@@ -239,13 +239,6 @@ def generate_rag(
|
|
239 |
return None
|
240 |
db = create_db_with_langchain(path, url_content)
|
241 |
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
|
242 |
-
rag_prompt = hub.pull("rlm/rag-prompt")
|
243 |
-
|
244 |
-
def format_docs(docs):
|
245 |
-
if all(isinstance(doc, Document) for doc in docs):
|
246 |
-
return "\n\n".join(doc.page_content for doc in docs)
|
247 |
-
else:
|
248 |
-
raise TypeError("All items in docs must be instances of Document.")
|
249 |
|
250 |
docs = retriever.get_relevant_documents(topic)
|
251 |
|
@@ -292,18 +285,4 @@ def generate(
|
|
292 |
if path or url_content:
|
293 |
return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
|
294 |
else:
|
295 |
-
return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
|
296 |
-
|
297 |
-
# prompt = "Write a short 200 word report with an introduction about the current methods of ai detection and the results."
|
298 |
-
# topic = "the current methods of ai detection"
|
299 |
-
|
300 |
-
# text, citations = generate(
|
301 |
-
# prompt,
|
302 |
-
# topic,
|
303 |
-
# "OpenAI GPT 4o",
|
304 |
-
# None,
|
305 |
-
# ["./final_report.pdf","./detection_tools.pdf"],
|
306 |
-
# )
|
307 |
-
# from pprint import pprint
|
308 |
-
# print(text)
|
309 |
-
# print(citations)
|
|
|
239 |
return None
|
240 |
db = create_db_with_langchain(path, url_content)
|
241 |
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
docs = retriever.get_relevant_documents(topic)
|
244 |
|
|
|
285 |
if path or url_content:
|
286 |
return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
|
287 |
else:
|
288 |
+
return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -14,14 +14,15 @@ import torch
|
|
14 |
import numpy as np
|
15 |
from scipy.special import softmax
|
16 |
import language_tool_python
|
17 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
18 |
|
19 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
20 |
from google_search import google_search, months, domain_list, build_date
|
21 |
-
from humanize import humanize_text, device
|
22 |
from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
|
23 |
import nltk
|
24 |
-
|
|
|
25 |
|
26 |
print(f"Using device: {device}")
|
27 |
|
@@ -64,7 +65,8 @@ def clean_text(text: str) -> str:
|
|
64 |
cleaned = re.sub(r"\s+", " ", paragraph).strip()
|
65 |
cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
|
66 |
cleaned_paragraphs.append(cleaned)
|
67 |
-
|
|
|
68 |
|
69 |
|
70 |
def format_references(text: str) -> str:
|
@@ -262,12 +264,12 @@ def ai_check(text: str, option: str):
|
|
262 |
|
263 |
|
264 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
265 |
-
settings[
|
266 |
# - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
|
267 |
prompt = f"""
|
268 |
Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
|
269 |
"""
|
270 |
-
if settings[
|
271 |
prompt += f"""
|
272 |
Context:
|
273 |
- {settings['context']}
|
@@ -282,7 +284,7 @@ Write a {settings['article_length']} words (around) {settings['format']} on {set
|
|
282 |
- Depth: {settings['depth_of_content']}
|
283 |
- Structure: {', '.join(settings['structure'])}
|
284 |
"""
|
285 |
-
if len(settings[
|
286 |
prompt += f"""
|
287 |
Keywords to incorporate:
|
288 |
{', '.join(settings['keywords'])}
|
@@ -384,10 +386,11 @@ def generate_article(
|
|
384 |
|
385 |
|
386 |
def get_history(history):
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
|
|
391 |
|
392 |
|
393 |
def clear_history():
|
@@ -397,6 +400,7 @@ def clear_history():
|
|
397 |
|
398 |
def humanize(
|
399 |
model: str,
|
|
|
400 |
temperature: float = 1.2,
|
401 |
repetition_penalty: float = 1,
|
402 |
top_k: int = 50,
|
@@ -405,9 +409,9 @@ def humanize(
|
|
405 |
) -> str:
|
406 |
print("Humanizing text...")
|
407 |
# body, references = split_text_from_refs(text)
|
408 |
-
cited_text = history[-1][1]
|
409 |
-
result =
|
410 |
-
|
411 |
model_name=model,
|
412 |
temperature=temperature,
|
413 |
repetition_penalty=repetition_penalty,
|
@@ -416,10 +420,9 @@ def humanize(
|
|
416 |
)
|
417 |
# result = result + references
|
418 |
# corrected_text = format_and_correct_language_check(result)
|
419 |
-
|
420 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
421 |
-
history.append((f"Humanized Text | {timestamp}\nInput: {model}", result))
|
422 |
-
return clean_text(
|
423 |
|
424 |
|
425 |
def update_visibility_api(model: str):
|
@@ -609,7 +612,7 @@ def generate_and_format(
|
|
609 |
|
610 |
# reference_formatted = format_references(article)
|
611 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
612 |
-
history.append((f"Generated Text | {timestamp}\nInput: {topic}", article))
|
613 |
|
614 |
# Save the article and metadata to Cloud Storage
|
615 |
# We dont save if there is PDF input for privacy reasons
|
@@ -1028,6 +1031,7 @@ def create_interface():
|
|
1028 |
fn=humanize,
|
1029 |
inputs=[
|
1030 |
model_dropdown,
|
|
|
1031 |
temperature_slider,
|
1032 |
repetition_penalty_slider,
|
1033 |
top_k_slider,
|
|
|
14 |
import numpy as np
|
15 |
from scipy.special import softmax
|
16 |
import language_tool_python
|
17 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
18 |
|
19 |
from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
|
20 |
from google_search import google_search, months, domain_list, build_date
|
21 |
+
from humanize import humanize_text, device
|
22 |
from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
|
23 |
import nltk
|
24 |
+
|
25 |
+
nltk.download("punkt_tab")
|
26 |
|
27 |
print(f"Using device: {device}")
|
28 |
|
|
|
65 |
cleaned = re.sub(r"\s+", " ", paragraph).strip()
|
66 |
cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
|
67 |
cleaned_paragraphs.append(cleaned)
|
68 |
+
cleaned_paragraphs = [item for item in cleaned_paragraphs if item.strip()]
|
69 |
+
return "\n\n".join(cleaned_paragraphs)
|
70 |
|
71 |
|
72 |
def format_references(text: str) -> str:
|
|
|
264 |
|
265 |
|
266 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
267 |
+
settings["keywords"] = [item for item in settings["keywords"] if item.strip()]
|
268 |
# - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
|
269 |
prompt = f"""
|
270 |
Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
|
271 |
"""
|
272 |
+
if settings["context"]:
|
273 |
prompt += f"""
|
274 |
Context:
|
275 |
- {settings['context']}
|
|
|
284 |
- Depth: {settings['depth_of_content']}
|
285 |
- Structure: {', '.join(settings['structure'])}
|
286 |
"""
|
287 |
+
if len(settings["keywords"]) > 0:
|
288 |
prompt += f"""
|
289 |
Keywords to incorporate:
|
290 |
{', '.join(settings['keywords'])}
|
|
|
386 |
|
387 |
|
388 |
def get_history(history):
|
389 |
+
return history
|
390 |
+
# history_formatted = []
|
391 |
+
# for entry in history:
|
392 |
+
# history_formatted.append((entry[0], display_cited_text(entry[1])))
|
393 |
+
# return history_formatted
|
394 |
|
395 |
|
396 |
def clear_history():
|
|
|
400 |
|
401 |
def humanize(
|
402 |
model: str,
|
403 |
+
cited_text: str,
|
404 |
temperature: float = 1.2,
|
405 |
repetition_penalty: float = 1,
|
406 |
top_k: int = 50,
|
|
|
409 |
) -> str:
|
410 |
print("Humanizing text...")
|
411 |
# body, references = split_text_from_refs(text)
|
412 |
+
# cited_text = history[-1][1]
|
413 |
+
result = humanize_text(
|
414 |
+
text=cited_text,
|
415 |
model_name=model,
|
416 |
temperature=temperature,
|
417 |
repetition_penalty=repetition_penalty,
|
|
|
420 |
)
|
421 |
# result = result + references
|
422 |
# corrected_text = format_and_correct_language_check(result)
|
|
|
423 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
424 |
+
history.append((f"Humanized Text | {timestamp}\nInput: {model}", clean_text(result)))
|
425 |
+
return clean_text(result), history
|
426 |
|
427 |
|
428 |
def update_visibility_api(model: str):
|
|
|
612 |
|
613 |
# reference_formatted = format_references(article)
|
614 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
615 |
+
history.append((f"Generated Text | {timestamp}\nInput: {topic}", clean_text(display_cited_text(article))))
|
616 |
|
617 |
# Save the article and metadata to Cloud Storage
|
618 |
# We dont save if there is PDF input for privacy reasons
|
|
|
1031 |
fn=humanize,
|
1032 |
inputs=[
|
1033 |
model_dropdown,
|
1034 |
+
output_article,
|
1035 |
temperature_slider,
|
1036 |
repetition_penalty_slider,
|
1037 |
top_k_slider,
|
humanize.py
CHANGED
@@ -3,9 +3,9 @@ import torch
|
|
3 |
import nltk
|
4 |
from nltk import sent_tokenize
|
5 |
import gradio as gr
|
6 |
-
from peft import PeftModel
|
7 |
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
8 |
import language_tool_python
|
|
|
9 |
|
10 |
nltk.download("punkt")
|
11 |
|
@@ -53,10 +53,31 @@ print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}
|
|
53 |
# grammar correction tool
|
54 |
tool = language_tool_python.LanguageTool("en-US")
|
55 |
|
|
|
56 |
def format_and_correct_language_check(text: str) -> str:
|
57 |
return tool.correct(text)
|
58 |
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
|
61 |
inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
|
62 |
outputs = model.generate(
|
@@ -72,7 +93,15 @@ def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_
|
|
72 |
return answers
|
73 |
|
74 |
|
75 |
-
def humanize_batch_decoder_only(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
|
77 |
# Construct the messages_batch using the tokenized sentences
|
78 |
messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
|
@@ -80,7 +109,12 @@ def humanize_batch_decoder_only(model, tokenizer, sentences, temperature, repeti
|
|
80 |
tokenizer = get_chat_template(
|
81 |
tokenizer,
|
82 |
chat_template="phi-3",
|
83 |
-
mapping={
|
|
|
|
|
|
|
|
|
|
|
84 |
)
|
85 |
|
86 |
# Enable native 2x faster inference
|
@@ -137,9 +171,11 @@ def humanize_text(
|
|
137 |
Paragraphs are stored as a number of sentences per paragraph.
|
138 |
"""
|
139 |
progress(0, desc="Starting to Humanize")
|
140 |
-
|
141 |
# Map model names to their respective processing functions
|
142 |
-
model_map = {
|
|
|
|
|
|
|
143 |
assert model_name in model_map, f"Invalid model name: {model_name}"
|
144 |
process_function = model_map[model_name]
|
145 |
|
@@ -147,7 +183,10 @@ def humanize_text(
|
|
147 |
paragraphs = text.split("\n")
|
148 |
all_sentences = []
|
149 |
sentences_per_paragraph = []
|
|
|
150 |
for paragraph in paragraphs:
|
|
|
|
|
151 |
sentences = sent_tokenize(paragraph)
|
152 |
sentences_per_paragraph.append(len(sentences))
|
153 |
all_sentences.extend(sentences)
|
@@ -163,8 +202,8 @@ def humanize_text(
|
|
163 |
|
164 |
# Call the selected processing function
|
165 |
paraphrased_batch = process_function(
|
166 |
-
seq2seq_model if model_name == "Standard Model" else dec_only_model,
|
167 |
-
seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer,
|
168 |
batch_sentences,
|
169 |
temperature,
|
170 |
repetition_penalty,
|
@@ -195,32 +234,8 @@ def humanize_text(
|
|
195 |
humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
|
196 |
humanized_paragraphs.append(humanized_paragraph)
|
197 |
sentence_index += num_sentences
|
198 |
-
|
199 |
-
|
|
|
|
|
200 |
return humanized_text
|
201 |
-
|
202 |
-
|
203 |
-
def humanize_chunk(
|
204 |
-
data,
|
205 |
-
progress=gr.Progress(),
|
206 |
-
model_name="Standard Model",
|
207 |
-
temperature=1.2,
|
208 |
-
repetition_penalty=1.0,
|
209 |
-
top_k=50,
|
210 |
-
length_penalty=1.0,
|
211 |
-
):
|
212 |
-
humanized_chunks = {'cited_text': []}
|
213 |
-
if 'cited_text' in data:
|
214 |
-
for item in data['cited_text']:
|
215 |
-
humanized_chunk = {'chunk': [{'text': ""}, {'citations': None}]}
|
216 |
-
if 'chunk' in item and len(item['chunk']) > 0:
|
217 |
-
chunk_text = item['chunk'][0].get('text')
|
218 |
-
humanized_chunk['chunk'][0] = {'text': format_and_correct_language_check(humanize_text(chunk_text))}
|
219 |
-
|
220 |
-
citation_ids = []
|
221 |
-
# Process the citations for the chunk
|
222 |
-
if len(item['chunk']) > 1 and item['chunk'][1]['citations']:
|
223 |
-
humanized_chunk['chunk'][1] = {'citations': item['chunk'][1]['citations']}
|
224 |
-
humanized_chunks['cited_text'].append(humanized_chunk)
|
225 |
-
return humanized_chunks
|
226 |
-
|
|
|
3 |
import nltk
|
4 |
from nltk import sent_tokenize
|
5 |
import gradio as gr
|
|
|
6 |
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
7 |
import language_tool_python
|
8 |
+
import re
|
9 |
|
10 |
nltk.download("punkt")
|
11 |
|
|
|
53 |
# grammar correction tool
|
54 |
tool = language_tool_python.LanguageTool("en-US")
|
55 |
|
56 |
+
|
57 |
def format_and_correct_language_check(text: str) -> str:
|
58 |
return tool.correct(text)
|
59 |
|
60 |
+
|
61 |
+
def extract_citations(text):
|
62 |
+
citations = re.findall(r"<(\d+)>", text)
|
63 |
+
return [int(citation) for citation in citations]
|
64 |
+
|
65 |
+
|
66 |
+
def remove_citations(text):
|
67 |
+
text = re.sub(r"<\d+>", "", text)
|
68 |
+
text = re.sub(r"[\d+]", "", text)
|
69 |
+
return text
|
70 |
+
|
71 |
+
|
72 |
+
def humanize_batch_seq2seq(
|
73 |
+
model,
|
74 |
+
tokenizer,
|
75 |
+
sentences,
|
76 |
+
temperature,
|
77 |
+
repetition_penalty,
|
78 |
+
top_k,
|
79 |
+
length_penalty,
|
80 |
+
):
|
81 |
inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
|
82 |
inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
|
83 |
outputs = model.generate(
|
|
|
93 |
return answers
|
94 |
|
95 |
|
96 |
+
def humanize_batch_decoder_only(
|
97 |
+
model,
|
98 |
+
tokenizer,
|
99 |
+
sentences,
|
100 |
+
temperature,
|
101 |
+
repetition_penalty,
|
102 |
+
top_k,
|
103 |
+
length_penalty,
|
104 |
+
):
|
105 |
pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
|
106 |
# Construct the messages_batch using the tokenized sentences
|
107 |
messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
|
|
|
109 |
tokenizer = get_chat_template(
|
110 |
tokenizer,
|
111 |
chat_template="phi-3",
|
112 |
+
mapping={
|
113 |
+
"role": "from",
|
114 |
+
"content": "value",
|
115 |
+
"user": "human",
|
116 |
+
"assistant": "gpt",
|
117 |
+
}, # ShareGPT style
|
118 |
)
|
119 |
|
120 |
# Enable native 2x faster inference
|
|
|
171 |
Paragraphs are stored as a number of sentences per paragraph.
|
172 |
"""
|
173 |
progress(0, desc="Starting to Humanize")
|
|
|
174 |
# Map model names to their respective processing functions
|
175 |
+
model_map = {
|
176 |
+
"Standard Model": humanize_batch_seq2seq,
|
177 |
+
"Advanced Model (Beta)": humanize_batch_decoder_only,
|
178 |
+
}
|
179 |
assert model_name in model_map, f"Invalid model name: {model_name}"
|
180 |
process_function = model_map[model_name]
|
181 |
|
|
|
183 |
paragraphs = text.split("\n")
|
184 |
all_sentences = []
|
185 |
sentences_per_paragraph = []
|
186 |
+
citations_per_paragraph = []
|
187 |
for paragraph in paragraphs:
|
188 |
+
citations_per_paragraph.append(extract_citations(paragraph))
|
189 |
+
paragraph = remove_citations(paragraph)
|
190 |
sentences = sent_tokenize(paragraph)
|
191 |
sentences_per_paragraph.append(len(sentences))
|
192 |
all_sentences.extend(sentences)
|
|
|
202 |
|
203 |
# Call the selected processing function
|
204 |
paraphrased_batch = process_function(
|
205 |
+
(seq2seq_model if model_name == "Standard Model" else dec_only_model),
|
206 |
+
(seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer),
|
207 |
batch_sentences,
|
208 |
temperature,
|
209 |
repetition_penalty,
|
|
|
234 |
humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
|
235 |
humanized_paragraphs.append(humanized_paragraph)
|
236 |
sentence_index += num_sentences
|
237 |
+
for i, paragraph in enumerate(humanized_paragraphs):
|
238 |
+
citation_texts = [f"<{cid}>" for cid in citations_per_paragraph[i]]
|
239 |
+
humanized_paragraphs[i] = paragraph + " " + "".join(citation_texts)
|
240 |
+
humanized_text = "\n\n".join(humanized_paragraphs)
|
241 |
return humanized_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|