minko186 commited on
Commit
24a0ba5
·
1 Parent(s): c1769c1

fix double space on generated text + changed humanizer to batched

Browse files
Files changed (3) hide show
  1. ai_generate.py +1 -22
  2. app.py +22 -18
  3. humanize.py +51 -36
ai_generate.py CHANGED
@@ -239,13 +239,6 @@ def generate_rag(
239
  return None
240
  db = create_db_with_langchain(path, url_content)
241
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
242
- rag_prompt = hub.pull("rlm/rag-prompt")
243
-
244
- def format_docs(docs):
245
- if all(isinstance(doc, Document) for doc in docs):
246
- return "\n\n".join(doc.page_content for doc in docs)
247
- else:
248
- raise TypeError("All items in docs must be instances of Document.")
249
 
250
  docs = retriever.get_relevant_documents(topic)
251
 
@@ -292,18 +285,4 @@ def generate(
292
  if path or url_content:
293
  return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
294
  else:
295
- return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
296
-
297
- # prompt = "Write a short 200 word report with an introduction about the current methods of ai detection and the results."
298
- # topic = "the current methods of ai detection"
299
-
300
- # text, citations = generate(
301
- # prompt,
302
- # topic,
303
- # "OpenAI GPT 4o",
304
- # None,
305
- # ["./final_report.pdf","./detection_tools.pdf"],
306
- # )
307
- # from pprint import pprint
308
- # print(text)
309
- # print(citations)
 
239
  return None
240
  db = create_db_with_langchain(path, url_content)
241
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
 
 
 
 
 
 
 
242
 
243
  docs = retriever.get_relevant_documents(topic)
244
 
 
285
  if path or url_content:
286
  return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
287
  else:
288
+ return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -14,14 +14,15 @@ import torch
14
  import numpy as np
15
  from scipy.special import softmax
16
  import language_tool_python
17
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
18
 
19
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
20
  from google_search import google_search, months, domain_list, build_date
21
- from humanize import humanize_text, device, humanize_chunk
22
  from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
23
  import nltk
24
- nltk.download('punkt_tab')
 
25
 
26
  print(f"Using device: {device}")
27
 
@@ -64,7 +65,8 @@ def clean_text(text: str) -> str:
64
  cleaned = re.sub(r"\s+", " ", paragraph).strip()
65
  cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
66
  cleaned_paragraphs.append(cleaned)
67
- return "\n".join(cleaned_paragraphs)
 
68
 
69
 
70
  def format_references(text: str) -> str:
@@ -262,12 +264,12 @@ def ai_check(text: str, option: str):
262
 
263
 
264
  def generate_prompt(settings: Dict[str, str]) -> str:
265
- settings['keywords'] = [item for item in settings['keywords'] if item.strip()]
266
  # - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
267
  prompt = f"""
268
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
269
  """
270
- if settings['context']:
271
  prompt += f"""
272
  Context:
273
  - {settings['context']}
@@ -282,7 +284,7 @@ Write a {settings['article_length']} words (around) {settings['format']} on {set
282
  - Depth: {settings['depth_of_content']}
283
  - Structure: {', '.join(settings['structure'])}
284
  """
285
- if len(settings['keywords']) > 0:
286
  prompt += f"""
287
  Keywords to incorporate:
288
  {', '.join(settings['keywords'])}
@@ -384,10 +386,11 @@ def generate_article(
384
 
385
 
386
  def get_history(history):
387
- history_formatted = []
388
- for entry in history:
389
- history_formatted.append((entry[0], display_cited_text(entry[1])))
390
- return history_formatted
 
391
 
392
 
393
  def clear_history():
@@ -397,6 +400,7 @@ def clear_history():
397
 
398
  def humanize(
399
  model: str,
 
400
  temperature: float = 1.2,
401
  repetition_penalty: float = 1,
402
  top_k: int = 50,
@@ -405,9 +409,9 @@ def humanize(
405
  ) -> str:
406
  print("Humanizing text...")
407
  # body, references = split_text_from_refs(text)
408
- cited_text = history[-1][1]
409
- result = humanize_chunk(
410
- data = cited_text,
411
  model_name=model,
412
  temperature=temperature,
413
  repetition_penalty=repetition_penalty,
@@ -416,10 +420,9 @@ def humanize(
416
  )
417
  # result = result + references
418
  # corrected_text = format_and_correct_language_check(result)
419
-
420
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
421
- history.append((f"Humanized Text | {timestamp}\nInput: {model}", result))
422
- return clean_text(display_cited_text(result)), history
423
 
424
 
425
  def update_visibility_api(model: str):
@@ -609,7 +612,7 @@ def generate_and_format(
609
 
610
  # reference_formatted = format_references(article)
611
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
612
- history.append((f"Generated Text | {timestamp}\nInput: {topic}", article))
613
 
614
  # Save the article and metadata to Cloud Storage
615
  # We dont save if there is PDF input for privacy reasons
@@ -1028,6 +1031,7 @@ def create_interface():
1028
  fn=humanize,
1029
  inputs=[
1030
  model_dropdown,
 
1031
  temperature_slider,
1032
  repetition_penalty_slider,
1033
  top_k_slider,
 
14
  import numpy as np
15
  from scipy.special import softmax
16
  import language_tool_python
17
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
18
 
19
  from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
20
  from google_search import google_search, months, domain_list, build_date
21
+ from humanize import humanize_text, device
22
  from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
23
  import nltk
24
+
25
+ nltk.download("punkt_tab")
26
 
27
  print(f"Using device: {device}")
28
 
 
65
  cleaned = re.sub(r"\s+", " ", paragraph).strip()
66
  cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
67
  cleaned_paragraphs.append(cleaned)
68
+ cleaned_paragraphs = [item for item in cleaned_paragraphs if item.strip()]
69
+ return "\n\n".join(cleaned_paragraphs)
70
 
71
 
72
  def format_references(text: str) -> str:
 
264
 
265
 
266
  def generate_prompt(settings: Dict[str, str]) -> str:
267
+ settings["keywords"] = [item for item in settings["keywords"] if item.strip()]
268
  # - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
269
  prompt = f"""
270
  Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
271
  """
272
+ if settings["context"]:
273
  prompt += f"""
274
  Context:
275
  - {settings['context']}
 
284
  - Depth: {settings['depth_of_content']}
285
  - Structure: {', '.join(settings['structure'])}
286
  """
287
+ if len(settings["keywords"]) > 0:
288
  prompt += f"""
289
  Keywords to incorporate:
290
  {', '.join(settings['keywords'])}
 
386
 
387
 
388
  def get_history(history):
389
+ return history
390
+ # history_formatted = []
391
+ # for entry in history:
392
+ # history_formatted.append((entry[0], display_cited_text(entry[1])))
393
+ # return history_formatted
394
 
395
 
396
  def clear_history():
 
400
 
401
  def humanize(
402
  model: str,
403
+ cited_text: str,
404
  temperature: float = 1.2,
405
  repetition_penalty: float = 1,
406
  top_k: int = 50,
 
409
  ) -> str:
410
  print("Humanizing text...")
411
  # body, references = split_text_from_refs(text)
412
+ # cited_text = history[-1][1]
413
+ result = humanize_text(
414
+ text=cited_text,
415
  model_name=model,
416
  temperature=temperature,
417
  repetition_penalty=repetition_penalty,
 
420
  )
421
  # result = result + references
422
  # corrected_text = format_and_correct_language_check(result)
 
423
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
424
+ history.append((f"Humanized Text | {timestamp}\nInput: {model}", clean_text(result)))
425
+ return clean_text(result), history
426
 
427
 
428
  def update_visibility_api(model: str):
 
612
 
613
  # reference_formatted = format_references(article)
614
  timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
615
+ history.append((f"Generated Text | {timestamp}\nInput: {topic}", clean_text(display_cited_text(article))))
616
 
617
  # Save the article and metadata to Cloud Storage
618
  # We dont save if there is PDF input for privacy reasons
 
1031
  fn=humanize,
1032
  inputs=[
1033
  model_dropdown,
1034
+ output_article,
1035
  temperature_slider,
1036
  repetition_penalty_slider,
1037
  top_k_slider,
humanize.py CHANGED
@@ -3,9 +3,9 @@ import torch
3
  import nltk
4
  from nltk import sent_tokenize
5
  import gradio as gr
6
- from peft import PeftModel
7
  from transformers import T5ForConditionalGeneration, T5Tokenizer
8
  import language_tool_python
 
9
 
10
  nltk.download("punkt")
11
 
@@ -53,10 +53,31 @@ print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}
53
  # grammar correction tool
54
  tool = language_tool_python.LanguageTool("en-US")
55
 
 
56
  def format_and_correct_language_check(text: str) -> str:
57
  return tool.correct(text)
58
 
59
- def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
61
  inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
62
  outputs = model.generate(
@@ -72,7 +93,15 @@ def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_
72
  return answers
73
 
74
 
75
- def humanize_batch_decoder_only(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
 
 
 
 
 
 
 
 
76
  pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
77
  # Construct the messages_batch using the tokenized sentences
78
  messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
@@ -80,7 +109,12 @@ def humanize_batch_decoder_only(model, tokenizer, sentences, temperature, repeti
80
  tokenizer = get_chat_template(
81
  tokenizer,
82
  chat_template="phi-3",
83
- mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, # ShareGPT style
 
 
 
 
 
84
  )
85
 
86
  # Enable native 2x faster inference
@@ -137,9 +171,11 @@ def humanize_text(
137
  Paragraphs are stored as a number of sentences per paragraph.
138
  """
139
  progress(0, desc="Starting to Humanize")
140
-
141
  # Map model names to their respective processing functions
142
- model_map = {"Standard Model": humanize_batch_seq2seq, "Advanced Model (Beta)": humanize_batch_decoder_only}
 
 
 
143
  assert model_name in model_map, f"Invalid model name: {model_name}"
144
  process_function = model_map[model_name]
145
 
@@ -147,7 +183,10 @@ def humanize_text(
147
  paragraphs = text.split("\n")
148
  all_sentences = []
149
  sentences_per_paragraph = []
 
150
  for paragraph in paragraphs:
 
 
151
  sentences = sent_tokenize(paragraph)
152
  sentences_per_paragraph.append(len(sentences))
153
  all_sentences.extend(sentences)
@@ -163,8 +202,8 @@ def humanize_text(
163
 
164
  # Call the selected processing function
165
  paraphrased_batch = process_function(
166
- seq2seq_model if model_name == "Standard Model" else dec_only_model,
167
- seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer,
168
  batch_sentences,
169
  temperature,
170
  repetition_penalty,
@@ -195,32 +234,8 @@ def humanize_text(
195
  humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
196
  humanized_paragraphs.append(humanized_paragraph)
197
  sentence_index += num_sentences
198
-
199
- humanized_text = "\n".join(humanized_paragraphs)
 
 
200
  return humanized_text
201
-
202
-
203
- def humanize_chunk(
204
- data,
205
- progress=gr.Progress(),
206
- model_name="Standard Model",
207
- temperature=1.2,
208
- repetition_penalty=1.0,
209
- top_k=50,
210
- length_penalty=1.0,
211
- ):
212
- humanized_chunks = {'cited_text': []}
213
- if 'cited_text' in data:
214
- for item in data['cited_text']:
215
- humanized_chunk = {'chunk': [{'text': ""}, {'citations': None}]}
216
- if 'chunk' in item and len(item['chunk']) > 0:
217
- chunk_text = item['chunk'][0].get('text')
218
- humanized_chunk['chunk'][0] = {'text': format_and_correct_language_check(humanize_text(chunk_text))}
219
-
220
- citation_ids = []
221
- # Process the citations for the chunk
222
- if len(item['chunk']) > 1 and item['chunk'][1]['citations']:
223
- humanized_chunk['chunk'][1] = {'citations': item['chunk'][1]['citations']}
224
- humanized_chunks['cited_text'].append(humanized_chunk)
225
- return humanized_chunks
226
-
 
3
  import nltk
4
  from nltk import sent_tokenize
5
  import gradio as gr
 
6
  from transformers import T5ForConditionalGeneration, T5Tokenizer
7
  import language_tool_python
8
+ import re
9
 
10
  nltk.download("punkt")
11
 
 
53
  # grammar correction tool
54
  tool = language_tool_python.LanguageTool("en-US")
55
 
56
+
57
  def format_and_correct_language_check(text: str) -> str:
58
  return tool.correct(text)
59
 
60
+
61
+ def extract_citations(text):
62
+ citations = re.findall(r"<(\d+)>", text)
63
+ return [int(citation) for citation in citations]
64
+
65
+
66
+ def remove_citations(text):
67
+ text = re.sub(r"<\d+>", "", text)
68
+ text = re.sub(r"[\d+]", "", text)
69
+ return text
70
+
71
+
72
+ def humanize_batch_seq2seq(
73
+ model,
74
+ tokenizer,
75
+ sentences,
76
+ temperature,
77
+ repetition_penalty,
78
+ top_k,
79
+ length_penalty,
80
+ ):
81
  inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
82
  inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
83
  outputs = model.generate(
 
93
  return answers
94
 
95
 
96
+ def humanize_batch_decoder_only(
97
+ model,
98
+ tokenizer,
99
+ sentences,
100
+ temperature,
101
+ repetition_penalty,
102
+ top_k,
103
+ length_penalty,
104
+ ):
105
  pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
106
  # Construct the messages_batch using the tokenized sentences
107
  messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
 
109
  tokenizer = get_chat_template(
110
  tokenizer,
111
  chat_template="phi-3",
112
+ mapping={
113
+ "role": "from",
114
+ "content": "value",
115
+ "user": "human",
116
+ "assistant": "gpt",
117
+ }, # ShareGPT style
118
  )
119
 
120
  # Enable native 2x faster inference
 
171
  Paragraphs are stored as a number of sentences per paragraph.
172
  """
173
  progress(0, desc="Starting to Humanize")
 
174
  # Map model names to their respective processing functions
175
+ model_map = {
176
+ "Standard Model": humanize_batch_seq2seq,
177
+ "Advanced Model (Beta)": humanize_batch_decoder_only,
178
+ }
179
  assert model_name in model_map, f"Invalid model name: {model_name}"
180
  process_function = model_map[model_name]
181
 
 
183
  paragraphs = text.split("\n")
184
  all_sentences = []
185
  sentences_per_paragraph = []
186
+ citations_per_paragraph = []
187
  for paragraph in paragraphs:
188
+ citations_per_paragraph.append(extract_citations(paragraph))
189
+ paragraph = remove_citations(paragraph)
190
  sentences = sent_tokenize(paragraph)
191
  sentences_per_paragraph.append(len(sentences))
192
  all_sentences.extend(sentences)
 
202
 
203
  # Call the selected processing function
204
  paraphrased_batch = process_function(
205
+ (seq2seq_model if model_name == "Standard Model" else dec_only_model),
206
+ (seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer),
207
  batch_sentences,
208
  temperature,
209
  repetition_penalty,
 
234
  humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
235
  humanized_paragraphs.append(humanized_paragraph)
236
  sentence_index += num_sentences
237
+ for i, paragraph in enumerate(humanized_paragraphs):
238
+ citation_texts = [f"<{cid}>" for cid in citations_per_paragraph[i]]
239
+ humanized_paragraphs[i] = paragraph + " " + "".join(citation_texts)
240
+ humanized_text = "\n\n".join(humanized_paragraphs)
241
  return humanized_text