Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Sep 4, 2024

Commit

24a0ba5

1 Parent(s): c1769c1

fix double space on generated text + changed humanizer to batched

Browse files

Files changed (3) hide show

ai_generate.py +1 -22
app.py +22 -18
humanize.py +51 -36

ai_generate.py CHANGED Viewed

@@ -239,13 +239,6 @@ def generate_rag(
         return None
     db = create_db_with_langchain(path, url_content)
     retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
-    rag_prompt = hub.pull("rlm/rag-prompt")
-    def format_docs(docs):
-        if all(isinstance(doc, Document) for doc in docs):
-            return "\n\n".join(doc.page_content for doc in docs)
-        else:
-            raise TypeError("All items in docs must be instances of Document.")
     docs = retriever.get_relevant_documents(topic)
@@ -292,18 +285,4 @@ def generate(
     if path or url_content:
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
-        return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)
-# prompt = "Write a short 200 word report with an introduction about the current methods of ai detection and the results."
-# topic = "the current methods of ai detection"
-# text, citations = generate(
-#     prompt,
-#     topic,
-#     "OpenAI GPT 4o",
-#     None,
-#     ["./final_report.pdf","./detection_tools.pdf"],
-# )
-# from pprint import pprint
-# print(text)
-# print(citations)

         return None
     db = create_db_with_langchain(path, url_content)
     retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": K, "fetch_k": FETCH_K})
     docs = retriever.get_relevant_documents(topic)
     if path or url_content:
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
+        return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

app.py CHANGED Viewed

@@ -14,14 +14,15 @@ import torch
 import numpy as np
 from scipy.special import softmax
 import language_tool_python
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
 from google_search import google_search, months, domain_list, build_date
-from humanize import humanize_text, device, humanize_chunk
 from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
 import nltk
-nltk.download('punkt_tab')
 print(f"Using device: {device}")
@@ -64,7 +65,8 @@ def clean_text(text: str) -> str:
         cleaned = re.sub(r"\s+", " ", paragraph).strip()
         cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
         cleaned_paragraphs.append(cleaned)
-    return "\n".join(cleaned_paragraphs)
 def format_references(text: str) -> str:
@@ -262,12 +264,12 @@ def ai_check(text: str, option: str):
 def generate_prompt(settings: Dict[str, str]) -> str:
-    settings['keywords'] = [item for item in settings['keywords'] if item.strip()]
     #    - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
     prompt = f"""
 Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
     """
-    if settings['context']:
         prompt += f"""
     Context:
     - {settings['context']}
@@ -282,7 +284,7 @@ Write a {settings['article_length']} words (around) {settings['format']} on {set
     - Depth: {settings['depth_of_content']}
     - Structure: {', '.join(settings['structure'])}
     """
-    if len(settings['keywords']) > 0:
         prompt += f"""
     Keywords to incorporate:
     {', '.join(settings['keywords'])}
@@ -384,10 +386,11 @@ def generate_article(
 def get_history(history):
-    history_formatted = []
-    for entry in history:
-        history_formatted.append((entry[0], display_cited_text(entry[1])))
-    return history_formatted
 def clear_history():
@@ -397,6 +400,7 @@ def clear_history():
 def humanize(
     model: str,
     temperature: float = 1.2,
     repetition_penalty: float = 1,
     top_k: int = 50,
@@ -405,9 +409,9 @@ def humanize(
 ) -> str:
     print("Humanizing text...")
     # body, references = split_text_from_refs(text)
-    cited_text = history[-1][1]
-    result = humanize_chunk(
-        data = cited_text,
         model_name=model,
         temperature=temperature,
         repetition_penalty=repetition_penalty,
@@ -416,10 +420,9 @@ def humanize(
     )
     # result = result + references
     # corrected_text = format_and_correct_language_check(result)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    history.append((f"Humanized Text | {timestamp}\nInput: {model}", result))
-    return clean_text(display_cited_text(result)), history
 def update_visibility_api(model: str):
@@ -609,7 +612,7 @@ def generate_and_format(
     # reference_formatted = format_references(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    history.append((f"Generated Text | {timestamp}\nInput: {topic}", article))
     # Save the article and metadata to Cloud Storage
     # We dont save if there is PDF input for privacy reasons
@@ -1028,6 +1031,7 @@ def create_interface():
             fn=humanize,
             inputs=[
                 model_dropdown,
                 temperature_slider,
                 repetition_penalty_slider,
                 top_k_slider,

 import numpy as np
 from scipy.special import softmax
 import language_tool_python
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
 from google_search import google_search, months, domain_list, build_date
+from humanize import humanize_text, device
 from ai_generate import generate, citations_to_html, remove_citations, display_cited_text
 import nltk
+nltk.download("punkt_tab")
 print(f"Using device: {device}")
         cleaned = re.sub(r"\s+", " ", paragraph).strip()
         cleaned = re.sub(r"(?<=\.) ([a-z])", lambda x: x.group(1).upper(), cleaned)
         cleaned_paragraphs.append(cleaned)
+    cleaned_paragraphs = [item for item in cleaned_paragraphs if item.strip()]
+    return "\n\n".join(cleaned_paragraphs)
 def format_references(text: str) -> str:
 def generate_prompt(settings: Dict[str, str]) -> str:
+    settings["keywords"] = [item for item in settings["keywords"] if item.strip()]
     #    - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
     prompt = f"""
 Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
     """
+    if settings["context"]:
         prompt += f"""
     Context:
     - {settings['context']}
     - Depth: {settings['depth_of_content']}
     - Structure: {', '.join(settings['structure'])}
     """
+    if len(settings["keywords"]) > 0:
         prompt += f"""
     Keywords to incorporate:
     {', '.join(settings['keywords'])}
 def get_history(history):
+    return history
+    # history_formatted = []
+    # for entry in history:
+    #     history_formatted.append((entry[0], display_cited_text(entry[1])))
+    # return history_formatted
 def clear_history():
 def humanize(
     model: str,
+    cited_text: str,
     temperature: float = 1.2,
     repetition_penalty: float = 1,
     top_k: int = 50,
 ) -> str:
     print("Humanizing text...")
     # body, references = split_text_from_refs(text)
+    # cited_text = history[-1][1]
+    result = humanize_text(
+        text=cited_text,
         model_name=model,
         temperature=temperature,
         repetition_penalty=repetition_penalty,
     )
     # result = result + references
     # corrected_text = format_and_correct_language_check(result)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    history.append((f"Humanized Text | {timestamp}\nInput: {model}", clean_text(result)))
+    return clean_text(result), history
 def update_visibility_api(model: str):
     # reference_formatted = format_references(article)
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    history.append((f"Generated Text | {timestamp}\nInput: {topic}", clean_text(display_cited_text(article))))
     # Save the article and metadata to Cloud Storage
     # We dont save if there is PDF input for privacy reasons
             fn=humanize,
             inputs=[
                 model_dropdown,
+                output_article,
                 temperature_slider,
                 repetition_penalty_slider,
                 top_k_slider,

humanize.py CHANGED Viewed

@@ -3,9 +3,9 @@ import torch
 import nltk
 from nltk import sent_tokenize
 import gradio as gr
-from peft import PeftModel
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 import language_tool_python
 nltk.download("punkt")
@@ -53,10 +53,31 @@ print(f"Loaded model: {dec_only}, Num. params: {dec_only_model.num_parameters()}
 # grammar correction tool
 tool = language_tool_python.LanguageTool("en-US")
 def format_and_correct_language_check(text: str) -> str:
     return tool.correct(text)
-def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
     inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
     inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
     outputs = model.generate(
@@ -72,7 +93,15 @@ def humanize_batch_seq2seq(model, tokenizer, sentences, temperature, repetition_
     return answers
-def humanize_batch_decoder_only(model, tokenizer, sentences, temperature, repetition_penalty, top_k, length_penalty):
     pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
     # Construct the messages_batch using the tokenized sentences
     messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
@@ -80,7 +109,12 @@ def humanize_batch_decoder_only(model, tokenizer, sentences, temperature, repeti
     tokenizer = get_chat_template(
         tokenizer,
         chat_template="phi-3",
-        mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
     )
     # Enable native 2x faster inference
@@ -137,9 +171,11 @@ def humanize_text(
     Paragraphs are stored as a number of sentences per paragraph.
     """
     progress(0, desc="Starting to Humanize")
     # Map model names to their respective processing functions
-    model_map = {"Standard Model": humanize_batch_seq2seq, "Advanced Model (Beta)": humanize_batch_decoder_only}
     assert model_name in model_map, f"Invalid model name: {model_name}"
     process_function = model_map[model_name]
@@ -147,7 +183,10 @@ def humanize_text(
     paragraphs = text.split("\n")
     all_sentences = []
     sentences_per_paragraph = []
     for paragraph in paragraphs:
         sentences = sent_tokenize(paragraph)
         sentences_per_paragraph.append(len(sentences))
         all_sentences.extend(sentences)
@@ -163,8 +202,8 @@ def humanize_text(
             # Call the selected processing function
             paraphrased_batch = process_function(
-                seq2seq_model if model_name == "Standard Model" else dec_only_model,
-                seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer,
                 batch_sentences,
                 temperature,
                 repetition_penalty,
@@ -195,32 +234,8 @@ def humanize_text(
         humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
         humanized_paragraphs.append(humanized_paragraph)
         sentence_index += num_sentences
-    humanized_text = "\n".join(humanized_paragraphs)
     return humanized_text
-def humanize_chunk(
-    data,
-    progress=gr.Progress(),
-    model_name="Standard Model",
-    temperature=1.2,
-    repetition_penalty=1.0,
-    top_k=50,
-    length_penalty=1.0,
-):
-    humanized_chunks = {'cited_text': []}
-    if 'cited_text' in data:
-        for item in data['cited_text']:
-            humanized_chunk = {'chunk': [{'text': ""}, {'citations': None}]}
-            if 'chunk' in item and len(item['chunk']) > 0:
-                chunk_text = item['chunk'][0].get('text')
-                humanized_chunk['chunk'][0] = {'text': format_and_correct_language_check(humanize_text(chunk_text))}
-                citation_ids = []
-                # Process the citations for the chunk
-                if len(item['chunk']) > 1 and item['chunk'][1]['citations']:
-                    humanized_chunk['chunk'][1] = {'citations': item['chunk'][1]['citations']}
-            humanized_chunks['cited_text'].append(humanized_chunk)
-    return humanized_chunks

 import nltk
 from nltk import sent_tokenize
 import gradio as gr
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 import language_tool_python
+import re
 nltk.download("punkt")
 # grammar correction tool
 tool = language_tool_python.LanguageTool("en-US")
 def format_and_correct_language_check(text: str) -> str:
     return tool.correct(text)
+def extract_citations(text):
+    citations = re.findall(r"<(\d+)>", text)
+    return [int(citation) for citation in citations]
+def remove_citations(text):
+    text = re.sub(r"<\d+>", "", text)
+    text = re.sub(r"[\d+]", "", text)
+    return text
+def humanize_batch_seq2seq(
+    model,
+    tokenizer,
+    sentences,
+    temperature,
+    repetition_penalty,
+    top_k,
+    length_penalty,
+):
     inputs = ["Please paraphrase this sentence: " + sentence for sentence in sentences]
     inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(model.device)
     outputs = model.generate(
     return answers
+def humanize_batch_decoder_only(
+    model,
+    tokenizer,
+    sentences,
+    temperature,
+    repetition_penalty,
+    top_k,
+    length_penalty,
+):
     pre_prompt = "As a humanizer model, your task is to rewrite the following sentence to make it more human-like. Return only the paraphrased sentence. \n\n"
     # Construct the messages_batch using the tokenized sentences
     messages_batch = [{"from": "human", "value": f"{pre_prompt}{sentence}"} for sentence in sentences]
     tokenizer = get_chat_template(
         tokenizer,
         chat_template="phi-3",
+        mapping={
+            "role": "from",
+            "content": "value",
+            "user": "human",
+            "assistant": "gpt",
+        },  # ShareGPT style
     )
     # Enable native 2x faster inference
     Paragraphs are stored as a number of sentences per paragraph.
     """
     progress(0, desc="Starting to Humanize")
     # Map model names to their respective processing functions
+    model_map = {
+        "Standard Model": humanize_batch_seq2seq,
+        "Advanced Model (Beta)": humanize_batch_decoder_only,
+    }
     assert model_name in model_map, f"Invalid model name: {model_name}"
     process_function = model_map[model_name]
     paragraphs = text.split("\n")
     all_sentences = []
     sentences_per_paragraph = []
+    citations_per_paragraph = []
     for paragraph in paragraphs:
+        citations_per_paragraph.append(extract_citations(paragraph))
+        paragraph = remove_citations(paragraph)
         sentences = sent_tokenize(paragraph)
         sentences_per_paragraph.append(len(sentences))
         all_sentences.extend(sentences)
             # Call the selected processing function
             paraphrased_batch = process_function(
+                (seq2seq_model if model_name == "Standard Model" else dec_only_model),
+                (seq2seq_tokenizer if model_name == "Standard Model" else dec_only_tokenizer),
                 batch_sentences,
                 temperature,
                 repetition_penalty,
         humanized_paragraph = " ".join(paraphrased_sentences[sentence_index : sentence_index + num_sentences])
         humanized_paragraphs.append(humanized_paragraph)
         sentence_index += num_sentences
+    for i, paragraph in enumerate(humanized_paragraphs):
+        citation_texts = [f"<{cid}>" for cid in citations_per_paragraph[i]]
+        humanized_paragraphs[i] = paragraph + " " + "".join(citation_texts)
+    humanized_text = "\n\n".join(humanized_paragraphs)
     return humanized_text