Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Aug 28, 2024

Commit

e76dfe8

1 Parent(s): 95168db

add inline citations + page content

Browse files

Files changed (3) hide show

.gitignore +1 -0
ai_generate.py +109 -6
app.py +30 -25

.gitignore CHANGED Viewed

@@ -4,4 +4,5 @@ nohup.out
 *.out
 *.log
 *.json
 temp.py

 *.out
 *.log
 *.json
+*.pdf
 temp.py

ai_generate.py CHANGED Viewed

@@ -15,6 +15,8 @@ from langchain_openai import ChatOpenAI
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_anthropic import ChatAnthropic
 from dotenv import load_dotenv
 load_dotenv()
@@ -47,6 +49,99 @@ llm_classes = {
 }
 def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
     model_name = llm_model_translation.get(model)
     llm_class = llm_classes.get(model_name)
@@ -108,15 +203,23 @@ def generate_rag(
     rag_prompt = hub.pull("rlm/rag-prompt")
     def format_docs(docs):
-        return "\n\n".join(doc.page_content for doc in docs)
     docs = retriever.get_relevant_documents(topic)
-    formatted_docs = format_docs(docs)
     rag_chain = (
-        {"context": lambda _: formatted_docs, "question": RunnablePassthrough()} | rag_prompt | llm | StrOutputParser()
     )
-    return rag_chain.invoke(prompt)
 def generate_base(
     prompt: str, topic: str, model: str, temperature: float, max_length: int, api_key: str, sys_message=""
@@ -147,4 +250,4 @@ def generate(
     if path or url_content:
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
-        return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_anthropic import ChatAnthropic
 from dotenv import load_dotenv
+from langchain_core.output_parsers import XMLOutputParser
+from langchain.prompts import ChatPromptTemplate
 load_dotenv()
 }
+xml_system = """You're a helpful AI assistant. Given a user prompt and some related sources, fulfill all the requirements \
+of the prompt and provide citations. If a chunk of the generated text does not use any of the sources (for example, \
+introductions or general text), don't put a citation for that chunk and just leave citations empty. Otherwise, \
+list all sources used for that chunk of the text. Don't add inline citations in the text itself. Add all citations to the separated \
+citations section. Use explicit new lines in the text to show paragraph splits. \
+Return a citation for every quote across all articles that justify the text. Use the following format for your final output:
+<cited_text>
+    <chunk>
+        <text></text>
+        <citations>
+            <citation><source_id></source_id></citation>
+            ...
+        </citations>
+    </chunk>
+    <chunk>
+        <text></text>
+        <citations>
+            <citation><source_id></source_id></citation>
+            ...
+        </citations>
+    </chunk>
+    ...
+</cited_text>
+The entire text should be wrapped in one cited_text. For References section (if asked by prompt), don't add citations.
+For source id, give a valid integer alone without a key.
+Here are the sources:{context}"""
+xml_prompt = ChatPromptTemplate.from_messages(
+    [("system", xml_system), ("human", "{input}")]
+)
+def format_docs_xml(docs: list[Document]) -> str:
+    formatted = []
+    for i, doc in enumerate(docs):
+        doc_str = f"""\
+    <source id=\"{i}\">
+        <path>{doc.metadata['source']}</path>
+        <article_snippet>{doc.page_content}</article_snippet>
+    </source>"""
+        formatted.append(doc_str)
+    return "\n\n<sources>" + "\n".join(formatted) + "</sources>"
+def get_doc_content(docs, id):
+    return docs[id].page_content
+def process_cited_text(data, docs):
+    # Initialize variables for the combined text and a dictionary for citations
+    combined_text = ""
+    citations = {}
+    # Iterate through the cited_text list
+    for item in data['cited_text']:
+        chunk_text = item['chunk'][0]['text']
+        combined_text += chunk_text
+        citation_ids = []
+        # Process the citations for the chunk
+        if item['chunk'][1]['citations']:
+            for c in item['chunk'][1]['citations']:
+                if c and 'citation' in c:
+                    citation = c['citation']
+                    if isinstance(citation, dict) and "source_id" in citation:
+                        citation = citation['source_id']
+                    if isinstance(citation, str):
+                        try:
+                            citation_ids.append(int(citation))
+                        except ValueError:
+                            pass # Handle cases where the string is not a valid integer
+        if citation_ids:
+            citation_texts = [f"<{cid}-{docs[cid].metadata['source']}>" for cid in citation_ids]
+            combined_text += " " + " ".join(citation_texts)
+        combined_text += "\n\n"
+        # Store unique citations in a dictionary
+        for citation_id in citation_ids:
+            if citation_id not in citations:
+                citations[citation_id] = {'source': docs[citation_id].metadata['source'], 'content': docs[citation_id].page_content}
+    return combined_text.strip(), citations
+def citations_to_html(citations):
+    # Generate the HTML for the unique citations
+    html_content = ""
+    for citation_id, citation_info in citations.items():
+        html_content += (
+            f"<li><strong>Source ID:</strong> {citation_id}<br>"
+            f"<strong>Path:</strong> {citation_info['source']}<br>"
+            f"<strong>Page Content:</strong> {citation_info['content']}</li>"
+        )
+    html_content += "</ul></body></html>"
+    return html_content
 def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
     model_name = llm_model_translation.get(model)
     llm_class = llm_classes.get(model_name)
     rag_prompt = hub.pull("rlm/rag-prompt")
     def format_docs(docs):
+        if all(isinstance(doc, Document) for doc in docs):
+            return "\n\n".join(doc.page_content for doc in docs)
+        else:
+            raise TypeError("All items in docs must be instances of Document.")
     docs = retriever.get_relevant_documents(topic)
+    formatted_docs = format_docs_xml(docs)
     rag_chain = (
+        RunnablePassthrough.assign(context=lambda _: formatted_docs)
+        | xml_prompt
+        | llm
+        | XMLOutputParser()
     )
+    result = rag_chain.invoke({"input": prompt})
+    text, citations = process_cited_text(result, docs)
+    return text, citations
 def generate_base(
     prompt: str, topic: str, model: str, temperature: float, max_length: int, api_key: str, sys_message=""
     if path or url_content:
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
+        return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipe
 from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
 from google_search import google_search, months, domain_list, build_date
 from humanize import humanize_text, device
-from ai_generate import generate
 print(f"Using device: {device}")
@@ -259,12 +259,16 @@ def ai_check(text: str, option: str):
 def generate_prompt(settings: Dict[str, str]) -> str:
     prompt = f"""
-    I am a {settings['role']}
-    Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.
     Context:
     - {settings['context']}
     Style and Tone:
     - Writing style: {settings['writing_style']}
     - Tone: {settings['tone']}
@@ -273,21 +277,20 @@ def generate_prompt(settings: Dict[str, str]) -> str:
     Content:
     - Depth: {settings['depth_of_content']}
     - Structure: {', '.join(settings['structure'])}
     Keywords to incorporate:
     {', '.join(settings['keywords'])}
     Additional requirements:
     - Don't start with "Here is a...", start with the requested text directly
-    - Include {settings['num_examples']} relevant examples or case studies
-    - Incorporate data or statistics from {', '.join(settings['references'])}
     - End with a {settings['conclusion_type']} conclusion
-    - Add a "References" section in the format "References:" on a new line at the end with at least 3 credible detailed sources, formatted as [1], [2], etc. with each source on their own line
-    - Do not repeat sources
     - Do not make any headline, title bold.
-    Ensure proper paragraph breaks for better readability.
-    Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
     """
     return prompt
@@ -361,19 +364,19 @@ def generate_article(
         prompt = generate_prompt(settings)
     print("Generated Prompt...\n", prompt)
-    article = generate(
         prompt=prompt,
         topic=topic,
         model=ai_model,
         url_content=url_content,
         path=pdf_file_input,
         temperature=1,
         max_length=2048,
         api_key=api_key,
         sys_message="",
     )
-    return clean_text(article)
 def get_history(history):
@@ -571,7 +574,7 @@ def generate_and_format(
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
     topic_context = topic + ", " + context
-    article = generate_article(
         input_role,
         topic_context,
         context,
@@ -629,7 +632,7 @@ def generate_and_format(
         )
         print(save_message)
-    return reference_formatted, history
 def create_interface():
@@ -857,6 +860,8 @@ def create_interface():
             with gr.Column(scale=3):
                 with gr.Tab("Text Generator"):
                     output_article = gr.Textbox(label="Generated Article", lines=20)
                     ai_comments = gr.Textbox(
                         label="Add comments to help edit generated text", interactive=True, visible=False
                     )
@@ -966,7 +971,7 @@ def create_interface():
                 pdf_file_input,
                 history,
             ],
-            outputs=[output_article, history],
         )
         regenerate_btn.click(
@@ -1003,7 +1008,7 @@ def create_interface():
                 exclude_sites,
                 ai_comments,
             ],
-            outputs=[output_article, history],
         )
         ai_check_btn.click(
@@ -1035,8 +1040,8 @@ def create_interface():
 if __name__ == "__main__":
     demo = create_interface()
-    demo.queue(
-        max_size=2,
-        default_concurrency_limit=2,
-    ).launch(server_name="0.0.0.0", share=True, server_port=7890)
-    # demo.launch(server_name="0.0.0.0")

 from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
 from google_search import google_search, months, domain_list, build_date
 from humanize import humanize_text, device
+from ai_generate import generate, citations_to_html
 print(f"Using device: {device}")
 def generate_prompt(settings: Dict[str, str]) -> str:
+    settings['keywords'] = [item for item in settings['keywords'] if item.strip()]
     prompt = f"""
+Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
+    """
+    if settings['context']:
+        prompt += f"""
     Context:
     - {settings['context']}
+        """
+    prompt += f"""
     Style and Tone:
     - Writing style: {settings['writing_style']}
     - Tone: {settings['tone']}
     Content:
     - Depth: {settings['depth_of_content']}
     - Structure: {', '.join(settings['structure'])}
+    """
+    if len(settings['keywords']) > 0:
+        prompt += f"""
     Keywords to incorporate:
     {', '.join(settings['keywords'])}
+        """
+    prompt += f"""
     Additional requirements:
     - Don't start with "Here is a...", start with the requested text directly
     - End with a {settings['conclusion_type']} conclusion
     - Do not make any headline, title bold.
+    - Ensure proper paragraph breaks for better readability.
+    - Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
+    - Adhere to any format structure provided to the system if any.
     """
     return prompt
         prompt = generate_prompt(settings)
     print("Generated Prompt...\n", prompt)
+    article, citations = generate(
         prompt=prompt,
         topic=topic,
         model=ai_model,
         url_content=url_content,
         path=pdf_file_input,
+        # path=["./final_report.pdf"], # TODO: reset
         temperature=1,
         max_length=2048,
         api_key=api_key,
         sys_message="",
     )
+    return clean_text(article), citations_to_html(citations)
 def get_history(history):
         print(f"Google Search Query: {final_query}")
         url_content = google_search(final_query, sorted_date, domains_to_include)
     topic_context = topic + ", " + context
+    article, citations = generate_article(
         input_role,
         topic_context,
         context,
         )
         print(save_message)
+    return reference_formatted, citations, history
 def create_interface():
             with gr.Column(scale=3):
                 with gr.Tab("Text Generator"):
                     output_article = gr.Textbox(label="Generated Article", lines=20)
+                    with gr.Accordion("Citations", open=True):
+                        output_citations = gr.HTML(label="Citations")
                     ai_comments = gr.Textbox(
                         label="Add comments to help edit generated text", interactive=True, visible=False
                     )
                 pdf_file_input,
                 history,
             ],
+            outputs=[output_article, output_citations, history],
         )
         regenerate_btn.click(
                 exclude_sites,
                 ai_comments,
             ],
+            outputs=[output_article, output_citations, history],
         )
         ai_check_btn.click(
 if __name__ == "__main__":
     demo = create_interface()
+    # demo.queue(
+    #     max_size=2,
+    #     default_concurrency_limit=2,
+    # ).launch(server_name="0.0.0.0", share=True, server_port=7890)
+    demo.launch(server_name="0.0.0.0")