Spaces:

polygraf-ai
/

article_writer

Runtime error

App Files Files Community

minko186 commited on Aug 29, 2024

Commit

f6b1cb0

1 Parent(s): e76dfe8

updates on prompt + better error handling

Browse files

Files changed (2) hide show

ai_generate.py +61 -43
app.py +8 -4

ai_generate.py CHANGED Viewed

@@ -17,6 +17,7 @@ from langchain_anthropic import ChatAnthropic
 from dotenv import load_dotenv
 from langchain_core.output_parsers import XMLOutputParser
 from langchain.prompts import ChatPromptTemplate
 load_dotenv()
@@ -51,10 +52,19 @@ llm_classes = {
 xml_system = """You're a helpful AI assistant. Given a user prompt and some related sources, fulfill all the requirements \
 of the prompt and provide citations. If a chunk of the generated text does not use any of the sources (for example, \
-introductions or general text), don't put a citation for that chunk and just leave citations empty. Otherwise, \
-list all sources used for that chunk of the text. Don't add inline citations in the text itself. Add all citations to the separated \
-citations section. Use explicit new lines in the text to show paragraph splits. \
-Return a citation for every quote across all articles that justify the text. Use the following format for your final output:
 <cited_text>
     <chunk>
         <text></text>
@@ -95,51 +105,59 @@ def get_doc_content(docs, id):
     return docs[id].page_content
 def process_cited_text(data, docs):
     # Initialize variables for the combined text and a dictionary for citations
     combined_text = ""
     citations = {}
     # Iterate through the cited_text list
-    for item in data['cited_text']:
-        chunk_text = item['chunk'][0]['text']
-        combined_text += chunk_text
-        citation_ids = []
-        # Process the citations for the chunk
-        if item['chunk'][1]['citations']:
-            for c in item['chunk'][1]['citations']:
-                if c and 'citation' in c:
-                    citation = c['citation']
-                    if isinstance(citation, dict) and "source_id" in citation:
-                        citation = citation['source_id']
-                    if isinstance(citation, str):
-                        try:
-                            citation_ids.append(int(citation))
-                        except ValueError:
-                            pass # Handle cases where the string is not a valid integer
-        if citation_ids:
-            citation_texts = [f"<{cid}-{docs[cid].metadata['source']}>" for cid in citation_ids]
-            combined_text += " " + " ".join(citation_texts)
-        combined_text += "\n\n"
-        # Store unique citations in a dictionary
-        for citation_id in citation_ids:
-            if citation_id not in citations:
-                citations[citation_id] = {'source': docs[citation_id].metadata['source'], 'content': docs[citation_id].page_content}
     return combined_text.strip(), citations
 def citations_to_html(citations):
-    # Generate the HTML for the unique citations
-    html_content = ""
-    for citation_id, citation_info in citations.items():
-        html_content += (
-            f"<li><strong>Source ID:</strong> {citation_id}<br>"
-            f"<strong>Path:</strong> {citation_info['source']}<br>"
-            f"<strong>Page Content:</strong> {citation_info['content']}</li>"
-        )
-    html_content += "</ul></body></html>"
-    return html_content
 def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
@@ -227,13 +245,13 @@ def generate_base(
     llm = load_llm(model, api_key, temperature, max_length)
     if llm is None:
         print("Failed to load LLM. Aborting operation.")
-        return None
     try:
         output = llm.invoke(prompt).content
-        return output
     except Exception as e:
         print(f"An error occurred while running the model: {e}")
-        return None
 def generate(
@@ -250,4 +268,4 @@ def generate(
     if path or url_content:
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
-        return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

 from dotenv import load_dotenv
 from langchain_core.output_parsers import XMLOutputParser
 from langchain.prompts import ChatPromptTemplate
+import re
 load_dotenv()
 xml_system = """You're a helpful AI assistant. Given a user prompt and some related sources, fulfill all the requirements \
 of the prompt and provide citations. If a chunk of the generated text does not use any of the sources (for example, \
+introductions or general text), don't put a citation for that chunk and just leave "citations" section empty. Otherwise, \
+list all sources used for that chunk of the text. Remember, don't add inline citations in the text itself in any circumstant.
+Add all citations to the separate citations section. Use explicit new lines in the text to show paragraph splits. For each chunk use this example format:
+<chunk>
+    <text>This is a sample text chunk....</text>
+    <citations>
+        <citation>1</citation>
+        <citation>3</citation>
+        ...
+    </citations>
+</chunk>
+If the prompt asks for a reference section, add it in a chunk without any citations
+Return a citation for every quote across all articles that justify the text. Remember use the following format for your final output:
 <cited_text>
     <chunk>
         <text></text>
     return docs[id].page_content
+def remove_citations(text):
+    text = re.sub(r'<\d+>', '', text)
+    text = re.sub(r'[\d+]', '', text)
+    return text
 def process_cited_text(data, docs):
     # Initialize variables for the combined text and a dictionary for citations
     combined_text = ""
     citations = {}
     # Iterate through the cited_text list
+    if 'cited_text' in data:
+        for item in data['cited_text']:
+            chunk_text = item['chunk'][0]['text']
+            combined_text += chunk_text
+            citation_ids = []
+            # Process the citations for the chunk
+            if item['chunk'][1]['citations']:
+                for c in item['chunk'][1]['citations']:
+                    if c and 'citation' in c:
+                        citation = c['citation']
+                        if isinstance(citation, dict) and "source_id" in citation:
+                            citation = citation['source_id']
+                        if isinstance(citation, str):
+                            try:
+                                citation_ids.append(int(citation))
+                            except ValueError:
+                                pass # Handle cases where the string is not a valid integer
+            if citation_ids:
+                citation_texts = [f"<{cid}>" for cid in citation_ids]
+                combined_text += " " + "".join(citation_texts)
+            combined_text += "\n\n"
+            # Store unique citations in a dictionary
+            for citation_id in citation_ids:
+                if citation_id not in citations:
+                    citations[citation_id] = {'source': docs[citation_id].metadata['source'], 'content': docs[citation_id].page_content}
     return combined_text.strip(), citations
 def citations_to_html(citations):
+    if citations:
+        # Generate the HTML for the unique citations
+        html_content = ""
+        for citation_id, citation_info in citations.items():
+            html_content += (
+                f"<li><strong>Source ID:</strong> {citation_id}<br>"
+                f"<strong>Path:</strong> {citation_info['source']}<br>"
+                f"<strong>Page Content:</strong> {citation_info['content']}</li>"
+            )
+        html_content += "</ul></body></html>"
+        return html_content
+    return ""
 def load_llm(model: str, api_key: str, temperature: float = 1.0, max_length: int = 2048):
     llm = load_llm(model, api_key, temperature, max_length)
     if llm is None:
         print("Failed to load LLM. Aborting operation.")
+        return None, None
     try:
         output = llm.invoke(prompt).content
+        return output, None
     except Exception as e:
         print(f"An error occurred while running the model: {e}")
+        return None, None
 def generate(
     if path or url_content:
         return generate_rag(prompt, topic, model, url_content, path, temperature, max_length, api_key, sys_message)
     else:
+        return generate_base(prompt, topic, model, temperature, max_length, api_key, sys_message)

app.py CHANGED Viewed

@@ -19,7 +19,9 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipe
 from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
 from google_search import google_search, months, domain_list, build_date
 from humanize import humanize_text, device
-from ai_generate import generate, citations_to_html
 print(f"Using device: {device}")
@@ -244,6 +246,7 @@ def predict_mc_scores(input, bc_score):
 def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
     body, references = split_text_from_refs(text)
     score, text = detection_polygraf(text=body, model=model)
     mc_score = predict_mc_scores(body, score)  # mc score
@@ -260,6 +263,7 @@ def ai_check(text: str, option: str):
 def generate_prompt(settings: Dict[str, str]) -> str:
     settings['keywords'] = [item for item in settings['keywords'] if item.strip()]
     prompt = f"""
 Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
     """
@@ -273,7 +277,7 @@ Write a {settings['article_length']} words (around) {settings['format']} on {set
     - Writing style: {settings['writing_style']}
     - Tone: {settings['tone']}
     - Target audience: {settings['user_category']}
     Content:
     - Depth: {settings['depth_of_content']}
     - Structure: {', '.join(settings['structure'])}
@@ -302,7 +306,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
     Edit the given text based on user comments.
     User Comments:
     - {settings['user_comments']}
     Requirements:
     - Don't start with "Here is a...", start with the requested text directly
     - The original content should not be changed. Make minor modifications based on user comments above.
@@ -310,7 +314,7 @@ def regenerate_prompt(settings: Dict[str, str]) -> str:
     - Do not make any headline, title bold.
     Context:
     - {settings['context']}
     Ensure proper paragraph breaks for better readability.
     Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
     """

 from utils import remove_special_characters, split_text_allow_complete_sentences_nltk
 from google_search import google_search, months, domain_list, build_date
 from humanize import humanize_text, device
+from ai_generate import generate, citations_to_html, remove_citations
+import nltk
+nltk.download('punkt_tab')
 print(f"Using device: {device}")
 def highlighter_polygraf(text, model="Polygraf AI (Base Model)"):
+    text = remove_citations(text)
     body, references = split_text_from_refs(text)
     score, text = detection_polygraf(text=body, model=model)
     mc_score = predict_mc_scores(body, score)  # mc score
 def generate_prompt(settings: Dict[str, str]) -> str:
     settings['keywords'] = [item for item in settings['keywords'] if item.strip()]
+    #    - Add a "References" section in the format "References:" on a new line after the requested text, formatted as [1], [2], etc. with each source on their own line
     prompt = f"""
 Write a {settings['article_length']} words (around) {settings['format']} on {settings['topic']}.\n
     """
     - Writing style: {settings['writing_style']}
     - Tone: {settings['tone']}
     - Target audience: {settings['user_category']}
     Content:
     - Depth: {settings['depth_of_content']}
     - Structure: {', '.join(settings['structure'])}
     Edit the given text based on user comments.
     User Comments:
     - {settings['user_comments']}
     Requirements:
     - Don't start with "Here is a...", start with the requested text directly
     - The original content should not be changed. Make minor modifications based on user comments above.
     - Do not make any headline, title bold.
     Context:
     - {settings['context']}
     Ensure proper paragraph breaks for better readability.
     Avoid any references to artificial intelligence, language models, or the fact that this is generated by an AI, and do not mention something like here is the article etc.
     """