Spaces:

jedick
/

R-help-chat

Running on Zero

App Files Files Community

jedick commited on Jul 24

Commit

e4391fe

1 Parent(s): 3472410

Cleanup data directories

Browse files

Files changed (2) hide show

app.py +23 -3
prompts.py +23 -23

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from main import openai_model, model_id
 from util import get_sources, get_start_end_months
 from git import Repo
 import zipfile
 import spaces
 import torch
 import uuid
@@ -263,7 +264,7 @@ with gr.Blocks(
         render=False,
     )
     data_error = gr.Textbox(
-        value="App is unavailable. Please contact the maintainer.",
         lines=1,
         label="Error downloading or extracting data",
         visible=False,
@@ -561,6 +562,19 @@ with gr.Blocks(
     # Data loading
     # ------------
     def download():
         """Download the db.zip file"""
@@ -586,8 +600,14 @@ with gr.Blocks(
             zip_file_path = "./R-help-db/db.zip"
             extract_to_path = "./"
-            with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
-                zip_ref.extractall(extract_to_path)
         return None

 from util import get_sources, get_start_end_months
 from git import Repo
 import zipfile
+import shutil
 import spaces
 import torch
 import uuid
         render=False,
     )
     data_error = gr.Textbox(
+        value="App is unavailable because data could not be loaded. Try reloading the page, then contact the maintainer if the problem persists.",
         lines=1,
         label="Error downloading or extracting data",
         visible=False,
     # Data loading
     # ------------
+    def rm_directory(directory_path):
+        """Forcefully and recursively delete a directory, like rm -rf"""
+        try:
+            shutil.rmtree(directory_path)
+            print(f"Successfully deleted: {directory_path}")
+        except FileNotFoundError:
+            print(f"Directory not found: {directory_path}")
+        except PermissionError:
+            print(f"Permission denied: {directory_path}")
+        except Exception as e:
+            print(f"An error occurred: {e}")
     def download():
         """Download the db.zip file"""
             zip_file_path = "./R-help-db/db.zip"
             extract_to_path = "./"
+            try:
+                with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
+                    zip_ref.extractall(extract_to_path)
+            except:
+                # If there were any errors, clean up directories to
+                # initiate a new download when app is reloaded
+                rm_directory("./db")
+                rm_directory("./R-help-db")
         return None

prompts.py CHANGED Viewed

@@ -14,22 +14,22 @@ def retrieve_prompt(compute_mode):
     start, end = get_start_end_months(get_sources())
     retrieve_prompt = (
-        f"Today Date: {date.today()}. "
-        "You are a helpful RAG chatbot designed to answer questions about R programming based on the R-help mailing list. "
-        "Do not ask the user for more information, but retrieve emails from the R-help mailing list archives. "
         # gpt-4o-mini says last two months aren't available with this: Emails from from {start} to {end} are available for retrieval.
-        f"The emails available for retrieval are from {start} to {end}. "
-        "Write a search query based on the user's question, but do not answer the question just yet. "
-        "For questions about differences or comparison between X and Y, retrieve emails about X and Y. "
-        "If the user's question is about years, use retrieve_emails(search_query=, start_year=, end_year=) (this month is this year). "
-        "Example: to retrieve emails about R from a month in any year use retrieve_emails(search_query='R', months=<month>). "
         # This confuses gpt-4o-mini (empty search_query - token problem?)
-        "Use 3-letter month abbreviations: Jan for January, Jul for July. "
-        "If you decide not to retrieve emails, tell the user why and suggest how to improve their question to chat with the R-help mailing list. "
     )
     # A sanity check that we don't have unassigned variables
     # (this causes KeyError in parsing by ToolCallingLLM)
-    matches = re.findall(r"\{.*?\}", "".join(retrieve_prompt))
     if matches:
         raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
     return retrieve_prompt
@@ -38,21 +38,21 @@ def retrieve_prompt(compute_mode):
 def answer_prompt(with_tools=True):
     """Return system prompt for generate step"""
     answer_prompt = (
-        f"Today Date: {date.today()}. "
-        "You are a helpful RAG chatbot designed to answer questions about R programming based on the R-help mailing list. "
-        "Summarize the retrieved emails from the R-help mailing list archives to answer the user's question or query. "
-        "If any of the retrieved emails are irrelevant (e.g. wrong dates), then do not use them. "
-        "Tell the user if there are no retrieved emails or if you are unable to answer the question based on the information in the emails. "
-        "Do not give an answer based on your own knowledge or memory, and do not include examples that aren't based on the retrieved emails. "
-        "Example: For a question about writing formulas for lm(), make your answer about formulas for lm() from the retrieved emails. "
-        "Do not respond with packages that are only listed under sessionInfo, session info, or other attached packages. "
-        "Include inline citations (email senders and dates) in your response. "
-        "Only answer general questions about R if the answer is given in the retrieved emails. "
-        "Respond with 300 words maximum and 30 lines of code maximum and include any relevant URLs from the retrieved emails. "
     )
     if with_tools:
         answer_prompt += "Use answer_with_citations to provide the complete answer and all citations used. "
-    matches = re.findall(r"\{.*?\}", "".join(answer_prompt))
     if matches:
         raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
     return answer_prompt

     start, end = get_start_end_months(get_sources())
     retrieve_prompt = (
+        f"Today Date: {date.today()}."
+        "You are a helpful RAG chatbot designed to answer questions about R programming based on the R-help mailing list."
+        "Do not ask the user for more information, but retrieve emails from the R-help mailing list archives."
         # gpt-4o-mini says last two months aren't available with this: Emails from from {start} to {end} are available for retrieval.
+        f"The emails available for retrieval are from {start} to {end}."
+        "Write a search query based on the user's question, but do not answer the question just yet."
+        "For questions about differences or comparison between X and Y, retrieve emails about X and Y."
+        "If the user's question is about years, use retrieve_emails(search_query=, start_year=, end_year=) (this month is this year)."
+        "Example: to retrieve emails about R from a month in any year use retrieve_emails(search_query='R', months=<month>)."
         # This confuses gpt-4o-mini (empty search_query - token problem?)
+        "Use 3-letter month abbreviations: Jan for January, Jul for July."
+        "If you decide not to retrieve emails, tell the user why and suggest how to improve their question to chat with the R-help mailing list."
     )
     # A sanity check that we don't have unassigned variables
     # (this causes KeyError in parsing by ToolCallingLLM)
+    matches = re.findall(r"\{.*?\}", " ".join(retrieve_prompt))
     if matches:
         raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
     return retrieve_prompt
 def answer_prompt(with_tools=True):
     """Return system prompt for generate step"""
     answer_prompt = (
+        f"Today Date: {date.today()}."
+        "You are a helpful RAG chatbot designed to answer questions about R programming based on the R-help mailing list."
+        "Summarize the retrieved emails from the R-help mailing list archives to answer the user's question or query."
+        "If any of the retrieved emails are irrelevant (e.g. wrong dates), then do not use them."
+        "Tell the user if there are no retrieved emails or if you are unable to answer the question based on the information in the emails."
+        "Do not give an answer based on your own knowledge or memory, and do not include examples that aren't based on the retrieved emails."
+        "Example: For a question about writing formulas for lm(), make your answer about formulas for lm() from the retrieved emails."
+        "Do not respond with packages that are only listed under sessionInfo, session info, or other attached packages."
+        "Include inline citations (email senders and dates) in your response."
+        "Only answer general questions about R if the answer is given in the retrieved emails."
+        "Respond with 300 words maximum and 30 lines of code maximum and include any relevant URLs from the retrieved emails."
     )
     if with_tools:
         answer_prompt += "Use answer_with_citations to provide the complete answer and all citations used. "
+    matches = re.findall(r"\{.*?\}", " ".join(answer_prompt))
     if matches:
         raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
     return answer_prompt