jedick commited on
Commit
e4391fe
·
1 Parent(s): 3472410

Cleanup data directories

Browse files
Files changed (2) hide show
  1. app.py +23 -3
  2. prompts.py +23 -23
app.py CHANGED
@@ -8,6 +8,7 @@ from main import openai_model, model_id
8
  from util import get_sources, get_start_end_months
9
  from git import Repo
10
  import zipfile
 
11
  import spaces
12
  import torch
13
  import uuid
@@ -263,7 +264,7 @@ with gr.Blocks(
263
  render=False,
264
  )
265
  data_error = gr.Textbox(
266
- value="App is unavailable. Please contact the maintainer.",
267
  lines=1,
268
  label="Error downloading or extracting data",
269
  visible=False,
@@ -561,6 +562,19 @@ with gr.Blocks(
561
  # Data loading
562
  # ------------
563
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  def download():
565
  """Download the db.zip file"""
566
 
@@ -586,8 +600,14 @@ with gr.Blocks(
586
 
587
  zip_file_path = "./R-help-db/db.zip"
588
  extract_to_path = "./"
589
- with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
590
- zip_ref.extractall(extract_to_path)
 
 
 
 
 
 
591
 
592
  return None
593
 
 
8
  from util import get_sources, get_start_end_months
9
  from git import Repo
10
  import zipfile
11
+ import shutil
12
  import spaces
13
  import torch
14
  import uuid
 
264
  render=False,
265
  )
266
  data_error = gr.Textbox(
267
+ value="App is unavailable because data could not be loaded. Try reloading the page, then contact the maintainer if the problem persists.",
268
  lines=1,
269
  label="Error downloading or extracting data",
270
  visible=False,
 
562
  # Data loading
563
  # ------------
564
 
565
+ def rm_directory(directory_path):
566
+ """Forcefully and recursively delete a directory, like rm -rf"""
567
+
568
+ try:
569
+ shutil.rmtree(directory_path)
570
+ print(f"Successfully deleted: {directory_path}")
571
+ except FileNotFoundError:
572
+ print(f"Directory not found: {directory_path}")
573
+ except PermissionError:
574
+ print(f"Permission denied: {directory_path}")
575
+ except Exception as e:
576
+ print(f"An error occurred: {e}")
577
+
578
  def download():
579
  """Download the db.zip file"""
580
 
 
600
 
601
  zip_file_path = "./R-help-db/db.zip"
602
  extract_to_path = "./"
603
+ try:
604
+ with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
605
+ zip_ref.extractall(extract_to_path)
606
+ except:
607
+ # If there were any errors, clean up directories to
608
+ # initiate a new download when app is reloaded
609
+ rm_directory("./db")
610
+ rm_directory("./R-help-db")
611
 
612
  return None
613
 
prompts.py CHANGED
@@ -14,22 +14,22 @@ def retrieve_prompt(compute_mode):
14
  start, end = get_start_end_months(get_sources())
15
 
16
  retrieve_prompt = (
17
- f"Today Date: {date.today()}. "
18
- "You are a helpful RAG chatbot designed to answer questions about R programming based on the R-help mailing list. "
19
- "Do not ask the user for more information, but retrieve emails from the R-help mailing list archives. "
20
  # gpt-4o-mini says last two months aren't available with this: Emails from from {start} to {end} are available for retrieval.
21
- f"The emails available for retrieval are from {start} to {end}. "
22
- "Write a search query based on the user's question, but do not answer the question just yet. "
23
- "For questions about differences or comparison between X and Y, retrieve emails about X and Y. "
24
- "If the user's question is about years, use retrieve_emails(search_query=, start_year=, end_year=) (this month is this year). "
25
- "Example: to retrieve emails about R from a month in any year use retrieve_emails(search_query='R', months=<month>). "
26
  # This confuses gpt-4o-mini (empty search_query - token problem?)
27
- "Use 3-letter month abbreviations: Jan for January, Jul for July. "
28
- "If you decide not to retrieve emails, tell the user why and suggest how to improve their question to chat with the R-help mailing list. "
29
  )
30
  # A sanity check that we don't have unassigned variables
31
  # (this causes KeyError in parsing by ToolCallingLLM)
32
- matches = re.findall(r"\{.*?\}", "".join(retrieve_prompt))
33
  if matches:
34
  raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
35
  return retrieve_prompt
@@ -38,21 +38,21 @@ def retrieve_prompt(compute_mode):
38
  def answer_prompt(with_tools=True):
39
  """Return system prompt for generate step"""
40
  answer_prompt = (
41
- f"Today Date: {date.today()}. "
42
- "You are a helpful RAG chatbot designed to answer questions about R programming based on the R-help mailing list. "
43
- "Summarize the retrieved emails from the R-help mailing list archives to answer the user's question or query. "
44
- "If any of the retrieved emails are irrelevant (e.g. wrong dates), then do not use them. "
45
- "Tell the user if there are no retrieved emails or if you are unable to answer the question based on the information in the emails. "
46
- "Do not give an answer based on your own knowledge or memory, and do not include examples that aren't based on the retrieved emails. "
47
- "Example: For a question about writing formulas for lm(), make your answer about formulas for lm() from the retrieved emails. "
48
- "Do not respond with packages that are only listed under sessionInfo, session info, or other attached packages. "
49
- "Include inline citations (email senders and dates) in your response. "
50
- "Only answer general questions about R if the answer is given in the retrieved emails. "
51
- "Respond with 300 words maximum and 30 lines of code maximum and include any relevant URLs from the retrieved emails. "
52
  )
53
  if with_tools:
54
  answer_prompt += "Use answer_with_citations to provide the complete answer and all citations used. "
55
- matches = re.findall(r"\{.*?\}", "".join(answer_prompt))
56
  if matches:
57
  raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
58
  return answer_prompt
 
14
  start, end = get_start_end_months(get_sources())
15
 
16
  retrieve_prompt = (
17
+ f"Today Date: {date.today()}."
18
+ "You are a helpful RAG chatbot designed to answer questions about R programming based on the R-help mailing list."
19
+ "Do not ask the user for more information, but retrieve emails from the R-help mailing list archives."
20
  # gpt-4o-mini says last two months aren't available with this: Emails from from {start} to {end} are available for retrieval.
21
+ f"The emails available for retrieval are from {start} to {end}."
22
+ "Write a search query based on the user's question, but do not answer the question just yet."
23
+ "For questions about differences or comparison between X and Y, retrieve emails about X and Y."
24
+ "If the user's question is about years, use retrieve_emails(search_query=, start_year=, end_year=) (this month is this year)."
25
+ "Example: to retrieve emails about R from a month in any year use retrieve_emails(search_query='R', months=<month>)."
26
  # This confuses gpt-4o-mini (empty search_query - token problem?)
27
+ "Use 3-letter month abbreviations: Jan for January, Jul for July."
28
+ "If you decide not to retrieve emails, tell the user why and suggest how to improve their question to chat with the R-help mailing list."
29
  )
30
  # A sanity check that we don't have unassigned variables
31
  # (this causes KeyError in parsing by ToolCallingLLM)
32
+ matches = re.findall(r"\{.*?\}", " ".join(retrieve_prompt))
33
  if matches:
34
  raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
35
  return retrieve_prompt
 
38
  def answer_prompt(with_tools=True):
39
  """Return system prompt for generate step"""
40
  answer_prompt = (
41
+ f"Today Date: {date.today()}."
42
+ "You are a helpful RAG chatbot designed to answer questions about R programming based on the R-help mailing list."
43
+ "Summarize the retrieved emails from the R-help mailing list archives to answer the user's question or query."
44
+ "If any of the retrieved emails are irrelevant (e.g. wrong dates), then do not use them."
45
+ "Tell the user if there are no retrieved emails or if you are unable to answer the question based on the information in the emails."
46
+ "Do not give an answer based on your own knowledge or memory, and do not include examples that aren't based on the retrieved emails."
47
+ "Example: For a question about writing formulas for lm(), make your answer about formulas for lm() from the retrieved emails."
48
+ "Do not respond with packages that are only listed under sessionInfo, session info, or other attached packages."
49
+ "Include inline citations (email senders and dates) in your response."
50
+ "Only answer general questions about R if the answer is given in the retrieved emails."
51
+ "Respond with 300 words maximum and 30 lines of code maximum and include any relevant URLs from the retrieved emails."
52
  )
53
  if with_tools:
54
  answer_prompt += "Use answer_with_citations to provide the complete answer and all citations used. "
55
+ matches = re.findall(r"\{.*?\}", " ".join(answer_prompt))
56
  if matches:
57
  raise ValueError(f"Unassigned variables in prompt: {' '.join(matches)}")
58
  return answer_prompt