taesiri commited on
Commit
16c9cff
·
1 Parent(s): 2adf285
Files changed (3) hide show
  1. app.py +47 -61
  2. packages.txt +8 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -8,6 +8,10 @@ import gradio as gr
8
  import requests
9
 
10
  import arxiv
 
 
 
 
11
 
12
 
13
  def replace_texttt(text):
@@ -28,58 +32,14 @@ def get_paper_info(paper_id):
28
  return None, None
29
 
30
 
31
- def download_arxiv_source(paper_id):
32
- url = f"https://arxiv.org/e-print/{paper_id}"
33
-
34
- # Get the tar file
35
- response = requests.get(url)
36
- response.raise_for_status()
37
-
38
- # Open the tar file
39
- tar = tarfile.open(fileobj=io.BytesIO(response.content), mode="r")
40
-
41
- # Load all .tex files into memory, including their subdirectories
42
- tex_files = {
43
- member.name: tar.extractfile(member).read().decode("utf-8")
44
- for member in tar.getmembers()
45
- if member.name.endswith(".tex")
46
- }
47
- # Load all .tex files into memory, including their subdirectories
48
- tex_files = {
49
- member.name: tar.extractfile(member).read().decode("utf-8")
50
- for member in tar.getmembers()
51
- if member.isfile() and member.name.endswith(".tex")
52
- }
53
-
54
- # Pattern to match \input{filename} and \include{filename}
55
- pattern = re.compile(r"\\(input|include){(.*?)}")
56
-
57
- # Function to replace \input{filename} and \include{filename} with file contents
58
- def replace_includes(text):
59
- output = []
60
- for line in text.split("\n"):
61
- match = re.search(pattern, line)
62
- if match:
63
- command, filename = match.groups()
64
- # LaTeX automatically adds .tex extension for \input and \include commands
65
- if not filename.endswith(".tex"):
66
- filename += ".tex"
67
- if filename in tex_files:
68
- output.append(replace_includes(tex_files[filename]))
69
- else:
70
- output.append(f"% {line} % FILE NOT FOUND")
71
- else:
72
- output.append(line)
73
- return "\n".join(output)
74
-
75
- if "main.tex" in tex_files:
76
- # Start with the contents of main.tex
77
- main_tex = replace_includes(tex_files["main.tex"])
78
- else:
79
- # No main.tex, concatenate all .tex files
80
- main_tex = "\n".join(replace_includes(text) for text in tex_files.values())
81
-
82
- return main_tex
83
 
84
 
85
  class ContextualQA:
@@ -94,10 +54,28 @@ class ContextualQA:
94
  self.context = text
95
 
96
  def ask_question(self, question):
97
- leading_prompt = "Give the following paper:"
98
- trailing_prompt = "Now, answer the following question based on the content of the paper above. You can optionally use Markdown to format your answer or LaTeX typesetting to improve the presentation of your answer."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- prompt = f"{HUMAN_PROMPT} {leading_prompt} {self.context} {trailing_prompt} {HUMAN_PROMPT} {question} {AI_PROMPT}"
101
  response = self.client.completions.create(
102
  prompt=prompt,
103
  stop_sequences=[HUMAN_PROMPT],
@@ -126,14 +104,22 @@ class ContextualQA:
126
 
127
 
128
  def load_context(paper_id):
129
- try:
130
- latex_source = download_arxiv_source(paper_id)
131
- except Exception as e:
132
- return None, [(f"Error loading paper with id {paper_id}.", str(e))]
 
 
 
 
 
 
 
133
 
134
  client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
135
  qa_model = ContextualQA(client, model="claude-2.0")
136
- qa_model.load_text(latex_source)
 
137
 
138
  # Usage
139
  title, abstract = get_paper_info(paper_id)
@@ -146,7 +132,7 @@ def load_context(paper_id):
146
  [
147
  (
148
  f"Load the paper with id {paper_id}.",
149
- f"\n**Title**: {title}\n\n**Abstract**: {abstract}\n\nPaper loaded, You can now ask questions.",
150
  )
151
  ],
152
  )
 
8
  import requests
9
 
10
  import arxiv
11
+ from arxiv_latex_extractor import get_paper_content
12
+ import requests
13
+
14
+ LEADING_PROMPT = "Read the following paper and answer the question below:"
15
 
16
 
17
  def replace_texttt(text):
 
32
  return None, None
33
 
34
 
35
+ def get_paper_from_huggingface(paper_id):
36
+ try:
37
+ url = f"https://huggingface.co/datasets/taesiri/arxiv_db/raw/main/papers/{paper_id}.tex"
38
+ response = requests.get(url)
39
+ response.raise_for_status() # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
40
+ return response.text
41
+ except Exception as e:
42
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  class ContextualQA:
 
54
  self.context = text
55
 
56
  def ask_question(self, question):
57
+ if self.questions:
58
+ # For the first question-answer pair, don't add HUMAN_PROMPT before the question
59
+ first_pair = f"Question: {self.questions[0]}\n{AI_PROMPT} Answer: {self.responses[0]}"
60
+ # For subsequent questions, include both HUMAN_PROMPT and AI_PROMPT
61
+ subsequent_pairs = "\n".join(
62
+ [
63
+ f"{HUMAN_PROMPT} Question: {q}\n{AI_PROMPT} Answer: {a}"
64
+ for q, a in zip(self.questions[1:], self.responses[1:])
65
+ ]
66
+ )
67
+ history_context = f"{first_pair}\n{subsequent_pairs}"
68
+ else:
69
+ history_context = ""
70
+
71
+ full_context = f"{self.context}\n\n{history_context}\n"
72
+
73
+ prompt = f"{HUMAN_PROMPT} {full_context} {HUMAN_PROMPT} {question} {AI_PROMPT}"
74
+
75
+ # save prompt on disk for examination
76
+ with open("prompt.txt", "w") as f:
77
+ f.write(prompt)
78
 
 
79
  response = self.client.completions.create(
80
  prompt=prompt,
81
  stop_sequences=[HUMAN_PROMPT],
 
104
 
105
 
106
  def load_context(paper_id):
107
+ global LEADING_PROMPT
108
+
109
+ # First, try to get the paper from Hugging Face
110
+ latex_source = get_paper_from_huggingface(paper_id)
111
+
112
+ # If not found, use arxiv_latex_extractor
113
+ if not latex_source:
114
+ try:
115
+ latex_source = get_paper_content(paper_id)
116
+ except Exception as e:
117
+ return None, [(f"Error loading paper with id {paper_id}: {e}",)]
118
 
119
  client = Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])
120
  qa_model = ContextualQA(client, model="claude-2.0")
121
+ context = f"{LEADING_PROMPT}\n{latex_source}"
122
+ qa_model.load_text(context)
123
 
124
  # Usage
125
  title, abstract = get_paper_info(paper_id)
 
132
  [
133
  (
134
  f"Load the paper with id {paper_id}.",
135
+ f"\n**Title**: {title}\n\n**Abstract**: {abstract}\n\nPaper loaded. You can now ask questions.",
136
  )
137
  ],
138
  )
packages.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ perl
2
+ cpanminus
3
+ texlive-full
4
+ texlive-fonts-extra
5
+ texlive-font-utils
6
+ pandoc
7
+ poppler-utils
8
+ unzip
requirements.txt CHANGED
@@ -6,4 +6,5 @@ seaborn
6
  tqdm
7
  numpy
8
  arxiv
9
- tiktoken
 
 
6
  tqdm
7
  numpy
8
  arxiv
9
+ tiktoken
10
+ git+https://github.com/taesiri/arxiv_latex_extractor