raannakasturi commited on
Commit
770226f
·
1 Parent(s): 67b6792

Refactor summary generation functions to remove unused parameters and simplify the API

Browse files
Files changed (3) hide show
  1. app.py +5 -7
  2. main.py +5 -7
  3. nlp_processes.py +1 -187
app.py CHANGED
@@ -12,8 +12,8 @@ def installChrome():
12
  subprocess.run(['apt-get', 'clean'])
13
 
14
 
15
- def rexplore_summarizer(url, title, id, citation, access_key):
16
- response = json.loads(main(url, title, id, citation, access_key))
17
  data = json.dumps(response, ensure_ascii=False, indent=4)
18
  if response["mindmap_status"] != "success":
19
  mindmap = "error"
@@ -25,7 +25,7 @@ def rexplore_summarizer(url, title, id, citation, access_key):
25
  summary = response["summary"]
26
  return data, summary, mindmap
27
 
28
- def clear_everything(url, title, id, citation, access_key, raw_data, summary, mindmap):
29
  return None, None, None, None, None, None
30
 
31
  theme = gr.themes.Soft(
@@ -51,9 +51,7 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
51
  with gr.Row():
52
  with gr.Column():
53
  url = gr.Textbox(label="PDF URL", placeholder="Paste the PDF URL here")
54
- title = gr.Textbox(label="Title", placeholder="Enter the title Research Paper")
55
  id = gr.Textbox(label="DOI/arXiv ID", placeholder="Enter the DOI or arXiv ID of the Research Paper")
56
- citation = gr.Textbox(label="Citation", placeholder="Enter the citation of the Research Paper")
57
  access_key = gr.Textbox(label="Access Key", placeholder="Enter the Access Key", type="password")
58
  with gr.Row():
59
  clear_btn = gr.Button(value="Clear", variant="stop")
@@ -64,7 +62,7 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
64
  mindmap = gr.TextArea(label="Mindmap", placeholder="The generated mindmap will be displayed here", lines=7, interactive=False, show_copy_button=True)
65
  summarize_btn.click(
66
  rexplore_summarizer,
67
- inputs=[url, title, id, citation, access_key],
68
  outputs=[raw_data, summary, mindmap],
69
  concurrency_limit=25,
70
  scroll_to_output=True,
@@ -72,7 +70,7 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
72
  api_name="rexplore_summarizer",
73
  show_progress="full",
74
  )
75
- clear_btn.click(clear_everything, inputs=[url, title, id, citation, raw_data, summary, mindmap, access_key], outputs=[url, id, raw_data, summary, mindmap, access_key], show_api=False)
76
 
77
  installChrome()
78
  app.queue(default_concurrency_limit=25).launch(show_api=True, ssr_mode=False)
 
12
  subprocess.run(['apt-get', 'clean'])
13
 
14
 
15
+ def rexplore_summarizer(url, id, access_key):
16
+ response = json.loads(main(url, id, access_key))
17
  data = json.dumps(response, ensure_ascii=False, indent=4)
18
  if response["mindmap_status"] != "success":
19
  mindmap = "error"
 
25
  summary = response["summary"]
26
  return data, summary, mindmap
27
 
28
+ def clear_everything(url, id, access_key, raw_data, summary, mindmap):
29
  return None, None, None, None, None, None
30
 
31
  theme = gr.themes.Soft(
 
51
  with gr.Row():
52
  with gr.Column():
53
  url = gr.Textbox(label="PDF URL", placeholder="Paste the PDF URL here")
 
54
  id = gr.Textbox(label="DOI/arXiv ID", placeholder="Enter the DOI or arXiv ID of the Research Paper")
 
55
  access_key = gr.Textbox(label="Access Key", placeholder="Enter the Access Key", type="password")
56
  with gr.Row():
57
  clear_btn = gr.Button(value="Clear", variant="stop")
 
62
  mindmap = gr.TextArea(label="Mindmap", placeholder="The generated mindmap will be displayed here", lines=7, interactive=False, show_copy_button=True)
63
  summarize_btn.click(
64
  rexplore_summarizer,
65
+ inputs=[url, id, access_key],
66
  outputs=[raw_data, summary, mindmap],
67
  concurrency_limit=25,
68
  scroll_to_output=True,
 
70
  api_name="rexplore_summarizer",
71
  show_progress="full",
72
  )
73
+ clear_btn.click(clear_everything, inputs=[url, id, raw_data, summary, mindmap, access_key], outputs=[url, id, raw_data, summary, mindmap, access_key], show_api=False)
74
 
75
  installChrome()
76
  app.queue(default_concurrency_limit=25).launch(show_api=True, ssr_mode=False)
main.py CHANGED
@@ -9,7 +9,7 @@ import os
9
  dotenv.load_dotenv()
10
  ACCESS_KEY = os.getenv("ACCESS_KEY")
11
 
12
- def generate_summary_mindmap(corpus, title, citation):
13
  response = {}
14
  math_summary = generate_math_summary(corpus)
15
  # print(f'As a text script expert, please help me to write a short text script with the topic \" {math_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary .Please answer within 150-300 characters.\\n 2.to summarize the text I provided, using up to seven Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Key Insight should not include timestamps.\\n Your output should use the following template strictly, provide the results for the three tasks:\\n ## Summary\\n ## Highlights\\n - Highlights\\n ## Key Insights\\n - Key Insights .\\n Importantly your output must use language \"English\"')
@@ -22,26 +22,24 @@ def generate_summary_mindmap(corpus, title, citation):
22
  response["mindmap"] = None
23
  return response
24
  else:
25
- response = generate_nlp_summary_and_mindmap(math_summary, title, citation)
26
  return response
27
 
28
- def main(url, title, id, citation, access_key):
29
  if access_key != ACCESS_KEY:
30
  return {"error": "Invalid Access Key", "summary": None, "mindmap": None}
31
  else:
32
  corpus = extract_text_from_pdf(url, id)
33
  start_time = time.time()
34
- response = generate_summary_mindmap(corpus, title, citation)
35
  print(f"Total timetaken: {time.time() - start_time} seconds")
36
  return json.dumps(response, indent=4, ensure_ascii=False)
37
 
38
  if __name__ == "__main__":
39
  url = "https://arxiv.org/pdf/2412.21024"
40
  id = "123"
41
- title = "Trading linearity for ellipticity: a nonsmooth approach to Einstein’s theory of gravity and the Lorentzian splitting theorems"
42
  access_key = os.environ.get("ACCESS_KEY")
43
- citation = "Bykov, D., Krivorol, V., & Kuzovchikov, A. (2024). Oscillator Calculus on Coadjoint Orbits and Index Theorems (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2412.21024"
44
- data = main(url, title, id, citation, access_key)
45
  print((data))
46
  with open("output.json", "w", encoding="utf-8") as f:
47
  json.dump(data, f, ensure_ascii=False, indent=4)
 
9
  dotenv.load_dotenv()
10
  ACCESS_KEY = os.getenv("ACCESS_KEY")
11
 
12
+ def generate_summary_mindmap(corpus):
13
  response = {}
14
  math_summary = generate_math_summary(corpus)
15
  # print(f'As a text script expert, please help me to write a short text script with the topic \" {math_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary .Please answer within 150-300 characters.\\n 2.to summarize the text I provided, using up to seven Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Key Insight should not include timestamps.\\n Your output should use the following template strictly, provide the results for the three tasks:\\n ## Summary\\n ## Highlights\\n - Highlights\\n ## Key Insights\\n - Key Insights .\\n Importantly your output must use language \"English\"')
 
22
  response["mindmap"] = None
23
  return response
24
  else:
25
+ response = generate_nlp_summary_and_mindmap(math_summary)
26
  return response
27
 
28
+ def main(url, id, access_key):
29
  if access_key != ACCESS_KEY:
30
  return {"error": "Invalid Access Key", "summary": None, "mindmap": None}
31
  else:
32
  corpus = extract_text_from_pdf(url, id)
33
  start_time = time.time()
34
+ response = generate_summary_mindmap(corpus)
35
  print(f"Total timetaken: {time.time() - start_time} seconds")
36
  return json.dumps(response, indent=4, ensure_ascii=False)
37
 
38
  if __name__ == "__main__":
39
  url = "https://arxiv.org/pdf/2412.21024"
40
  id = "123"
 
41
  access_key = os.environ.get("ACCESS_KEY")
42
+ data = main(url, id, access_key)
 
43
  print((data))
44
  with open("output.json", "w", encoding="utf-8") as f:
45
  json.dump(data, f, ensure_ascii=False, indent=4)
nlp_processes.py CHANGED
@@ -87,192 +87,8 @@ def generate_nlp_mindmap(temp_summary):
87
  print(str(e))
88
  return False
89
 
90
- def fix_title(title):
91
- proxy = get_proxy()
92
- while not get_proxy():
93
- proxy = get_proxy()
94
- try:
95
- try:
96
- client = Client(
97
- provider=RetryProvider(
98
- providers=[Blackbox, Airforce, AmigoChat],
99
- max_retries=4,
100
- ),
101
- )
102
- completion = client.chat.completions.create(
103
- proxy=proxy,
104
- model="llama-3.1-405b",
105
- messages=[
106
- {
107
- "role": "system",
108
- "content": (
109
- "You are a highly advanced language model with strict adherence to precision and accuracy. \n\n"
110
- "Your task is to process input text, identify and correct any encoded or escaped characters, and render the text into a readable format. \n\n"
111
- "**Requirements:**\n"
112
- "1. Correctly decode and render any encoded characters (e.g., \\x sequences or LaTeX-style expressions) into their intended readable forms.\n"
113
- "2. Accurately interpret and render mathematical expressions using MathJax where appropriate.\n"
114
- "3. Produce **only the corrected sequence** as output—no additional commentary, metadata, or extraneous punctuation.\n"
115
- "4. Maintain the structure and style of the original input text, ensuring it remains true to its intended meaning and formatting.\n\n"
116
- "**Input Example:**\n"
117
- "From Painlev\\xe9 equations to ${\\cal N}=2$ susy gauge theories: prolegomena TDI-$\\\\infty$\n\n"
118
- "**Output Example:**\n"
119
- "From Painlevé equations to \\({\\cal N}=2\\) SUSY gauge theories: prolegomena TDI-\\(\\infty\\)"
120
- ),
121
- },
122
- {
123
- "role": "user",
124
- "content": repr(
125
- "Convert the following text into a normal, readable sequence, ensuring accurate interpretation and correction of encoded or escaped characters where necessary. "
126
- "The output must strictly adhere to the input text's original structure, maintaining readability and formatting. Use MathJax where applicable to correctly render mathematical expressions, ensuring the final sequence is flawless. "
127
- "Provide only the corrected sequence as output, with no additional commentary, formatting, or extraneous punctuation beyond what is specified in the input text.\n\n"
128
- f"**Input:** {title}\n\n"
129
- ),
130
- },
131
- ],
132
- )
133
- return completion.choices[0].message.content
134
- except Exception as e:
135
- print(str(e))
136
- client = Client()
137
- completion = client.chat.completions.create(
138
- proxy=proxy,
139
- provider=RetryProvider(
140
- providers=[Blackbox, ChatGptEs, ChatGpt, Copilot, DDG, Liaobots, Mhystical, PollinationsAI],
141
- max_retries=8,
142
- single_provider_retry=True,
143
- ),
144
- model="gpt-4",
145
- messages=[
146
- {
147
- "role": "system",
148
- "content": (
149
- "You are a highly advanced language model with strict adherence to precision and accuracy. \n\n"
150
- "Your task is to process input text, identify and correct any encoded or escaped characters, and render the text into a readable format. \n\n"
151
- "**Requirements:**\n"
152
- "1. Correctly decode and render any encoded characters (e.g., \\x sequences or LaTeX-style expressions) into their intended readable forms.\n"
153
- "2. Accurately interpret and render mathematical expressions using MathJax where appropriate.\n"
154
- "3. Produce **only the corrected sequence** as output—no additional commentary, metadata, or extraneous punctuation.\n"
155
- "4. Maintain the structure and style of the original input text, ensuring it remains true to its intended meaning and formatting.\n\n"
156
- "**Input Example:**\n"
157
- "From Painlev\\xe9 equations to ${\\cal N}=2$ susy gauge theories: prolegomena TDI-$\\\\infty$\n\n"
158
- "**Output Example:**\n"
159
- "From Painlevé equations to \\({\\cal N}=2\\) SUSY gauge theories: prolegomena TDI-\\(\\infty\\)"
160
- ),
161
- },
162
- {
163
- "role": "user",
164
- "content": repr(
165
- "Convert the following text into a normal, readable sequence, ensuring accurate interpretation and correction of encoded or escaped characters where necessary. "
166
- "The output must strictly adhere to the input text's original structure, maintaining readability and formatting. Use MathJax where applicable to correctly render mathematical expressions, ensuring the final sequence is flawless. "
167
- "Provide only the corrected sequence as output, with no additional commentary, formatting, or extraneous punctuation beyond what is specified in the input text.\n\n"
168
- f"**Input:** {title}\n\n"
169
- ),
170
- },
171
- ],
172
- )
173
- return completion.choices[0].message.content
174
- except Exception as e:
175
- print(str(e))
176
- return False
177
-
178
- def fix_citation(citation):
179
- proxy = get_proxy()
180
- while not get_proxy():
181
- proxy = get_proxy()
182
- try:
183
- try:
184
- client = Client(
185
- provider=RetryProvider(
186
- providers=[Blackbox, Airforce, AmigoChat],
187
- max_retries=4,
188
- ),
189
- )
190
- completion = client.chat.completions.create(
191
- proxy=proxy,
192
- model="llama-3.1-405b",
193
- messages=[
194
- {
195
- "role": "system",
196
- "content": (
197
- "You are a highly advanced language model with strict adherence to precision and formatting. Your task is to process input text and correct any encoding errors or formatting issues, rendering it into a readable citation in APA latest edition format. \n\n"
198
- "Requirements:\n"
199
- "Accurately decode and render any encoded characters (e.g., special character codes like ’).\n"
200
- "Correctly format the citation in strict compliance with the APA latest edition guidelines.\n"
201
- "Produce only the corrected citation as output, with no additional commentary, metadata, or extraneous punctuation beyond what is specified in the text.\n"
202
- "Ensure mathematical expressions, if any, are rendered using MathJax where applicable, maintaining their proper APA context.\n"
203
- "Input Example:\n"
204
- "McCann, R. J. (2025). Trading linearity for ellipticity: a nonsmooth approach to Einstein’s theory of gravity and the Lorentzian splitting theorems (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2501.00702"
205
- "Expected Output Example:\n"
206
- "McCann, R. J. (2025). Trading linearity for ellipticity: A nonsmooth approach to Einstein’s theory of gravity and the Lorentzian splitting theorems (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2501.00702"
207
- ),
208
- },
209
- {
210
- "role": "user",
211
- "content": repr(
212
- "Convert the following text into a properly formatted citation in strict compliance with APA latest edition guidelines. Correct any encoding errors (e.g., ’) and ensure the output is clean, readable, and adheres to APA rules. Render mathematical expressions using MathJax where applicable, preserving proper context.\n"
213
- "Provide only the corrected citation as output, with no additional commentary, metadata, or extraneous punctuation beyond what is specified in the text.\n"
214
- f"**Input:** {citation}\n\n"
215
- ),
216
- },
217
- ],
218
- )
219
- return completion.choices[0].message.content
220
- except Exception as e:
221
- print(str(e))
222
- client = Client()
223
- completion = client.chat.completions.create(
224
- proxy=proxy,
225
- provider=RetryProvider(
226
- providers=[Blackbox, ChatGptEs, ChatGpt, Copilot, DDG, Liaobots, Mhystical, PollinationsAI],
227
- max_retries=8,
228
- single_provider_retry=True,
229
- ),
230
- model="gpt-4",
231
- messages=[
232
- {
233
- "role": "system",
234
- "content": (
235
- "You are a highly advanced language model with strict adherence to precision and formatting. Your task is to process input text and correct any encoding errors or formatting issues, rendering it into a readable citation in APA latest edition format. \n\n"
236
- "Requirements:\n"
237
- "Accurately decode and render any encoded characters (e.g., special character codes like ’).\n"
238
- "Correctly format the citation in strict compliance with the APA latest edition guidelines.\n"
239
- "Produce only the corrected citation as output, with no additional commentary, metadata, or extraneous punctuation beyond what is specified in the text.\n"
240
- "Ensure mathematical expressions, if any, are rendered using MathJax where applicable, maintaining their proper APA context.\n"
241
- "Input Example:\n"
242
- "McCann, R. J. (2025). Trading linearity for ellipticity: a nonsmooth approach to Einstein’s theory of gravity and the Lorentzian splitting theorems (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2501.00702"
243
- "Expected Output Example:\n"
244
- "McCann, R. J. (2025). Trading linearity for ellipticity: A nonsmooth approach to Einstein’s theory of gravity and the Lorentzian splitting theorems (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2501.00702"
245
- ),
246
- },
247
- {
248
- "role": "user",
249
- "content": repr(
250
- "Convert the following text into a properly formatted citation in strict compliance with APA latest edition guidelines. Correct any encoding errors (e.g., ’) and ensure the output is clean, readable, and adheres to APA rules. Render mathematical expressions using MathJax where applicable, preserving proper context.\n"
251
- "Provide only the corrected citation as output, with no additional commentary, metadata, or extraneous punctuation beyond what is specified in the text.\n"
252
- f"**Input:** {citation}\n\n"
253
- ),
254
- },
255
- ],
256
- )
257
- return completion.choices[0].message.content
258
- except Exception as e:
259
- print(str(e))
260
- return False
261
-
262
- def generate_nlp_summary_and_mindmap(temp_summary, title, citation):
263
  response = {}
264
- def local_fix_title():
265
- fixed_title = fix_title(title)
266
- if not fixed_title:
267
- response["title"] = title
268
- else:
269
- response["title"] = fixed_title.strip()
270
- def local_fix_citation():
271
- fixed_citation = fix_citation(citation)
272
- if not fixed_citation:
273
- response["citation"] = citation
274
- else:
275
- response["citation"] = fixed_citation.strip()
276
  def local_generate_nlp_summary():
277
  nlp_summary = generate_nlp_summary(temp_summary)
278
  if not nlp_summary:
@@ -292,8 +108,6 @@ def generate_nlp_summary_and_mindmap(temp_summary, title, citation):
292
  threads = []
293
  threads.append(threading.Thread(target=local_generate_nlp_summary))
294
  threads.append(threading.Thread(target=local_generate_nlp_mindmap))
295
- threads.append(threading.Thread(target=local_fix_title))
296
- threads.append(threading.Thread(target=local_fix_citation))
297
  for thread in threads:
298
  thread.start()
299
  for thread in threads:
 
87
  print(str(e))
88
  return False
89
 
90
+ def generate_nlp_summary_and_mindmap(temp_summary):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  response = {}
 
 
 
 
 
 
 
 
 
 
 
 
92
  def local_generate_nlp_summary():
93
  nlp_summary = generate_nlp_summary(temp_summary)
94
  if not nlp_summary:
 
108
  threads = []
109
  threads.append(threading.Thread(target=local_generate_nlp_summary))
110
  threads.append(threading.Thread(target=local_generate_nlp_mindmap))
 
 
111
  for thread in threads:
112
  thread.start()
113
  for thread in threads: