raannakasturi commited on
Commit
a90f1c4
·
1 Parent(s): 6e23a32

Update dependencies and refactor summarization functions to include title and citation

Browse files
app.py CHANGED
@@ -2,8 +2,8 @@ import gradio as gr
2
  from main import main
3
  import json
4
 
5
- def rexplore_summarizer(url, id, access_key):
6
- response = json.loads(main(url, id, access_key))
7
  data = json.dumps(response, indent=4, ensure_ascii=False)
8
  if response["mindmap_status"] != "success":
9
  mindmap = "error"
@@ -15,7 +15,7 @@ def rexplore_summarizer(url, id, access_key):
15
  summary = response["summary"]
16
  return data, summary, mindmap
17
 
18
- def clear_everything(url, id, access_key, raw_data, summary, mindmap):
19
  return None, None, None, None, None, None
20
 
21
  theme = gr.themes.Soft(
@@ -41,7 +41,9 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
41
  with gr.Row():
42
  with gr.Column():
43
  url = gr.Textbox(label="PDF URL", placeholder="Paste the PDF URL here")
44
- id = gr.Textbox(label="DOI/arXiv ID", placeholder="Enter the DOI or arXiv ID of the document")
 
 
45
  access_key = gr.Textbox(label="Access Key", placeholder="Enter the Access Key", type="password")
46
  with gr.Row():
47
  clear_btn = gr.Button(value="Clear", variant="stop")
@@ -53,7 +55,7 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
53
 
54
  summarize_btn.click(
55
  rexplore_summarizer,
56
- inputs=[url, id, access_key],
57
  outputs=[raw_data, summary, mindmap],
58
  concurrency_limit=25,
59
  scroll_to_output=True,
@@ -61,6 +63,6 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
61
  api_name="rexplore_summarizer",
62
  show_progress="full",
63
  )
64
- clear_btn.click(clear_everything, inputs=[url, id, raw_data, summary, mindmap, access_key], outputs=[url, id, raw_data, summary, mindmap, access_key], show_api=False)
65
 
66
  app.queue(default_concurrency_limit=25).launch(show_api=True, max_threads=500, ssr_mode=False)
 
2
  from main import main
3
  import json
4
 
5
+ def rexplore_summarizer(url, title, id, citation, access_key):
6
+ response = json.loads(main(url, title, id, citation, access_key))
7
  data = json.dumps(response, indent=4, ensure_ascii=False)
8
  if response["mindmap_status"] != "success":
9
  mindmap = "error"
 
15
  summary = response["summary"]
16
  return data, summary, mindmap
17
 
18
+ def clear_everything(url, title, id, citation, access_key, raw_data, summary, mindmap):
19
  return None, None, None, None, None, None
20
 
21
  theme = gr.themes.Soft(
 
41
  with gr.Row():
42
  with gr.Column():
43
  url = gr.Textbox(label="PDF URL", placeholder="Paste the PDF URL here")
44
+ title = gr.Textbox(label="Title", placeholder="Enter the title Research Paper")
45
+ id = gr.Textbox(label="DOI/arXiv ID", placeholder="Enter the DOI or arXiv ID of the Research Paper")
46
+ citation = gr.Textbox(label="Citation", placeholder="Enter the citation of the Research Paper")
47
  access_key = gr.Textbox(label="Access Key", placeholder="Enter the Access Key", type="password")
48
  with gr.Row():
49
  clear_btn = gr.Button(value="Clear", variant="stop")
 
55
 
56
  summarize_btn.click(
57
  rexplore_summarizer,
58
+ inputs=[url, title, id, citation, access_key],
59
  outputs=[raw_data, summary, mindmap],
60
  concurrency_limit=25,
61
  scroll_to_output=True,
 
63
  api_name="rexplore_summarizer",
64
  show_progress="full",
65
  )
66
+ clear_btn.click(clear_everything, inputs=[url, title, id, citation, raw_data, summary, mindmap, access_key], outputs=[url, id, raw_data, summary, mindmap, access_key], show_api=False)
67
 
68
  app.queue(default_concurrency_limit=25).launch(show_api=True, max_threads=500, ssr_mode=False)
main.py CHANGED
@@ -1,8 +1,7 @@
1
  from extract_text import extract_text_from_pdf
2
  from math_summarizer import generate_math_summary
3
- from nlp_summarizer import generate_nlp_summary_and_mindmap
4
  import json
5
- import openai
6
  import dotenv
7
  import time
8
  import os
@@ -10,7 +9,7 @@ import os
10
  dotenv.load_dotenv()
11
  ACCESS_KEY = os.getenv("ACCESS_KEY")
12
 
13
- def generate_summary_mindmap(corpus):
14
  response = {}
15
  math_summary = generate_math_summary(corpus)
16
  # print(f'As a text script expert, please help me to write a short text script with the topic \" {math_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary .Please answer within 150-300 characters.\\n 2.to summarize the text I provided, using up to seven Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Key Insight should not include timestamps.\\n Your output should use the following template strictly, provide the results for the three tasks:\\n ## Summary\\n ## Highlights\\n - Highlights\\n ## Key Insights\\n - Key Insights .\\n Importantly your output must use language \"English\"')
@@ -23,25 +22,26 @@ def generate_summary_mindmap(corpus):
23
  response["mindmap"] = None
24
  return response
25
  else:
26
- response = generate_nlp_summary_and_mindmap(math_summary)
27
- print(len(response))
28
  return response
29
 
30
- def main(url, id, access_key):
31
  if access_key != ACCESS_KEY:
32
  return {"error": "Invalid Access Key", "summary": None, "mindmap": None}
33
  else:
34
  corpus = extract_text_from_pdf(url, id)
35
  start_time = time.time()
36
- response = generate_summary_mindmap(corpus)
37
  print(f"Total timetaken: {time.time() - start_time} seconds")
38
  return json.dumps(response, indent=4, ensure_ascii=False)
39
 
40
  if __name__ == "__main__":
41
  url = "https://arxiv.org/pdf/2412.21024"
42
  id = "123"
 
43
  access_key = "1234"
44
- data = main(url, id, access_key)
45
  print(len(data))
 
46
  with open("output.json", "w", encoding='utf-8') as f:
47
  json.dump(data, f, ensure_ascii=False, indent=4)
 
1
  from extract_text import extract_text_from_pdf
2
  from math_summarizer import generate_math_summary
3
+ from nlp_processes import generate_nlp_summary_and_mindmap
4
  import json
 
5
  import dotenv
6
  import time
7
  import os
 
9
  dotenv.load_dotenv()
10
  ACCESS_KEY = os.getenv("ACCESS_KEY")
11
 
12
+ def generate_summary_mindmap(corpus, title, citation):
13
  response = {}
14
  math_summary = generate_math_summary(corpus)
15
  # print(f'As a text script expert, please help me to write a short text script with the topic \" {math_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary .Please answer within 150-300 characters.\\n 2.to summarize the text I provided, using up to seven Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Key Insight should not include timestamps.\\n Your output should use the following template strictly, provide the results for the three tasks:\\n ## Summary\\n ## Highlights\\n - Highlights\\n ## Key Insights\\n - Key Insights .\\n Importantly your output must use language \"English\"')
 
22
  response["mindmap"] = None
23
  return response
24
  else:
25
+ response = generate_nlp_summary_and_mindmap(math_summary, title, citation)
 
26
  return response
27
 
28
+ def main(url, title, id, citation, access_key):
29
  if access_key != ACCESS_KEY:
30
  return {"error": "Invalid Access Key", "summary": None, "mindmap": None}
31
  else:
32
  corpus = extract_text_from_pdf(url, id)
33
  start_time = time.time()
34
+ response = generate_summary_mindmap(corpus, title, citation)
35
  print(f"Total timetaken: {time.time() - start_time} seconds")
36
  return json.dumps(response, indent=4, ensure_ascii=False)
37
 
38
  if __name__ == "__main__":
39
  url = "https://arxiv.org/pdf/2412.21024"
40
  id = "123"
41
+ title = "Trading linearity for ellipticity: a nonsmooth approach to Einstein’s theory of gravity and the Lorentzian splitting theorems"
42
  access_key = "1234"
43
+ data = main(url, title, id, access_key)
44
  print(len(data))
45
+ print(data['title'])
46
  with open("output.json", "w", encoding='utf-8') as f:
47
  json.dump(data, f, ensure_ascii=False, indent=4)
math_summarizer.py CHANGED
@@ -32,7 +32,7 @@ def sanitize_text(input_string):
32
  def generate_math_summary(research_paper_text):
33
  sanitized_text = sanitize_text(research_paper_text)
34
  try:
35
- textrank_summary = luhn_summary = lsa_summary = lexrank_summary = None
36
  def run_textrank():
37
  nonlocal textrank_summary
38
  textrank_summary = generate_textrank_summary(sanitized_text)
 
32
  def generate_math_summary(research_paper_text):
33
  sanitized_text = sanitize_text(research_paper_text)
34
  try:
35
+ textrank_summary = luhn_summary = None
36
  def run_textrank():
37
  nonlocal textrank_summary
38
  textrank_summary = generate_textrank_summary(sanitized_text)
nlp_summarizer.py → nlp_processes.py RENAMED
@@ -1,12 +1,12 @@
1
  from g4f.client import Client
2
- from g4f.Provider import RetryProvider, Blackbox, MetaAI
3
  import threading
4
 
5
  def generate_nlp_summary(temp_summary):
6
  try:
7
  client = Client(
8
  provider=RetryProvider(
9
- providers=[Blackbox, MetaAI],
10
  shuffle=True,
11
  single_provider_retry=True,
12
  max_retries=3,
@@ -28,7 +28,7 @@ def generate_nlp_mindmap(temp_summary):
28
  try:
29
  client = Client(
30
  provider=RetryProvider(
31
- providers=[Blackbox, MetaAI],
32
  shuffle=True,
33
  single_provider_retry=True,
34
  max_retries=3,
@@ -46,8 +46,108 @@ def generate_nlp_mindmap(temp_summary):
46
  print(str(e))
47
  return False
48
 
49
- def generate_nlp_summary_and_mindmap(temp_summary):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  response = {}
 
 
 
 
 
 
 
 
 
 
 
 
51
  def local_generate_nlp_summary():
52
  nlp_summary = generate_nlp_summary(temp_summary)
53
  if not nlp_summary:
@@ -67,6 +167,8 @@ def generate_nlp_summary_and_mindmap(temp_summary):
67
  threads = []
68
  threads.append(threading.Thread(target=local_generate_nlp_summary))
69
  threads.append(threading.Thread(target=local_generate_nlp_mindmap))
 
 
70
  for thread in threads:
71
  thread.start()
72
  for thread in threads:
 
1
  from g4f.client import Client
2
+ from g4f.Provider import RetryProvider, Blackbox, MetaAI, BlackboxCreateAgent
3
  import threading
4
 
5
  def generate_nlp_summary(temp_summary):
6
  try:
7
  client = Client(
8
  provider=RetryProvider(
9
+ providers=[Blackbox, MetaAI, BlackboxCreateAgent],
10
  shuffle=True,
11
  single_provider_retry=True,
12
  max_retries=3,
 
28
  try:
29
  client = Client(
30
  provider=RetryProvider(
31
+ providers=[Blackbox, MetaAI, BlackboxCreateAgent],
32
  shuffle=True,
33
  single_provider_retry=True,
34
  max_retries=3,
 
46
  print(str(e))
47
  return False
48
 
49
+ def fix_title(title):
50
+ try:
51
+ client = Client(
52
+ provider=RetryProvider(
53
+ providers=[Blackbox, MetaAI, BlackboxCreateAgent],
54
+ shuffle=True,
55
+ single_provider_retry=True,
56
+ max_retries=3,
57
+ ),
58
+ )
59
+ completion = client.chat.completions.create(
60
+ model="llama-3.1-405b",
61
+ messages=[
62
+ {
63
+ "role": "system",
64
+ "content": (
65
+ "You are a highly advanced language model with strict adherence to precision and accuracy. \n\n"
66
+ "Your task is to process input text, identify and correct any encoded or escaped characters, and render the text into a readable format. \n\n"
67
+ "**Requirements:**\n"
68
+ "1. Correctly decode and render any encoded characters (e.g., \\x sequences or LaTeX-style expressions) into their intended readable forms.\n"
69
+ "2. Accurately interpret and render mathematical expressions using MathJax where appropriate.\n"
70
+ "3. Produce **only the corrected sequence** as output—no additional commentary, metadata, or extraneous punctuation.\n"
71
+ "4. Maintain the structure and style of the original input text, ensuring it remains true to its intended meaning and formatting.\n\n"
72
+ "**Input Example:**\n"
73
+ "From Painlev\\xe9 equations to ${\\cal N}=2$ susy gauge theories: prolegomena TDI-$\\\\infty$\n\n"
74
+ "**Output Example:**\n"
75
+ "From Painlevé equations to \\({\\cal N}=2\\) SUSY gauge theories: prolegomena TDI-\\(\\infty\\)"
76
+ ),
77
+ },
78
+ {
79
+ "role": "user",
80
+ "content": repr(
81
+ "Convert the following text into a normal, readable sequence, ensuring accurate interpretation and correction of encoded or escaped characters where necessary. "
82
+ "The output must strictly adhere to the input text's original structure, maintaining readability and formatting. Use MathJax where applicable to correctly render mathematical expressions, ensuring the final sequence is flawless. "
83
+ "Provide only the corrected sequence as output, with no additional commentary, formatting, or extraneous punctuation beyond what is specified in the input text.\n\n"
84
+ f"**Input:** {title}\n\n"
85
+ ),
86
+ },
87
+ ],
88
+ )
89
+ return completion.choices[0].message.content
90
+ except Exception as e:
91
+ print(str(e))
92
+ return False
93
+
94
+ def fix_citation(citation):
95
+ try:
96
+ client = Client(
97
+ provider=RetryProvider(
98
+ providers=[Blackbox, MetaAI, BlackboxCreateAgent],
99
+ shuffle=True,
100
+ single_provider_retry=True,
101
+ max_retries=3,
102
+ ),
103
+ )
104
+ completion = client.chat.completions.create(
105
+ model="llama-3.1-405b",
106
+ messages=[
107
+ {
108
+ "role": "system",
109
+ "content": (
110
+ "You are a highly advanced language model with strict adherence to precision and formatting. Your task is to process input text and correct any encoding errors or formatting issues, rendering it into a readable citation in APA latest edition format. \n\n"
111
+ "Requirements:\n"
112
+ "Accurately decode and render any encoded characters (e.g., special character codes like ’).\n"
113
+ "Correctly format the citation in strict compliance with the APA latest edition guidelines.\n"
114
+ "Produce only the corrected citation as output, with no additional commentary, metadata, or extraneous punctuation beyond what is specified in the text.\n"
115
+ "Ensure mathematical expressions, if any, are rendered using MathJax where applicable, maintaining their proper APA context.\n"
116
+ "Input Example:\n"
117
+ "McCann, R. J. (2025). Trading linearity for ellipticity: a nonsmooth approach to Einstein’s theory of gravity and the Lorentzian splitting theorems (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2501.00702"
118
+ "Expected Output Example:\n"
119
+ "McCann, R. J. (2025). Trading linearity for ellipticity: A nonsmooth approach to Einstein’s theory of gravity and the Lorentzian splitting theorems (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2501.00702"
120
+ ),
121
+ },
122
+ {
123
+ "role": "user",
124
+ "content": repr(
125
+ "Convert the following text into a properly formatted citation in strict compliance with APA latest edition guidelines. Correct any encoding errors (e.g., ’) and ensure the output is clean, readable, and adheres to APA rules. Render mathematical expressions using MathJax where applicable, preserving proper context.\n"
126
+ "Provide only the corrected citation as output, with no additional commentary, metadata, or extraneous punctuation beyond what is specified in the text.\n"
127
+ f"**Input:** {citation}\n\n"
128
+ ),
129
+ },
130
+ ],
131
+ )
132
+ return completion.choices[0].message.content
133
+ except Exception as e:
134
+ print(str(e))
135
+ return False
136
+
137
+ def generate_nlp_summary_and_mindmap(temp_summary, title, citation):
138
  response = {}
139
+ def local_fix_title():
140
+ fixed_title = fix_title(title)
141
+ if not fixed_title:
142
+ response["title"] = title
143
+ else:
144
+ response["title"] = fixed_title.strip()
145
+ def local_fix_citation():
146
+ fixed_citation = fix_citation(citation)
147
+ if not fixed_citation:
148
+ response["citation"] = citation
149
+ else:
150
+ response["citation"] = fixed_citation.strip
151
  def local_generate_nlp_summary():
152
  nlp_summary = generate_nlp_summary(temp_summary)
153
  if not nlp_summary:
 
167
  threads = []
168
  threads.append(threading.Thread(target=local_generate_nlp_summary))
169
  threads.append(threading.Thread(target=local_generate_nlp_mindmap))
170
+ threads.append(threading.Thread(target=local_fix_title))
171
+ threads.append(threading.Thread(target=local_fix_citation))
172
  for thread in threads:
173
  thread.start()
174
  for thread in threads:
requirements.txt CHANGED
@@ -3,4 +3,4 @@ gradio==5.8.0
3
  python-dotenv==1.0.1
4
  pdfplumber==0.11.4
5
  requests==2.32.3
6
- g4f[all]==0.4.0.2
 
3
  python-dotenv==1.0.1
4
  pdfplumber==0.11.4
5
  requests==2.32.3
6
+ g4f[all]==0.4.0.4