Spaces:
Running
Running
Commit
·
a90f1c4
1
Parent(s):
6e23a32
Update dependencies and refactor summarization functions to include title and citation
Browse files- app.py +8 -6
- main.py +8 -8
- math_summarizer.py +1 -1
- nlp_summarizer.py → nlp_processes.py +106 -4
- requirements.txt +1 -1
app.py
CHANGED
@@ -2,8 +2,8 @@ import gradio as gr
|
|
2 |
from main import main
|
3 |
import json
|
4 |
|
5 |
-
def rexplore_summarizer(url, id, access_key):
|
6 |
-
response = json.loads(main(url, id, access_key))
|
7 |
data = json.dumps(response, indent=4, ensure_ascii=False)
|
8 |
if response["mindmap_status"] != "success":
|
9 |
mindmap = "error"
|
@@ -15,7 +15,7 @@ def rexplore_summarizer(url, id, access_key):
|
|
15 |
summary = response["summary"]
|
16 |
return data, summary, mindmap
|
17 |
|
18 |
-
def clear_everything(url, id, access_key, raw_data, summary, mindmap):
|
19 |
return None, None, None, None, None, None
|
20 |
|
21 |
theme = gr.themes.Soft(
|
@@ -41,7 +41,9 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
|
|
41 |
with gr.Row():
|
42 |
with gr.Column():
|
43 |
url = gr.Textbox(label="PDF URL", placeholder="Paste the PDF URL here")
|
44 |
-
|
|
|
|
|
45 |
access_key = gr.Textbox(label="Access Key", placeholder="Enter the Access Key", type="password")
|
46 |
with gr.Row():
|
47 |
clear_btn = gr.Button(value="Clear", variant="stop")
|
@@ -53,7 +55,7 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
|
|
53 |
|
54 |
summarize_btn.click(
|
55 |
rexplore_summarizer,
|
56 |
-
inputs=[url, id, access_key],
|
57 |
outputs=[raw_data, summary, mindmap],
|
58 |
concurrency_limit=25,
|
59 |
scroll_to_output=True,
|
@@ -61,6 +63,6 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
|
|
61 |
api_name="rexplore_summarizer",
|
62 |
show_progress="full",
|
63 |
)
|
64 |
-
clear_btn.click(clear_everything, inputs=[url, id, raw_data, summary, mindmap, access_key], outputs=[url, id, raw_data, summary, mindmap, access_key], show_api=False)
|
65 |
|
66 |
app.queue(default_concurrency_limit=25).launch(show_api=True, max_threads=500, ssr_mode=False)
|
|
|
2 |
from main import main
|
3 |
import json
|
4 |
|
5 |
+
def rexplore_summarizer(url, title, id, citation, access_key):
|
6 |
+
response = json.loads(main(url, title, id, citation, access_key))
|
7 |
data = json.dumps(response, indent=4, ensure_ascii=False)
|
8 |
if response["mindmap_status"] != "success":
|
9 |
mindmap = "error"
|
|
|
15 |
summary = response["summary"]
|
16 |
return data, summary, mindmap
|
17 |
|
18 |
+
def clear_everything(url, title, id, citation, access_key, raw_data, summary, mindmap):
|
19 |
return None, None, None, None, None, None
|
20 |
|
21 |
theme = gr.themes.Soft(
|
|
|
41 |
with gr.Row():
|
42 |
with gr.Column():
|
43 |
url = gr.Textbox(label="PDF URL", placeholder="Paste the PDF URL here")
|
44 |
+
title = gr.Textbox(label="Title", placeholder="Enter the title Research Paper")
|
45 |
+
id = gr.Textbox(label="DOI/arXiv ID", placeholder="Enter the DOI or arXiv ID of the Research Paper")
|
46 |
+
citation = gr.Textbox(label="Citation", placeholder="Enter the citation of the Research Paper")
|
47 |
access_key = gr.Textbox(label="Access Key", placeholder="Enter the Access Key", type="password")
|
48 |
with gr.Row():
|
49 |
clear_btn = gr.Button(value="Clear", variant="stop")
|
|
|
55 |
|
56 |
summarize_btn.click(
|
57 |
rexplore_summarizer,
|
58 |
+
inputs=[url, title, id, citation, access_key],
|
59 |
outputs=[raw_data, summary, mindmap],
|
60 |
concurrency_limit=25,
|
61 |
scroll_to_output=True,
|
|
|
63 |
api_name="rexplore_summarizer",
|
64 |
show_progress="full",
|
65 |
)
|
66 |
+
clear_btn.click(clear_everything, inputs=[url, title, id, citation, raw_data, summary, mindmap, access_key], outputs=[url, id, raw_data, summary, mindmap, access_key], show_api=False)
|
67 |
|
68 |
app.queue(default_concurrency_limit=25).launch(show_api=True, max_threads=500, ssr_mode=False)
|
main.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
from extract_text import extract_text_from_pdf
|
2 |
from math_summarizer import generate_math_summary
|
3 |
-
from
|
4 |
import json
|
5 |
-
import openai
|
6 |
import dotenv
|
7 |
import time
|
8 |
import os
|
@@ -10,7 +9,7 @@ import os
|
|
10 |
dotenv.load_dotenv()
|
11 |
ACCESS_KEY = os.getenv("ACCESS_KEY")
|
12 |
|
13 |
-
def generate_summary_mindmap(corpus):
|
14 |
response = {}
|
15 |
math_summary = generate_math_summary(corpus)
|
16 |
# print(f'As a text script expert, please help me to write a short text script with the topic \" {math_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary .Please answer within 150-300 characters.\\n 2.to summarize the text I provided, using up to seven Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Key Insight should not include timestamps.\\n Your output should use the following template strictly, provide the results for the three tasks:\\n ## Summary\\n ## Highlights\\n - Highlights\\n ## Key Insights\\n - Key Insights .\\n Importantly your output must use language \"English\"')
|
@@ -23,25 +22,26 @@ def generate_summary_mindmap(corpus):
|
|
23 |
response["mindmap"] = None
|
24 |
return response
|
25 |
else:
|
26 |
-
response = generate_nlp_summary_and_mindmap(math_summary)
|
27 |
-
print(len(response))
|
28 |
return response
|
29 |
|
30 |
-
def main(url, id, access_key):
|
31 |
if access_key != ACCESS_KEY:
|
32 |
return {"error": "Invalid Access Key", "summary": None, "mindmap": None}
|
33 |
else:
|
34 |
corpus = extract_text_from_pdf(url, id)
|
35 |
start_time = time.time()
|
36 |
-
response = generate_summary_mindmap(corpus)
|
37 |
print(f"Total timetaken: {time.time() - start_time} seconds")
|
38 |
return json.dumps(response, indent=4, ensure_ascii=False)
|
39 |
|
40 |
if __name__ == "__main__":
|
41 |
url = "https://arxiv.org/pdf/2412.21024"
|
42 |
id = "123"
|
|
|
43 |
access_key = "1234"
|
44 |
-
data = main(url, id, access_key)
|
45 |
print(len(data))
|
|
|
46 |
with open("output.json", "w", encoding='utf-8') as f:
|
47 |
json.dump(data, f, ensure_ascii=False, indent=4)
|
|
|
1 |
from extract_text import extract_text_from_pdf
|
2 |
from math_summarizer import generate_math_summary
|
3 |
+
from nlp_processes import generate_nlp_summary_and_mindmap
|
4 |
import json
|
|
|
5 |
import dotenv
|
6 |
import time
|
7 |
import os
|
|
|
9 |
dotenv.load_dotenv()
|
10 |
ACCESS_KEY = os.getenv("ACCESS_KEY")
|
11 |
|
12 |
+
def generate_summary_mindmap(corpus, title, citation):
|
13 |
response = {}
|
14 |
math_summary = generate_math_summary(corpus)
|
15 |
# print(f'As a text script expert, please help me to write a short text script with the topic \" {math_summary}\".You have three tasks, which are:\\n 1.to summarize the text I provided into a Summary .Please answer within 150-300 characters.\\n 2.to summarize the text I provided, using up to seven Highlight.\\n 3.to summarize the text I provided, using up to seven Key Insights. Each insight should include a brief in-depth analysis. Key Insight should not include timestamps.\\n Your output should use the following template strictly, provide the results for the three tasks:\\n ## Summary\\n ## Highlights\\n - Highlights\\n ## Key Insights\\n - Key Insights .\\n Importantly your output must use language \"English\"')
|
|
|
22 |
response["mindmap"] = None
|
23 |
return response
|
24 |
else:
|
25 |
+
response = generate_nlp_summary_and_mindmap(math_summary, title, citation)
|
|
|
26 |
return response
|
27 |
|
28 |
+
def main(url, title, id, citation, access_key):
|
29 |
if access_key != ACCESS_KEY:
|
30 |
return {"error": "Invalid Access Key", "summary": None, "mindmap": None}
|
31 |
else:
|
32 |
corpus = extract_text_from_pdf(url, id)
|
33 |
start_time = time.time()
|
34 |
+
response = generate_summary_mindmap(corpus, title, citation)
|
35 |
print(f"Total timetaken: {time.time() - start_time} seconds")
|
36 |
return json.dumps(response, indent=4, ensure_ascii=False)
|
37 |
|
38 |
if __name__ == "__main__":
|
39 |
url = "https://arxiv.org/pdf/2412.21024"
|
40 |
id = "123"
|
41 |
+
title = "Trading linearity for ellipticity: a nonsmooth approach to Einsteinâs theory of gravity and the Lorentzian splitting theorems"
|
42 |
access_key = "1234"
|
43 |
+
data = main(url, title, id, access_key)
|
44 |
print(len(data))
|
45 |
+
print(data['title'])
|
46 |
with open("output.json", "w", encoding='utf-8') as f:
|
47 |
json.dump(data, f, ensure_ascii=False, indent=4)
|
math_summarizer.py
CHANGED
@@ -32,7 +32,7 @@ def sanitize_text(input_string):
|
|
32 |
def generate_math_summary(research_paper_text):
|
33 |
sanitized_text = sanitize_text(research_paper_text)
|
34 |
try:
|
35 |
-
textrank_summary = luhn_summary =
|
36 |
def run_textrank():
|
37 |
nonlocal textrank_summary
|
38 |
textrank_summary = generate_textrank_summary(sanitized_text)
|
|
|
32 |
def generate_math_summary(research_paper_text):
|
33 |
sanitized_text = sanitize_text(research_paper_text)
|
34 |
try:
|
35 |
+
textrank_summary = luhn_summary = None
|
36 |
def run_textrank():
|
37 |
nonlocal textrank_summary
|
38 |
textrank_summary = generate_textrank_summary(sanitized_text)
|
nlp_summarizer.py → nlp_processes.py
RENAMED
@@ -1,12 +1,12 @@
|
|
1 |
from g4f.client import Client
|
2 |
-
from g4f.Provider import RetryProvider, Blackbox, MetaAI
|
3 |
import threading
|
4 |
|
5 |
def generate_nlp_summary(temp_summary):
|
6 |
try:
|
7 |
client = Client(
|
8 |
provider=RetryProvider(
|
9 |
-
providers=[Blackbox, MetaAI],
|
10 |
shuffle=True,
|
11 |
single_provider_retry=True,
|
12 |
max_retries=3,
|
@@ -28,7 +28,7 @@ def generate_nlp_mindmap(temp_summary):
|
|
28 |
try:
|
29 |
client = Client(
|
30 |
provider=RetryProvider(
|
31 |
-
providers=[Blackbox, MetaAI],
|
32 |
shuffle=True,
|
33 |
single_provider_retry=True,
|
34 |
max_retries=3,
|
@@ -46,8 +46,108 @@ def generate_nlp_mindmap(temp_summary):
|
|
46 |
print(str(e))
|
47 |
return False
|
48 |
|
49 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
response = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def local_generate_nlp_summary():
|
52 |
nlp_summary = generate_nlp_summary(temp_summary)
|
53 |
if not nlp_summary:
|
@@ -67,6 +167,8 @@ def generate_nlp_summary_and_mindmap(temp_summary):
|
|
67 |
threads = []
|
68 |
threads.append(threading.Thread(target=local_generate_nlp_summary))
|
69 |
threads.append(threading.Thread(target=local_generate_nlp_mindmap))
|
|
|
|
|
70 |
for thread in threads:
|
71 |
thread.start()
|
72 |
for thread in threads:
|
|
|
1 |
from g4f.client import Client
|
2 |
+
from g4f.Provider import RetryProvider, Blackbox, MetaAI, BlackboxCreateAgent
|
3 |
import threading
|
4 |
|
5 |
def generate_nlp_summary(temp_summary):
|
6 |
try:
|
7 |
client = Client(
|
8 |
provider=RetryProvider(
|
9 |
+
providers=[Blackbox, MetaAI, BlackboxCreateAgent],
|
10 |
shuffle=True,
|
11 |
single_provider_retry=True,
|
12 |
max_retries=3,
|
|
|
28 |
try:
|
29 |
client = Client(
|
30 |
provider=RetryProvider(
|
31 |
+
providers=[Blackbox, MetaAI, BlackboxCreateAgent],
|
32 |
shuffle=True,
|
33 |
single_provider_retry=True,
|
34 |
max_retries=3,
|
|
|
46 |
print(str(e))
|
47 |
return False
|
48 |
|
49 |
+
def fix_title(title):
|
50 |
+
try:
|
51 |
+
client = Client(
|
52 |
+
provider=RetryProvider(
|
53 |
+
providers=[Blackbox, MetaAI, BlackboxCreateAgent],
|
54 |
+
shuffle=True,
|
55 |
+
single_provider_retry=True,
|
56 |
+
max_retries=3,
|
57 |
+
),
|
58 |
+
)
|
59 |
+
completion = client.chat.completions.create(
|
60 |
+
model="llama-3.1-405b",
|
61 |
+
messages=[
|
62 |
+
{
|
63 |
+
"role": "system",
|
64 |
+
"content": (
|
65 |
+
"You are a highly advanced language model with strict adherence to precision and accuracy. \n\n"
|
66 |
+
"Your task is to process input text, identify and correct any encoded or escaped characters, and render the text into a readable format. \n\n"
|
67 |
+
"**Requirements:**\n"
|
68 |
+
"1. Correctly decode and render any encoded characters (e.g., \\x sequences or LaTeX-style expressions) into their intended readable forms.\n"
|
69 |
+
"2. Accurately interpret and render mathematical expressions using MathJax where appropriate.\n"
|
70 |
+
"3. Produce **only the corrected sequence** as output—no additional commentary, metadata, or extraneous punctuation.\n"
|
71 |
+
"4. Maintain the structure and style of the original input text, ensuring it remains true to its intended meaning and formatting.\n\n"
|
72 |
+
"**Input Example:**\n"
|
73 |
+
"From Painlev\\xe9 equations to ${\\cal N}=2$ susy gauge theories: prolegomena TDI-$\\\\infty$\n\n"
|
74 |
+
"**Output Example:**\n"
|
75 |
+
"From Painlevé equations to \\({\\cal N}=2\\) SUSY gauge theories: prolegomena TDI-\\(\\infty\\)"
|
76 |
+
),
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"role": "user",
|
80 |
+
"content": repr(
|
81 |
+
"Convert the following text into a normal, readable sequence, ensuring accurate interpretation and correction of encoded or escaped characters where necessary. "
|
82 |
+
"The output must strictly adhere to the input text's original structure, maintaining readability and formatting. Use MathJax where applicable to correctly render mathematical expressions, ensuring the final sequence is flawless. "
|
83 |
+
"Provide only the corrected sequence as output, with no additional commentary, formatting, or extraneous punctuation beyond what is specified in the input text.\n\n"
|
84 |
+
f"**Input:** {title}\n\n"
|
85 |
+
),
|
86 |
+
},
|
87 |
+
],
|
88 |
+
)
|
89 |
+
return completion.choices[0].message.content
|
90 |
+
except Exception as e:
|
91 |
+
print(str(e))
|
92 |
+
return False
|
93 |
+
|
94 |
+
def fix_citation(citation):
|
95 |
+
try:
|
96 |
+
client = Client(
|
97 |
+
provider=RetryProvider(
|
98 |
+
providers=[Blackbox, MetaAI, BlackboxCreateAgent],
|
99 |
+
shuffle=True,
|
100 |
+
single_provider_retry=True,
|
101 |
+
max_retries=3,
|
102 |
+
),
|
103 |
+
)
|
104 |
+
completion = client.chat.completions.create(
|
105 |
+
model="llama-3.1-405b",
|
106 |
+
messages=[
|
107 |
+
{
|
108 |
+
"role": "system",
|
109 |
+
"content": (
|
110 |
+
"You are a highly advanced language model with strict adherence to precision and formatting. Your task is to process input text and correct any encoding errors or formatting issues, rendering it into a readable citation in APA latest edition format. \n\n"
|
111 |
+
"Requirements:\n"
|
112 |
+
"Accurately decode and render any encoded characters (e.g., special character codes like â).\n"
|
113 |
+
"Correctly format the citation in strict compliance with the APA latest edition guidelines.\n"
|
114 |
+
"Produce only the corrected citation as output, with no additional commentary, metadata, or extraneous punctuation beyond what is specified in the text.\n"
|
115 |
+
"Ensure mathematical expressions, if any, are rendered using MathJax where applicable, maintaining their proper APA context.\n"
|
116 |
+
"Input Example:\n"
|
117 |
+
"McCann, R. J. (2025). Trading linearity for ellipticity: a nonsmooth approach to Einsteinâs theory of gravity and the Lorentzian splitting theorems (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2501.00702"
|
118 |
+
"Expected Output Example:\n"
|
119 |
+
"McCann, R. J. (2025). Trading linearity for ellipticity: A nonsmooth approach to Einstein’s theory of gravity and the Lorentzian splitting theorems (Version 1). arXiv. https://doi.org/10.48550/ARXIV.2501.00702"
|
120 |
+
),
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"role": "user",
|
124 |
+
"content": repr(
|
125 |
+
"Convert the following text into a properly formatted citation in strict compliance with APA latest edition guidelines. Correct any encoding errors (e.g., â) and ensure the output is clean, readable, and adheres to APA rules. Render mathematical expressions using MathJax where applicable, preserving proper context.\n"
|
126 |
+
"Provide only the corrected citation as output, with no additional commentary, metadata, or extraneous punctuation beyond what is specified in the text.\n"
|
127 |
+
f"**Input:** {citation}\n\n"
|
128 |
+
),
|
129 |
+
},
|
130 |
+
],
|
131 |
+
)
|
132 |
+
return completion.choices[0].message.content
|
133 |
+
except Exception as e:
|
134 |
+
print(str(e))
|
135 |
+
return False
|
136 |
+
|
137 |
+
def generate_nlp_summary_and_mindmap(temp_summary, title, citation):
|
138 |
response = {}
|
139 |
+
def local_fix_title():
|
140 |
+
fixed_title = fix_title(title)
|
141 |
+
if not fixed_title:
|
142 |
+
response["title"] = title
|
143 |
+
else:
|
144 |
+
response["title"] = fixed_title.strip()
|
145 |
+
def local_fix_citation():
|
146 |
+
fixed_citation = fix_citation(citation)
|
147 |
+
if not fixed_citation:
|
148 |
+
response["citation"] = citation
|
149 |
+
else:
|
150 |
+
response["citation"] = fixed_citation.strip
|
151 |
def local_generate_nlp_summary():
|
152 |
nlp_summary = generate_nlp_summary(temp_summary)
|
153 |
if not nlp_summary:
|
|
|
167 |
threads = []
|
168 |
threads.append(threading.Thread(target=local_generate_nlp_summary))
|
169 |
threads.append(threading.Thread(target=local_generate_nlp_mindmap))
|
170 |
+
threads.append(threading.Thread(target=local_fix_title))
|
171 |
+
threads.append(threading.Thread(target=local_fix_citation))
|
172 |
for thread in threads:
|
173 |
thread.start()
|
174 |
for thread in threads:
|
requirements.txt
CHANGED
@@ -3,4 +3,4 @@ gradio==5.8.0
|
|
3 |
python-dotenv==1.0.1
|
4 |
pdfplumber==0.11.4
|
5 |
requests==2.32.3
|
6 |
-
g4f[all]==0.4.0.
|
|
|
3 |
python-dotenv==1.0.1
|
4 |
pdfplumber==0.11.4
|
5 |
requests==2.32.3
|
6 |
+
g4f[all]==0.4.0.4
|