Spaces:
Running
Running
Commit
·
48f0f78
1
Parent(s):
3c576d2
Refactor summarizer functions to accept URL, ID, and access key; implement PDF text extraction
Browse files- app.py +9 -7
- extract_text.py +34 -0
- main.py +12 -6
- requirements.txt +3 -0
app.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
import gradio as gr
|
2 |
from main import main
|
3 |
|
4 |
-
def rexplore_summarizer(
|
5 |
-
response = main(
|
6 |
return response, response['summary'], response['mindmap']
|
7 |
|
8 |
-
def clear_everything(
|
9 |
-
return None, None, None, None
|
10 |
|
11 |
theme = gr.themes.Soft(
|
12 |
primary_hue="purple",
|
@@ -30,7 +30,9 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
|
|
30 |
''')
|
31 |
with gr.Row():
|
32 |
with gr.Column():
|
33 |
-
|
|
|
|
|
34 |
with gr.Row():
|
35 |
clear_btn = gr.Button(value="Clear", variant='stop')
|
36 |
summarize_btn = gr.Button(value="Summarize", variant='primary')
|
@@ -41,7 +43,7 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
|
|
41 |
|
42 |
summarize_btn.click(
|
43 |
rexplore_summarizer,
|
44 |
-
inputs=[
|
45 |
outputs=[raw_data, summary, mindmap],
|
46 |
concurrency_limit=25,
|
47 |
scroll_to_output=True,
|
@@ -49,6 +51,6 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
|
|
49 |
api_name="rexplore_summarizer",
|
50 |
show_progress="full",
|
51 |
)
|
52 |
-
clear_btn.click(clear_everything, inputs=[
|
53 |
|
54 |
app.queue(default_concurrency_limit=25).launch(show_api=True)
|
|
|
1 |
import gradio as gr
|
2 |
from main import main
|
3 |
|
4 |
+
def rexplore_summarizer(url, id, access_key):
|
5 |
+
response = main(url, id, access_key)
|
6 |
return response, response['summary'], response['mindmap']
|
7 |
|
8 |
+
def clear_everything(url, id, access_key, raw_data, summary, mindmap):
|
9 |
+
return None, None, None, None, None, None
|
10 |
|
11 |
theme = gr.themes.Soft(
|
12 |
primary_hue="purple",
|
|
|
30 |
''')
|
31 |
with gr.Row():
|
32 |
with gr.Column():
|
33 |
+
url = gr.Textbox(label="PDF URL", placeholder="Paste the PDF URL here")
|
34 |
+
id = gr.Textbox(label="DOI/arXiv ID", placeholder="Enter the DOI or arXiv ID of the document")
|
35 |
+
access_key = gr.Textbox(label="Access Key", placeholder="Enter the Access Key")
|
36 |
with gr.Row():
|
37 |
clear_btn = gr.Button(value="Clear", variant='stop')
|
38 |
summarize_btn = gr.Button(value="Summarize", variant='primary')
|
|
|
43 |
|
44 |
summarize_btn.click(
|
45 |
rexplore_summarizer,
|
46 |
+
inputs=[url, id, access_key],
|
47 |
outputs=[raw_data, summary, mindmap],
|
48 |
concurrency_limit=25,
|
49 |
scroll_to_output=True,
|
|
|
51 |
api_name="rexplore_summarizer",
|
52 |
show_progress="full",
|
53 |
)
|
54 |
+
clear_btn.click(clear_everything, inputs=[url, id, raw_data, summary, mindmap, access_key], outputs=[url, id, raw_data, summary, mindmap, access_key], show_api=False)
|
55 |
|
56 |
app.queue(default_concurrency_limit=25).launch(show_api=True)
|
extract_text.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdfplumber import open as pdf_open
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
import requests
|
4 |
+
import os
|
5 |
+
|
6 |
+
def download_pdf(url, id):
|
7 |
+
file_path = f"{id}.pdf"
|
8 |
+
response = requests.get(url)
|
9 |
+
with open(file_path, 'wb') as file:
|
10 |
+
file.write(response.content)
|
11 |
+
return file_path
|
12 |
+
|
13 |
+
def extract_text_from_pdf(url, id):
|
14 |
+
pdf_path = download_pdf(url, id)
|
15 |
+
try:
|
16 |
+
with pdf_open(pdf_path) as pdf:
|
17 |
+
all_text = ""
|
18 |
+
for page in pdf.pages:
|
19 |
+
all_text += page.extract_text() + " "
|
20 |
+
start_index = all_text.find("ABSTRACT")
|
21 |
+
end_index = all_text.find("REFERENCES")
|
22 |
+
if start_index != -1 and end_index != -1 and start_index < end_index:
|
23 |
+
relevant_text = all_text[start_index:end_index]
|
24 |
+
else:
|
25 |
+
relevant_text = all_text
|
26 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
|
27 |
+
text_list = text_splitter.split_text(relevant_text)
|
28 |
+
research_paper_text = "".join(text_list)
|
29 |
+
except Exception as e:
|
30 |
+
print(f"Error processing PDF: {e}")
|
31 |
+
research_paper_text = ""
|
32 |
+
finally:
|
33 |
+
os.remove(pdf_path)
|
34 |
+
return research_paper_text
|
main.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from math_summarizer import generate_math_summary
|
2 |
from nlp_summarizer import generate_nlp_summary_and_mindmap
|
3 |
import openai
|
@@ -7,6 +8,7 @@ import os
|
|
7 |
|
8 |
dotenv.load_dotenv()
|
9 |
API_KEY = os.getenv('API_KEY')
|
|
|
10 |
|
11 |
def create_client(api_key):
|
12 |
client = openai.OpenAI(
|
@@ -29,9 +31,13 @@ def generate_summary(client, corpus):
|
|
29 |
response = generate_nlp_summary_and_mindmap(client, corpus)
|
30 |
return response
|
31 |
|
32 |
-
def main(
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
from extract_text import extract_text_from_pdf
|
2 |
from math_summarizer import generate_math_summary
|
3 |
from nlp_summarizer import generate_nlp_summary_and_mindmap
|
4 |
import openai
|
|
|
8 |
|
9 |
dotenv.load_dotenv()
|
10 |
API_KEY = os.getenv('API_KEY')
|
11 |
+
ACCESS_KEY = os.getenv('ACCESS_KEY')
|
12 |
|
13 |
def create_client(api_key):
|
14 |
client = openai.OpenAI(
|
|
|
31 |
response = generate_nlp_summary_and_mindmap(client, corpus)
|
32 |
return response
|
33 |
|
34 |
+
def main(url, id, access_key):
|
35 |
+
if access_key != ACCESS_KEY:
|
36 |
+
return {"error": "Invalid Access Key", "summary": None, "mindmap": None}
|
37 |
+
else:
|
38 |
+
corpus = extract_text_from_pdf(url, id)
|
39 |
+
start_time = time.time()
|
40 |
+
client = create_client(API_KEY)
|
41 |
+
response = generate_summary(client, corpus)
|
42 |
+
print(f"Total timetaken: {time.time() - start_time} seconds")
|
43 |
+
return response
|
requirements.txt
CHANGED
@@ -1,3 +1,6 @@
|
|
1 |
openai==1.57.3
|
2 |
gradio==5.8.0
|
3 |
python-dotenv==1.0.1
|
|
|
|
|
|
|
|
1 |
openai==1.57.3
|
2 |
gradio==5.8.0
|
3 |
python-dotenv==1.0.1
|
4 |
+
pdfplumber==0.11.4
|
5 |
+
langchain==0.3.13
|
6 |
+
requests==2.32.3
|