raannakasturi commited on
Commit
48f0f78
·
1 Parent(s): 3c576d2

Refactor summarizer functions to accept URL, ID, and access key; implement PDF text extraction

Browse files
Files changed (4) hide show
  1. app.py +9 -7
  2. extract_text.py +34 -0
  3. main.py +12 -6
  4. requirements.txt +3 -0
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import gradio as gr
2
  from main import main
3
 
4
- def rexplore_summarizer(corpus):
5
- response = main(corpus)
6
  return response, response['summary'], response['mindmap']
7
 
8
- def clear_everything(text_corpus, raw_data, summary, mindmap):
9
- return None, None, None, None
10
 
11
  theme = gr.themes.Soft(
12
  primary_hue="purple",
@@ -30,7 +30,9 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
30
  ''')
31
  with gr.Row():
32
  with gr.Column():
33
- text_corpus = gr.TextArea(label="Text Corpus", placeholder="Paste the text corpus here", lines=5)
 
 
34
  with gr.Row():
35
  clear_btn = gr.Button(value="Clear", variant='stop')
36
  summarize_btn = gr.Button(value="Summarize", variant='primary')
@@ -41,7 +43,7 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
41
 
42
  summarize_btn.click(
43
  rexplore_summarizer,
44
- inputs=[text_corpus],
45
  outputs=[raw_data, summary, mindmap],
46
  concurrency_limit=25,
47
  scroll_to_output=True,
@@ -49,6 +51,6 @@ with gr.Blocks(theme=theme, title="ReXplore Summarizer", fill_height=True) as ap
49
  api_name="rexplore_summarizer",
50
  show_progress="full",
51
  )
52
- clear_btn.click(clear_everything, inputs=[text_corpus, raw_data, summary, mindmap], outputs=[text_corpus, raw_data, summary, mindmap], show_api=False)
53
 
54
  app.queue(default_concurrency_limit=25).launch(show_api=True)
 
1
  import gradio as gr
2
  from main import main
3
 
4
+ def rexplore_summarizer(url, id, access_key):
5
+ response = main(url, id, access_key)
6
  return response, response['summary'], response['mindmap']
7
 
8
+ def clear_everything(url, id, access_key, raw_data, summary, mindmap):
9
+ return None, None, None, None, None, None
10
 
11
  theme = gr.themes.Soft(
12
  primary_hue="purple",
 
30
  ''')
31
  with gr.Row():
32
  with gr.Column():
33
+ url = gr.Textbox(label="PDF URL", placeholder="Paste the PDF URL here")
34
+ id = gr.Textbox(label="DOI/arXiv ID", placeholder="Enter the DOI or arXiv ID of the document")
35
+ access_key = gr.Textbox(label="Access Key", placeholder="Enter the Access Key")
36
  with gr.Row():
37
  clear_btn = gr.Button(value="Clear", variant='stop')
38
  summarize_btn = gr.Button(value="Summarize", variant='primary')
 
43
 
44
  summarize_btn.click(
45
  rexplore_summarizer,
46
+ inputs=[url, id, access_key],
47
  outputs=[raw_data, summary, mindmap],
48
  concurrency_limit=25,
49
  scroll_to_output=True,
 
51
  api_name="rexplore_summarizer",
52
  show_progress="full",
53
  )
54
+ clear_btn.click(clear_everything, inputs=[url, id, raw_data, summary, mindmap, access_key], outputs=[url, id, raw_data, summary, mindmap, access_key], show_api=False)
55
 
56
  app.queue(default_concurrency_limit=25).launch(show_api=True)
extract_text.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdfplumber import open as pdf_open
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ import requests
4
+ import os
5
+
6
+ def download_pdf(url, id):
7
+ file_path = f"{id}.pdf"
8
+ response = requests.get(url)
9
+ with open(file_path, 'wb') as file:
10
+ file.write(response.content)
11
+ return file_path
12
+
13
+ def extract_text_from_pdf(url, id):
14
+ pdf_path = download_pdf(url, id)
15
+ try:
16
+ with pdf_open(pdf_path) as pdf:
17
+ all_text = ""
18
+ for page in pdf.pages:
19
+ all_text += page.extract_text() + " "
20
+ start_index = all_text.find("ABSTRACT")
21
+ end_index = all_text.find("REFERENCES")
22
+ if start_index != -1 and end_index != -1 and start_index < end_index:
23
+ relevant_text = all_text[start_index:end_index]
24
+ else:
25
+ relevant_text = all_text
26
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
27
+ text_list = text_splitter.split_text(relevant_text)
28
+ research_paper_text = "".join(text_list)
29
+ except Exception as e:
30
+ print(f"Error processing PDF: {e}")
31
+ research_paper_text = ""
32
+ finally:
33
+ os.remove(pdf_path)
34
+ return research_paper_text
main.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from math_summarizer import generate_math_summary
2
  from nlp_summarizer import generate_nlp_summary_and_mindmap
3
  import openai
@@ -7,6 +8,7 @@ import os
7
 
8
  dotenv.load_dotenv()
9
  API_KEY = os.getenv('API_KEY')
 
10
 
11
  def create_client(api_key):
12
  client = openai.OpenAI(
@@ -29,9 +31,13 @@ def generate_summary(client, corpus):
29
  response = generate_nlp_summary_and_mindmap(client, corpus)
30
  return response
31
 
32
- def main(corpus):
33
- start_time = time.time()
34
- client = create_client(API_KEY)
35
- response = generate_summary(client, corpus)
36
- print(f"Total timetaken: {time.time() - start_time} seconds")
37
- return response
 
 
 
 
 
1
+ from extract_text import extract_text_from_pdf
2
  from math_summarizer import generate_math_summary
3
  from nlp_summarizer import generate_nlp_summary_and_mindmap
4
  import openai
 
8
 
9
  dotenv.load_dotenv()
10
  API_KEY = os.getenv('API_KEY')
11
+ ACCESS_KEY = os.getenv('ACCESS_KEY')
12
 
13
  def create_client(api_key):
14
  client = openai.OpenAI(
 
31
  response = generate_nlp_summary_and_mindmap(client, corpus)
32
  return response
33
 
34
+ def main(url, id, access_key):
35
+ if access_key != ACCESS_KEY:
36
+ return {"error": "Invalid Access Key", "summary": None, "mindmap": None}
37
+ else:
38
+ corpus = extract_text_from_pdf(url, id)
39
+ start_time = time.time()
40
+ client = create_client(API_KEY)
41
+ response = generate_summary(client, corpus)
42
+ print(f"Total timetaken: {time.time() - start_time} seconds")
43
+ return response
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
  openai==1.57.3
2
  gradio==5.8.0
3
  python-dotenv==1.0.1
 
 
 
 
1
  openai==1.57.3
2
  gradio==5.8.0
3
  python-dotenv==1.0.1
4
+ pdfplumber==0.11.4
5
+ langchain==0.3.13
6
+ requests==2.32.3