raannakasturi commited on
Commit
a58af4a
·
verified ·
1 Parent(s): 48f0f78

Update extract_text.py

Browse files
Files changed (1) hide show
  1. extract_text.py +41 -34
extract_text.py CHANGED
@@ -1,34 +1,41 @@
1
- from pdfplumber import open as pdf_open
2
- from langchain.text_splitter import RecursiveCharacterTextSplitter
3
- import requests
4
- import os
5
-
6
- def download_pdf(url, id):
7
- file_path = f"{id}.pdf"
8
- response = requests.get(url)
9
- with open(file_path, 'wb') as file:
10
- file.write(response.content)
11
- return file_path
12
-
13
- def extract_text_from_pdf(url, id):
14
- pdf_path = download_pdf(url, id)
15
- try:
16
- with pdf_open(pdf_path) as pdf:
17
- all_text = ""
18
- for page in pdf.pages:
19
- all_text += page.extract_text() + " "
20
- start_index = all_text.find("ABSTRACT")
21
- end_index = all_text.find("REFERENCES")
22
- if start_index != -1 and end_index != -1 and start_index < end_index:
23
- relevant_text = all_text[start_index:end_index]
24
- else:
25
- relevant_text = all_text
26
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
27
- text_list = text_splitter.split_text(relevant_text)
28
- research_paper_text = "".join(text_list)
29
- except Exception as e:
30
- print(f"Error processing PDF: {e}")
31
- research_paper_text = ""
32
- finally:
33
- os.remove(pdf_path)
34
- return research_paper_text
 
 
 
 
 
 
 
 
1
+ from pdfplumber import open as pdf_open
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ import requests
4
+ import os
5
+
6
+ def download_pdf(url, id):
7
+ directory = "downloads"
8
+ os.makedirs(directory, exist_ok=True)
9
+ file_path = os.path.join(directory, f"{id}.pdf") # Use a unique name based on id
10
+ try:
11
+ response = requests.get(url)
12
+ response.raise_for_status() # Raise an error for bad responses
13
+ with open(file_path, 'wb') as file:
14
+ file.write(response.content)
15
+ except Exception as e:
16
+ print(f"Error downloading PDF: {e}")
17
+ return None
18
+ return file_path
19
+
20
+ def extract_text_from_pdf(url, id):
21
+ pdf_path = download_pdf(url, id)
22
+ try:
23
+ with pdf_open(pdf_path) as pdf:
24
+ all_text = ""
25
+ for page in pdf.pages:
26
+ all_text += page.extract_text() + " "
27
+ start_index = all_text.find("ABSTRACT")
28
+ end_index = all_text.find("REFERENCES")
29
+ if start_index != -1 and end_index != -1 and start_index < end_index:
30
+ relevant_text = all_text[start_index:end_index]
31
+ else:
32
+ relevant_text = all_text
33
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
34
+ text_list = text_splitter.split_text(relevant_text)
35
+ research_paper_text = "".join(text_list)
36
+ except Exception as e:
37
+ print(f"Error processing PDF: {e}")
38
+ research_paper_text = ""
39
+ finally:
40
+ os.remove(pdf_path)
41
+ return research_paper_text