raannakasturi commited on
Commit
c7bd06b
·
verified ·
1 Parent(s): 66ae8c7

Update extract_text.py

Browse files
Files changed (1) hide show
  1. extract_text.py +6 -4
extract_text.py CHANGED
@@ -19,11 +19,12 @@ def download_pdf(url, id):
19
 
20
  def extract_text_from_pdf(url, id):
21
  pdf_path = download_pdf(url, id)
 
 
 
22
  try:
23
  with pdf_open(pdf_path) as pdf:
24
- all_text = ""
25
- for page in pdf.pages:
26
- all_text += page.extract_text() + " "
27
  start_index = all_text.find("ABSTRACT")
28
  end_index = all_text.find("REFERENCES")
29
  if start_index != -1 and end_index != -1 and start_index < end_index:
@@ -35,5 +36,6 @@ def extract_text_from_pdf(url, id):
35
  print(f"Error processing PDF: {e}")
36
  research_paper_text = ""
37
  finally:
38
- os.remove(pdf_path)
 
39
  return research_paper_text
 
19
 
20
  def extract_text_from_pdf(url, id):
21
  pdf_path = download_pdf(url, id)
22
+ if not pdf_path or not os.path.exists(pdf_path):
23
+ print(f"PDF not found: {pdf_path}")
24
+ return ""
25
  try:
26
  with pdf_open(pdf_path) as pdf:
27
+ all_text = " ".join([page.extract_text() or "" for page in pdf.pages])
 
 
28
  start_index = all_text.find("ABSTRACT")
29
  end_index = all_text.find("REFERENCES")
30
  if start_index != -1 and end_index != -1 and start_index < end_index:
 
36
  print(f"Error processing PDF: {e}")
37
  research_paper_text = ""
38
  finally:
39
+ if os.path.exists(pdf_path):
40
+ os.remove(pdf_path)
41
  return research_paper_text