ScientryAPI

Running

raannakasturi commited on Feb 26

Commit

c7bd06b

verified ·

1 Parent(s): 66ae8c7

Update extract_text.py

Files changed (1) hide show

extract_text.py CHANGED Viewed

@@ -19,11 +19,12 @@ def download_pdf(url, id):
 def extract_text_from_pdf(url, id):
     pdf_path = download_pdf(url, id)
     try:
         with pdf_open(pdf_path) as pdf:
-            all_text = ""
-            for page in pdf.pages:
-                all_text += page.extract_text() + " "
         start_index = all_text.find("ABSTRACT")
         end_index = all_text.find("REFERENCES")
         if start_index != -1 and end_index != -1 and start_index < end_index:
@@ -35,5 +36,6 @@ def extract_text_from_pdf(url, id):
         print(f"Error processing PDF: {e}")
         research_paper_text = ""
     finally:
-        os.remove(pdf_path)
     return research_paper_text

 def extract_text_from_pdf(url, id):
     pdf_path = download_pdf(url, id)
+    if not pdf_path or not os.path.exists(pdf_path):
+        print(f"PDF not found: {pdf_path}")
+        return ""
     try:
         with pdf_open(pdf_path) as pdf:
+            all_text = " ".join([page.extract_text() or "" for page in pdf.pages])
         start_index = all_text.find("ABSTRACT")
         end_index = all_text.find("REFERENCES")
         if start_index != -1 and end_index != -1 and start_index < end_index:
         print(f"Error processing PDF: {e}")
         research_paper_text = ""
     finally:
+        if os.path.exists(pdf_path):
+            os.remove(pdf_path)
     return research_paper_text