ScientryAPI

Running

App Files Files Community

raannakasturi commited on Dec 22, 2024

Commit

a58af4a

verified ·

1 Parent(s): 48f0f78

Update extract_text.py

Browse files

Files changed (1) hide show

extract_text.py +41 -34

extract_text.py CHANGED Viewed

@@ -1,34 +1,41 @@
-from pdfplumber import open as pdf_open
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-import requests
-import os
-def download_pdf(url, id):
-    file_path = f"{id}.pdf"
-    response = requests.get(url)
-    with open(file_path, 'wb') as file:
-        file.write(response.content)
-    return file_path
-def extract_text_from_pdf(url, id):
-    pdf_path = download_pdf(url, id)
-    try:
-        with pdf_open(pdf_path) as pdf:
-            all_text = ""
-            for page in pdf.pages:
-                all_text += page.extract_text() + " "
-        start_index = all_text.find("ABSTRACT")
-        end_index = all_text.find("REFERENCES")
-        if start_index != -1 and end_index != -1 and start_index < end_index:
-            relevant_text = all_text[start_index:end_index]
-        else:
-            relevant_text = all_text
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
-        text_list = text_splitter.split_text(relevant_text)
-        research_paper_text = "".join(text_list)
-    except Exception as e:
-        print(f"Error processing PDF: {e}")
-        research_paper_text = ""
-    finally:
-        os.remove(pdf_path)
-    return research_paper_text

+from pdfplumber import open as pdf_open
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import requests
+import os
+def download_pdf(url, id):
+    directory = "downloads"
+    os.makedirs(directory, exist_ok=True)
+    file_path = os.path.join(directory, f"{id}.pdf")  # Use a unique name based on id
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Raise an error for bad responses
+        with open(file_path, 'wb') as file:
+            file.write(response.content)
+    except Exception as e:
+        print(f"Error downloading PDF: {e}")
+        return None
+    return file_path
+def extract_text_from_pdf(url, id):
+    pdf_path = download_pdf(url, id)
+    try:
+        with pdf_open(pdf_path) as pdf:
+            all_text = ""
+            for page in pdf.pages:
+                all_text += page.extract_text() + " "
+        start_index = all_text.find("ABSTRACT")
+        end_index = all_text.find("REFERENCES")
+        if start_index != -1 and end_index != -1 and start_index < end_index:
+            relevant_text = all_text[start_index:end_index]
+        else:
+            relevant_text = all_text
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
+        text_list = text_splitter.split_text(relevant_text)
+        research_paper_text = "".join(text_list)
+    except Exception as e:
+        print(f"Error processing PDF: {e}")
+        research_paper_text = ""
+    finally:
+        os.remove(pdf_path)
+    return research_paper_text