ScientryAPI / extract_text.py
raannakasturi's picture
Update extract_text.py to include User-Agent header in PDF download requests
a871aff
raw
history blame
1.41 kB
from pdfplumber import open as pdf_open
import requests
import os
def download_pdf(url, id):
id = id.replace("/", "-")
directory = "downloads"
os.makedirs(directory, exist_ok=True)
file_path = os.path.join(directory, f"{id}.pdf")
try:
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"})
response.raise_for_status()
with open(file_path, "wb") as file:
file.write(response.content)
except Exception as e:
print(f"Error downloading PDF: {e}")
return None
return file_path
def extract_text_from_pdf(url, id):
pdf_path = download_pdf(url, id)
try:
with pdf_open(pdf_path) as pdf:
all_text = ""
for page in pdf.pages:
all_text += page.extract_text() + " "
start_index = all_text.find("ABSTRACT")
end_index = all_text.find("REFERENCES")
if start_index != -1 and end_index != -1 and start_index < end_index:
relevant_text = all_text[start_index:end_index]
else:
relevant_text = all_text
research_paper_text = relevant_text
except Exception as e:
print(f"Error processing PDF: {e}")
research_paper_text = ""
finally:
os.remove(pdf_path)
return research_paper_text