Spaces:
Running
Running
from pdfplumber import open as pdf_open | |
import requests | |
import os | |
def download_pdf(url, id): | |
id = id.replace("/", "-") | |
directory = "downloads" | |
os.makedirs(directory, exist_ok=True) | |
file_path = os.path.join(directory, f"{id}.pdf") | |
try: | |
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"}) | |
response.raise_for_status() | |
with open(file_path, "wb") as file: | |
file.write(response.content) | |
except Exception as e: | |
print(f"Error downloading PDF: {e}") | |
return None | |
return file_path | |
def extract_text_from_pdf(url, id): | |
pdf_path = download_pdf(url, id) | |
try: | |
with pdf_open(pdf_path) as pdf: | |
all_text = "" | |
for page in pdf.pages: | |
all_text += page.extract_text() + " " | |
start_index = all_text.find("ABSTRACT") | |
end_index = all_text.find("REFERENCES") | |
if start_index != -1 and end_index != -1 and start_index < end_index: | |
relevant_text = all_text[start_index:end_index] | |
else: | |
relevant_text = all_text | |
research_paper_text = relevant_text | |
except Exception as e: | |
print(f"Error processing PDF: {e}") | |
research_paper_text = "" | |
finally: | |
os.remove(pdf_path) | |
return research_paper_text | |