from pdfplumber import open as pdf_open import requests import os def download_pdf(url, id): id = id.replace("/", "-") directory = "downloads" os.makedirs(directory, exist_ok=True) file_path = os.path.join(directory, f"{id}.pdf") try: response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"}) response.raise_for_status() with open(file_path, "wb") as file: file.write(response.content) print(f"Saving PDF to: {file_path}") except Exception as e: print(f"Error downloading PDF: {e}") return None return file_path def extract_text_from_pdf(url, id): pdf_path = download_pdf(url, id) if not pdf_path or not os.path.exists(pdf_path): print(f"PDF not found: {pdf_path}") return "" try: with pdf_open(pdf_path) as pdf: all_text = " ".join([page.extract_text() or "" for page in pdf.pages]) start_index = all_text.find("ABSTRACT") end_index = all_text.find("REFERENCES") if start_index != -1 and end_index != -1 and start_index < end_index: relevant_text = all_text[start_index:end_index] else: relevant_text = all_text research_paper_text = relevant_text except Exception as e: print(f"Error processing PDF: {e}") research_paper_text = "" finally: if os.path.exists(pdf_path): os.remove(pdf_path) return research_paper_text