File size: 1,578 Bytes
a58af4a
 
 
 
 
b8aa173
a58af4a
 
a871aff
a58af4a
a871aff
 
9183d8e
a58af4a
8f9eff9
a58af4a
 
 
 
 
 
 
c7bd06b
 
 
a58af4a
 
c7bd06b
a58af4a
 
 
 
 
 
c7c6bac
a58af4a
 
 
 
c7bd06b
 
a58af4a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from pdfplumber import open as pdf_open
import requests
import os

def download_pdf(url, id):
    id = id.replace("/", "-")
    directory = "downloads"
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, f"{id}.pdf")
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"})
        response.raise_for_status()
        with open(file_path, "wb") as file:
            file.write(response.content)
        print(f"Saving PDF to: {file_path}")
    except Exception as e:
        print(f"Error downloading PDF: {e}")
        return None
    return file_path

def extract_text_from_pdf(url, id):
    pdf_path = download_pdf(url, id)
    if not pdf_path or not os.path.exists(pdf_path):
        print(f"PDF not found: {pdf_path}")
        return ""
    try:
        with pdf_open(pdf_path) as pdf:
            all_text = " ".join([page.extract_text() or "" for page in pdf.pages])
        start_index = all_text.find("ABSTRACT")
        end_index = all_text.find("REFERENCES")
        if start_index != -1 and end_index != -1 and start_index < end_index:
            relevant_text = all_text[start_index:end_index]
        else:
            relevant_text = all_text
        research_paper_text = relevant_text
    except Exception as e:
        print(f"Error processing PDF: {e}")
        research_paper_text = ""
    finally:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)
    return research_paper_text