arXivGPT / pdf2vectorstore.py
Volko
Optimised parsing
9fe2c04
raw
history blame
2.63 kB
import os
import requests
from bs4 import BeautifulSoup
from pdf2image import convert_from_path
import pytesseract
import pickle
import concurrent.futures
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings
def download_pdf(url, filename):
print("Downloading pdf...")
response = requests.get(url, stream=True)
with open(filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
def extract_pdf_text(filename):
print("Extracting text from pdf...")
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
images = convert_from_path(filename)
text = ""
with concurrent.futures.ThreadPoolExecutor() as executor:
extracted_texts = executor.map(pytesseract.image_to_string, images)
for extracted_text in extracted_texts:
text += extracted_text
return text
def get_arxiv_pdf_url(paper_link):
if paper_link.endswith('.pdf'):
return paper_link
else:
print("Getting pdf url...")
response = requests.get(paper_link)
soup = BeautifulSoup(response.text, 'html.parser')
pdf_url = soup.find('a', {'class': 'mobile-submission-download'})['href']
pdf_url = 'https://arxiv.org' + pdf_url
return pdf_url
def read_paper(paper_link):
print("Reading paper...")
pdf_filename = 'paper.pdf'
pdf_url = get_arxiv_pdf_url(paper_link)
with concurrent.futures.ThreadPoolExecutor() as executor:
pdf_future = executor.submit(download_pdf, pdf_url, pdf_filename)
pdf_future.result()
text_future = executor.submit(extract_pdf_text, pdf_filename)
text = text_future.result()
os.remove(pdf_filename)
return text
def convert_to_vectorstore(arxiv_url, api_key):
if not arxiv_url or not api_key:
return None
print("Converting to vectorstore...")
txtfile = "paper.txt"
with open(txtfile, 'w') as f:
f.write(read_paper(arxiv_url))
loader = UnstructuredFileLoader(txtfile)
raw_documents = loader.load()
os.remove(txtfile)
print("Loaded document")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
documents = text_splitter.split_documents(raw_documents)
os.environ["OPENAI_API_KEY"] = api_key
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)
os.environ["OPENAI_API_KEY"] = ""
return vectorstore