arXivGPT / pdf2vectorstore.py
Volko
Reverted
0344383
raw
history blame
2.27 kB
import os
import requests
from bs4 import BeautifulSoup
from pdf2image import convert_from_path
import pytesseract
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import OpenAIEmbeddings
def download_pdf(url, filename):
print("Downloading pdf...")
response = requests.get(url, stream=True)
with open(filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
def extract_pdf_text(filename):
print("Extracting text from pdf...")
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
images = convert_from_path(filename)
text = ""
for image in images:
text += pytesseract.image_to_string(image)
return text
def get_arxiv_pdf_url(paper_link):
if paper_link.endswith('.pdf'):
return paper_link
else:
print("Getting pdf url...")
response = requests.get(paper_link)
soup = BeautifulSoup(response.text, 'html.parser')
pdf_url = soup.find('a', {'class': 'mobile-submission-download'})['href']
pdf_url = 'https://arxiv.org' + pdf_url
return pdf_url
def read_paper(paper_link):
print("Reading paper...")
pdf_filename = 'paper.pdf'
pdf_url = get_arxiv_pdf_url(paper_link)
download_pdf(pdf_url, pdf_filename)
text = extract_pdf_text(pdf_filename)
os.remove(pdf_filename)
return text
def convert_to_vectorstore(arxiv_url, api_key):
if not arxiv_url or not api_key:
return None
print("Converting to vectorstore...")
txtfile = "paper.txt"
with open(txtfile, 'w') as f:
f.write(read_paper(arxiv_url))
loader = UnstructuredFileLoader(txtfile)
raw_documents = loader.load()
os.remove(txtfile)
print("Loaded document")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
documents = text_splitter.split_documents(raw_documents)
os.environ["OPENAI_API_KEY"] = api_key
embeddings = OpenAIEmbeddings()
os.environ["OPENAI_API_KEY"] = ""
vectorstore = FAISS.from_documents(documents, embeddings)
return vectorstore