Volko
commited on
Commit
·
ccc9ab3
1
Parent(s):
0344383
Optimised parsing
Browse files- pdf2vectorstore.py +8 -2
pdf2vectorstore.py
CHANGED
@@ -5,6 +5,7 @@ from bs4 import BeautifulSoup
|
|
5 |
from pdf2image import convert_from_path
|
6 |
import pytesseract
|
7 |
import pickle
|
|
|
8 |
|
9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
from langchain.document_loaders import UnstructuredFileLoader
|
@@ -18,14 +19,19 @@ def download_pdf(url, filename):
|
|
18 |
for chunk in response.iter_content(chunk_size=8192):
|
19 |
f.write(chunk)
|
20 |
|
|
|
|
|
|
|
21 |
def extract_pdf_text(filename):
|
22 |
print("Extracting text from pdf...")
|
23 |
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
|
24 |
images = convert_from_path(filename)
|
25 |
text = ""
|
26 |
-
|
27 |
-
|
|
|
28 |
|
|
|
29 |
return text
|
30 |
|
31 |
def get_arxiv_pdf_url(paper_link):
|
|
|
5 |
from pdf2image import convert_from_path
|
6 |
import pytesseract
|
7 |
import pickle
|
8 |
+
from concurrent.futures import ThreadPoolExecutor
|
9 |
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from langchain.document_loaders import UnstructuredFileLoader
|
|
|
19 |
for chunk in response.iter_content(chunk_size=8192):
|
20 |
f.write(chunk)
|
21 |
|
22 |
+
def extract_image_text(image):
|
23 |
+
return pytesseract.image_to_string(image)
|
24 |
+
|
25 |
def extract_pdf_text(filename):
|
26 |
print("Extracting text from pdf...")
|
27 |
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
|
28 |
images = convert_from_path(filename)
|
29 |
text = ""
|
30 |
+
|
31 |
+
with ThreadPoolExecutor() as executor:
|
32 |
+
text_parts = list(executor.map(extract_image_text, images))
|
33 |
|
34 |
+
text = "".join(text_parts)
|
35 |
return text
|
36 |
|
37 |
def get_arxiv_pdf_url(paper_link):
|