Volko
commited on
Commit
β’
a58f539
1
Parent(s):
ccc9ab3
Reverted
Browse files- app.py +3 -3
- pdf2vectorstore.py +2 -8
app.py
CHANGED
@@ -135,11 +135,11 @@ with block:
|
|
135 |
<div style="text-align:center">
|
136 |
<p>Developed by <a href='https://www.linkedin.com/in/dekay/'>Github and Huggingface: Volkopat</a></p>
|
137 |
<p>Powered by <a href='https://openai.com/'>OpenAI</a>, <a href='https://arxiv.org/'>arXiv</a> and <a href='https://github.com/hwchase17/langchain'>LangChain π¦οΈπ</a></p>
|
138 |
-
<p>ArxivGPT is a chatbot that answers questions about research papers
|
139 |
<p>Currently, it can answer questions about the paper you just linked.</p>
|
140 |
-
<p>It's still in development, so please report any bugs you find.</p>
|
141 |
<p>The answers can be quite limited as there is a 4096 token limit for GPT-3.5, hence waiting for GPT-4 access to upgrade.</p>
|
142 |
-
<p>Possible upgrades coming up: GPT-4, status messages, other research paper hubs.</p>
|
143 |
</div>
|
144 |
<style>
|
145 |
p {
|
|
|
135 |
<div style="text-align:center">
|
136 |
<p>Developed by <a href='https://www.linkedin.com/in/dekay/'>Github and Huggingface: Volkopat</a></p>
|
137 |
<p>Powered by <a href='https://openai.com/'>OpenAI</a>, <a href='https://arxiv.org/'>arXiv</a> and <a href='https://github.com/hwchase17/langchain'>LangChain π¦οΈπ</a></p>
|
138 |
+
<p>ArxivGPT is a chatbot that answers questions about research papers. It uses a pretrained GPT-3.5 model to generate answers.</p>
|
139 |
<p>Currently, it can answer questions about the paper you just linked.</p>
|
140 |
+
<p>It's still in development, so please report any bugs you find. It can take up to a minute to start a conversation for every new paper as there is a parsing delay.</p>
|
141 |
<p>The answers can be quite limited as there is a 4096 token limit for GPT-3.5, hence waiting for GPT-4 access to upgrade.</p>
|
142 |
+
<p>Possible upgrades coming up: GPT-4, faster parsing, status messages, other research paper hubs.</p>
|
143 |
</div>
|
144 |
<style>
|
145 |
p {
|
pdf2vectorstore.py
CHANGED
@@ -5,7 +5,6 @@ from bs4 import BeautifulSoup
|
|
5 |
from pdf2image import convert_from_path
|
6 |
import pytesseract
|
7 |
import pickle
|
8 |
-
from concurrent.futures import ThreadPoolExecutor
|
9 |
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from langchain.document_loaders import UnstructuredFileLoader
|
@@ -19,19 +18,14 @@ def download_pdf(url, filename):
|
|
19 |
for chunk in response.iter_content(chunk_size=8192):
|
20 |
f.write(chunk)
|
21 |
|
22 |
-
def extract_image_text(image):
|
23 |
-
return pytesseract.image_to_string(image)
|
24 |
-
|
25 |
def extract_pdf_text(filename):
|
26 |
print("Extracting text from pdf...")
|
27 |
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
|
28 |
images = convert_from_path(filename)
|
29 |
text = ""
|
30 |
-
|
31 |
-
|
32 |
-
text_parts = list(executor.map(extract_image_text, images))
|
33 |
|
34 |
-
text = "".join(text_parts)
|
35 |
return text
|
36 |
|
37 |
def get_arxiv_pdf_url(paper_link):
|
|
|
5 |
from pdf2image import convert_from_path
|
6 |
import pytesseract
|
7 |
import pickle
|
|
|
8 |
|
9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
from langchain.document_loaders import UnstructuredFileLoader
|
|
|
18 |
for chunk in response.iter_content(chunk_size=8192):
|
19 |
f.write(chunk)
|
20 |
|
|
|
|
|
|
|
21 |
def extract_pdf_text(filename):
|
22 |
print("Extracting text from pdf...")
|
23 |
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
|
24 |
images = convert_from_path(filename)
|
25 |
text = ""
|
26 |
+
for image in images:
|
27 |
+
text += pytesseract.image_to_string(image)
|
|
|
28 |
|
|
|
29 |
return text
|
30 |
|
31 |
def get_arxiv_pdf_url(paper_link):
|