Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
import os
|
7 |
import requests
|
8 |
from io import BytesIO
|
@@ -23,55 +18,87 @@ messages = [
|
|
23 |
class Chatbot():
|
24 |
|
25 |
def parse_paper(self, pdf):
|
|
|
|
|
26 |
print("Parsing paper")
|
27 |
number_of_pages = len(pdf.pages)
|
28 |
print(f"Total number of pages: {number_of_pages}")
|
|
|
29 |
paper_text = []
|
|
|
30 |
for i in range(number_of_pages):
|
|
|
31 |
page = pdf.pages[i]
|
|
|
32 |
page_text = []
|
33 |
|
34 |
-
def visitor_body(text,
|
|
|
|
|
|
|
|
|
|
|
35 |
x = tm[4]
|
36 |
y = tm[5]
|
37 |
-
|
|
|
|
|
|
|
38 |
if (y > 50 and y < 720) and (len(text.strip()) > 1):
|
39 |
page_text.append({
|
|
|
40 |
'fontsize': fontSize,
|
|
|
41 |
'text': text.strip().replace('\x03', ''),
|
|
|
42 |
'x': x,
|
43 |
'y': y
|
44 |
})
|
45 |
|
|
|
46 |
_ = page.extract_text(visitor_text=visitor_body)
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
'page': i
|
60 |
})
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
'page': i
|
69 |
})
|
70 |
-
|
71 |
-
blob_text = t['text']
|
72 |
-
paper_text += processed_text
|
73 |
print("Done parsing paper")
|
74 |
-
|
75 |
return paper_text
|
76 |
|
77 |
def paper_df(self, pdf):
|
@@ -95,13 +122,8 @@ class Chatbot():
|
|
95 |
openai.api_key = os.getenv('OPENAI_API_KEY')
|
96 |
embedding_model = "text-embedding-ada-002"
|
97 |
# This is going to create embeddings for subsets of the PDF
|
98 |
-
embeddings = df.text.apply(
|
99 |
-
|
100 |
-
print('Done calculating embeddings')
|
101 |
-
print(pkg_resources.get_distribution("openai").version)
|
102 |
-
return df
|
103 |
-
|
104 |
-
|
105 |
|
106 |
def search_embeddings(self, df, query, n=3, pprint=True):
|
107 |
|
@@ -110,12 +132,24 @@ class Chatbot():
|
|
110 |
query,
|
111 |
engine="text-embedding-ada-002"
|
112 |
)
|
113 |
-
# Step 2. Create a
|
114 |
-
|
115 |
-
#
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
global sources
|
120 |
sources = []
|
121 |
for i in range(n):
|
@@ -188,6 +222,8 @@ def show_pdf(file_content):
|
|
188 |
|
189 |
def main():
|
190 |
st.title("Research Paper Guru")
|
|
|
|
|
191 |
st.subheader("Upload PDF or Enter URL")
|
192 |
|
193 |
pdf_option = st.selectbox("Choose an option:", ["Upload PDF", "Enter URL"])
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import requests
|
3 |
from io import BytesIO
|
|
|
18 |
class Chatbot():
|
19 |
|
20 |
def parse_paper(self, pdf):
|
21 |
+
# This function parses the PDF and returns a list of dictionaries with the text,
|
22 |
+
# font size, and x and y coordinates of each text element in the PDF
|
23 |
print("Parsing paper")
|
24 |
number_of_pages = len(pdf.pages)
|
25 |
print(f"Total number of pages: {number_of_pages}")
|
26 |
+
# This is the list that will contain all the text elements in the PDF and will be returned by the function
|
27 |
paper_text = []
|
28 |
+
|
29 |
for i in range(number_of_pages):
|
30 |
+
# Iterate through each page in the PDF, and extract the text elements. pdf.pages is a list of Page objects.
|
31 |
page = pdf.pages[i]
|
32 |
+
# This is the list that will contain all the text elements in the current page
|
33 |
page_text = []
|
34 |
|
35 |
+
def visitor_body(text, tm, fontSize):
|
36 |
+
# tm is a 6-element tuple of floats that represent a 2x3 matrix, which is the text matrix for the text.
|
37 |
+
# The first two elements are the horizontal and vertical scaling factors, the third and fourth elements
|
38 |
+
# are the horizontal and vertical shear factors, and the fifth and sixth elements are the horizontal and vertical translation factors.
|
39 |
+
|
40 |
+
# x and y are the coordinates of the text element
|
41 |
x = tm[4]
|
42 |
y = tm[5]
|
43 |
+
|
44 |
+
# ignore header/footer, and empty text.
|
45 |
+
# The y coordinate is used to filter out the header and footer of the paper
|
46 |
+
# The length of the text is used to filter out empty text
|
47 |
if (y > 50 and y < 720) and (len(text.strip()) > 1):
|
48 |
page_text.append({
|
49 |
+
# The fontsize is used to separate paragraphs into different elements in the paper_text list
|
50 |
'fontsize': fontSize,
|
51 |
+
# The text is stripped of whitespace and the \x03 character
|
52 |
'text': text.strip().replace('\x03', ''),
|
53 |
+
# The x and y coordinates are used to separate paragraphs into different elements in the paper_text list
|
54 |
'x': x,
|
55 |
'y': y
|
56 |
})
|
57 |
|
58 |
+
# Extract the text elements from the page
|
59 |
_ = page.extract_text(visitor_text=visitor_body)
|
60 |
|
61 |
+
# Instantiate variables to be used in the loop. The first text element in the page_text list is used to initialize the variables
|
62 |
+
prev_y = None
|
63 |
+
prev_font_size = None
|
64 |
+
paragraph = ''
|
65 |
+
|
66 |
+
# Iterate through the page_text list and add the text to the paragraph string.
|
67 |
+
# y_diff and font_size_diff are used to separate paragraphs into different elements in the paper_text list by
|
68 |
+
# checking the y coordinate and the font size of the current text element and comparing it to the previous text element
|
69 |
+
for idx, t in enumerate(page_text):
|
70 |
+
if prev_y is None:
|
71 |
+
y_diff = abs(t['y'] - prev_y)
|
72 |
+
font_size_diff = abs(t['fontsize'] - prev_font_size)
|
73 |
+
|
74 |
+
# y_diff > 10 and font_size_diff > 1 are used to separate paragraphs into different elements in the paper_text list
|
75 |
+
if y_diff > 10 or font_size_diff > 1:
|
76 |
+
# Add paragraph to paper_text when the y_diff is too large or the font size is too different
|
77 |
+
# This is to separate paragraphs into different elements in the paper_text list
|
78 |
+
# This is done by checking the y coordinate and the font size of the current text element
|
79 |
+
paper_text.append({
|
80 |
+
'fontsize': prev_font_size,
|
81 |
+
'text': paragraph.strip(),
|
82 |
'page': i
|
83 |
})
|
84 |
+
paragraph = ''
|
85 |
+
|
86 |
+
# Add text to paragraph, and update the variables.
|
87 |
+
paragraph += f" {t['text']}"
|
88 |
+
prev_y = t['y']
|
89 |
+
prev_font_size = t['fontsize']
|
90 |
+
|
91 |
+
# Add last paragraph when reaching the end of the page_text
|
92 |
+
|
93 |
+
if idx == len(page_text) - 1:
|
94 |
+
paper_text.append({
|
95 |
+
'fontsize': prev_font_size,
|
96 |
+
'text': paragraph.strip(),
|
97 |
'page': i
|
98 |
})
|
99 |
+
|
|
|
|
|
100 |
print("Done parsing paper")
|
101 |
+
|
102 |
return paper_text
|
103 |
|
104 |
def paper_df(self, pdf):
|
|
|
122 |
openai.api_key = os.getenv('OPENAI_API_KEY')
|
123 |
embedding_model = "text-embedding-ada-002"
|
124 |
# This is going to create embeddings for subsets of the PDF
|
125 |
+
embeddings = np.vstack(df.text.apply(lambda x: get_embedding(x, engine=embedding_model)))
|
126 |
+
return embeddings
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
def search_embeddings(self, df, query, n=3, pprint=True):
|
129 |
|
|
|
132 |
query,
|
133 |
engine="text-embedding-ada-002"
|
134 |
)
|
135 |
+
# Step 2. Create a FAISS index and add the embeddings
|
136 |
+
d = embeddings.shape[1]
|
137 |
+
# Use the L2 distance metric
|
138 |
+
index = faiss.IndexFlatL2(d)
|
139 |
+
index.add(embeddings)
|
140 |
+
|
141 |
+
|
142 |
+
# Step 3. Search the index for the embedding of the question
|
143 |
+
|
144 |
+
D, I = index.search(query_embedding.reshape(1,d), n)
|
145 |
+
|
146 |
+
# Step 4. Get the top n results from the dataframe
|
147 |
+
results = df.iloc[I[0]]
|
148 |
+
results['similarity'] = D[0]
|
149 |
+
results = results.reset_index(drop=True)
|
150 |
+
|
151 |
+
# Make a dictionary of the first n results with the page number as the key and the text as the value
|
152 |
+
|
153 |
global sources
|
154 |
sources = []
|
155 |
for i in range(n):
|
|
|
222 |
|
223 |
def main():
|
224 |
st.title("Research Paper Guru")
|
225 |
+
st.subheader("Mike Ion - https://github.com/mikeion")
|
226 |
+
st.subheader("Ask a question about a research paper and get an answer with sources!")
|
227 |
st.subheader("Upload PDF or Enter URL")
|
228 |
|
229 |
pdf_option = st.selectbox("Choose an option:", ["Upload PDF", "Enter URL"])
|