mikeion commited on
Commit
5985946
·
1 Parent(s): efeb6fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -42
app.py CHANGED
@@ -1,8 +1,3 @@
1
-
2
-
3
-
4
-
5
-
6
  import os
7
  import requests
8
  from io import BytesIO
@@ -23,55 +18,87 @@ messages = [
23
  class Chatbot():
24
 
25
  def parse_paper(self, pdf):
 
 
26
  print("Parsing paper")
27
  number_of_pages = len(pdf.pages)
28
  print(f"Total number of pages: {number_of_pages}")
 
29
  paper_text = []
 
30
  for i in range(number_of_pages):
 
31
  page = pdf.pages[i]
 
32
  page_text = []
33
 
34
- def visitor_body(text, cm, tm, fontDict, fontSize):
 
 
 
 
 
35
  x = tm[4]
36
  y = tm[5]
37
- # ignore header/footer
 
 
 
38
  if (y > 50 and y < 720) and (len(text.strip()) > 1):
39
  page_text.append({
 
40
  'fontsize': fontSize,
 
41
  'text': text.strip().replace('\x03', ''),
 
42
  'x': x,
43
  'y': y
44
  })
45
 
 
46
  _ = page.extract_text(visitor_text=visitor_body)
47
 
48
- blob_font_size = None
49
- blob_text = ''
50
- processed_text = []
51
-
52
- for t in page_text:
53
- if t['fontsize'] == blob_font_size:
54
- blob_text += f" {t['text']}"
55
- if len(blob_text) >= 2000:
56
- processed_text.append({
57
- 'fontsize': blob_font_size,
58
- 'text': blob_text,
 
 
 
 
 
 
 
 
 
 
59
  'page': i
60
  })
61
- blob_font_size = None
62
- blob_text = ''
63
- else:
64
- if blob_font_size is not None and len(blob_text) >= 1:
65
- processed_text.append({
66
- 'fontsize': blob_font_size,
67
- 'text': blob_text,
 
 
 
 
 
 
68
  'page': i
69
  })
70
- blob_font_size = t['fontsize']
71
- blob_text = t['text']
72
- paper_text += processed_text
73
  print("Done parsing paper")
74
- # print(paper_text)
75
  return paper_text
76
 
77
  def paper_df(self, pdf):
@@ -95,13 +122,8 @@ class Chatbot():
95
  openai.api_key = os.getenv('OPENAI_API_KEY')
96
  embedding_model = "text-embedding-ada-002"
97
  # This is going to create embeddings for subsets of the PDF
98
- embeddings = df.text.apply([lambda x: get_embedding(x, engine=embedding_model)])
99
- df["embeddings"] = embeddings
100
- print('Done calculating embeddings')
101
- print(pkg_resources.get_distribution("openai").version)
102
- return df
103
-
104
-
105
 
106
  def search_embeddings(self, df, query, n=3, pprint=True):
107
 
@@ -110,12 +132,24 @@ class Chatbot():
110
  query,
111
  engine="text-embedding-ada-002"
112
  )
113
- # Step 2. Create a column in the dataframe that contains the cosine similarity (distance) between the query and the text in the dataframe
114
- df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))
115
- # Step 3. Sort the dataframe by the similarity column
116
- results = df.sort_values("similarity", ascending=False, ignore_index=True)
117
- # make a dictionary of the the first three results with the page number as the key and the text as the value. The page number is a column in the dataframe.
118
- results = results.head(n)
 
 
 
 
 
 
 
 
 
 
 
 
119
  global sources
120
  sources = []
121
  for i in range(n):
@@ -188,6 +222,8 @@ def show_pdf(file_content):
188
 
189
  def main():
190
  st.title("Research Paper Guru")
 
 
191
  st.subheader("Upload PDF or Enter URL")
192
 
193
  pdf_option = st.selectbox("Choose an option:", ["Upload PDF", "Enter URL"])
 
 
 
 
 
 
1
  import os
2
  import requests
3
  from io import BytesIO
 
18
  class Chatbot():
19
 
20
  def parse_paper(self, pdf):
21
+ # This function parses the PDF and returns a list of dictionaries with the text,
22
+ # font size, and x and y coordinates of each text element in the PDF
23
  print("Parsing paper")
24
  number_of_pages = len(pdf.pages)
25
  print(f"Total number of pages: {number_of_pages}")
26
+ # This is the list that will contain all the text elements in the PDF and will be returned by the function
27
  paper_text = []
28
+
29
  for i in range(number_of_pages):
30
+ # Iterate through each page in the PDF, and extract the text elements. pdf.pages is a list of Page objects.
31
  page = pdf.pages[i]
32
+ # This is the list that will contain all the text elements in the current page
33
  page_text = []
34
 
35
+ def visitor_body(text, tm, fontSize):
36
+ # tm is a 6-element tuple of floats that represent a 2x3 matrix, which is the text matrix for the text.
37
+ # The first two elements are the horizontal and vertical scaling factors, the third and fourth elements
38
+ # are the horizontal and vertical shear factors, and the fifth and sixth elements are the horizontal and vertical translation factors.
39
+
40
+ # x and y are the coordinates of the text element
41
  x = tm[4]
42
  y = tm[5]
43
+
44
+ # ignore header/footer, and empty text.
45
+ # The y coordinate is used to filter out the header and footer of the paper
46
+ # The length of the text is used to filter out empty text
47
  if (y > 50 and y < 720) and (len(text.strip()) > 1):
48
  page_text.append({
49
+ # The fontsize is used to separate paragraphs into different elements in the paper_text list
50
  'fontsize': fontSize,
51
+ # The text is stripped of whitespace and the \x03 character
52
  'text': text.strip().replace('\x03', ''),
53
+ # The x and y coordinates are used to separate paragraphs into different elements in the paper_text list
54
  'x': x,
55
  'y': y
56
  })
57
 
58
+ # Extract the text elements from the page
59
  _ = page.extract_text(visitor_text=visitor_body)
60
 
61
+ # Instantiate variables to be used in the loop. The first text element in the page_text list is used to initialize the variables
62
+ prev_y = None
63
+ prev_font_size = None
64
+ paragraph = ''
65
+
66
+ # Iterate through the page_text list and add the text to the paragraph string.
67
+ # y_diff and font_size_diff are used to separate paragraphs into different elements in the paper_text list by
68
+ # checking the y coordinate and the font size of the current text element and comparing it to the previous text element
69
+ for idx, t in enumerate(page_text):
70
+ if prev_y is None:
71
+ y_diff = abs(t['y'] - prev_y)
72
+ font_size_diff = abs(t['fontsize'] - prev_font_size)
73
+
74
+ # y_diff > 10 and font_size_diff > 1 are used to separate paragraphs into different elements in the paper_text list
75
+ if y_diff > 10 or font_size_diff > 1:
76
+ # Add paragraph to paper_text when the y_diff is too large or the font size is too different
77
+ # This is to separate paragraphs into different elements in the paper_text list
78
+ # This is done by checking the y coordinate and the font size of the current text element
79
+ paper_text.append({
80
+ 'fontsize': prev_font_size,
81
+ 'text': paragraph.strip(),
82
  'page': i
83
  })
84
+ paragraph = ''
85
+
86
+ # Add text to paragraph, and update the variables.
87
+ paragraph += f" {t['text']}"
88
+ prev_y = t['y']
89
+ prev_font_size = t['fontsize']
90
+
91
+ # Add last paragraph when reaching the end of the page_text
92
+
93
+ if idx == len(page_text) - 1:
94
+ paper_text.append({
95
+ 'fontsize': prev_font_size,
96
+ 'text': paragraph.strip(),
97
  'page': i
98
  })
99
+
 
 
100
  print("Done parsing paper")
101
+
102
  return paper_text
103
 
104
  def paper_df(self, pdf):
 
122
  openai.api_key = os.getenv('OPENAI_API_KEY')
123
  embedding_model = "text-embedding-ada-002"
124
  # This is going to create embeddings for subsets of the PDF
125
+ embeddings = np.vstack(df.text.apply(lambda x: get_embedding(x, engine=embedding_model)))
126
+ return embeddings
 
 
 
 
 
127
 
128
  def search_embeddings(self, df, query, n=3, pprint=True):
129
 
 
132
  query,
133
  engine="text-embedding-ada-002"
134
  )
135
+ # Step 2. Create a FAISS index and add the embeddings
136
+ d = embeddings.shape[1]
137
+ # Use the L2 distance metric
138
+ index = faiss.IndexFlatL2(d)
139
+ index.add(embeddings)
140
+
141
+
142
+ # Step 3. Search the index for the embedding of the question
143
+
144
+ D, I = index.search(query_embedding.reshape(1,d), n)
145
+
146
+ # Step 4. Get the top n results from the dataframe
147
+ results = df.iloc[I[0]]
148
+ results['similarity'] = D[0]
149
+ results = results.reset_index(drop=True)
150
+
151
+ # Make a dictionary of the first n results with the page number as the key and the text as the value
152
+
153
  global sources
154
  sources = []
155
  for i in range(n):
 
222
 
223
  def main():
224
  st.title("Research Paper Guru")
225
+ st.subheader("Mike Ion - https://github.com/mikeion")
226
+ st.subheader("Ask a question about a research paper and get an answer with sources!")
227
  st.subheader("Upload PDF or Enter URL")
228
 
229
  pdf_option = st.selectbox("Choose an option:", ["Upload PDF", "Enter URL"])