Spaces:

mikeion
/

research_guru

Sleeping

App Files Files Community

mikeion commited on Mar 25, 2023

Commit

5985946

1 Parent(s): efeb6fd

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -42

app.py CHANGED Viewed

@@ -1,8 +1,3 @@
 import os
 import requests
 from io import BytesIO
@@ -23,55 +18,87 @@ messages = [
 class Chatbot():
     def parse_paper(self, pdf):
         print("Parsing paper")
         number_of_pages = len(pdf.pages)
         print(f"Total number of pages: {number_of_pages}")
         paper_text = []
         for i in range(number_of_pages):
             page = pdf.pages[i]
             page_text = []
-            def visitor_body(text, cm, tm, fontDict, fontSize):
                 x = tm[4]
                 y = tm[5]
-                # ignore header/footer
                 if (y > 50 and y < 720) and (len(text.strip()) > 1):
                     page_text.append({
                     'fontsize': fontSize,
                     'text': text.strip().replace('\x03', ''),
                     'x': x,
                     'y': y
                     })
             _ = page.extract_text(visitor_text=visitor_body)
-            blob_font_size = None
-            blob_text = ''
-            processed_text = []
-            for t in page_text:
-                if t['fontsize'] == blob_font_size:
-                    blob_text += f" {t['text']}"
-                    if len(blob_text) >= 2000:
-                        processed_text.append({
-                            'fontsize': blob_font_size,
-                            'text': blob_text,
                             'page': i
                         })
-                        blob_font_size = None
-                        blob_text = ''
-                else:
-                    if blob_font_size is not None and len(blob_text) >= 1:
-                        processed_text.append({
-                            'fontsize': blob_font_size,
-                            'text': blob_text,
                             'page': i
                         })
-                    blob_font_size = t['fontsize']
-                    blob_text = t['text']
-                paper_text += processed_text
         print("Done parsing paper")
-        # print(paper_text)
         return paper_text
     def paper_df(self, pdf):
@@ -95,13 +122,8 @@ class Chatbot():
         openai.api_key = os.getenv('OPENAI_API_KEY')
         embedding_model = "text-embedding-ada-002"
         # This is going to create embeddings for subsets of the PDF
-        embeddings = df.text.apply([lambda x: get_embedding(x, engine=embedding_model)])
-        df["embeddings"] = embeddings
-        print('Done calculating embeddings')
-        print(pkg_resources.get_distribution("openai").version)
-        return df
     def search_embeddings(self, df, query, n=3, pprint=True):
@@ -110,12 +132,24 @@ class Chatbot():
             query,
             engine="text-embedding-ada-002"
         )
-        # Step 2. Create a column in the dataframe that contains the cosine similarity (distance) between the query and the text in the dataframe
-        df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding))
-        # Step 3. Sort the dataframe by the similarity column
-        results = df.sort_values("similarity", ascending=False, ignore_index=True)
-        # make a dictionary of the the first three results with the page number as the key and the text as the value. The page number is a column in the dataframe.
-        results = results.head(n)
         global sources
         sources = []
         for i in range(n):
@@ -188,6 +222,8 @@ def show_pdf(file_content):
 def main():
     st.title("Research Paper Guru")
     st.subheader("Upload PDF or Enter URL")
     pdf_option = st.selectbox("Choose an option:", ["Upload PDF", "Enter URL"])

 import os
 import requests
 from io import BytesIO
 class Chatbot():
     def parse_paper(self, pdf):
+        # This function parses the PDF and returns a list of dictionaries with the text,
+        # font size, and x and y coordinates of each text element in the PDF
         print("Parsing paper")
         number_of_pages = len(pdf.pages)
         print(f"Total number of pages: {number_of_pages}")
+        # This is the list that will contain all the text elements in the PDF and will be returned by the function
         paper_text = []
         for i in range(number_of_pages):
+            # Iterate through each page in the PDF, and extract the text elements. pdf.pages is a list of Page objects.
             page = pdf.pages[i]
+            # This is the list that will contain all the text elements in the current page
             page_text = []
+            def visitor_body(text, tm, fontSize):
+                # tm is a 6-element tuple of floats that represent a 2x3 matrix, which is the text matrix for the text.
+                # The first two elements are the horizontal and vertical scaling factors, the third and fourth elements
+                # are the horizontal and vertical shear factors, and the fifth and sixth elements are the horizontal and vertical translation factors.
+                # x and y are the coordinates of the text element
                 x = tm[4]
                 y = tm[5]
+                # ignore header/footer, and empty text.
+                # The y coordinate is used to filter out the header and footer of the paper
+                # The length of the text is used to filter out empty text
                 if (y > 50 and y < 720) and (len(text.strip()) > 1):
                     page_text.append({
+                    # The fontsize is used to separate paragraphs into different elements in the paper_text list
                     'fontsize': fontSize,
+                    # The text is stripped of whitespace and the \x03 character
                     'text': text.strip().replace('\x03', ''),
+                    # The x and y coordinates are used to separate paragraphs into different elements in the paper_text list
                     'x': x,
                     'y': y
                     })
+            # Extract the text elements from the page
             _ = page.extract_text(visitor_text=visitor_body)
+            # Instantiate variables to be used in the loop. The first text element in the page_text list is used to initialize the variables
+            prev_y = None
+            prev_font_size = None
+            paragraph = ''
+            # Iterate through the page_text list and add the text to the paragraph string.
+            # y_diff and font_size_diff are used to separate paragraphs into different elements in the paper_text list by
+            # checking the y coordinate and the font size of the current text element and comparing it to the previous text element
+            for idx, t in enumerate(page_text):
+                if prev_y is None:
+                    y_diff = abs(t['y'] - prev_y)
+                    font_size_diff = abs(t['fontsize'] - prev_font_size)
+                    # y_diff > 10 and font_size_diff > 1 are used to separate paragraphs into different elements in the paper_text list
+                    if y_diff > 10 or font_size_diff > 1:
+                        # Add paragraph to paper_text when the y_diff is too large or the font size is too different
+                        # This is to separate paragraphs into different elements in the paper_text list
+                        # This is done by checking the y coordinate and the font size of the current text element
+                        paper_text.append({
+                            'fontsize': prev_font_size,
+                            'text': paragraph.strip(),
                             'page': i
                         })
+                        paragraph = ''
+                    # Add text to paragraph, and update the variables.
+                    paragraph += f" {t['text']}"
+                    prev_y = t['y']
+                    prev_font_size = t['fontsize']
+                    # Add last paragraph when reaching the end of the page_text
+                    if idx == len(page_text) - 1:
+                        paper_text.append({
+                            'fontsize': prev_font_size,
+                            'text': paragraph.strip(),
                             'page': i
                         })
         print("Done parsing paper")
         return paper_text
     def paper_df(self, pdf):
         openai.api_key = os.getenv('OPENAI_API_KEY')
         embedding_model = "text-embedding-ada-002"
         # This is going to create embeddings for subsets of the PDF
+        embeddings = np.vstack(df.text.apply(lambda x: get_embedding(x, engine=embedding_model)))
+        return embeddings
     def search_embeddings(self, df, query, n=3, pprint=True):
             query,
             engine="text-embedding-ada-002"
         )
+        # Step 2. Create a FAISS index and add the embeddings
+        d = embeddings.shape[1]
+        # Use the L2 distance metric
+        index = faiss.IndexFlatL2(d)
+        index.add(embeddings)
+        # Step 3. Search the index for the embedding of the question
+        D, I = index.search(query_embedding.reshape(1,d), n)
+        # Step 4. Get the top n results from the dataframe
+        results = df.iloc[I[0]]
+        results['similarity'] = D[0]
+        results = results.reset_index(drop=True)
+        # Make a dictionary of the first n results with the page number as the key and the text as the value
         global sources
         sources = []
         for i in range(n):
 def main():
     st.title("Research Paper Guru")
+    st.subheader("Mike Ion - https://github.com/mikeion")
+    st.subheader("Ask a question about a research paper and get an answer with sources!")
     st.subheader("Upload PDF or Enter URL")
     pdf_option = st.selectbox("Choose an option:", ["Upload PDF", "Enter URL"])