DEV-chat-with-pdf-openai

Sleeping

App Files Files Community

wholewhale commited on Oct 18, 2023

Commit

9c04c52

1 Parent(s): 8db718c

summary stuffing

Browse files

Files changed (1) hide show

app.py +26 -7

app.py CHANGED Viewed

@@ -23,30 +23,49 @@ def summary(self):
     avg_doc_length = sum(len(doc) for doc in self.documents) / num_documents
     return f"Number of documents: {num_documents}, Average document length: {avg_doc_length}"
-# PDF summary and query
 def pdf_changes(pdf_doc):
     try:
         loader = OnlinePDFLoader(pdf_doc.name)
         documents = loader.load()
         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
         texts = text_splitter.split_documents(documents)
         embeddings = OpenAIEmbeddings()
         global db
         db = Chroma.from_documents(texts, embeddings)
-        summary = db.summary()  # Assuming Chroma has a summary method
         retriever = db.as_retriever()
         global qa
         qa = ConversationalRetrievalChain.from_llm(
-            llm=OpenAI(temperature=0.2, model_name="gpt-3.5-turbo", max_tokens=-1, n=2),
-            retriever=retriever,
-            return_source_documents=False)
-        return f"Ready. {summary}"  # Include the summary in the return message
     except Exception as e:
         return f"Error processing PDF: {str(e)}"
 def clear_data():
     global qa, db
     qa = None

     avg_doc_length = sum(len(doc) for doc in self.documents) / num_documents
     return f"Number of documents: {num_documents}, Average document length: {avg_doc_length}"
+# PDF summary and query using stuffing
 def pdf_changes(pdf_doc):
     try:
         loader = OnlinePDFLoader(pdf_doc.name)
         documents = loader.load()
         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
         texts = text_splitter.split_documents(documents)
+        # Initialize summary variable
+        full_summary = ""
+        # Divide the text into smaller chunks, for example 3 pages per chunk
+        for i in range(0, len(texts), 3):
+            chunk = " ".join(texts[i:i+3])
+            # Load the summarization chain with stuffing method
+            stuff_chain = load_summarize_chain(vertex_llm_text, chain_type="stuff", prompt=prompt)
+            # Generate summary for the chunk
+            chunk_summary = stuff_chain.run(chunk)
+            # Add the chunk summary to the full summary
+            full_summary += f"Summary of pages {i+1}-{i+3}:\n{chunk_summary}\n"
         embeddings = OpenAIEmbeddings()
         global db
         db = Chroma.from_documents(texts, embeddings)
         retriever = db.as_retriever()
         global qa
         qa = ConversationalRetrievalChain.from_llm(
+            llm=OpenAI(temperature=0.2, model_name="gpt-3.5-turbo", max_tokens=-1, n=2),
+            retriever=retriever,
+            return_source_documents=False
+        )
+        return f"Ready. Full Summary:\n{full_summary}"
     except Exception as e:
         return f"Error processing PDF: {str(e)}"
 def clear_data():
     global qa, db
     qa = None