Spaces:

Waseemhassan771
/

chat_document

Running

App Files Files Community

Waseemhassan771 commited on Feb 24

Commit

bbe7373

verified ·

1 Parent(s): 8c6a92e

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -27

app.py CHANGED Viewed

@@ -2,16 +2,24 @@ import os
 import streamlit as st
 import fitz  # PyMuPDF
 from google.cloud import language_v1
-from google.oauth2 import service_account
-from dotenv import load_dotenv
 import requests
 # Load the environment variables from the .env file
 load_dotenv()
 google_api_key = os.getenv('GOOGLE_API_KEY')
-# Function to analyze entities using the API key
-def analyze_entities(text, api_key):
     url = f"https://language.googleapis.com/v1/documents:analyzeEntities?key={api_key}"
     headers = {
         "Content-Type": "application/json",
@@ -24,11 +32,12 @@ def analyze_entities(text, api_key):
         "encodingType": "UTF8"
     }
     response = requests.post(url, headers=headers, json=data)
-    return response.json()
 # Streamlit app
 st.title("Chat with Your Document")
-st.write("Upload a PDF file to chat with its content using Google's Language API.")
 # File upload
 uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
@@ -41,32 +50,27 @@ if uploaded_file is not None:
         page = pdf_document.load_page(page_num)
         pdf_text += page.get_text()
-    # Process the PDF text with Google's API
-    analysis_result = analyze_entities(pdf_text, google_api_key)
-    # Extract entities and their salience
-    pdf_embeddings = []
-    for entity in analysis_result.get('entities', []):
-        pdf_embeddings.append({
-            'name': entity.get('name'),
-            'type': entity.get('type'),
-            'salience': entity.get('salience')
-        })
     # Chat with the document
     user_input = st.text_input("Ask a question about the document:")
     if st.button("Ask"):
         if user_input:
-            # Analyze the user input with Google's API
-            analysis_result = analyze_entities(user_input, google_api_key)
-            user_entities = analysis_result.get('entities', [])
-            # Match user question with PDF content
-            response_text = "Here are some key entities from the document:\n"
-            for entity in user_entities:
-                for pdf_entity in pdf_embeddings:
-                    if pdf_entity['name'] == entity.get('name'):
-                        response_text += f"Entity: {pdf_entity['name']}, Type: {pdf_entity['type']}, Salience: {pdf_entity['salience']}\n"
             st.write(response_text.strip())
         else:
@@ -74,4 +78,4 @@ if uploaded_file is not None:
     # Display the PDF text
     st.write("Extracted Text from PDF:")
-    st.write(pdf_text)

 import streamlit as st
 import fitz  # PyMuPDF
 from google.cloud import language_v1
 import requests
+import json
+from dotenv import load_dotenv
+import pinecone
 # Load the environment variables from the .env file
 load_dotenv()
 google_api_key = os.getenv('GOOGLE_API_KEY')
+pinecone_api_key = os.getenv('PINECONE_API_KEY')
+# Initialize Pinecone
+pinecone.init(api_key=pinecone_api_key, environment='us-west1-gcp')
+index_name = 'pdf-analysis'
+if index_name not in pinecone.list_indexes():
+    pinecone.create_index(index_name, dimension=768)
+# Function to analyze entities and get embeddings using the API key
+def get_embeddings(text, api_key):
     url = f"https://language.googleapis.com/v1/documents:analyzeEntities?key={api_key}"
     headers = {
         "Content-Type": "application/json",
         "encodingType": "UTF8"
     }
     response = requests.post(url, headers=headers, json=data)
+    embeddings = response.json()
+    return embeddings
 # Streamlit app
 st.title("Chat with Your Document")
+st.write("Upload a PDF file to chat with its content using Google's Language API and Pinecone.")
 # File upload
 uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
         page = pdf_document.load_page(page_num)
         pdf_text += page.get_text()
+    # Get embeddings for the PDF text
+    embeddings = get_embeddings(pdf_text, google_api_key)
+    vectors = [(str(i), embedding) for i, embedding in enumerate(embeddings['entities'])]
+    # Create or connect to Pinecone index
+    index = pinecone.Index(index_name)
+    index.upsert(vectors)
     # Chat with the document
     user_input = st.text_input("Ask a question about the document:")
     if st.button("Ask"):
         if user_input:
+            # Get embeddings for the user query
+            user_query_embeddings = get_embeddings(user_input, google_api_key)
+            query_vector = user_query_embeddings['entities'][0]['name']
+            # Perform similarity search
+            results = index.query(query_vector, top_k=5)
+            response_text = "Relevant information from the document:\n"
+            for result in results['matches']:
+                response_text += f"Text: {result['text']}, Score: {result['score']}\n"
             st.write(response_text.strip())
         else:
     # Display the PDF text
     st.write("Extracted Text from PDF:")
+    st.write