Waseemhassan771 commited on
Commit
bbe7373
·
verified ·
1 Parent(s): 8c6a92e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -27
app.py CHANGED
@@ -2,16 +2,24 @@ import os
2
  import streamlit as st
3
  import fitz # PyMuPDF
4
  from google.cloud import language_v1
5
- from google.oauth2 import service_account
6
- from dotenv import load_dotenv
7
  import requests
 
 
 
8
 
9
  # Load the environment variables from the .env file
10
  load_dotenv()
11
  google_api_key = os.getenv('GOOGLE_API_KEY')
 
12
 
13
- # Function to analyze entities using the API key
14
- def analyze_entities(text, api_key):
 
 
 
 
 
 
15
  url = f"https://language.googleapis.com/v1/documents:analyzeEntities?key={api_key}"
16
  headers = {
17
  "Content-Type": "application/json",
@@ -24,11 +32,12 @@ def analyze_entities(text, api_key):
24
  "encodingType": "UTF8"
25
  }
26
  response = requests.post(url, headers=headers, json=data)
27
- return response.json()
 
28
 
29
  # Streamlit app
30
  st.title("Chat with Your Document")
31
- st.write("Upload a PDF file to chat with its content using Google's Language API.")
32
 
33
  # File upload
34
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
@@ -41,32 +50,27 @@ if uploaded_file is not None:
41
  page = pdf_document.load_page(page_num)
42
  pdf_text += page.get_text()
43
 
44
- # Process the PDF text with Google's API
45
- analysis_result = analyze_entities(pdf_text, google_api_key)
46
-
47
- # Extract entities and their salience
48
- pdf_embeddings = []
49
- for entity in analysis_result.get('entities', []):
50
- pdf_embeddings.append({
51
- 'name': entity.get('name'),
52
- 'type': entity.get('type'),
53
- 'salience': entity.get('salience')
54
- })
55
 
56
  # Chat with the document
57
  user_input = st.text_input("Ask a question about the document:")
58
  if st.button("Ask"):
59
  if user_input:
60
- # Analyze the user input with Google's API
61
- analysis_result = analyze_entities(user_input, google_api_key)
62
- user_entities = analysis_result.get('entities', [])
63
 
64
- # Match user question with PDF content
65
- response_text = "Here are some key entities from the document:\n"
66
- for entity in user_entities:
67
- for pdf_entity in pdf_embeddings:
68
- if pdf_entity['name'] == entity.get('name'):
69
- response_text += f"Entity: {pdf_entity['name']}, Type: {pdf_entity['type']}, Salience: {pdf_entity['salience']}\n"
70
 
71
  st.write(response_text.strip())
72
  else:
@@ -74,4 +78,4 @@ if uploaded_file is not None:
74
 
75
  # Display the PDF text
76
  st.write("Extracted Text from PDF:")
77
- st.write(pdf_text)
 
2
  import streamlit as st
3
  import fitz # PyMuPDF
4
  from google.cloud import language_v1
 
 
5
  import requests
6
+ import json
7
+ from dotenv import load_dotenv
8
+ import pinecone
9
 
10
  # Load the environment variables from the .env file
11
  load_dotenv()
12
  google_api_key = os.getenv('GOOGLE_API_KEY')
13
+ pinecone_api_key = os.getenv('PINECONE_API_KEY')
14
 
15
+ # Initialize Pinecone
16
+ pinecone.init(api_key=pinecone_api_key, environment='us-west1-gcp')
17
+ index_name = 'pdf-analysis'
18
+ if index_name not in pinecone.list_indexes():
19
+ pinecone.create_index(index_name, dimension=768)
20
+
21
+ # Function to analyze entities and get embeddings using the API key
22
+ def get_embeddings(text, api_key):
23
  url = f"https://language.googleapis.com/v1/documents:analyzeEntities?key={api_key}"
24
  headers = {
25
  "Content-Type": "application/json",
 
32
  "encodingType": "UTF8"
33
  }
34
  response = requests.post(url, headers=headers, json=data)
35
+ embeddings = response.json()
36
+ return embeddings
37
 
38
  # Streamlit app
39
  st.title("Chat with Your Document")
40
+ st.write("Upload a PDF file to chat with its content using Google's Language API and Pinecone.")
41
 
42
  # File upload
43
  uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
 
50
  page = pdf_document.load_page(page_num)
51
  pdf_text += page.get_text()
52
 
53
+ # Get embeddings for the PDF text
54
+ embeddings = get_embeddings(pdf_text, google_api_key)
55
+ vectors = [(str(i), embedding) for i, embedding in enumerate(embeddings['entities'])]
56
+
57
+ # Create or connect to Pinecone index
58
+ index = pinecone.Index(index_name)
59
+ index.upsert(vectors)
 
 
 
 
60
 
61
  # Chat with the document
62
  user_input = st.text_input("Ask a question about the document:")
63
  if st.button("Ask"):
64
  if user_input:
65
+ # Get embeddings for the user query
66
+ user_query_embeddings = get_embeddings(user_input, google_api_key)
67
+ query_vector = user_query_embeddings['entities'][0]['name']
68
 
69
+ # Perform similarity search
70
+ results = index.query(query_vector, top_k=5)
71
+ response_text = "Relevant information from the document:\n"
72
+ for result in results['matches']:
73
+ response_text += f"Text: {result['text']}, Score: {result['score']}\n"
 
74
 
75
  st.write(response_text.strip())
76
  else:
 
78
 
79
  # Display the PDF text
80
  st.write("Extracted Text from PDF:")
81
+ st.write