Commit
·
198bfc2
1
Parent(s):
b222c4c
add app
Browse files- .env copy +5 -0
- .gitignore +12 -1
- Dockerfile +11 -0
- README.md +10 -0
- app.py +90 -0
- langchain_components/ingest.py +213 -0
- langchain_components/replier.py +103 -0
- poetry.lock +0 -0
- pyproject.toml +25 -0
- requirements.txt +112 -0
.env copy
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OPENAI_API_KEY=Your_OPENAI_API_KEY
|
2 |
+
POSTGRES_URL_EMBEDDINDS=YOUR_POSTGRES_URL
|
3 |
+
POSTGRES_URL=YOUR_POSTGRES_URL
|
4 |
+
PINECONE_API_KEY = YOUR_PINECONE_API_KEY
|
5 |
+
#POSTGRES_URL_SERVER=YOUR_POSTGRES_URL_SERVER
|
.gitignore
CHANGED
@@ -1,2 +1,13 @@
|
|
1 |
.env
|
2 |
-
venv/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
.env
|
2 |
+
venv/
|
3 |
+
.venv/
|
4 |
+
myvenv/
|
5 |
+
my-env/
|
6 |
+
.env
|
7 |
+
|
8 |
+
data/
|
9 |
+
data_ingest/
|
10 |
+
__pycache__
|
11 |
+
|
12 |
+
images/
|
13 |
+
*.ipynb
|
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
COPY . .
|
10 |
+
|
11 |
+
CMD ["streamlit", "run", "app.py", "--server.address=0.0.0.0", "--server.port=7860"]
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Stramlit Pdf
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
---
|
9 |
+
|
10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import random
|
3 |
+
from langchain_components.replier import get_context_from_vectorstore,get_vectorstore_from_postgres,prepare_prompt_and_chain_with_history,get_vectorstore_from_pinecone
|
4 |
+
import fitz
|
5 |
+
|
6 |
+
def display_pdf(pdf_path):
|
7 |
+
try:
|
8 |
+
pdf_document = fitz.open(pdf_path)
|
9 |
+
num_pages = pdf_document.page_count
|
10 |
+
st.sidebar.write(f"Total pages: {num_pages}")
|
11 |
+
|
12 |
+
for page_num in range(num_pages):
|
13 |
+
page = pdf_document.load_page(page_num)
|
14 |
+
image = page.get_pixmap()
|
15 |
+
st.sidebar.image(image.tobytes(), caption=f"Page {page_num + 1}", use_column_width=True)
|
16 |
+
|
17 |
+
except Exception as e:
|
18 |
+
st.sidebar.error(f"Error loading PDF: {e}")
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
def main():
|
23 |
+
st.header('Interact with your PDF that includes images, tables, and graphs.')
|
24 |
+
|
25 |
+
if "activate_chat" not in st.session_state:
|
26 |
+
st.session_state.activate_chat = False
|
27 |
+
|
28 |
+
if "messages" not in st.session_state:
|
29 |
+
st.session_state.messages = []
|
30 |
+
|
31 |
+
|
32 |
+
with st.sidebar:
|
33 |
+
username = st.text_input("Please enter your name here")
|
34 |
+
if st.button('Press Button to Start chat with Narendra AI'):
|
35 |
+
if "user_id" not in st.session_state:
|
36 |
+
st.session_state.user_id = username
|
37 |
+
|
38 |
+
if "session_id" not in st.session_state:
|
39 |
+
random_number = random.randint(1, 1000000)
|
40 |
+
st.session_state.session_id = str(random_number)
|
41 |
+
|
42 |
+
if "vectorstore" not in st.session_state:
|
43 |
+
collection_name="fy2024_chunk_2000"
|
44 |
+
pinecone_collection_name="fy2024"
|
45 |
+
st.session_state.vectorstore = get_vectorstore_from_postgres(collection_name)
|
46 |
+
#st.session_state.vectorstore = get_vectorstore_from_pinecone(pinecone_collection_name)
|
47 |
+
|
48 |
+
if "chain" not in st.session_state:
|
49 |
+
st.session_state.chain = prepare_prompt_and_chain_with_history()
|
50 |
+
|
51 |
+
st.session_state.activate_chat = True
|
52 |
+
|
53 |
+
|
54 |
+
st.subheader("PDF Viewer")
|
55 |
+
pdf_path = "data/fy2024.pdf"
|
56 |
+
if st.button('Show PDF'):
|
57 |
+
st.session_state.pdf_path = pdf_path
|
58 |
+
|
59 |
+
if "pdf_path" in st.session_state:
|
60 |
+
pdf_path = st.session_state.pdf_path
|
61 |
+
display_pdf(pdf_path)
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
for message in st.session_state.messages:
|
66 |
+
with st.chat_message(message["role"], avatar = message['avatar']):
|
67 |
+
st.markdown(message["content"])
|
68 |
+
|
69 |
+
if st.session_state.activate_chat == True:
|
70 |
+
if prompt := st.chat_input("Ask your question from the PDF? "):
|
71 |
+
with st.chat_message("user", avatar = '👨🏻'):
|
72 |
+
st.markdown(prompt)
|
73 |
+
st.session_state.messages.append({"role": "user", "avatar" :'👨🏻', "content": prompt})
|
74 |
+
|
75 |
+
user_id = st.session_state.user_id
|
76 |
+
session_id = st.session_state.session_id
|
77 |
+
vectorstore = st.session_state.vectorstore
|
78 |
+
chain = st.session_state.chain
|
79 |
+
print("chain Done")
|
80 |
+
|
81 |
+
data=get_context_from_vectorstore(vectorstore,prompt)
|
82 |
+
ai_msg =chain.invoke({"data": data, "input": prompt}, config={"configurable": {"user_id": user_id, "session_id": session_id}})
|
83 |
+
cleaned_response=ai_msg.content
|
84 |
+
with st.chat_message("assistant", avatar='🤖'):
|
85 |
+
st.markdown(cleaned_response)
|
86 |
+
st.session_state.messages.append({"role": "assistant", "avatar" :'🤖', "content": cleaned_response})
|
87 |
+
|
88 |
+
|
89 |
+
if __name__ == '__main__':
|
90 |
+
main()
|
langchain_components/ingest.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import uuid
|
3 |
+
import base64
|
4 |
+
from unstructured.partition.pdf import partition_pdf
|
5 |
+
from langchain_openai import ChatOpenAI
|
6 |
+
from langchain.chains import LLMChain
|
7 |
+
from langchain.prompts import PromptTemplate
|
8 |
+
from langchain.schema.messages import HumanMessage, SystemMessage
|
9 |
+
from langchain.schema.document import Document
|
10 |
+
from langchain_openai import OpenAIEmbeddings
|
11 |
+
from langchain_postgres.vectorstores import PGVector
|
12 |
+
from pinecone import Pinecone
|
13 |
+
from pinecone import ServerlessSpec
|
14 |
+
from langchain_pinecone import PineconeVectorStore
|
15 |
+
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
load_dotenv()
|
18 |
+
|
19 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
20 |
+
POSTGRES_URL_EMBEDDINDS=os.getenv("POSTGRES_URL_EMBEDDINDS")
|
21 |
+
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
filename="/home/bluebash-005/code/bluebash/poc/stramlit_pdf/data/fy2024.pdf"
|
26 |
+
output_path = "/home/bluebash-005/code/bluebash/poc/stramlit_pdf/images"
|
27 |
+
openai_ef = OpenAIEmbeddings()
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
text_elements = []
|
32 |
+
text_summaries = []
|
33 |
+
|
34 |
+
table_elements = []
|
35 |
+
table_summaries = []
|
36 |
+
|
37 |
+
image_elements = []
|
38 |
+
image_summaries = []
|
39 |
+
|
40 |
+
|
41 |
+
def file_reader():
|
42 |
+
raw_pdf_elements = partition_pdf(
|
43 |
+
filename=filename,
|
44 |
+
extract_images_in_pdf=True,
|
45 |
+
infer_table_structure=True,
|
46 |
+
chunking_strategy="by_title",
|
47 |
+
max_characters=4000,
|
48 |
+
new_after_n_chars=3800,
|
49 |
+
combine_text_under_n_chars=2000,
|
50 |
+
extract_image_block_output_dir=output_path,
|
51 |
+
)
|
52 |
+
return raw_pdf_elements
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
def text_insert(raw_pdf_elements):
|
57 |
+
summary_prompt = """
|
58 |
+
Summarize the following {element_type}:
|
59 |
+
{element}
|
60 |
+
"""
|
61 |
+
|
62 |
+
prompt=PromptTemplate.from_template(summary_prompt)
|
63 |
+
llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key = openai_api_key, max_tokens=1024)
|
64 |
+
runnable = prompt | llm
|
65 |
+
|
66 |
+
for e in raw_pdf_elements:
|
67 |
+
if 'CompositeElement' in repr(e):
|
68 |
+
text_elements.append(e.text)
|
69 |
+
summary = runnable.invoke({'element_type': 'text', 'element': e})
|
70 |
+
text_summaries.append(summary.content)
|
71 |
+
|
72 |
+
elif 'Table' in repr(e):
|
73 |
+
table_elements.append(e.text)
|
74 |
+
summary = runnable.invoke({'element_type': 'table', 'element': e})
|
75 |
+
table_summaries.append(summary.content)
|
76 |
+
|
77 |
+
|
78 |
+
def image_insert():
|
79 |
+
|
80 |
+
def encode_image(image_path):
|
81 |
+
with open(image_path, "rb") as f:
|
82 |
+
return base64.b64encode(f.read()).decode('utf-8')
|
83 |
+
|
84 |
+
def summarize_image(encoded_image):
|
85 |
+
prompt = [
|
86 |
+
SystemMessage(content="You are a bot that is good at analyzing images."),
|
87 |
+
HumanMessage(content=[
|
88 |
+
{
|
89 |
+
"type": "text",
|
90 |
+
"text": "Describe the contents of this image."
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"type": "image_url",
|
94 |
+
"image_url": {
|
95 |
+
"url": f"data:image/jpeg;base64,{encoded_image}"
|
96 |
+
},
|
97 |
+
},
|
98 |
+
])
|
99 |
+
]
|
100 |
+
response = ChatOpenAI(model="gpt-4-vision-preview", openai_api_key=openai_api_key, max_tokens=1024).invoke(prompt)
|
101 |
+
return response.content
|
102 |
+
|
103 |
+
|
104 |
+
for i in os.listdir(output_path):
|
105 |
+
if i.endswith(('.png', '.jpg', '.jpeg')):
|
106 |
+
image_path = os.path.join(output_path, i)
|
107 |
+
encoded_image = encode_image(image_path)
|
108 |
+
image_elements.append(encoded_image)
|
109 |
+
summary = summarize_image(encoded_image)
|
110 |
+
image_summaries.append(summary)
|
111 |
+
|
112 |
+
|
113 |
+
documents = []
|
114 |
+
retrieve_contents = []
|
115 |
+
|
116 |
+
def get_docummets():
|
117 |
+
for e, s in zip(text_elements, text_summaries):
|
118 |
+
i = str(uuid.uuid4())
|
119 |
+
doc = Document(
|
120 |
+
page_content = s,
|
121 |
+
metadata = {
|
122 |
+
'id': i,
|
123 |
+
'type': 'text',
|
124 |
+
'original_content': e
|
125 |
+
}
|
126 |
+
)
|
127 |
+
retrieve_contents.append((i, e))
|
128 |
+
documents.append(doc)
|
129 |
+
print("text_element done")
|
130 |
+
|
131 |
+
for e, s in zip(table_elements, table_summaries):
|
132 |
+
doc = Document(
|
133 |
+
page_content = s,
|
134 |
+
metadata = {
|
135 |
+
'id': i,
|
136 |
+
'type': 'table',
|
137 |
+
'original_content': e
|
138 |
+
}
|
139 |
+
)
|
140 |
+
retrieve_contents.append((i, e))
|
141 |
+
documents.append(doc)
|
142 |
+
|
143 |
+
print("table_elements done")
|
144 |
+
|
145 |
+
for e, s in zip(image_elements, image_summaries):
|
146 |
+
doc = Document(
|
147 |
+
page_content = s,
|
148 |
+
metadata = {
|
149 |
+
'id': i,
|
150 |
+
'type': 'image',
|
151 |
+
'original_content': e
|
152 |
+
}
|
153 |
+
)
|
154 |
+
retrieve_contents.append((i, s))
|
155 |
+
documents.append(doc)
|
156 |
+
|
157 |
+
print("image_elements Done")
|
158 |
+
|
159 |
+
def add_docs_to_postgres(collection_name):
|
160 |
+
vectorstore = PGVector(embeddings=openai_ef,collection_name=collection_name,connection=POSTGRES_URL_EMBEDDINDS,use_jsonb=True,)
|
161 |
+
vectorstore.add_documents(documents)
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
def add_docs_to_pinecone(index_name):
|
166 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
167 |
+
|
168 |
+
spec = ServerlessSpec(cloud='aws', region='us-east-1')
|
169 |
+
if index_name in pc.list_indexes().names():
|
170 |
+
pc.delete_index(index_name)
|
171 |
+
|
172 |
+
# we create a new index
|
173 |
+
pc.create_index(
|
174 |
+
index_name,
|
175 |
+
dimension=1536,
|
176 |
+
metric='dotproduct',
|
177 |
+
spec=spec
|
178 |
+
)
|
179 |
+
import pdb
|
180 |
+
pdb.set_trace()
|
181 |
+
n=len(documents)//2
|
182 |
+
doc1=documents[:n]
|
183 |
+
doc2=documents[n:]
|
184 |
+
|
185 |
+
vectorstore_from_docs = PineconeVectorStore.from_documents(
|
186 |
+
doc1,
|
187 |
+
index_name=index_name,
|
188 |
+
embedding=openai_ef
|
189 |
+
)
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
|
194 |
+
def main():
|
195 |
+
collection_name="fy2024"
|
196 |
+
print("started file reader")
|
197 |
+
raw_pdf_elements=file_reader()
|
198 |
+
print(raw_pdf_elements)
|
199 |
+
print()
|
200 |
+
|
201 |
+
text_insert(raw_pdf_elements)
|
202 |
+
print("text_insert Done")
|
203 |
+
image_insert()
|
204 |
+
print("image_insert Done")
|
205 |
+
print()
|
206 |
+
get_docummets()
|
207 |
+
print("get_docummets Done")
|
208 |
+
#add_docs_to_postgres(collection_name)
|
209 |
+
add_docs_to_pinecone(collection_name)
|
210 |
+
print("Done")
|
211 |
+
|
212 |
+
if __name__=="__main__":
|
213 |
+
main()
|
langchain_components/replier.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from langchain_openai import ChatOpenAI
|
3 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
4 |
+
from langchain_core.runnables.history import RunnableWithMessageHistory
|
5 |
+
from langchain_core.runnables import ConfigurableFieldSpec
|
6 |
+
from langchain_community.chat_message_histories import PostgresChatMessageHistory
|
7 |
+
from langchain_openai import OpenAIEmbeddings
|
8 |
+
from langchain_postgres.vectorstores import PGVector
|
9 |
+
from langchain_community.vectorstores import Pinecone
|
10 |
+
|
11 |
+
import logging
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
logging.basicConfig(level=logging.INFO)
|
17 |
+
|
18 |
+
os.environ["OPENAI_API_KEY"] =os.getenv("OPENAI_API_KEY")
|
19 |
+
POSTGRES_URL = os.getenv("POSTGRES_URL")
|
20 |
+
|
21 |
+
def create_postgres_chat_message_history(session_id, user_id):
|
22 |
+
return PostgresChatMessageHistory(connection_string=POSTGRES_URL,session_id=session_id)
|
23 |
+
|
24 |
+
def prepare_prompt_and_chain_with_history():
|
25 |
+
llm = ChatOpenAI(model="gpt-4o")
|
26 |
+
prompt = ChatPromptTemplate.from_messages(
|
27 |
+
[
|
28 |
+
(
|
29 |
+
"system",
|
30 |
+
"""You are an expert in data that hepls to summerize the graph and extract information from that images. Answer the question based only on the following context, which can include text, images and tables: "
|
31 |
+
** IMPORTANT INSTRUCTIONS -->
|
32 |
+
1. Reply in 50 words maximum.
|
33 |
+
2. Only answer the question related to the context else say you don't know the answer.""",
|
34 |
+
|
35 |
+
),
|
36 |
+
"Here is the context to answer user's questions everytime --> {data}. ",
|
37 |
+
MessagesPlaceholder(variable_name="history"),
|
38 |
+
("user", "{input}"),
|
39 |
+
]
|
40 |
+
)
|
41 |
+
runnable = prompt | llm
|
42 |
+
with_message_history = RunnableWithMessageHistory(
|
43 |
+
runnable,
|
44 |
+
create_postgres_chat_message_history,
|
45 |
+
input_messages_key="input",
|
46 |
+
history_messages_key="history",
|
47 |
+
history_factory_config=[
|
48 |
+
ConfigurableFieldSpec(
|
49 |
+
id="user_id",
|
50 |
+
annotation=str,
|
51 |
+
name="User ID",
|
52 |
+
description="Unique identifier for the user.",
|
53 |
+
default="",
|
54 |
+
is_shared=True,
|
55 |
+
),
|
56 |
+
ConfigurableFieldSpec(
|
57 |
+
id="session_id",
|
58 |
+
annotation=str,
|
59 |
+
name="Session ID",
|
60 |
+
description="Unique identifier for the conversation.",
|
61 |
+
default="",
|
62 |
+
is_shared=True,
|
63 |
+
),
|
64 |
+
],
|
65 |
+
verbose=True,
|
66 |
+
)
|
67 |
+
return with_message_history
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
def get_vectorstore_from_postgres(collection_name):
|
73 |
+
openai_ef = OpenAIEmbeddings()
|
74 |
+
vectorstore = PGVector(
|
75 |
+
embeddings=openai_ef,
|
76 |
+
collection_name=collection_name,
|
77 |
+
connection=POSTGRES_URL,
|
78 |
+
use_jsonb=True,
|
79 |
+
)
|
80 |
+
return vectorstore
|
81 |
+
|
82 |
+
|
83 |
+
def get_vectorstore_from_pinecone(index_name):
|
84 |
+
openai_ef = OpenAIEmbeddings()
|
85 |
+
vectorstore = Pinecone.from_existing_index(index_name, openai_ef)
|
86 |
+
return vectorstore
|
87 |
+
|
88 |
+
|
89 |
+
def get_context_from_vectorstore(vectorstore,user_query):
|
90 |
+
logging.info("Start postgres vector search......")
|
91 |
+
relevant_docs = vectorstore.similarity_search(user_query,k=4)
|
92 |
+
logging.info(relevant_docs)
|
93 |
+
context = ""
|
94 |
+
relevant_images = []
|
95 |
+
for d in relevant_docs:
|
96 |
+
if d.metadata['type'] == 'text':
|
97 |
+
context += '[text]' + d.metadata['original_content']
|
98 |
+
elif d.metadata['type'] == 'table':
|
99 |
+
context += '[table]' + d.metadata['original_content']
|
100 |
+
elif d.metadata['type'] == 'image':
|
101 |
+
context += '[image]' + d.page_content
|
102 |
+
relevant_images.append(d.metadata['original_content'])
|
103 |
+
return context
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "streamlit-unstructed-pdf"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["narendra-bluebash <[email protected]>"]
|
6 |
+
readme = "README.md"
|
7 |
+
|
8 |
+
[tool.poetry.dependencies]
|
9 |
+
python = ">=3.9,<3.9.7 || >3.9.7,<4.0"
|
10 |
+
streamlit = "1.35.0"
|
11 |
+
langchain-openai = "^0.1.9"
|
12 |
+
langchain = "^0.2.5"
|
13 |
+
langchain-community = "^0.2.5"
|
14 |
+
langchain-postgres = "^0.0.9"
|
15 |
+
python-dotenv = "^1.0.1"
|
16 |
+
psycopg2-binary = "^2.9.9"
|
17 |
+
pinecone-client = "^4.1.1"
|
18 |
+
pymupdf = "^1.24.6"
|
19 |
+
|
20 |
+
[tool.poetry.group.dev.dependencies]
|
21 |
+
ipykernel = "^6.29.4"
|
22 |
+
|
23 |
+
[build-system]
|
24 |
+
requires = ["poetry-core>=1.0.0"]
|
25 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.5
|
2 |
+
aiosignal==1.3.1
|
3 |
+
altair==5.3.0
|
4 |
+
annotated-types==0.7.0
|
5 |
+
anyio==4.4.0
|
6 |
+
asttokens==2.4.1
|
7 |
+
async-timeout==4.0.3
|
8 |
+
attrs==23.2.0
|
9 |
+
blinker==1.8.2
|
10 |
+
cachetools==5.3.3
|
11 |
+
certifi==2024.6.2
|
12 |
+
charset-normalizer==3.3.2
|
13 |
+
click==8.1.7
|
14 |
+
comm==0.2.2
|
15 |
+
dataclasses-json==0.6.7
|
16 |
+
debugpy==1.8.1
|
17 |
+
decorator==5.1.1
|
18 |
+
distro==1.9.0
|
19 |
+
exceptiongroup==1.2.1
|
20 |
+
executing==2.0.1
|
21 |
+
frozenlist==1.4.1
|
22 |
+
gitdb==4.0.11
|
23 |
+
GitPython==3.1.43
|
24 |
+
greenlet==3.0.3
|
25 |
+
h11==0.14.0
|
26 |
+
httpcore==1.0.5
|
27 |
+
httpx==0.27.0
|
28 |
+
idna==3.7
|
29 |
+
importlib_metadata==7.2.1
|
30 |
+
ipykernel==6.29.4
|
31 |
+
ipython==8.18.1
|
32 |
+
jedi==0.19.1
|
33 |
+
Jinja2==3.1.4
|
34 |
+
jsonpatch==1.33
|
35 |
+
jsonpointer==3.0.0
|
36 |
+
jsonschema==4.22.0
|
37 |
+
jsonschema-specifications==2023.12.1
|
38 |
+
jupyter_client==8.6.2
|
39 |
+
jupyter_core==5.7.2
|
40 |
+
langchain==0.2.5
|
41 |
+
langchain-community==0.2.5
|
42 |
+
langchain-core==0.2.9
|
43 |
+
langchain-openai==0.1.9
|
44 |
+
langchain-postgres==0.0.9
|
45 |
+
langchain-text-splitters==0.2.1
|
46 |
+
langsmith==0.1.82
|
47 |
+
markdown-it-py==3.0.0
|
48 |
+
MarkupSafe==2.1.5
|
49 |
+
marshmallow==3.21.3
|
50 |
+
matplotlib-inline==0.1.7
|
51 |
+
mdurl==0.1.2
|
52 |
+
multidict==6.0.5
|
53 |
+
mypy-extensions==1.0.0
|
54 |
+
nest-asyncio==1.6.0
|
55 |
+
numpy==1.26.4
|
56 |
+
openai==1.35.3
|
57 |
+
orjson==3.10.5
|
58 |
+
packaging==24.1
|
59 |
+
pandas==2.2.2
|
60 |
+
parso==0.8.4
|
61 |
+
pexpect==4.9.0
|
62 |
+
pgvector==0.2.5
|
63 |
+
pillow==10.3.0
|
64 |
+
pinecone-client==4.1.1
|
65 |
+
pinecone-plugin-interface==0.0.7
|
66 |
+
platformdirs==4.2.2
|
67 |
+
prompt_toolkit==3.0.47
|
68 |
+
protobuf==4.25.3
|
69 |
+
psutil==6.0.0
|
70 |
+
psycopg==3.1.19
|
71 |
+
psycopg-pool==3.2.2
|
72 |
+
psycopg2-binary==2.9.9
|
73 |
+
ptyprocess==0.7.0
|
74 |
+
pure-eval==0.2.2
|
75 |
+
pyarrow==16.1.0
|
76 |
+
pydantic==2.7.4
|
77 |
+
pydantic_core==2.18.4
|
78 |
+
pydeck==0.9.1
|
79 |
+
Pygments==2.18.0
|
80 |
+
PyMuPDF==1.24.6
|
81 |
+
PyMuPDFb==1.24.6
|
82 |
+
python-dateutil==2.9.0.post0
|
83 |
+
python-dotenv==1.0.1
|
84 |
+
pytz==2024.1
|
85 |
+
PyYAML==6.0.1
|
86 |
+
pyzmq==26.0.3
|
87 |
+
referencing==0.35.1
|
88 |
+
regex==2024.5.15
|
89 |
+
requests==2.32.3
|
90 |
+
rich==13.7.1
|
91 |
+
rpds-py==0.18.1
|
92 |
+
six==1.16.0
|
93 |
+
smmap==5.0.1
|
94 |
+
sniffio==1.3.1
|
95 |
+
SQLAlchemy==2.0.31
|
96 |
+
stack-data==0.6.3
|
97 |
+
streamlit==1.35.0
|
98 |
+
tenacity==8.4.2
|
99 |
+
tiktoken==0.7.0
|
100 |
+
toml==0.10.2
|
101 |
+
toolz==0.12.1
|
102 |
+
tornado==6.4.1
|
103 |
+
tqdm==4.66.4
|
104 |
+
traitlets==5.14.3
|
105 |
+
typing-inspect==0.9.0
|
106 |
+
typing_extensions==4.12.2
|
107 |
+
tzdata==2024.1
|
108 |
+
urllib3==2.2.2
|
109 |
+
watchdog==4.0.1
|
110 |
+
wcwidth==0.2.13
|
111 |
+
yarl==1.9.4
|
112 |
+
zipp==3.19.2
|