narendra-bluebash's picture
add app
198bfc2
import os
import uuid
import base64
from unstructured.partition.pdf import partition_pdf
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.schema.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain_postgres.vectorstores import PGVector
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
POSTGRES_URL_EMBEDDINDS=os.getenv("POSTGRES_URL_EMBEDDINDS")
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
filename="/home/bluebash-005/code/bluebash/poc/stramlit_pdf/data/fy2024.pdf"
output_path = "/home/bluebash-005/code/bluebash/poc/stramlit_pdf/images"
openai_ef = OpenAIEmbeddings()
text_elements = []
text_summaries = []
table_elements = []
table_summaries = []
image_elements = []
image_summaries = []
def file_reader():
raw_pdf_elements = partition_pdf(
filename=filename,
extract_images_in_pdf=True,
infer_table_structure=True,
chunking_strategy="by_title",
max_characters=4000,
new_after_n_chars=3800,
combine_text_under_n_chars=2000,
extract_image_block_output_dir=output_path,
)
return raw_pdf_elements
def text_insert(raw_pdf_elements):
summary_prompt = """
Summarize the following {element_type}:
{element}
"""
prompt=PromptTemplate.from_template(summary_prompt)
llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key = openai_api_key, max_tokens=1024)
runnable = prompt | llm
for e in raw_pdf_elements:
if 'CompositeElement' in repr(e):
text_elements.append(e.text)
summary = runnable.invoke({'element_type': 'text', 'element': e})
text_summaries.append(summary.content)
elif 'Table' in repr(e):
table_elements.append(e.text)
summary = runnable.invoke({'element_type': 'table', 'element': e})
table_summaries.append(summary.content)
def image_insert():
def encode_image(image_path):
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode('utf-8')
def summarize_image(encoded_image):
prompt = [
SystemMessage(content="You are a bot that is good at analyzing images."),
HumanMessage(content=[
{
"type": "text",
"text": "Describe the contents of this image."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{encoded_image}"
},
},
])
]
response = ChatOpenAI(model="gpt-4-vision-preview", openai_api_key=openai_api_key, max_tokens=1024).invoke(prompt)
return response.content
for i in os.listdir(output_path):
if i.endswith(('.png', '.jpg', '.jpeg')):
image_path = os.path.join(output_path, i)
encoded_image = encode_image(image_path)
image_elements.append(encoded_image)
summary = summarize_image(encoded_image)
image_summaries.append(summary)
documents = []
retrieve_contents = []
def get_docummets():
for e, s in zip(text_elements, text_summaries):
i = str(uuid.uuid4())
doc = Document(
page_content = s,
metadata = {
'id': i,
'type': 'text',
'original_content': e
}
)
retrieve_contents.append((i, e))
documents.append(doc)
print("text_element done")
for e, s in zip(table_elements, table_summaries):
doc = Document(
page_content = s,
metadata = {
'id': i,
'type': 'table',
'original_content': e
}
)
retrieve_contents.append((i, e))
documents.append(doc)
print("table_elements done")
for e, s in zip(image_elements, image_summaries):
doc = Document(
page_content = s,
metadata = {
'id': i,
'type': 'image',
'original_content': e
}
)
retrieve_contents.append((i, s))
documents.append(doc)
print("image_elements Done")
def add_docs_to_postgres(collection_name):
vectorstore = PGVector(embeddings=openai_ef,collection_name=collection_name,connection=POSTGRES_URL_EMBEDDINDS,use_jsonb=True,)
vectorstore.add_documents(documents)
def add_docs_to_pinecone(index_name):
pc = Pinecone(api_key=PINECONE_API_KEY)
spec = ServerlessSpec(cloud='aws', region='us-east-1')
if index_name in pc.list_indexes().names():
pc.delete_index(index_name)
# we create a new index
pc.create_index(
index_name,
dimension=1536,
metric='dotproduct',
spec=spec
)
import pdb
pdb.set_trace()
n=len(documents)//2
doc1=documents[:n]
doc2=documents[n:]
vectorstore_from_docs = PineconeVectorStore.from_documents(
doc1,
index_name=index_name,
embedding=openai_ef
)
def main():
collection_name="fy2024"
print("started file reader")
raw_pdf_elements=file_reader()
print(raw_pdf_elements)
print()
text_insert(raw_pdf_elements)
print("text_insert Done")
image_insert()
print("image_insert Done")
print()
get_docummets()
print("get_docummets Done")
#add_docs_to_postgres(collection_name)
add_docs_to_pinecone(collection_name)
print("Done")
if __name__=="__main__":
main()