|
import os |
|
import uuid |
|
import base64 |
|
from unstructured.partition.pdf import partition_pdf |
|
from langchain_openai import ChatOpenAI |
|
from langchain.chains import LLMChain |
|
from langchain.prompts import PromptTemplate |
|
from langchain.schema.messages import HumanMessage, SystemMessage |
|
from langchain.schema.document import Document |
|
from langchain_openai import OpenAIEmbeddings |
|
from langchain_postgres.vectorstores import PGVector |
|
from pinecone import Pinecone |
|
from pinecone import ServerlessSpec |
|
from langchain_pinecone import PineconeVectorStore |
|
|
|
from dotenv import load_dotenv |
|
load_dotenv() |
|
|
|
openai_api_key = os.getenv("OPENAI_API_KEY") |
|
POSTGRES_URL_EMBEDDINDS=os.getenv("POSTGRES_URL_EMBEDDINDS") |
|
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY") |
|
|
|
|
|
|
|
filename="/home/bluebash-005/code/bluebash/poc/stramlit_pdf/data/fy2024.pdf" |
|
output_path = "/home/bluebash-005/code/bluebash/poc/stramlit_pdf/images" |
|
openai_ef = OpenAIEmbeddings() |
|
|
|
|
|
|
|
text_elements = [] |
|
text_summaries = [] |
|
|
|
table_elements = [] |
|
table_summaries = [] |
|
|
|
image_elements = [] |
|
image_summaries = [] |
|
|
|
|
|
def file_reader(): |
|
raw_pdf_elements = partition_pdf( |
|
filename=filename, |
|
extract_images_in_pdf=True, |
|
infer_table_structure=True, |
|
chunking_strategy="by_title", |
|
max_characters=4000, |
|
new_after_n_chars=3800, |
|
combine_text_under_n_chars=2000, |
|
extract_image_block_output_dir=output_path, |
|
) |
|
return raw_pdf_elements |
|
|
|
|
|
|
|
def text_insert(raw_pdf_elements): |
|
summary_prompt = """ |
|
Summarize the following {element_type}: |
|
{element} |
|
""" |
|
|
|
prompt=PromptTemplate.from_template(summary_prompt) |
|
llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key = openai_api_key, max_tokens=1024) |
|
runnable = prompt | llm |
|
|
|
for e in raw_pdf_elements: |
|
if 'CompositeElement' in repr(e): |
|
text_elements.append(e.text) |
|
summary = runnable.invoke({'element_type': 'text', 'element': e}) |
|
text_summaries.append(summary.content) |
|
|
|
elif 'Table' in repr(e): |
|
table_elements.append(e.text) |
|
summary = runnable.invoke({'element_type': 'table', 'element': e}) |
|
table_summaries.append(summary.content) |
|
|
|
|
|
def image_insert(): |
|
|
|
def encode_image(image_path): |
|
with open(image_path, "rb") as f: |
|
return base64.b64encode(f.read()).decode('utf-8') |
|
|
|
def summarize_image(encoded_image): |
|
prompt = [ |
|
SystemMessage(content="You are a bot that is good at analyzing images."), |
|
HumanMessage(content=[ |
|
{ |
|
"type": "text", |
|
"text": "Describe the contents of this image." |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{encoded_image}" |
|
}, |
|
}, |
|
]) |
|
] |
|
response = ChatOpenAI(model="gpt-4-vision-preview", openai_api_key=openai_api_key, max_tokens=1024).invoke(prompt) |
|
return response.content |
|
|
|
|
|
for i in os.listdir(output_path): |
|
if i.endswith(('.png', '.jpg', '.jpeg')): |
|
image_path = os.path.join(output_path, i) |
|
encoded_image = encode_image(image_path) |
|
image_elements.append(encoded_image) |
|
summary = summarize_image(encoded_image) |
|
image_summaries.append(summary) |
|
|
|
|
|
documents = [] |
|
retrieve_contents = [] |
|
|
|
def get_docummets(): |
|
for e, s in zip(text_elements, text_summaries): |
|
i = str(uuid.uuid4()) |
|
doc = Document( |
|
page_content = s, |
|
metadata = { |
|
'id': i, |
|
'type': 'text', |
|
'original_content': e |
|
} |
|
) |
|
retrieve_contents.append((i, e)) |
|
documents.append(doc) |
|
print("text_element done") |
|
|
|
for e, s in zip(table_elements, table_summaries): |
|
doc = Document( |
|
page_content = s, |
|
metadata = { |
|
'id': i, |
|
'type': 'table', |
|
'original_content': e |
|
} |
|
) |
|
retrieve_contents.append((i, e)) |
|
documents.append(doc) |
|
|
|
print("table_elements done") |
|
|
|
for e, s in zip(image_elements, image_summaries): |
|
doc = Document( |
|
page_content = s, |
|
metadata = { |
|
'id': i, |
|
'type': 'image', |
|
'original_content': e |
|
} |
|
) |
|
retrieve_contents.append((i, s)) |
|
documents.append(doc) |
|
|
|
print("image_elements Done") |
|
|
|
def add_docs_to_postgres(collection_name): |
|
vectorstore = PGVector(embeddings=openai_ef,collection_name=collection_name,connection=POSTGRES_URL_EMBEDDINDS,use_jsonb=True,) |
|
vectorstore.add_documents(documents) |
|
|
|
|
|
|
|
def add_docs_to_pinecone(index_name): |
|
pc = Pinecone(api_key=PINECONE_API_KEY) |
|
|
|
spec = ServerlessSpec(cloud='aws', region='us-east-1') |
|
if index_name in pc.list_indexes().names(): |
|
pc.delete_index(index_name) |
|
|
|
|
|
pc.create_index( |
|
index_name, |
|
dimension=1536, |
|
metric='dotproduct', |
|
spec=spec |
|
) |
|
import pdb |
|
pdb.set_trace() |
|
n=len(documents)//2 |
|
doc1=documents[:n] |
|
doc2=documents[n:] |
|
|
|
vectorstore_from_docs = PineconeVectorStore.from_documents( |
|
doc1, |
|
index_name=index_name, |
|
embedding=openai_ef |
|
) |
|
|
|
|
|
|
|
|
|
def main(): |
|
collection_name="fy2024" |
|
print("started file reader") |
|
raw_pdf_elements=file_reader() |
|
print(raw_pdf_elements) |
|
print() |
|
|
|
text_insert(raw_pdf_elements) |
|
print("text_insert Done") |
|
image_insert() |
|
print("image_insert Done") |
|
print() |
|
get_docummets() |
|
print("get_docummets Done") |
|
|
|
add_docs_to_pinecone(collection_name) |
|
print("Done") |
|
|
|
if __name__=="__main__": |
|
main() |