|
import pdfplumber
|
|
import uuid
|
|
from langchain_groq import ChatGroq
|
|
from langchain_core.prompts import ChatPromptTemplate
|
|
from langchain_core.output_parsers import StrOutputParser
|
|
from langchain.storage import InMemoryStore
|
|
from langchain.schema.document import Document
|
|
from langchain.embeddings import OpenAIEmbeddings
|
|
from langchain.retrievers.multi_vector import MultiVectorRetriever
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
from pinecone import Pinecone as pineC, ServerlessSpec
|
|
from langchain_pinecone import Pinecone
|
|
import os
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
def extract_pdf(file_path):
|
|
texts=[]
|
|
tables=[]
|
|
|
|
with pdfplumber.open(file_path) as pdf:
|
|
for page in pdf.pages:
|
|
texts.append(page.extract_text())
|
|
|
|
if page.extract_tables():
|
|
tables.append(page.extract_tables())
|
|
|
|
return texts, tables
|
|
def summarize_data(texts,tables):
|
|
prompt_text = """
|
|
You are an assistant tasked with summarizing tables and text.
|
|
Give a concise summary of the table or text that perfectly describes the table in starting 2 sentences.
|
|
|
|
Respond only with the summary, no additionnal comment.
|
|
Do not start your message by saying "Here is a summary" or anything like that.
|
|
Just give the summary as it is.
|
|
|
|
Table or text chunk: {element}
|
|
"""
|
|
prompt = ChatPromptTemplate.from_template(prompt_text)
|
|
|
|
|
|
|
|
model = ChatGroq(temperature=0, model="llama-3.1-8b-instant",api_key=os.environ["GROQ_API_KEY"])
|
|
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
|
|
|
|
text_summaries = []
|
|
if texts:
|
|
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
|
|
|
|
|
|
tables_html = [str(table) for table in tables]
|
|
table_summaries = []
|
|
if tables_html:
|
|
table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 5})
|
|
return texts,text_summaries,tables,table_summaries
|
|
|
|
def create_vectorstore():
|
|
|
|
model_name = "intfloat/multilingual-e5-large-instruct"
|
|
model_kwargs = {'device': 'cpu'}
|
|
encode_kwargs = {'normalize_embeddings': False}
|
|
hf = HuggingFaceEmbeddings(
|
|
model_name=model_name,
|
|
model_kwargs=model_kwargs,
|
|
encode_kwargs=encode_kwargs
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
store = InMemoryStore()
|
|
id_key = "doc_id"
|
|
|
|
pc = pineC(api_key=os.environ["PINECONE_API_KEY"])
|
|
|
|
index_name = "gaidorag"
|
|
text_field = "text"
|
|
cloud ='aws'
|
|
region = 'us-east-1'
|
|
|
|
spec = ServerlessSpec(cloud=cloud, region=region)
|
|
|
|
if index_name not in pc.list_indexes().names():
|
|
|
|
pc.create_index(
|
|
index_name,
|
|
dimension=1024,
|
|
metric='cosine',
|
|
spec=spec
|
|
)
|
|
|
|
index = pc.Index(index_name)
|
|
|
|
vectorstore = Pinecone(
|
|
index, hf, text_field
|
|
)
|
|
|
|
|
|
|
|
|
|
retriever = MultiVectorRetriever(
|
|
vectorstore=vectorstore,
|
|
docstore=store,
|
|
id_key=id_key,
|
|
)
|
|
return retriever
|
|
def embed_docs(retriever,texts,text_summaries,tables,table_summaries):
|
|
|
|
id_key = "doc_id"
|
|
doc_ids = [str(uuid.uuid4()) for _ in texts]
|
|
summary_texts = [
|
|
Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
|
|
]
|
|
retriever.vectorstore.add_documents(summary_texts)
|
|
retriever.docstore.mset(list(zip(doc_ids, texts)))
|
|
|
|
|
|
table_ids = [str(uuid.uuid4()) for _ in tables]
|
|
summary_tables = [
|
|
Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
|
|
]
|
|
retriever.vectorstore.add_documents(summary_tables)
|
|
retriever.docstore.mset(list(zip(table_ids, tables)))
|
|
|
|
|
|
def process_docs(file_path):
|
|
texts,tables=extract_pdf(file_path)
|
|
texts,text_summaries,tables,table_summaries=summarize_data(texts,tables)
|
|
retriever=create_vectorstore()
|
|
embed_docs(retriever,texts,text_summaries,tables,table_summaries)
|
|
return retriever
|
|
|