gaidorag / data_preprocessing.py
varun1011's picture
Upload 4 files
fc540fe verified
import pdfplumber
import uuid
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone as pineC, ServerlessSpec
from langchain_pinecone import Pinecone
import os
from dotenv import load_dotenv
load_dotenv()
def extract_pdf(file_path):
texts=[]
tables=[]
# Open the PDF and extract pages
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
texts.append(page.extract_text())
# print(text) # Extract plain text
if page.extract_tables():
tables.append(page.extract_tables())
# Extract tables
return texts, tables
def summarize_data(texts,tables):
prompt_text = """
You are an assistant tasked with summarizing tables and text.
Give a concise summary of the table or text that perfectly describes the table in starting 2 sentences.
Respond only with the summary, no additionnal comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is.
Table or text chunk: {element}
"""
prompt = ChatPromptTemplate.from_template(prompt_text)
#
# Summary chain
model = ChatGroq(temperature=0, model="llama-3.1-8b-instant",api_key=os.environ["GROQ_API_KEY"])
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
# Summarize extracted text
text_summaries = []
if texts:
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
# Summarize extracted tables
tables_html = [str(table) for table in tables] # Convert tables to string format
table_summaries = []
if tables_html:
table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 5})
return texts,text_summaries,tables,table_summaries
def create_vectorstore():
model_name = "intfloat/multilingual-e5-large-instruct"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
# index= pc.Index("gaido-rag")
# The vectorstore to use to index the child chunks
# vectorstore = Chroma(collection_name="multi_modal_rag", embedding_function=hf)
# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"
pc = pineC(api_key=os.environ["PINECONE_API_KEY"])
index_name = "gaidorag"
text_field = "text"
cloud ='aws'
region = 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
# if does not exist, create index
pc.create_index(
index_name,
dimension=1024, # dimensionality of text-embedding-ada-002
metric='cosine',
spec=spec
)
# switch back to normal index for langchain
index = pc.Index(index_name)
vectorstore = Pinecone(
index, hf, text_field
)
# The retriever (empty to start)
retriever = MultiVectorRetriever(
vectorstore=vectorstore,
docstore=store,
id_key=id_key,
)
return retriever
def embed_docs(retriever,texts,text_summaries,tables,table_summaries):
# Add texts
id_key = "doc_id"
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))
# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))
def process_docs(file_path):
texts,tables=extract_pdf(file_path)
texts,text_summaries,tables,table_summaries=summarize_data(texts,tables)
retriever=create_vectorstore()
embed_docs(retriever,texts,text_summaries,tables,table_summaries)
return retriever