Spaces:

varun1011
/

gaidorag

Sleeping

App Files Files Community

varun1011 commited on Mar 3

Commit

fc540fe

verified ·

1 Parent(s): 6c5f507

Upload 4 files

Browse files

Files changed (4) hide show

app.py +62 -0
data_preprocessing.py +133 -0
rag.py +112 -0
requirements.txt +12 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import streamlit as st
+from pathlib import Path
+from data_preprocessing import process_docs
+from rag import create_rag_chain
+import time
+def response_generator(prompt,chain):
+    response = chain.invoke(prompt)
+    for word in response.split():
+        yield word + " "
+        time.sleep(0.05)
+# Set up the file uploader
+# uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+# Specify the directory to save files
+save_directory = "docs"
+save_path="docs/file.pdf"
+st.title("📝 InsureAgent")
+with st.sidebar:
+    uploaded_file = st.file_uploader("Upload a document", type=("pdf"))
+    if uploaded_file is not None:
+        with open(save_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        st.success(f"File saved successfully: {save_path}")
+    retriever=process_docs(save_path)
+    chain,chain_with_sources=create_rag_chain(retriever)
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# Streamed response emulator
+if prompt := st.chat_input("What is up?"):
+    # Add user message to chat history
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    # Display user message in chat message container
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    # Display assistant response in chat message container
+    with st.chat_message("assistant"):
+        response = st.write_stream(response_generator(prompt,chain))
+    # Add assistant response to chat history
+    st.session_state.messages.append({"role": "assistant", "content": response})

data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import pdfplumber
+import uuid
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain.storage import InMemoryStore
+from langchain.schema.document import Document
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.retrievers.multi_vector import MultiVectorRetriever
+from langchain_huggingface import HuggingFaceEmbeddings
+from pinecone import Pinecone as pineC, ServerlessSpec
+from langchain_pinecone import Pinecone
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def extract_pdf(file_path):
+    texts=[]
+    tables=[]
+    # Open the PDF and extract pages
+    with pdfplumber.open(file_path) as pdf:
+        for page in pdf.pages:
+            texts.append(page.extract_text())
+            # print(text) # Extract plain text
+            if page.extract_tables():
+                tables.append(page.extract_tables())
+                # Extract tables
+    return texts, tables
+def summarize_data(texts,tables):
+    prompt_text = """
+    You are an assistant tasked with summarizing tables and text.
+    Give a concise summary of the table or text that perfectly describes the table in starting 2 sentences.
+    Respond only with the summary, no additionnal comment.
+    Do not start your message by saying "Here is a summary" or anything like that.
+    Just give the summary as it is.
+    Table or text chunk: {element}
+    """
+    prompt = ChatPromptTemplate.from_template(prompt_text)
+    #
+    # Summary chain
+    model = ChatGroq(temperature=0, model="llama-3.1-8b-instant",api_key=os.environ["GROQ_API_KEY"])
+    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
+    # Summarize extracted text
+    text_summaries = []
+    if texts:
+        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
+    # Summarize extracted tables
+    tables_html = [str(table) for table in tables]  # Convert tables to string format
+    table_summaries = []
+    if tables_html:
+        table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 5})
+    return texts,text_summaries,tables,table_summaries
+def create_vectorstore():
+    model_name = "intfloat/multilingual-e5-large-instruct"
+    model_kwargs = {'device': 'cpu'}
+    encode_kwargs = {'normalize_embeddings': False}
+    hf = HuggingFaceEmbeddings(
+        model_name=model_name,
+        model_kwargs=model_kwargs,
+        encode_kwargs=encode_kwargs
+    )
+    # index= pc.Index("gaido-rag")
+    # The vectorstore to use to index the child chunks
+    # vectorstore = Chroma(collection_name="multi_modal_rag", embedding_function=hf)
+    # The storage layer for the parent documents
+    store = InMemoryStore()
+    id_key = "doc_id"
+    pc = pineC(api_key=os.environ["PINECONE_API_KEY"])
+    index_name = "gaidorag"
+    text_field = "text"
+    cloud ='aws'
+    region = 'us-east-1'
+    spec = ServerlessSpec(cloud=cloud, region=region)
+    # check if index already exists (it shouldn't if this is first time)
+    if index_name not in pc.list_indexes().names():
+        # if does not exist, create index
+        pc.create_index(
+            index_name,
+            dimension=1024,  # dimensionality of text-embedding-ada-002
+            metric='cosine',
+            spec=spec
+        )
+    # switch back to normal index for langchain
+    index = pc.Index(index_name)
+    vectorstore = Pinecone(
+        index, hf, text_field
+    )
+# The retriever (empty to start)
+    retriever = MultiVectorRetriever(
+        vectorstore=vectorstore,
+        docstore=store,
+        id_key=id_key,
+    )
+    return retriever
+def embed_docs(retriever,texts,text_summaries,tables,table_summaries):
+    # Add texts
+    id_key = "doc_id"
+    doc_ids = [str(uuid.uuid4()) for _ in texts]
+    summary_texts = [
+        Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
+    ]
+    retriever.vectorstore.add_documents(summary_texts)
+    retriever.docstore.mset(list(zip(doc_ids, texts)))
+    # Add tables
+    table_ids = [str(uuid.uuid4()) for _ in tables]
+    summary_tables = [
+        Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
+    ]
+    retriever.vectorstore.add_documents(summary_tables)
+    retriever.docstore.mset(list(zip(table_ids, tables)))
+def process_docs(file_path):
+    texts,tables=extract_pdf(file_path)
+    texts,text_summaries,tables,table_summaries=summarize_data(texts,tables)
+    retriever=create_vectorstore()
+    embed_docs(retriever,texts,text_summaries,tables,table_summaries)
+    return retriever

rag.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from langchain_core.runnables import RunnablePassthrough, RunnableLambda
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from base64 import b64decode
+import os
+from dotenv import load_dotenv
+load_dotenv
+def parse_docs(docs):
+    """Split base64-encoded images and texts"""
+    b64 = []
+    text = []
+    for doc in docs:
+        try:
+            b64decode(doc)
+            b64.append(doc)
+        except Exception as e:
+            text.append(doc)
+    return {"images": b64, "texts": text}
+def build_prompt(kwargs):
+    docs_by_type = kwargs["context"]
+    user_question = kwargs["question"]
+    # Extract text context
+    context_text = ""
+    if docs_by_type.get("texts"):
+        for text_element in docs_by_type["texts"]:
+            if isinstance(text_element, list):
+                # Flatten nested lists before joining
+                flat_text = " ".join(
+                    " ".join(map(str, sub_element)) if isinstance(sub_element, list) else str(sub_element)
+                    for sub_element in text_element
+                )
+                context_text += flat_text + "\n"
+            else:
+                context_text += str(text_element) + "\n"
+    # Extract table context
+    context_tables = ""
+    if docs_by_type.get("tables"):
+        for table in docs_by_type["tables"]:
+            table_str = "\n".join([" | ".join(map(str, row)) for row in table])  # Convert table rows to strings
+            context_tables += f"\nTable:\n{table_str}\n"
+    # Construct prompt with context (including tables)
+    prompt_template = f"""
+    Answer the question based only on the following context only Ground Truth Final Answer, which includes text and tables.
+    If you don't know the answer, just say that you don't know, don't try to make up an answer.
+    Be specific
+    Context:
+    {context_text}
+    {context_tables}
+    Question: {user_question}
+    """
+    prompt_content = [{"type": "text", "text": prompt_template}]
+    # If images are provided, include them
+    # if docs_by_type.get("images"):
+    #     for image in docs_by_type["images"]:
+    #         prompt_content.append(
+    #             {
+    #                 "type": "image_url",
+    #                 "image_url": {"url": f"data:image/jpeg;base64,{image}"},
+    #             }
+    #         )
+    return ChatPromptTemplate.from_messages(
+        [
+            HumanMessage(content=prompt_content),
+        ]
+    )
+def create_rag_chain(retriever):
+    chain = (
+        {
+            "context": retriever | RunnableLambda(parse_docs),
+            "question": RunnablePassthrough(),
+        }
+        | RunnableLambda(build_prompt)
+        | ChatGroq(model="llama-3.3-70b-versatile",api_key=os.environ["GROQ_API_KEY"])
+        | StrOutputParser()
+    )
+    chain_with_sources = {
+        "context": retriever | RunnableLambda(parse_docs),
+        "question": RunnablePassthrough(),
+    } | RunnablePassthrough().assign(
+        response=(
+            RunnableLambda(build_prompt)
+            | ChatGroq(model="llama-3.3-70b-versatile",api_key=os.environ["GROQ_API_KEY"])
+            | StrOutputParser()
+        )
+    )
+    return chain, chain_with_sources
+def invoke_chain(chain):
+    response = chain.invoke(
+    "What is the policy start and expiry date?"
+    )
+    return response

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+pdfplumber
+tiktoken
+langchain
+langchain-community
+langchain-openai
+langchain-groq
+python-dotenv
+langchain-huggingface
+ragas
+datasets
+langchain-pinecone
+streamlit