File size: 4,807 Bytes
fc540fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import pdfplumber
import uuid
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone as pineC, ServerlessSpec
from langchain_pinecone import Pinecone
import os 
from dotenv import load_dotenv
load_dotenv()

def extract_pdf(file_path):
    texts=[]
    tables=[]
    # Open the PDF and extract pages
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            texts.append(page.extract_text())
            # print(text) # Extract plain text
            if page.extract_tables():
                tables.append(page.extract_tables())
                # Extract tables
    return texts, tables
def summarize_data(texts,tables):
    prompt_text = """

    You are an assistant tasked with summarizing tables and text.

    Give a concise summary of the table or text that perfectly describes the table in starting 2 sentences.



    Respond only with the summary, no additionnal comment.

    Do not start your message by saying "Here is a summary" or anything like that.

    Just give the summary as it is.



    Table or text chunk: {element}

    """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    #
    # Summary chain
    model = ChatGroq(temperature=0, model="llama-3.1-8b-instant",api_key=os.environ["GROQ_API_KEY"])
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
    # Summarize extracted text
    text_summaries = []
    if texts:
        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

    # Summarize extracted tables
    tables_html = [str(table) for table in tables]  # Convert tables to string format
    table_summaries = []
    if tables_html:
        table_summaries = summarize_chain.batch(tables_html, {"max_concurrency": 5})
    return texts,text_summaries,tables,table_summaries

def create_vectorstore():

    model_name = "intfloat/multilingual-e5-large-instruct"
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    hf = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    # index= pc.Index("gaido-rag")
    # The vectorstore to use to index the child chunks
    # vectorstore = Chroma(collection_name="multi_modal_rag", embedding_function=hf)

    # The storage layer for the parent documents
    store = InMemoryStore()
    id_key = "doc_id"
    
    pc = pineC(api_key=os.environ["PINECONE_API_KEY"])

    index_name = "gaidorag"
    text_field = "text"
    cloud ='aws'
    region = 'us-east-1'

    spec = ServerlessSpec(cloud=cloud, region=region)
    # check if index already exists (it shouldn't if this is first time)
    if index_name not in pc.list_indexes().names():
        # if does not exist, create index
        pc.create_index(
            index_name,
            dimension=1024,  # dimensionality of text-embedding-ada-002
            metric='cosine',
            spec=spec
        )
    # switch back to normal index for langchain
    index = pc.Index(index_name)

    vectorstore = Pinecone(
        index, hf, text_field
    )


    
# The retriever (empty to start)
    retriever = MultiVectorRetriever(
        vectorstore=vectorstore,
        docstore=store,
        id_key=id_key,
    )
    return retriever
def embed_docs(retriever,texts,text_summaries,tables,table_summaries):
    # Add texts
    id_key = "doc_id"
    doc_ids = [str(uuid.uuid4()) for _ in texts]
    summary_texts = [
        Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
    ]
    retriever.vectorstore.add_documents(summary_texts)
    retriever.docstore.mset(list(zip(doc_ids, texts)))

    # Add tables
    table_ids = [str(uuid.uuid4()) for _ in tables]
    summary_tables = [
        Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
    ]
    retriever.vectorstore.add_documents(summary_tables)
    retriever.docstore.mset(list(zip(table_ids, tables)))


def process_docs(file_path):
    texts,tables=extract_pdf(file_path)
    texts,text_summaries,tables,table_summaries=summarize_data(texts,tables)
    retriever=create_vectorstore()
    embed_docs(retriever,texts,text_summaries,tables,table_summaries)
    return retriever