File size: 4,839 Bytes
00621a0
 
 
 
 
 
 
 
 
b24caf7
00621a0
b24caf7
00621a0
 
 
b24caf7
00621a0
 
 
 
 
 
 
 
b24caf7
00621a0
 
 
 
 
b24caf7
00621a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Import modules and classes
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine import TransformQueryEngine
from langchain_core.documents import Document as LangDocument
from llama_index.core import Document as LlamaDocument
from llama_index.core import Settings
from llama_parse import LlamaParse
import streamlit as st
import os

# Set environmental variables
nvidia_api_key = os.getenv("NVIDIA_KEY")
llamaparse_api_key = os.getenv("PARSE_KEY")

# Initialize ChatNVIDIA, NVIDIARerank, and NVIDIAEmbeddings
client = ChatNVIDIA(
    model="meta/llama-3.1-8b-instruct",
    api_key=nvidia_api_key,
    temperature=0.2,
    top_p=0.7,
    max_tokens=1024
)

embed_model = NVIDIAEmbeddings(
    model="nvidia/nv-embedqa-e5-v5", 
    api_key=nvidia_api_key, 
    truncate="NONE"
)

reranker = NVIDIARerank(
  model="nvidia/nv-rerankqa-mistral-4b-v3", 
  api_key=nvidia_api_key,
)

# Set the NVIDIA models globally
Settings.embed_model = embed_model
Settings.llm = client

# Parse the local PDF document
parser = LlamaParse(
    api_key=llamaparse_api_key,
    result_type="markdown",
    verbose=True
)

documents = parser.load_data("C:\\Users\\user\\Documents\\Jan 2024\\Projects\\RAGs\\Files\\PhilDataset.pdf")
print("Document Parsed")

# Split parsed text into chunks for embedding model
def split_text(text, max_tokens=512):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        word_length = len(word)
        if current_length + word_length + 1 > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length + 1
        else:
            current_chunk.append(word)
            current_length += word_length + 1

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Generate embeddings for document chunks
all_embeddings = []
all_documents = []

for doc in documents:
    text_chunks = split_text(doc.text)
    for chunk in text_chunks:
        embedding = embed_model.embed_query(chunk)
        all_embeddings.append(embedding)
        all_documents.append(LlamaDocument(text=chunk))
print("Embeddings generated")

# Create and persist index with NVIDIAEmbeddings
index = VectorStoreIndex.from_documents(all_documents, embeddings=all_embeddings, embed_model=embed_model)
index.set_index_id("vector_index")
index.storage_context.persist("./storage")
print("Index created")

# Load index from storage
storage_context = StorageContext.from_defaults(persist_dir="storage")
index = load_index_from_storage(storage_context, index_id="vector_index")
print("Index loaded")

# Initialize HyDEQueryTransform and TransformQueryEngine
hyde = HyDEQueryTransform(include_original=True)
query_engine = index.as_query_engine()
hyde_query_engine = TransformQueryEngine(query_engine, hyde)

# Query the index with HyDE and use output as LLM context
def query_model_with_context(question):
    # Generate a hypothetical document using HyDE
    hyde_response = hyde_query_engine.query(question)
    print(f"HyDE Response: {hyde_response}")

    if isinstance(hyde_response, str):
        hyde_query = hyde_response
    else:
        hyde_query = hyde_response.response

    # Use the hypothetical document to retrieve relevant documents
    retriever = index.as_retriever(similarity_top_k=3)
    nodes = retriever.retrieve(hyde_query)

    for node in nodes:
        print(node)

    # Rerank the retrieved documents
    ranked_documents = reranker.compress_documents(
        query=question,
        documents=[LangDocument(page_content=node.text) for node in nodes]
    )

    # Print the most relevant and least relevant node
    print(f"Most relevant node: {ranked_documents[0].page_content}")

    # Use the most relevant node as context
    context = ranked_documents[0].page_content

    # Send context and question to the client (NVIDIA Llama 3.1 8B model)
    messages = [
        {"role": "system", "content": context},
        {"role": "user", "content": str(question)}
    ]
    completion = client.stream(messages)
    
    # Process response
    response_text = ""
    for chunk in completion:
        if chunk.content is not None:
            response_text += chunk.content
    
    return response_text


# Streamlit UI
st.title("Chat with HyDE + Rerank RAG")
question = st.text_input("Enter your question:")

if st.button("Submit"):
    if question:
        st.write("**RAG Response:**")
        response = query_model_with_context(question)
        st.write(response)
    else:
        st.warning("Please enter a question.")