File size: 8,483 Bytes
6df5c93
 
 
75495ad
6df5c93
42309dc
 
 
 
6df5c93
6c87654
57b271f
21b7541
57b271f
d35cd40
2e1563c
 
6df5c93
 
 
 
 
 
 
 
2bbf094
 
6df5c93
b4050b2
 
6df5c93
 
 
 
 
 
 
466808a
6df5c93
 
 
 
 
 
 
 
 
f79e678
4c7739f
57b271f
 
 
 
 
34426fc
 
 
6df5c93
 
 
 
 
 
9ed2e92
 
 
 
 
 
 
 
 
 
6df5c93
 
9ed2e92
0fdd155
6288d92
df02851
eff8daf
c106d4f
ced0582
8aebf77
 
6df5c93
 
 
 
 
326b887
db1cea6
f49831c
db1cea6
 
 
 
 
 
1b8b321
 
db1cea6
f49831c
326b887
 
 
db1cea6
 
dfa472e
db1cea6
f9083b7
db1cea6
1b8b321
db1cea6
dfa472e
db1cea6
 
 
 
 
99bb0aa
fbe4134
 
 
 
 
db1cea6
fbe4134
 
 
8cee1bc
fbe4134
 
 
 
7efc081
fbe4134
 
 
 
99bb0aa
fbe4134
 
 
99bb0aa
fbe4134
99bb0aa
6df5c93
 
bcf5ba4
8917e60
2172305
 
5b2e654
 
 
2172305
9e24330
6089bfa
 
 
9e24330
 
0c35020
9e24330
 
0c35020
9e24330
 
0c35020
7d312dc
 
b700892
2c0ea57
9f52dc4
2c0ea57
93e3091
 
ebb0364
6df5c93
 
 
 
 
 
 
 
 
 
 
506afb0
 
 
 
6df5c93
 
499e447
6df5c93
 
499e447
6df5c93
 
1afdee3
831abbd
 
1afdee3
831abbd
1afdee3
417adb9
40be4b1
1afdee3
 
 
 
 
 
 
 
 
 
 
8fde75c
 
 
1afdee3
 
 
 
 
 
 
 
 
499e447
 
1afdee3
 
 
8c715b2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import os
import json
import gradio as gr

from huggingface_hub import HfApi, login
from PyPDF2 import PdfReader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain.docstore.document import Document
from langchain.schema import Document

from chunk_python_code import chunk_python_code_with_metadata
from vectorstore import get_chroma_vectorstore
from download_repo_to_huggingface import download_and_upload_kadiAPY_repo_to_huggingfacespace
from process_repo_zipfile import extract_files_and_filepath_from_dir

# Load environment variables from .env file
load_dotenv()

# Load configuration from JSON file
with open('config.json') as config_file:
    config = json.load(config_file)

with open("config2.json", "r") as file:
    config2 = json.load(file)

PERSIST_DOC_DIRECTORY = config["persist_doc_directory"]
PERSIST_CODE_DIRECTORY =config["persist_code_directory"]
CHUNK_SIZE = config["chunk_size"]
CHUNK_OVERLAP = config["chunk_overlap"]
EMBEDDING_MODEL_NAME = config["embedding_model"]
LLM_MODEL_NAME = config["llm_model"]
LLM_TEMPERATURE = config["llm_temperature"]
GITLAB_API_URL = config["gitlab_api_url"]
HF_SPACE_NAME = config["hf_space_name"]
DATA_DIR = config["data_dir"]

GROQ_API_KEY = os.environ["GROQ_API_KEY"]
HF_TOKEN = os.environ["HF_Token"]



login(HF_TOKEN)
api = HfApi()



def split_python_code_into_chunks(texts, file_paths):
    chunks = [] 
    for text, file_path in zip(texts, file_paths):
        document_chunks = chunk_python_code_with_metadata(text, file_path)
        chunks.extend(document_chunks)   
    return chunks


# Split text into chunks
def split_into_chunks(texts, references, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = []

    for text, reference in zip(texts, references):
        chunks.extend([
            Document(
                page_content=chunk,
                metadata={
                    "source": reference,
                    "usage": "doc"
                }
            ) 
            for chunk in text_splitter.split_text(text)
        ])
    return chunks


# Setup Vectorstore
def embed_documents_into_vectorstore(chunks, model_name, persist_directory):
    print("Start setup_vectorstore_function")
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)   
    vectorstore = get_chroma_vectorstore(embedding_model, persist_directory)
    vectorstore.add_documents(chunks)
    return vectorstore

# Setup LLM
def setup_llm(model_name, temperature, api_key):
    llm = ChatGroq(model=model_name, temperature=temperature, api_key=api_key)
    return llm


def format_kadi_apy_library_context(docs):
    doc_context = []
    
    for doc in docs:
        # Extract metadata information
        class_info = doc.metadata.get("class", "Unknown Class")
        type_info = doc.metadata.get("type", "Unknown Type")
        source_info = doc.metadata.get("source", "Unknown Type")

        print(":}\n\n", doc.page_content)
        formatted_doc = f"# source: {source_info}\n# class: {class_info}\n# type: {type_info}\n{doc.page_content}\n\n\n"
        doc_context.append(formatted_doc)
    
    return doc_context


def format_kadi_api_doc_context(docs):
    doc_context = []

    for doc in docs:
        source_info = doc.metadata.get("source", "Unknown Type")
        print(":}\n\n", doc.page_content)
        formatted_doc = f"# source: {source_info}\n{doc.page_content}\n\n\n"
        doc_context.append(formatted_doc)
    
    return doc_context


                   
def rag_workflow(query):
    """
    RAGChain class to perform the complete RAG workflow.
    """
    # Assume 'llm' and 'vector_store' are already initialized instances
    rag_chain = RAGChain(llm, vector_store)

    # Step 1: Predict which library usage is relevant
    library_usage_prediction = rag_chain.predict_library_usage(query)
    print(f"Predicted library usage: {library_usage_prediction}")

    # Step 2: Retrieve contexts (documents and code snippets)
    doc_contexts, code_contexts = rag_chain.retrieve_contexts(query, library_usage_prediction)
    print("Retrieved Document Contexts:", doc_contexts)
    print("Retrieved Code Contexts:", code_contexts)

    # Step 3: Format the contexts
    formatted_doc_context, formatted_code_context = rag_chain.format_context(doc_contexts, code_contexts)
    print("Formatted Document Contexts:", formatted_doc_context)
    print("Formatted Code Contexts:", formatted_code_context)

    # Step 4: Generate the final response
    response = rag_chain.generate_response(query, formatted_doc_context, formatted_code_context)
    print("Generated Response:", response)

    return response


def initialize():
    global vector_store, chunks, llm

    
    download_and_upload_kadiAPY_repo_to_huggingfacespace(
        api_url=config2["gitlab"]["api_url"],
        project_id=config2["gitlab"]["project"]["id"],
        version=config2["gitlab"]["project"]["version"]
    )
    
    code_texts, code_references = extract_files_and_filepath_from_dir(DATA_DIR, ['kadi_apy'], [])
    doc_texts, kadiAPY_doc_references = extract_files_and_filepath_from_dir(DATA_DIR, ['docs/source/'], [])
    
    print("LEEEEEEEEEEEENGTH of code_texts: ", len(code_texts))
    print("LEEEEEEEEEEEENGTH of doc_files: ", len(doc_texts))
    
    code_chunks = split_python_code_into_chunks(code_texts, code_references)
    doc_chunks = split_into_chunks(doc_texts, kadiAPY_doc_references, CHUNK_SIZE, CHUNK_OVERLAP)

    print(f"Total number of code_chunks: {len(code_chunks)}")
    print(f"Total number of doc_chunks: {len(doc_chunks)}")

    filename = "test"
    vector_store = embed_documents_into_vectorstore(doc_chunks + code_chunks, EMBEDDING_MODEL_NAME, f"{DATA_DIR}/{filename}")
    llm = setup_llm(LLM_MODEL_NAME, LLM_TEMPERATURE, GROQ_API_KEY)

    from langchain_community.document_loaders import TextLoader
          
initialize()


# Gradio utils
def check_input_text(text):
    if not text:
        gr.Warning("Please input a question.")
        raise TypeError
    return True

def add_text(history, text):
    history = history + [(text, None)]
    yield history, ""


import gradio as gr


def bot_kadi(history):
    user_query = history[-1][0]
    response = rag_workflow(user_query)
    history[-1] = (user_query, response)

    yield history  

def main():
    with gr.Blocks() as demo:
        gr.Markdown("## KadiAPY - AI Coding-Assistant")
        gr.Markdown("AI assistant for KadiAPY based on RAG architecture powered by LLM")

        with gr.Tab("KadiAPY - AI Assistant"):
            with gr.Row():
                with gr.Column(scale=10):
                    chatbot = gr.Chatbot([], elem_id="chatbot", label="Kadi Bot", bubble_full_width=False, show_copy_button=True, height=600)
                    user_txt = gr.Textbox(label="Question", placeholder="Type in your question and press Enter or click Submit")

                    with gr.Row():
                        with gr.Column(scale=1):
                            submit_btn = gr.Button("Submit", variant="primary")
                        with gr.Column(scale=1):
                            clear_btn = gr.Button("Clear", variant="stop")

                    gr.Examples(
                        examples=[
                            "Who is working on Kadi4Mat?",
                            "How do i install the Kadi-Apy library?",
                            "How do i install the Kadi-Apy library for development?",
                            "I need a method to upload a file to a record",
                        ],
                        inputs=user_txt,
                        outputs=chatbot,
                        fn=add_text,
                        label="Try asking...",
                        cache_examples=False,
                        examples_per_page=3,
                    )

            user_txt.submit(check_input_text, user_txt, None).success(add_text, [chatbot, user_txt], [chatbot, user_txt]).then(bot_kadi, [chatbot], [chatbot])
            submit_btn.click(check_input_text, user_txt, None).success(add_text, [chatbot, user_txt], [chatbot, user_txt]).then(bot_kadi, [chatbot], [chatbot])
            clear_btn.click(lambda: None, None, chatbot, queue=False)

    demo.launch() 

    
if __name__ == "__main__":
    main()