Spaces:

bupa1018
/

KadiAPY_Coding_Assistant

Sleeping

File size: 8,725 Bytes

6df5c93
 
 
75495ad
6df5c93
42309dc
 
6df5c93
6c87654
57b271f
21b7541
57b271f
d35cd40
a9e4372
624df56
1c96354
fcfb36c
6df5c93
 
 
 
 
 
ae5beeb
2bbf094
6df5c93
 
 
 
 
ae5beeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6df5c93
 
 
 
f79e678
4c7739f
57b271f
 
 
807a415
 
 
 
 
 
 
 
 
 
 
 
 
57b271f
 
34426fc
 
 
6df5c93
 
 
 
 
 
9ed2e92
 
 
 
 
 
 
 
 
 
6df5c93
 
9ed2e92
0fdd155
6288d92
df02851
eff8daf
c106d4f
ced0582
8aebf77
 
99bb0aa
fbe4134
 
 
 
 
db1cea6
01ba6b0
 
 
 
475d7f4
 
83510c8
475d7f4
1faad78
c2bc215
01ba6b0
 
 
 
00668f3
339ddfc
a7e724a
def348c
 
7efc081
075591d
 
01ba6b0
 
 
 
475d7f4
 
075591d
 
01ba6b0
 
 
 
475d7f4
fbe4134
99bb0aa
fbe4134
99bb0aa
6df5c93
 
bcf5ba4
8917e60
2172305
 
5b2e654
 
 
2172305
9e24330
624df56
2f8bffe
624df56
6089bfa
9e24330
 
0c35020
9e24330
1e6df24
0c35020
9e24330
 
0c35020
ac1c4a0
ae5beeb
2c0ea57
9f52dc4
2c0ea57
93e3091
 
ebb0364
6df5c93
 
 
 
 
 
 
 
 
 
 
506afb0
 
 
 
6df5c93
 
499e447
6df5c93
 
499e447
6df5c93
 
1afdee3
831abbd
 
1afdee3
831abbd
1afdee3
417adb9
40be4b1
1afdee3
 
 
 
 
 
 
 
 
 
 
8fde75c
 
 
1afdee3
 
 
 
 
 
 
 
 
499e447
 
1afdee3
 
 
8c715b2

import os
import json
import gradio as gr

from huggingface_hub import HfApi, login
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from langchain.docstore.document import Document
from langchain.schema import Document

from chunk_python_code import chunk_python_code_with_metadata
from vectorstore import get_chroma_vectorstore
from download_repo_to_huggingface import download_gitlab_repo_to_hfspace
from process_repo_zipfile import extract_repo_files
from ragchain import RAGChain
from llm import get_groq_llm

# Load environment variables from .env file
load_dotenv()

# Load configuration from JSON file

with open("config.json", "r") as file:
    config2 = json.load(file)

GROQ_API_KEY = os.environ["GROQ_API_KEY"]
HF_TOKEN = os.environ["HF_Token"]


VECTORSTORE_DIRECTORY = config.get("vectorstore_directory")
CHUNK_SIZE = config.get("chunking", "chunk_size")
CHUNK_OVERLAP = config.get("chunking", "chunk_overlap")
EMBEDDING_MODEL_NAME = config.get("embedding_model", "name")
LLM_MODEL_NAME = config.get("llm_model", "name")
LLM_MODEL_TEMPERATURE = config.get("llm_model", "temperature")
GITLAB_API_URL = config.get("gitlab", "api_url")
GITLAB_PROJECT_ID = config.get("gitlab", "project", "id")
GITLAB_PROJECT_VERSION = config.get("gitlab", "project", "version")
DATA_DIR = config.get("data_dir")
HF_SPACE_NAME = config.get("hf_space_name")
DOCS_FOLDER = config.get("usage", "docs", "folder")
DOCS_FILE = config.get("usage", "docs", "file")
KADI_APY_FOLDER = config.get("usage", "kadi_apy", "folder")
KADI_APY_FILE = config.get("usage", "kadi_apy", "file"


login(HF_TOKEN)
api = HfApi()



def split_python_code_into_chunks(texts, file_paths):
    chunks = [] 
    for text, file_path in zip(texts, file_paths):
        """
        Custom made python code splitter, algorithm iterates through child nodes of ast-tree(max child depth = 2)
        aims to have full body of methods along signature (+ can handle decorators) in a chunk and adds method specific metadata
        e.g visbility: public, _internal
            type: "class", "methods", "command"(CLI commands)
            source: 
        
        
        with the intend to use a filter when retrieving potentaion useful snippets. 
        

        
        """
        document_chunks = chunk_python_code_with_metadata(text, file_path)
        chunks.extend(document_chunks)   
    return chunks


# Split text into chunks
def split_into_chunks(texts, references, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = []

    for text, reference in zip(texts, references):
        chunks.extend([
            Document(
                page_content=chunk,
                metadata={
                    "source": reference,
                    "usage": "doc"
                }
            ) 
            for chunk in text_splitter.split_text(text)
        ])
    return chunks


# Setup Vectorstore
def embed_documents_into_vectorstore(chunks, model_name, persist_directory):
    print("Start setup_vectorstore_function")
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)   
    vectorstore = get_chroma_vectorstore(embedding_model, persist_directory)
    vectorstore.add_documents(chunks)
    return vectorstore

def rag_workflow(query):
    """
    RAGChain class to perform the complete RAG workflow.
    """
    # Assume 'llm' and 'vector_store' are already initialized instances
    rag_chain = RAGChain(llm, vector_store)


    """
    Pre-Retrieval-Stage  
    """
    # predict which python library to search in: (standard) kadiAPY-library or kadiAPY-cli-library 
    code_library_usage_prediction = rag_chain.predict_library_usage(query)
    print(f"Predicted library usage: {code_library_usage_prediction}")
    
    rewritten_query = rag_chain.rewrite_query(query)
    print(f"\n\n Rewritten query: {rewritten_query}\n\n")

    """
    Retrieval-Stage
    """  
    kadiAPY_doc_documents = rag_chain.retrieve_contexts(query, k=5, filter={"usage": "doc"})
    kadiAPY_code_documents = rag_chain.retrieve_contexts(str(rewritten_query.content), k=3, filter={"usage": code_library_usage_prediction})
    
    print("Retrieved Document Contexts:", kadiAPY_doc_documents)
    print("Retrieved Code Contexts:", kadiAPY_code_documents)


    
    """
    Pre-Generation-Stage
    Adding each doc's metadata to the retrieved content (docs & code snippets) 
    """
    formatted_doc_snippets = rag_chain.format_documents(kadiAPY_doc_documents)
    formatted_code_snippets = rag_chain.format_documents(kadiAPY_code_documents)
    #print("FORMATTED Retrieved Document Contexts:", formatted_doc_snippets)
    #print("FORMATTED Retrieved Code Contexts:" , formatted_code_snippets)

    """
    Generation-Stage
    """
    response = rag_chain.generate_response(query, formatted_doc_snippets, formatted_code_snippets)
    print("Generated Response:", response)

    return response


def initialize():
    global vector_store, chunks, llm

    
    download_and_upload_kadiAPY_repo_to_huggingfacespace(
        api_url=config2["gitlab"]["api_url"],
        project_id=config2["gitlab"]["project"]["id"],
        version=config2["gitlab"]["project"]["version"]
    )
    
    code_texts, code_references = extract_repo_files(DATA_DIR, ['kadi_apy'], [])
    #doc_texts, doc_references = extract_files_and_filepath_from_dir(DATA_DIR, ['docs/source/'], [])
    doc_texts, doc_references = extract_repo_files(DATA_DIR, [], [])
    
    print("LEEEEEEEEEEEENGTH of code_texts: ", len(code_texts))
    print("LEEEEEEEEEEEENGTH of doc_files: ", len(doc_texts))
    
    code_chunks = split_python_code_into_chunks(code_texts, code_references)
    doc_chunks = split_into_chunks(doc_texts, doc_references, CHUNK_SIZE, CHUNK_OVERLAP)

    print(f"Total number of code_chunks: {len(code_chunks)}")
    print(f"Total number of doc_chunks: {len(doc_chunks)}")

    vector_store = embed_documents_into_vectorstore(doc_chunks + code_chunks, EMBEDDING_MODEL_NAME, VECTORSTORE_DIRECTORY)
    llm = get_groq_llm(LLM_MODEL_NAME, LLM_MODEL_TEMPERATURE, GROQ_API_KEY)

    from langchain_community.document_loaders import TextLoader
          
initialize()


# Gradio utils
def check_input_text(text):
    if not text:
        gr.Warning("Please input a question.")
        raise TypeError
    return True

def add_text(history, text):
    history = history + [(text, None)]
    yield history, ""


import gradio as gr


def bot_kadi(history):
    user_query = history[-1][0]
    response = rag_workflow(user_query)
    history[-1] = (user_query, response)

    yield history  

def main():
    with gr.Blocks() as demo:
        gr.Markdown("## KadiAPY - AI Coding-Assistant")
        gr.Markdown("AI assistant for KadiAPY based on RAG architecture powered by LLM")

        with gr.Tab("KadiAPY - AI Assistant"):
            with gr.Row():
                with gr.Column(scale=10):
                    chatbot = gr.Chatbot([], elem_id="chatbot", label="Kadi Bot", bubble_full_width=False, show_copy_button=True, height=600)
                    user_txt = gr.Textbox(label="Question", placeholder="Type in your question and press Enter or click Submit")

                    with gr.Row():
                        with gr.Column(scale=1):
                            submit_btn = gr.Button("Submit", variant="primary")
                        with gr.Column(scale=1):
                            clear_btn = gr.Button("Clear", variant="stop")

                    gr.Examples(
                        examples=[
                            "Who is working on Kadi4Mat?",
                            "How do i install the Kadi-Apy library?",
                            "How do i install the Kadi-Apy library for development?",
                            "I need a method to upload a file to a record",
                        ],
                        inputs=user_txt,
                        outputs=chatbot,
                        fn=add_text,
                        label="Try asking...",
                        cache_examples=False,
                        examples_per_page=3,
                    )

            user_txt.submit(check_input_text, user_txt, None).success(add_text, [chatbot, user_txt], [chatbot, user_txt]).then(bot_kadi, [chatbot], [chatbot])
            submit_btn.click(check_input_text, user_txt, None).success(add_text, [chatbot, user_txt], [chatbot, user_txt]).then(bot_kadi, [chatbot], [chatbot])
            clear_btn.click(lambda: None, None, chatbot, queue=False)

    demo.launch() 

    
if __name__ == "__main__":
    main()