Spaces:

muhammadsalmanalfaridzi
/

RAG-GutHib

Sleeping

App Files Files Community

muhammadsalmanalfaridzi commited on Apr 28

Commit

dba1f58

verified ·

1 Parent(s): 8ad304a

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -277

app.py CHANGED Viewed

@@ -1,46 +1,27 @@
 import os
-import sys
 import gc
 import tempfile
 import uuid
 import logging
-import requests
-import time
-from typing import List, Any
 import streamlit as st
 from dotenv import load_dotenv
-import openai
 from gitingest import ingest
-from llama_index.core import Settings, PromptTemplate, VectorStoreIndex, SimpleDirectoryReader
-from llama_index.core.node_parser import MarkdownNodeParser
-from llama_index.vector_stores.faiss import FaissVectorStore
-from llama_index.embeddings.base import BaseEmbedding
-import faiss
-from llama_index.llms.sambanovasystems import SambaNovaCloud
-# ------------------ Configuration ------------------
 load_dotenv()
-# Configure SamnaNova OpenAI-compatible client
-SAMBA_API_KEY = os.getenv("SAMBANOVA_API_KEY")
-SAMBA_BASE_URL = os.getenv("SAMBANOVA_BASE_URL", "https://api.sambanova.ai/v1")
-# Nomic AI API Key
-NOMIC_API_KEY = os.getenv("NOMIC_API_KEY")
-if not SAMBA_API_KEY:
-    raise ValueError("Missing SAMBANOVA_API_KEY in environment")
-if not NOMIC_API_KEY:
-    raise ValueError("Missing NOMIC_API_KEY in environment")
-# Initialize SambaNova client
-sambanova_client = openai.OpenAI(
-    api_key=SAMBA_API_KEY,
-    base_url=SAMBA_BASE_URL
-)
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -49,97 +30,10 @@ logger = logging.getLogger(__name__)
 MAX_REPO_SIZE = 100 * 1024 * 1024  # 100MB
 SUPPORTED_REPO_TYPES = ['.py', '.md', '.ipynb', '.js', '.ts', '.json']
-# ------------------ Exceptions ------------------
 class GitHubRAGError(Exception):
     """Custom exception for GitHub RAG application errors"""
     pass
-# ------------------ Embedding Cache ------------------
-embedding_cache = {}
-# ------------------ Nomic AI Embedding Implementation ------------------
-class NomicEmbedding(BaseEmbedding):
-    """Custom embedding class for Nomic AI"""
-    def __init__(self, model_name="nomic-embed-text-v1.5", task_type="search_document"):
-        self.model_name = model_name
-        self.task_type = task_type
-        self.api_key = NOMIC_API_KEY
-        super().__init__()
-    def _get_query_embedding(self, query: str) -> List[float]:
-        """Get embedding for a query string"""
-        return self._get_embedding(query)
-    def _get_text_embedding(self, text: str) -> List[float]:
-        """Get embedding for a text string"""
-        return self._get_embedding(text)
-    def _get_embedding(self, text: str) -> List[float]:
-        """Get embedding from Nomic AI"""
-        # Check if text is already in cache
-        if text in embedding_cache:
-            return embedding_cache[text]
-        try:
-            url = "https://api-atlas.nomic.ai/v1/embedding/text"
-            headers = {
-                "Authorization": f"Bearer {self.api_key}",
-                "Content-Type": "application/json",
-                "Accept": "application/json"
-            }
-            payload = {
-                "texts": [text],
-                "model": self.model_name,
-                "task_type": self.task_type
-            }
-            # Retry logic with exponential backoff
-            max_retries = 3
-            retry_delay = 1  # Start with 1 second delay
-            for retry in range(max_retries):
-                try:
-                    response = requests.post(
-                        url,
-                        headers=headers,
-                        json=payload,
-                        timeout=30  # 30 seconds timeout
-                    )
-                    if response.status_code == 200:
-                        embedding = response.json()["embeddings"][0]
-                        # Cache the result
-                        embedding_cache[text] = embedding
-                        return embedding
-                    else:
-                        logger.error(f"Error from Nomic API: {response.status_code} - {response.text}")
-                        if retry < max_retries - 1:
-                            # Wait with exponential backoff before retry
-                            time.sleep(retry_delay)
-                            retry_delay *= 2  # Double the delay for next retry
-                        else:
-                            # Last retry failed
-                            raise Exception(f"Failed to get embedding after {max_retries} attempts")
-                except requests.exceptions.RequestException as e:
-                    logger.error(f"Request error (attempt {retry+1}/{max_retries}): {e}")
-                    if retry < max_retries - 1:
-                        time.sleep(retry_delay)
-                        retry_delay *= 2
-                    else:
-                        raise
-        except Exception as e:
-            logger.error(f"Error connecting to Nomic API: {e}")
-            raise  # Propagate the error without fallback
-    async def _aget_query_embedding(self, query: str) -> List[float]:
-        """Async version of get_query_embedding"""
-        return self._get_query_embedding(query)
-    async def _aget_text_embedding(self, text: str) -> List[float]:
-        """Async version of get_text_embedding"""
-        return self._get_text_embedding(text)
-# ------------------ Utility Functions ------------------
 def validate_github_url(url: str) -> bool:
     return url.startswith(('https://github.com/', 'http://github.com/'))
@@ -147,7 +41,7 @@ def validate_github_url(url: str) -> bool:
 def get_repo_name(url: str) -> str:
     try:
-        return url.rstrip('/').split('/')[-1].replace('.git', '')
     except Exception as e:
         raise GitHubRAGError(f"Invalid repository URL: {e}")
@@ -171,179 +65,131 @@ def process_with_gitingets(github_url: str) -> tuple:
         return summary, tree, content
     except Exception as e:
         logger.error(f"Error processing repository: {e}")
-        raise GitHubRAGError(f"Failed to process repository: {e}")
-def create_query_engine(content_path: str, repo_name: str) -> Any:
-    """Create and configure LlamaIndex RAG query engine with FAISS vector store."""
-    try:
-        # Load documents from local folder
-        loader = SimpleDirectoryReader(input_dir=content_path)
-        docs = loader.load_data()
-        # Create a Nomic embedding instance
-        embed_model = NomicEmbedding()
-        # Set up LlamaIndex to use Nomic embeddings
-        Settings.embed_model = embed_model
-        # Create FAISS index - using L2 distance (Euclidean)
-        dimension = len(embed_model._get_text_embedding("test"))  # Get dimensionality from a sample embedding
-        faiss_index = faiss.IndexFlatL2(dimension)
-        # Initialize FAISS vector store
-        vector_store = FaissVectorStore(faiss_index=faiss_index)
-        # Build vector index with markdown parsing and FAISS
-        node_parser = MarkdownNodeParser()
-        index = VectorStoreIndex.from_documents(
-            documents=docs,
-            transformations=[node_parser],
-            vector_store=vector_store,
-            show_progress=True
-        )
-        # Custom QA prompt template
-        qa_prompt = PromptTemplate(
-            template_str="""
-You are an AI assistant specialized in analyzing GitHub repositories.
-Repository structure:
-{tree}
-Context information:
-{context_str}
-Answer the following query about the repository. If unknown, say you don't have enough information.
-Query: {query_str}
-Answer:"""
-        )
-        # Configure query engine with streaming and template
-        query_engine = index.as_query_engine(streaming=True)
-        query_engine.update_prompts({
-            "response_synthesizer:text_qa_template": qa_prompt
-        })
-        # And then configure it within llama-index
-        llm = SambaNovaCloud(
             model_name="QwQ-32B",
-            api_key=SAMBA_API_KEY,
-            base_url=SAMBA_BASE_URL
         )
-        Settings.llm = llm
-        return query_engine
-    except Exception as e:
-        logger.error(f"Error creating query engine: {e}")
-        raise GitHubRAGError(f"Failed to create query engine: {e}")
-# ------------------ Streamlit App ------------------
-# Initialize session state
 if "id" not in st.session_state:
     st.session_state.id = uuid.uuid4()
-    st.session_state.file_cache = {}
     st.session_state.messages = []
 session_id = st.session_state.id
-# Sidebar inputs
 with st.sidebar:
-    st.header("Add your GitHub repository!")
-    github_url = st.text_input("Enter GitHub repository URL",
-                               placeholder="https://github.com/username/repo")
-    load_repo = st.button("Load Repository", type="primary")
-    if github_url and load_repo:
-        try:
-            if not validate_github_url(github_url):
-                st.error("Please enter a valid GitHub repository URL")
-                st.stop()
-            repo_name = get_repo_name(github_url)
-            file_key = f"{session_id}-{repo_name}"
-            if file_key not in st.session_state.file_cache:
-                with st.spinner("Processing your repository..."):
-                    with tempfile.TemporaryDirectory() as temp_dir:
-                        summary, tree, content = process_with_gitingets(github_url)
-                        # Write content for RAG
-                        content_path = temp_dir
-                        # Save full content as a doc
-                        md_path = os.path.join(temp_dir, f"{repo_name}.md")
-                        with open(md_path, "w", encoding="utf-8") as f:
-                            f.write(content)
-                        # Create query engine and cache
-                        query_engine = create_query_engine(content_path, repo_name)
-                        st.session_state.file_cache[file_key] = dict(
-                            engine=query_engine,
-                            tree=tree
-                        )
-                        st.success("Repository loaded successfully! Ready to chat.")
-            else:
-                st.info("Repository already loaded. Ready to chat!")
-        except GitHubRAGError as e:
-            st.error(str(e))
             st.stop()
-# Main chat UI
-col1, col2 = st.columns([6, 1])
 with col1:
-    st.header("Chat with GitHub using RAG + Sambanova")
 with col2:
-    st.button("Clear Chat ↺", on_click=reset_chat)
-# Display chat history
 for msg in st.session_state.messages:
-    with st.chat_message(msg["role"]):
-        st.markdown(msg["content"])
-# Chat input
-if prompt := st.chat_input("Ask your question..."):
-    st.session_state.messages.append({"role": "user", "content": prompt})
     with st.chat_message("user"):
         st.markdown(prompt)
     with st.chat_message("assistant"):
-        file_key = f"{session_id}-{get_repo_name(github_url)}"
-        cache = st.session_state.file_cache.get(file_key)
-        if not cache:
-            st.error("Please load a repository first!")
-            st.stop()
-        query_engine = cache['engine']
-        tree_str = cache['tree']
-        # Generate RAG response (streamed chunks)
-        rag_response = query_engine.query(prompt)
-        context_str = rag_response.context_str if hasattr(rag_response, 'context_str') else ''
-        # Membuat pesan untuk model Sambanova
-        messages = [
-            {"role": "system", "content": "You are a knowledgeable assistant combining GitHub repository context with user queries."},
-            {"role": "user", "content": f"Struktur Repositori:\n{tree_str}\nKonteks:\n{context_str}\nPertanyaan: {prompt}"}
-        ]
-        # Memanggil API Sambanova
-        try:
-            stream = sambanova_client.chat.completions.create(
-                model="QwQ-32B",  # Ganti dengan model yang sesuai
-                messages=messages,
-                temperature=0.1,
-                top_p=0.1
-            )
-            full_resp = ""
-            for chunk in stream:
-                if chunk.choices[0].delta.content:
-                    full_resp += chunk.choices[0].delta.content
-                    st.write(full_resp + "▌")
-            st.write(full_resp)
-            st.session_state.messages.append({"role": "assistant", "content": full_resp})
-        except Exception as e:
-            logger.error(f"API Error: {str(e)}")
-            st.error(f"Error generating response: {str(e)}")
-            st.stop()

 import os
 import gc
 import tempfile
 import uuid
 import logging
 import streamlit as st
 from dotenv import load_dotenv
 from gitingest import ingest
+from llama_index import (
+    SimpleDirectoryReader,
+    VectorStoreIndex,
+    PromptTemplate,
+    ServiceContext,
+    LLMPredictor,
+)
+from llama_index.node_parser import MarkdownNodeParser
+from llama_index.embeddings import HuggingFaceEmbedding
+from llama_index.llms import OpenAI
+# Load environment
 load_dotenv()
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 MAX_REPO_SIZE = 100 * 1024 * 1024  # 100MB
 SUPPORTED_REPO_TYPES = ['.py', '.md', '.ipynb', '.js', '.ts', '.json']
 class GitHubRAGError(Exception):
     """Custom exception for GitHub RAG application errors"""
     pass
 def validate_github_url(url: str) -> bool:
     return url.startswith(('https://github.com/', 'http://github.com/'))
 def get_repo_name(url: str) -> str:
     try:
+        return url.split('/')[-1].replace('.git', '')
     except Exception as e:
         raise GitHubRAGError(f"Invalid repository URL: {e}")
         return summary, tree, content
     except Exception as e:
         logger.error(f"Error processing repository: {e}")
+        raise GitHubRAGError(str(e))
+def create_query_engine(content_dir: str) -> Any:
+    """
+    Build index with nomic embeddings and query via Sambanova LLM
+    """
+    # Reader & parser
+    loader = SimpleDirectoryReader(input_dir=content_dir)
+    docs = loader.load_data()
+    node_parser = MarkdownNodeParser()
+    # Embedding model using Nomic Embed v2 MoE
+    embed_model = HuggingFaceEmbedding(
+        model_name="nomic-ai/nomic-embed-text-v2-moe",
+        embedding_device="cpu",  # or 'cuda'
+        normalize_embeddings=True,
+        trust_remote_code=True,
+    )
+    # LLM predictor using Sambarova Cloud via OpenAI compatible API
+    llm_predictor = LLMPredictor(
+        llm=OpenAI(
+            api_key=os.environ.get("SAMBANOVA_API_KEY"),
             model_name="QwQ-32B",
+            temperature=0.1,
+            top_p=0.1,
+            streaming=True,
+            api_base="https://api.sambanova.ai/v1",
         )
+    )
+    # Service context
+    service_context = ServiceContext.from_defaults(
+        embed_model=embed_model,
+        llm_predictor=llm_predictor,
+        prompt_helper=None,
+    )
+    # Build index
+    index = VectorStoreIndex.from_documents(
+        documents=docs,
+        service_context=service_context,
+        transformations=[node_parser],
+        show_progress=True,
+    )
+    # Custom QA prompt
+    qa_template = PromptTemplate(
+        "You are an AI assistant specialized in analyzing GitHub repositories.\n\n"
+        "Repository files and structure:\n{tree}\n---\n"
+        "Context:\n{context_str}\n---\n"
+        "Question: {query_str}\nAnswer:"
+    )
+    service_context.prompt_helper.set_default_template(
+        qa_template,
+        key="response_synthesizer:text_qa_template"
+    )
+    # Create query engine
+    return index.as_query_engine(streaming=True, service_context=service_context)
+# Streamlit App
 if "id" not in st.session_state:
     st.session_state.id = uuid.uuid4()
+    st.session_state.cache = {}
     st.session_state.messages = []
 session_id = st.session_state.id
 with st.sidebar:
+    st.header("GitHub RAG with Sambanova & Nomic Embed")
+    github_url = st.text_input("GitHub Repo URL", help="e.g. https://github.com/user/repo")
+    load_btn = st.button("Load Repository")
+    if github_url and load_btn:
+        if not validate_github_url(github_url):
+            st.error("Invalid GitHub URL")
             st.stop()
+        repo_name = get_repo_name(github_url)
+        key = f"{session_id}-{repo_name}"
+        if key not in st.session_state.cache:
+            with st.spinner("Processing repository..."):
+                try:
+                    summary, tree, content = process_with_gitingets(github_url)
+                    with tempfile.TemporaryDirectory() as td:
+                        # Save all files to directory
+                        content_path = os.path.join(td, repo_name)
+                        os.makedirs(content_path, exist_ok=True)
+                        with open(os.path.join(content_path, f"{repo_name}.md"), "w") as f:
+                            f.write(content)
+                        # Build query engine
+                        qe = create_query_engine(content_path)
+                        st.session_state.cache[key] = (qe, tree)
+                        st.success("Repository loaded!")
+                except GitHubRAGError as e:
+                    st.error(str(e))
+                    st.stop()
+        else:
+            st.info("Repository already loaded.")
+col1, col2 = st.columns([6,1])
 with col1:
+    st.header("Chat with your Repo")
 with col2:
+    st.button("Clear Chat", on_click=reset_chat)
+# Display chat
 for msg in st.session_state.messages:
+    with st.chat_message(msg['role']):
+        st.markdown(msg['content'])
+if prompt := st.chat_input("Ask a question about the repository..."):
+    st.session_state.messages.append({"role":"user","content":prompt})
     with st.chat_message("user"):
         st.markdown(prompt)
+    key = f"{session_id}-{get_repo_name(github_url)}"
+    if key not in st.session_state.cache:
+        st.error("Load a repository first.")
+        st.stop()
+    qe, tree = st.session_state.cache[key]
     with st.chat_message("assistant"):
+        placeholder = st.empty()
+        answer = ""
+        for chunk in qe.query(prompt).response_gen:
+            answer += chunk
+            placeholder.markdown(answer + "▌")
+        placeholder.markdown(answer)
+        st.session_state.messages.append({"role":"assistant","content":answer})