Spaces:

Agents-MCP-Hackathon
/

OpenSorus

Running

App Files Files Community

halfacupoftea commited on Jun 5

Commit

ccded5c

1 Parent(s): 8f1b296

Optimize retriever and agent for context aware responses

Browse files

Files changed (9) hide show

.gitignore +1 -61
agent/__init__.py +0 -0
agent/agent_config/prompts.py +15 -5
agent/agent_config/tool_schema.py +1 -1
agent/core.py +74 -49
app.py +24 -0
requirements.txt +1 -0
tools/code_index.py +80 -118
tools/github_tools.py +2 -2

.gitignore CHANGED Viewed

@@ -3,26 +3,18 @@ __pycache__/
 *.py[cod]
 *$py.class
-# C extensions
-*.so
 # Distribution / packaging
 .Python
 build/
-develop-eggs/
 dist/
 downloads/
-eggs/
-.eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
-*.egg-info/
 .installed.cfg
-*.egg
 # PyInstaller
 *.manifest
@@ -32,49 +24,6 @@ wheels/
 pip-log.txt
 pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
 # pyenv
 .python-version
@@ -99,13 +48,4 @@ env.bak/
 venv.bak/
 # VS Code settings
-.vscode/
-# PyCharm
-.idea/
-# macOS system files
-.DS_Store
-# Windows system files
-Thumbs.db

 *.py[cod]
 *$py.class
 # Distribution / packaging
 .Python
 build/
 dist/
 downloads/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 .installed.cfg
 # PyInstaller
 *.manifest
 pip-log.txt
 pip-delete-this-directory.txt
 # pyenv
 .python-version
 venv.bak/
 # VS Code settings
+.vscode/

agent/__init__.py ADDED Viewed

File without changes

agent/agent_config/prompts.py CHANGED Viewed

@@ -10,16 +10,26 @@ system_message = {
         "- Feature request\n"
         "- Incomplete or unclear\n\n"
-        "Then, based on the classification, write a clear, concise, and friendly response.\n\n"
         "The comment should be well formatted and readable, using Markdown for code blocks and lists where appropriate.\n\n"
         "DO NOT paste or repeat the issue description. DO NOT quote it. Respond entirely in your own words.\n"
         "You can only use the following tools: fetch_github_issue, get_issue_details, retrieve_context, post_comment.\n"
         "Do not attempt to use any other tools such as web_search."
         "DO NOT HALLUCINATE OR MAKE UP TOOLS."
     )
 }
-user_message = {
-    "role": "user",
-    "content": "Please suggest a fix on this issue https://github.com/aditi-dsi/testing-cryptope/issues/4."
-}

         "- Feature request\n"
         "- Incomplete or unclear\n\n"
+        "Then, based on the classification, write a CLEAR, CONCISE, and FRIENDLY response.\n\n"
+        "STEPS TO FOLLOW:\n"
+        "You are an AI agent that assists in resolving GitHub issues.\n"
+        "First, call `get_issue_details` using the `issue_url` to obtain the full issue description and issue title.\n"
+        "Then, when calling `retriever_context`, always pass the exact `issue_title+issue_description` you got from `get_issue_details`.\n"
+        "Do not fabricate or reuse incorrect descriptions.\n\n"
         "The comment should be well formatted and readable, using Markdown for code blocks and lists where appropriate.\n\n"
         "DO NOT paste or repeat the issue description. DO NOT quote it. Respond entirely in your own words.\n"
         "You can only use the following tools: fetch_github_issue, get_issue_details, retrieve_context, post_comment.\n"
+        "Whenever an issue involves deals with code or codebase, use the `retrieve_context` tool to get the relevant code snippets or metadata about the codebase to formulate your response.\n"
+        "STRICTLY READ the context that you get back from `retrieve_context` and use it to inform your response.\n"
+        "If you do not get any relevant context from `retrieve_context` tool then JUST STICK to the context that is provided in the issue description.\n\n"
+        "DO NOT OVERUSE the context retrieved from `retrieve_context`, only extract relevant context that exactly matches to the current issue.\n\n"
+        "DO NOT OVEREXAGGERATE OR MAKE UP INFORMATION.\n"
         "Do not attempt to use any other tools such as web_search."
         "DO NOT HALLUCINATE OR MAKE UP TOOLS."
     )
 }
+#         "STRICTLY use the `retrieve_context` tool to get the relevant code snippets or metadata about the codebase to formulate your response.\n\n"
+        # "Stick to the context that your retri"

agent/agent_config/tool_schema.py CHANGED Viewed

@@ -63,7 +63,7 @@ tools = [
                     },
                     "issue_description": {
                         "type": "string",
-                        "description": "The description of the issue to retrieve context for."
                     }
                 },
                 "required": ["owner", "repo", "ref", "issue_description"]

                     },
                     "issue_description": {
                         "type": "string",
+                        "description": "The exact issue description from the issue the agent is resolving. Must be passed without rephrasing."
                     }
                 },
                 "required": ["owner", "repo", "ref", "issue_description"]

agent/core.py CHANGED Viewed

@@ -17,62 +17,87 @@ names_to_functions = {
 allowed_tools = set(names_to_functions.keys())
 system_message = prompts.system_message
-user_message = {
-    "role": "user",
-    "content": "Please suggest a fix on this issue https://github.com/aditi-dsi/testing-cryptope/issues/4."
-}
-messages = [system_message, user_message]
 api_key = MISTRAL_API_KEY
 model = "devstral-small-latest"
 client = Mistral(api_key=api_key)
-MAX_STEPS = 5
-tool_calls = 0
-while True:
-    response = client.chat.complete(
-        model=model,
-        messages=messages,
-        tools=tools,
-        tool_choice="any",
-    )
-    msg = response.choices[0].message
-    messages.append(msg)
-    if hasattr(msg, "tool_calls") and msg.tool_calls:
-        for tool_call in msg.tool_calls:
-            function_name = tool_call.function.name
-            function_params = json.loads(tool_call.function.arguments)
-            if function_name in allowed_tools:
-                function_result = names_to_functions[function_name](**function_params)
-                print(f"Agent is calling tool: {function_name}")
-                tool_calls += 1
-                messages.append({
-                    "role": "tool",
-                    "tool_call_id": tool_call.id,
-                    "content": str(function_result)
-                })
-                if function_name == "post_comment":
-                    print("OpenSorus (final): ✅ Comment posted successfully. No further action needed.")
-                    exit(0)
-            else:
-                print(f"LLM tried to call unknown tool: {function_name}")
-                tool_error_msg = (
-                    f"Error: Tool '{function_name}' is not available. "
-                    "You can only use the following tools: fetch_github_issue, get_issue_details, post_comment."
-                )
-                messages.append({
-                    "role": "tool",
-                    "tool_call_id": tool_call.id,
-                    "content": tool_error_msg
-                })
-        if tool_calls >= MAX_STEPS:
-            print(f"Agent stopped after {MAX_STEPS} tool calls to protect against rate limiting.")
             break
-    else:
-        print("OpenSorus (final):", msg.content)
-        break

 allowed_tools = set(names_to_functions.keys())
 system_message = prompts.system_message
 api_key = MISTRAL_API_KEY
 model = "devstral-small-latest"
 client = Mistral(api_key=api_key)
+def run_agent(issue_url: str, branch_name: str = "main") -> str:
+    """
+    Run the agent workflow on a given GitHub issue URL.
+    """
+    MAX_STEPS = 5
+    tool_calls = 0
+    issue_description_cache = None
+    user_message = {
+        "role": "user",
+        "content": f"Please suggest a fix on this issue {issue_url} and use {branch_name} branch for retrieving code context."
+    }
+    messages = [system_message, user_message]
+    while True:
+        response = client.chat.complete(
+            model=model,
+            messages=messages,
+            tools=tools,
+            tool_choice="any",
+        )
+        msg = response.choices[0].message
+        messages.append(msg)
+        if hasattr(msg, "tool_calls") and msg.tool_calls:
+            for tool_call in msg.tool_calls:
+                function_name = tool_call.function.name
+                function_params = json.loads(tool_call.function.arguments)
+                if function_name in allowed_tools:
+                    function_result = names_to_functions[function_name](**function_params)
+                    print(f"Agent is calling tool: {function_name}")
+                    tool_calls += 1
+                    if function_name == "get_issue_details" and isinstance(function_result, dict):
+                        issue_title = function_result.get("title")
+                        issue_body = function_result.get("body")
+                        issue_description_cache = issue_title + "\n" + issue_body if issue_title or issue_body else None
+                        print("ISSUE DESCRIPTION CACHE ✨:", issue_description_cache)
+                    if function_name == "retrieve_context":
+                        if "issue_description" in function_params:
+                            if (
+                                issue_description_cache
+                                and (function_params["issue_description"] != issue_description_cache)
+                            ):
+                                print("🔁 Overriding incorrect issue_description with correct one from cache.")
+                                function_params["issue_description"] = issue_description_cache
+                                function_result = names_to_functions[function_name](**function_params)
+                    messages.append({
+                        "role": "tool",
+                        "tool_call_id": tool_call.id,
+                        "content": str(function_result)
+                    })
+                    if function_name == "post_comment":
+                        print("OpenSorus (final): ✅ Comment posted successfully. No further action needed.")
+                        return "Task Completed"
+                else:
+                    print(f"LLM tried to call unknown tool: {function_name}")
+                    tool_error_msg = (
+                        f"Error: Tool '{function_name}' is not available. "
+                        "You can only use the following tools: fetch_github_issue, get_issue_details, post_comment."
+                    )
+                    messages.append({
+                        "role": "tool",
+                        "tool_call_id": tool_call.id,
+                        "content": tool_error_msg
+                    })
+            if tool_calls >= MAX_STEPS:
+                print(f"Agent stopped after {MAX_STEPS} tool calls to protect against rate limiting.")
+                break
+        else:
+            print("OpenSorus (final):", msg.content)
             break
+    return "Task Completed"

app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import gradio as gr
+from agent.core import run_agent
+def respond_to_issue(issue_url, branch_name):
+    try:
+        result = run_agent(issue_url, branch_name)
+        response = "Agent has successfully processed the issue and posted an update in the comments. Check the GitHub issue for updates."
+    except Exception as e:
+        response = f"Something went wrong: {str(e)}"
+    return response
+iface = gr.Interface(
+    fn=respond_to_issue,
+    inputs=[
+        gr.Textbox(label="GitHub Issue URL", placeholder="https://github.com/user/repo/issues/123"),
+        gr.Textbox(label="Branch Name", placeholder="main or dev or feature/xyz")
+    ],
+    outputs=gr.Textbox(label="Agent Response"),
+    title="GitHub Issue AI Agent",
+    description="Enter a GitHub issue URL you want to assign to OpenSorus and the branch to refer for code context (default is 'main'). The agent will fetch relevant context and respond."
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 llama_index==0.12.40
 mistralai==1.8.1
 PyJWT==2.10.1

+gradio==5.33.0
 llama_index==0.12.40
 mistralai==1.8.1
 PyJWT==2.10.1

tools/code_index.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 import re
 import time
 from typing import List, Dict
 from llama_index.core import VectorStoreIndex, Document, Settings, get_response_synthesizer
@@ -12,64 +14,56 @@ from config import MISTRAL_API_KEY
 from tools.utils import fetch_repo_files, fetch_file_content
-repo_indices_cache: Dict[str, VectorStoreIndex] = {}
 INCLUDE_FILE_EXTENSIONS = {".py", ".js", ".ts", ".json", ".md", ".txt"}
-def clean_line(line: str) -> str:
-    line = re.sub(r'^\s*\d+[\.\)]\s*', '', line)
-    line = line.strip(' `"\'')
-    return line.strip()
-def select_relevant_files_mistral(issue_description: str, file_paths: List[str]) -> List[str]:
-    model = "devstral-small-latest"
-    client = Mistral(api_key=MISTRAL_API_KEY)
-    system_prompt = '''
-    You are a code reasoning assistant. Given a GitHub issue description and a list of file paths from a codebase, return a list of top 5 files that are most relevant to solving or understanding the issue, based on naming, possible associations, or inferred logic.
-    DO NOT RETURN ANYTHING ELSE.
-    DO NOT RETURN ANY ADDITIONAL INFORMATION OR EXPLANATIONS.
-    ONLY RETURN THE FILE PATHS, ONE PER LINE, WITHOUT ANY ADDITIONAL TEXT OR FORMATTING.
-    DO NOT HALLUCINATE.
-    '''
-    user_prompt = f"""Issue:
-{issue_description}
-Files:
-{chr(10).join(file_paths)}
-Return the list of most relevant files (only exact paths)."""
-    response = client.chat.complete(
-        model=model,
-        messages=[
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt},
-        ],
-    )
-    reply = response.choices[0].message.content if hasattr(response.choices[0].message, "content") else str(response.choices[0].message)
-    lines = [line.strip() for line in reply.strip().splitlines()]
-    relevant_files = []
-    for line in lines:
-        cleaned = clean_line(line)
-        if cleaned in file_paths:
-            relevant_files.append(cleaned)
-        # else:
-        #     print(f"[Warning] Ignored unexpected line from LLM response: {line}")
-    if not relevant_files:
-        print("[Info] No valid file paths found in LLM response, defaulting to all files.")
-        return file_paths
-    else:
-        # print("RELEVANT files selected by LLM:")
-        return relevant_files
-# print(select_relevant_files_mistral('''
 # 🛠️ Configuration Error: Placeholder values detected in host_config.json
 # This file still includes default placeholders like:
@@ -87,7 +81,7 @@ def build_repo_index(owner: str, repo: str, ref: str = "main", issue_description
     file_paths = fetch_repo_files(owner, repo, ref)
     if issue_description:
-        file_paths = select_relevant_files_mistral(issue_description, file_paths)
     documents = []
     for path in file_paths:
@@ -108,83 +102,51 @@ def build_repo_index(owner: str, repo: str, ref: str = "main", issue_description
     return index
 # print(build_repo_index("aditi-dsi", "EvalAI-Starters", "master",
-# '''
-# 🛠️ Configuration Error: Placeholder values detected in host_config.json
-# This file still includes default placeholders like:
-# <evalai_user_auth_token>
-# <host_team_pk>
-# <evalai_host_url>
-# Please replace them with real values to proceed.
-# '''))
-def get_repo_index(owner: str, repo: str, ref: str, issue_description: str) -> VectorStoreIndex:
-    cache_key = f"{owner}/{repo}:{hash(issue_description)}"
-    if cache_key in repo_indices_cache:
-        print(f"[Cache] Returning cached index for {cache_key}")
-        return repo_indices_cache[cache_key]
-    index = build_repo_index(owner, repo, ref, issue_description)
-    repo_indices_cache[cache_key] = index
-    return index
-# print(get_repo_index("aditi-dsi", "EvalAI-Starters", "master",
-# '''
-# 🛠️ Configuration Error: Placeholder values detected in host_config.json
-# This file still includes default placeholders like:
-# <evalai_user_auth_token>
-# <host_team_pk>
-# <evalai_host_url>
-# Please replace them with real values to proceed.
-# '''))
 def retrieve_context(owner: str, repo: str, ref: str, issue_description: str) -> List[str]:
-    index = get_repo_index(owner, repo, ref, issue_description)
     Settings.llm = MistralAI(model="codestral-latest", api_key=MISTRAL_API_KEY)
     Settings.embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)
-    retriever = index.as_retriever(similarity_top_k=5)
     query_engine = RetrieverQueryEngine(
-    retriever=retriever,
-    response_synthesizer=get_response_synthesizer(),
-    node_postprocessors=[SimilarityPostprocessor(similarity_top_k=5)],
     )
-    query = f"Please give relevant information from the codebase about that can help to solve or understand this issue:{issue_description}"
     response = query_engine.query(query)
     print(response)
-    return None
-# index_tools = [
-#         {
-#         "type": "function",
-#         "function": {
-#             "name": "retrieve_context",
-#             "description": "Fetch relevant context from codebase for a GitHub issue",
-#             "parameters": {
-#                 "type": "object",
-#                 "properties": {
-#                     "owner": {
-#                         "type": "string",
-#                         "description": "The owner of the repository."
-#                     },
-#                     "repo": {
-#                         "type": "string",
-#                         "description": "The name of the repository."
-#                     },
-#                     "ref": {
-#                         "type": "string",
-#                         "description": "The branch or commit reference to index from."
-#                     },
-#                     "issue_description": {
-#                         "type": "string",
-#                         "description": "The description of the issue to retrieve context for."
-#                     }
-#                 },
-#                 "required": ["owner", "repo", "ref", "issue_description"]
-#             },
-#         },
-#     },
-# ]

+import numpy as np
 import os
 import re
+from sklearn.metrics.pairwise import cosine_similarity
 import time
 from typing import List, Dict
 from llama_index.core import VectorStoreIndex, Document, Settings, get_response_synthesizer
 from tools.utils import fetch_repo_files, fetch_file_content
 INCLUDE_FILE_EXTENSIONS = {".py", ".js", ".ts", ".json", ".md", ".txt"}
+def safe_normalize(vec: np.ndarray) -> np.ndarray:
+    vec = np.nan_to_num(vec, nan=0.0, posinf=0.0, neginf=0.0)
+    norm = np.linalg.norm(vec)
+    if norm == 0 or np.isnan(norm) or np.isinf(norm):
+        return None
+    return vec / norm
+def select_relevant_files_semantic(issue_description: str, file_paths: List[str]) -> List[str]:
+    embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)
+    issue_embedding = np.array(embed_model.get_text_embedding(issue_description), dtype=np.float64)
+    issue_embedding = safe_normalize(issue_embedding)
+    if issue_embedding is None:
+        print("[Warning] Issue description embedding invalid (zero or NaN norm). Returning empty list.")
+        return []
+    scored_files = []
+    for path in file_paths:
+        try:
+            file_embedding = np.array(embed_model.get_text_embedding(path), dtype=np.float64)
+            file_embedding = safe_normalize(file_embedding)
+            if file_embedding is None:
+                print(f"[Warning] Skipping {path} due to zero or invalid embedding norm.")
+                continue
+            with np.errstate(divide='ignore', invalid='ignore', over='ignore'):
+                score = cosine_similarity([issue_embedding], [file_embedding])[0][0]
+            if np.isnan(score) or np.isinf(score):
+                print(f"[Warning] Skipping {path} due to invalid similarity score.")
+                continue
+            scored_files.append((path, score))
+        except Exception as e:
+            print(f"[Warning] Skipping {path} due to error: {e}")
+    top_files = [f[0] for f in sorted(scored_files, key=lambda x: x[1], reverse=True)[:2]]
+    if "README.md" in file_paths:
+        if "README.md" not in top_files:
+            top_files.insert(0, "README.md")
+    return top_files
+# print(select_relevant_files_semantic(
+# '''
 # 🛠️ Configuration Error: Placeholder values detected in host_config.json
 # This file still includes default placeholders like:
     file_paths = fetch_repo_files(owner, repo, ref)
     if issue_description:
+        file_paths = select_relevant_files_semantic(issue_description, file_paths)
     documents = []
     for path in file_paths:
     return index
 # print(build_repo_index("aditi-dsi", "EvalAI-Starters", "master",
+    # '''
+    # 🛠️ Configuration Error: Placeholder values detected in host_config.json
+    # This file still includes default placeholders like:
+    # <evalai_user_auth_token>
+    # <host_team_pk>
+    # <evalai_host_url>
+    # Please replace them with real values to proceed.
+    # '''))
 def retrieve_context(owner: str, repo: str, ref: str, issue_description: str) -> List[str]:
+    print("Issue Description:", issue_description)
+    index = build_repo_index(owner, repo, ref, issue_description)
     Settings.llm = MistralAI(model="codestral-latest", api_key=MISTRAL_API_KEY)
     Settings.embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)
+    retriever = index.as_retriever(similarity_top_k=3)
     query_engine = RetrieverQueryEngine(
+        retriever=retriever,
+        response_synthesizer=get_response_synthesizer(),
+        node_postprocessors=[
+            SimilarityPostprocessor(similarity_top_k=3, similarity_cutoff=0.75)
+        ],
+    )
+    query = (
+    f"Please give relevant information from the codebase that highly matches the keywords of this issue and useful for solving or understanding this issue:{issue_description}"
+    "STRICT RULES:\n"
+    "- ONLY use information available in the retriever context.\n"
+    "- DO NOT generate or assume any information outside the given context.\n"
+    f"- ONLY include context that is highly relevant and clearly useful for understanding or solving this issue: {issue_description}\n"
+    "- DO NOT include generic, loosely related, or unrelated content.\n"
     )
+    print("query", query)
     response = query_engine.query(query)
     print(response)
+    return response
+# print(retrieve_context("aditi-dsi", "EvalAI-Starters", "master",
+#     '''
+#     🛠️ Configuration Error: Placeholder values detected in host_config.json
+#     This file still includes default placeholders like:
+#     <evalai_user_auth_token>
+#     <host_team_pk>
+#     <evalai_host_url>
+#     Please replace them with real values to proceed.
+#     '''))

tools/github_tools.py CHANGED Viewed

@@ -23,11 +23,11 @@ def get_issue_details(owner, repo, issue_num):
     }
     response = github_request("GET", url, headers=headers)
     if response.status_code == 200:
-        return response.json()
     else:
         raise Exception(f"Failed to fetch issue: {response.status_code} {response.text}")
-# print(get_issue_details("aditi-dsi", "testing-cryptope", "3"))
 def post_comment(owner, repo, issue_num, comment_body):
     installation_id = get_installation_id(owner, repo)

     }
     response = github_request("GET", url, headers=headers)
     if response.status_code == 200:
+        return response.json().get("body")
     else:
         raise Exception(f"Failed to fetch issue: {response.status_code} {response.text}")
+# print(get_issue_details("aditi-dsi", "testing-cryptope", "4"))
 def post_comment(owner, repo, issue_num, comment_body):
     installation_id = get_installation_id(owner, repo)