Commit
·
ccded5c
1
Parent(s):
8f1b296
Optimize retriever and agent for context aware responses
Browse files- .gitignore +1 -61
- agent/__init__.py +0 -0
- agent/agent_config/prompts.py +15 -5
- agent/agent_config/tool_schema.py +1 -1
- agent/core.py +74 -49
- app.py +24 -0
- requirements.txt +1 -0
- tools/code_index.py +80 -118
- tools/github_tools.py +2 -2
.gitignore
CHANGED
@@ -3,26 +3,18 @@ __pycache__/
|
|
3 |
*.py[cod]
|
4 |
*$py.class
|
5 |
|
6 |
-
# C extensions
|
7 |
-
*.so
|
8 |
-
|
9 |
# Distribution / packaging
|
10 |
.Python
|
11 |
build/
|
12 |
-
develop-eggs/
|
13 |
dist/
|
14 |
downloads/
|
15 |
-
eggs/
|
16 |
-
.eggs/
|
17 |
lib/
|
18 |
lib64/
|
19 |
parts/
|
20 |
sdist/
|
21 |
var/
|
22 |
wheels/
|
23 |
-
*.egg-info/
|
24 |
.installed.cfg
|
25 |
-
*.egg
|
26 |
|
27 |
# PyInstaller
|
28 |
*.manifest
|
@@ -32,49 +24,6 @@ wheels/
|
|
32 |
pip-log.txt
|
33 |
pip-delete-this-directory.txt
|
34 |
|
35 |
-
# Unit test / coverage reports
|
36 |
-
htmlcov/
|
37 |
-
.tox/
|
38 |
-
.nox/
|
39 |
-
.coverage
|
40 |
-
.coverage.*
|
41 |
-
.cache
|
42 |
-
nosetests.xml
|
43 |
-
coverage.xml
|
44 |
-
*.cover
|
45 |
-
*.py,cover
|
46 |
-
.hypothesis/
|
47 |
-
.pytest_cache/
|
48 |
-
|
49 |
-
# Translations
|
50 |
-
*.mo
|
51 |
-
*.pot
|
52 |
-
|
53 |
-
# Django stuff:
|
54 |
-
*.log
|
55 |
-
local_settings.py
|
56 |
-
db.sqlite3
|
57 |
-
|
58 |
-
# Flask stuff:
|
59 |
-
instance/
|
60 |
-
.webassets-cache
|
61 |
-
|
62 |
-
# Scrapy stuff:
|
63 |
-
.scrapy
|
64 |
-
|
65 |
-
# Sphinx documentation
|
66 |
-
docs/_build/
|
67 |
-
|
68 |
-
# PyBuilder
|
69 |
-
target/
|
70 |
-
|
71 |
-
# Jupyter Notebook
|
72 |
-
.ipynb_checkpoints
|
73 |
-
|
74 |
-
# IPython
|
75 |
-
profile_default/
|
76 |
-
ipython_config.py
|
77 |
-
|
78 |
# pyenv
|
79 |
.python-version
|
80 |
|
@@ -99,13 +48,4 @@ env.bak/
|
|
99 |
venv.bak/
|
100 |
|
101 |
# VS Code settings
|
102 |
-
.vscode/
|
103 |
-
|
104 |
-
# PyCharm
|
105 |
-
.idea/
|
106 |
-
|
107 |
-
# macOS system files
|
108 |
-
.DS_Store
|
109 |
-
|
110 |
-
# Windows system files
|
111 |
-
Thumbs.db
|
|
|
3 |
*.py[cod]
|
4 |
*$py.class
|
5 |
|
|
|
|
|
|
|
6 |
# Distribution / packaging
|
7 |
.Python
|
8 |
build/
|
|
|
9 |
dist/
|
10 |
downloads/
|
|
|
|
|
11 |
lib/
|
12 |
lib64/
|
13 |
parts/
|
14 |
sdist/
|
15 |
var/
|
16 |
wheels/
|
|
|
17 |
.installed.cfg
|
|
|
18 |
|
19 |
# PyInstaller
|
20 |
*.manifest
|
|
|
24 |
pip-log.txt
|
25 |
pip-delete-this-directory.txt
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
# pyenv
|
28 |
.python-version
|
29 |
|
|
|
48 |
venv.bak/
|
49 |
|
50 |
# VS Code settings
|
51 |
+
.vscode/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agent/__init__.py
ADDED
File without changes
|
agent/agent_config/prompts.py
CHANGED
@@ -10,16 +10,26 @@ system_message = {
|
|
10 |
"- Feature request\n"
|
11 |
"- Incomplete or unclear\n\n"
|
12 |
|
13 |
-
"Then, based on the classification, write a
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
"The comment should be well formatted and readable, using Markdown for code blocks and lists where appropriate.\n\n"
|
15 |
"DO NOT paste or repeat the issue description. DO NOT quote it. Respond entirely in your own words.\n"
|
16 |
"You can only use the following tools: fetch_github_issue, get_issue_details, retrieve_context, post_comment.\n"
|
|
|
|
|
|
|
|
|
|
|
17 |
"Do not attempt to use any other tools such as web_search."
|
18 |
"DO NOT HALLUCINATE OR MAKE UP TOOLS."
|
19 |
)
|
20 |
}
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
"content": "Please suggest a fix on this issue https://github.com/aditi-dsi/testing-cryptope/issues/4."
|
25 |
-
}
|
|
|
10 |
"- Feature request\n"
|
11 |
"- Incomplete or unclear\n\n"
|
12 |
|
13 |
+
"Then, based on the classification, write a CLEAR, CONCISE, and FRIENDLY response.\n\n"
|
14 |
+
|
15 |
+
"STEPS TO FOLLOW:\n"
|
16 |
+
"You are an AI agent that assists in resolving GitHub issues.\n"
|
17 |
+
"First, call `get_issue_details` using the `issue_url` to obtain the full issue description and issue title.\n"
|
18 |
+
"Then, when calling `retriever_context`, always pass the exact `issue_title+issue_description` you got from `get_issue_details`.\n"
|
19 |
+
"Do not fabricate or reuse incorrect descriptions.\n\n"
|
20 |
+
|
21 |
"The comment should be well formatted and readable, using Markdown for code blocks and lists where appropriate.\n\n"
|
22 |
"DO NOT paste or repeat the issue description. DO NOT quote it. Respond entirely in your own words.\n"
|
23 |
"You can only use the following tools: fetch_github_issue, get_issue_details, retrieve_context, post_comment.\n"
|
24 |
+
"Whenever an issue involves deals with code or codebase, use the `retrieve_context` tool to get the relevant code snippets or metadata about the codebase to formulate your response.\n"
|
25 |
+
"STRICTLY READ the context that you get back from `retrieve_context` and use it to inform your response.\n"
|
26 |
+
"If you do not get any relevant context from `retrieve_context` tool then JUST STICK to the context that is provided in the issue description.\n\n"
|
27 |
+
"DO NOT OVERUSE the context retrieved from `retrieve_context`, only extract relevant context that exactly matches to the current issue.\n\n"
|
28 |
+
"DO NOT OVEREXAGGERATE OR MAKE UP INFORMATION.\n"
|
29 |
"Do not attempt to use any other tools such as web_search."
|
30 |
"DO NOT HALLUCINATE OR MAKE UP TOOLS."
|
31 |
)
|
32 |
}
|
33 |
|
34 |
+
# "STRICTLY use the `retrieve_context` tool to get the relevant code snippets or metadata about the codebase to formulate your response.\n\n"
|
35 |
+
# "Stick to the context that your retri"
|
|
|
|
agent/agent_config/tool_schema.py
CHANGED
@@ -63,7 +63,7 @@ tools = [
|
|
63 |
},
|
64 |
"issue_description": {
|
65 |
"type": "string",
|
66 |
-
"description": "The description
|
67 |
}
|
68 |
},
|
69 |
"required": ["owner", "repo", "ref", "issue_description"]
|
|
|
63 |
},
|
64 |
"issue_description": {
|
65 |
"type": "string",
|
66 |
+
"description": "The exact issue description from the issue the agent is resolving. Must be passed without rephrasing."
|
67 |
}
|
68 |
},
|
69 |
"required": ["owner", "repo", "ref", "issue_description"]
|
agent/core.py
CHANGED
@@ -17,62 +17,87 @@ names_to_functions = {
|
|
17 |
allowed_tools = set(names_to_functions.keys())
|
18 |
|
19 |
system_message = prompts.system_message
|
20 |
-
user_message = {
|
21 |
-
"role": "user",
|
22 |
-
"content": "Please suggest a fix on this issue https://github.com/aditi-dsi/testing-cryptope/issues/4."
|
23 |
-
}
|
24 |
-
|
25 |
-
messages = [system_message, user_message]
|
26 |
|
27 |
api_key = MISTRAL_API_KEY
|
28 |
model = "devstral-small-latest"
|
29 |
client = Mistral(api_key=api_key)
|
30 |
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
function_result = names_to_functions[function_name](**function_params)
|
50 |
-
print(f"Agent is calling tool: {function_name}")
|
51 |
-
tool_calls += 1
|
52 |
-
messages.append({
|
53 |
-
"role": "tool",
|
54 |
-
"tool_call_id": tool_call.id,
|
55 |
-
"content": str(function_result)
|
56 |
-
})
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
75 |
break
|
76 |
-
|
77 |
-
print("OpenSorus (final):", msg.content)
|
78 |
-
break
|
|
|
17 |
allowed_tools = set(names_to_functions.keys())
|
18 |
|
19 |
system_message = prompts.system_message
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
api_key = MISTRAL_API_KEY
|
22 |
model = "devstral-small-latest"
|
23 |
client = Mistral(api_key=api_key)
|
24 |
|
25 |
+
def run_agent(issue_url: str, branch_name: str = "main") -> str:
|
26 |
+
"""
|
27 |
+
Run the agent workflow on a given GitHub issue URL.
|
28 |
+
"""
|
29 |
+
|
30 |
+
MAX_STEPS = 5
|
31 |
+
tool_calls = 0
|
32 |
+
issue_description_cache = None
|
33 |
+
|
34 |
+
user_message = {
|
35 |
+
"role": "user",
|
36 |
+
"content": f"Please suggest a fix on this issue {issue_url} and use {branch_name} branch for retrieving code context."
|
37 |
+
}
|
38 |
+
messages = [system_message, user_message]
|
39 |
+
|
40 |
+
while True:
|
41 |
+
response = client.chat.complete(
|
42 |
+
model=model,
|
43 |
+
messages=messages,
|
44 |
+
tools=tools,
|
45 |
+
tool_choice="any",
|
46 |
+
)
|
47 |
+
msg = response.choices[0].message
|
48 |
+
messages.append(msg)
|
49 |
+
|
50 |
+
|
51 |
+
if hasattr(msg, "tool_calls") and msg.tool_calls:
|
52 |
+
for tool_call in msg.tool_calls:
|
53 |
+
function_name = tool_call.function.name
|
54 |
+
function_params = json.loads(tool_call.function.arguments)
|
55 |
+
if function_name in allowed_tools:
|
56 |
+
function_result = names_to_functions[function_name](**function_params)
|
57 |
+
print(f"Agent is calling tool: {function_name}")
|
58 |
+
tool_calls += 1
|
59 |
+
|
60 |
+
if function_name == "get_issue_details" and isinstance(function_result, dict):
|
61 |
+
issue_title = function_result.get("title")
|
62 |
+
issue_body = function_result.get("body")
|
63 |
+
issue_description_cache = issue_title + "\n" + issue_body if issue_title or issue_body else None
|
64 |
+
print("ISSUE DESCRIPTION CACHE ✨:", issue_description_cache)
|
65 |
|
66 |
+
if function_name == "retrieve_context":
|
67 |
+
if "issue_description" in function_params:
|
68 |
+
if (
|
69 |
+
issue_description_cache
|
70 |
+
and (function_params["issue_description"] != issue_description_cache)
|
71 |
+
):
|
72 |
+
print("🔁 Overriding incorrect issue_description with correct one from cache.")
|
73 |
+
function_params["issue_description"] = issue_description_cache
|
74 |
+
function_result = names_to_functions[function_name](**function_params)
|
75 |
|
76 |
+
messages.append({
|
77 |
+
"role": "tool",
|
78 |
+
"tool_call_id": tool_call.id,
|
79 |
+
"content": str(function_result)
|
80 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
+
if function_name == "post_comment":
|
83 |
+
print("OpenSorus (final): ✅ Comment posted successfully. No further action needed.")
|
84 |
+
return "Task Completed"
|
85 |
|
86 |
+
else:
|
87 |
+
print(f"LLM tried to call unknown tool: {function_name}")
|
88 |
+
tool_error_msg = (
|
89 |
+
f"Error: Tool '{function_name}' is not available. "
|
90 |
+
"You can only use the following tools: fetch_github_issue, get_issue_details, post_comment."
|
91 |
+
)
|
92 |
+
messages.append({
|
93 |
+
"role": "tool",
|
94 |
+
"tool_call_id": tool_call.id,
|
95 |
+
"content": tool_error_msg
|
96 |
+
})
|
97 |
+
if tool_calls >= MAX_STEPS:
|
98 |
+
print(f"Agent stopped after {MAX_STEPS} tool calls to protect against rate limiting.")
|
99 |
+
break
|
100 |
+
else:
|
101 |
+
print("OpenSorus (final):", msg.content)
|
102 |
break
|
103 |
+
return "Task Completed"
|
|
|
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from agent.core import run_agent
|
3 |
+
|
4 |
+
def respond_to_issue(issue_url, branch_name):
|
5 |
+
try:
|
6 |
+
result = run_agent(issue_url, branch_name)
|
7 |
+
response = "Agent has successfully processed the issue and posted an update in the comments. Check the GitHub issue for updates."
|
8 |
+
except Exception as e:
|
9 |
+
response = f"Something went wrong: {str(e)}"
|
10 |
+
return response
|
11 |
+
|
12 |
+
iface = gr.Interface(
|
13 |
+
fn=respond_to_issue,
|
14 |
+
inputs=[
|
15 |
+
gr.Textbox(label="GitHub Issue URL", placeholder="https://github.com/user/repo/issues/123"),
|
16 |
+
gr.Textbox(label="Branch Name", placeholder="main or dev or feature/xyz")
|
17 |
+
],
|
18 |
+
outputs=gr.Textbox(label="Agent Response"),
|
19 |
+
title="GitHub Issue AI Agent",
|
20 |
+
description="Enter a GitHub issue URL you want to assign to OpenSorus and the branch to refer for code context (default is 'main'). The agent will fetch relevant context and respond."
|
21 |
+
)
|
22 |
+
|
23 |
+
if __name__ == "__main__":
|
24 |
+
iface.launch()
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
llama_index==0.12.40
|
2 |
mistralai==1.8.1
|
3 |
PyJWT==2.10.1
|
|
|
1 |
+
gradio==5.33.0
|
2 |
llama_index==0.12.40
|
3 |
mistralai==1.8.1
|
4 |
PyJWT==2.10.1
|
tools/code_index.py
CHANGED
@@ -1,5 +1,7 @@
|
|
|
|
1 |
import os
|
2 |
import re
|
|
|
3 |
import time
|
4 |
from typing import List, Dict
|
5 |
from llama_index.core import VectorStoreIndex, Document, Settings, get_response_synthesizer
|
@@ -12,64 +14,56 @@ from config import MISTRAL_API_KEY
|
|
12 |
from tools.utils import fetch_repo_files, fetch_file_content
|
13 |
|
14 |
|
15 |
-
repo_indices_cache: Dict[str, VectorStoreIndex] = {}
|
16 |
INCLUDE_FILE_EXTENSIONS = {".py", ".js", ".ts", ".json", ".md", ".txt"}
|
17 |
|
18 |
-
def
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
21 |
|
22 |
-
|
|
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
client = Mistral(api_key=MISTRAL_API_KEY)
|
28 |
-
|
29 |
-
system_prompt = '''
|
30 |
-
You are a code reasoning assistant. Given a GitHub issue description and a list of file paths from a codebase, return a list of top 5 files that are most relevant to solving or understanding the issue, based on naming, possible associations, or inferred logic.
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
],
|
51 |
-
)
|
52 |
|
53 |
-
|
54 |
|
55 |
-
|
56 |
-
|
|
|
57 |
|
58 |
-
|
59 |
-
cleaned = clean_line(line)
|
60 |
-
if cleaned in file_paths:
|
61 |
-
relevant_files.append(cleaned)
|
62 |
-
# else:
|
63 |
-
# print(f"[Warning] Ignored unexpected line from LLM response: {line}")
|
64 |
|
65 |
-
if not relevant_files:
|
66 |
-
print("[Info] No valid file paths found in LLM response, defaulting to all files.")
|
67 |
-
return file_paths
|
68 |
-
else:
|
69 |
-
# print("RELEVANT files selected by LLM:")
|
70 |
-
return relevant_files
|
71 |
|
72 |
-
# print(
|
|
|
73 |
# 🛠️ Configuration Error: Placeholder values detected in host_config.json
|
74 |
# This file still includes default placeholders like:
|
75 |
|
@@ -87,7 +81,7 @@ def build_repo_index(owner: str, repo: str, ref: str = "main", issue_description
|
|
87 |
file_paths = fetch_repo_files(owner, repo, ref)
|
88 |
|
89 |
if issue_description:
|
90 |
-
file_paths =
|
91 |
|
92 |
documents = []
|
93 |
for path in file_paths:
|
@@ -108,83 +102,51 @@ def build_repo_index(owner: str, repo: str, ref: str = "main", issue_description
|
|
108 |
return index
|
109 |
|
110 |
# print(build_repo_index("aditi-dsi", "EvalAI-Starters", "master",
|
111 |
-
# '''
|
112 |
-
# 🛠️ Configuration Error: Placeholder values detected in host_config.json
|
113 |
-
# This file still includes default placeholders like:
|
114 |
|
115 |
-
# <evalai_user_auth_token>
|
116 |
-
# <host_team_pk>
|
117 |
-
# <evalai_host_url>
|
118 |
-
# Please replace them with real values to proceed.
|
119 |
-
# '''))
|
120 |
-
|
121 |
-
|
122 |
-
def get_repo_index(owner: str, repo: str, ref: str, issue_description: str) -> VectorStoreIndex:
|
123 |
-
cache_key = f"{owner}/{repo}:{hash(issue_description)}"
|
124 |
-
if cache_key in repo_indices_cache:
|
125 |
-
print(f"[Cache] Returning cached index for {cache_key}")
|
126 |
-
return repo_indices_cache[cache_key]
|
127 |
-
|
128 |
-
index = build_repo_index(owner, repo, ref, issue_description)
|
129 |
-
repo_indices_cache[cache_key] = index
|
130 |
-
return index
|
131 |
-
|
132 |
-
|
133 |
-
# print(get_repo_index("aditi-dsi", "EvalAI-Starters", "master",
|
134 |
-
# '''
|
135 |
-
# 🛠️ Configuration Error: Placeholder values detected in host_config.json
|
136 |
-
# This file still includes default placeholders like:
|
137 |
-
|
138 |
-
# <evalai_user_auth_token>
|
139 |
-
# <host_team_pk>
|
140 |
-
# <evalai_host_url>
|
141 |
-
# Please replace them with real values to proceed.
|
142 |
-
# '''))
|
143 |
|
144 |
|
145 |
def retrieve_context(owner: str, repo: str, ref: str, issue_description: str) -> List[str]:
|
146 |
-
|
|
|
147 |
Settings.llm = MistralAI(model="codestral-latest", api_key=MISTRAL_API_KEY)
|
148 |
Settings.embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)
|
149 |
-
retriever = index.as_retriever(similarity_top_k=
|
|
|
150 |
query_engine = RetrieverQueryEngine(
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
)
|
155 |
-
query
|
156 |
response = query_engine.query(query)
|
157 |
print(response)
|
158 |
-
return
|
159 |
-
|
160 |
-
#
|
161 |
-
#
|
162 |
-
#
|
163 |
-
#
|
164 |
-
|
165 |
-
#
|
166 |
-
#
|
167 |
-
#
|
168 |
-
#
|
169 |
-
#
|
170 |
-
# "type": "string",
|
171 |
-
# "description": "The owner of the repository."
|
172 |
-
# },
|
173 |
-
# "repo": {
|
174 |
-
# "type": "string",
|
175 |
-
# "description": "The name of the repository."
|
176 |
-
# },
|
177 |
-
# "ref": {
|
178 |
-
# "type": "string",
|
179 |
-
# "description": "The branch or commit reference to index from."
|
180 |
-
# },
|
181 |
-
# "issue_description": {
|
182 |
-
# "type": "string",
|
183 |
-
# "description": "The description of the issue to retrieve context for."
|
184 |
-
# }
|
185 |
-
# },
|
186 |
-
# "required": ["owner", "repo", "ref", "issue_description"]
|
187 |
-
# },
|
188 |
-
# },
|
189 |
-
# },
|
190 |
-
# ]
|
|
|
1 |
+
import numpy as np
|
2 |
import os
|
3 |
import re
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
import time
|
6 |
from typing import List, Dict
|
7 |
from llama_index.core import VectorStoreIndex, Document, Settings, get_response_synthesizer
|
|
|
14 |
from tools.utils import fetch_repo_files, fetch_file_content
|
15 |
|
16 |
|
|
|
17 |
INCLUDE_FILE_EXTENSIONS = {".py", ".js", ".ts", ".json", ".md", ".txt"}
|
18 |
|
19 |
+
def safe_normalize(vec: np.ndarray) -> np.ndarray:
|
20 |
+
vec = np.nan_to_num(vec, nan=0.0, posinf=0.0, neginf=0.0)
|
21 |
+
norm = np.linalg.norm(vec)
|
22 |
+
if norm == 0 or np.isnan(norm) or np.isinf(norm):
|
23 |
+
return None
|
24 |
+
return vec / norm
|
25 |
|
26 |
+
def select_relevant_files_semantic(issue_description: str, file_paths: List[str]) -> List[str]:
|
27 |
+
embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)
|
28 |
|
29 |
+
issue_embedding = np.array(embed_model.get_text_embedding(issue_description), dtype=np.float64)
|
30 |
+
issue_embedding = safe_normalize(issue_embedding)
|
31 |
+
if issue_embedding is None:
|
32 |
+
print("[Warning] Issue description embedding invalid (zero or NaN norm). Returning empty list.")
|
33 |
+
return []
|
34 |
|
35 |
+
scored_files = []
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
for path in file_paths:
|
38 |
+
try:
|
39 |
+
file_embedding = np.array(embed_model.get_text_embedding(path), dtype=np.float64)
|
40 |
+
file_embedding = safe_normalize(file_embedding)
|
41 |
+
if file_embedding is None:
|
42 |
+
print(f"[Warning] Skipping {path} due to zero or invalid embedding norm.")
|
43 |
+
continue
|
44 |
+
|
45 |
+
with np.errstate(divide='ignore', invalid='ignore', over='ignore'):
|
46 |
+
score = cosine_similarity([issue_embedding], [file_embedding])[0][0]
|
47 |
+
|
48 |
+
if np.isnan(score) or np.isinf(score):
|
49 |
+
print(f"[Warning] Skipping {path} due to invalid similarity score.")
|
50 |
+
continue
|
51 |
+
|
52 |
+
scored_files.append((path, score))
|
53 |
+
except Exception as e:
|
54 |
+
print(f"[Warning] Skipping {path} due to error: {e}")
|
|
|
|
|
55 |
|
56 |
+
top_files = [f[0] for f in sorted(scored_files, key=lambda x: x[1], reverse=True)[:2]]
|
57 |
|
58 |
+
if "README.md" in file_paths:
|
59 |
+
if "README.md" not in top_files:
|
60 |
+
top_files.insert(0, "README.md")
|
61 |
|
62 |
+
return top_files
|
|
|
|
|
|
|
|
|
|
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
+
# print(select_relevant_files_semantic(
|
66 |
+
# '''
|
67 |
# 🛠️ Configuration Error: Placeholder values detected in host_config.json
|
68 |
# This file still includes default placeholders like:
|
69 |
|
|
|
81 |
file_paths = fetch_repo_files(owner, repo, ref)
|
82 |
|
83 |
if issue_description:
|
84 |
+
file_paths = select_relevant_files_semantic(issue_description, file_paths)
|
85 |
|
86 |
documents = []
|
87 |
for path in file_paths:
|
|
|
102 |
return index
|
103 |
|
104 |
# print(build_repo_index("aditi-dsi", "EvalAI-Starters", "master",
|
105 |
+
# '''
|
106 |
+
# 🛠️ Configuration Error: Placeholder values detected in host_config.json
|
107 |
+
# This file still includes default placeholders like:
|
108 |
|
109 |
+
# <evalai_user_auth_token>
|
110 |
+
# <host_team_pk>
|
111 |
+
# <evalai_host_url>
|
112 |
+
# Please replace them with real values to proceed.
|
113 |
+
# '''))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
|
116 |
def retrieve_context(owner: str, repo: str, ref: str, issue_description: str) -> List[str]:
|
117 |
+
print("Issue Description:", issue_description)
|
118 |
+
index = build_repo_index(owner, repo, ref, issue_description)
|
119 |
Settings.llm = MistralAI(model="codestral-latest", api_key=MISTRAL_API_KEY)
|
120 |
Settings.embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)
|
121 |
+
retriever = index.as_retriever(similarity_top_k=3)
|
122 |
+
|
123 |
query_engine = RetrieverQueryEngine(
|
124 |
+
retriever=retriever,
|
125 |
+
response_synthesizer=get_response_synthesizer(),
|
126 |
+
node_postprocessors=[
|
127 |
+
SimilarityPostprocessor(similarity_top_k=3, similarity_cutoff=0.75)
|
128 |
+
],
|
129 |
+
)
|
130 |
+
query = (
|
131 |
+
f"Please give relevant information from the codebase that highly matches the keywords of this issue and useful for solving or understanding this issue:{issue_description}"
|
132 |
+
"STRICT RULES:\n"
|
133 |
+
"- ONLY use information available in the retriever context.\n"
|
134 |
+
"- DO NOT generate or assume any information outside the given context.\n"
|
135 |
+
f"- ONLY include context that is highly relevant and clearly useful for understanding or solving this issue: {issue_description}\n"
|
136 |
+
"- DO NOT include generic, loosely related, or unrelated content.\n"
|
137 |
)
|
138 |
+
print("query", query)
|
139 |
response = query_engine.query(query)
|
140 |
print(response)
|
141 |
+
return response
|
142 |
+
|
143 |
+
# print(retrieve_context("aditi-dsi", "EvalAI-Starters", "master",
|
144 |
+
# '''
|
145 |
+
# 🛠️ Configuration Error: Placeholder values detected in host_config.json
|
146 |
+
# This file still includes default placeholders like:
|
147 |
+
|
148 |
+
# <evalai_user_auth_token>
|
149 |
+
# <host_team_pk>
|
150 |
+
# <evalai_host_url>
|
151 |
+
# Please replace them with real values to proceed.
|
152 |
+
# '''))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/github_tools.py
CHANGED
@@ -23,11 +23,11 @@ def get_issue_details(owner, repo, issue_num):
|
|
23 |
}
|
24 |
response = github_request("GET", url, headers=headers)
|
25 |
if response.status_code == 200:
|
26 |
-
return response.json()
|
27 |
else:
|
28 |
raise Exception(f"Failed to fetch issue: {response.status_code} {response.text}")
|
29 |
|
30 |
-
# print(get_issue_details("aditi-dsi", "testing-cryptope", "
|
31 |
|
32 |
def post_comment(owner, repo, issue_num, comment_body):
|
33 |
installation_id = get_installation_id(owner, repo)
|
|
|
23 |
}
|
24 |
response = github_request("GET", url, headers=headers)
|
25 |
if response.status_code == 200:
|
26 |
+
return response.json().get("body")
|
27 |
else:
|
28 |
raise Exception(f"Failed to fetch issue: {response.status_code} {response.text}")
|
29 |
|
30 |
+
# print(get_issue_details("aditi-dsi", "testing-cryptope", "4"))
|
31 |
|
32 |
def post_comment(owner, repo, issue_num, comment_body):
|
33 |
installation_id = get_installation_id(owner, repo)
|