Spaces:
Sleeping
Sleeping
First commit
Browse files- .gitignore +5 -0
- app.py +268 -0
- chat_with_project.py +110 -0
- get_prompts.py +62 -0
- milvus.py +105 -0
- requirements.txt +8 -0
- requirements_dev.txt +143 -0
- utils/extract.py +69 -0
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
/workspace
|
3 |
+
/__pycache__
|
4 |
+
.env
|
5 |
+
/extraction
|
app.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import zipfile
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
import subprocess
|
6 |
+
from chat_with_project import query_project
|
7 |
+
from get_prompts import get_prompt_for_mode
|
8 |
+
from dotenv import load_dotenv, set_key
|
9 |
+
from milvus import initialize_milvus, DEFAULT_MILVUS_HOST, DEFAULT_MILVUS_PORT, DEFAULT_COLLECTION_NAME, DEFAULT_DIMENSION, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_DELAY
|
10 |
+
|
11 |
+
# --- Configuration and Setup ---
|
12 |
+
|
13 |
+
# Define paths for workspace and extraction directories
|
14 |
+
WORKSPACE_DIR = "workspace"
|
15 |
+
EXTRACTION_DIR = "extraction"
|
16 |
+
|
17 |
+
def clear_directories():
|
18 |
+
"""Clears the workspace and extraction directories."""
|
19 |
+
for directory in [WORKSPACE_DIR, EXTRACTION_DIR]:
|
20 |
+
if os.path.exists(directory):
|
21 |
+
shutil.rmtree(directory)
|
22 |
+
os.makedirs(directory, exist_ok=True)
|
23 |
+
|
24 |
+
# Clear directories at startup
|
25 |
+
clear_directories()
|
26 |
+
|
27 |
+
# --- API Key Management ---
|
28 |
+
|
29 |
+
def ensure_env_file_exists():
|
30 |
+
"""Ensures that a .env file exists in the project root."""
|
31 |
+
if not os.path.exists(".env"):
|
32 |
+
with open(".env", "w") as f:
|
33 |
+
f.write("") # Create an empty .env file
|
34 |
+
|
35 |
+
def load_api_key():
|
36 |
+
"""Loads the API key from the .env file or the environment."""
|
37 |
+
ensure_env_file_exists()
|
38 |
+
load_dotenv()
|
39 |
+
return os.environ.get("OPENAI_API_KEY")
|
40 |
+
|
41 |
+
def update_api_key(api_key):
|
42 |
+
"""Updates the API key in the .env file."""
|
43 |
+
if api_key:
|
44 |
+
set_key(".env", "OPENAI_API_KEY", api_key)
|
45 |
+
load_dotenv() # Reload environment variables
|
46 |
+
return "API key updated successfully."
|
47 |
+
else:
|
48 |
+
return "API key cannot be empty."
|
49 |
+
|
50 |
+
def is_api_key_set():
|
51 |
+
"""Checks if the API key is set."""
|
52 |
+
return bool(load_api_key())
|
53 |
+
|
54 |
+
# --- Core Functionalities ---
|
55 |
+
|
56 |
+
def process_zip(zip_file_path):
|
57 |
+
"""Extracts a zip file, analyzes content, and stores information."""
|
58 |
+
try:
|
59 |
+
# Clear existing workspace and extraction directories before processing
|
60 |
+
clear_directories()
|
61 |
+
|
62 |
+
# Extract the zip file
|
63 |
+
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
64 |
+
zip_ref.extractall(WORKSPACE_DIR)
|
65 |
+
|
66 |
+
# Run extract.py
|
67 |
+
subprocess.run(["python", "./utils/extract.py", WORKSPACE_DIR], check=True)
|
68 |
+
|
69 |
+
return "Processing complete! Results saved in the 'extraction' directory."
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
return f"An error occurred: {e}"
|
73 |
+
|
74 |
+
def init_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay):
|
75 |
+
"""Initializes or loads the Milvus vector database."""
|
76 |
+
try:
|
77 |
+
# Convert string inputs to appropriate types
|
78 |
+
milvus_port = int(milvus_port)
|
79 |
+
dimension = int(dimension)
|
80 |
+
max_retries = int(max_retries)
|
81 |
+
retry_delay = int(retry_delay)
|
82 |
+
|
83 |
+
initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay)
|
84 |
+
return "Milvus database initialized or loaded successfully."
|
85 |
+
|
86 |
+
except Exception as e:
|
87 |
+
return f"Error initializing Milvus: {e}"
|
88 |
+
|
89 |
+
# --- Chatbot Verification ---
|
90 |
+
|
91 |
+
def is_project_loaded():
|
92 |
+
"""Checks if a project has been loaded (i.e., if the extraction directory contains .pkl files)."""
|
93 |
+
extraction_dir = "extraction"
|
94 |
+
pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
|
95 |
+
return bool(pkl_files)
|
96 |
+
# --- Gradio UI Components ---
|
97 |
+
|
98 |
+
# Chat Interface
|
99 |
+
def chat_ui(query, history, mode):
|
100 |
+
"""Handles the chat interaction for Analyzer, Debugger, and Developer modes."""
|
101 |
+
api_key = load_api_key()
|
102 |
+
if not api_key:
|
103 |
+
return "Error: OpenAI API key not set. Please set the API key in the Settings tab.", []
|
104 |
+
|
105 |
+
if not is_project_loaded():
|
106 |
+
return "Error: No project loaded. Please upload and process a ZIP file first.", []
|
107 |
+
|
108 |
+
# Initialize history if None
|
109 |
+
if history is None:
|
110 |
+
history = []
|
111 |
+
|
112 |
+
print(f"Chat Mode: {mode}")
|
113 |
+
system_prompt = get_prompt_for_mode(mode)
|
114 |
+
print(f"System Prompt: {system_prompt}")
|
115 |
+
|
116 |
+
# Pass the query and system prompt to the LLM
|
117 |
+
response = query_project(query, system_prompt)
|
118 |
+
print(f"Response from query_project: {response}")
|
119 |
+
|
120 |
+
if response is None or not response.strip():
|
121 |
+
response = "An error occurred during processing. Please check the logs."
|
122 |
+
|
123 |
+
if mode == "developer":
|
124 |
+
extracted_files = extract_files_from_response(response)
|
125 |
+
|
126 |
+
# Format the output for developer mode
|
127 |
+
developer_response = ""
|
128 |
+
for filepath, content in extracted_files.items():
|
129 |
+
developer_response += f"**{filepath}:**\n`python\n{content}\n`\n\n"
|
130 |
+
|
131 |
+
history.append((query, developer_response))
|
132 |
+
# Return history and an empty string for the text output (as it's handled by the chatbot)
|
133 |
+
return history, history
|
134 |
+
|
135 |
+
else:
|
136 |
+
# Format the output for non-developer modes
|
137 |
+
formatted_response = response.replace('\n', ' \n') # Use two spaces for markdown line breaks
|
138 |
+
history.append((query, formatted_response))
|
139 |
+
# Return history and an empty string for the text output (as it's handled by the chatbot)
|
140 |
+
return history, history
|
141 |
+
|
142 |
+
|
143 |
+
def extract_files_from_response(response):
|
144 |
+
"""
|
145 |
+
Parses the LLM response to extract file paths and their corresponding code content.
|
146 |
+
|
147 |
+
Args:
|
148 |
+
response (str): The raw response string from the LLM.
|
149 |
+
|
150 |
+
Returns:
|
151 |
+
dict: A dictionary where keys are file paths and values are the code content of each file.
|
152 |
+
"""
|
153 |
+
files = {}
|
154 |
+
current_file = None
|
155 |
+
current_content = []
|
156 |
+
|
157 |
+
for line in response.splitlines():
|
158 |
+
if line.startswith("--- BEGIN FILE:"):
|
159 |
+
if current_file is not None:
|
160 |
+
# Save previous file content
|
161 |
+
files[current_file] = "\n".join(current_content)
|
162 |
+
|
163 |
+
# Start a new file
|
164 |
+
current_file = line.replace("--- BEGIN FILE:", "").strip()
|
165 |
+
current_content = []
|
166 |
+
elif line.startswith("--- END FILE:"):
|
167 |
+
if current_file is not None:
|
168 |
+
# Save current file content
|
169 |
+
files[current_file] = "\n".join(current_content)
|
170 |
+
current_file = None
|
171 |
+
current_content = []
|
172 |
+
elif current_file is not None:
|
173 |
+
# Append line to current file content
|
174 |
+
current_content.append(line)
|
175 |
+
|
176 |
+
return files
|
177 |
+
|
178 |
+
# ZIP Processing Interface
|
179 |
+
zip_iface = gr.Interface(
|
180 |
+
fn=process_zip,
|
181 |
+
inputs=gr.File(label="Upload ZIP File"),
|
182 |
+
outputs="text",
|
183 |
+
title="Zip File Analyzer",
|
184 |
+
description="Upload a zip file to analyze and store its contents.",
|
185 |
+
)
|
186 |
+
|
187 |
+
# Milvus Initialization Interface
|
188 |
+
milvus_iface = gr.Interface(
|
189 |
+
fn=init_milvus,
|
190 |
+
inputs=[
|
191 |
+
gr.Textbox(label="Milvus Host", placeholder=DEFAULT_MILVUS_HOST, value=DEFAULT_MILVUS_HOST),
|
192 |
+
gr.Textbox(label="Milvus Port", placeholder=DEFAULT_MILVUS_PORT, value=DEFAULT_MILVUS_PORT),
|
193 |
+
gr.Textbox(label="Collection Name", placeholder=DEFAULT_COLLECTION_NAME, value=DEFAULT_COLLECTION_NAME),
|
194 |
+
gr.Textbox(label="Dimension", placeholder=str(DEFAULT_DIMENSION), value=str(DEFAULT_DIMENSION)),
|
195 |
+
gr.Textbox(label="Max Retries", placeholder=str(DEFAULT_MAX_RETRIES), value=str(DEFAULT_MAX_RETRIES)),
|
196 |
+
gr.Textbox(label="Retry Delay (seconds)", placeholder=str(DEFAULT_RETRY_DELAY), value=str(DEFAULT_RETRY_DELAY))
|
197 |
+
],
|
198 |
+
outputs="text",
|
199 |
+
title="Milvus Database Initialization",
|
200 |
+
description="Initialize or load the Milvus vector database.",
|
201 |
+
)
|
202 |
+
|
203 |
+
# Gradio Chatbot UI Interface
|
204 |
+
chat_iface = gr.Interface(
|
205 |
+
fn=chat_ui,
|
206 |
+
inputs=[
|
207 |
+
gr.Textbox(label="Ask a question", placeholder="Type your question here"),
|
208 |
+
gr.State(), # Maintains chat history
|
209 |
+
gr.Radio(["analyzer", "debugger", "developer"], label="Chat Mode", value="analyzer")
|
210 |
+
],
|
211 |
+
outputs=[
|
212 |
+
gr.Chatbot(label="Chat with Project"),
|
213 |
+
"state" # This is to store the state,
|
214 |
+
],
|
215 |
+
title="Chat with your Project",
|
216 |
+
description="Ask questions about the data extracted from the zip file.",
|
217 |
+
# Example usage - Corrected to only include instruction and mode
|
218 |
+
examples=[
|
219 |
+
["What is this project about?", "analyzer"],
|
220 |
+
["Are there any potential bugs?", "debugger"],
|
221 |
+
["How does the data flow through the application?", "analyzer"],
|
222 |
+
["Explain the main components of the architecture.", "analyzer"],
|
223 |
+
["What are the dependencies of this project?", "analyzer"],
|
224 |
+
["Are there any potential memory leaks?", "debugger"],
|
225 |
+
["Identify any areas where the code could be optimized.","debugger"],
|
226 |
+
["Implement basic logging for the main application and save logs to a file.", "developer"],
|
227 |
+
["Use try/except blocks in main functions to handle exceptions", "developer"]
|
228 |
+
|
229 |
+
],
|
230 |
+
)
|
231 |
+
|
232 |
+
# Settings Interface
|
233 |
+
settings_iface = gr.Interface(
|
234 |
+
fn=update_api_key,
|
235 |
+
inputs=gr.Textbox(label="OpenAI API Key", type="password"),
|
236 |
+
outputs="text",
|
237 |
+
title="Settings",
|
238 |
+
description="Set your OpenAI API key.",
|
239 |
+
)
|
240 |
+
|
241 |
+
# Status Interface
|
242 |
+
def get_api_key_status():
|
243 |
+
if is_api_key_set():
|
244 |
+
return "API key status: Set"
|
245 |
+
else:
|
246 |
+
return "API key status: Not set"
|
247 |
+
|
248 |
+
status_iface = gr.Interface(
|
249 |
+
fn=get_api_key_status,
|
250 |
+
inputs=None,
|
251 |
+
outputs="text",
|
252 |
+
live=True,
|
253 |
+
title="API Key Status"
|
254 |
+
)
|
255 |
+
|
256 |
+
# Add credits to the UI
|
257 |
+
credits = gr.Markdown("## Credits\n\nCreated by [Ruslan Magana Vsevolodovna](https://ruslanmv.com/)")
|
258 |
+
|
259 |
+
# --- Main Application Launch ---
|
260 |
+
|
261 |
+
# Combine the interfaces using Tabs
|
262 |
+
demo = gr.TabbedInterface(
|
263 |
+
[zip_iface, milvus_iface, chat_iface, settings_iface, status_iface],
|
264 |
+
["Process ZIP", "Init Milvus", "Chat with Project", "Settings", "Status"],
|
265 |
+
)
|
266 |
+
|
267 |
+
# Launch the app with credits
|
268 |
+
demo.queue().launch()
|
chat_with_project.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pymilvus import connections, Collection, utility
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
from langchain_openai import ChatOpenAI # Updated import
|
4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
5 |
+
from langchain_core.output_parsers import StrOutputParser
|
6 |
+
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
|
7 |
+
import os
|
8 |
+
|
9 |
+
# Milvus connection details
|
10 |
+
MILVUS_HOST = 'localhost'
|
11 |
+
MILVUS_PORT = '19530'
|
12 |
+
COLLECTION_NAME = 'document_collection'
|
13 |
+
|
14 |
+
def load_api_key():
|
15 |
+
"""Loads the API key from the .env file or the environment."""
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
load_dotenv()
|
18 |
+
return os.environ.get("OPENAI_API_KEY")
|
19 |
+
|
20 |
+
# Embedding model
|
21 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
22 |
+
|
23 |
+
def retrieve_relevant_documents(query, top_k=5):
|
24 |
+
"""
|
25 |
+
Retrieves the most relevant documents from Milvus based on the query.
|
26 |
+
"""
|
27 |
+
print(f"Connecting to Milvus at {MILVUS_HOST}:{MILVUS_PORT}...")
|
28 |
+
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
|
29 |
+
if utility.has_collection(COLLECTION_NAME):
|
30 |
+
collection = Collection(COLLECTION_NAME)
|
31 |
+
collection.load()
|
32 |
+
|
33 |
+
query_vector = model.encode([query]).tolist()
|
34 |
+
print(f"Encoded Query Vector: {query_vector}")
|
35 |
+
|
36 |
+
search_params = {
|
37 |
+
"metric_type": "L2",
|
38 |
+
"params": {"nprobe": 16}
|
39 |
+
}
|
40 |
+
search_results = collection.search(
|
41 |
+
data=query_vector,
|
42 |
+
anns_field="content_vector",
|
43 |
+
param=search_params,
|
44 |
+
limit=top_k,
|
45 |
+
expr=None,
|
46 |
+
output_fields=["path"]
|
47 |
+
)
|
48 |
+
|
49 |
+
relevant_docs = []
|
50 |
+
for hit in search_results[0]:
|
51 |
+
doc_path = hit.entity.get("path")
|
52 |
+
relevant_docs.append(doc_path)
|
53 |
+
|
54 |
+
print(f"Relevant Docs: {relevant_docs}")
|
55 |
+
connections.disconnect(alias='default')
|
56 |
+
else:
|
57 |
+
print(f"Collection {COLLECTION_NAME} does not exist.")
|
58 |
+
relevant_docs = []
|
59 |
+
|
60 |
+
return relevant_docs
|
61 |
+
|
62 |
+
|
63 |
+
def generate_response_with_gpt(query, relevant_docs, system_prompt):
|
64 |
+
"""
|
65 |
+
Generates a response using OpenAI's GPT model, based on the query, relevant documents, and system prompt.
|
66 |
+
"""
|
67 |
+
api_key = load_api_key()
|
68 |
+
if not api_key:
|
69 |
+
raise ValueError("OpenAI API key not set. Please set it in the .env file or environment variables.")
|
70 |
+
|
71 |
+
print(f"Using OpenAI API Key: {api_key[:5]}...") # Partial key for debugging
|
72 |
+
chat = ChatOpenAI(temperature=0.7, openai_api_key=api_key, model_name="gpt-3.5-turbo")
|
73 |
+
|
74 |
+
messages = [SystemMessage(content=system_prompt)]
|
75 |
+
if relevant_docs:
|
76 |
+
doc_content = ""
|
77 |
+
for doc_path in relevant_docs:
|
78 |
+
if os.path.isfile(doc_path):
|
79 |
+
try:
|
80 |
+
with open(doc_path, "r", encoding="utf-8") as f:
|
81 |
+
doc_content += f.read() + "\n"
|
82 |
+
except Exception as e:
|
83 |
+
print(f"Error reading document {doc_path}: {e}")
|
84 |
+
if doc_content:
|
85 |
+
messages.append(HumanMessage(content=f"Relevant documents:\n{doc_content}"))
|
86 |
+
|
87 |
+
messages.append(HumanMessage(content=query))
|
88 |
+
print(f"Messages sent to OpenAI API: {messages}")
|
89 |
+
|
90 |
+
try:
|
91 |
+
response = chat.invoke(messages)
|
92 |
+
print(f"OpenAI API Response: {response.content}")
|
93 |
+
print("Type OpenAI API Response",type(response.content))
|
94 |
+
return response.content
|
95 |
+
except Exception as e:
|
96 |
+
print(f"Error during OpenAI API call: {e}")
|
97 |
+
return "Error generating response. Please try again later."
|
98 |
+
|
99 |
+
|
100 |
+
def query_project(query, system_prompt):
|
101 |
+
"""
|
102 |
+
Queries the project using a RAG approach with specified system prompt.
|
103 |
+
"""
|
104 |
+
relevant_docs = retrieve_relevant_documents(query)
|
105 |
+
print(" Starting the query:")
|
106 |
+
print(query)
|
107 |
+
response = generate_response_with_gpt(query, relevant_docs, system_prompt)
|
108 |
+
print(f"Query Response: {response}")
|
109 |
+
print("Type response",type(response))
|
110 |
+
return response
|
get_prompts.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts import PromptTemplate
|
2 |
+
|
3 |
+
ANALYZER_PROMPT_TEMPLATE = """
|
4 |
+
You are a code analyzer AI. Your task is to analyze the project's structure,
|
5 |
+
purpose, and functionality. Explain how different components interact,
|
6 |
+
discuss the overall architecture, and provide insights into the project's design.
|
7 |
+
Consider the context provided and try to be comprehensive in your analysis.
|
8 |
+
|
9 |
+
Relevant context: {context}
|
10 |
+
|
11 |
+
Explain in detail, based on the context provided.
|
12 |
+
"""
|
13 |
+
|
14 |
+
DEBUGGER_PROMPT_TEMPLATE = """
|
15 |
+
You are a code debugger AI. Your task is to identify potential bugs,
|
16 |
+
errors, and areas for improvement in the project's code. Analyze the given code
|
17 |
+
for logic errors, performance bottlenecks, and suggest fixes or improvements.
|
18 |
+
If the user asks how to fix an issue, provide the corrected code snippet.
|
19 |
+
|
20 |
+
Relevant context: {context}
|
21 |
+
|
22 |
+
Focus on identifying issues and providing solutions or improvements based on the context provided.
|
23 |
+
"""
|
24 |
+
|
25 |
+
DEVELOPER_PROMPT_TEMPLATE = """
|
26 |
+
You are a software developer AI. Your task is to modify or extend existing code based on user requests.
|
27 |
+
When a user asks to add a feature or modify existing functionality, you should:
|
28 |
+
|
29 |
+
1. Identify the files that need to be modified or created.
|
30 |
+
2. Output the full, updated code for each file that needs changes.
|
31 |
+
3. Clearly indicate the filename before each code block using this format:
|
32 |
+
```
|
33 |
+
--- BEGIN FILE: <filepath> ---
|
34 |
+
<full code of the file>
|
35 |
+
--- END FILE: <filepath> ---
|
36 |
+
```
|
37 |
+
4. If a new file needs to be created, use the same format and specify the new file's path and name.
|
38 |
+
5. **Do not omit any part of the code**. Output the entire content of each modified or new file.
|
39 |
+
6. Ensure that the generated code is functional, well-structured, and integrates seamlessly with the existing project.
|
40 |
+
7. Explain any additional setup or configuration steps if necessary.
|
41 |
+
|
42 |
+
Remember to consider the existing project's structure and coding style when making modifications.
|
43 |
+
|
44 |
+
Relevant context: {context}
|
45 |
+
|
46 |
+
User request: {question}
|
47 |
+
|
48 |
+
Modify or extend the code as requested, providing the full code for each relevant file.
|
49 |
+
"""
|
50 |
+
|
51 |
+
def get_prompt_for_mode(mode):
|
52 |
+
"""
|
53 |
+
Returns the appropriate prompt template based on the selected mode.
|
54 |
+
"""
|
55 |
+
if mode == "analyzer":
|
56 |
+
return ANALYZER_PROMPT_TEMPLATE
|
57 |
+
elif mode == "debugger":
|
58 |
+
return DEBUGGER_PROMPT_TEMPLATE
|
59 |
+
elif mode == "developer":
|
60 |
+
return DEVELOPER_PROMPT_TEMPLATE
|
61 |
+
else:
|
62 |
+
raise ValueError(f"Invalid mode: {mode}")
|
milvus.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# milvus.py
|
2 |
+
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
|
3 |
+
import pandas as pd
|
4 |
+
import os
|
5 |
+
import sys
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
import time
|
8 |
+
|
9 |
+
# Default Milvus connection details
|
10 |
+
DEFAULT_MILVUS_HOST = 'localhost'
|
11 |
+
DEFAULT_MILVUS_PORT = '19530'
|
12 |
+
DEFAULT_COLLECTION_NAME = 'document_collection'
|
13 |
+
DEFAULT_DIMENSION = 384 # Adjust based on your embedding model
|
14 |
+
DEFAULT_MAX_RETRIES = 3
|
15 |
+
DEFAULT_RETRY_DELAY = 5 # seconds
|
16 |
+
|
17 |
+
# Embedding model
|
18 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
19 |
+
|
20 |
+
def create_milvus_collection(host, port, collection_name, dimension):
|
21 |
+
"""
|
22 |
+
Creates a new Milvus collection if it doesn't exist.
|
23 |
+
"""
|
24 |
+
if not utility.has_collection(collection_name):
|
25 |
+
fields = [
|
26 |
+
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
27 |
+
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=500),
|
28 |
+
FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=dimension)
|
29 |
+
]
|
30 |
+
schema = CollectionSchema(fields, "Document Vector Store")
|
31 |
+
collection = Collection(collection_name, schema, consistency_level="Strong")
|
32 |
+
|
33 |
+
index_params = {
|
34 |
+
"metric_type": "L2",
|
35 |
+
"index_type": "IVF_FLAT",
|
36 |
+
"params": {"nlist": 1024}
|
37 |
+
}
|
38 |
+
collection.create_index(field_name="content_vector", index_params=index_params)
|
39 |
+
print(f"Collection {collection_name} created and index built.")
|
40 |
+
else:
|
41 |
+
print(f"Collection {collection_name} already exists.")
|
42 |
+
|
43 |
+
def load_data_to_milvus(host, port, collection_name):
|
44 |
+
"""
|
45 |
+
Loads data from the DataFrame into Milvus, using sentence embeddings.
|
46 |
+
"""
|
47 |
+
extraction_dir = "extraction"
|
48 |
+
pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
|
49 |
+
if not pkl_files:
|
50 |
+
print("No .pkl files found in the 'extraction' directory.")
|
51 |
+
return
|
52 |
+
|
53 |
+
df_path = os.path.join(extraction_dir, pkl_files[0])
|
54 |
+
df = pd.read_pickle(df_path)
|
55 |
+
|
56 |
+
# Generate sentence embeddings
|
57 |
+
df['content_vector'] = df['content'].apply(lambda x: model.encode(x).tolist())
|
58 |
+
|
59 |
+
data_to_insert = [
|
60 |
+
df['path'].tolist(),
|
61 |
+
df['content_vector'].tolist()
|
62 |
+
]
|
63 |
+
|
64 |
+
collection = Collection(collection_name)
|
65 |
+
collection.insert(data_to_insert)
|
66 |
+
collection.flush()
|
67 |
+
|
68 |
+
print(f"Data from {df_path} loaded into Milvus collection {collection_name}.")
|
69 |
+
|
70 |
+
def connect_to_milvus(host, port, max_retries, retry_delay):
|
71 |
+
"""Connects to Milvus with retries."""
|
72 |
+
retries = 0
|
73 |
+
while retries < max_retries:
|
74 |
+
try:
|
75 |
+
connections.connect(host=host, port=port)
|
76 |
+
print(f"Successfully connected to Milvus at {host}:{port}")
|
77 |
+
return True
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Error connecting to Milvus: {e}")
|
80 |
+
retries += 1
|
81 |
+
if retries < max_retries:
|
82 |
+
print(f"Retrying in {retry_delay} seconds...")
|
83 |
+
time.sleep(retry_delay)
|
84 |
+
else:
|
85 |
+
print("Max retries reached. Could not connect to Milvus.")
|
86 |
+
return False
|
87 |
+
|
88 |
+
def initialize_milvus(host, port, collection_name, dimension, max_retries, retry_delay):
|
89 |
+
"""Initializes Milvus with parameters."""
|
90 |
+
if connect_to_milvus(host, port, max_retries, retry_delay):
|
91 |
+
create_milvus_collection(host, port, collection_name, dimension)
|
92 |
+
load_data_to_milvus(host, port, collection_name)
|
93 |
+
connections.disconnect(alias='default')
|
94 |
+
|
95 |
+
|
96 |
+
if __name__ == "__main__":
|
97 |
+
# Use default values or environment variables if available
|
98 |
+
milvus_host = os.environ.get('MILVUS_HOST', DEFAULT_MILVUS_HOST)
|
99 |
+
milvus_port = os.environ.get('MILVUS_PORT', DEFAULT_MILVUS_PORT)
|
100 |
+
collection_name = os.environ.get('COLLECTION_NAME', DEFAULT_COLLECTION_NAME)
|
101 |
+
dimension = int(os.environ.get('DIMENSION', DEFAULT_DIMENSION))
|
102 |
+
max_retries = int(os.environ.get('MAX_RETRIES', DEFAULT_MAX_RETRIES))
|
103 |
+
retry_delay = int(os.environ.get('RETRY_DELAY', DEFAULT_RETRY_DELAY))
|
104 |
+
|
105 |
+
initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay)
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==5.11.0
|
2 |
+
pymilvus==2.5.3
|
3 |
+
sentence-transformers==3.3.1
|
4 |
+
openai==1.59.5
|
5 |
+
langchain==0.3.14
|
6 |
+
python-dotenv
|
7 |
+
langchain-community==0.3.14
|
8 |
+
langchain-openai==0.2.14
|
requirements_dev.txt
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiohappyeyeballs==2.4.4
|
3 |
+
aiohttp==3.11.11
|
4 |
+
aiosignal==1.3.2
|
5 |
+
altair==5.5.0
|
6 |
+
annotated-types==0.7.0
|
7 |
+
anyio==4.8.0
|
8 |
+
asttokens==2.4.1
|
9 |
+
attrs==24.3.0
|
10 |
+
blinker==1.9.0
|
11 |
+
cachetools==5.5.0
|
12 |
+
certifi==2024.12.14
|
13 |
+
charset-normalizer==3.4.1
|
14 |
+
click==8.1.8
|
15 |
+
colorama==0.4.6
|
16 |
+
comm==0.2.2
|
17 |
+
dataclasses-json==0.6.7
|
18 |
+
debugpy==1.8.1
|
19 |
+
decorator==5.1.1
|
20 |
+
distro==1.9.0
|
21 |
+
executing==2.0.1
|
22 |
+
fastapi==0.115.6
|
23 |
+
ffmpy==0.5.0
|
24 |
+
filelock==3.16.1
|
25 |
+
fpdf==1.7.2
|
26 |
+
frozenlist==1.5.0
|
27 |
+
fsspec==2024.12.0
|
28 |
+
gitdb==4.0.12
|
29 |
+
GitPython==3.1.44
|
30 |
+
gradio==5.11.0
|
31 |
+
gradio_client==1.5.3
|
32 |
+
greenlet==3.1.1
|
33 |
+
grpcio==1.67.1
|
34 |
+
h11==0.14.0
|
35 |
+
httpcore==1.0.7
|
36 |
+
httpx==0.28.1
|
37 |
+
httpx-sse==0.4.0
|
38 |
+
huggingface-hub==0.27.1
|
39 |
+
idna==3.10
|
40 |
+
ipykernel==6.29.4
|
41 |
+
ipython==8.25.0
|
42 |
+
jedi==0.19.1
|
43 |
+
Jinja2==3.1.5
|
44 |
+
jiter==0.8.2
|
45 |
+
joblib==1.4.2
|
46 |
+
jsonpatch==1.33
|
47 |
+
jsonpointer==3.0.0
|
48 |
+
jsonschema==4.23.0
|
49 |
+
jsonschema-specifications==2024.10.1
|
50 |
+
jupyter_client==8.6.2
|
51 |
+
jupyter_core==5.7.2
|
52 |
+
langchain==0.3.14
|
53 |
+
langchain-community==0.3.14
|
54 |
+
langchain-core==0.3.29
|
55 |
+
langchain-openai==0.2.14
|
56 |
+
langchain-text-splitters==0.3.5
|
57 |
+
langsmith==0.2.10
|
58 |
+
markdown-it-py==3.0.0
|
59 |
+
MarkupSafe==2.1.5
|
60 |
+
marshmallow==3.24.2
|
61 |
+
matplotlib-inline==0.1.7
|
62 |
+
mdurl==0.1.2
|
63 |
+
mpmath==1.3.0
|
64 |
+
multidict==6.1.0
|
65 |
+
mypy-extensions==1.0.0
|
66 |
+
narwhals==1.21.1
|
67 |
+
nest-asyncio==1.6.0
|
68 |
+
networkx==3.4.2
|
69 |
+
numpy==2.2.1
|
70 |
+
openai==1.59.5
|
71 |
+
orjson==3.10.14
|
72 |
+
packaging==24.1
|
73 |
+
pandas==2.2.3
|
74 |
+
parso==0.8.4
|
75 |
+
pillow==11.1.0
|
76 |
+
platformdirs==4.2.2
|
77 |
+
prompt_toolkit==3.0.47
|
78 |
+
propcache==0.2.1
|
79 |
+
protobuf==5.29.3
|
80 |
+
psutil==6.0.0
|
81 |
+
pure-eval==0.2.2
|
82 |
+
pyarrow==18.1.0
|
83 |
+
pydantic==2.10.4
|
84 |
+
pydantic-settings==2.7.1
|
85 |
+
pydantic_core==2.27.2
|
86 |
+
pydeck==0.9.1
|
87 |
+
pydub==0.25.1
|
88 |
+
Pygments==2.18.0
|
89 |
+
pymilvus==2.5.3
|
90 |
+
python-dateutil==2.9.0.post0
|
91 |
+
python-dotenv==1.0.1
|
92 |
+
python-multipart==0.0.20
|
93 |
+
pytz==2024.2
|
94 |
+
pywin32==306
|
95 |
+
PyYAML==6.0.2
|
96 |
+
pyzmq==26.0.3
|
97 |
+
referencing==0.35.1
|
98 |
+
regex==2024.11.6
|
99 |
+
requests==2.32.3
|
100 |
+
requests-toolbelt==1.0.0
|
101 |
+
rich==13.9.4
|
102 |
+
rpds-py==0.22.3
|
103 |
+
ruff==0.8.6
|
104 |
+
safehttpx==0.1.6
|
105 |
+
safetensors==0.5.2
|
106 |
+
scikit-learn==1.6.0
|
107 |
+
scipy==1.15.0
|
108 |
+
semantic-version==2.10.0
|
109 |
+
sentence-transformers==3.3.1
|
110 |
+
setuptools==75.1.0
|
111 |
+
shellingham==1.5.4
|
112 |
+
six==1.16.0
|
113 |
+
smmap==5.0.2
|
114 |
+
sniffio==1.3.1
|
115 |
+
SQLAlchemy==2.0.36
|
116 |
+
stack-data==0.6.3
|
117 |
+
starlette==0.41.3
|
118 |
+
streamlit==1.41.1
|
119 |
+
streamlit-pdf-viewer==0.0.20
|
120 |
+
sympy==1.13.1
|
121 |
+
tenacity==9.0.0
|
122 |
+
threadpoolctl==3.5.0
|
123 |
+
tiktoken==0.8.0
|
124 |
+
tokenizers==0.21.0
|
125 |
+
toml==0.10.2
|
126 |
+
tomlkit==0.13.2
|
127 |
+
torch==2.5.1
|
128 |
+
tornado==6.4.1
|
129 |
+
tqdm==4.67.1
|
130 |
+
traitlets==5.14.3
|
131 |
+
transformers==4.47.1
|
132 |
+
typer==0.15.1
|
133 |
+
typing-inspect==0.9.0
|
134 |
+
typing_extensions==4.12.2
|
135 |
+
tzdata==2024.2
|
136 |
+
ujson==5.10.0
|
137 |
+
urllib3==2.3.0
|
138 |
+
uvicorn==0.34.0
|
139 |
+
watchdog==6.0.0
|
140 |
+
wcwidth==0.2.13
|
141 |
+
websockets==14.1
|
142 |
+
wheel==0.44.0
|
143 |
+
yarl==1.18.3
|
utils/extract.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
def display_and_store_directory_content(base_path):
|
6 |
+
"""
|
7 |
+
Display all paths with directories and files along with their content,
|
8 |
+
and store the information in a Pandas DataFrame.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
base_path (str): The root directory path to scan.
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
None: Prints paths and content, and saves the DataFrame as a pickle file.
|
15 |
+
"""
|
16 |
+
data = [] # To store path and content as rows for the DataFrame
|
17 |
+
|
18 |
+
for root, dirs, files in os.walk(base_path):
|
19 |
+
# Store directories (no content)
|
20 |
+
for d in dirs:
|
21 |
+
dir_path = os.path.join(root, d)
|
22 |
+
data.append({"path": dir_path, "content": ""})
|
23 |
+
print(f"Directory: {dir_path}")
|
24 |
+
|
25 |
+
# Store files and their content
|
26 |
+
for f in files:
|
27 |
+
file_path = os.path.join(root, f)
|
28 |
+
try:
|
29 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
30 |
+
content = file.read()
|
31 |
+
except Exception as e:
|
32 |
+
content = f"Error reading file: {e}"
|
33 |
+
|
34 |
+
data.append({"path": file_path, "content": content})
|
35 |
+
print(f"\nFile: {file_path}")
|
36 |
+
print("-" * 40)
|
37 |
+
print(content)
|
38 |
+
print("-" * 40)
|
39 |
+
|
40 |
+
# Create a DataFrame
|
41 |
+
df = pd.DataFrame(data)
|
42 |
+
|
43 |
+
# Create the 'extraction' directory if it doesn't exist
|
44 |
+
extraction_dir = "extraction"
|
45 |
+
if not os.path.exists(extraction_dir):
|
46 |
+
os.makedirs(extraction_dir)
|
47 |
+
|
48 |
+
# Use the last component of the base path as the file name
|
49 |
+
base_name = os.path.basename(os.path.normpath(base_path))
|
50 |
+
output_file = os.path.join(extraction_dir, f"{base_name}.pkl")
|
51 |
+
|
52 |
+
# Save the DataFrame to a pickle file
|
53 |
+
df.to_pickle(output_file)
|
54 |
+
print(f"\nDataFrame saved to {output_file}")
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
# Ensure a directory path is provided as an argument
|
58 |
+
if len(sys.argv) < 2:
|
59 |
+
print("Usage: python utils\\extract_all_content.py <directory>")
|
60 |
+
sys.exit(1)
|
61 |
+
|
62 |
+
# Get the directory path from the command-line arguments
|
63 |
+
directory_path = sys.argv[1]
|
64 |
+
|
65 |
+
# Execute the function
|
66 |
+
if os.path.exists(directory_path):
|
67 |
+
display_and_store_directory_content(directory_path)
|
68 |
+
else:
|
69 |
+
print(f"Error: The path '{directory_path}' does not exist.")
|