ruslanmv commited on
Commit
306849a
·
1 Parent(s): 93c7f73

First commit

Browse files
Files changed (8) hide show
  1. .gitignore +5 -0
  2. app.py +268 -0
  3. chat_with_project.py +110 -0
  4. get_prompts.py +62 -0
  5. milvus.py +105 -0
  6. requirements.txt +8 -0
  7. requirements_dev.txt +143 -0
  8. utils/extract.py +69 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ /workspace
3
+ /__pycache__
4
+ .env
5
+ /extraction
app.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import zipfile
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+ from chat_with_project import query_project
7
+ from get_prompts import get_prompt_for_mode
8
+ from dotenv import load_dotenv, set_key
9
+ from milvus import initialize_milvus, DEFAULT_MILVUS_HOST, DEFAULT_MILVUS_PORT, DEFAULT_COLLECTION_NAME, DEFAULT_DIMENSION, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_DELAY
10
+
11
+ # --- Configuration and Setup ---
12
+
13
+ # Define paths for workspace and extraction directories
14
+ WORKSPACE_DIR = "workspace"
15
+ EXTRACTION_DIR = "extraction"
16
+
17
+ def clear_directories():
18
+ """Clears the workspace and extraction directories."""
19
+ for directory in [WORKSPACE_DIR, EXTRACTION_DIR]:
20
+ if os.path.exists(directory):
21
+ shutil.rmtree(directory)
22
+ os.makedirs(directory, exist_ok=True)
23
+
24
+ # Clear directories at startup
25
+ clear_directories()
26
+
27
+ # --- API Key Management ---
28
+
29
+ def ensure_env_file_exists():
30
+ """Ensures that a .env file exists in the project root."""
31
+ if not os.path.exists(".env"):
32
+ with open(".env", "w") as f:
33
+ f.write("") # Create an empty .env file
34
+
35
+ def load_api_key():
36
+ """Loads the API key from the .env file or the environment."""
37
+ ensure_env_file_exists()
38
+ load_dotenv()
39
+ return os.environ.get("OPENAI_API_KEY")
40
+
41
+ def update_api_key(api_key):
42
+ """Updates the API key in the .env file."""
43
+ if api_key:
44
+ set_key(".env", "OPENAI_API_KEY", api_key)
45
+ load_dotenv() # Reload environment variables
46
+ return "API key updated successfully."
47
+ else:
48
+ return "API key cannot be empty."
49
+
50
+ def is_api_key_set():
51
+ """Checks if the API key is set."""
52
+ return bool(load_api_key())
53
+
54
+ # --- Core Functionalities ---
55
+
56
+ def process_zip(zip_file_path):
57
+ """Extracts a zip file, analyzes content, and stores information."""
58
+ try:
59
+ # Clear existing workspace and extraction directories before processing
60
+ clear_directories()
61
+
62
+ # Extract the zip file
63
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
64
+ zip_ref.extractall(WORKSPACE_DIR)
65
+
66
+ # Run extract.py
67
+ subprocess.run(["python", "./utils/extract.py", WORKSPACE_DIR], check=True)
68
+
69
+ return "Processing complete! Results saved in the 'extraction' directory."
70
+
71
+ except Exception as e:
72
+ return f"An error occurred: {e}"
73
+
74
+ def init_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay):
75
+ """Initializes or loads the Milvus vector database."""
76
+ try:
77
+ # Convert string inputs to appropriate types
78
+ milvus_port = int(milvus_port)
79
+ dimension = int(dimension)
80
+ max_retries = int(max_retries)
81
+ retry_delay = int(retry_delay)
82
+
83
+ initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay)
84
+ return "Milvus database initialized or loaded successfully."
85
+
86
+ except Exception as e:
87
+ return f"Error initializing Milvus: {e}"
88
+
89
+ # --- Chatbot Verification ---
90
+
91
+ def is_project_loaded():
92
+ """Checks if a project has been loaded (i.e., if the extraction directory contains .pkl files)."""
93
+ extraction_dir = "extraction"
94
+ pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
95
+ return bool(pkl_files)
96
+ # --- Gradio UI Components ---
97
+
98
+ # Chat Interface
99
+ def chat_ui(query, history, mode):
100
+ """Handles the chat interaction for Analyzer, Debugger, and Developer modes."""
101
+ api_key = load_api_key()
102
+ if not api_key:
103
+ return "Error: OpenAI API key not set. Please set the API key in the Settings tab.", []
104
+
105
+ if not is_project_loaded():
106
+ return "Error: No project loaded. Please upload and process a ZIP file first.", []
107
+
108
+ # Initialize history if None
109
+ if history is None:
110
+ history = []
111
+
112
+ print(f"Chat Mode: {mode}")
113
+ system_prompt = get_prompt_for_mode(mode)
114
+ print(f"System Prompt: {system_prompt}")
115
+
116
+ # Pass the query and system prompt to the LLM
117
+ response = query_project(query, system_prompt)
118
+ print(f"Response from query_project: {response}")
119
+
120
+ if response is None or not response.strip():
121
+ response = "An error occurred during processing. Please check the logs."
122
+
123
+ if mode == "developer":
124
+ extracted_files = extract_files_from_response(response)
125
+
126
+ # Format the output for developer mode
127
+ developer_response = ""
128
+ for filepath, content in extracted_files.items():
129
+ developer_response += f"**{filepath}:**\n`python\n{content}\n`\n\n"
130
+
131
+ history.append((query, developer_response))
132
+ # Return history and an empty string for the text output (as it's handled by the chatbot)
133
+ return history, history
134
+
135
+ else:
136
+ # Format the output for non-developer modes
137
+ formatted_response = response.replace('\n', ' \n') # Use two spaces for markdown line breaks
138
+ history.append((query, formatted_response))
139
+ # Return history and an empty string for the text output (as it's handled by the chatbot)
140
+ return history, history
141
+
142
+
143
+ def extract_files_from_response(response):
144
+ """
145
+ Parses the LLM response to extract file paths and their corresponding code content.
146
+
147
+ Args:
148
+ response (str): The raw response string from the LLM.
149
+
150
+ Returns:
151
+ dict: A dictionary where keys are file paths and values are the code content of each file.
152
+ """
153
+ files = {}
154
+ current_file = None
155
+ current_content = []
156
+
157
+ for line in response.splitlines():
158
+ if line.startswith("--- BEGIN FILE:"):
159
+ if current_file is not None:
160
+ # Save previous file content
161
+ files[current_file] = "\n".join(current_content)
162
+
163
+ # Start a new file
164
+ current_file = line.replace("--- BEGIN FILE:", "").strip()
165
+ current_content = []
166
+ elif line.startswith("--- END FILE:"):
167
+ if current_file is not None:
168
+ # Save current file content
169
+ files[current_file] = "\n".join(current_content)
170
+ current_file = None
171
+ current_content = []
172
+ elif current_file is not None:
173
+ # Append line to current file content
174
+ current_content.append(line)
175
+
176
+ return files
177
+
178
+ # ZIP Processing Interface
179
+ zip_iface = gr.Interface(
180
+ fn=process_zip,
181
+ inputs=gr.File(label="Upload ZIP File"),
182
+ outputs="text",
183
+ title="Zip File Analyzer",
184
+ description="Upload a zip file to analyze and store its contents.",
185
+ )
186
+
187
+ # Milvus Initialization Interface
188
+ milvus_iface = gr.Interface(
189
+ fn=init_milvus,
190
+ inputs=[
191
+ gr.Textbox(label="Milvus Host", placeholder=DEFAULT_MILVUS_HOST, value=DEFAULT_MILVUS_HOST),
192
+ gr.Textbox(label="Milvus Port", placeholder=DEFAULT_MILVUS_PORT, value=DEFAULT_MILVUS_PORT),
193
+ gr.Textbox(label="Collection Name", placeholder=DEFAULT_COLLECTION_NAME, value=DEFAULT_COLLECTION_NAME),
194
+ gr.Textbox(label="Dimension", placeholder=str(DEFAULT_DIMENSION), value=str(DEFAULT_DIMENSION)),
195
+ gr.Textbox(label="Max Retries", placeholder=str(DEFAULT_MAX_RETRIES), value=str(DEFAULT_MAX_RETRIES)),
196
+ gr.Textbox(label="Retry Delay (seconds)", placeholder=str(DEFAULT_RETRY_DELAY), value=str(DEFAULT_RETRY_DELAY))
197
+ ],
198
+ outputs="text",
199
+ title="Milvus Database Initialization",
200
+ description="Initialize or load the Milvus vector database.",
201
+ )
202
+
203
+ # Gradio Chatbot UI Interface
204
+ chat_iface = gr.Interface(
205
+ fn=chat_ui,
206
+ inputs=[
207
+ gr.Textbox(label="Ask a question", placeholder="Type your question here"),
208
+ gr.State(), # Maintains chat history
209
+ gr.Radio(["analyzer", "debugger", "developer"], label="Chat Mode", value="analyzer")
210
+ ],
211
+ outputs=[
212
+ gr.Chatbot(label="Chat with Project"),
213
+ "state" # This is to store the state,
214
+ ],
215
+ title="Chat with your Project",
216
+ description="Ask questions about the data extracted from the zip file.",
217
+ # Example usage - Corrected to only include instruction and mode
218
+ examples=[
219
+ ["What is this project about?", "analyzer"],
220
+ ["Are there any potential bugs?", "debugger"],
221
+ ["How does the data flow through the application?", "analyzer"],
222
+ ["Explain the main components of the architecture.", "analyzer"],
223
+ ["What are the dependencies of this project?", "analyzer"],
224
+ ["Are there any potential memory leaks?", "debugger"],
225
+ ["Identify any areas where the code could be optimized.","debugger"],
226
+ ["Implement basic logging for the main application and save logs to a file.", "developer"],
227
+ ["Use try/except blocks in main functions to handle exceptions", "developer"]
228
+
229
+ ],
230
+ )
231
+
232
+ # Settings Interface
233
+ settings_iface = gr.Interface(
234
+ fn=update_api_key,
235
+ inputs=gr.Textbox(label="OpenAI API Key", type="password"),
236
+ outputs="text",
237
+ title="Settings",
238
+ description="Set your OpenAI API key.",
239
+ )
240
+
241
+ # Status Interface
242
+ def get_api_key_status():
243
+ if is_api_key_set():
244
+ return "API key status: Set"
245
+ else:
246
+ return "API key status: Not set"
247
+
248
+ status_iface = gr.Interface(
249
+ fn=get_api_key_status,
250
+ inputs=None,
251
+ outputs="text",
252
+ live=True,
253
+ title="API Key Status"
254
+ )
255
+
256
+ # Add credits to the UI
257
+ credits = gr.Markdown("## Credits\n\nCreated by [Ruslan Magana Vsevolodovna](https://ruslanmv.com/)")
258
+
259
+ # --- Main Application Launch ---
260
+
261
+ # Combine the interfaces using Tabs
262
+ demo = gr.TabbedInterface(
263
+ [zip_iface, milvus_iface, chat_iface, settings_iface, status_iface],
264
+ ["Process ZIP", "Init Milvus", "Chat with Project", "Settings", "Status"],
265
+ )
266
+
267
+ # Launch the app with credits
268
+ demo.queue().launch()
chat_with_project.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymilvus import connections, Collection, utility
2
+ from sentence_transformers import SentenceTransformer
3
+ from langchain_openai import ChatOpenAI # Updated import
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from langchain_core.output_parsers import StrOutputParser
6
+ from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
7
+ import os
8
+
9
+ # Milvus connection details
10
+ MILVUS_HOST = 'localhost'
11
+ MILVUS_PORT = '19530'
12
+ COLLECTION_NAME = 'document_collection'
13
+
14
+ def load_api_key():
15
+ """Loads the API key from the .env file or the environment."""
16
+ from dotenv import load_dotenv
17
+ load_dotenv()
18
+ return os.environ.get("OPENAI_API_KEY")
19
+
20
+ # Embedding model
21
+ model = SentenceTransformer('all-MiniLM-L6-v2')
22
+
23
+ def retrieve_relevant_documents(query, top_k=5):
24
+ """
25
+ Retrieves the most relevant documents from Milvus based on the query.
26
+ """
27
+ print(f"Connecting to Milvus at {MILVUS_HOST}:{MILVUS_PORT}...")
28
+ connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
29
+ if utility.has_collection(COLLECTION_NAME):
30
+ collection = Collection(COLLECTION_NAME)
31
+ collection.load()
32
+
33
+ query_vector = model.encode([query]).tolist()
34
+ print(f"Encoded Query Vector: {query_vector}")
35
+
36
+ search_params = {
37
+ "metric_type": "L2",
38
+ "params": {"nprobe": 16}
39
+ }
40
+ search_results = collection.search(
41
+ data=query_vector,
42
+ anns_field="content_vector",
43
+ param=search_params,
44
+ limit=top_k,
45
+ expr=None,
46
+ output_fields=["path"]
47
+ )
48
+
49
+ relevant_docs = []
50
+ for hit in search_results[0]:
51
+ doc_path = hit.entity.get("path")
52
+ relevant_docs.append(doc_path)
53
+
54
+ print(f"Relevant Docs: {relevant_docs}")
55
+ connections.disconnect(alias='default')
56
+ else:
57
+ print(f"Collection {COLLECTION_NAME} does not exist.")
58
+ relevant_docs = []
59
+
60
+ return relevant_docs
61
+
62
+
63
+ def generate_response_with_gpt(query, relevant_docs, system_prompt):
64
+ """
65
+ Generates a response using OpenAI's GPT model, based on the query, relevant documents, and system prompt.
66
+ """
67
+ api_key = load_api_key()
68
+ if not api_key:
69
+ raise ValueError("OpenAI API key not set. Please set it in the .env file or environment variables.")
70
+
71
+ print(f"Using OpenAI API Key: {api_key[:5]}...") # Partial key for debugging
72
+ chat = ChatOpenAI(temperature=0.7, openai_api_key=api_key, model_name="gpt-3.5-turbo")
73
+
74
+ messages = [SystemMessage(content=system_prompt)]
75
+ if relevant_docs:
76
+ doc_content = ""
77
+ for doc_path in relevant_docs:
78
+ if os.path.isfile(doc_path):
79
+ try:
80
+ with open(doc_path, "r", encoding="utf-8") as f:
81
+ doc_content += f.read() + "\n"
82
+ except Exception as e:
83
+ print(f"Error reading document {doc_path}: {e}")
84
+ if doc_content:
85
+ messages.append(HumanMessage(content=f"Relevant documents:\n{doc_content}"))
86
+
87
+ messages.append(HumanMessage(content=query))
88
+ print(f"Messages sent to OpenAI API: {messages}")
89
+
90
+ try:
91
+ response = chat.invoke(messages)
92
+ print(f"OpenAI API Response: {response.content}")
93
+ print("Type OpenAI API Response",type(response.content))
94
+ return response.content
95
+ except Exception as e:
96
+ print(f"Error during OpenAI API call: {e}")
97
+ return "Error generating response. Please try again later."
98
+
99
+
100
+ def query_project(query, system_prompt):
101
+ """
102
+ Queries the project using a RAG approach with specified system prompt.
103
+ """
104
+ relevant_docs = retrieve_relevant_documents(query)
105
+ print(" Starting the query:")
106
+ print(query)
107
+ response = generate_response_with_gpt(query, relevant_docs, system_prompt)
108
+ print(f"Query Response: {response}")
109
+ print("Type response",type(response))
110
+ return response
get_prompts.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import PromptTemplate
2
+
3
+ ANALYZER_PROMPT_TEMPLATE = """
4
+ You are a code analyzer AI. Your task is to analyze the project's structure,
5
+ purpose, and functionality. Explain how different components interact,
6
+ discuss the overall architecture, and provide insights into the project's design.
7
+ Consider the context provided and try to be comprehensive in your analysis.
8
+
9
+ Relevant context: {context}
10
+
11
+ Explain in detail, based on the context provided.
12
+ """
13
+
14
+ DEBUGGER_PROMPT_TEMPLATE = """
15
+ You are a code debugger AI. Your task is to identify potential bugs,
16
+ errors, and areas for improvement in the project's code. Analyze the given code
17
+ for logic errors, performance bottlenecks, and suggest fixes or improvements.
18
+ If the user asks how to fix an issue, provide the corrected code snippet.
19
+
20
+ Relevant context: {context}
21
+
22
+ Focus on identifying issues and providing solutions or improvements based on the context provided.
23
+ """
24
+
25
+ DEVELOPER_PROMPT_TEMPLATE = """
26
+ You are a software developer AI. Your task is to modify or extend existing code based on user requests.
27
+ When a user asks to add a feature or modify existing functionality, you should:
28
+
29
+ 1. Identify the files that need to be modified or created.
30
+ 2. Output the full, updated code for each file that needs changes.
31
+ 3. Clearly indicate the filename before each code block using this format:
32
+ ```
33
+ --- BEGIN FILE: <filepath> ---
34
+ <full code of the file>
35
+ --- END FILE: <filepath> ---
36
+ ```
37
+ 4. If a new file needs to be created, use the same format and specify the new file's path and name.
38
+ 5. **Do not omit any part of the code**. Output the entire content of each modified or new file.
39
+ 6. Ensure that the generated code is functional, well-structured, and integrates seamlessly with the existing project.
40
+ 7. Explain any additional setup or configuration steps if necessary.
41
+
42
+ Remember to consider the existing project's structure and coding style when making modifications.
43
+
44
+ Relevant context: {context}
45
+
46
+ User request: {question}
47
+
48
+ Modify or extend the code as requested, providing the full code for each relevant file.
49
+ """
50
+
51
+ def get_prompt_for_mode(mode):
52
+ """
53
+ Returns the appropriate prompt template based on the selected mode.
54
+ """
55
+ if mode == "analyzer":
56
+ return ANALYZER_PROMPT_TEMPLATE
57
+ elif mode == "debugger":
58
+ return DEBUGGER_PROMPT_TEMPLATE
59
+ elif mode == "developer":
60
+ return DEVELOPER_PROMPT_TEMPLATE
61
+ else:
62
+ raise ValueError(f"Invalid mode: {mode}")
milvus.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # milvus.py
2
+ from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
3
+ import pandas as pd
4
+ import os
5
+ import sys
6
+ from sentence_transformers import SentenceTransformer
7
+ import time
8
+
9
+ # Default Milvus connection details
10
+ DEFAULT_MILVUS_HOST = 'localhost'
11
+ DEFAULT_MILVUS_PORT = '19530'
12
+ DEFAULT_COLLECTION_NAME = 'document_collection'
13
+ DEFAULT_DIMENSION = 384 # Adjust based on your embedding model
14
+ DEFAULT_MAX_RETRIES = 3
15
+ DEFAULT_RETRY_DELAY = 5 # seconds
16
+
17
+ # Embedding model
18
+ model = SentenceTransformer('all-MiniLM-L6-v2')
19
+
20
+ def create_milvus_collection(host, port, collection_name, dimension):
21
+ """
22
+ Creates a new Milvus collection if it doesn't exist.
23
+ """
24
+ if not utility.has_collection(collection_name):
25
+ fields = [
26
+ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
27
+ FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=500),
28
+ FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=dimension)
29
+ ]
30
+ schema = CollectionSchema(fields, "Document Vector Store")
31
+ collection = Collection(collection_name, schema, consistency_level="Strong")
32
+
33
+ index_params = {
34
+ "metric_type": "L2",
35
+ "index_type": "IVF_FLAT",
36
+ "params": {"nlist": 1024}
37
+ }
38
+ collection.create_index(field_name="content_vector", index_params=index_params)
39
+ print(f"Collection {collection_name} created and index built.")
40
+ else:
41
+ print(f"Collection {collection_name} already exists.")
42
+
43
+ def load_data_to_milvus(host, port, collection_name):
44
+ """
45
+ Loads data from the DataFrame into Milvus, using sentence embeddings.
46
+ """
47
+ extraction_dir = "extraction"
48
+ pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
49
+ if not pkl_files:
50
+ print("No .pkl files found in the 'extraction' directory.")
51
+ return
52
+
53
+ df_path = os.path.join(extraction_dir, pkl_files[0])
54
+ df = pd.read_pickle(df_path)
55
+
56
+ # Generate sentence embeddings
57
+ df['content_vector'] = df['content'].apply(lambda x: model.encode(x).tolist())
58
+
59
+ data_to_insert = [
60
+ df['path'].tolist(),
61
+ df['content_vector'].tolist()
62
+ ]
63
+
64
+ collection = Collection(collection_name)
65
+ collection.insert(data_to_insert)
66
+ collection.flush()
67
+
68
+ print(f"Data from {df_path} loaded into Milvus collection {collection_name}.")
69
+
70
+ def connect_to_milvus(host, port, max_retries, retry_delay):
71
+ """Connects to Milvus with retries."""
72
+ retries = 0
73
+ while retries < max_retries:
74
+ try:
75
+ connections.connect(host=host, port=port)
76
+ print(f"Successfully connected to Milvus at {host}:{port}")
77
+ return True
78
+ except Exception as e:
79
+ print(f"Error connecting to Milvus: {e}")
80
+ retries += 1
81
+ if retries < max_retries:
82
+ print(f"Retrying in {retry_delay} seconds...")
83
+ time.sleep(retry_delay)
84
+ else:
85
+ print("Max retries reached. Could not connect to Milvus.")
86
+ return False
87
+
88
+ def initialize_milvus(host, port, collection_name, dimension, max_retries, retry_delay):
89
+ """Initializes Milvus with parameters."""
90
+ if connect_to_milvus(host, port, max_retries, retry_delay):
91
+ create_milvus_collection(host, port, collection_name, dimension)
92
+ load_data_to_milvus(host, port, collection_name)
93
+ connections.disconnect(alias='default')
94
+
95
+
96
+ if __name__ == "__main__":
97
+ # Use default values or environment variables if available
98
+ milvus_host = os.environ.get('MILVUS_HOST', DEFAULT_MILVUS_HOST)
99
+ milvus_port = os.environ.get('MILVUS_PORT', DEFAULT_MILVUS_PORT)
100
+ collection_name = os.environ.get('COLLECTION_NAME', DEFAULT_COLLECTION_NAME)
101
+ dimension = int(os.environ.get('DIMENSION', DEFAULT_DIMENSION))
102
+ max_retries = int(os.environ.get('MAX_RETRIES', DEFAULT_MAX_RETRIES))
103
+ retry_delay = int(os.environ.get('RETRY_DELAY', DEFAULT_RETRY_DELAY))
104
+
105
+ initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==5.11.0
2
+ pymilvus==2.5.3
3
+ sentence-transformers==3.3.1
4
+ openai==1.59.5
5
+ langchain==0.3.14
6
+ python-dotenv
7
+ langchain-community==0.3.14
8
+ langchain-openai==0.2.14
requirements_dev.txt ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.4.4
3
+ aiohttp==3.11.11
4
+ aiosignal==1.3.2
5
+ altair==5.5.0
6
+ annotated-types==0.7.0
7
+ anyio==4.8.0
8
+ asttokens==2.4.1
9
+ attrs==24.3.0
10
+ blinker==1.9.0
11
+ cachetools==5.5.0
12
+ certifi==2024.12.14
13
+ charset-normalizer==3.4.1
14
+ click==8.1.8
15
+ colorama==0.4.6
16
+ comm==0.2.2
17
+ dataclasses-json==0.6.7
18
+ debugpy==1.8.1
19
+ decorator==5.1.1
20
+ distro==1.9.0
21
+ executing==2.0.1
22
+ fastapi==0.115.6
23
+ ffmpy==0.5.0
24
+ filelock==3.16.1
25
+ fpdf==1.7.2
26
+ frozenlist==1.5.0
27
+ fsspec==2024.12.0
28
+ gitdb==4.0.12
29
+ GitPython==3.1.44
30
+ gradio==5.11.0
31
+ gradio_client==1.5.3
32
+ greenlet==3.1.1
33
+ grpcio==1.67.1
34
+ h11==0.14.0
35
+ httpcore==1.0.7
36
+ httpx==0.28.1
37
+ httpx-sse==0.4.0
38
+ huggingface-hub==0.27.1
39
+ idna==3.10
40
+ ipykernel==6.29.4
41
+ ipython==8.25.0
42
+ jedi==0.19.1
43
+ Jinja2==3.1.5
44
+ jiter==0.8.2
45
+ joblib==1.4.2
46
+ jsonpatch==1.33
47
+ jsonpointer==3.0.0
48
+ jsonschema==4.23.0
49
+ jsonschema-specifications==2024.10.1
50
+ jupyter_client==8.6.2
51
+ jupyter_core==5.7.2
52
+ langchain==0.3.14
53
+ langchain-community==0.3.14
54
+ langchain-core==0.3.29
55
+ langchain-openai==0.2.14
56
+ langchain-text-splitters==0.3.5
57
+ langsmith==0.2.10
58
+ markdown-it-py==3.0.0
59
+ MarkupSafe==2.1.5
60
+ marshmallow==3.24.2
61
+ matplotlib-inline==0.1.7
62
+ mdurl==0.1.2
63
+ mpmath==1.3.0
64
+ multidict==6.1.0
65
+ mypy-extensions==1.0.0
66
+ narwhals==1.21.1
67
+ nest-asyncio==1.6.0
68
+ networkx==3.4.2
69
+ numpy==2.2.1
70
+ openai==1.59.5
71
+ orjson==3.10.14
72
+ packaging==24.1
73
+ pandas==2.2.3
74
+ parso==0.8.4
75
+ pillow==11.1.0
76
+ platformdirs==4.2.2
77
+ prompt_toolkit==3.0.47
78
+ propcache==0.2.1
79
+ protobuf==5.29.3
80
+ psutil==6.0.0
81
+ pure-eval==0.2.2
82
+ pyarrow==18.1.0
83
+ pydantic==2.10.4
84
+ pydantic-settings==2.7.1
85
+ pydantic_core==2.27.2
86
+ pydeck==0.9.1
87
+ pydub==0.25.1
88
+ Pygments==2.18.0
89
+ pymilvus==2.5.3
90
+ python-dateutil==2.9.0.post0
91
+ python-dotenv==1.0.1
92
+ python-multipart==0.0.20
93
+ pytz==2024.2
94
+ pywin32==306
95
+ PyYAML==6.0.2
96
+ pyzmq==26.0.3
97
+ referencing==0.35.1
98
+ regex==2024.11.6
99
+ requests==2.32.3
100
+ requests-toolbelt==1.0.0
101
+ rich==13.9.4
102
+ rpds-py==0.22.3
103
+ ruff==0.8.6
104
+ safehttpx==0.1.6
105
+ safetensors==0.5.2
106
+ scikit-learn==1.6.0
107
+ scipy==1.15.0
108
+ semantic-version==2.10.0
109
+ sentence-transformers==3.3.1
110
+ setuptools==75.1.0
111
+ shellingham==1.5.4
112
+ six==1.16.0
113
+ smmap==5.0.2
114
+ sniffio==1.3.1
115
+ SQLAlchemy==2.0.36
116
+ stack-data==0.6.3
117
+ starlette==0.41.3
118
+ streamlit==1.41.1
119
+ streamlit-pdf-viewer==0.0.20
120
+ sympy==1.13.1
121
+ tenacity==9.0.0
122
+ threadpoolctl==3.5.0
123
+ tiktoken==0.8.0
124
+ tokenizers==0.21.0
125
+ toml==0.10.2
126
+ tomlkit==0.13.2
127
+ torch==2.5.1
128
+ tornado==6.4.1
129
+ tqdm==4.67.1
130
+ traitlets==5.14.3
131
+ transformers==4.47.1
132
+ typer==0.15.1
133
+ typing-inspect==0.9.0
134
+ typing_extensions==4.12.2
135
+ tzdata==2024.2
136
+ ujson==5.10.0
137
+ urllib3==2.3.0
138
+ uvicorn==0.34.0
139
+ watchdog==6.0.0
140
+ wcwidth==0.2.13
141
+ websockets==14.1
142
+ wheel==0.44.0
143
+ yarl==1.18.3
utils/extract.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import pandas as pd
4
+
5
+ def display_and_store_directory_content(base_path):
6
+ """
7
+ Display all paths with directories and files along with their content,
8
+ and store the information in a Pandas DataFrame.
9
+
10
+ Args:
11
+ base_path (str): The root directory path to scan.
12
+
13
+ Returns:
14
+ None: Prints paths and content, and saves the DataFrame as a pickle file.
15
+ """
16
+ data = [] # To store path and content as rows for the DataFrame
17
+
18
+ for root, dirs, files in os.walk(base_path):
19
+ # Store directories (no content)
20
+ for d in dirs:
21
+ dir_path = os.path.join(root, d)
22
+ data.append({"path": dir_path, "content": ""})
23
+ print(f"Directory: {dir_path}")
24
+
25
+ # Store files and their content
26
+ for f in files:
27
+ file_path = os.path.join(root, f)
28
+ try:
29
+ with open(file_path, 'r', encoding='utf-8') as file:
30
+ content = file.read()
31
+ except Exception as e:
32
+ content = f"Error reading file: {e}"
33
+
34
+ data.append({"path": file_path, "content": content})
35
+ print(f"\nFile: {file_path}")
36
+ print("-" * 40)
37
+ print(content)
38
+ print("-" * 40)
39
+
40
+ # Create a DataFrame
41
+ df = pd.DataFrame(data)
42
+
43
+ # Create the 'extraction' directory if it doesn't exist
44
+ extraction_dir = "extraction"
45
+ if not os.path.exists(extraction_dir):
46
+ os.makedirs(extraction_dir)
47
+
48
+ # Use the last component of the base path as the file name
49
+ base_name = os.path.basename(os.path.normpath(base_path))
50
+ output_file = os.path.join(extraction_dir, f"{base_name}.pkl")
51
+
52
+ # Save the DataFrame to a pickle file
53
+ df.to_pickle(output_file)
54
+ print(f"\nDataFrame saved to {output_file}")
55
+
56
+ if __name__ == "__main__":
57
+ # Ensure a directory path is provided as an argument
58
+ if len(sys.argv) < 2:
59
+ print("Usage: python utils\\extract_all_content.py <directory>")
60
+ sys.exit(1)
61
+
62
+ # Get the directory path from the command-line arguments
63
+ directory_path = sys.argv[1]
64
+
65
+ # Execute the function
66
+ if os.path.exists(directory_path):
67
+ display_and_store_directory_content(directory_path)
68
+ else:
69
+ print(f"Error: The path '{directory_path}' does not exist.")