Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,21 +4,11 @@ from parser import parse_python_code
|
|
4 |
import os
|
5 |
import json
|
6 |
import io
|
7 |
-
import chromadb
|
8 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
-
import numpy as np
|
10 |
-
from datasets import Dataset, load_dataset
|
11 |
-
from transformers import AutoTokenizer, AutoModel
|
12 |
-
import torch
|
13 |
import subprocess # To call process_hf_dataset.py
|
14 |
|
15 |
# User-configurable variables
|
16 |
DB_NAME = "python_programs" # ChromaDB collection name
|
17 |
-
HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
|
18 |
-
HF_KEY = "YOUR_HUGGINGFACE_KEY" # Replace with your Hugging Face API key
|
19 |
UPLOAD_DIR = "./uploads" # Directory for uploads
|
20 |
-
PERSIST_DIR = "./chroma_data" # Directory for persistent ChromaDB storage
|
21 |
-
USE_GPU = False # Default to CPU, set to True for GPU if available
|
22 |
|
23 |
app = Flask(__name__)
|
24 |
|
@@ -27,222 +17,6 @@ def reconstruct_code(parts):
|
|
27 |
sorted_parts = sorted(parts, key=lambda p: p['location'][0])
|
28 |
return ''.join(part['source'] for part in sorted_parts)
|
29 |
|
30 |
-
def init_chromadb(persist_dir=PERSIST_DIR):
|
31 |
-
"""Initialize ChromaDB client, optionally with persistent storage."""
|
32 |
-
try:
|
33 |
-
# Use persistent storage if directory exists, otherwise in-memory
|
34 |
-
if os.path.exists(persist_dir):
|
35 |
-
client = chromadb.PersistentClient(path=persist_dir)
|
36 |
-
else:
|
37 |
-
client = chromadb.Client()
|
38 |
-
return client
|
39 |
-
except Exception as e:
|
40 |
-
print(f"Error initializing ChromaDB: {e}")
|
41 |
-
return chromadb.Client() # Fallback to in-memory
|
42 |
-
|
43 |
-
def create_collection(client, collection_name=DB_NAME):
|
44 |
-
"""Create or get a ChromaDB collection for Python programs."""
|
45 |
-
try:
|
46 |
-
collection = client.get_collection(name=collection_name)
|
47 |
-
except:
|
48 |
-
collection = client.create_collection(name=collection_name)
|
49 |
-
return collection
|
50 |
-
|
51 |
-
def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
|
52 |
-
"""Store a program in ChromaDB with its code, sequence, and vectors."""
|
53 |
-
collection = create_collection(client, collection_name)
|
54 |
-
|
55 |
-
# Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
|
56 |
-
flattened_vectors = [item for sublist in vectors for item in sublist]
|
57 |
-
|
58 |
-
# Store program data (ID, code, sequence, vectors)
|
59 |
-
program_id = str(hash(code)) # Use hash of code as ID for uniqueness
|
60 |
-
collection.add(
|
61 |
-
documents=[code],
|
62 |
-
metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors))}],
|
63 |
-
ids=[program_id],
|
64 |
-
embeddings=[flattened_vectors] # Pass as flat list
|
65 |
-
)
|
66 |
-
return program_id
|
67 |
-
|
68 |
-
def populate_sample_db(client):
|
69 |
-
"""Populate ChromaDB with sample Python programs."""
|
70 |
-
samples = [
|
71 |
-
"""
|
72 |
-
import os
|
73 |
-
def add_one(x):
|
74 |
-
y = x + 1
|
75 |
-
return y
|
76 |
-
""",
|
77 |
-
"""
|
78 |
-
def multiply(a, b):
|
79 |
-
c = a * b
|
80 |
-
if c > 0:
|
81 |
-
return c
|
82 |
-
"""
|
83 |
-
]
|
84 |
-
|
85 |
-
for code in samples:
|
86 |
-
parts, sequence = parse_python_code(code)
|
87 |
-
vectors = [part['vector'] for part in parts]
|
88 |
-
store_program(client, code, sequence, vectors)
|
89 |
-
|
90 |
-
def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None):
|
91 |
-
"""Query ChromaDB for programs matching the operations sequence or semantic description."""
|
92 |
-
collection = create_collection(client, collection_name)
|
93 |
-
|
94 |
-
if semantic_query:
|
95 |
-
# Semantic search using CodeBERT embeddings
|
96 |
-
query_vector = generate_semantic_vector(semantic_query)
|
97 |
-
results = collection.query(
|
98 |
-
query_embeddings=[query_vector],
|
99 |
-
n_results=top_k,
|
100 |
-
include=["documents", "metadatas"]
|
101 |
-
)
|
102 |
-
else:
|
103 |
-
# Vector-based search for operations sequence
|
104 |
-
query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
|
105 |
-
results = collection.query(
|
106 |
-
query_embeddings=[query_vector],
|
107 |
-
n_results=top_k,
|
108 |
-
include=["documents", "metadatas"]
|
109 |
-
)
|
110 |
-
|
111 |
-
# Process results
|
112 |
-
matching_programs = []
|
113 |
-
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
|
114 |
-
sequence = meta['sequence'].split(',')
|
115 |
-
if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
|
116 |
-
try:
|
117 |
-
# Reconstruct program vectors (flatten if needed)
|
118 |
-
doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
|
119 |
-
if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
|
120 |
-
program_vector = doc_vectors # Single flat vector
|
121 |
-
else:
|
122 |
-
program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
|
123 |
-
except:
|
124 |
-
program_vector = [0] * 6 # Fallback for malformed vectors
|
125 |
-
similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
|
126 |
-
matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', '')})
|
127 |
-
|
128 |
-
return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
|
129 |
-
|
130 |
-
def create_vector(category, level, location, total_lines, parent_path):
|
131 |
-
"""Helper to create a vector for query (matches parser's create_vector)."""
|
132 |
-
category_map = {
|
133 |
-
'import': 1, 'function': 2, 'async_function': 3, 'class': 4,
|
134 |
-
'if': 5, 'while': 6, 'for': 7, 'try': 8, 'expression': 9, 'spacer': 10,
|
135 |
-
'other': 11, 'elif': 12, 'else': 13, 'except': 14, 'finally': 15, 'return': 16,
|
136 |
-
'assigned_variable': 17, 'input_variable': 18, 'returned_variable': 19
|
137 |
-
}
|
138 |
-
category_id = category_map.get(category, 0)
|
139 |
-
start_line, end_line = location
|
140 |
-
span = (end_line - start_line + 1) / total_lines
|
141 |
-
center_pos = ((start_line + end_line) / 2) / total_lines
|
142 |
-
parent_depth = len(parent_path)
|
143 |
-
parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1))
|
144 |
-
for i, parent in enumerate(parent_path)) / max(1, len(category_map))
|
145 |
-
return [category_id, level, center_pos, span, parent_depth, parent_weight]
|
146 |
-
|
147 |
-
def is_subsequence(subseq, seq):
|
148 |
-
"""Check if subseq is a subsequence of seq."""
|
149 |
-
it = iter(seq)
|
150 |
-
return all(item in it for item in subseq)
|
151 |
-
|
152 |
-
def generate_description_tokens(sequence, vectors):
|
153 |
-
"""Generate semantic description tokens for a program based on its sequence and vectors."""
|
154 |
-
tokens = []
|
155 |
-
category_descriptions = {
|
156 |
-
'import': 'imports module',
|
157 |
-
'function': 'defines function',
|
158 |
-
'assigned_variable': 'assigns variable',
|
159 |
-
'input_variable': 'input parameter',
|
160 |
-
'returned_variable': 'returns value',
|
161 |
-
'if': 'conditional statement',
|
162 |
-
'return': 'returns result',
|
163 |
-
'try': 'try block',
|
164 |
-
'except': 'exception handler',
|
165 |
-
'expression': 'expression statement',
|
166 |
-
'spacer': 'empty line or comment'
|
167 |
-
}
|
168 |
-
|
169 |
-
for cat, vec in zip(sequence, vectors):
|
170 |
-
if cat in category_descriptions:
|
171 |
-
tokens.append(f"{category_descriptions[cat]}:{cat}")
|
172 |
-
# Add vector-derived features (e.g., level, span) as tokens
|
173 |
-
tokens.append(f"level:{vec[1]}")
|
174 |
-
tokens.append(f"span:{vec[3]:.2f}")
|
175 |
-
return tokens
|
176 |
-
|
177 |
-
def generate_semantic_vector(description, use_gpu=USE_GPU):
|
178 |
-
"""Generate a semantic vector for a textual description using CodeBERT, with CPU/GPU option."""
|
179 |
-
# Load CodeBERT model and tokenizer
|
180 |
-
model_name = "microsoft/codebert-base"
|
181 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
182 |
-
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
183 |
-
model = AutoModel.from_pretrained(model_name).to(device)
|
184 |
-
|
185 |
-
# Tokenize and encode the description
|
186 |
-
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
187 |
-
inputs = {k: v.to(device) for k, v in inputs.items()}
|
188 |
-
|
189 |
-
# Generate embeddings
|
190 |
-
with torch.no_grad():
|
191 |
-
outputs = model(**inputs)
|
192 |
-
# Use mean pooling of the last hidden states
|
193 |
-
vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
|
194 |
-
|
195 |
-
# Truncate or pad to 6D to match our vectors
|
196 |
-
if len(vector) < 6:
|
197 |
-
vector.extend([0] * (6 - len(vector)))
|
198 |
-
elif len(vector) > 6:
|
199 |
-
vector = vector[:6]
|
200 |
-
return vector
|
201 |
-
|
202 |
-
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
|
203 |
-
"""Save ChromaDB data to Hugging Face Dataset."""
|
204 |
-
client = init_chromadb()
|
205 |
-
collection = create_collection(client)
|
206 |
-
|
207 |
-
# Fetch all data from ChromaDB
|
208 |
-
results = collection.get(include=["documents", "metadatas", "embeddings"])
|
209 |
-
data = {
|
210 |
-
"code": results["documents"],
|
211 |
-
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
212 |
-
"vectors": results["embeddings"], # ChromaDB already flattens embeddings
|
213 |
-
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
|
214 |
-
}
|
215 |
-
|
216 |
-
# Create a Hugging Face Dataset
|
217 |
-
dataset = Dataset.from_dict(data)
|
218 |
-
|
219 |
-
# Push to Hugging Face Hub
|
220 |
-
dataset.push_to_hub(dataset_name, token=token)
|
221 |
-
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
222 |
-
|
223 |
-
def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
|
224 |
-
"""Load ChromaDB data from Hugging Face Dataset, handle empty dataset."""
|
225 |
-
try:
|
226 |
-
dataset = load_dataset(dataset_name, split="train", token=token)
|
227 |
-
except Exception as e:
|
228 |
-
print(f"Error loading dataset from Hugging Face: {e}. Populating with samples...")
|
229 |
-
client = init_chromadb()
|
230 |
-
populate_sample_db(client)
|
231 |
-
save_chromadb_to_hf() # Create and push a new dataset
|
232 |
-
return init_chromadb()
|
233 |
-
|
234 |
-
client = init_chromadb()
|
235 |
-
collection = create_collection(client)
|
236 |
-
|
237 |
-
for item in dataset:
|
238 |
-
collection.add(
|
239 |
-
documents=[item["code"]],
|
240 |
-
metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"]}],
|
241 |
-
ids=[str(hash(item["code"]))],
|
242 |
-
embeddings=[item["vectors"]]
|
243 |
-
)
|
244 |
-
return client
|
245 |
-
|
246 |
@app.route('/', methods=['GET', 'POST'])
|
247 |
def index():
|
248 |
if request.method == 'POST':
|
@@ -263,6 +37,7 @@ def index():
|
|
263 |
code_input = f.read()
|
264 |
parts, sequence = parse_python_code(code_input)
|
265 |
# Store in ChromaDB
|
|
|
266 |
client = init_chromadb()
|
267 |
vectors = [part['vector'] for part in parts]
|
268 |
store_program(client, code_input, sequence, vectors, DB_NAME)
|
@@ -273,12 +48,14 @@ def index():
|
|
273 |
filename += '.py'
|
274 |
parts, sequence = parse_python_code(code_input)
|
275 |
vectors = [part['vector'] for part in parts]
|
|
|
276 |
client = init_chromadb()
|
277 |
store_program(client, code_input, sequence, vectors, DB_NAME)
|
278 |
elif 'query_ops' in request.form and request.form['query_ops'].strip():
|
279 |
# Handle query for operations (category sequence)
|
280 |
operations = [op.strip() for op in request.form['query_ops'].split(',')]
|
281 |
-
|
|
|
282 |
query_results = query_programs(client, operations, DB_NAME)
|
283 |
return render_template(
|
284 |
'results_partial.html',
|
@@ -291,7 +68,8 @@ def index():
|
|
291 |
elif 'semantic_query' in request.form and request.form['semantic_query'].strip():
|
292 |
# Handle semantic query (natural language description)
|
293 |
semantic_query = request.form['semantic_query']
|
294 |
-
|
|
|
295 |
query_results = query_programs(client, None, DB_NAME, semantic_query=semantic_query)
|
296 |
return render_template(
|
297 |
'results_partial.html',
|
@@ -331,12 +109,15 @@ def index():
|
|
331 |
return 'No file, code, or query provided', 400
|
332 |
|
333 |
# Initial page load
|
334 |
-
|
|
|
335 |
# If no dataset exists locally, populate with samples
|
336 |
try:
|
337 |
if not client.list_collections()[0].name == DB_NAME:
|
|
|
338 |
populate_sample_db(client)
|
339 |
except:
|
|
|
340 |
populate_sample_db(client)
|
341 |
return render_template('index.html', parts=None, filename=None, reconstructed_code=None, code_input=None, query_results=None)
|
342 |
|
@@ -354,6 +135,31 @@ def export_json():
|
|
354 |
mimetype='application/json'
|
355 |
)
|
356 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
if __name__ == '__main__':
|
358 |
if not os.path.exists(UPLOAD_DIR):
|
359 |
os.makedirs(UPLOAD_DIR)
|
|
|
4 |
import os
|
5 |
import json
|
6 |
import io
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
import subprocess # To call process_hf_dataset.py
|
8 |
|
9 |
# User-configurable variables
|
10 |
DB_NAME = "python_programs" # ChromaDB collection name
|
|
|
|
|
11 |
UPLOAD_DIR = "./uploads" # Directory for uploads
|
|
|
|
|
12 |
|
13 |
app = Flask(__name__)
|
14 |
|
|
|
17 |
sorted_parts = sorted(parts, key=lambda p: p['location'][0])
|
18 |
return ''.join(part['source'] for part in sorted_parts)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
@app.route('/', methods=['GET', 'POST'])
|
21 |
def index():
|
22 |
if request.method == 'POST':
|
|
|
37 |
code_input = f.read()
|
38 |
parts, sequence = parse_python_code(code_input)
|
39 |
# Store in ChromaDB
|
40 |
+
from database import init_chromadb, store_program
|
41 |
client = init_chromadb()
|
42 |
vectors = [part['vector'] for part in parts]
|
43 |
store_program(client, code_input, sequence, vectors, DB_NAME)
|
|
|
48 |
filename += '.py'
|
49 |
parts, sequence = parse_python_code(code_input)
|
50 |
vectors = [part['vector'] for part in parts]
|
51 |
+
from database import init_chromadb, store_program
|
52 |
client = init_chromadb()
|
53 |
store_program(client, code_input, sequence, vectors, DB_NAME)
|
54 |
elif 'query_ops' in request.form and request.form['query_ops'].strip():
|
55 |
# Handle query for operations (category sequence)
|
56 |
operations = [op.strip() for op in request.form['query_ops'].split(',')]
|
57 |
+
from database import load_chromadb_from_hf, query_programs
|
58 |
+
client = load_chromadb_from_hf()
|
59 |
query_results = query_programs(client, operations, DB_NAME)
|
60 |
return render_template(
|
61 |
'results_partial.html',
|
|
|
68 |
elif 'semantic_query' in request.form and request.form['semantic_query'].strip():
|
69 |
# Handle semantic query (natural language description)
|
70 |
semantic_query = request.form['semantic_query']
|
71 |
+
from database import load_chromadb_from_hf, query_programs
|
72 |
+
client = load_chromadb_from_hf()
|
73 |
query_results = query_programs(client, None, DB_NAME, semantic_query=semantic_query)
|
74 |
return render_template(
|
75 |
'results_partial.html',
|
|
|
109 |
return 'No file, code, or query provided', 400
|
110 |
|
111 |
# Initial page load
|
112 |
+
from database import load_chromadb_from_hf
|
113 |
+
client = load_chromadb_from_hf()
|
114 |
# If no dataset exists locally, populate with samples
|
115 |
try:
|
116 |
if not client.list_collections()[0].name == DB_NAME:
|
117 |
+
from database import populate_sample_db
|
118 |
populate_sample_db(client)
|
119 |
except:
|
120 |
+
from database import populate_sample_db
|
121 |
populate_sample_db(client)
|
122 |
return render_template('index.html', parts=None, filename=None, reconstructed_code=None, code_input=None, query_results=None)
|
123 |
|
|
|
135 |
mimetype='application/json'
|
136 |
)
|
137 |
|
138 |
+
def generate_description_tokens(sequence, vectors):
|
139 |
+
"""Generate semantic description tokens for a program based on its sequence and vectors."""
|
140 |
+
tokens = []
|
141 |
+
category_descriptions = {
|
142 |
+
'import': 'imports module',
|
143 |
+
'function': 'defines function',
|
144 |
+
'assigned_variable': 'assigns variable',
|
145 |
+
'input_variable': 'input parameter',
|
146 |
+
'returned_variable': 'returns value',
|
147 |
+
'if': 'conditional statement',
|
148 |
+
'return': 'returns result',
|
149 |
+
'try': 'try block',
|
150 |
+
'except': 'exception handler',
|
151 |
+
'expression': 'expression statement',
|
152 |
+
'spacer': 'empty line or comment'
|
153 |
+
}
|
154 |
+
|
155 |
+
for cat, vec in zip(sequence, vectors):
|
156 |
+
if cat in category_descriptions:
|
157 |
+
tokens.append(f"{category_descriptions[cat]}:{cat}")
|
158 |
+
# Add vector-derived features (e.g., level, span) as tokens
|
159 |
+
tokens.append(f"level:{vec[1]}")
|
160 |
+
tokens.append(f"span:{vec[3]:.2f}")
|
161 |
+
return " ".join(tokens)
|
162 |
+
|
163 |
if __name__ == '__main__':
|
164 |
if not os.path.exists(UPLOAD_DIR):
|
165 |
os.makedirs(UPLOAD_DIR)
|