broadfield-dev commited on
Commit
4026330
·
verified ·
1 Parent(s): 1c2a481

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -229
app.py CHANGED
@@ -4,21 +4,11 @@ from parser import parse_python_code
4
  import os
5
  import json
6
  import io
7
- import chromadb
8
- from sklearn.metrics.pairwise import cosine_similarity
9
- import numpy as np
10
- from datasets import Dataset, load_dataset
11
- from transformers import AutoTokenizer, AutoModel
12
- import torch
13
  import subprocess # To call process_hf_dataset.py
14
 
15
  # User-configurable variables
16
  DB_NAME = "python_programs" # ChromaDB collection name
17
- HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
18
- HF_KEY = "YOUR_HUGGINGFACE_KEY" # Replace with your Hugging Face API key
19
  UPLOAD_DIR = "./uploads" # Directory for uploads
20
- PERSIST_DIR = "./chroma_data" # Directory for persistent ChromaDB storage
21
- USE_GPU = False # Default to CPU, set to True for GPU if available
22
 
23
  app = Flask(__name__)
24
 
@@ -27,222 +17,6 @@ def reconstruct_code(parts):
27
  sorted_parts = sorted(parts, key=lambda p: p['location'][0])
28
  return ''.join(part['source'] for part in sorted_parts)
29
 
30
- def init_chromadb(persist_dir=PERSIST_DIR):
31
- """Initialize ChromaDB client, optionally with persistent storage."""
32
- try:
33
- # Use persistent storage if directory exists, otherwise in-memory
34
- if os.path.exists(persist_dir):
35
- client = chromadb.PersistentClient(path=persist_dir)
36
- else:
37
- client = chromadb.Client()
38
- return client
39
- except Exception as e:
40
- print(f"Error initializing ChromaDB: {e}")
41
- return chromadb.Client() # Fallback to in-memory
42
-
43
- def create_collection(client, collection_name=DB_NAME):
44
- """Create or get a ChromaDB collection for Python programs."""
45
- try:
46
- collection = client.get_collection(name=collection_name)
47
- except:
48
- collection = client.create_collection(name=collection_name)
49
- return collection
50
-
51
- def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
52
- """Store a program in ChromaDB with its code, sequence, and vectors."""
53
- collection = create_collection(client, collection_name)
54
-
55
- # Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
56
- flattened_vectors = [item for sublist in vectors for item in sublist]
57
-
58
- # Store program data (ID, code, sequence, vectors)
59
- program_id = str(hash(code)) # Use hash of code as ID for uniqueness
60
- collection.add(
61
- documents=[code],
62
- metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors))}],
63
- ids=[program_id],
64
- embeddings=[flattened_vectors] # Pass as flat list
65
- )
66
- return program_id
67
-
68
- def populate_sample_db(client):
69
- """Populate ChromaDB with sample Python programs."""
70
- samples = [
71
- """
72
- import os
73
- def add_one(x):
74
- y = x + 1
75
- return y
76
- """,
77
- """
78
- def multiply(a, b):
79
- c = a * b
80
- if c > 0:
81
- return c
82
- """
83
- ]
84
-
85
- for code in samples:
86
- parts, sequence = parse_python_code(code)
87
- vectors = [part['vector'] for part in parts]
88
- store_program(client, code, sequence, vectors)
89
-
90
- def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None):
91
- """Query ChromaDB for programs matching the operations sequence or semantic description."""
92
- collection = create_collection(client, collection_name)
93
-
94
- if semantic_query:
95
- # Semantic search using CodeBERT embeddings
96
- query_vector = generate_semantic_vector(semantic_query)
97
- results = collection.query(
98
- query_embeddings=[query_vector],
99
- n_results=top_k,
100
- include=["documents", "metadatas"]
101
- )
102
- else:
103
- # Vector-based search for operations sequence
104
- query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
105
- results = collection.query(
106
- query_embeddings=[query_vector],
107
- n_results=top_k,
108
- include=["documents", "metadatas"]
109
- )
110
-
111
- # Process results
112
- matching_programs = []
113
- for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
114
- sequence = meta['sequence'].split(',')
115
- if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
116
- try:
117
- # Reconstruct program vectors (flatten if needed)
118
- doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
119
- if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
120
- program_vector = doc_vectors # Single flat vector
121
- else:
122
- program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
123
- except:
124
- program_vector = [0] * 6 # Fallback for malformed vectors
125
- similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
126
- matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', '')})
127
-
128
- return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
129
-
130
- def create_vector(category, level, location, total_lines, parent_path):
131
- """Helper to create a vector for query (matches parser's create_vector)."""
132
- category_map = {
133
- 'import': 1, 'function': 2, 'async_function': 3, 'class': 4,
134
- 'if': 5, 'while': 6, 'for': 7, 'try': 8, 'expression': 9, 'spacer': 10,
135
- 'other': 11, 'elif': 12, 'else': 13, 'except': 14, 'finally': 15, 'return': 16,
136
- 'assigned_variable': 17, 'input_variable': 18, 'returned_variable': 19
137
- }
138
- category_id = category_map.get(category, 0)
139
- start_line, end_line = location
140
- span = (end_line - start_line + 1) / total_lines
141
- center_pos = ((start_line + end_line) / 2) / total_lines
142
- parent_depth = len(parent_path)
143
- parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1))
144
- for i, parent in enumerate(parent_path)) / max(1, len(category_map))
145
- return [category_id, level, center_pos, span, parent_depth, parent_weight]
146
-
147
- def is_subsequence(subseq, seq):
148
- """Check if subseq is a subsequence of seq."""
149
- it = iter(seq)
150
- return all(item in it for item in subseq)
151
-
152
- def generate_description_tokens(sequence, vectors):
153
- """Generate semantic description tokens for a program based on its sequence and vectors."""
154
- tokens = []
155
- category_descriptions = {
156
- 'import': 'imports module',
157
- 'function': 'defines function',
158
- 'assigned_variable': 'assigns variable',
159
- 'input_variable': 'input parameter',
160
- 'returned_variable': 'returns value',
161
- 'if': 'conditional statement',
162
- 'return': 'returns result',
163
- 'try': 'try block',
164
- 'except': 'exception handler',
165
- 'expression': 'expression statement',
166
- 'spacer': 'empty line or comment'
167
- }
168
-
169
- for cat, vec in zip(sequence, vectors):
170
- if cat in category_descriptions:
171
- tokens.append(f"{category_descriptions[cat]}:{cat}")
172
- # Add vector-derived features (e.g., level, span) as tokens
173
- tokens.append(f"level:{vec[1]}")
174
- tokens.append(f"span:{vec[3]:.2f}")
175
- return tokens
176
-
177
- def generate_semantic_vector(description, use_gpu=USE_GPU):
178
- """Generate a semantic vector for a textual description using CodeBERT, with CPU/GPU option."""
179
- # Load CodeBERT model and tokenizer
180
- model_name = "microsoft/codebert-base"
181
- tokenizer = AutoTokenizer.from_pretrained(model_name)
182
- device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
183
- model = AutoModel.from_pretrained(model_name).to(device)
184
-
185
- # Tokenize and encode the description
186
- inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
187
- inputs = {k: v.to(device) for k, v in inputs.items()}
188
-
189
- # Generate embeddings
190
- with torch.no_grad():
191
- outputs = model(**inputs)
192
- # Use mean pooling of the last hidden states
193
- vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
194
-
195
- # Truncate or pad to 6D to match our vectors
196
- if len(vector) < 6:
197
- vector.extend([0] * (6 - len(vector)))
198
- elif len(vector) > 6:
199
- vector = vector[:6]
200
- return vector
201
-
202
- def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
203
- """Save ChromaDB data to Hugging Face Dataset."""
204
- client = init_chromadb()
205
- collection = create_collection(client)
206
-
207
- # Fetch all data from ChromaDB
208
- results = collection.get(include=["documents", "metadatas", "embeddings"])
209
- data = {
210
- "code": results["documents"],
211
- "sequence": [meta["sequence"] for meta in results["metadatas"]],
212
- "vectors": results["embeddings"], # ChromaDB already flattens embeddings
213
- "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
214
- }
215
-
216
- # Create a Hugging Face Dataset
217
- dataset = Dataset.from_dict(data)
218
-
219
- # Push to Hugging Face Hub
220
- dataset.push_to_hub(dataset_name, token=token)
221
- print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
222
-
223
- def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
224
- """Load ChromaDB data from Hugging Face Dataset, handle empty dataset."""
225
- try:
226
- dataset = load_dataset(dataset_name, split="train", token=token)
227
- except Exception as e:
228
- print(f"Error loading dataset from Hugging Face: {e}. Populating with samples...")
229
- client = init_chromadb()
230
- populate_sample_db(client)
231
- save_chromadb_to_hf() # Create and push a new dataset
232
- return init_chromadb()
233
-
234
- client = init_chromadb()
235
- collection = create_collection(client)
236
-
237
- for item in dataset:
238
- collection.add(
239
- documents=[item["code"]],
240
- metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"]}],
241
- ids=[str(hash(item["code"]))],
242
- embeddings=[item["vectors"]]
243
- )
244
- return client
245
-
246
  @app.route('/', methods=['GET', 'POST'])
247
  def index():
248
  if request.method == 'POST':
@@ -263,6 +37,7 @@ def index():
263
  code_input = f.read()
264
  parts, sequence = parse_python_code(code_input)
265
  # Store in ChromaDB
 
266
  client = init_chromadb()
267
  vectors = [part['vector'] for part in parts]
268
  store_program(client, code_input, sequence, vectors, DB_NAME)
@@ -273,12 +48,14 @@ def index():
273
  filename += '.py'
274
  parts, sequence = parse_python_code(code_input)
275
  vectors = [part['vector'] for part in parts]
 
276
  client = init_chromadb()
277
  store_program(client, code_input, sequence, vectors, DB_NAME)
278
  elif 'query_ops' in request.form and request.form['query_ops'].strip():
279
  # Handle query for operations (category sequence)
280
  operations = [op.strip() for op in request.form['query_ops'].split(',')]
281
- client = load_chromadb_from_hf(HF_DATASET_NAME, HF_KEY) # Load from Hugging Face
 
282
  query_results = query_programs(client, operations, DB_NAME)
283
  return render_template(
284
  'results_partial.html',
@@ -291,7 +68,8 @@ def index():
291
  elif 'semantic_query' in request.form and request.form['semantic_query'].strip():
292
  # Handle semantic query (natural language description)
293
  semantic_query = request.form['semantic_query']
294
- client = load_chromadb_from_hf(HF_DATASET_NAME, HF_KEY) # Load from Hugging Face
 
295
  query_results = query_programs(client, None, DB_NAME, semantic_query=semantic_query)
296
  return render_template(
297
  'results_partial.html',
@@ -331,12 +109,15 @@ def index():
331
  return 'No file, code, or query provided', 400
332
 
333
  # Initial page load
334
- client = load_chromadb_from_hf(HF_DATASET_NAME, HF_KEY) # Load from Hugging Face on startup
 
335
  # If no dataset exists locally, populate with samples
336
  try:
337
  if not client.list_collections()[0].name == DB_NAME:
 
338
  populate_sample_db(client)
339
  except:
 
340
  populate_sample_db(client)
341
  return render_template('index.html', parts=None, filename=None, reconstructed_code=None, code_input=None, query_results=None)
342
 
@@ -354,6 +135,31 @@ def export_json():
354
  mimetype='application/json'
355
  )
356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  if __name__ == '__main__':
358
  if not os.path.exists(UPLOAD_DIR):
359
  os.makedirs(UPLOAD_DIR)
 
4
  import os
5
  import json
6
  import io
 
 
 
 
 
 
7
  import subprocess # To call process_hf_dataset.py
8
 
9
  # User-configurable variables
10
  DB_NAME = "python_programs" # ChromaDB collection name
 
 
11
  UPLOAD_DIR = "./uploads" # Directory for uploads
 
 
12
 
13
  app = Flask(__name__)
14
 
 
17
  sorted_parts = sorted(parts, key=lambda p: p['location'][0])
18
  return ''.join(part['source'] for part in sorted_parts)
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  @app.route('/', methods=['GET', 'POST'])
21
  def index():
22
  if request.method == 'POST':
 
37
  code_input = f.read()
38
  parts, sequence = parse_python_code(code_input)
39
  # Store in ChromaDB
40
+ from database import init_chromadb, store_program
41
  client = init_chromadb()
42
  vectors = [part['vector'] for part in parts]
43
  store_program(client, code_input, sequence, vectors, DB_NAME)
 
48
  filename += '.py'
49
  parts, sequence = parse_python_code(code_input)
50
  vectors = [part['vector'] for part in parts]
51
+ from database import init_chromadb, store_program
52
  client = init_chromadb()
53
  store_program(client, code_input, sequence, vectors, DB_NAME)
54
  elif 'query_ops' in request.form and request.form['query_ops'].strip():
55
  # Handle query for operations (category sequence)
56
  operations = [op.strip() for op in request.form['query_ops'].split(',')]
57
+ from database import load_chromadb_from_hf, query_programs
58
+ client = load_chromadb_from_hf()
59
  query_results = query_programs(client, operations, DB_NAME)
60
  return render_template(
61
  'results_partial.html',
 
68
  elif 'semantic_query' in request.form and request.form['semantic_query'].strip():
69
  # Handle semantic query (natural language description)
70
  semantic_query = request.form['semantic_query']
71
+ from database import load_chromadb_from_hf, query_programs
72
+ client = load_chromadb_from_hf()
73
  query_results = query_programs(client, None, DB_NAME, semantic_query=semantic_query)
74
  return render_template(
75
  'results_partial.html',
 
109
  return 'No file, code, or query provided', 400
110
 
111
  # Initial page load
112
+ from database import load_chromadb_from_hf
113
+ client = load_chromadb_from_hf()
114
  # If no dataset exists locally, populate with samples
115
  try:
116
  if not client.list_collections()[0].name == DB_NAME:
117
+ from database import populate_sample_db
118
  populate_sample_db(client)
119
  except:
120
+ from database import populate_sample_db
121
  populate_sample_db(client)
122
  return render_template('index.html', parts=None, filename=None, reconstructed_code=None, code_input=None, query_results=None)
123
 
 
135
  mimetype='application/json'
136
  )
137
 
138
+ def generate_description_tokens(sequence, vectors):
139
+ """Generate semantic description tokens for a program based on its sequence and vectors."""
140
+ tokens = []
141
+ category_descriptions = {
142
+ 'import': 'imports module',
143
+ 'function': 'defines function',
144
+ 'assigned_variable': 'assigns variable',
145
+ 'input_variable': 'input parameter',
146
+ 'returned_variable': 'returns value',
147
+ 'if': 'conditional statement',
148
+ 'return': 'returns result',
149
+ 'try': 'try block',
150
+ 'except': 'exception handler',
151
+ 'expression': 'expression statement',
152
+ 'spacer': 'empty line or comment'
153
+ }
154
+
155
+ for cat, vec in zip(sequence, vectors):
156
+ if cat in category_descriptions:
157
+ tokens.append(f"{category_descriptions[cat]}:{cat}")
158
+ # Add vector-derived features (e.g., level, span) as tokens
159
+ tokens.append(f"level:{vec[1]}")
160
+ tokens.append(f"span:{vec[3]:.2f}")
161
+ return " ".join(tokens)
162
+
163
  if __name__ == '__main__':
164
  if not os.path.exists(UPLOAD_DIR):
165
  os.makedirs(UPLOAD_DIR)