broadfield-dev commited on
Commit
12bbd2a
·
verified ·
1 Parent(s): 406abdb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +243 -26
app.py CHANGED
@@ -4,10 +4,21 @@ from parser import parse_python_code
4
  import os
5
  import json
6
  import io
7
- from database import init_chromadb, populate_sample_db, store_program, query_programs, load_chromadb_from_hf, HF_DATASET_NAME, HF_KEY, DB_NAME, USE_GPU
 
 
 
 
 
 
8
 
9
  # User-configurable variables
 
 
 
10
  UPLOAD_DIR = "./uploads" # Directory for uploads
 
 
11
 
12
  app = Flask(__name__)
13
 
@@ -16,6 +27,222 @@ def reconstruct_code(parts):
16
  sorted_parts = sorted(parts, key=lambda p: p['location'][0])
17
  return ''.join(part['source'] for part in sorted_parts)
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  @app.route('/', methods=['GET', 'POST'])
20
  def index():
21
  if request.method == 'POST':
@@ -74,6 +301,21 @@ def index():
74
  code_input=None,
75
  query_results=query_results
76
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  if parts:
79
  indexed_parts = [{'index': i + 1, **part} for i, part in enumerate(parts)]
@@ -112,31 +354,6 @@ def export_json():
112
  mimetype='application/json'
113
  )
114
 
115
- def generate_description_tokens(sequence, vectors):
116
- """Generate semantic description tokens for a program based on its sequence and vectors."""
117
- tokens = []
118
- category_descriptions = {
119
- 'import': 'imports module',
120
- 'function': 'defines function',
121
- 'assigned_variable': 'assigns variable',
122
- 'input_variable': 'input parameter',
123
- 'returned_variable': 'returns value',
124
- 'if': 'conditional statement',
125
- 'return': 'returns result',
126
- 'try': 'try block',
127
- 'except': 'exception handler',
128
- 'expression': 'expression statement',
129
- 'spacer': 'empty line or comment'
130
- }
131
-
132
- for cat, vec in zip(sequence, vectors):
133
- if cat in category_descriptions:
134
- tokens.append(f"{category_descriptions[cat]}:{cat}")
135
- # Add vector-derived features (e.g., level, span) as tokens
136
- tokens.append(f"level:{vec[1]}")
137
- tokens.append(f"span:{vec[3]:.2f}")
138
- return " ".join(tokens)
139
-
140
  if __name__ == '__main__':
141
  if not os.path.exists(UPLOAD_DIR):
142
  os.makedirs(UPLOAD_DIR)
 
4
  import os
5
  import json
6
  import io
7
+ import chromadb
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import numpy as np
10
+ from datasets import Dataset, load_dataset
11
+ from transformers import AutoTokenizer, AutoModel
12
+ import torch
13
+ import subprocess # To call process_hf_dataset.py
14
 
15
  # User-configurable variables
16
+ DB_NAME = "python_programs" # ChromaDB collection name
17
+ HF_DATASET_NAME = "python_program_vectors" # Hugging Face Dataset name
18
+ HF_KEY = "YOUR_HUGGINGFACE_KEY" # Replace with your Hugging Face API key
19
  UPLOAD_DIR = "./uploads" # Directory for uploads
20
+ PERSIST_DIR = "./chroma_data" # Directory for persistent ChromaDB storage
21
+ USE_GPU = False # Default to CPU, set to True for GPU if available
22
 
23
  app = Flask(__name__)
24
 
 
27
  sorted_parts = sorted(parts, key=lambda p: p['location'][0])
28
  return ''.join(part['source'] for part in sorted_parts)
29
 
30
+ def init_chromadb(persist_dir=PERSIST_DIR):
31
+ """Initialize ChromaDB client, optionally with persistent storage."""
32
+ try:
33
+ # Use persistent storage if directory exists, otherwise in-memory
34
+ if os.path.exists(persist_dir):
35
+ client = chromadb.PersistentClient(path=persist_dir)
36
+ else:
37
+ client = chromadb.Client()
38
+ return client
39
+ except Exception as e:
40
+ print(f"Error initializing ChromaDB: {e}")
41
+ return chromadb.Client() # Fallback to in-memory
42
+
43
+ def create_collection(client, collection_name=DB_NAME):
44
+ """Create or get a ChromaDB collection for Python programs."""
45
+ try:
46
+ collection = client.get_collection(name=collection_name)
47
+ except:
48
+ collection = client.create_collection(name=collection_name)
49
+ return collection
50
+
51
+ def store_program(client, code, sequence, vectors, collection_name=DB_NAME):
52
+ """Store a program in ChromaDB with its code, sequence, and vectors."""
53
+ collection = create_collection(client, collection_name)
54
+
55
+ # Flatten vectors to ensure they are a list of numbers (ChromaDB expects flat embeddings)
56
+ flattened_vectors = [item for sublist in vectors for item in sublist]
57
+
58
+ # Store program data (ID, code, sequence, vectors)
59
+ program_id = str(hash(code)) # Use hash of code as ID for uniqueness
60
+ collection.add(
61
+ documents=[code],
62
+ metadatas=[{"sequence": ",".join(sequence), "description_tokens": " ".join(generate_description_tokens(sequence, vectors))}],
63
+ ids=[program_id],
64
+ embeddings=[flattened_vectors] # Pass as flat list
65
+ )
66
+ return program_id
67
+
68
+ def populate_sample_db(client):
69
+ """Populate ChromaDB with sample Python programs."""
70
+ samples = [
71
+ """
72
+ import os
73
+ def add_one(x):
74
+ y = x + 1
75
+ return y
76
+ """,
77
+ """
78
+ def multiply(a, b):
79
+ c = a * b
80
+ if c > 0:
81
+ return c
82
+ """
83
+ ]
84
+
85
+ for code in samples:
86
+ parts, sequence = parse_python_code(code)
87
+ vectors = [part['vector'] for part in parts]
88
+ store_program(client, code, sequence, vectors)
89
+
90
+ def query_programs(client, operations, collection_name=DB_NAME, top_k=5, semantic_query=None):
91
+ """Query ChromaDB for programs matching the operations sequence or semantic description."""
92
+ collection = create_collection(client, collection_name)
93
+
94
+ if semantic_query:
95
+ # Semantic search using CodeBERT embeddings
96
+ query_vector = generate_semantic_vector(semantic_query)
97
+ results = collection.query(
98
+ query_embeddings=[query_vector],
99
+ n_results=top_k,
100
+ include=["documents", "metadatas"]
101
+ )
102
+ else:
103
+ # Vector-based search for operations sequence
104
+ query_vector = sum([create_vector(op, 0, (1, 1), 100, []) for op in operations], []) / len(operations) if operations else [0] * 6
105
+ results = collection.query(
106
+ query_embeddings=[query_vector],
107
+ n_results=top_k,
108
+ include=["documents", "metadatas"]
109
+ )
110
+
111
+ # Process results
112
+ matching_programs = []
113
+ for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
114
+ sequence = meta['sequence'].split(',')
115
+ if not semantic_query or is_subsequence(operations, sequence): # Ensure sequence match for operations
116
+ try:
117
+ # Reconstruct program vectors (flatten if needed)
118
+ doc_vectors = eval(doc['vectors']) if isinstance(doc['vectors'], str) else doc['vectors']
119
+ if isinstance(doc_vectors, (list, np.ndarray)) and len(doc_vectors) == 6:
120
+ program_vector = doc_vectors # Single flat vector
121
+ else:
122
+ program_vector = np.mean([v for v in doc_vectors if isinstance(v, (list, np.ndarray))], axis=0).tolist()
123
+ except:
124
+ program_vector = [0] * 6 # Fallback for malformed vectors
125
+ similarity = cosine_similarity([query_vector], [program_vector])[0][0] if program_vector and query_vector else 0
126
+ matching_programs.append({'id': meta['id'], 'code': doc, 'similarity': similarity, 'description': meta.get('description_tokens', '')})
127
+
128
+ return sorted(matching_programs, key=lambda x: x['similarity'], reverse=True)
129
+
130
+ def create_vector(category, level, location, total_lines, parent_path):
131
+ """Helper to create a vector for query (matches parser's create_vector)."""
132
+ category_map = {
133
+ 'import': 1, 'function': 2, 'async_function': 3, 'class': 4,
134
+ 'if': 5, 'while': 6, 'for': 7, 'try': 8, 'expression': 9, 'spacer': 10,
135
+ 'other': 11, 'elif': 12, 'else': 13, 'except': 14, 'finally': 15, 'return': 16,
136
+ 'assigned_variable': 17, 'input_variable': 18, 'returned_variable': 19
137
+ }
138
+ category_id = category_map.get(category, 0)
139
+ start_line, end_line = location
140
+ span = (end_line - start_line + 1) / total_lines
141
+ center_pos = ((start_line + end_line) / 2) / total_lines
142
+ parent_depth = len(parent_path)
143
+ parent_weight = sum(category_map.get(parent.split('[')[0].lower(), 0) * (1 / (i + 1))
144
+ for i, parent in enumerate(parent_path)) / max(1, len(category_map))
145
+ return [category_id, level, center_pos, span, parent_depth, parent_weight]
146
+
147
+ def is_subsequence(subseq, seq):
148
+ """Check if subseq is a subsequence of seq."""
149
+ it = iter(seq)
150
+ return all(item in it for item in subseq)
151
+
152
+ def generate_description_tokens(sequence, vectors):
153
+ """Generate semantic description tokens for a program based on its sequence and vectors."""
154
+ tokens = []
155
+ category_descriptions = {
156
+ 'import': 'imports module',
157
+ 'function': 'defines function',
158
+ 'assigned_variable': 'assigns variable',
159
+ 'input_variable': 'input parameter',
160
+ 'returned_variable': 'returns value',
161
+ 'if': 'conditional statement',
162
+ 'return': 'returns result',
163
+ 'try': 'try block',
164
+ 'except': 'exception handler',
165
+ 'expression': 'expression statement',
166
+ 'spacer': 'empty line or comment'
167
+ }
168
+
169
+ for cat, vec in zip(sequence, vectors):
170
+ if cat in category_descriptions:
171
+ tokens.append(f"{category_descriptions[cat]}:{cat}")
172
+ # Add vector-derived features (e.g., level, span) as tokens
173
+ tokens.append(f"level:{vec[1]}")
174
+ tokens.append(f"span:{vec[3]:.2f}")
175
+ return tokens
176
+
177
+ def generate_semantic_vector(description, use_gpu=USE_GPU):
178
+ """Generate a semantic vector for a textual description using CodeBERT, with CPU/GPU option."""
179
+ # Load CodeBERT model and tokenizer
180
+ model_name = "microsoft/codebert-base"
181
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
182
+ device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
183
+ model = AutoModel.from_pretrained(model_name).to(device)
184
+
185
+ # Tokenize and encode the description
186
+ inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
187
+ inputs = {k: v.to(device) for k, v in inputs.items()}
188
+
189
+ # Generate embeddings
190
+ with torch.no_grad():
191
+ outputs = model(**inputs)
192
+ # Use mean pooling of the last hidden states
193
+ vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
194
+
195
+ # Truncate or pad to 6D to match our vectors
196
+ if len(vector) < 6:
197
+ vector.extend([0] * (6 - len(vector)))
198
+ elif len(vector) > 6:
199
+ vector = vector[:6]
200
+ return vector
201
+
202
+ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
203
+ """Save ChromaDB data to Hugging Face Dataset."""
204
+ client = init_chromadb()
205
+ collection = create_collection(client)
206
+
207
+ # Fetch all data from ChromaDB
208
+ results = collection.get(include=["documents", "metadatas", "embeddings"])
209
+ data = {
210
+ "code": results["documents"],
211
+ "sequence": [meta["sequence"] for meta in results["metadatas"]],
212
+ "vectors": results["embeddings"], # ChromaDB already flattens embeddings
213
+ "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
214
+ }
215
+
216
+ # Create a Hugging Face Dataset
217
+ dataset = Dataset.from_dict(data)
218
+
219
+ # Push to Hugging Face Hub
220
+ dataset.push_to_hub(dataset_name, token=token)
221
+ print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
222
+
223
+ def load_chromadb_from_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
224
+ """Load ChromaDB data from Hugging Face Dataset, handle empty dataset."""
225
+ try:
226
+ dataset = load_dataset(dataset_name, split="train", token=token)
227
+ except Exception as e:
228
+ print(f"Error loading dataset from Hugging Face: {e}. Populating with samples...")
229
+ client = init_chromadb()
230
+ populate_sample_db(client)
231
+ save_chromadb_to_hf() # Create and push a new dataset
232
+ return init_chromadb()
233
+
234
+ client = init_chromadb()
235
+ collection = create_collection(client)
236
+
237
+ for item in dataset:
238
+ collection.add(
239
+ documents=[item["code"]],
240
+ metadatas=[{"sequence": item["sequence"], "description_tokens": item["description_tokens"]}],
241
+ ids=[str(hash(item["code"]))],
242
+ embeddings=[item["vectors"]]
243
+ )
244
+ return client
245
+
246
  @app.route('/', methods=['GET', 'POST'])
247
  def index():
248
  if request.method == 'POST':
 
301
  code_input=None,
302
  query_results=query_results
303
  )
304
+ elif 'process_hf' in request.form:
305
+ # Trigger processing of Hugging Face dataset
306
+ try:
307
+ subprocess.run(['python', 'process_hf_dataset.py'], check=True)
308
+ return render_template(
309
+ 'results_partial.html',
310
+ parts=None,
311
+ filename="Hugging Face Dataset Processed",
312
+ reconstructed_code=None,
313
+ code_input=None,
314
+ query_results=None,
315
+ message="Hugging Face dataset processed and stored successfully."
316
+ )
317
+ except subprocess.CalledProcessError as e:
318
+ return f"Error processing Hugging Face dataset: {e}", 500
319
 
320
  if parts:
321
  indexed_parts = [{'index': i + 1, **part} for i, part in enumerate(parts)]
 
354
  mimetype='application/json'
355
  )
356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  if __name__ == '__main__':
358
  if not os.path.exists(UPLOAD_DIR):
359
  os.makedirs(UPLOAD_DIR)