broadfield-dev commited on
Commit
ecb205b
·
verified ·
1 Parent(s): 275730d

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +83 -41
process_hf_dataset.py CHANGED
@@ -8,10 +8,32 @@ import os
8
  from dotenv import load_dotenv
9
  from transformers import AutoTokenizer, AutoModel
10
  import torch
 
 
11
 
12
  # Load environment variables
13
  load_dotenv()
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def rename_variables(code, variable_prefixes=None):
16
  """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
17
  if variable_prefixes is None:
@@ -99,11 +121,9 @@ def generate_description_tokens(sequence, vectors, var_map=None):
99
 
100
  def generate_semantic_vector(description, total_lines=100, use_gpu=False):
101
  """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
102
- # Load CodeBERT model and tokenizer
103
- model_name = "microsoft/codebert-base"
104
- tokenizer = AutoTokenizer.from_pretrained(model_name)
105
- device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
106
- model = AutoModel.from_pretrained(model_name).to(device)
107
 
108
  # Tokenize and encode the description
109
  inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
@@ -123,49 +143,71 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
123
 
124
  return vector
125
 
126
- def process_hf_dataset():
127
- """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
128
  # Load the dataset
129
  dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
130
 
131
  # Initialize ChromaDB client
132
  client = init_chromadb()
133
 
134
- # Process each entry
135
- for entry in dataset:
136
- instruction = entry['instruction']
137
- output = entry['output']
138
-
139
- # Rename variables to align with vector categories
140
- processed_code, var_map = rename_variables(output)
141
-
142
- # Parse the code to get parts and sequence, generating our 6D vectors
143
- parts, sequence = parse_python_code(processed_code)
144
- program_vectors = [part['vector'] for part in parts] # Use parser's 6D vectors for program structure
145
-
146
- # Generate description tokens including variable roles
147
- description_tokens = f"task:{instruction.replace(' ', '_')}"
148
- description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
149
- description_tokens += " " + " ".join(description_tokens_list)
150
-
151
- # Generate a 6D semantic vector for the instruction
152
- semantic_vector = generate_semantic_vector(instruction)
153
-
154
- # Combine program vectors with semantic vector (use semantic vector for semantic search, store program vectors separately)
155
- # Store both semantic and program vectors, but ensure ChromaDB uses 6D
156
- combined_vector = semantic_vector # Use semantic vector for ChromaDB embedding (6D)
157
 
158
- # Store in ChromaDB with description and combined vector
159
- store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- # Update metadata with instruction and variable roles as description, and store program vectors
162
- collection = client.get_collection(DB_NAME)
163
- program_id = str(hash(processed_code))
164
- collection.update(
165
- ids=[program_id],
166
- metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)}],
167
- embeddings=[combined_vector] # Ensure 6D embedding for semantic search
168
- )
 
 
 
169
 
170
  # Save to Hugging Face Dataset
171
  save_chromadb_to_hf()
@@ -193,4 +235,4 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
193
  print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
194
 
195
  if __name__ == "__main__":
196
- process_hf_dataset()
 
8
  from dotenv import load_dotenv
9
  from transformers import AutoTokenizer, AutoModel
10
  import torch
11
+ from tqdm import tqdm # For progress bar
12
+ import time
13
 
14
  # Load environment variables
15
  load_dotenv()
16
 
17
+ # Cache CodeBERT model globally to avoid repeated loading
18
+ model_name = "microsoft/codebert-base"
19
+ tokenizer = None
20
+ model = None
21
+ device = None
22
+
23
+ def load_codebert_model(use_gpu=False):
24
+ """Load and cache the CodeBERT model, handling GPU/CPU options."""
25
+ global tokenizer, model, device
26
+ if tokenizer is None or model is None:
27
+ try:
28
+ device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
29
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
30
+ model = AutoModel.from_pretrained(model_name).to(device)
31
+ print(f"CodeBERT model loaded on {device}")
32
+ except Exception as e:
33
+ print(f"Error loading CodeBERT model: {e}")
34
+ raise
35
+ return tokenizer, model, device
36
+
37
  def rename_variables(code, variable_prefixes=None):
38
  """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
39
  if variable_prefixes is None:
 
121
 
122
  def generate_semantic_vector(description, total_lines=100, use_gpu=False):
123
  """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
124
+ global tokenizer, model, device
125
+ if tokenizer is None or model is None:
126
+ tokenizer, model, device = load_codebert_model(use_gpu)
 
 
127
 
128
  # Tokenize and encode the description
129
  inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
143
 
144
  return vector
145
 
146
+ def process_hf_dataset(batch_size=100, use_gpu=False):
147
+ """Process the Hugging Face dataset in batches and store programs in ChromaDB, aligning with vector categories."""
148
  # Load the dataset
149
  dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
150
 
151
  # Initialize ChromaDB client
152
  client = init_chromadb()
153
 
154
+ # Clear existing collection (fresh install) if needed
155
+ try:
156
+ client.delete_collection(DB_NAME)
157
+ except:
158
+ pass # Collection may not exist
159
+ collection = client.create_collection(DB_NAME)
160
+
161
+ # Process in batches with progress bar
162
+ total_entries = len(dataset)
163
+ for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
164
+ batch = dataset[i:i + batch_size]
165
+ batch_programs = []
166
+ batch_ids = []
167
+ batch_documents = []
168
+ batch_metadatas = []
169
+ batch_embeddings = []
 
 
 
 
 
 
 
170
 
171
+ for entry in batch:
172
+ instruction = entry['instruction']
173
+ output = entry['output']
174
+
175
+ # Rename variables to align with vector categories
176
+ processed_code, var_map = rename_variables(output)
177
+
178
+ # Parse the code to get parts and sequence, generating our 6D vectors
179
+ parts, sequence = parse_python_code(processed_code)
180
+ program_vectors = [part['vector'] for part in parts] # Use parser's 6D vectors for program structure
181
+
182
+ # Generate description tokens including variable roles
183
+ description_tokens = f"task:{instruction.replace(' ', '_')}"
184
+ description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
185
+ description_tokens += " " + " ".join(description_tokens_list)
186
+
187
+ # Generate a 6D semantic vector for the instruction
188
+ semantic_vector = generate_semantic_vector(instruction, use_gpu=use_gpu)
189
+
190
+ # Store program data
191
+ program_id = str(hash(processed_code))
192
+ batch_ids.append(program_id)
193
+ batch_documents.append(processed_code)
194
+ batch_metadatas.append({"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)})
195
+ batch_embeddings.append(semantic_vector)
196
+
197
+ # Add small delay to prevent freezing (optional, adjust as needed)
198
+ time.sleep(0.01)
199
 
200
+ # Batch add to ChromaDB
201
+ try:
202
+ collection.add(
203
+ documents=batch_documents,
204
+ metadatas=batch_metadatas,
205
+ ids=batch_ids,
206
+ embeddings=batch_embeddings
207
+ )
208
+ except Exception as e:
209
+ print(f"Error adding batch to ChromaDB: {e}")
210
+ raise
211
 
212
  # Save to Hugging Face Dataset
213
  save_chromadb_to_hf()
 
235
  print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
236
 
237
  if __name__ == "__main__":
238
+ process_hf_dataset(batch_size=100, use_gpu=False)