broadfield-dev commited on
Commit
506d255
·
verified ·
1 Parent(s): 1540ac7

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +21 -46
process_hf_dataset.py CHANGED
@@ -3,11 +3,11 @@ from datasets import load_dataset
3
  import re
4
  from parser import parse_python_code, create_vector
5
  from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
6
- from transformers import AutoTokenizer, AutoModel
7
- import torch
8
  import chromadb
9
  import os
10
  from dotenv import load_dotenv
 
 
11
 
12
  # Load environment variables
13
  load_dotenv()
@@ -97,41 +97,13 @@ def generate_description_tokens(sequence, vectors, var_map=None):
97
 
98
  return tokens
99
 
100
- def generate_semantic_vector_og(description, total_lines=100):
101
- """Generate a 6D semantic vector for a textual description, matching our vector format."""
102
- # Use a simplified heuristic to map description to our 6D vector format
103
- category_map = {
104
- 'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
105
- }
106
-
107
- # Parse description for key terms
108
- tokens = description.lower().split()
109
- vector = [0] * 6 # Initialize 6D vector
110
-
111
- # Map description tokens to categories and assign basic vector values
112
- for token in tokens:
113
- for cat, cat_id in category_map.items():
114
- if cat in token:
115
- vector[0] = cat_id # category_id
116
- vector[1] = 1 # level (assume top-level for simplicity)
117
- vector[2] = 0.5 # center_pos (midpoint of code)
118
- vector[3] = 0.1 # span (small for simplicity)
119
- vector[4] = 1 # parent_depth (shallow)
120
- vector[5] = cat_id / len(category_map) # parent_weight (normalized)
121
- break
122
-
123
- return vector
124
-
125
-
126
- """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
127
- # Load CodeBERT model and tokenizer
128
- use_gpu=False
129
- model_name = "microsoft/codebert-base"
130
- tokenizer = AutoTokenizer.from_pretrained(model_name)
131
- device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
132
- model = AutoModel.from_pretrained(model_name).to(device)
133
  def generate_semantic_vector(description, total_lines=100, use_gpu=False):
134
-
 
 
 
 
 
135
 
136
  # Tokenize and encode the description
137
  inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
@@ -150,6 +122,7 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
150
  vector = vector[:6] # Truncate to 6D
151
 
152
  return vector
 
153
  def process_hf_dataset():
154
  """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
155
  # Load the dataset
@@ -168,29 +141,30 @@ def process_hf_dataset():
168
 
169
  # Parse the code to get parts and sequence, generating our 6D vectors
170
  parts, sequence = parse_python_code(processed_code)
171
- vectors = [part['vector'] for part in parts] # Use parser's 6D vectors
172
 
173
  # Generate description tokens including variable roles
174
  description_tokens = f"task:{instruction.replace(' ', '_')}"
175
- description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
176
  description_tokens += " " + " ".join(description_tokens_list)
177
 
178
- # Generate a 6D semantic vector for the instruction, incorporating it into the program vector
179
  semantic_vector = generate_semantic_vector(instruction)
180
 
181
- # Combine program vectors with instruction vector (average or concatenate, but ensure 6D)
182
- combined_vector = semantic_vector # Use semantic vector as primary for semantic search
 
183
 
184
  # Store in ChromaDB with description and combined vector
185
  store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
186
 
187
- # Update metadata with instruction and variable roles as description
188
  collection = client.get_collection(DB_NAME)
189
  program_id = str(hash(processed_code))
190
  collection.update(
191
  ids=[program_id],
192
- metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
193
- embeddings=[combined_vector] # Ensure 6D embedding
194
  )
195
 
196
  # Save to Hugging Face Dataset
@@ -206,8 +180,9 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
206
  data = {
207
  "code": results["documents"],
208
  "sequence": [meta["sequence"] for meta in results["metadatas"]],
209
- "vectors": results["embeddings"], # ChromaDB already flattens embeddings
210
- "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
 
211
  }
212
 
213
  # Create a Hugging Face Dataset
 
3
  import re
4
  from parser import parse_python_code, create_vector
5
  from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
 
 
6
  import chromadb
7
  import os
8
  from dotenv import load_dotenv
9
+ from transformers import AutoTokenizer, AutoModel
10
+ import torch
11
 
12
  # Load environment variables
13
  load_dotenv()
 
97
 
98
  return tokens
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  def generate_semantic_vector(description, total_lines=100, use_gpu=False):
101
+ """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
102
+ # Load CodeBERT model and tokenizer
103
+ model_name = "microsoft/codebert-base"
104
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
105
+ device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
106
+ model = AutoModel.from_pretrained(model_name).to(device)
107
 
108
  # Tokenize and encode the description
109
  inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
122
  vector = vector[:6] # Truncate to 6D
123
 
124
  return vector
125
+
126
  def process_hf_dataset():
127
  """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
128
  # Load the dataset
 
141
 
142
  # Parse the code to get parts and sequence, generating our 6D vectors
143
  parts, sequence = parse_python_code(processed_code)
144
+ program_vectors = [part['vector'] for part in parts] # Use parser's 6D vectors for program structure
145
 
146
  # Generate description tokens including variable roles
147
  description_tokens = f"task:{instruction.replace(' ', '_')}"
148
+ description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
149
  description_tokens += " " + " ".join(description_tokens_list)
150
 
151
+ # Generate a 6D semantic vector for the instruction
152
  semantic_vector = generate_semantic_vector(instruction)
153
 
154
+ # Combine program vectors with semantic vector (use semantic vector for semantic search, store program vectors separately)
155
+ # Store both semantic and program vectors, but ensure ChromaDB uses 6D
156
+ combined_vector = semantic_vector # Use semantic vector for ChromaDB embedding (6D)
157
 
158
  # Store in ChromaDB with description and combined vector
159
  store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
160
 
161
+ # Update metadata with instruction and variable roles as description, and store program vectors
162
  collection = client.get_collection(DB_NAME)
163
  program_id = str(hash(processed_code))
164
  collection.update(
165
  ids=[program_id],
166
+ metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)}],
167
+ embeddings=[combined_vector] # Ensure 6D embedding for semantic search
168
  )
169
 
170
  # Save to Hugging Face Dataset
 
180
  data = {
181
  "code": results["documents"],
182
  "sequence": [meta["sequence"] for meta in results["metadatas"]],
183
+ "vectors": results["embeddings"], # Semantic 6D vectors
184
+ "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
185
+ "program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
186
  }
187
 
188
  # Create a Hugging Face Dataset