broadfield-dev commited on
Commit
d6c93c4
·
verified ·
1 Parent(s): e77acbf

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +34 -5
process_hf_dataset.py CHANGED
@@ -1,7 +1,7 @@
1
  # process_hf_dataset.py
2
  from datasets import load_dataset
3
  import re
4
- from parser import parse_python_code
5
  from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
6
  import chromadb
7
  import os
@@ -95,6 +95,31 @@ def generate_description_tokens(sequence, vectors, var_map=None):
95
 
96
  return tokens
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def process_hf_dataset():
99
  """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
100
  # Load the dataset
@@ -111,16 +136,19 @@ def process_hf_dataset():
111
  # Rename variables to align with vector categories
112
  processed_code, var_map = rename_variables(output)
113
 
114
- # Parse the code to get parts and sequence
115
  parts, sequence = parse_python_code(processed_code)
116
- vectors = [part['vector'] for part in parts]
117
 
118
  # Generate description tokens including variable roles
119
  description_tokens = f"task:{instruction.replace(' ', '_')}"
120
  description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
121
  description_tokens += " " + " ".join(description_tokens_list)
122
 
123
- # Store in ChromaDB with description
 
 
 
124
  store_program(client, processed_code, sequence, vectors, DB_NAME)
125
 
126
  # Update metadata with instruction and variable roles as description
@@ -128,7 +156,8 @@ def process_hf_dataset():
128
  program_id = str(hash(processed_code))
129
  collection.update(
130
  ids=[program_id],
131
- metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}]
 
132
  )
133
 
134
  # Save to Hugging Face Dataset
 
1
  # process_hf_dataset.py
2
  from datasets import load_dataset
3
  import re
4
+ from parser import parse_python_code, create_vector
5
  from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
6
  import chromadb
7
  import os
 
95
 
96
  return tokens
97
 
98
+ def generate_semantic_vector(description, total_lines=100):
99
+ """Generate a 6D semantic vector for a textual description, matching our vector format."""
100
+ # Use a simplified heuristic to map description to our 6D vector format
101
+ category_map = {
102
+ 'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
103
+ }
104
+
105
+ # Parse description for key terms
106
+ tokens = description.lower().split()
107
+ vector = [0] * 6 # Initialize 6D vector
108
+
109
+ # Map description tokens to categories and assign basic vector values
110
+ for token in tokens:
111
+ for cat, cat_id in category_map.items():
112
+ if cat in token:
113
+ vector[0] = cat_id # category_id
114
+ vector[1] = 1 # level (assume top-level for simplicity)
115
+ vector[2] = 0.5 # center_pos (midpoint of code)
116
+ vector[3] = 0.1 # span (small for simplicity)
117
+ vector[4] = 1 # parent_depth (shallow)
118
+ vector[5] = cat_id / len(category_map) # parent_weight (normalized)
119
+ break
120
+
121
+ return vector
122
+
123
  def process_hf_dataset():
124
  """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
125
  # Load the dataset
 
136
  # Rename variables to align with vector categories
137
  processed_code, var_map = rename_variables(output)
138
 
139
+ # Parse the code to get parts and sequence, generating our 6D vectors
140
  parts, sequence = parse_python_code(processed_code)
141
+ vectors = [part['vector'] for part in parts] # Use parser's 6D vectors
142
 
143
  # Generate description tokens including variable roles
144
  description_tokens = f"task:{instruction.replace(' ', '_')}"
145
  description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
146
  description_tokens += " " + " ".join(description_tokens_list)
147
 
148
+ # Generate a 6D semantic vector for the instruction
149
+ semantic_vector = generate_semantic_vector(instruction)
150
+
151
+ # Store in ChromaDB with description and semantic vector
152
  store_program(client, processed_code, sequence, vectors, DB_NAME)
153
 
154
  # Update metadata with instruction and variable roles as description
 
156
  program_id = str(hash(processed_code))
157
  collection.update(
158
  ids=[program_id],
159
+ metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
160
+ embeddings=[semantic_vector] # Update with 6D semantic vector for semantic search
161
  )
162
 
163
  # Save to Hugging Face Dataset