Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +21 -46
process_hf_dataset.py
CHANGED
@@ -3,11 +3,11 @@ from datasets import load_dataset
|
|
3 |
import re
|
4 |
from parser import parse_python_code, create_vector
|
5 |
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
|
6 |
-
from transformers import AutoTokenizer, AutoModel
|
7 |
-
import torch
|
8 |
import chromadb
|
9 |
import os
|
10 |
from dotenv import load_dotenv
|
|
|
|
|
11 |
|
12 |
# Load environment variables
|
13 |
load_dotenv()
|
@@ -97,41 +97,13 @@ def generate_description_tokens(sequence, vectors, var_map=None):
|
|
97 |
|
98 |
return tokens
|
99 |
|
100 |
-
def generate_semantic_vector_og(description, total_lines=100):
|
101 |
-
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
|
102 |
-
# Use a simplified heuristic to map description to our 6D vector format
|
103 |
-
category_map = {
|
104 |
-
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
|
105 |
-
}
|
106 |
-
|
107 |
-
# Parse description for key terms
|
108 |
-
tokens = description.lower().split()
|
109 |
-
vector = [0] * 6 # Initialize 6D vector
|
110 |
-
|
111 |
-
# Map description tokens to categories and assign basic vector values
|
112 |
-
for token in tokens:
|
113 |
-
for cat, cat_id in category_map.items():
|
114 |
-
if cat in token:
|
115 |
-
vector[0] = cat_id # category_id
|
116 |
-
vector[1] = 1 # level (assume top-level for simplicity)
|
117 |
-
vector[2] = 0.5 # center_pos (midpoint of code)
|
118 |
-
vector[3] = 0.1 # span (small for simplicity)
|
119 |
-
vector[4] = 1 # parent_depth (shallow)
|
120 |
-
vector[5] = cat_id / len(category_map) # parent_weight (normalized)
|
121 |
-
break
|
122 |
-
|
123 |
-
return vector
|
124 |
-
|
125 |
-
|
126 |
-
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
127 |
-
# Load CodeBERT model and tokenizer
|
128 |
-
use_gpu=False
|
129 |
-
model_name = "microsoft/codebert-base"
|
130 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
131 |
-
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
132 |
-
model = AutoModel.from_pretrained(model_name).to(device)
|
133 |
def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
# Tokenize and encode the description
|
137 |
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
@@ -150,6 +122,7 @@ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
|
150 |
vector = vector[:6] # Truncate to 6D
|
151 |
|
152 |
return vector
|
|
|
153 |
def process_hf_dataset():
|
154 |
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
|
155 |
# Load the dataset
|
@@ -168,29 +141,30 @@ def process_hf_dataset():
|
|
168 |
|
169 |
# Parse the code to get parts and sequence, generating our 6D vectors
|
170 |
parts, sequence = parse_python_code(processed_code)
|
171 |
-
|
172 |
|
173 |
# Generate description tokens including variable roles
|
174 |
description_tokens = f"task:{instruction.replace(' ', '_')}"
|
175 |
-
description_tokens_list = generate_description_tokens(sequence,
|
176 |
description_tokens += " " + " ".join(description_tokens_list)
|
177 |
|
178 |
-
# Generate a 6D semantic vector for the instruction
|
179 |
semantic_vector = generate_semantic_vector(instruction)
|
180 |
|
181 |
-
# Combine program vectors with
|
182 |
-
|
|
|
183 |
|
184 |
# Store in ChromaDB with description and combined vector
|
185 |
store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
|
186 |
|
187 |
-
# Update metadata with instruction and variable roles as description
|
188 |
collection = client.get_collection(DB_NAME)
|
189 |
program_id = str(hash(processed_code))
|
190 |
collection.update(
|
191 |
ids=[program_id],
|
192 |
-
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
|
193 |
-
embeddings=[combined_vector] # Ensure 6D embedding
|
194 |
)
|
195 |
|
196 |
# Save to Hugging Face Dataset
|
@@ -206,8 +180,9 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
|
|
206 |
data = {
|
207 |
"code": results["documents"],
|
208 |
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
209 |
-
"vectors": results["embeddings"], #
|
210 |
-
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
|
|
|
211 |
}
|
212 |
|
213 |
# Create a Hugging Face Dataset
|
|
|
3 |
import re
|
4 |
from parser import parse_python_code, create_vector
|
5 |
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
|
|
|
|
|
6 |
import chromadb
|
7 |
import os
|
8 |
from dotenv import load_dotenv
|
9 |
+
from transformers import AutoTokenizer, AutoModel
|
10 |
+
import torch
|
11 |
|
12 |
# Load environment variables
|
13 |
load_dotenv()
|
|
|
97 |
|
98 |
return tokens
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
101 |
+
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
102 |
+
# Load CodeBERT model and tokenizer
|
103 |
+
model_name = "microsoft/codebert-base"
|
104 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
105 |
+
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
106 |
+
model = AutoModel.from_pretrained(model_name).to(device)
|
107 |
|
108 |
# Tokenize and encode the description
|
109 |
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
|
|
122 |
vector = vector[:6] # Truncate to 6D
|
123 |
|
124 |
return vector
|
125 |
+
|
126 |
def process_hf_dataset():
|
127 |
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
|
128 |
# Load the dataset
|
|
|
141 |
|
142 |
# Parse the code to get parts and sequence, generating our 6D vectors
|
143 |
parts, sequence = parse_python_code(processed_code)
|
144 |
+
program_vectors = [part['vector'] for part in parts] # Use parser's 6D vectors for program structure
|
145 |
|
146 |
# Generate description tokens including variable roles
|
147 |
description_tokens = f"task:{instruction.replace(' ', '_')}"
|
148 |
+
description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
|
149 |
description_tokens += " " + " ".join(description_tokens_list)
|
150 |
|
151 |
+
# Generate a 6D semantic vector for the instruction
|
152 |
semantic_vector = generate_semantic_vector(instruction)
|
153 |
|
154 |
+
# Combine program vectors with semantic vector (use semantic vector for semantic search, store program vectors separately)
|
155 |
+
# Store both semantic and program vectors, but ensure ChromaDB uses 6D
|
156 |
+
combined_vector = semantic_vector # Use semantic vector for ChromaDB embedding (6D)
|
157 |
|
158 |
# Store in ChromaDB with description and combined vector
|
159 |
store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
|
160 |
|
161 |
+
# Update metadata with instruction and variable roles as description, and store program vectors
|
162 |
collection = client.get_collection(DB_NAME)
|
163 |
program_id = str(hash(processed_code))
|
164 |
collection.update(
|
165 |
ids=[program_id],
|
166 |
+
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)}],
|
167 |
+
embeddings=[combined_vector] # Ensure 6D embedding for semantic search
|
168 |
)
|
169 |
|
170 |
# Save to Hugging Face Dataset
|
|
|
180 |
data = {
|
181 |
"code": results["documents"],
|
182 |
"sequence": [meta["sequence"] for meta in results["metadatas"]],
|
183 |
+
"vectors": results["embeddings"], # Semantic 6D vectors
|
184 |
+
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
|
185 |
+
"program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
|
186 |
}
|
187 |
|
188 |
# Create a Hugging Face Dataset
|