Spaces:
Running
Running
# process_hf_dataset.py | |
from datasets import load_dataset | |
import re | |
from parser import parse_python_code, create_vector | |
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME | |
import chromadb | |
import os | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
def rename_variables(code, variable_prefixes=None): | |
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable).""" | |
if variable_prefixes is None: | |
variable_prefixes = { | |
'input': 'input_variable', | |
'assigned': 'assigned_variable', | |
'returned': 'returned_variable' | |
} | |
# Simple variable name detection and renaming | |
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic) | |
variables = set() | |
code_lines = code.split('\n') | |
# Find all variable names (simplified approach, could improve with AST) | |
for line in code_lines: | |
matches = re.findall(pattern, line) | |
for match in matches: | |
if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords | |
variables.add(match) | |
# Sort variables by first appearance (simplified, could improve with AST) | |
sorted_vars = sorted(list(variables)) | |
var_map = {} | |
var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1} | |
# Assign variables based on context (simplified heuristic) | |
for var in sorted_vars: | |
# Determine variable role based on context | |
is_input = any(var in line and 'def' in line for line in code_lines) # Check if in function definition (input parameter) | |
is_returned = any('return' in line and var in line for line in code_lines) # Check if used in return statement | |
is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines) # Check if assigned | |
if is_input: | |
role = 'input_variable' | |
elif is_returned: | |
role = 'returned_variable' | |
elif is_assigned: | |
role = 'assigned_variable' | |
else: | |
role = 'assigned_variable' # Default to assigned if unclear | |
new_name = f"{role}{var_count[role]}" | |
var_map[var] = new_name | |
var_count[role] += 1 | |
# Replace variables in code | |
new_code = code | |
for old_var, new_var in var_map.items(): | |
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code) | |
return new_code, var_map | |
def generate_description_tokens(sequence, vectors, var_map=None): | |
"""Generate semantic description tokens for a program, including variable roles.""" | |
tokens = [] | |
category_descriptions = { | |
'import': 'imports module', | |
'function': 'defines function', | |
'assigned_variable': 'assigns variable', | |
'input_variable': 'input parameter', | |
'returned_variable': 'returns value', | |
'if': 'conditional statement', | |
'return': 'returns result', | |
'try': 'try block', | |
'except': 'exception handler', | |
'expression': 'expression statement', | |
'spacer': 'empty line or comment' | |
} | |
for cat, vec in zip(sequence, vectors): | |
if cat in category_descriptions: | |
tokens.append(f"{category_descriptions[cat]}:{cat}") | |
# Add vector-derived features (e.g., level, span) as tokens | |
tokens.append(f"level:{vec[1]}") | |
tokens.append(f"span:{vec[3]:.2f}") | |
# Add variable role tokens if var_map exists | |
if var_map: | |
for old_var, new_var in var_map.items(): | |
role = new_var.split('variable')[0] + 'variable' # Extract role (e.g., 'input_variable') | |
tokens.append(f"variable:{old_var}={new_var}:{role}") | |
return tokens | |
def generate_semantic_vector_og(description, total_lines=100): | |
"""Generate a 6D semantic vector for a textual description, matching our vector format.""" | |
# Use a simplified heuristic to map description to our 6D vector format | |
category_map = { | |
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14 | |
} | |
# Parse description for key terms | |
tokens = description.lower().split() | |
vector = [0] * 6 # Initialize 6D vector | |
# Map description tokens to categories and assign basic vector values | |
for token in tokens: | |
for cat, cat_id in category_map.items(): | |
if cat in token: | |
vector[0] = cat_id # category_id | |
vector[1] = 1 # level (assume top-level for simplicity) | |
vector[2] = 0.5 # center_pos (midpoint of code) | |
vector[3] = 0.1 # span (small for simplicity) | |
vector[4] = 1 # parent_depth (shallow) | |
vector[5] = cat_id / len(category_map) # parent_weight (normalized) | |
break | |
return vector | |
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D.""" | |
# Load CodeBERT model and tokenizer | |
model_name = "microsoft/codebert-base" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu") | |
model = AutoModel.from_pretrained(model_name).to(device) | |
def generate_semantic_vector(description, total_lines=100, use_gpu=False): | |
# Tokenize and encode the description | |
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512) | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
# Generate embeddings | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
# Use mean pooling of the last hidden states | |
vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist() | |
# Truncate or project to 6D (simplified projection: take first 6 dimensions) | |
if len(vector) < 6: | |
vector.extend([0] * (6 - len(vector))) | |
elif len(vector) > 6: | |
vector = vector[:6] # Truncate to 6D | |
return vector | |
def process_hf_dataset(): | |
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors.""" | |
# Load the dataset | |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train") | |
# Initialize ChromaDB client | |
client = init_chromadb() | |
# Process each entry | |
for entry in dataset: | |
instruction = entry['instruction'] | |
output = entry['output'] | |
# Rename variables to align with vector categories | |
processed_code, var_map = rename_variables(output) | |
# Parse the code to get parts and sequence, generating our 6D vectors | |
parts, sequence = parse_python_code(processed_code) | |
vectors = [part['vector'] for part in parts] # Use parser's 6D vectors | |
# Generate description tokens including variable roles | |
description_tokens = f"task:{instruction.replace(' ', '_')}" | |
description_tokens_list = generate_description_tokens(sequence, vectors, var_map) | |
description_tokens += " " + " ".join(description_tokens_list) | |
# Generate a 6D semantic vector for the instruction, incorporating it into the program vector | |
semantic_vector = generate_semantic_vector(instruction) | |
# Combine program vectors with instruction vector (average or concatenate, but ensure 6D) | |
combined_vector = semantic_vector # Use semantic vector as primary for semantic search | |
# Store in ChromaDB with description and combined vector | |
store_program(client, processed_code, sequence, [combined_vector], DB_NAME) | |
# Update metadata with instruction and variable roles as description | |
collection = client.get_collection(DB_NAME) | |
program_id = str(hash(processed_code)) | |
collection.update( | |
ids=[program_id], | |
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}], | |
embeddings=[combined_vector] # Ensure 6D embedding | |
) | |
# Save to Hugging Face Dataset | |
save_chromadb_to_hf() | |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")): | |
"""Save ChromaDB data to Hugging Face Dataset.""" | |
client = init_chromadb() | |
collection = client.get_collection(DB_NAME) | |
# Fetch all data from ChromaDB | |
results = collection.get(include=["documents", "metadatas", "embeddings"]) | |
data = { | |
"code": results["documents"], | |
"sequence": [meta["sequence"] for meta in results["metadatas"]], | |
"vectors": results["embeddings"], # ChromaDB already flattens embeddings | |
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]] | |
} | |
# Create a Hugging Face Dataset | |
dataset = Dataset.from_dict(data) | |
# Push to Hugging Face Hub | |
dataset.push_to_hub(dataset_name, token=token) | |
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}") | |
if __name__ == "__main__": | |
process_hf_dataset() |