parse_py / process_hf_dataset.py
broadfield-dev's picture
Update process_hf_dataset.py
927956e verified
raw
history blame
8.07 kB
# process_hf_dataset.py
from datasets import load_dataset
import re
from parser import parse_python_code, create_vector
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
import chromadb
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
def rename_variables(code, variable_prefixes=None):
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
if variable_prefixes is None:
variable_prefixes = {
'input': 'input_variable',
'assigned': 'assigned_variable',
'returned': 'returned_variable'
}
# Simple variable name detection and renaming
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
variables = set()
code_lines = code.split('\n')
# Find all variable names (simplified approach, could improve with AST)
for line in code_lines:
matches = re.findall(pattern, line)
for match in matches:
if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords
variables.add(match)
# Sort variables by first appearance (simplified, could improve with AST)
sorted_vars = sorted(list(variables))
var_map = {}
var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1}
# Assign variables based on context (simplified heuristic)
for var in sorted_vars:
# Determine variable role based on context
is_input = any(var in line and 'def' in line for line in code_lines) # Check if in function definition (input parameter)
is_returned = any('return' in line and var in line for line in code_lines) # Check if used in return statement
is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines) # Check if assigned
if is_input:
role = 'input_variable'
elif is_returned:
role = 'returned_variable'
elif is_assigned:
role = 'assigned_variable'
else:
role = 'assigned_variable' # Default to assigned if unclear
new_name = f"{role}{var_count[role]}"
var_map[var] = new_name
var_count[role] += 1
# Replace variables in code
new_code = code
for old_var, new_var in var_map.items():
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
return new_code, var_map
def generate_description_tokens(sequence, vectors, var_map=None):
"""Generate semantic description tokens for a program, including variable roles."""
tokens = []
category_descriptions = {
'import': 'imports module',
'function': 'defines function',
'assigned_variable': 'assigns variable',
'input_variable': 'input parameter',
'returned_variable': 'returns value',
'if': 'conditional statement',
'return': 'returns result',
'try': 'try block',
'except': 'exception handler',
'expression': 'expression statement',
'spacer': 'empty line or comment'
}
for cat, vec in zip(sequence, vectors):
if cat in category_descriptions:
tokens.append(f"{category_descriptions[cat]}:{cat}")
# Add vector-derived features (e.g., level, span) as tokens
tokens.append(f"level:{vec[1]}")
tokens.append(f"span:{vec[3]:.2f}")
# Add variable role tokens if var_map exists
if var_map:
for old_var, new_var in var_map.items():
role = new_var.split('variable')[0] + 'variable' # Extract role (e.g., 'input_variable')
tokens.append(f"variable:{old_var}={new_var}:{role}")
return tokens
def generate_semantic_vector(description, total_lines=100):
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
# Use a simplified heuristic to map description to our 6D vector format
category_map = {
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
}
# Parse description for key terms
tokens = description.lower().split()
vector = [0] * 6 # Initialize 6D vector
# Map description tokens to categories and assign basic vector values
for token in tokens:
for cat, cat_id in category_map.items():
if cat in token:
vector[0] = cat_id # category_id
vector[1] = 1 # level (assume top-level for simplicity)
vector[2] = 0.5 # center_pos (midpoint of code)
vector[3] = 0.1 # span (small for simplicity)
vector[4] = 1 # parent_depth (shallow)
vector[5] = cat_id / len(category_map) # parent_weight (normalized)
break
return vector
def process_hf_dataset():
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
# Load the dataset
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
# Initialize ChromaDB client
client = init_chromadb()
# Process each entry
for entry in dataset:
instruction = entry['instruction']
output = entry['output']
# Rename variables to align with vector categories
processed_code, var_map = rename_variables(output)
# Parse the code to get parts and sequence, generating our 6D vectors
parts, sequence = parse_python_code(processed_code)
vectors = [part['vector'] for part in parts] # Use parser's 6D vectors
# Generate description tokens including variable roles
description_tokens = f"task:{instruction.replace(' ', '_')}"
description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
description_tokens += " " + " ".join(description_tokens_list)
# Generate a 6D semantic vector for the instruction, incorporating it into the program vector
semantic_vector = generate_semantic_vector(instruction)
# Combine program vectors with instruction vector (average or concatenate, but ensure 6D)
combined_vector = semantic_vector # Use semantic vector as primary for semantic search
# Store in ChromaDB with description and combined vector
store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
# Update metadata with instruction and variable roles as description
collection = client.get_collection(DB_NAME)
program_id = str(hash(processed_code))
collection.update(
ids=[program_id],
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
embeddings=[combined_vector] # Ensure 6D embedding
)
# Save to Hugging Face Dataset
save_chromadb_to_hf()
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
"""Save ChromaDB data to Hugging Face Dataset."""
client = init_chromadb()
collection = client.get_collection(DB_NAME)
# Fetch all data from ChromaDB
results = collection.get(include=["documents", "metadatas", "embeddings"])
data = {
"code": results["documents"],
"sequence": [meta["sequence"] for meta in results["metadatas"]],
"vectors": results["embeddings"], # ChromaDB already flattens embeddings
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
}
# Create a Hugging Face Dataset
dataset = Dataset.from_dict(data)
# Push to Hugging Face Hub
dataset.push_to_hub(dataset_name, token=token)
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
if __name__ == "__main__":
process_hf_dataset()