parse_py / process_hf_datasets.py
broadfield-dev's picture
Create process_hf_datasets.py
b284540 verified
raw
history blame
3.49 kB
# process_hf_dataset.py
from datasets import load_dataset
import re
from parser import parse_python_code
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
import chromadb
def rename_variables(code):
"""Rename variables in Python code to input_var1, input_var2, etc."""
# Simple variable name detection and renaming
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
variables = set()
code_lines = code.split('\n')
# Find all variable names (simplified approach)
for line in code_lines:
matches = re.findall(pattern, line)
for match in matches:
if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords
variables.add(match)
# Sort variables by first appearance (simplified, could improve with AST)
sorted_vars = sorted(list(variables))
var_map = {var: f"input_var{i+1}" for i, var in enumerate(sorted_vars)}
# Replace variables in code
new_code = code
for old_var, new_var in var_map.items():
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
return new_code
def process_hf_dataset():
"""Process the Hugging Face dataset and store programs in ChromaDB."""
# Load the dataset
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
# Initialize ChromaDB client
client = init_chromadb()
# Process each entry
for entry in dataset:
instruction = entry['instruction']
output = entry['output']
# Rename variables in the output code
processed_code = rename_variables(output)
# Parse the code to get parts and sequence
parts, sequence = parse_python_code(processed_code)
vectors = [part['vector'] for part in parts]
# Generate description tokens from instruction
description_tokens = f"task:{instruction.replace(' ', '_')}"
# Store in ChromaDB with description
store_program(client, processed_code, sequence, vectors, DB_NAME)
# Update metadata with instruction as description
collection = client.get_collection(DB_NAME)
program_id = str(hash(processed_code))
collection.update(
ids=[program_id],
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}]
)
# Save to Hugging Face Dataset
save_chromadb_to_hf()
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
"""Save ChromaDB data to Hugging Face Dataset."""
client = init_chromadb()
collection = client.get_collection(DB_NAME)
# Fetch all data from ChromaDB
results = collection.get(include=["documents", "metadatas", "embeddings"])
data = {
"code": results["documents"],
"sequence": [meta["sequence"] for meta in results["metadatas"]],
"vectors": results["embeddings"], # ChromaDB already flattens embeddings
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
}
# Create a Hugging Face Dataset
dataset = Dataset.from_dict(data)
# Push to Hugging Face Hub
dataset.push_to_hub(dataset_name, token=token)
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
if __name__ == "__main__":
process_hf_dataset()