Spaces:
Running
Running
# process_hf_dataset.py | |
from datasets import load_dataset | |
import re | |
from parser import parse_python_code | |
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY | |
import chromadb | |
def rename_variables(code): | |
"""Rename variables in Python code to input_var1, input_var2, etc.""" | |
# Simple variable name detection and renaming | |
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic) | |
variables = set() | |
code_lines = code.split('\n') | |
# Find all variable names (simplified approach) | |
for line in code_lines: | |
matches = re.findall(pattern, line) | |
for match in matches: | |
if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords | |
variables.add(match) | |
# Sort variables by first appearance (simplified, could improve with AST) | |
sorted_vars = sorted(list(variables)) | |
var_map = {var: f"input_var{i+1}" for i, var in enumerate(sorted_vars)} | |
# Replace variables in code | |
new_code = code | |
for old_var, new_var in var_map.items(): | |
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code) | |
return new_code | |
def process_hf_dataset(): | |
"""Process the Hugging Face dataset and store programs in ChromaDB.""" | |
# Load the dataset | |
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train") | |
# Initialize ChromaDB client | |
client = init_chromadb() | |
# Process each entry | |
for entry in dataset: | |
instruction = entry['instruction'] | |
output = entry['output'] | |
# Rename variables in the output code | |
processed_code = rename_variables(output) | |
# Parse the code to get parts and sequence | |
parts, sequence = parse_python_code(processed_code) | |
vectors = [part['vector'] for part in parts] | |
# Generate description tokens from instruction | |
description_tokens = f"task:{instruction.replace(' ', '_')}" | |
# Store in ChromaDB with description | |
store_program(client, processed_code, sequence, vectors, DB_NAME) | |
# Update metadata with instruction as description | |
collection = client.get_collection(DB_NAME) | |
program_id = str(hash(processed_code)) | |
collection.update( | |
ids=[program_id], | |
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}] | |
) | |
# Save to Hugging Face Dataset | |
save_chromadb_to_hf() | |
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY): | |
"""Save ChromaDB data to Hugging Face Dataset.""" | |
client = init_chromadb() | |
collection = client.get_collection(DB_NAME) | |
# Fetch all data from ChromaDB | |
results = collection.get(include=["documents", "metadatas", "embeddings"]) | |
data = { | |
"code": results["documents"], | |
"sequence": [meta["sequence"] for meta in results["metadatas"]], | |
"vectors": results["embeddings"], # ChromaDB already flattens embeddings | |
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]] | |
} | |
# Create a Hugging Face Dataset | |
dataset = Dataset.from_dict(data) | |
# Push to Hugging Face Hub | |
dataset.push_to_hub(dataset_name, token=token) | |
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}") | |
if __name__ == "__main__": | |
process_hf_dataset() |