# process_hf_dataset.py from datasets import load_dataset import re from parser import parse_python_code from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY import chromadb def rename_variables(code): """Rename variables in Python code to input_var1, input_var2, etc.""" # Simple variable name detection and renaming pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic) variables = set() code_lines = code.split('\n') # Find all variable names (simplified approach) for line in code_lines: matches = re.findall(pattern, line) for match in matches: if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords variables.add(match) # Sort variables by first appearance (simplified, could improve with AST) sorted_vars = sorted(list(variables)) var_map = {var: f"input_var{i+1}" for i, var in enumerate(sorted_vars)} # Replace variables in code new_code = code for old_var, new_var in var_map.items(): new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code) return new_code def process_hf_dataset(): """Process the Hugging Face dataset and store programs in ChromaDB.""" # Load the dataset dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train") # Initialize ChromaDB client client = init_chromadb() # Process each entry for entry in dataset: instruction = entry['instruction'] output = entry['output'] # Rename variables in the output code processed_code = rename_variables(output) # Parse the code to get parts and sequence parts, sequence = parse_python_code(processed_code) vectors = [part['vector'] for part in parts] # Generate description tokens from instruction description_tokens = f"task:{instruction.replace(' ', '_')}" # Store in ChromaDB with description store_program(client, processed_code, sequence, vectors, DB_NAME) # Update metadata with instruction as description collection = client.get_collection(DB_NAME) program_id = str(hash(processed_code)) collection.update( ids=[program_id], metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}] ) # Save to Hugging Face Dataset save_chromadb_to_hf() def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY): """Save ChromaDB data to Hugging Face Dataset.""" client = init_chromadb() collection = client.get_collection(DB_NAME) # Fetch all data from ChromaDB results = collection.get(include=["documents", "metadatas", "embeddings"]) data = { "code": results["documents"], "sequence": [meta["sequence"] for meta in results["metadatas"]], "vectors": results["embeddings"], # ChromaDB already flattens embeddings "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]] } # Create a Hugging Face Dataset dataset = Dataset.from_dict(data) # Push to Hugging Face Hub dataset.push_to_hub(dataset_name, token=token) print(f"Dataset pushed to Hugging Face Hub as {dataset_name}") if __name__ == "__main__": process_hf_dataset()