Spaces:

broadfield-dev
/

parse_py

Running

App Files Files Community

parse_py / process_hf_datasets.py

broadfield-dev

Create process_hf_datasets.py

b284540 verified 4 months ago

raw

history blame

3.49 kB

	# process_hf_dataset.py
	from datasets import load_dataset
	import re
	from parser import parse_python_code
	from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
	import chromadb

	def rename_variables(code):
	"""Rename variables in Python code to input_var1, input_var2, etc."""
	# Simple variable name detection and renaming
	pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
	variables = set()
	code_lines = code.split('\n')

	# Find all variable names (simplified approach)
	for line in code_lines:
	matches = re.findall(pattern, line)
	for match in matches:
	if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords
	variables.add(match)

	# Sort variables by first appearance (simplified, could improve with AST)
	sorted_vars = sorted(list(variables))
	var_map = {var: f"input_var{i+1}" for i, var in enumerate(sorted_vars)}

	# Replace variables in code
	new_code = code
	for old_var, new_var in var_map.items():
	new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)

	return new_code

	def process_hf_dataset():
	"""Process the Hugging Face dataset and store programs in ChromaDB."""
	# Load the dataset
	dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")

	# Initialize ChromaDB client
	client = init_chromadb()

	# Process each entry
	for entry in dataset:
	instruction = entry['instruction']
	output = entry['output']

	# Rename variables in the output code
	processed_code = rename_variables(output)

	# Parse the code to get parts and sequence
	parts, sequence = parse_python_code(processed_code)
	vectors = [part['vector'] for part in parts]

	# Generate description tokens from instruction
	description_tokens = f"task:{instruction.replace(' ', '_')}"

	# Store in ChromaDB with description
	store_program(client, processed_code, sequence, vectors, DB_NAME)

	# Update metadata with instruction as description
	collection = client.get_collection(DB_NAME)
	program_id = str(hash(processed_code))
	collection.update(
	ids=[program_id],
	metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}]
	)

	# Save to Hugging Face Dataset
	save_chromadb_to_hf()

	def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
	"""Save ChromaDB data to Hugging Face Dataset."""
	client = init_chromadb()
	collection = client.get_collection(DB_NAME)

	# Fetch all data from ChromaDB
	results = collection.get(include=["documents", "metadatas", "embeddings"])
	data = {
	"code": results["documents"],
	"sequence": [meta["sequence"] for meta in results["metadatas"]],
	"vectors": results["embeddings"], # ChromaDB already flattens embeddings
	"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
	}

	# Create a Hugging Face Dataset
	dataset = Dataset.from_dict(data)

	# Push to Hugging Face Hub
	dataset.push_to_hub(dataset_name, token=token)
	print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")

	if __name__ == "__main__":
	process_hf_dataset()