Spaces:

broadfield-dev
/

parse_py

Running

App Files Files Community

parse_py / process_hf_dataset.py

broadfield-dev

Update process_hf_dataset.py

9e89af0 verified 4 months ago

raw

history blame

9.2 kB

	# process_hf_dataset.py
	from datasets import load_dataset
	import re
	from parser import parse_python_code, create_vector
	from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
	import chromadb
	import os
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	def rename_variables(code, variable_prefixes=None):
	"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
	if variable_prefixes is None:
	variable_prefixes = {
	'input': 'input_variable',
	'assigned': 'assigned_variable',
	'returned': 'returned_variable'
	}

	# Simple variable name detection and renaming
	pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
	variables = set()
	code_lines = code.split('\n')

	# Find all variable names (simplified approach, could improve with AST)
	for line in code_lines:
	matches = re.findall(pattern, line)
	for match in matches:
	if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords
	variables.add(match)

	# Sort variables by first appearance (simplified, could improve with AST)
	sorted_vars = sorted(list(variables))
	var_map = {}
	var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1}

	# Assign variables based on context (simplified heuristic)
	for var in sorted_vars:
	# Determine variable role based on context
	is_input = any(var in line and 'def' in line for line in code_lines) # Check if in function definition (input parameter)
	is_returned = any('return' in line and var in line for line in code_lines) # Check if used in return statement
	is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines) # Check if assigned

	if is_input:
	role = 'input_variable'
	elif is_returned:
	role = 'returned_variable'
	elif is_assigned:
	role = 'assigned_variable'
	else:
	role = 'assigned_variable' # Default to assigned if unclear

	new_name = f"{role}{var_count[role]}"
	var_map[var] = new_name
	var_count[role] += 1

	# Replace variables in code
	new_code = code
	for old_var, new_var in var_map.items():
	new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)

	return new_code, var_map

	def generate_description_tokens(sequence, vectors, var_map=None):
	"""Generate semantic description tokens for a program, including variable roles."""
	tokens = []
	category_descriptions = {
	'import': 'imports module',
	'function': 'defines function',
	'assigned_variable': 'assigns variable',
	'input_variable': 'input parameter',
	'returned_variable': 'returns value',
	'if': 'conditional statement',
	'return': 'returns result',
	'try': 'try block',
	'except': 'exception handler',
	'expression': 'expression statement',
	'spacer': 'empty line or comment'
	}

	for cat, vec in zip(sequence, vectors):
	if cat in category_descriptions:
	tokens.append(f"{category_descriptions[cat]}:{cat}")
	# Add vector-derived features (e.g., level, span) as tokens
	tokens.append(f"level:{vec[1]}")
	tokens.append(f"span:{vec[3]:.2f}")

	# Add variable role tokens if var_map exists
	if var_map:
	for old_var, new_var in var_map.items():
	role = new_var.split('variable')[0] + 'variable' # Extract role (e.g., 'input_variable')
	tokens.append(f"variable:{old_var}={new_var}:{role}")

	return tokens

	def generate_semantic_vector_og(description, total_lines=100):
	"""Generate a 6D semantic vector for a textual description, matching our vector format."""
	# Use a simplified heuristic to map description to our 6D vector format
	category_map = {
	'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
	}

	# Parse description for key terms
	tokens = description.lower().split()
	vector = [0] * 6 # Initialize 6D vector

	# Map description tokens to categories and assign basic vector values
	for token in tokens:
	for cat, cat_id in category_map.items():
	if cat in token:
	vector[0] = cat_id # category_id
	vector[1] = 1 # level (assume top-level for simplicity)
	vector[2] = 0.5 # center_pos (midpoint of code)
	vector[3] = 0.1 # span (small for simplicity)
	vector[4] = 1 # parent_depth (shallow)
	vector[5] = cat_id / len(category_map) # parent_weight (normalized)
	break

	return vector


	"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
	# Load CodeBERT model and tokenizer
	model_name = "microsoft/codebert-base"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
	model = AutoModel.from_pretrained(model_name).to(device)
	def generate_semantic_vector(description, total_lines=100, use_gpu=False):


	# Tokenize and encode the description
	inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Generate embeddings
	with torch.no_grad():
	outputs = model(**inputs)
	# Use mean pooling of the last hidden states
	vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()

	# Truncate or project to 6D (simplified projection: take first 6 dimensions)
	if len(vector) < 6:
	vector.extend([0] * (6 - len(vector)))
	elif len(vector) > 6:
	vector = vector[:6] # Truncate to 6D

	return vector
	def process_hf_dataset():
	"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
	# Load the dataset
	dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")

	# Initialize ChromaDB client
	client = init_chromadb()

	# Process each entry
	for entry in dataset:
	instruction = entry['instruction']
	output = entry['output']

	# Rename variables to align with vector categories
	processed_code, var_map = rename_variables(output)

	# Parse the code to get parts and sequence, generating our 6D vectors
	parts, sequence = parse_python_code(processed_code)
	vectors = [part['vector'] for part in parts] # Use parser's 6D vectors

	# Generate description tokens including variable roles
	description_tokens = f"task:{instruction.replace(' ', '_')}"
	description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
	description_tokens += " " + " ".join(description_tokens_list)

	# Generate a 6D semantic vector for the instruction, incorporating it into the program vector
	semantic_vector = generate_semantic_vector(instruction)

	# Combine program vectors with instruction vector (average or concatenate, but ensure 6D)
	combined_vector = semantic_vector # Use semantic vector as primary for semantic search

	# Store in ChromaDB with description and combined vector
	store_program(client, processed_code, sequence, [combined_vector], DB_NAME)

	# Update metadata with instruction and variable roles as description
	collection = client.get_collection(DB_NAME)
	program_id = str(hash(processed_code))
	collection.update(
	ids=[program_id],
	metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
	embeddings=[combined_vector] # Ensure 6D embedding
	)

	# Save to Hugging Face Dataset
	save_chromadb_to_hf()

	def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
	"""Save ChromaDB data to Hugging Face Dataset."""
	client = init_chromadb()
	collection = client.get_collection(DB_NAME)

	# Fetch all data from ChromaDB
	results = collection.get(include=["documents", "metadatas", "embeddings"])
	data = {
	"code": results["documents"],
	"sequence": [meta["sequence"] for meta in results["metadatas"]],
	"vectors": results["embeddings"], # ChromaDB already flattens embeddings
	"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
	}

	# Create a Hugging Face Dataset
	dataset = Dataset.from_dict(data)

	# Push to Hugging Face Hub
	dataset.push_to_hub(dataset_name, token=token)
	print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")

	if __name__ == "__main__":
	process_hf_dataset()