Spaces:
Running
Running
File size: 5,989 Bytes
b284540 0df5c07 b284540 0df5c07 b284540 0df5c07 b284540 0df5c07 b284540 0df5c07 b284540 0df5c07 b284540 0df5c07 b284540 0df5c07 b284540 0df5c07 b284540 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# process_hf_dataset.py
from datasets import load_dataset
import re
from parser import parse_python_code
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME, HF_KEY
import chromadb
def rename_variables(code, variable_prefixes=None):
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
if variable_prefixes is None:
variable_prefixes = {
'input': 'input_variable',
'assigned': 'assigned_variable',
'returned': 'returned_variable'
}
# Simple variable name detection and renaming
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
variables = set()
code_lines = code.split('\n')
# Find all variable names (simplified approach, could improve with AST)
for line in code_lines:
matches = re.findall(pattern, line)
for match in matches:
if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords
variables.add(match)
# Sort variables by first appearance (simplified, could improve with AST)
sorted_vars = sorted(list(variables))
var_map = {}
var_count = {prefix: 1 for prefix in variable_prefixes.values()}
# Assign variables based on context (simplified heuristic)
for var in sorted_vars:
# Determine variable role based on context (simplified)
if var in ['expression', 'input']: # Assume input parameters or initial variables
role = 'input_variable'
elif var in code.split() and 'return' in line for line in code_lines if var in line: # Returned variables
role = 'returned_variable'
else: # Default to assigned variables
role = 'assigned_variable'
new_name = f"{role}{var_count[role]}"
var_map[var] = new_name
var_count[role] += 1
# Replace variables in code
new_code = code
for old_var, new_var in var_map.items():
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
return new_code, var_map
def generate_description_tokens(sequence, vectors, var_map=None):
"""Generate semantic description tokens for a program, including variable roles."""
tokens = []
category_descriptions = {
'import': 'imports module',
'function': 'defines function',
'assigned_variable': 'assigns variable',
'input_variable': 'input parameter',
'returned_variable': 'returns value',
'if': 'conditional statement',
'return': 'returns result',
'try': 'try block',
'except': 'exception handler',
'expression': 'expression statement',
'spacer': 'empty line or comment'
}
for cat, vec in zip(sequence, vectors):
if cat in category_descriptions:
tokens.append(f"{category_descriptions[cat]}:{cat}")
# Add vector-derived features (e.g., level, span) as tokens
tokens.append(f"level:{vec[1]}")
tokens.append(f"span:{vec[3]:.2f}")
# Add variable role tokens if var_map exists
if var_map:
for old_var, new_var in var_map.items():
role = new_var.split('variable')[0] + 'variable' # Extract role (e.g., 'input_variable')
tokens.append(f"variable:{old_var}={new_var}:{role}")
return tokens
def process_hf_dataset():
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories."""
# Load the dataset
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
# Initialize ChromaDB client
client = init_chromadb()
# Process each entry
for entry in dataset:
instruction = entry['instruction']
output = entry['output']
# Rename variables to align with vector categories
processed_code, var_map = rename_variables(output)
# Parse the code to get parts and sequence
parts, sequence = parse_python_code(processed_code)
vectors = [part['vector'] for part in parts]
# Generate description tokens including variable roles
description_tokens = f"task:{instruction.replace(' ', '_')}"
description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
description_tokens += " " + " ".join(description_tokens_list)
# Store in ChromaDB with description
store_program(client, processed_code, sequence, vectors, DB_NAME)
# Update metadata with instruction and variable roles as description
collection = client.get_collection(DB_NAME)
program_id = str(hash(processed_code))
collection.update(
ids=[program_id],
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}]
)
# Save to Hugging Face Dataset
save_chromadb_to_hf()
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=HF_KEY):
"""Save ChromaDB data to Hugging Face Dataset."""
client = init_chromadb()
collection = client.get_collection(DB_NAME)
# Fetch all data from ChromaDB
results = collection.get(include=["documents", "metadatas", "embeddings"])
data = {
"code": results["documents"],
"sequence": [meta["sequence"] for meta in results["metadatas"]],
"vectors": results["embeddings"], # ChromaDB already flattens embeddings
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
}
# Create a Hugging Face Dataset
dataset = Dataset.from_dict(data)
# Push to Hugging Face Hub
dataset.push_to_hub(dataset_name, token=token)
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
if __name__ == "__main__":
process_hf_dataset() |