Spaces:
Running
Running
File size: 8,065 Bytes
b284540 d6c93c4 e77acbf b284540 e77acbf b284540 0df5c07 b284540 0df5c07 b284540 0df5c07 e77acbf 0df5c07 e77acbf 0df5c07 e77acbf 0df5c07 e77acbf 0df5c07 e77acbf 0df5c07 b284540 0df5c07 b284540 d6c93c4 b284540 927956e b284540 0df5c07 b284540 d6c93c4 b284540 d6c93c4 b284540 0df5c07 b284540 0df5c07 b284540 927956e d6c93c4 927956e b284540 0df5c07 b284540 d6c93c4 927956e b284540 e77acbf b284540 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# process_hf_dataset.py
from datasets import load_dataset
import re
from parser import parse_python_code, create_vector
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
import chromadb
import os
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
def rename_variables(code, variable_prefixes=None):
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
if variable_prefixes is None:
variable_prefixes = {
'input': 'input_variable',
'assigned': 'assigned_variable',
'returned': 'returned_variable'
}
# Simple variable name detection and renaming
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
variables = set()
code_lines = code.split('\n')
# Find all variable names (simplified approach, could improve with AST)
for line in code_lines:
matches = re.findall(pattern, line)
for match in matches:
if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords
variables.add(match)
# Sort variables by first appearance (simplified, could improve with AST)
sorted_vars = sorted(list(variables))
var_map = {}
var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1}
# Assign variables based on context (simplified heuristic)
for var in sorted_vars:
# Determine variable role based on context
is_input = any(var in line and 'def' in line for line in code_lines) # Check if in function definition (input parameter)
is_returned = any('return' in line and var in line for line in code_lines) # Check if used in return statement
is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines) # Check if assigned
if is_input:
role = 'input_variable'
elif is_returned:
role = 'returned_variable'
elif is_assigned:
role = 'assigned_variable'
else:
role = 'assigned_variable' # Default to assigned if unclear
new_name = f"{role}{var_count[role]}"
var_map[var] = new_name
var_count[role] += 1
# Replace variables in code
new_code = code
for old_var, new_var in var_map.items():
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
return new_code, var_map
def generate_description_tokens(sequence, vectors, var_map=None):
"""Generate semantic description tokens for a program, including variable roles."""
tokens = []
category_descriptions = {
'import': 'imports module',
'function': 'defines function',
'assigned_variable': 'assigns variable',
'input_variable': 'input parameter',
'returned_variable': 'returns value',
'if': 'conditional statement',
'return': 'returns result',
'try': 'try block',
'except': 'exception handler',
'expression': 'expression statement',
'spacer': 'empty line or comment'
}
for cat, vec in zip(sequence, vectors):
if cat in category_descriptions:
tokens.append(f"{category_descriptions[cat]}:{cat}")
# Add vector-derived features (e.g., level, span) as tokens
tokens.append(f"level:{vec[1]}")
tokens.append(f"span:{vec[3]:.2f}")
# Add variable role tokens if var_map exists
if var_map:
for old_var, new_var in var_map.items():
role = new_var.split('variable')[0] + 'variable' # Extract role (e.g., 'input_variable')
tokens.append(f"variable:{old_var}={new_var}:{role}")
return tokens
def generate_semantic_vector(description, total_lines=100):
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
# Use a simplified heuristic to map description to our 6D vector format
category_map = {
'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
}
# Parse description for key terms
tokens = description.lower().split()
vector = [0] * 6 # Initialize 6D vector
# Map description tokens to categories and assign basic vector values
for token in tokens:
for cat, cat_id in category_map.items():
if cat in token:
vector[0] = cat_id # category_id
vector[1] = 1 # level (assume top-level for simplicity)
vector[2] = 0.5 # center_pos (midpoint of code)
vector[3] = 0.1 # span (small for simplicity)
vector[4] = 1 # parent_depth (shallow)
vector[5] = cat_id / len(category_map) # parent_weight (normalized)
break
return vector
def process_hf_dataset():
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
# Load the dataset
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
# Initialize ChromaDB client
client = init_chromadb()
# Process each entry
for entry in dataset:
instruction = entry['instruction']
output = entry['output']
# Rename variables to align with vector categories
processed_code, var_map = rename_variables(output)
# Parse the code to get parts and sequence, generating our 6D vectors
parts, sequence = parse_python_code(processed_code)
vectors = [part['vector'] for part in parts] # Use parser's 6D vectors
# Generate description tokens including variable roles
description_tokens = f"task:{instruction.replace(' ', '_')}"
description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
description_tokens += " " + " ".join(description_tokens_list)
# Generate a 6D semantic vector for the instruction, incorporating it into the program vector
semantic_vector = generate_semantic_vector(instruction)
# Combine program vectors with instruction vector (average or concatenate, but ensure 6D)
combined_vector = semantic_vector # Use semantic vector as primary for semantic search
# Store in ChromaDB with description and combined vector
store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
# Update metadata with instruction and variable roles as description
collection = client.get_collection(DB_NAME)
program_id = str(hash(processed_code))
collection.update(
ids=[program_id],
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
embeddings=[combined_vector] # Ensure 6D embedding
)
# Save to Hugging Face Dataset
save_chromadb_to_hf()
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
"""Save ChromaDB data to Hugging Face Dataset."""
client = init_chromadb()
collection = client.get_collection(DB_NAME)
# Fetch all data from ChromaDB
results = collection.get(include=["documents", "metadatas", "embeddings"])
data = {
"code": results["documents"],
"sequence": [meta["sequence"] for meta in results["metadatas"]],
"vectors": results["embeddings"], # ChromaDB already flattens embeddings
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
}
# Create a Hugging Face Dataset
dataset = Dataset.from_dict(data)
# Push to Hugging Face Hub
dataset.push_to_hub(dataset_name, token=token)
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
if __name__ == "__main__":
process_hf_dataset() |