Spaces:
Running
Running
File size: 8,476 Bytes
b284540 d6c93c4 e77acbf b284540 e77acbf 506d255 e77acbf b284540 0df5c07 b284540 0df5c07 b284540 0df5c07 e77acbf 0df5c07 e77acbf 0df5c07 e77acbf 0df5c07 e77acbf 0df5c07 e77acbf 0df5c07 b284540 0df5c07 b284540 4058ab2 506d255 4058ab2 506d255 b284540 927956e b284540 0df5c07 b284540 d6c93c4 b284540 506d255 b284540 0df5c07 b284540 506d255 0df5c07 b284540 506d255 d6c93c4 506d255 927956e b284540 506d255 b284540 506d255 b284540 e77acbf b284540 506d255 b284540 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# process_hf_dataset.py
from datasets import load_dataset
import re
from parser import parse_python_code, create_vector
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
import chromadb
import os
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel
import torch
# Load environment variables
load_dotenv()
def rename_variables(code, variable_prefixes=None):
"""Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
if variable_prefixes is None:
variable_prefixes = {
'input': 'input_variable',
'assigned': 'assigned_variable',
'returned': 'returned_variable'
}
# Simple variable name detection and renaming
pattern = r'\b[a-zA-Z_]\w*\b' # Match variable names (simple heuristic)
variables = set()
code_lines = code.split('\n')
# Find all variable names (simplified approach, could improve with AST)
for line in code_lines:
matches = re.findall(pattern, line)
for match in matches:
if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']: # Exclude keywords
variables.add(match)
# Sort variables by first appearance (simplified, could improve with AST)
sorted_vars = sorted(list(variables))
var_map = {}
var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1}
# Assign variables based on context (simplified heuristic)
for var in sorted_vars:
# Determine variable role based on context
is_input = any(var in line and 'def' in line for line in code_lines) # Check if in function definition (input parameter)
is_returned = any('return' in line and var in line for line in code_lines) # Check if used in return statement
is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines) # Check if assigned
if is_input:
role = 'input_variable'
elif is_returned:
role = 'returned_variable'
elif is_assigned:
role = 'assigned_variable'
else:
role = 'assigned_variable' # Default to assigned if unclear
new_name = f"{role}{var_count[role]}"
var_map[var] = new_name
var_count[role] += 1
# Replace variables in code
new_code = code
for old_var, new_var in var_map.items():
new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
return new_code, var_map
def generate_description_tokens(sequence, vectors, var_map=None):
"""Generate semantic description tokens for a program, including variable roles."""
tokens = []
category_descriptions = {
'import': 'imports module',
'function': 'defines function',
'assigned_variable': 'assigns variable',
'input_variable': 'input parameter',
'returned_variable': 'returns value',
'if': 'conditional statement',
'return': 'returns result',
'try': 'try block',
'except': 'exception handler',
'expression': 'expression statement',
'spacer': 'empty line or comment'
}
for cat, vec in zip(sequence, vectors):
if cat in category_descriptions:
tokens.append(f"{category_descriptions[cat]}:{cat}")
# Add vector-derived features (e.g., level, span) as tokens
tokens.append(f"level:{vec[1]}")
tokens.append(f"span:{vec[3]:.2f}")
# Add variable role tokens if var_map exists
if var_map:
for old_var, new_var in var_map.items():
role = new_var.split('variable')[0] + 'variable' # Extract role (e.g., 'input_variable')
tokens.append(f"variable:{old_var}={new_var}:{role}")
return tokens
def generate_semantic_vector(description, total_lines=100, use_gpu=False):
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
# Load CodeBERT model and tokenizer
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_name).to(device)
# Tokenize and encode the description
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Generate embeddings
with torch.no_grad():
outputs = model(**inputs)
# Use mean pooling of the last hidden states
vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
# Truncate or project to 6D (simplified projection: take first 6 dimensions)
if len(vector) < 6:
vector.extend([0] * (6 - len(vector)))
elif len(vector) > 6:
vector = vector[:6] # Truncate to 6D
return vector
def process_hf_dataset():
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
# Load the dataset
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
# Initialize ChromaDB client
client = init_chromadb()
# Process each entry
for entry in dataset:
instruction = entry['instruction']
output = entry['output']
# Rename variables to align with vector categories
processed_code, var_map = rename_variables(output)
# Parse the code to get parts and sequence, generating our 6D vectors
parts, sequence = parse_python_code(processed_code)
program_vectors = [part['vector'] for part in parts] # Use parser's 6D vectors for program structure
# Generate description tokens including variable roles
description_tokens = f"task:{instruction.replace(' ', '_')}"
description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
description_tokens += " " + " ".join(description_tokens_list)
# Generate a 6D semantic vector for the instruction
semantic_vector = generate_semantic_vector(instruction)
# Combine program vectors with semantic vector (use semantic vector for semantic search, store program vectors separately)
# Store both semantic and program vectors, but ensure ChromaDB uses 6D
combined_vector = semantic_vector # Use semantic vector for ChromaDB embedding (6D)
# Store in ChromaDB with description and combined vector
store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
# Update metadata with instruction and variable roles as description, and store program vectors
collection = client.get_collection(DB_NAME)
program_id = str(hash(processed_code))
collection.update(
ids=[program_id],
metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)}],
embeddings=[combined_vector] # Ensure 6D embedding for semantic search
)
# Save to Hugging Face Dataset
save_chromadb_to_hf()
def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
"""Save ChromaDB data to Hugging Face Dataset."""
client = init_chromadb()
collection = client.get_collection(DB_NAME)
# Fetch all data from ChromaDB
results = collection.get(include=["documents", "metadatas", "embeddings"])
data = {
"code": results["documents"],
"sequence": [meta["sequence"] for meta in results["metadatas"]],
"vectors": results["embeddings"], # Semantic 6D vectors
"description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
"program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]] # Store structural vectors
}
# Create a Hugging Face Dataset
dataset = Dataset.from_dict(data)
# Push to Hugging Face Hub
dataset.push_to_hub(dataset_name, token=token)
print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
if __name__ == "__main__":
process_hf_dataset() |