File size: 11,610 Bytes
b284540
 
 
d6c93c4
e77acbf
b284540
e77acbf
 
506d255
 
ecb205b
 
6fa17d2
 
 
 
 
e77acbf
 
 
b284540
16ea922
ecb205b
 
 
 
 
 
 
 
 
 
 
 
 
6fa17d2
ecb205b
6fa17d2
ecb205b
 
 
0df5c07
 
 
 
 
 
 
 
 
b284540
 
 
 
 
0df5c07
b284540
 
 
 
 
 
 
 
0df5c07
e77acbf
0df5c07
 
 
e77acbf
 
 
 
 
 
0df5c07
e77acbf
0df5c07
e77acbf
0df5c07
e77acbf
 
0df5c07
 
 
 
b284540
 
 
 
 
 
0df5c07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b284540
4058ab2
506d255
ecb205b
 
 
4058ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18f44de
 
6fa17d2
18f44de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6fa17d2
4058ab2
506d255
ecb205b
 
b284540
6fa17d2
 
 
 
 
 
 
16ea922
b284540
 
 
18f44de
 
ecb205b
 
16ea922
ecb205b
16ea922
ecb205b
 
 
 
 
927956e
ecb205b
6fa17d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b284540
ecb205b
 
 
 
 
 
 
 
6fa17d2
ecb205b
6fa17d2
ecb205b
b284540
 
 
 
e77acbf
b284540
 
 
 
 
 
 
 
 
506d255
 
 
b284540
 
 
 
 
 
6fa17d2
 
 
 
 
 
b284540
 
ecb205b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# process_hf_dataset.py
from datasets import load_dataset
import re
from parser import parse_python_code, create_vector
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
import chromadb
import os
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm  # For progress bar
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Cache CodeBERT model globally to avoid repeated loading and reducing freezing
model_name = "microsoft/codebert-base"
tokenizer = None
model = None
device = None

def load_codebert_model(use_gpu=False):
    """Load and cache the CodeBERT model, handling GPU/CPU options."""
    global tokenizer, model, device
    if tokenizer is None or model is None:
        try:
            device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModel.from_pretrained(model_name).to(device)
            logger.info(f"CodeBERT model loaded on {device}")
        except Exception as e:
            logger.error(f"Error loading CodeBERT model: {e}")
            raise
    return tokenizer, model, device

def rename_variables(code, variable_prefixes=None):
    """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
    if variable_prefixes is None:
        variable_prefixes = {
            'input': 'input_variable',
            'assigned': 'assigned_variable',
            'returned': 'returned_variable'
        }
    
    # Simple variable name detection and renaming
    pattern = r'\b[a-zA-Z_]\w*\b'  # Match variable names (simple heuristic)
    variables = set()
    code_lines = code.split('\n')
    
    # Find all variable names (simplified approach, could improve with AST)
    for line in code_lines:
        matches = re.findall(pattern, line)
        for match in matches:
            if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']:  # Exclude keywords
                variables.add(match)
    
    # Sort variables by first appearance (simplified, could improve with AST)
    sorted_vars = sorted(list(variables))
    var_map = {}
    var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1}
    
    # Assign variables based on context (simplified heuristic)
    for var in sorted_vars:
        # Determine variable role based on context
        is_input = any(var in line and 'def' in line for line in code_lines)  # Check if in function definition (input parameter)
        is_returned = any('return' in line and var in line for line in code_lines)  # Check if used in return statement
        is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines)  # Check if assigned
        
        if is_input:
            role = 'input_variable'
        elif is_returned:
            role = 'returned_variable'
        elif is_assigned:
            role = 'assigned_variable'
        else:
            role = 'assigned_variable'  # Default to assigned if unclear
        
        new_name = f"{role}{var_count[role]}"
        var_map[var] = new_name
        var_count[role] += 1
    
    # Replace variables in code
    new_code = code
    for old_var, new_var in var_map.items():
        new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
    
    return new_code, var_map

def generate_description_tokens(sequence, vectors, var_map=None):
    """Generate semantic description tokens for a program, including variable roles."""
    tokens = []
    category_descriptions = {
        'import': 'imports module',
        'function': 'defines function',
        'assigned_variable': 'assigns variable',
        'input_variable': 'input parameter',
        'returned_variable': 'returns value',
        'if': 'conditional statement',
        'return': 'returns result',
        'try': 'try block',
        'except': 'exception handler',
        'expression': 'expression statement',
        'spacer': 'empty line or comment'
    }
    
    for cat, vec in zip(sequence, vectors):
        if cat in category_descriptions:
            tokens.append(f"{category_descriptions[cat]}:{cat}")
            # Add vector-derived features (e.g., level, span) as tokens
            tokens.append(f"level:{vec[1]}")
            tokens.append(f"span:{vec[3]:.2f}")
    
    # Add variable role tokens if var_map exists
    if var_map:
        for old_var, new_var in var_map.items():
            role = new_var.split('variable')[0] + 'variable'  # Extract role (e.g., 'input_variable')
            tokens.append(f"variable:{old_var}={new_var}:{role}")
    
    return tokens

def generate_semantic_vector(description, total_lines=100, use_gpu=False):
    """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
    global tokenizer, model, device
    if tokenizer is None or model is None:
        tokenizer, model, device = load_codebert_model(use_gpu)
    
    # Tokenize and encode the description
    inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # Use mean pooling of the last hidden states
        vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
    
    # Truncate or project to 6D (simplified projection: take first 6 dimensions)
    if len(vector) < 6:
        vector.extend([0] * (6 - len(vector)))
    elif len(vector) > 6:
        vector = vector[:6]  # Truncate to 6D
    
    # Ensure vector isn’t all zeros or defaults
    if all(v == 0 for v in vector):
        logger.warning(f"Default vector detected for description: {description}")
        # Fallback: Use heuristic if CodeBERT fails to generate meaningful embeddings
        category_map = {
            'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
        }
        tokens = description.lower().split()
        vector = [0] * 6
        for token in tokens:
            for cat, cat_id in category_map.items():
                if cat in token:
                    vector[0] = cat_id  # category_id
                    vector[1] = 1  # level
                    vector[2] = 0.5  # center_pos
                    vector[3] = 0.1  # span
                    vector[4] = 1  # parent_depth
                    vector[5] = cat_id / len(category_map)  # parent_weight
                    break
    
    logger.debug(f"Generated semantic vector for '{description}': {vector}")
    return vector

def process_hf_dataset(batch_size=100, use_gpu=False):
    """Process the Hugging Face dataset in batches and store programs in ChromaDB, aligning with vector categories."""
    # Load the dataset
    try:
        dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
        dataset_list = list(dataset)
        logger.info(f"Loaded dataset with {len(dataset_list)} entries")
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise
    
    # Initialize ChromaDB client
    client = init_chromadb()
    
    # Do not clear or populate with defaults here—let UI buttons handle this
    collection = client.get_or_create_collection(DB_NAME)
    
    # Process in batches with progress bar
    total_entries = len(dataset_list)
    for i in tqdm(range(0, total_entries, batch_size), desc="Processing Hugging Face Dataset"):
        batch = dataset_list[i:i + batch_size]
        batch_programs = []
        batch_ids = []
        batch_documents = []
        batch_metadatas = []
        batch_embeddings = []
        
        for entry in batch:
            try:
                instruction = entry['instruction']
                output = entry['output']
                
                # Rename variables to align with vector categories
                processed_code, var_map = rename_variables(output)
                
                # Parse the code to get parts and sequence, generating our 6D vectors
                parts, sequence = parse_python_code(processed_code)
                program_vectors = [part['vector'] for part in parts]  # Use parser's 6D vectors for program structure
                
                # Generate description tokens including variable roles
                description_tokens = f"task:{instruction.replace(' ', '_')}"
                description_tokens_list = generate_description_tokens(sequence, program_vectors, var_map)
                description_tokens += " " + " ".join(description_tokens_list)
                
                # Generate a 6D semantic vector for the instruction
                semantic_vector = generate_semantic_vector(instruction, use_gpu=use_gpu)
                
                # Store program data
                program_id = str(hash(processed_code))
                batch_ids.append(program_id)
                batch_documents.append(processed_code)
                batch_metadatas.append({"sequence": ",".join(sequence), "description_tokens": description_tokens, "program_vectors": str(program_vectors)})
                batch_embeddings.append(semantic_vector)
                
                logger.debug(f"Processed entry: {program_id}, Vector: {semantic_vector}")
            except Exception as e:
                logger.error(f"Error processing entry {i}: {e}")
                continue  # Skip failed entries but continue processing
        
        # Batch add to ChromaDB
        try:
            collection.add(
                documents=batch_documents,
                metadatas=batch_metadatas,
                ids=batch_ids,
                embeddings=batch_embeddings
            )
            logger.info(f"Added batch {i//batch_size + 1} to ChromaDB with {len(batch_ids)} entries")
        except Exception as e:
            logger.error(f"Error adding batch to ChromaDB: {e}")
            raise
    
    # Save to Hugging Face Dataset
    save_chromadb_to_hf()

def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
    """Save ChromaDB data to Hugging Face Dataset."""
    client = init_chromadb()
    collection = client.get_collection(DB_NAME)
    
    # Fetch all data from ChromaDB
    results = collection.get(include=["documents", "metadatas", "embeddings"])
    data = {
        "code": results["documents"],
        "sequence": [meta["sequence"] for meta in results["metadatas"]],
        "vectors": results["embeddings"],  # Semantic 6D vectors
        "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]],
        "program_vectors": [eval(meta.get('program_vectors', '[]')) for meta in results["metadatas"]]  # Store structural vectors
    }
    
    # Create a Hugging Face Dataset
    dataset = Dataset.from_dict(data)
    
    # Push to Hugging Face Hub
    try:
        dataset.push_to_hub(dataset_name, token=token)
        logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
    except Exception as e:
        logger.error(f"Error pushing dataset to Hugging Face Hub: {e}")
        raise

if __name__ == "__main__":
    process_hf_dataset(batch_size=100, use_gpu=False)