File size: 8,065 Bytes
b284540
 
 
d6c93c4
e77acbf
b284540
e77acbf
 
 
 
 
b284540
0df5c07
 
 
 
 
 
 
 
 
b284540
 
 
 
 
0df5c07
b284540
 
 
 
 
 
 
 
0df5c07
e77acbf
0df5c07
 
 
e77acbf
 
 
 
 
 
0df5c07
e77acbf
0df5c07
e77acbf
0df5c07
e77acbf
 
0df5c07
 
 
 
b284540
 
 
 
 
 
0df5c07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b284540
d6c93c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b284540
927956e
b284540
 
 
 
 
 
 
 
 
 
 
0df5c07
 
b284540
d6c93c4
b284540
d6c93c4
b284540
0df5c07
b284540
0df5c07
 
b284540
927956e
d6c93c4
 
927956e
 
 
 
 
b284540
0df5c07
b284540
 
 
 
d6c93c4
927956e
b284540
 
 
 
 
e77acbf
b284540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# process_hf_dataset.py
from datasets import load_dataset
import re
from parser import parse_python_code, create_vector
from database import init_chromadb, store_program, DB_NAME, HF_DATASET_NAME
import chromadb
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def rename_variables(code, variable_prefixes=None):
    """Rename variables in Python code to align with vector categories (input_variable, assigned_variable, returned_variable)."""
    if variable_prefixes is None:
        variable_prefixes = {
            'input': 'input_variable',
            'assigned': 'assigned_variable',
            'returned': 'returned_variable'
        }
    
    # Simple variable name detection and renaming
    pattern = r'\b[a-zA-Z_]\w*\b'  # Match variable names (simple heuristic)
    variables = set()
    code_lines = code.split('\n')
    
    # Find all variable names (simplified approach, could improve with AST)
    for line in code_lines:
        matches = re.findall(pattern, line)
        for match in matches:
            if match not in ['def', 'if', 'else', 'for', 'while', 'return', 'import', 'print', 'eval', 'str', 'int']:  # Exclude keywords
                variables.add(match)
    
    # Sort variables by first appearance (simplified, could improve with AST)
    sorted_vars = sorted(list(variables))
    var_map = {}
    var_count = {'input_variable': 1, 'assigned_variable': 1, 'returned_variable': 1}
    
    # Assign variables based on context (simplified heuristic)
    for var in sorted_vars:
        # Determine variable role based on context
        is_input = any(var in line and 'def' in line for line in code_lines)  # Check if in function definition (input parameter)
        is_returned = any('return' in line and var in line for line in code_lines)  # Check if used in return statement
        is_assigned = any('=' in line and var in line.split('=')[0].strip() for line in code_lines)  # Check if assigned
        
        if is_input:
            role = 'input_variable'
        elif is_returned:
            role = 'returned_variable'
        elif is_assigned:
            role = 'assigned_variable'
        else:
            role = 'assigned_variable'  # Default to assigned if unclear
        
        new_name = f"{role}{var_count[role]}"
        var_map[var] = new_name
        var_count[role] += 1
    
    # Replace variables in code
    new_code = code
    for old_var, new_var in var_map.items():
        new_code = re.sub(r'\b' + old_var + r'\b', new_var, new_code)
    
    return new_code, var_map

def generate_description_tokens(sequence, vectors, var_map=None):
    """Generate semantic description tokens for a program, including variable roles."""
    tokens = []
    category_descriptions = {
        'import': 'imports module',
        'function': 'defines function',
        'assigned_variable': 'assigns variable',
        'input_variable': 'input parameter',
        'returned_variable': 'returns value',
        'if': 'conditional statement',
        'return': 'returns result',
        'try': 'try block',
        'except': 'exception handler',
        'expression': 'expression statement',
        'spacer': 'empty line or comment'
    }
    
    for cat, vec in zip(sequence, vectors):
        if cat in category_descriptions:
            tokens.append(f"{category_descriptions[cat]}:{cat}")
            # Add vector-derived features (e.g., level, span) as tokens
            tokens.append(f"level:{vec[1]}")
            tokens.append(f"span:{vec[3]:.2f}")
    
    # Add variable role tokens if var_map exists
    if var_map:
        for old_var, new_var in var_map.items():
            role = new_var.split('variable')[0] + 'variable'  # Extract role (e.g., 'input_variable')
            tokens.append(f"variable:{old_var}={new_var}:{role}")
    
    return tokens

def generate_semantic_vector(description, total_lines=100):
    """Generate a 6D semantic vector for a textual description, matching our vector format."""
    # Use a simplified heuristic to map description to our 6D vector format
    category_map = {
        'import': 1, 'function': 2, 'assign': 17, 'input': 18, 'return': 19, 'if': 5, 'try': 8, 'except': 14
    }
    
    # Parse description for key terms
    tokens = description.lower().split()
    vector = [0] * 6  # Initialize 6D vector
    
    # Map description tokens to categories and assign basic vector values
    for token in tokens:
        for cat, cat_id in category_map.items():
            if cat in token:
                vector[0] = cat_id  # category_id
                vector[1] = 1  # level (assume top-level for simplicity)
                vector[2] = 0.5  # center_pos (midpoint of code)
                vector[3] = 0.1  # span (small for simplicity)
                vector[4] = 1  # parent_depth (shallow)
                vector[5] = cat_id / len(category_map)  # parent_weight (normalized)
                break
    
    return vector

def process_hf_dataset():
    """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
    # Load the dataset
    dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
    
    # Initialize ChromaDB client
    client = init_chromadb()
    
    # Process each entry
    for entry in dataset:
        instruction = entry['instruction']
        output = entry['output']
        
        # Rename variables to align with vector categories
        processed_code, var_map = rename_variables(output)
        
        # Parse the code to get parts and sequence, generating our 6D vectors
        parts, sequence = parse_python_code(processed_code)
        vectors = [part['vector'] for part in parts]  # Use parser's 6D vectors
        
        # Generate description tokens including variable roles
        description_tokens = f"task:{instruction.replace(' ', '_')}"
        description_tokens_list = generate_description_tokens(sequence, vectors, var_map)
        description_tokens += " " + " ".join(description_tokens_list)
        
        # Generate a 6D semantic vector for the instruction, incorporating it into the program vector
        semantic_vector = generate_semantic_vector(instruction)
        
        # Combine program vectors with instruction vector (average or concatenate, but ensure 6D)
        combined_vector = semantic_vector  # Use semantic vector as primary for semantic search
        
        # Store in ChromaDB with description and combined vector
        store_program(client, processed_code, sequence, [combined_vector], DB_NAME)
        
        # Update metadata with instruction and variable roles as description
        collection = client.get_collection(DB_NAME)
        program_id = str(hash(processed_code))
        collection.update(
            ids=[program_id],
            metadatas=[{"sequence": ",".join(sequence), "description_tokens": description_tokens}],
            embeddings=[combined_vector]  # Ensure 6D embedding
        )
    
    # Save to Hugging Face Dataset
    save_chromadb_to_hf()

def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY")):
    """Save ChromaDB data to Hugging Face Dataset."""
    client = init_chromadb()
    collection = client.get_collection(DB_NAME)
    
    # Fetch all data from ChromaDB
    results = collection.get(include=["documents", "metadatas", "embeddings"])
    data = {
        "code": results["documents"],
        "sequence": [meta["sequence"] for meta in results["metadatas"]],
        "vectors": results["embeddings"],  # ChromaDB already flattens embeddings
        "description_tokens": [meta.get('description_tokens', '') for meta in results["metadatas"]]
    }
    
    # Create a Hugging Face Dataset
    dataset = Dataset.from_dict(data)
    
    # Push to Hugging Face Hub
    dataset.push_to_hub(dataset_name, token=token)
    print(f"Dataset pushed to Hugging Face Hub as {dataset_name}")

if __name__ == "__main__":
    process_hf_dataset()