Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +26 -1
process_hf_dataset.py
CHANGED
@@ -95,7 +95,7 @@ def generate_description_tokens(sequence, vectors, var_map=None):
|
|
95 |
|
96 |
return tokens
|
97 |
|
98 |
-
def
|
99 |
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
|
100 |
# Use a simplified heuristic to map description to our 6D vector format
|
101 |
category_map = {
|
@@ -120,6 +120,31 @@ def generate_semantic_vector(description, total_lines=100):
|
|
120 |
|
121 |
return vector
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
def process_hf_dataset():
|
124 |
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
|
125 |
# Load the dataset
|
|
|
95 |
|
96 |
return tokens
|
97 |
|
98 |
+
def generate_semantic_vector_og(description, total_lines=100):
|
99 |
"""Generate a 6D semantic vector for a textual description, matching our vector format."""
|
100 |
# Use a simplified heuristic to map description to our 6D vector format
|
101 |
category_map = {
|
|
|
120 |
|
121 |
return vector
|
122 |
|
123 |
+
def generate_semantic_vector(description, total_lines=100, use_gpu=False):
|
124 |
+
"""Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
|
125 |
+
# Load CodeBERT model and tokenizer
|
126 |
+
model_name = "microsoft/codebert-base"
|
127 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
128 |
+
device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
|
129 |
+
model = AutoModel.from_pretrained(model_name).to(device)
|
130 |
+
|
131 |
+
# Tokenize and encode the description
|
132 |
+
inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
133 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
134 |
+
|
135 |
+
# Generate embeddings
|
136 |
+
with torch.no_grad():
|
137 |
+
outputs = model(**inputs)
|
138 |
+
# Use mean pooling of the last hidden states
|
139 |
+
vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
|
140 |
+
|
141 |
+
# Truncate or project to 6D (simplified projection: take first 6 dimensions)
|
142 |
+
if len(vector) < 6:
|
143 |
+
vector.extend([0] * (6 - len(vector)))
|
144 |
+
elif len(vector) > 6:
|
145 |
+
vector = vector[:6] # Truncate to 6D
|
146 |
+
|
147 |
+
return vector
|
148 |
def process_hf_dataset():
|
149 |
"""Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
|
150 |
# Load the dataset
|