broadfield-dev commited on
Commit
4058ab2
·
verified ·
1 Parent(s): 927956e

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +26 -1
process_hf_dataset.py CHANGED
@@ -95,7 +95,7 @@ def generate_description_tokens(sequence, vectors, var_map=None):
95
 
96
  return tokens
97
 
98
- def generate_semantic_vector(description, total_lines=100):
99
  """Generate a 6D semantic vector for a textual description, matching our vector format."""
100
  # Use a simplified heuristic to map description to our 6D vector format
101
  category_map = {
@@ -120,6 +120,31 @@ def generate_semantic_vector(description, total_lines=100):
120
 
121
  return vector
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  def process_hf_dataset():
124
  """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
125
  # Load the dataset
 
95
 
96
  return tokens
97
 
98
+ def generate_semantic_vector_og(description, total_lines=100):
99
  """Generate a 6D semantic vector for a textual description, matching our vector format."""
100
  # Use a simplified heuristic to map description to our 6D vector format
101
  category_map = {
 
120
 
121
  return vector
122
 
123
+ def generate_semantic_vector(description, total_lines=100, use_gpu=False):
124
+ """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
125
+ # Load CodeBERT model and tokenizer
126
+ model_name = "microsoft/codebert-base"
127
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
128
+ device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
129
+ model = AutoModel.from_pretrained(model_name).to(device)
130
+
131
+ # Tokenize and encode the description
132
+ inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
133
+ inputs = {k: v.to(device) for k, v in inputs.items()}
134
+
135
+ # Generate embeddings
136
+ with torch.no_grad():
137
+ outputs = model(**inputs)
138
+ # Use mean pooling of the last hidden states
139
+ vector = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy().tolist()
140
+
141
+ # Truncate or project to 6D (simplified projection: take first 6 dimensions)
142
+ if len(vector) < 6:
143
+ vector.extend([0] * (6 - len(vector)))
144
+ elif len(vector) > 6:
145
+ vector = vector[:6] # Truncate to 6D
146
+
147
+ return vector
148
  def process_hf_dataset():
149
  """Process the Hugging Face dataset and store programs in ChromaDB, aligning with vector categories and including instruction in vectors."""
150
  # Load the dataset