broadfield-dev commited on
Commit
9e89af0
·
verified ·
1 Parent(s): 4058ab2

Update process_hf_dataset.py

Browse files
Files changed (1) hide show
  1. process_hf_dataset.py +8 -6
process_hf_dataset.py CHANGED
@@ -120,13 +120,15 @@ def generate_semantic_vector_og(description, total_lines=100):
120
 
121
  return vector
122
 
 
 
 
 
 
 
 
123
  def generate_semantic_vector(description, total_lines=100, use_gpu=False):
124
- """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
125
- # Load CodeBERT model and tokenizer
126
- model_name = "microsoft/codebert-base"
127
- tokenizer = AutoTokenizer.from_pretrained(model_name)
128
- device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
129
- model = AutoModel.from_pretrained(model_name).to(device)
130
 
131
  # Tokenize and encode the description
132
  inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)
 
120
 
121
  return vector
122
 
123
+
124
+ """Generate a 6D semantic vector for a textual description using CodeBERT, projecting to 6D."""
125
+ # Load CodeBERT model and tokenizer
126
+ model_name = "microsoft/codebert-base"
127
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
128
+ device = torch.device("cuda" if use_gpu and torch.cuda.is_available() else "cpu")
129
+ model = AutoModel.from_pretrained(model_name).to(device)
130
  def generate_semantic_vector(description, total_lines=100, use_gpu=False):
131
+
 
 
 
 
 
132
 
133
  # Tokenize and encode the description
134
  inputs = tokenizer(description, return_tensors="pt", padding=True, truncation=True, max_length=512)