seanzxj commited on
Commit
895eef3
·
verified ·
1 Parent(s): 0d73126

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +15 -64
README.md CHANGED
@@ -49,6 +49,13 @@ filename='lr_clf_test2.joblib'
49
  model_file_path=hf_hub_download(repo_id=repo_id, filename=filename) <br>
50
  model=joblib.load(model_file_path)
51
  print(model)
 
 
 
 
 
 
 
52
 
53
  #Load test dataset (assuming the name is the same as the one in the Ed post) <br>
54
  test_df = pd.read_csv(file_path)
@@ -215,78 +222,22 @@ X_test = X_test.dropna(subset = ['title'])
215
  X_test = handle_missing_data(X_test, 'title')
216
  X_test = consistency_checks(X_test, 'title') </pre>
217
 
218
- # Load the embedding model from Huggingface. Transformer: DistilBERT
219
-
220
-
221
- <pre>
222
- def get_embeddings(text_all, tokenizer, model, device, max_len=128):
223
- '''
224
- Generate embeddings using a transformer model on GPU if available.
225
- Args:
226
- - text_all: List of input texts
227
- - tokenizer: Tokenizer for the model
228
- - model: Transformer model
229
- - device: torch.device to run the computations
230
- - max_len: Maximum token length for the input
231
- Returns:
232
- - embeddings: List of embeddings for each input text
233
- '''
234
- embeddings = []
235
-
236
- count = 0
237
- print('Start embeddings:')
238
-
239
- for text in text_all:
240
- count += 1
241
- if count % (len(text_all) // 10) == 0:
242
- print(f'{count / len(text_all) * 100:.1f}% done ...')
243
-
244
- # Tokenize the input text
245
- model_input_token = tokenizer(
246
- text,
247
- add_special_tokens=True,
248
- max_length=max_len,
249
- padding='max_length',
250
- truncation=True,
251
- return_tensors='pt'
252
- ).to(device) # Move input tensors to GPU
253
-
254
- # Generate embeddings without gradient computation
255
- with torch.no_grad():
256
- model_output = model(**model_input_token)
257
- cls_embedding = model_output.last_hidden_state[:, 0, :] # Use CLS token embedding
258
- cls_embedding = cls_embedding.squeeze().cpu().numpy() # Move back to CPU for numpy
259
- embeddings.append(cls_embedding)
260
-
261
- return embeddings </pre>
262
-
263
-
264
- # Check for GPU availability
265
- <pre>
266
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
267
- print(f'Using device: {device}')
268
-
269
- # Load the tokenizer and model for 'all-mpnet-base-v2'
270
- print("Loading model and tokenizer...")
271
- # Load model and tokenizer
272
- tokenizer_news = AutoTokenizer.from_pretrained('distilbert-base-uncased')
273
- model_news = AutoModel.from_pretrained('distilbert-base-uncased').to(device)
274
 
275
- # Set the model to evaluation mode
276
- model_news.eval()
277
 
278
- ############################################# DBERT UNCASED Embedding #############################################
279
  ############################################# Embedding #############################################
280
- print("Computing DBERT embeddings for training data...")
 
281
 
282
  y_test = X_test['labels']
283
  X_test = X_test['title']
284
-
285
- X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_news, model_news, device, max_len=128)
286
- print("DBERT embeddings for training data computed!")
287
 
 
288
 
289
- prediction = model.predict(X_test_embeddings_DBERT)
 
 
 
290
  </pre>
291
  # Accuracy
292
  <pre>label_map = {'NBC': 0, 'FoxNews': 1}
 
49
  model_file_path=hf_hub_download(repo_id=repo_id, filename=filename) <br>
50
  model=joblib.load(model_file_path)
51
  print(model)
52
+
53
+ repo_id2='awngsz/tfidf_model' ############# <--- check tfidf model name
54
+ filename2='embed_tfidf.joblib'
55
+
56
+ model_file_path2=hf_hub_download(repo_id=repo_id2, filename=filename2) <br>
57
+ tfidf_model=joblib.load(model_file_path2)
58
+ print(tfidf_model)
59
 
60
  #Load test dataset (assuming the name is the same as the one in the Ed post) <br>
61
  test_df = pd.read_csv(file_path)
 
222
  X_test = handle_missing_data(X_test, 'title')
223
  X_test = consistency_checks(X_test, 'title') </pre>
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
 
 
226
 
227
+ ############################################# TF-IDF Embedding #############################################
228
  ############################################# Embedding #############################################
229
+ from sklearn.feature_extraction.text import TfidfVectorizer
230
+ print("Computing embeddings ...")
231
 
232
  y_test = X_test['labels']
233
  X_test = X_test['title']
 
 
 
234
 
235
+ X_test_tfidf = tfidf_model.transform(X_test)
236
 
237
+ #X_test_embeddings_DBERT = get_embeddings(X_test, tokenizer_news, model_news, device, max_len=128)
238
+ print("Embeddings computed!")
239
+
240
+ prediction = model.predict(X_test_tfidf)
241
  </pre>
242
  # Accuracy
243
  <pre>label_map = {'NBC': 0, 'FoxNews': 1}