Spaces:
Sleeping
Sleeping
import pandas as pd | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
# Load CSV file | |
csv_file = "your_file.csv" # Path to your CSV file | |
df = pd.read_csv(csv_file) | |
# Assuming the column containing sentences is named 'text' | |
sentences = df['text'].tolist() | |
# Load Romanian BERT model and tokenizer | |
model_name = 'dumitrescustefan/bert-base-romanian-cased-v1' | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModel.from_pretrained(model_name) | |
# Function to get sentence embedding | |
def get_sentence_embedding(sentence, model, tokenizer): | |
inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=128) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
cls_embedding = outputs.last_hidden_state[:, 0, :] # CLS token embedding | |
return cls_embedding.numpy() | |
# Generate embeddings for all sentences | |
embeddings = [get_sentence_embedding(sentence, model, tokenizer) for sentence in sentences] | |
# Convert to numpy array | |
embeddings = np.array(embeddings).reshape(len(sentences), -1) | |
# Save embeddings to a file (optional) | |
np.save("sentence_embeddings.npy", embeddings) | |
# Save sentences for reference (optional) | |
df['embeddings'] = embeddings.tolist() | |
df.to_csv("embeddings_with_text.csv", index=False) | |