File size: 3,142 Bytes
308288b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# Load the dataset
df = pd.read_csv('data_larazon_publico_v2.csv')
# Define stopwords and stemmer for Spanish
stop_words = set(stopwords.words('spanish'))
stemmer = SnowballStemmer('spanish')
# Preprocess the text data
for i, row in df.iterrows():
# Tokenize the text
text = row['cuerpo']
tokens = word_tokenize(text.lower(), language='spanish')
# Remove stopwords, punctuation and stem the remaining words
stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token.isalpha()]
# Rejoin the stemmed tokens into a string and update the DataFrame
df.at[i, 'cuerpo'] = ' '.join(stemmed_tokens)
# Preprocess the data for summarization
tokenizer = AutoTokenizer.from_pretrained("it5/it5-base-news-summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("it5/it5-base-news-summarization")
max_input_length = 512
max_output_length = 128
input_ids = []
attention_masks = []
output_ids = []
for i in range(len(df)):
input_text = df.iloc[i]['cuerpo']
output_text = df.iloc[i]['cuerpo']
input_encoded = tokenizer.encode_plus(input_text, add_special_tokens=True,
max_length=max_input_length, pad_to_max_length=True,
return_attention_mask=True, return_tensors='pt')
output_encoded = tokenizer.encode_plus(output_text, add_special_tokens=True,
max_length=max_output_length, pad_to_max_length=True,
return_attention_mask=True, return_tensors='pt')
input_ids.append(input_encoded['input_ids'])
attention_masks.append(input_encoded['attention_mask'])
output_ids.append(output_encoded['input_ids'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
output_ids = torch.cat(output_ids, dim=0)
batch_size = 200
learning_rate = 2e-5
num_epochs = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = trf.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
num_training_steps=len(input_ids) // batch_size * num_epochs)
# Train the model
model.train()
for epoch in range(num_epochs):
for i in range(0, len(input_ids), batch_size):
batch_input_ids = input_ids[i:i+batch_size]
batch_attention_masks = attention_masks[i:i+batch_size]
batch_output_ids = output_ids[i:i+batch_size]
model.zero_grad()
outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks,
decoder_input_ids=batch_output_ids[:, :-1], labels=batch_output_ids[:, 1:].reshape(-1, 1))
loss = outputs[0]
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
if i % 1000 == 0:
print(f"Epoch: {epoch+1}, Batch: {i+1}/{len(input_ids)}, Loss: {loss.item()}")
# Save the trained model
model.save_pretrained('/Users/Juanes/Downloads/summarization_model') |