|
|
|
df = pd.read_csv('data_larazon_publico_v2.csv') |
|
|
|
|
|
stop_words = set(stopwords.words('spanish')) |
|
stemmer = SnowballStemmer('spanish') |
|
|
|
|
|
for i, row in df.iterrows(): |
|
|
|
text = row['cuerpo'] |
|
tokens = word_tokenize(text.lower(), language='spanish') |
|
|
|
|
|
stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token.isalpha()] |
|
|
|
|
|
df.at[i, 'cuerpo'] = ' '.join(stemmed_tokens) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("it5/it5-base-news-summarization") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("it5/it5-base-news-summarization") |
|
max_input_length = 512 |
|
max_output_length = 128 |
|
|
|
input_ids = [] |
|
attention_masks = [] |
|
output_ids = [] |
|
|
|
for i in range(len(df)): |
|
input_text = df.iloc[i]['cuerpo'] |
|
output_text = df.iloc[i]['cuerpo'] |
|
|
|
input_encoded = tokenizer.encode_plus(input_text, add_special_tokens=True, |
|
max_length=max_input_length, pad_to_max_length=True, |
|
return_attention_mask=True, return_tensors='pt') |
|
output_encoded = tokenizer.encode_plus(output_text, add_special_tokens=True, |
|
max_length=max_output_length, pad_to_max_length=True, |
|
return_attention_mask=True, return_tensors='pt') |
|
|
|
input_ids.append(input_encoded['input_ids']) |
|
attention_masks.append(input_encoded['attention_mask']) |
|
output_ids.append(output_encoded['input_ids']) |
|
|
|
input_ids = torch.cat(input_ids, dim=0) |
|
attention_masks = torch.cat(attention_masks, dim=0) |
|
output_ids = torch.cat(output_ids, dim=0) |
|
|
|
|
|
batch_size = 200 |
|
learning_rate = 2e-5 |
|
num_epochs = 1 |
|
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) |
|
scheduler = trf.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, |
|
num_training_steps=len(input_ids) // batch_size * num_epochs) |
|
|
|
|
|
model.train() |
|
for epoch in range(num_epochs): |
|
for i in range(0, len(input_ids), batch_size): |
|
batch_input_ids = input_ids[i:i+batch_size] |
|
batch_attention_masks = attention_masks[i:i+batch_size] |
|
batch_output_ids = output_ids[i:i+batch_size] |
|
|
|
model.zero_grad() |
|
|
|
outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, |
|
decoder_input_ids=batch_output_ids[:, :-1], labels=batch_output_ids[:, 1:].reshape(-1, 1)) |
|
loss = outputs[0] |
|
|
|
loss.backward() |
|
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
|
optimizer.step() |
|
scheduler.step() |
|
|
|
if i % 1000 == 0: |
|
print(f"Epoch: {epoch+1}, Batch: {i+1}/{len(input_ids)}, Loss: {loss.item()}") |
|
|
|
|
|
model.save_pretrained('/Users/Juanes/Downloads/summarization_model') |