# Load the dataset df = pd.read_csv('data_larazon_publico_v2.csv') # Define stopwords and stemmer for Spanish stop_words = set(stopwords.words('spanish')) stemmer = SnowballStemmer('spanish') # Preprocess the text data for i, row in df.iterrows(): # Tokenize the text text = row['cuerpo'] tokens = word_tokenize(text.lower(), language='spanish') # Remove stopwords, punctuation and stem the remaining words stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token.isalpha()] # Rejoin the stemmed tokens into a string and update the DataFrame df.at[i, 'cuerpo'] = ' '.join(stemmed_tokens) # Preprocess the data for summarization tokenizer = AutoTokenizer.from_pretrained("it5/it5-base-news-summarization") model = AutoModelForSeq2SeqLM.from_pretrained("it5/it5-base-news-summarization") max_input_length = 512 max_output_length = 128 input_ids = [] attention_masks = [] output_ids = [] for i in range(len(df)): input_text = df.iloc[i]['cuerpo'] output_text = df.iloc[i]['cuerpo'] input_encoded = tokenizer.encode_plus(input_text, add_special_tokens=True, max_length=max_input_length, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') output_encoded = tokenizer.encode_plus(output_text, add_special_tokens=True, max_length=max_output_length, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') input_ids.append(input_encoded['input_ids']) attention_masks.append(input_encoded['attention_mask']) output_ids.append(output_encoded['input_ids']) input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) output_ids = torch.cat(output_ids, dim=0) batch_size = 200 learning_rate = 2e-5 num_epochs = 1 optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) scheduler = trf.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(input_ids) // batch_size * num_epochs) # Train the model model.train() for epoch in range(num_epochs): for i in range(0, len(input_ids), batch_size): batch_input_ids = input_ids[i:i+batch_size] batch_attention_masks = attention_masks[i:i+batch_size] batch_output_ids = output_ids[i:i+batch_size] model.zero_grad() outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks, decoder_input_ids=batch_output_ids[:, :-1], labels=batch_output_ids[:, 1:].reshape(-1, 1)) loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() if i % 1000 == 0: print(f"Epoch: {epoch+1}, Batch: {i+1}/{len(input_ids)}, Loss: {loss.item()}") # Save the trained model model.save_pretrained('/Users/Juanes/Downloads/summarization_model')