juanesbch
/

summarization_model

Model card Files Files and versions Community

juanesbch commited on Apr 10, 2023

Commit

308288b

·

1 Parent(s): 0a74450

Create model.py

Files changed (1) hide show

model.py +81 -0

model.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Load the dataset
+df = pd.read_csv('data_larazon_publico_v2.csv')
+# Define stopwords and stemmer for Spanish
+stop_words = set(stopwords.words('spanish'))
+stemmer = SnowballStemmer('spanish')
+# Preprocess the text data
+for i, row in df.iterrows():
+    # Tokenize the text
+    text = row['cuerpo']
+    tokens = word_tokenize(text.lower(), language='spanish')
+    # Remove stopwords, punctuation and stem the remaining words
+    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token.isalpha()]
+    # Rejoin the stemmed tokens into a string and update the DataFrame
+    df.at[i, 'cuerpo'] = ' '.join(stemmed_tokens)
+# Preprocess the data for summarization
+tokenizer = AutoTokenizer.from_pretrained("it5/it5-base-news-summarization")
+model = AutoModelForSeq2SeqLM.from_pretrained("it5/it5-base-news-summarization")
+max_input_length = 512
+max_output_length = 128
+input_ids = []
+attention_masks = []
+output_ids = []
+for i in range(len(df)):
+    input_text = df.iloc[i]['cuerpo']
+    output_text = df.iloc[i]['cuerpo']
+    input_encoded = tokenizer.encode_plus(input_text, add_special_tokens=True,
+                                          max_length=max_input_length, pad_to_max_length=True,
+                                          return_attention_mask=True, return_tensors='pt')
+    output_encoded = tokenizer.encode_plus(output_text, add_special_tokens=True,
+                                      max_length=max_output_length, pad_to_max_length=True,
+                                      return_attention_mask=True, return_tensors='pt')
+    input_ids.append(input_encoded['input_ids'])
+    attention_masks.append(input_encoded['attention_mask'])
+    output_ids.append(output_encoded['input_ids'])
+input_ids = torch.cat(input_ids, dim=0)
+attention_masks = torch.cat(attention_masks, dim=0)
+output_ids = torch.cat(output_ids, dim=0)
+batch_size = 200
+learning_rate = 2e-5
+num_epochs = 1
+optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+scheduler = trf.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
+                                                num_training_steps=len(input_ids) // batch_size * num_epochs)
+# Train the model
+model.train()
+for epoch in range(num_epochs):
+    for i in range(0, len(input_ids), batch_size):
+        batch_input_ids = input_ids[i:i+batch_size]
+        batch_attention_masks = attention_masks[i:i+batch_size]
+        batch_output_ids = output_ids[i:i+batch_size]
+        model.zero_grad()
+        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks,
+                        decoder_input_ids=batch_output_ids[:, :-1], labels=batch_output_ids[:, 1:].reshape(-1, 1))
+        loss = outputs[0]
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        scheduler.step()
+        if i % 1000 == 0:
+            print(f"Epoch: {epoch+1}, Batch: {i+1}/{len(input_ids)}, Loss: {loss.item()}")
+# Save the trained model
+model.save_pretrained('/Users/Juanes/Downloads/summarization_model')