juanesbch commited on
Commit
308288b
·
1 Parent(s): 0a74450

Create model.py

Browse files
Files changed (1) hide show
  1. model.py +81 -0
model.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Load the dataset
2
+ df = pd.read_csv('data_larazon_publico_v2.csv')
3
+
4
+ # Define stopwords and stemmer for Spanish
5
+ stop_words = set(stopwords.words('spanish'))
6
+ stemmer = SnowballStemmer('spanish')
7
+
8
+ # Preprocess the text data
9
+ for i, row in df.iterrows():
10
+ # Tokenize the text
11
+ text = row['cuerpo']
12
+ tokens = word_tokenize(text.lower(), language='spanish')
13
+
14
+ # Remove stopwords, punctuation and stem the remaining words
15
+ stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token.isalpha()]
16
+
17
+ # Rejoin the stemmed tokens into a string and update the DataFrame
18
+ df.at[i, 'cuerpo'] = ' '.join(stemmed_tokens)
19
+
20
+ # Preprocess the data for summarization
21
+ tokenizer = AutoTokenizer.from_pretrained("it5/it5-base-news-summarization")
22
+ model = AutoModelForSeq2SeqLM.from_pretrained("it5/it5-base-news-summarization")
23
+ max_input_length = 512
24
+ max_output_length = 128
25
+
26
+ input_ids = []
27
+ attention_masks = []
28
+ output_ids = []
29
+
30
+ for i in range(len(df)):
31
+ input_text = df.iloc[i]['cuerpo']
32
+ output_text = df.iloc[i]['cuerpo']
33
+
34
+ input_encoded = tokenizer.encode_plus(input_text, add_special_tokens=True,
35
+ max_length=max_input_length, pad_to_max_length=True,
36
+ return_attention_mask=True, return_tensors='pt')
37
+ output_encoded = tokenizer.encode_plus(output_text, add_special_tokens=True,
38
+ max_length=max_output_length, pad_to_max_length=True,
39
+ return_attention_mask=True, return_tensors='pt')
40
+
41
+ input_ids.append(input_encoded['input_ids'])
42
+ attention_masks.append(input_encoded['attention_mask'])
43
+ output_ids.append(output_encoded['input_ids'])
44
+
45
+ input_ids = torch.cat(input_ids, dim=0)
46
+ attention_masks = torch.cat(attention_masks, dim=0)
47
+ output_ids = torch.cat(output_ids, dim=0)
48
+
49
+
50
+ batch_size = 200
51
+ learning_rate = 2e-5
52
+ num_epochs = 1
53
+
54
+ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
55
+ scheduler = trf.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
56
+ num_training_steps=len(input_ids) // batch_size * num_epochs)
57
+
58
+ # Train the model
59
+ model.train()
60
+ for epoch in range(num_epochs):
61
+ for i in range(0, len(input_ids), batch_size):
62
+ batch_input_ids = input_ids[i:i+batch_size]
63
+ batch_attention_masks = attention_masks[i:i+batch_size]
64
+ batch_output_ids = output_ids[i:i+batch_size]
65
+
66
+ model.zero_grad()
67
+
68
+ outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks,
69
+ decoder_input_ids=batch_output_ids[:, :-1], labels=batch_output_ids[:, 1:].reshape(-1, 1))
70
+ loss = outputs[0]
71
+
72
+ loss.backward()
73
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
74
+ optimizer.step()
75
+ scheduler.step()
76
+
77
+ if i % 1000 == 0:
78
+ print(f"Epoch: {epoch+1}, Batch: {i+1}/{len(input_ids)}, Loss: {loss.item()}")
79
+
80
+ # Save the trained model
81
+ model.save_pretrained('/Users/Juanes/Downloads/summarization_model')