juanesbch commited on
Commit
d37aee2
·
1 Parent(s): 8e12542

Delete model.py

Browse files
Files changed (1) hide show
  1. model.py +0 -81
model.py DELETED
@@ -1,81 +0,0 @@
1
- # Load the dataset
2
- df = pd.read_csv('data_larazon_publico_v2.csv')
3
-
4
- # Define stopwords and stemmer for Spanish
5
- stop_words = set(stopwords.words('spanish'))
6
- stemmer = SnowballStemmer('spanish')
7
-
8
- # Preprocess the text data
9
- for i, row in df.iterrows():
10
- # Tokenize the text
11
- text = row['cuerpo']
12
- tokens = word_tokenize(text.lower(), language='spanish')
13
-
14
- # Remove stopwords, punctuation and stem the remaining words
15
- stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token.isalpha()]
16
-
17
- # Rejoin the stemmed tokens into a string and update the DataFrame
18
- df.at[i, 'cuerpo'] = ' '.join(stemmed_tokens)
19
-
20
- # Preprocess the data for summarization
21
- tokenizer = AutoTokenizer.from_pretrained("it5/it5-base-news-summarization")
22
- model = AutoModelForSeq2SeqLM.from_pretrained("it5/it5-base-news-summarization")
23
- max_input_length = 512
24
- max_output_length = 128
25
-
26
- input_ids = []
27
- attention_masks = []
28
- output_ids = []
29
-
30
- for i in range(len(df)):
31
- input_text = df.iloc[i]['cuerpo']
32
- output_text = df.iloc[i]['cuerpo']
33
-
34
- input_encoded = tokenizer.encode_plus(input_text, add_special_tokens=True,
35
- max_length=max_input_length, pad_to_max_length=True,
36
- return_attention_mask=True, return_tensors='pt')
37
- output_encoded = tokenizer.encode_plus(output_text, add_special_tokens=True,
38
- max_length=max_output_length, pad_to_max_length=True,
39
- return_attention_mask=True, return_tensors='pt')
40
-
41
- input_ids.append(input_encoded['input_ids'])
42
- attention_masks.append(input_encoded['attention_mask'])
43
- output_ids.append(output_encoded['input_ids'])
44
-
45
- input_ids = torch.cat(input_ids, dim=0)
46
- attention_masks = torch.cat(attention_masks, dim=0)
47
- output_ids = torch.cat(output_ids, dim=0)
48
-
49
-
50
- batch_size = 200
51
- learning_rate = 2e-5
52
- num_epochs = 1
53
-
54
- optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
55
- scheduler = trf.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
56
- num_training_steps=len(input_ids) // batch_size * num_epochs)
57
-
58
- # Train the model
59
- model.train()
60
- for epoch in range(num_epochs):
61
- for i in range(0, len(input_ids), batch_size):
62
- batch_input_ids = input_ids[i:i+batch_size]
63
- batch_attention_masks = attention_masks[i:i+batch_size]
64
- batch_output_ids = output_ids[i:i+batch_size]
65
-
66
- model.zero_grad()
67
-
68
- outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_masks,
69
- decoder_input_ids=batch_output_ids[:, :-1], labels=batch_output_ids[:, 1:].reshape(-1, 1))
70
- loss = outputs[0]
71
-
72
- loss.backward()
73
- torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
74
- optimizer.step()
75
- scheduler.step()
76
-
77
- if i % 1000 == 0:
78
- print(f"Epoch: {epoch+1}, Batch: {i+1}/{len(input_ids)}, Loss: {loss.item()}")
79
-
80
- # Save the trained model
81
- model.save_pretrained('/Users/Juanes/Downloads/summarization_model')