Ffftdtd5dtft commited on
Commit
0885f1c
verified
1 Parent(s): 609fb80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -82
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
 
2
  import os
3
  import torch
@@ -7,9 +8,8 @@ from torch.optim import AdamW
7
  import matplotlib.pyplot as plt
8
  import matplotlib.animation as animation
9
  import time
10
- import threading
11
  from tqdm import tqdm
12
- from transformers import AutoTokenizer, AutoModel, TrainingArguments, pipeline
13
  from diffusers import DiffusionPipeline
14
  from huggingface_hub import login, HfApi, Repository
15
  from dotenv import load_dotenv
@@ -27,16 +27,15 @@ class UnifiedModel(nn.Module):
27
  hidden_states = []
28
  for model in self.models:
29
  if isinstance(model, nn.Module):
30
- outputs = model(inputs)
31
  hidden_states.append(outputs.last_hidden_state[:, 0, :])
32
- elif isinstance(model, DiffusionPipeline) or isinstance(model, pipeline):
33
- outputs = model(inputs)
34
- hidden_states.append(torch.tensor(outputs))
35
  concatenated_hidden_states = torch.cat(hidden_states, dim=-1)
36
  logits = self.classifier(concatenated_hidden_states)
37
  return logits
38
 
39
-
40
  class SyntheticDataset(Dataset):
41
  def __init__(self, tokenizers, size=100):
42
  self.tokenizers = tokenizers
@@ -62,7 +61,6 @@ class SyntheticDataset(Dataset):
62
  def __getitem__(self, idx):
63
  return self.data[idx]
64
 
65
-
66
  def push_to_hub(local_dir, repo_name):
67
  try:
68
  repo_url = HfApi().create_repo(repo_name, exist_ok=True)
@@ -85,6 +83,10 @@ def push_to_hub(local_dir, repo_name):
85
  except Exception as e:
86
  print(f"Error pushing to Hugging Face Hub: {e}")
87
 
 
 
 
 
88
 
89
  def main():
90
  while True:
@@ -108,49 +110,25 @@ def main():
108
  "Falconsai/text_summarization",
109
  "microsoft/speecht5_tts",
110
  "Groq/Llama-3-Groq-70B-Tool-Use",
111
- "Groq/Llama-3-Groq-8B-Tool-Use"
112
- ]
113
-
114
- # Inicializar los pipelines
115
- pipelines_to_unify = [
116
- pipeline("text-to-audio", model="facebook/musicgen-melody"),
117
- pipeline("text-to-audio", model="facebook/musicgen-large"),
118
- pipeline("text-to-audio", model="facebook/musicgen-small"),
119
- DiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt-1-1"),
120
- pipeline("automatic-speech-recognition", model="openai/whisper-small"),
121
- DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev"),
122
- DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1"),
123
- DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell"),
124
- pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B"),
125
- pipeline("text-generation", model="openbmb/MiniCPM-V-2_6"),
126
- pipeline("text-generation", model="bigcode/starcoder"),
127
- pipeline("text-to-speech", model="microsoft/speecht5_tts"),
128
- pipeline("text-generation", model="WizardLMTeam/WizardCoder-Python-34B-V1.0"),
129
- pipeline("text-generation", model="Qwen/Qwen2-72B-Instruct"),
130
- pipeline("text-generation", model="google/gemma-2-2b-it"),
131
- pipeline("summarization", model="facebook/bart-large-cnn"),
132
- pipeline("summarization", model="Falconsai/text_summarization"),
133
- DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev"),
134
- pipeline("text-to-audio", model="facebook/musicgen-small"),
135
- pipeline("text-generation", model="Groq/Llama-3-Groq-70B-Tool-Use"),
136
- pipeline("text-generation", model="Groq/Llama-3-Groq-8B-Tool-Use")
137
  ]
138
 
 
139
  tokenizers = {}
140
  models = []
141
  for model_name in models_to_train:
142
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
143
-
144
- if tokenizer.pad_token is None:
145
- tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
146
-
147
- model = AutoModel.from_pretrained(model_name)
148
  tokenizers[model_name] = tokenizer
149
  models.append(model)
150
 
151
- # Agregar pipelines como modelos
152
- models.extend(pipelines_to_unify)
153
-
154
  # Crear un dataset sint茅tico para entrenamiento y evaluaci贸n
155
  synthetic_dataset = SyntheticDataset(tokenizers, size=100)
156
 
@@ -163,7 +141,7 @@ def main():
163
  train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
164
  eval_loader = DataLoader(val_dataset, batch_size=16)
165
 
166
- # Unificar los modelos y pipelines en uno solo
167
  unified_model = UnifiedModel(models)
168
  unified_model.to(torch.device("cpu"))
169
 
@@ -193,45 +171,41 @@ def main():
193
  def train(model, train_loader, eval_loader, args):
194
  model.train()
195
  epoch = 0
196
- total_steps = args.num_train_epochs * len(train_loader)
197
- progress_bar = tqdm(total=total_steps, desc="Training")
198
-
199
- while epoch < args.num_train_epochs:
200
  start_time = time.time()
201
- for step, batch in enumerate(train_loader):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
203
  attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
204
  labels = batch["label"].to("cpu")
205
- optimizer.zero_grad()
206
  outputs = model(input_ids)
207
  loss = nn.CrossEntropyLoss()(outputs, labels)
208
- loss.backward()
209
- optimizer.step()
210
- progress_bar.update(1)
211
-
212
- elapsed_time = time.time() - start_time
213
- estimated_total_time = total_steps * (elapsed_time / (step + 1))
214
- estimated_remaining_time = estimated_total_time - elapsed_time
215
-
216
- if step % args.logging_steps == 0:
217
- train_losses.append(loss.item())
218
- print(f"Step {step}/{total_steps}, Loss: {loss.item()}, Estimated remaining time: {estimated_remaining_time:.2f} seconds")
219
-
220
- epoch += 1
221
- model.eval()
222
- eval_loss = 0
223
- with torch.no_grad():
224
- for batch in eval_loader:
225
- input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
226
- attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
227
- labels = batch["label"].to("cpu")
228
- outputs = model(input_ids)
229
- loss = nn.CrossEntropyLoss()(outputs, labels)
230
- eval_loss += loss.item()
231
-
232
- eval_loss /= len(eval_loader)
233
- eval_losses.append(eval_loss)
234
- print(f"Epoch {epoch}/{args.num_train_epochs}, Evaluation Loss: {eval_loss}")
235
 
236
  train(unified_model, train_loader, eval_loader, training_args)
237
 
@@ -239,19 +213,34 @@ def main():
239
  fig, ax = plt.subplots()
240
  ax.set_xlabel("Epochs")
241
  ax.set_ylabel("Loss")
 
 
242
  ax.legend()
243
 
244
  def animate(i):
245
  ax.clear()
246
- ax.plot(train_losses[:i], label="Train Loss")
247
- ax.plot(eval_losses[:i], label="Eval Loss")
 
 
248
  ax.legend()
249
 
250
- ani = animation.FuncAnimation(fig, animate, frames=len(train_losses), blit=False)
251
  plt.show()
252
 
253
- # Subir el modelo unificado a Hugging Face Hub
 
 
 
 
254
  local_dir = "./outputs/unified_model"
 
 
 
 
 
 
 
255
  push_to_hub(local_dir, repo_name="Ffftdtd5dtft/my_model")
256
 
257
  break
@@ -260,4 +249,4 @@ def main():
260
  time.sleep(2)
261
 
262
  if __name__ == "__main__":
263
- main()
 
1
+ !pip install torch==2.0.1 transformers==4.27.1 datasets==2.4.0 wget==3.2 huggingface-hub==0.14.1 beautifulsoup4==4.11.1 requests==2.28.1 matplotlib tqdm python-dotenv diffusers
2
 
3
  import os
4
  import torch
 
8
  import matplotlib.pyplot as plt
9
  import matplotlib.animation as animation
10
  import time
 
11
  from tqdm import tqdm
12
+ from transformers import AutoTokenizer, AutoModel, AutoModelForTextToWaveform, TrainingArguments
13
  from diffusers import DiffusionPipeline
14
  from huggingface_hub import login, HfApi, Repository
15
  from dotenv import load_dotenv
 
27
  hidden_states = []
28
  for model in self.models:
29
  if isinstance(model, nn.Module):
30
+ outputs = model(**inputs)
31
  hidden_states.append(outputs.last_hidden_state[:, 0, :])
32
+ elif isinstance(model, DiffusionPipeline):
33
+ outputs = model(**inputs)
34
+ hidden_states.append(torch.tensor(outputs).float())
35
  concatenated_hidden_states = torch.cat(hidden_states, dim=-1)
36
  logits = self.classifier(concatenated_hidden_states)
37
  return logits
38
 
 
39
  class SyntheticDataset(Dataset):
40
  def __init__(self, tokenizers, size=100):
41
  self.tokenizers = tokenizers
 
61
  def __getitem__(self, idx):
62
  return self.data[idx]
63
 
 
64
  def push_to_hub(local_dir, repo_name):
65
  try:
66
  repo_url = HfApi().create_repo(repo_name, exist_ok=True)
 
83
  except Exception as e:
84
  print(f"Error pushing to Hugging Face Hub: {e}")
85
 
86
+ def load_model(model_name):
87
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
88
+ model = AutoModel.from_pretrained(model_name)
89
+ return tokenizer, model
90
 
91
  def main():
92
  while True:
 
110
  "Falconsai/text_summarization",
111
  "microsoft/speecht5_tts",
112
  "Groq/Llama-3-Groq-70B-Tool-Use",
113
+ "Groq/Llama-3-Groq-8B-Tool-Use",
114
+ "facebook/musicgen-large",
115
+ "facebook/musicgen-melody",
116
+ "black-forest-labs/FLUX.1-schnell",
117
+ "facebook/musicgen-small",
118
+ "stabilityai/stable-video-diffusion-img2vid-xt-1-1",
119
+ "openai/whisper-small",
120
+ "black-forest-labs/FLUX.1-dev",
121
+ "stabilityai/stable-diffusion-2-1"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  ]
123
 
124
+ # Inicializar los modelos y tokenizadores
125
  tokenizers = {}
126
  models = []
127
  for model_name in models_to_train:
128
+ tokenizer, model = load_model(model_name)
 
 
 
 
 
129
  tokenizers[model_name] = tokenizer
130
  models.append(model)
131
 
 
 
 
132
  # Crear un dataset sint茅tico para entrenamiento y evaluaci贸n
133
  synthetic_dataset = SyntheticDataset(tokenizers, size=100)
134
 
 
141
  train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
142
  eval_loader = DataLoader(val_dataset, batch_size=16)
143
 
144
+ # Unificar los modelos en uno solo
145
  unified_model = UnifiedModel(models)
146
  unified_model.to(torch.device("cpu"))
147
 
 
171
  def train(model, train_loader, eval_loader, args):
172
  model.train()
173
  epoch = 0
174
+ total_steps = len(train_loader)
175
+ for step, batch in enumerate(train_loader):
 
 
176
  start_time = time.time()
177
+ input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
178
+ attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
179
+ labels = batch["label"].to("cpu")
180
+ optimizer.zero_grad()
181
+ outputs = model(input_ids)
182
+ loss = nn.CrossEntropyLoss()(outputs, labels)
183
+ loss.backward()
184
+ optimizer.step()
185
+
186
+ elapsed_time = time.time() - start_time
187
+ estimated_total_time = total_steps * (elapsed_time / (step + 1))
188
+ estimated_remaining_time = estimated_total_time - elapsed_time
189
+
190
+ if step % args.logging_steps == 0:
191
+ train_losses.append(loss.item())
192
+ print(f"Step {step}/{total_steps}, Loss: {loss.item()}, Estimated remaining time: {estimated_remaining_time:.2f} seconds")
193
+
194
+ epoch += 1
195
+ model.eval()
196
+ eval_loss = 0
197
+ with torch.no_grad():
198
+ for batch in eval_loader:
199
  input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
200
  attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
201
  labels = batch["label"].to("cpu")
 
202
  outputs = model(input_ids)
203
  loss = nn.CrossEntropyLoss()(outputs, labels)
204
+ eval_loss += loss.item()
205
+
206
+ eval_loss /= len(eval_loader)
207
+ eval_losses.append(eval_loss)
208
+ print(f"Epoch {epoch}/{args.num_train_epochs}, Evaluation Loss: {eval_loss}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
  train(unified_model, train_loader, eval_loader, training_args)
211
 
 
213
  fig, ax = plt.subplots()
214
  ax.set_xlabel("Epochs")
215
  ax.set_ylabel("Loss")
216
+ ax.plot(train_losses, label="Training Loss")
217
+ ax.plot(eval_losses, label="Evaluation Loss")
218
  ax.legend()
219
 
220
  def animate(i):
221
  ax.clear()
222
+ ax.plot(train_losses, label="Training Loss")
223
+ ax.plot(eval_losses, label="Evaluation Loss")
224
+ ax.set_xlabel("Epochs")
225
+ ax.set_ylabel("Loss")
226
  ax.legend()
227
 
228
+ ani = animation.FuncAnimation(fig, animate, interval=1000)
229
  plt.show()
230
 
231
+ # Guardar el modelo y el tokenizador unificados
232
+ if not os.path.exists("./outputs/unified_model"):
233
+ os.makedirs("./outputs/unified_model")
234
+
235
+ # Guardar el modelo unificado en un directorio local
236
  local_dir = "./outputs/unified_model"
237
+ torch.save(unified_model.state_dict(), os.path.join(local_dir, "pytorch_model.bin"))
238
+
239
+ # Guardar el tokenizador en un directorio local
240
+ for name, tokenizer in tokenizers.items():
241
+ tokenizer.save_pretrained(local_dir)
242
+
243
+ # Subir el modelo y el tokenizador a Hugging Face
244
  push_to_hub(local_dir, repo_name="Ffftdtd5dtft/my_model")
245
 
246
  break
 
249
  time.sleep(2)
250
 
251
  if __name__ == "__main__":
252
+ main()