Ffftdtd5dtft commited on
Commit
0f78a11
·
verified ·
1 Parent(s): 20125f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -99
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
  import torch
3
  import torch.nn as nn
@@ -6,12 +8,12 @@ from torch.optim import AdamW
6
  import matplotlib.pyplot as plt
7
  import matplotlib.animation as animation
8
  import time
9
- import threading
10
  from tqdm import tqdm
11
- from transformers import AutoTokenizer, AutoModel, AutoModelForTextToWaveform, TrainingArguments, pipeline
12
  from diffusers import DiffusionPipeline
13
  from huggingface_hub import login, HfApi, Repository
14
  from dotenv import load_dotenv
 
15
 
16
  # Cargar variables de entorno
17
  load_dotenv()
@@ -26,16 +28,15 @@ class UnifiedModel(nn.Module):
26
  hidden_states = []
27
  for model in self.models:
28
  if isinstance(model, nn.Module):
29
- outputs = model(inputs)
30
  hidden_states.append(outputs.last_hidden_state[:, 0, :])
31
- elif isinstance(model, DiffusionPipeline) or isinstance(model, pipeline):
32
- outputs = model(inputs)
33
- hidden_states.append(torch.tensor(outputs))
34
  concatenated_hidden_states = torch.cat(hidden_states, dim=-1)
35
  logits = self.classifier(concatenated_hidden_states)
36
  return logits
37
 
38
-
39
  class SyntheticDataset(Dataset):
40
  def __init__(self, tokenizers, size=100):
41
  self.tokenizers = tokenizers
@@ -61,7 +62,6 @@ class SyntheticDataset(Dataset):
61
  def __getitem__(self, idx):
62
  return self.data[idx]
63
 
64
-
65
  def push_to_hub(local_dir, repo_name):
66
  try:
67
  repo_url = HfApi().create_repo(repo_name, exist_ok=True)
@@ -84,6 +84,55 @@ def push_to_hub(local_dir, repo_name):
84
  except Exception as e:
85
  print(f"Error pushing to Hugging Face Hub: {e}")
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  def main():
89
  while True:
@@ -107,56 +156,22 @@ def main():
107
  "Falconsai/text_summarization",
108
  "microsoft/speecht5_tts",
109
  "Groq/Llama-3-Groq-70B-Tool-Use",
110
- "Groq/Llama-3-Groq-8B-Tool-Use"
111
- ]
112
-
113
- # Inicializar los pipelines
114
- pipelines_to_unify = [
115
- pipeline("text-to-audio", model="facebook/musicgen-melody"),
116
- pipeline("text-to-audio", model="facebook/musicgen-large"),
117
- pipeline("text-to-audio", model="facebook/musicgen-small"),
118
- DiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt-1-1"),
119
- pipeline("automatic-speech-recognition", model="openai/whisper-small"),
120
- DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev"),
121
- DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1"),
122
- DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell"),
123
- pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B"),
124
- pipeline("text-generation", model="openbmb/MiniCPM-V-2_6"),
125
- pipeline("text-generation", model="bigcode/starcoder"),
126
- pipeline("text-to-speech", model="microsoft/speecht5_tts"),
127
- pipeline("text-generation", model="WizardLMTeam/WizardCoder-Python-34B-V1.0"),
128
- pipeline("text-generation", model="Qwen/Qwen2-72B-Instruct"),
129
- pipeline("text-generation", model="google/gemma-2-2b-it"),
130
- pipeline("summarization", model="facebook/bart-large-cnn"),
131
- pipeline("summarization", model="Falconsai/text_summarization"),
132
- DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev"),
133
- pipeline("text-to-audio", model="facebook/musicgen-small"),
134
- pipeline("text-generation", model="Groq/Llama-3-Groq-70B-Tool-Use"),
135
- pipeline("text-generation", model="Groq/Llama-3-Groq-8B-Tool-Use")
136
- ]
137
-
138
- # Añadir modelos adicionales
139
- additional_models = [
140
  "facebook/musicgen-large",
141
- "facebook/musicgen-melody"
 
 
 
 
 
 
142
  ]
143
 
144
- # Inicializar los tokenizadores y modelos
145
  tokenizers = {}
146
  models = []
147
  for model_name in models_to_train:
148
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
149
-
150
- if tokenizer.pad_token is None:
151
- tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
152
-
153
- model = AutoModel.from_pretrained(model_name)
154
- tokenizers[model_name] = tokenizer
155
- models.append(model)
156
-
157
- for model_name in additional_models:
158
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
159
- model = AutoModelForTextToWaveform.from_pretrained(model_name)
160
  tokenizers[model_name] = tokenizer
161
  models.append(model)
162
 
@@ -172,7 +187,7 @@ def main():
172
  train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
173
  eval_loader = DataLoader(val_dataset, batch_size=16)
174
 
175
- # Unificar los modelos y pipelines en uno solo
176
  unified_model = UnifiedModel(models)
177
  unified_model.to(torch.device("cpu"))
178
 
@@ -199,68 +214,47 @@ def main():
199
  train_losses = []
200
  eval_losses = []
201
 
202
- def train(model, train_loader, eval_loader, args):
203
- model.train()
204
- epoch = 0
205
- total_steps = len(train_loader)
206
- for step, batch in enumerate(train_loader):
207
- start_time = time.time()
208
- input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
209
- attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
210
- labels = batch["label"].to("cpu")
211
- optimizer.zero_grad()
212
- outputs = model(input_ids)
213
- loss = nn.CrossEntropyLoss()(outputs, labels)
214
- loss.backward()
215
- optimizer.step()
216
-
217
- progress_bar.update(1)
218
-
219
- elapsed_time = time.time() - start_time
220
- estimated_total_time = total_steps * (elapsed_time / (step + 1))
221
- estimated_remaining_time = estimated_total_time - elapsed_time
222
-
223
- if step % args.logging_steps == 0:
224
- train_losses.append(loss.item())
225
- print(f"Step {step}/{total_steps}, Loss: {loss.item()}, Estimated remaining time: {estimated_remaining_time:.2f} seconds")
226
-
227
- epoch += 1
228
- model.eval()
229
- eval_loss = 0
230
- with torch.no_grad():
231
- for batch in eval_loader:
232
- input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
233
- attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
234
- labels = batch["label"].to("cpu")
235
- outputs = model(input_ids)
236
- loss = nn.CrossEntropyLoss()(outputs, labels)
237
- eval_loss += loss.item()
238
-
239
- eval_loss /= len(eval_loader)
240
- eval_losses.append(eval_loss)
241
- print(f"Epoch {epoch}/{args.num_train_epochs}, Evaluation Loss: {eval_loss}")
242
-
243
  train(unified_model, train_loader, eval_loader, training_args)
244
 
245
- # Visualizar pérdidas durante el entrenamiento
246
  fig, ax = plt.subplots()
247
  ax.set_xlabel("Epochs")
248
  ax.set_ylabel("Loss")
 
 
249
  ax.legend()
250
 
251
  def animate(i):
252
  ax.clear()
253
- ax.plot(train_losses[:i], label="Train Loss")
254
- ax.plot(eval_losses[:i], label="Eval Loss")
 
 
255
  ax.legend()
256
 
257
- ani = animation.FuncAnimation(fig, animate, frames=len(train_losses), blit=False)
258
  plt.show()
259
 
260
- # Subir el modelo unificado a Hugging Face Hub
 
 
 
 
261
  local_dir = "./outputs/unified_model"
 
 
 
 
 
 
 
262
  push_to_hub(local_dir, repo_name="Ffftdtd5dtft/my_model")
263
 
 
 
 
 
264
  break
265
  except Exception as e:
266
  print(f"Error: {e}")
 
1
+ !pip install torch==2.0.1 transformers==4.27.1 datasets==2.4.0 wget==3.2 huggingface-hub==0.14.1 beautifulsoup4==4.11.1 requests==2.28.1 matplotlib tqdm python-dotenv diffusers gradio
2
+
3
  import os
4
  import torch
5
  import torch.nn as nn
 
8
  import matplotlib.pyplot as plt
9
  import matplotlib.animation as animation
10
  import time
 
11
  from tqdm import tqdm
12
+ from transformers import AutoTokenizer, AutoModel, AutoModelForTextToWaveform, TrainingArguments
13
  from diffusers import DiffusionPipeline
14
  from huggingface_hub import login, HfApi, Repository
15
  from dotenv import load_dotenv
16
+ import gradio as gr
17
 
18
  # Cargar variables de entorno
19
  load_dotenv()
 
28
  hidden_states = []
29
  for model in self.models:
30
  if isinstance(model, nn.Module):
31
+ outputs = model(**inputs)
32
  hidden_states.append(outputs.last_hidden_state[:, 0, :])
33
+ elif isinstance(model, DiffusionPipeline):
34
+ outputs = model(**inputs)
35
+ hidden_states.append(torch.tensor(outputs).float())
36
  concatenated_hidden_states = torch.cat(hidden_states, dim=-1)
37
  logits = self.classifier(concatenated_hidden_states)
38
  return logits
39
 
 
40
  class SyntheticDataset(Dataset):
41
  def __init__(self, tokenizers, size=100):
42
  self.tokenizers = tokenizers
 
62
  def __getitem__(self, idx):
63
  return self.data[idx]
64
 
 
65
  def push_to_hub(local_dir, repo_name):
66
  try:
67
  repo_url = HfApi().create_repo(repo_name, exist_ok=True)
 
84
  except Exception as e:
85
  print(f"Error pushing to Hugging Face Hub: {e}")
86
 
87
+ def load_model(model_name):
88
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
89
+ model = AutoModel.from_pretrained(model_name)
90
+ return tokenizer, model
91
+
92
+ def train(model, train_loader, eval_loader, args):
93
+ model.train()
94
+ epoch = 0
95
+ total_steps = len(train_loader)
96
+ for step, batch in enumerate(train_loader):
97
+ start_time = time.time()
98
+ input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
99
+ attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
100
+ labels = batch["label"].to("cpu")
101
+ optimizer.zero_grad()
102
+ outputs = model(input_ids)
103
+ loss = nn.CrossEntropyLoss()(outputs, labels)
104
+ loss.backward()
105
+ optimizer.step()
106
+
107
+ elapsed_time = time.time() - start_time
108
+ estimated_total_time = total_steps * (elapsed_time / (step + 1))
109
+ estimated_remaining_time = estimated_total_time - elapsed_time
110
+
111
+ if step % args.logging_steps == 0:
112
+ train_losses.append(loss.item())
113
+ print(f"Step {step}/{total_steps}, Loss: {loss.item()}, Estimated remaining time: {estimated_remaining_time:.2f} seconds")
114
+
115
+ epoch += 1
116
+ model.eval()
117
+ eval_loss = 0
118
+ with torch.no_grad():
119
+ for batch in eval_loader:
120
+ input_ids = [batch[f"input_ids_{name}"].to("cpu") for name in tokenizers.keys()]
121
+ attention_mask = [batch[f"attention_mask_{name}"].to("cpu") for name in tokenizers.keys()]
122
+ labels = batch["label"].to("cpu")
123
+ outputs = model(input_ids)
124
+ loss = nn.CrossEntropyLoss()(outputs, labels)
125
+ eval_loss += loss.item()
126
+
127
+ eval_loss /= len(eval_loader)
128
+ eval_losses.append(eval_loss)
129
+ print(f"Epoch {epoch}/{args.num_train_epochs}, Evaluation Loss: {eval_loss}")
130
+
131
+ def gradio_interface(input_text):
132
+ # Define the Gradio interface function
133
+ tokenized_inputs = {name: tokenizer.encode(input_text, return_tensors="pt") for name, tokenizer in tokenizers.items()}
134
+ model_output = unified_model(tokenized_inputs)
135
+ return model_output
136
 
137
  def main():
138
  while True:
 
156
  "Falconsai/text_summarization",
157
  "microsoft/speecht5_tts",
158
  "Groq/Llama-3-Groq-70B-Tool-Use",
159
+ "Groq/Llama-3-Groq-8B-Tool-Use",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  "facebook/musicgen-large",
161
+ "facebook/musicgen-melody",
162
+ "black-forest-labs/FLUX.1-schnell",
163
+ "facebook/musicgen-small",
164
+ "stabilityai/stable-video-diffusion-img2vid-xt-1-1",
165
+ "openai/whisper-small",
166
+ "black-forest-labs/FLUX.1-dev",
167
+ "stabilityai/stable-diffusion-2-1"
168
  ]
169
 
170
+ # Inicializar los modelos y tokenizadores
171
  tokenizers = {}
172
  models = []
173
  for model_name in models_to_train:
174
+ tokenizer, model = load_model(model_name)
 
 
 
 
 
 
 
 
 
 
 
175
  tokenizers[model_name] = tokenizer
176
  models.append(model)
177
 
 
187
  train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
188
  eval_loader = DataLoader(val_dataset, batch_size=16)
189
 
190
+ # Unificar los modelos en uno solo
191
  unified_model = UnifiedModel(models)
192
  unified_model.to(torch.device("cpu"))
193
 
 
214
  train_losses = []
215
  eval_losses = []
216
 
217
+ # Entrenar el modelo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  train(unified_model, train_loader, eval_loader, training_args)
219
 
220
+ # Visualizar pérdidas
221
  fig, ax = plt.subplots()
222
  ax.set_xlabel("Epochs")
223
  ax.set_ylabel("Loss")
224
+ ax.plot(train_losses, label="Training Loss")
225
+ ax.plot(eval_losses, label="Evaluation Loss")
226
  ax.legend()
227
 
228
  def animate(i):
229
  ax.clear()
230
+ ax.plot(train_losses, label="Training Loss")
231
+ ax.plot(eval_losses, label="Evaluation Loss")
232
+ ax.set_xlabel("Epochs")
233
+ ax.set_ylabel("Loss")
234
  ax.legend()
235
 
236
+ ani = animation.FuncAnimation(fig, animate, interval=1000)
237
  plt.show()
238
 
239
+ # Guardar el modelo y el tokenizador unificados
240
+ if not os.path.exists("./outputs/unified_model"):
241
+ os.makedirs("./outputs/unified_model")
242
+
243
+ # Guardar el modelo unificado en un directorio local
244
  local_dir = "./outputs/unified_model"
245
+ torch.save(unified_model.state_dict(), os.path.join(local_dir, "pytorch_model.bin"))
246
+
247
+ # Guardar el tokenizador en un directorio local
248
+ for name, tokenizer in tokenizers.items():
249
+ tokenizer.save_pretrained(local_dir)
250
+
251
+ # Subir el modelo y el tokenizador a Hugging Face
252
  push_to_hub(local_dir, repo_name="Ffftdtd5dtft/my_model")
253
 
254
+ # Configurar y lanzar la interfaz Gradio
255
+ interface = gr.Interface(fn=gradio_interface, inputs="text", outputs="text")
256
+ interface.launch()
257
+
258
  break
259
  except Exception as e:
260
  print(f"Error: {e}")