Spaces:

uruguayai
/

trainflux

Runtime error

App Files Files Community

uruguayai commited on Sep 8, 2024

Commit

5fadcb1

verified ·

1 Parent(s): 06b9137

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -25

app.py CHANGED Viewed

@@ -55,13 +55,14 @@ def preprocess_images(examples):
         # Resize and convert to RGB
         image = image.convert("RGB").resize((512, 512))
         # Convert to numpy array and normalize
-        image = np.array(image).astype(np.float32) / 127.5 - 1.0
         # Ensure the image has the shape (3, height, width)
         return image.transpose(2, 0, 1)  # Change to channel-first format
     return {"pixel_values": [process_image(img) for img in examples["image"]]}
 # Load dataset from Hugging Face
 dataset_name = "uruguayai/montevideo"
 dataset_cache_file = os.path.join(cache_dir, "montevideo_dataset.pkl")
@@ -118,9 +119,13 @@ def train_step(state, batch, rng):
         pixel_values = jnp.array(batch["pixel_values"])
         batch_size = pixel_values.shape[0]
         # Generate random noise
         noise_rng, timestep_rng, latents_rng = jax.random.split(rng, 3)
-        noise = jax.random.normal(noise_rng, pixel_values.shape)
         # Sample random timesteps
         timesteps = jax.random.randint(
@@ -130,23 +135,23 @@ def train_step(state, batch, rng):
         # Create scheduler state
         scheduler_state = pipeline.scheduler.create_state()
-        # Add noise to images using the scheduler
-        noisy_images = pipeline.scheduler.add_noise(
             scheduler_state,
-            original_samples=pixel_values,
             noise=noise,
             timesteps=timesteps
         )
         # Generate random latents for text encoder
-        latents = jax.random.normal(latents_rng, (batch_size, pipeline.text_encoder.config.hidden_size))
         # Predict noise
         model_output = state.apply_fn.apply(
             {'params': params},
-            jnp.array(noisy_images),
             jnp.array(timesteps),
-            encoder_hidden_states=latents,
             train=True,
         )
@@ -172,22 +177,6 @@ num_epochs = 10
 batch_size = 4
 rng = jax.random.PRNGKey(0)
-# Debug print
-print("Processed dataset info:")
-print(processed_dataset)
-print("First batch:")
-first_batch = next(iter(processed_dataset.batch(batch_size)))
-print(f"Batch keys: {first_batch.keys()}")
-print(f"Type of pixel_values: {type(first_batch['pixel_values'])}")
-if isinstance(first_batch['pixel_values'], list):
-    print(f"Length of pixel_values list: {len(first_batch['pixel_values'])}")
-    if len(first_batch['pixel_values']) > 0:
-        print(f"Shape of first item in pixel_values: {np.array(first_batch['pixel_values'][0]).shape}")
-# Convert the list of pixel values to a numpy array
-first_batch['pixel_values'] = np.array(first_batch['pixel_values'])
-print(f"Pixel values shape after conversion: {first_batch['pixel_values'].shape}")
 for epoch in range(num_epochs):
     epoch_loss = 0
     num_batches = 0
@@ -202,7 +191,6 @@ for epoch in range(num_epochs):
     print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss}")
 # Save the fine-tuned model
 output_dir = "/tmp/montevideo_fine_tuned_model"
 os.makedirs(output_dir, exist_ok=True)

         # Resize and convert to RGB
         image = image.convert("RGB").resize((512, 512))
         # Convert to numpy array and normalize
+        image = np.array(image).astype(np.float32) / 255.0
         # Ensure the image has the shape (3, height, width)
         return image.transpose(2, 0, 1)  # Change to channel-first format
     return {"pixel_values": [process_image(img) for img in examples["image"]]}
 # Load dataset from Hugging Face
 dataset_name = "uruguayai/montevideo"
 dataset_cache_file = os.path.join(cache_dir, "montevideo_dataset.pkl")
         pixel_values = jnp.array(batch["pixel_values"])
         batch_size = pixel_values.shape[0]
+        # Encode images to latent space
+        latents = pipeline.vae.encode(pixel_values).latent_dist.sample(rng)
+        latents = latents * pipeline.vae.config.scaling_factor
         # Generate random noise
         noise_rng, timestep_rng, latents_rng = jax.random.split(rng, 3)
+        noise = jax.random.normal(noise_rng, latents.shape)
         # Sample random timesteps
         timesteps = jax.random.randint(
         # Create scheduler state
         scheduler_state = pipeline.scheduler.create_state()
+        # Add noise to latents using the scheduler
+        noisy_latents = pipeline.scheduler.add_noise(
             scheduler_state,
+            original_samples=latents,
             noise=noise,
             timesteps=timesteps
         )
         # Generate random latents for text encoder
+        encoder_hidden_states = jax.random.normal(latents_rng, (batch_size, pipeline.text_encoder.config.hidden_size))
         # Predict noise
         model_output = state.apply_fn.apply(
             {'params': params},
+            jnp.array(noisy_latents),
             jnp.array(timesteps),
+            encoder_hidden_states=encoder_hidden_states,
             train=True,
         )
 batch_size = 4
 rng = jax.random.PRNGKey(0)
 for epoch in range(num_epochs):
     epoch_loss = 0
     num_batches = 0
     print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss}")
 # Save the fine-tuned model
 output_dir = "/tmp/montevideo_fine_tuned_model"
 os.makedirs(output_dir, exist_ok=True)