Spaces:

tournas
/

storytelling_assistant

Running

App Files Files Community

tournas commited on Feb 15

Commit

483fc16

verified ·

1 Parent(s): 86ad5b0

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -80

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import uuid
 import gradio as gr
 import torch
 import nltk
@@ -9,125 +8,78 @@ from diffusers import StableDiffusionPipeline
 from ultralytics import YOLO
 from gtts import gTTS
 from PIL import Image
 from nltk.tokenize import sent_tokenize
 import spaces
-# Set device (use GPU if available, but don't initialize CUDA here)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load environment variables
 api_key = os.getenv("OPENAI_API_KEY")
 if not api_key:
-    raise ValueError("⚠️ OpenAI API Key is missing! Add it as a Secret in Hugging Face Spaces.")
-# Initialize OpenAI client
 client = OpenAI(api_key=api_key)
-# Download NLTK data
 nltk.download("punkt")
-@spaces.GPU
-# Lazy-load models to avoid initializing CUDA in the main process
-def load_yolo_model():
-    return YOLO("yolov8s.pt")
-def load_stable_diffusion():
-    return StableDiffusionPipeline.from_pretrained(
-        "runwayml/stable-diffusion-v1-5",
-        torch_dtype=torch.float16 if device == "cuda" else torch.float32
-    ).to(device)
-def load_summarizer():
-    return pipeline("summarization", model="facebook/bart-large-cnn", device=0 if device == "cuda" else -1)
-# Function to detect objects in an image
-def detect_objects(image_path, yolo_model):
     results = yolo_model(image_path)
     detected_objects = []
     for r in results:
         for box in r.boxes:
-            class_id = int(box.cls.item())
             label = yolo_model.names[class_id]
             detected_objects.append(label)
     return detected_objects
-# Function to generate a story based on detected objects
 def generate_story(detected_objects):
     story_prompt = f"Write a short story based on the following objects: {', '.join(detected_objects)}"
-    response = client.chat.completions.create(
-        model="gpt-4",  # Use GPT-4 or GPT-3.5-turbo
         messages=[{"role": "user", "content": story_prompt}],
         max_tokens=200
     )
-    return response.choices[0].message.content.strip()
-# Function to summarize the story into scenes
-def summarize_story(story, summarizer):
     summary = summarizer(story, max_length=100, do_sample=False)[0]['summary_text']
     scenes = sent_tokenize(summary)
     return scenes
-# Function to generate images for each scene
-def generate_images(story, stable_diffusion):
     scenes = summarize_story(story)
     prompts = [f"Highly detailed, cinematic scene: {scene}, digital art, 4K, realistic lighting" for scene in scenes]
     images = []
     for prompt in prompts:
-        image = stable_diffusion(prompt=prompt).images[0]
         images.append(image)
     return images
-# Function to convert text to speech
 def text_to_speech(story):
     tts = gTTS(text=story, lang="en", slow=False)
-    audio_file_path = f"story_audio_{uuid.uuid4().hex}.mp3"  # Unique filename
     tts.save(audio_file_path)
     return audio_file_path
-# Main pipeline function
 def full_pipeline(image):
-    try:
-        # Save the image with a unique filename
-        image_path = f"temp_{uuid.uuid4().hex}.jpg"
-        image.save(image_path)
-        # Lazy-load models
-        yolo_model = load_yolo_model()
-        stable_diffusion = load_stable_diffusion()
-        summarizer = load_summarizer()
-        # Detect objects in the image
-        detected_objects = detect_objects(image_path, yolo_model)
-        if not detected_objects:
-            return "No objects detected. Please upload a different image.", "", [], None
-        # Generate a story based on detected objects
-        story = generate_story(detected_objects)
-        if not story:
-            return "Failed to generate a story. Please try again.", "", [], None
-        # Summarize the story into scenes
-        scenes = summarize_story(story, summarizer)
-        if not scenes:
-            return story, "No scenes extracted.", [], None
-        # Generate images for each scene
-        images = generate_images(story, stable_diffusion)
-        if not images:
-            return story, "\n".join(scenes), [], None
-        # Convert the story to audio
-        audio = text_to_speech(story)
-        if not audio:
-            return story, "\n".join(scenes), images, None
-        # Return all outputs
-        return story, "\n".join(scenes), images, audio
-    except Exception as e:
-        return f"An error occurred: {str(e)}", "", [], None
-# Gradio UI with queue for long-running tasks
 demo = gr.Interface(
     fn=full_pipeline,
     inputs=gr.Image(type="pil"),
@@ -141,9 +93,5 @@ demo = gr.Interface(
     description="Upload an image, and the AI will detect objects, generate a story, create images, and narrate the story."
 )
-# Enable queue for long-running tasks
-demo.queue()
-# Launch the app
 if __name__ == "__main__":
-    demo.launch()

 import os
 import gradio as gr
 import torch
 import nltk
 from ultralytics import YOLO
 from gtts import gTTS
 from PIL import Image
+import numpy as np
 from nltk.tokenize import sent_tokenize
+from IPython.display import Audio
 import spaces
 device = "cuda" if torch.cuda.is_available() else "cpu"
 api_key = os.getenv("OPENAI_API_KEY")
 if not api_key:
+    raise ValueError("\u26a0\ufe0f OpenAI API Key is missing! Add it as a Secret in Hugging Face Spaces.")
 client = OpenAI(api_key=api_key)
+yolo_model = YOLO("yolov8s.pt")
+stable_diffusion = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+stable_diffusion.to(device)
 nltk.download("punkt")
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
+@spaces.GPU
+def detect_objects(image_path):
     results = yolo_model(image_path)
     detected_objects = []
     for r in results:
         for box in r.boxes:
+            class_id = int(box.cls.item())  # Διορθώθηκε
             label = yolo_model.names[class_id]
             detected_objects.append(label)
     return detected_objects
 def generate_story(detected_objects):
     story_prompt = f"Write a short story based on the following objects: {', '.join(detected_objects)}"
+    response = client.completions.create(
+        model="gpt-4o-mini",
         messages=[{"role": "user", "content": story_prompt}],
         max_tokens=200
     )
+    return response.choices[0].text.strip()  # Διορθώθηκε
+def summarize_story(story):
     summary = summarizer(story, max_length=100, do_sample=False)[0]['summary_text']
     scenes = sent_tokenize(summary)
     return scenes
+def generate_images(story):
     scenes = summarize_story(story)
     prompts = [f"Highly detailed, cinematic scene: {scene}, digital art, 4K, realistic lighting" for scene in scenes]
     images = []
     for prompt in prompts:
+        image = stable_diffusion(prompt=prompt).images[0]  # Διορθώθηκε
         images.append(image)
     return images
 def text_to_speech(story):
     tts = gTTS(text=story, lang="en", slow=False)
+    audio_file_path = "story_audio.mp3"
     tts.save(audio_file_path)
     return audio_file_path
 def full_pipeline(image):
+    image_path = "input.jpg"
+    image.save(image_path)  # Διορθώθηκε
+    detected_objects = detect_objects(image_path)
+    story = generate_story(detected_objects)
+    scenes = summarize_story(story)
+    images = generate_images(story)
+    audio = text_to_speech(story)
+    return story, scenes, images, audio
+# **Gradio UI**
 demo = gr.Interface(
     fn=full_pipeline,
     inputs=gr.Image(type="pil"),
     description="Upload an image, and the AI will detect objects, generate a story, create images, and narrate the story."
 )
 if __name__ == "__main__":
+    demo.launch()