Spaces:

tournas
/

storytelling_assistant

Running on Zero

App Files Files Community

tournas commited on 23 days ago

Commit

80b7403

verified ·

1 Parent(s): 17a4149

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -22

app.py CHANGED Viewed

@@ -1,59 +1,70 @@
 import os
 import gradio as gr
 import torch
 import nltk
 from openai import OpenAI
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 from diffusers import StableDiffusionPipeline
 from ultralytics import YOLO
 from gtts import gTTS
 from PIL import Image
-import numpy as np
 from nltk.tokenize import sent_tokenize
-from IPython.display import Audio
 import spaces
 device = "cuda" if torch.cuda.is_available() else "cpu"
 api_key = os.getenv("OPENAI_API_KEY")
 if not api_key:
     raise ValueError("⚠️ OpenAI API Key is missing! Add it as a Secret in Hugging Face Spaces.")
 client = OpenAI(api_key=api_key)
 yolo_model = YOLO("yolov8s.pt")
-stable_diffusion = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
 stable_diffusion.to(device)
 nltk.download("punkt")
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device= 'cuda' if torch.cuda.is_available() else 'cpu')
 @spaces.GPU
 def detect_objects(image_path):
     results = yolo_model(image_path)
     detected_objects = []
     for r in results:
         for box in r.boxes:
-            class_id = int(box.cls[0])
             label = yolo_model.names[class_id]
             detected_objects.append(label)
     return detected_objects
 def generate_story(detected_objects):
     story_prompt = f"Write a short story based on the following objects: {', '.join(detected_objects)}"
     response = client.chat.completions.create(
-        model="gpt-4o-mini",
         messages=[{"role": "user", "content": story_prompt}],
         max_tokens=200
     )
     return response.choices[0].message.content.strip()
 def summarize_story(story):
     summary = summarizer(story, max_length=100, do_sample=False)[0]['summary_text']
     scenes = sent_tokenize(summary)
     return scenes
 def generate_images(story):
     scenes = summarize_story(story)
     prompts = [f"Highly detailed, cinematic scene: {scene}, digital art, 4K, realistic lighting" for scene in scenes]
@@ -63,24 +74,52 @@ def generate_images(story):
         images.append(image)
     return images
 def text_to_speech(story):
     tts = gTTS(text=story, lang="en", slow=False)
-    audio_file_path = "story_audio.mp3"
     tts.save(audio_file_path)
     return audio_file_path
 def full_pipeline(image):
-    image_path = "input.jpg"
-    image.save(image_path)
-    detected_objects = detect_objects(image_path)
-    story = generate_story(detected_objects)
-    scenes = summarize_story(story)
-    images = generate_images(story)
-    audio = text_to_speech(story)
-    return story, scenes, images, audio
-# **Gradio UI**
 demo = gr.Interface(
     fn=full_pipeline,
     inputs=gr.Image(type="pil"),
@@ -94,6 +133,6 @@ demo = gr.Interface(
     description="Upload an image, and the AI will detect objects, generate a story, create images, and narrate the story."
 )
 if __name__ == "__main__":
     demo.launch()

 import os
+import uuid
 import gradio as gr
 import torch
 import nltk
 from openai import OpenAI
+from transformers import pipeline
 from diffusers import StableDiffusionPipeline
 from ultralytics import YOLO
 from gtts import gTTS
 from PIL import Image
 from nltk.tokenize import sent_tokenize
 import spaces
+# Set device (use GPU if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load environment variables
 api_key = os.getenv("OPENAI_API_KEY")
 if not api_key:
     raise ValueError("⚠️ OpenAI API Key is missing! Add it as a Secret in Hugging Face Spaces.")
+# Initialize OpenAI client
 client = OpenAI(api_key=api_key)
+# Load YOLO model
 yolo_model = YOLO("yolov8s.pt")
+# Load Stable Diffusion pipeline
+stable_diffusion = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 if device == "cuda" else torch.float32)
 stable_diffusion.to(device)
+# Download NLTK data
 nltk.download("punkt")
+# Load summarization pipeline
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if device == "cuda" else -1)
 @spaces.GPU
+# Function to detect objects in an image
 def detect_objects(image_path):
     results = yolo_model(image_path)
     detected_objects = []
     for r in results:
         for box in r.boxes:
+            class_id = int(box.cls.item())
             label = yolo_model.names[class_id]
             detected_objects.append(label)
     return detected_objects
+# Function to generate a story based on detected objects
 def generate_story(detected_objects):
     story_prompt = f"Write a short story based on the following objects: {', '.join(detected_objects)}"
     response = client.chat.completions.create(
+        model="gpt-4",  # Use GPT-4 or GPT-3.5-turbo
         messages=[{"role": "user", "content": story_prompt}],
         max_tokens=200
     )
     return response.choices[0].message.content.strip()
+# Function to summarize the story into scenes
 def summarize_story(story):
     summary = summarizer(story, max_length=100, do_sample=False)[0]['summary_text']
     scenes = sent_tokenize(summary)
     return scenes
+# Function to generate images for each scene
 def generate_images(story):
     scenes = summarize_story(story)
     prompts = [f"Highly detailed, cinematic scene: {scene}, digital art, 4K, realistic lighting" for scene in scenes]
         images.append(image)
     return images
+# Function to convert text to speech
 def text_to_speech(story):
     tts = gTTS(text=story, lang="en", slow=False)
+    audio_file_path = f"story_audio_{uuid.uuid4().hex}.mp3"  # Unique filename
     tts.save(audio_file_path)
     return audio_file_path
+# Main pipeline function
 def full_pipeline(image):
+    try:
+        # Save the image with a unique filename
+        image_path = f"temp_{uuid.uuid4().hex}.jpg"
+        image.save(image_path)
+        # Detect objects in the image
+        detected_objects = detect_objects(image_path)
+        if not detected_objects:
+            return "No objects detected. Please upload a different image.", "", [], None
+        # Generate a story based on detected objects
+        story = generate_story(detected_objects)
+        if not story:
+            return "Failed to generate a story. Please try again.", "", [], None
+        # Summarize the story into scenes
+        scenes = summarize_story(story)
+        if not scenes:
+            return story, "No scenes extracted.", [], None
+        # Generate images for each scene
+        images = generate_images(story)
+        if not images:
+            return story, "\n".join(scenes), [], None
+        # Convert the story to audio
+        audio = text_to_speech(story)
+        if not audio:
+            return story, "\n".join(scenes), images, None
+        # Return all outputs
+        return story, "\n".join(scenes), images, audio
+    except Exception as e:
+        return f"An error occurred: {str(e)}", "", [], None
+# Gradio UI
 demo = gr.Interface(
     fn=full_pipeline,
     inputs=gr.Image(type="pil"),
     description="Upload an image, and the AI will detect objects, generate a story, create images, and narrate the story."
 )
+# Launch the app
 if __name__ == "__main__":
     demo.launch()