Spaces:

tournas
/

storytelling_assistant

Running

App Files Files Community

tournas commited on Mar 5

Commit

1aaa563

verified ·

1 Parent(s): 16a5d71

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -49

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import numpy as np
 from nltk.tokenize import sent_tokenize
 import spaces
 device = "cuda" if torch.cuda.is_available() else "cpu"
 api_key = os.getenv("OPENAI_API_KEY")
@@ -20,17 +21,18 @@ if not api_key:
 client = OpenAI(api_key=api_key)
-yolo_model = YOLO("yolov8s.pt")
 stable_diffusion = StableDiffusionPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5",
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 ).to(device)
-stable_diffusion.vae.enable_tiling = False
-nltk.download("punkt")
 summarizer = pipeline(
     "summarization",
@@ -38,74 +40,105 @@ summarizer = pipeline(
 )
 def detect_objects(image):
-    yolo_model.to('cuda')
-    image_array = np.array(image)  # Μετατροπή PIL → NumPy
-    results = yolo_model(image_array)
-    detected_objects = []
-    for r in results:
-        for box in r.boxes:
-            class_id = int(box.cls.item())
-            label = yolo_model.names[class_id]
-            detected_objects.append(label)
-    return detected_objects
 def generate_story(detected_objects):
-    story_prompt = f"Write a short story based on the following objects: {', '.join(detected_objects)}"
-    response = client.chat.completions.create(
-        model="gpt-4o-mini",
-        messages=[{"role": "user", "content": story_prompt}],
-        max_tokens=200
-    )
-    return response.choices[0].message.content.strip()
 def summarize_story(story):
-    summary = summarizer(story, max_length=100, do_sample=False)[0]['summary_text']
-    scenes = sent_tokenize(summary)
-    return scenes
 def generate_images(story):
     scenes = summarize_story(story)
     images = []
-    # Περιορισμός σε μέγιστο 3 σκηνές για αποφυγή υπερφόρτωσης
-    scenes = scenes[:min(len(scenes), 3)]
     for prompt in scenes:
         try:
-            with torch.no_grad():  # Μειώνει τη χρήση μνήμης
-                prompt_text = f"Highly detailed, cinematic scene: {prompt}, digital art, 4K, realistic lighting"
-                # Προσθέτω παραμέτρους για καλύτερη διαχείριση μνήμης
                 image = stable_diffusion(
                     prompt_text,
-                    num_inference_steps=30,  # Μείωση από το προεπιλεγμένο 50
-                    guidance_scale=7.5
                 ).images[0]
                 images.append(image)
-                # Καθαρισμός μνήμης μετά από κάθε δημιουργία
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
         except Exception as e:
-            print(f"Error generating image for scene: {e}")
-            # Συνέχισε με την επόμενη σκηνή σε περίπτ��ση σφάλματος
-            continue
-    return images
 def text_to_speech(story):
-    tts = gTTS(text=story, lang="en", slow=False)
-    audio_file_path = "story_audio.mp3"
-    tts.save(audio_file_path)
-    return audio_file_path
 @spaces.GPU
 def full_pipeline(image):
-    detected_objects = detect_objects(image)
-    story = generate_story(detected_objects)
-    scenes = summarize_story(story)
-    images = generate_images(story)
-    audio = text_to_speech(story)
-    return story, scenes, images, audio
 # **Gradio UI**
 demo = gr.Interface(

 from nltk.tokenize import sent_tokenize
 import spaces
+# Ensure minimal GPU usage
 device = "cuda" if torch.cuda.is_available() else "cpu"
 api_key = os.getenv("OPENAI_API_KEY")
 client = OpenAI(api_key=api_key)
+# Use smallest YOLO model
+yolo_model = YOLO("yolov8n.pt")
+# Lightweight Stable Diffusion configuration
 stable_diffusion = StableDiffusionPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5",
     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 ).to(device)
+stable_diffusion.vae.enable_tiling = True  # Enable tiling to reduce memory usage
+nltk.download("punkt", quiet=True)
 summarizer = pipeline(
     "summarization",
 )
 def detect_objects(image):
+    try:
+        # Move model to appropriate device
+        yolo_model.to(device)
+        image_array = np.array(image)
+        results = yolo_model(image_array)
+        detected_objects = []
+        for r in results:
+            for box in r.boxes:
+                class_id = int(box.cls.item())
+                label = yolo_model.names[class_id]
+                detected_objects.append(label)
+        return list(set(detected_objects))  # Remove duplicates
+    except Exception as e:
+        print(f"Object detection error: {e}")
+        return ["generic", "objects"]
 def generate_story(detected_objects):
+    try:
+        story_prompt = f"Write a concise, creative short story using these objects: {', '.join(detected_objects)}"
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",  # More lightweight model
+            messages=[{"role": "user", "content": story_prompt}],
+            max_tokens=150  # Reduced token count
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"Story generation error: {e}")
+        return "A mysterious tale of adventure and discovery."
 def summarize_story(story):
+    try:
+        summary = summarizer(story, max_length=50, do_sample=False)[0]['summary_text']
+        scenes = sent_tokenize(summary)
+        return scenes[:2]  # Limit to 2 scenes to reduce computational load
+    except Exception as e:
+        print(f"Story summarization error: {e}")
+        return ["A peaceful scene", "An exciting moment"]
 def generate_images(story):
     scenes = summarize_story(story)
     images = []
     for prompt in scenes:
         try:
+            with torch.no_grad():
+                # Simplified, less computationally intensive prompt
+                prompt_text = f"Simple illustration: {prompt}, soft colors"
                 image = stable_diffusion(
                     prompt_text,
+                    num_inference_steps=20,  # Reduced steps
+                    guidance_scale=6.0,      # Slightly lower guidance
+                    height=256,              # Smaller image
+                    width=256
                 ).images[0]
                 images.append(image)
+                # Aggressive memory clearing
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
         except Exception as e:
+            print(f"Image generation error: {e}")
+    # Fallback if no images generated
+    return images if images else [Image.new('RGB', (256, 256), color='lightgray')]
 def text_to_speech(story):
+    try:
+        tts = gTTS(text=story[:500], lang="en", slow=False)  # Limit to first 500 chars
+        audio_file_path = "story_audio.mp3"
+        tts.save(audio_file_path)
+        return audio_file_path
+    except Exception as e:
+        print(f"Text-to-speech error: {e}")
+        return None
 @spaces.GPU
 def full_pipeline(image):
+    # Wrap entire process with error handling
+    try:
+        detected_objects = detect_objects(image)
+        story = generate_story(detected_objects)
+        scenes = summarize_story(story)
+        images = generate_images(story)
+        audio = text_to_speech(story)
+        return (
+            story or "A story could not be generated.",
+            scenes or ["Scene 1", "Scene 2"],
+            images,
+            audio
+        )
+    except Exception as e:
+        print(f"Full pipeline error: {e}")
+        return (
+            "An unexpected error occurred.",
+            ["Something went wrong"],
+            [Image.new('RGB', (256, 256), color='lightgray')],
+            None
+        )
 # **Gradio UI**
 demo = gr.Interface(