import streamlit as st import os import time import base64 import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import timm import math from PIL import Image, ImageDraw, ImageFont from io import BytesIO from diffusers import StableDiffusionPipeline from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import AutoConfig # Load Stable Diffusion Pipeline for generating images pipe = StableDiffusionPipeline.from_pretrained("runway/stable-diffusion-v2-1") # Load BERT model for chatbot config = AutoConfig.from_pretrained("bert-base-cased") tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", config=config) def generate_audio(text): audio = pipe(text, audio=True).audio.squeeze() audio = audio.detach().cpu().numpy() audio = audio.astype(np.float32) audio = audio / np.max(audio) return audio def generate_image(text): image = pipe(text).images[0] image = image.detach().cpu().numpy() image = Image.fromarray(image) return image def generate_video(text, frames=30, width=512, height=512): with torch.no_grad(): sequence = pipe.unet.unet(text).x.shape[0] sequence = int(sequence * frames) video = [] for I in range(frames): img = pipe(text, return_dict=True).images[0] img = F.interpolate(img, size=(height, width), mode="bilinear", align_corners=False) video.append(img.permute(1, 2, 0).detach().cpu().numpy()) video = np.stack(video, axis=0) video = (video * 255).astype(np.uint8) video = cv2.VideoWriter('output.avi', cv2.VideoWriter_fourcc(*'MJPG'), frames, (width, height)) for frame in video.iterquene(): for I in range(width): for j in range(height): frame[i, j] = video[sequence - sequence // frames * frames + I * height + j] video.write(frame) video.release() return cv2.VideoCapture('output.avi')[1] def main(): st.title("Streamlit App with Diffusers and Transformers") st.header("Generate Audio, Video, and Images using Diffusers") st.header("Chatbot using BERT") # Audio generation section with st.form("audio_form"): text_input = st.text_input("Enter text for audio generation:") submit_button = st.form_submit_button("Generate Audio") if submit_button: audio_output = generate_audio(text_input) audio_base64 = base64.b64encode(audio_output).decode('utf-8') st.write(f"Generated Audio:") st.audio(BytesIO(base64.b64decode(audio_base64)), format="audio/x-wav") # Image generation section with st.form("image_form"): text_input = st.text_input("Enter text for image generation:") submit_button = st.form_submit_button("Generate Image") if submit_button: image_output = generate_image(text_input) image_base64 = base64.b64encode(image_output).decode('utf-8') st.image(image_output, caption="Generated Image:", use_column_width=True) st.write(f"Generated Image (base64): {image_base64}") # Video generation section with st.form("video_form"): text_input = st.text_input("Enter text for video generation:") frames = st.number_input("Number of frames:", value=30, step=1) width = st.number_input("Image width:", value=512, step=1) height = st.number_input("Image height:", value=512, step=1) submit_button = st.form_submit_button("Generate Video") if submit_button: video_output = generate_video(text_input, frames, width, height) st.write(f"Generated Video:") # Chatbot section with st.form("chat_form"): user_input = st.text_area("Enter your message:", height=100) submit_button = st.form_submit_button("Send Message") if submit_button: message = tokenizer(user_input, padding=True, return_tensors="pt").to("cpu") outputs = model(message) prediction = torch.argmax(outputs.logits, dim=-1).item() response = tokenizer.decode(prediction, skip_special_tokens=True) st.write(f"Assistant Response: {response}") st.write("Streamlit App with Diffusers and Transformers") st.write("Generated by FallnAI")