from flask import Flask, request, jsonify from transformers import pipeline, BlipForConditionalGeneration, BlipProcessor, utils import torchaudio from torchaudio.transforms import Resample import torch from io import BytesIO from PIL import Image from flask_cors import CORS # ย้าย cache ไปที่ตำแหน่งที่ถูกต้อง utils.move_cache() app = Flask(__name__) CORS(app) # Initialize TTS model from Hugging Face tts_model_name = "suno/bark" tts = pipeline(task="text-to-speech", model=tts_model_name) # Initialize Blip model for image captioning model_id = "dblasko/blip-dalle3-img2prompt" blip_model = BlipForConditionalGeneration.from_pretrained(model_id) blip_processor = BlipProcessor.from_pretrained(model_id) def generate_caption(file): # Generate caption from image using Blip model inputs = blip_processor(files=file, return_tensors="pt") pixel_values = inputs.pixel_values generated_ids = blip_model.generate(pixel_values=pixel_values, max_length=50) generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0] # Use TTS model to convert generated caption to audio audio_output = tts(generated_caption) audio_path = "generated_audio_resampled.wav" torchaudio.save(audio_path, torch.tensor(audio_output[0]), audio_output["sampling_rate"]) return generated_caption, audio_path @app.route('/upload', methods=['POST']) def upload_image(): if 'file' not in request.files: return jsonify({'error': 'No image provided'}), 400 image_file = request.files['file'] generated_caption, audio_path = generate_caption(image_file) return jsonify({'generated_caption': generated_caption, 'audio_url': audio_path}), 200 if __name__ == '__main__': app.run(port=5000, debug=True)