Natthathida commited on
Commit
79d2f90
·
verified ·
1 Parent(s): c78ca01

update python

Browse files
Files changed (1) hide show
  1. app.py +60 -14
app.py CHANGED
@@ -1,8 +1,54 @@
1
- from transformers import pipeline, BlipForConditionalGeneration, BlipProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import torchaudio
3
  from torchaudio.transforms import Resample
4
  import torch
5
- import gradio as gr
 
 
6
 
7
  # Initialize TTS model from Hugging Face
8
  tts_model_name = "suno/bark"
@@ -27,15 +73,15 @@ def generate_caption(image):
27
 
28
  return generated_caption, audio_path
29
 
30
- # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
31
- demo = gr.Interface(
32
- fn=generate_caption,
33
- inputs=gr.Image(),
34
- outputs=[
35
- gr.Textbox(label="Generated caption"),
36
- gr.Button("Converts to Audio"),
37
- gr.Audio(type="filepath", label="Generated Audio")
38
- ],
39
- live=True
40
- )
41
- demo.launch(share=True)
 
1
+ # from transformers import pipeline, BlipForConditionalGeneration, BlipProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
2
+ # import torchaudio
3
+ # from torchaudio.transforms import Resample
4
+ # import torch
5
+ # from flask import Flask, request, jsonify
6
+ # # from PLI import Image
7
+ # # import pytesseract
8
+ # # import gradio as gr
9
+
10
+ # app = Flask(__name__)
11
+
12
+ # # Initialize TTS model from Hugging Face
13
+ # tts_model_name = "suno/bark"
14
+ # tts = pipeline(task="text-to-speech", model=tts_model_name)
15
+
16
+ # # Initialize Blip model for image captioning
17
+ # model_id = "dblasko/blip-dalle3-img2prompt"
18
+ # blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
19
+ # blip_processor = BlipProcessor.from_pretrained(model_id)
20
+
21
+ # @app.route('/generate_caption_and_audio', methods=['POST'])
22
+ # def generate_caption ():
23
+ # try:
24
+ # # Get image file from the request
25
+ # image = request.files['image']
26
+
27
+ # # Generate caption from image using Blip model
28
+ # inputs = blip_processor(images=image, return_tensors="pt")
29
+ # pixel_values = inputs.pixel_values
30
+ # generated_ids = blip_model.generate(pixel_values=pixel_values, max_length=50)
31
+ # generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
32
+
33
+ # # Use TTS model to convert generated caption to audio
34
+ # audio_output = tts(generated_caption)
35
+ # audio_path = "generated_audio_resampled.wav"
36
+ # torchaudio.save(audio_path, torch.tensor(audio_output[0]), audio_output["sampling_rate"])
37
+
38
+ # return jsonify({'generate_caption': generate_caption, 'audio_path': audio_path})
39
+ # except Exception as e:
40
+ # return jsonify({'error': str(e)})
41
+
42
+ # if __name__ == '__main__':
43
+ # app.run(debug=True)
44
+ from flask import Flask, request, jsonify
45
+ from transformers import pipeline, BlipForConditionalGeneration, BlipProcessor
46
  import torchaudio
47
  from torchaudio.transforms import Resample
48
  import torch
49
+ from io import BytesIO
50
+
51
+ app = Flask(__name__)
52
 
53
  # Initialize TTS model from Hugging Face
54
  tts_model_name = "suno/bark"
 
73
 
74
  return generated_caption, audio_path
75
 
76
+ @app.route('/upload', methods=['POST'])
77
+ def upload_image():
78
+ if 'image' not in request.files:
79
+ return jsonify({'error': 'No image provided'}), 400
80
+
81
+ image_file = request.files['image'].read()
82
+ generated_caption, audio_path = generate_caption(image_file)
83
+
84
+ return jsonify({'generated_caption': generated_caption, 'audio_url': audio_path}), 200
85
+
86
+ if __name__ == '__main__':
87
+ app.run(host='0.0.0.0', port=5000, debug=True)