Natthathida commited on
Commit
975b4c6
·
verified ·
1 Parent(s): a0f1951

Add python

Browse files
Files changed (1) hide show
  1. app.py +40 -55
app.py CHANGED
@@ -1,56 +1,41 @@
1
- from fastapi import FastAPI
2
- from fastapi.middleware.cors import CORSMiddleware
3
- from fastapi.responses import JSONResponse
4
- from fastapi.staticfiles import StaticFiles
5
- import numpy as np
6
- import argparse
7
- import os
8
-
9
- HOST = os.environ.get("API_URL", "0.0.0.0")
10
- PORT = os.environ.get("PORT", 7860)
11
- parser = argparse.ArgumentParser()
12
- parser.add_argument("--host", default=HOST)
13
- parser.add_argument("--port", type=int, default=PORT)
14
- parser.add_argument("--reload", action="store_true", default=True)
15
- parser.add_argument("--ssl_certfile")
16
- parser.add_argument("--ssl_keyfile")
17
- args = parser.parse_args()
18
-
19
- app = FastAPI()
20
- app.add_middleware(
21
- CORSMiddleware,
22
- allow_origins=["*"],
23
- allow_credentials=True,
24
- allow_methods=["*"],
25
- allow_headers=["*"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  )
27
-
28
-
29
- @app.get("/invert")
30
- async def invert(text: str):
31
- return {
32
- "original": text,
33
- "inverted": text[::-1],
34
- }
35
-
36
-
37
- @app.get("/data")
38
- async def get_data():
39
- data = {"data": np.random.rand(100).tolist()}
40
- return JSONResponse(data)
41
-
42
-
43
- app.mount("/", StaticFiles(directory="static", html=True), name="static")
44
-
45
- if __name__ == "__main__":
46
- import uvicorn
47
-
48
- print(args)
49
- uvicorn.run(
50
- "app:app",
51
- host=args.host,
52
- port=args.port,
53
- reload=args.reload,
54
- ssl_certfile=args.ssl_certfile,
55
- ssl_keyfile=args.ssl_keyfile,
56
- )
 
1
+ from transformers import pipeline, BlipForConditionalGeneration, BlipProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
2
+ import torchaudio
3
+ from torchaudio.transforms import Resample
4
+ import torch
5
+ import gradio as gr
6
+
7
+ # Initialize TTS model from Hugging Face
8
+ tts_model_name = "suno/bark"
9
+ tts = pipeline(task="text-to-speech", model=tts_model_name)
10
+
11
+ # Initialize Blip model for image captioning
12
+ model_id = "dblasko/blip-dalle3-img2prompt"
13
+ blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
14
+ blip_processor = BlipProcessor.from_pretrained(model_id)
15
+
16
+ def generate_caption(image):
17
+ # Generate caption from image using Blip model
18
+ inputs = blip_processor(images=image, return_tensors="pt")
19
+ pixel_values = inputs.pixel_values
20
+ generated_ids = blip_model.generate(pixel_values=pixel_values, max_length=50)
21
+ generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
22
+
23
+ # Use TTS model to convert generated caption to audio
24
+ audio_output = tts(generated_caption)
25
+ audio_path = "generated_audio_resampled.wav"
26
+ torchaudio.save(audio_path, torch.tensor(audio_output[0]), audio_output["sampling_rate"])
27
+
28
+ return generated_caption, audio_path
29
+
30
+ # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
31
+ demo = gr.Interface(
32
+ fn=generate_caption,
33
+ inputs=gr.Image(),
34
+ outputs=[
35
+ gr.Textbox(label="Generated caption"),
36
+ gr.Button("Converts to Audio"),
37
+ gr.Audio(type="filepath", label="Generated Audio")
38
+ ],
39
+ live=True
40
  )
41
+ demo.launch(share=True)