File size: 4,474 Bytes
535c52b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4aaf04f
 
 
535c52b
 
4aaf04f
 
535c52b
4aaf04f
 
535c52b
 
4aaf04f
535c52b
 
4aaf04f
535c52b
 
4aaf04f
535c52b
 
4aaf04f
535c52b
4aaf04f
535c52b
 
 
 
 
 
 
 
4aaf04f
535c52b
 
4aaf04f
535c52b
 
4aaf04f
535c52b
4aaf04f
535c52b
 
4aaf04f
535c52b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4aaf04f
535c52b
 
4aaf04f
535c52b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# import gradio as gr
# from transformers import pipeline, AutoModelForImageSegmentation 
# from gradio_imageslider import ImageSlider
# import torch
# from torchvision import transforms
# import spaces
# from PIL import Image

# import numpy as np
# import time

# birefnet = AutoModelForImageSegmentation.from_pretrained(
#     "ZhengPeng7/BiRefNet", trust_remote_code=True
# )
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# print("Using device:", device)

# birefnet.to(device)
# transform_image = transforms.Compose(
#     [
#         transforms.Resize((1024, 1024)),
#         transforms.ToTensor(),
#         transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
#     ]
# )

# # @spaces.GPU
# # def PreProcess(image):
# #     size = image.size
# #     image = transform_image(image).unsqueeze(0).to(device)

# #     with torch.no_grad():
# #         preds = birefnet(image)[-1].sigmoid().cpu()
# #     pred = preds[0].squeeze()
# #     pred = transforms.ToPILImage()(pred)
# #     mask = pred.resize(size)
# #     # image.putalpha(mask)
# #     return image

# @spaces.GPU
# def PreProcess(image):
#     size = image.size  # Save original size
#     image_tensor = transform_image(image).unsqueeze(0).to(device)  # Transform the image into a tensor

#     with torch.no_grad():
#         preds = birefnet(image_tensor)[-1].sigmoid().cpu()  # Get predictions
#     pred = preds[0].squeeze()

#     # Convert the prediction tensor to a PIL image
#     pred_pil = transforms.ToPILImage()(pred)

#     # Resize the mask to match the original image size
#     mask = pred_pil.resize(size)

#     # Convert the original image (passed as input) to a PIL image
#     image_pil = image.convert("RGBA")  # Ensure the image has an alpha channel

#     # Apply the alpha mask to the image
#     image_pil.putalpha(mask)

#     return image_pil

# def segment_image(image):
#     start = time.time()
#     image = Image.fromarray(image)
#     image = image.convert("RGB")
#     org = image.copy()
#     image = PreProcess(image)
#     time_taken = np.round((time.time() - start),2)
#     return (image, org), time_taken

# slider = ImageSlider(label='birefnet', type="pil")
# image = gr.Image(label="Upload an Image")

# butterfly = Image.open("butterfly.png")
# Dog = Image.open('Dog.jpg')

# time_taken = gr.Textbox(label="Time taken", type="text")

# demo = gr.Interface(
#     segment_image, inputs=image, outputs=[slider,time_taken], examples=[butterfly,Dog], api_name="BiRefNet")

# if __name__ == '__main__' :
#     demo.launch()

import requests
import gradio as gr
import tempfile
import os
from transformers import pipeline
from huggingface_hub import InferenceClient

model_id = "openai/whisper-large-v3"
client = InferenceClient(model_id)
pipe = pipeline("automatic-speech-recognition", model=model_id)

# def transcribe(inputs, task):
#     if inputs is None:
#         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

#     text = pipe(inputs, chunk_length_s=30)["text"]
#     return text

def transcribe(inputs, task):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    try:
  
        res = client.automatic_speech_recognition(inputs).text
        return res
    
    except Exception as e:
        return fr'Error: {str(e)}'
        

demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button!"
    ),
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="upload", type="filepath", label="Audio file"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button!"
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])

if __name__ == "__main__":
    demo.queue().launch()