# import gradio as gr # from transformers import pipeline, AutoModelForImageSegmentation # from gradio_imageslider import ImageSlider # import torch # from torchvision import transforms # import spaces # from PIL import Image # import numpy as np # import time # birefnet = AutoModelForImageSegmentation.from_pretrained( # "ZhengPeng7/BiRefNet", trust_remote_code=True # ) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # print("Using device:", device) # birefnet.to(device) # transform_image = transforms.Compose( # [ # transforms.Resize((1024, 1024)), # transforms.ToTensor(), # transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), # ] # ) # # @spaces.GPU # # def PreProcess(image): # # size = image.size # # image = transform_image(image).unsqueeze(0).to(device) # # with torch.no_grad(): # # preds = birefnet(image)[-1].sigmoid().cpu() # # pred = preds[0].squeeze() # # pred = transforms.ToPILImage()(pred) # # mask = pred.resize(size) # # # image.putalpha(mask) # # return image # @spaces.GPU # def PreProcess(image): # size = image.size # Save original size # image_tensor = transform_image(image).unsqueeze(0).to(device) # Transform the image into a tensor # with torch.no_grad(): # preds = birefnet(image_tensor)[-1].sigmoid().cpu() # Get predictions # pred = preds[0].squeeze() # # Convert the prediction tensor to a PIL image # pred_pil = transforms.ToPILImage()(pred) # # Resize the mask to match the original image size # mask = pred_pil.resize(size) # # Convert the original image (passed as input) to a PIL image # image_pil = image.convert("RGBA") # Ensure the image has an alpha channel # # Apply the alpha mask to the image # image_pil.putalpha(mask) # return image_pil # def segment_image(image): # start = time.time() # image = Image.fromarray(image) # image = image.convert("RGB") # org = image.copy() # image = PreProcess(image) # time_taken = np.round((time.time() - start),2) # return (image, org), time_taken # slider = ImageSlider(label='birefnet', type="pil") # image = gr.Image(label="Upload an Image") # butterfly = Image.open("butterfly.png") # Dog = Image.open('Dog.jpg') # time_taken = gr.Textbox(label="Time taken", type="text") # demo = gr.Interface( # segment_image, inputs=image, outputs=[slider,time_taken], examples=[butterfly,Dog], api_name="BiRefNet") # if __name__ == '__main__' : # demo.launch() import requests import gradio as gr import tempfile import os from transformers import pipeline from huggingface_hub import InferenceClient import time # import torch # import numpy as np # Ensure CUDA is available and set device accordingly # device = 0 if torch.cuda.is_available() else -1 model_id = "openai/whisper-large-v3" client = InferenceClient(model_id) pipe = pipeline("automatic-speech-recognition", model=model_id) #, device=device) def transcribe(inputs, use_api): start = time.time() API_STATUS = '' if inputs is None: raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") try: if use_api: print(f'Using API for transcription...') API_STATUS = 'Using API it took: ' # Use InferenceClient (API) if checkbox is checked res = client.automatic_speech_recognition(inputs).text else: print(f'Using local pipeline for transcription...') # Use local pipeline if checkbox is unchecked API_STATUS = 'Using local pipeline it took: ' res = pipe(inputs, chunk_length_s=30)["text"] end = time.time() - start return res, API_STATUS + str(round(end, 2)) + ' seconds' # return res, end except Exception as e: return fr'Error: {str(e)}', None def calculate_time_taken(start_time): return time.time() - start_time demo = gr.Blocks() mf_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources="microphone", type="filepath"), # gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), gr.Checkbox(label="Use API", value=False) ], outputs=["text",gr.Textbox(label="Time taken", type="text")], # Placeholder for transcribed text and time taken title="Whisper Large V3: Transcribe Audio", description=( "Transcribe long-form microphone or audio inputs with the click of a button!" ), allow_flagging="never", ) file_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources="upload", type="filepath", label="Audio file"), # gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), gr.Checkbox(label="Use API", value=False) # Checkbox for API usage ], outputs=["text",gr.Textbox(label="Time taken", type="text")], # Placeholder for transcribed text and time taken title="Whisper Large V3: Transcribe Audio", description=( "Transcribe long-form microphone or audio inputs with the click of a button!" ), allow_flagging="never", ) with demo: with gr.Row(): # with gr.Column(): # Group the tabs for microphone and file-based transcriptions tab = gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"]) # with gr.Column(): # use_api_checkbox = gr.Checkbox(label="Use API", value=False) # Checkbox outside # # time_taken = gr.Textbox(label="Time taken", type="text") # Time taken outside the interfaces if __name__ == "__main__": demo.queue().launch()