not-lain commited on
Commit
be980dd
·
1 Parent(s): 2885a71

fixed audio interface

Browse files
Files changed (4) hide show
  1. 1.wav +0 -0
  2. app.py +25 -66
  3. requirements.txt +1 -0
  4. test.py +21 -46
1.wav ADDED
Binary file (317 kB). View file
 
app.py CHANGED
@@ -1,80 +1,47 @@
1
  # Welcome to Team Tonic's MultiMed
2
- from lang_list import (
3
- LANGUAGE_NAME_TO_CODE,
4
- S2ST_TARGET_LANGUAGE_NAMES,
5
- S2TT_TARGET_LANGUAGE_NAMES,
6
- T2TT_TARGET_LANGUAGE_NAMES,
7
- TEXT_SOURCE_LANGUAGE_NAMES,
8
- LANG_TO_SPKR_ID,
9
- )
10
  from gradio_client import Client
11
  import os
12
  import numpy as np
13
  import base64
14
- import torch
15
- import torchaudio
16
  import gradio as gr
17
  import requests
18
  import json
19
  import dotenv
20
- from transformers import AutoProcessor, SeamlessM4TModel
21
- import torchaudio
22
  import PIL
23
  dotenv.load_dotenv()
24
 
25
- client = Client("https://facebook-seamless-m4t.hf.space/--replicas/frq8b/")
26
-
27
-
28
- AUDIO_SAMPLE_RATE = 16000.0
29
- MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
30
- DEFAULT_TARGET_LANGUAGE = "English"
31
 
32
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
33
 
34
 
35
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
36
 
37
- # processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
38
- # model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
39
-
40
-
41
- def process_speech(sound):
42
- """
43
- processing sound using seamless_m4t
44
- """
45
- # task_name = "T2TT"
46
- result = client.predict(task_name="S2TT",
47
- audio_source="microphone",
48
- input_audio_mic=sound,
49
- input_audio_file=None,
50
- input_text=None,
51
- source_language=None,
52
- target_language="English")
53
- print(result)
54
- return result[1]
55
-
56
-
57
- def process_speech_using_model(sound):
58
  """
59
  processing sound using seamless_m4t
60
  """
61
- # task_name = "T2TT"
62
- arr, org_sr = torchaudio.load(sound)
63
- target_language_code = LANGUAGE_NAME_TO_CODE[DEFAULT_TARGET_LANGUAGE]
64
- new_arr = torchaudio.functional.resample(
65
- arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
66
- max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
67
- if new_arr.shape[1] > max_length:
68
- new_arr = new_arr[:, :max_length]
69
- gr.Warning(
70
- f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
71
- input_data = processor(
72
- audios=new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
73
- tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code,
74
- num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
75
- text_out = processor.decode(tokens_ids, skip_special_tokens=True)
76
-
77
- return text_out
 
 
 
 
78
 
79
 
80
  def process_image(image) :
@@ -258,15 +225,7 @@ def process_and_query(text, image, audio):
258
  text = process_image(image)
259
 
260
  if audio is not None:
261
- # audio = audio[0].numpy()
262
- # audio = audio.astype(np.float32)
263
- # audio = audio / np.max(np.abs(audio))
264
- # audio = audio * 32768
265
- # audio = audio.astype(np.int16)
266
- # audio = audio.tobytes()
267
- # audio = base64.b64encode(audio).decode('utf-8')
268
  text = process_speech(audio)
269
- print(text)
270
 
271
  # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
272
  vectara_response_json = query_vectara(text)
 
1
  # Welcome to Team Tonic's MultiMed
2
+
 
 
 
 
 
 
 
3
  from gradio_client import Client
4
  import os
5
  import numpy as np
6
  import base64
 
 
7
  import gradio as gr
8
  import requests
9
  import json
10
  import dotenv
11
+ from scipy.io.wavfile import write
 
12
  import PIL
13
  dotenv.load_dotenv()
14
 
15
+ client = Client("facebook/seamless_m4t")
 
 
 
 
 
16
 
 
17
 
18
 
 
19
 
20
+ def process_speech(audio):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  """
22
  processing sound using seamless_m4t
23
  """
24
+ audio_name = f"{np.random.randint(0, 100)}.wav"
25
+ sr, data = audio
26
+ write(audio_name, sr, data.astype(np.int16))
27
+
28
+ out = client.predict(
29
+ "S2TT",
30
+ "file",
31
+ None,
32
+ audio_name,
33
+ "",
34
+ "French",# source language
35
+ "English",# target language
36
+ api_name="/run",
37
+ )
38
+ out = out[1] # get the text
39
+ try :
40
+ return f"{out}"
41
+ except Exception as e :
42
+ return f"{e}"
43
+
44
+
45
 
46
 
47
  def process_image(image) :
 
225
  text = process_image(image)
226
 
227
  if audio is not None:
 
 
 
 
 
 
 
228
  text = process_speech(audio)
 
229
 
230
  # Now, use the text (either provided by the user or obtained from OpenAI) to query Vectara
231
  vectara_response_json = query_vectara(text)
requirements.txt CHANGED
@@ -5,3 +5,4 @@ torchaudio==2.0.2
5
  sentencepiece
6
  python-dotenv
7
  Pillow
 
 
5
  sentencepiece
6
  python-dotenv
7
  Pillow
8
+ scipy
test.py CHANGED
@@ -5,57 +5,32 @@ import requests
5
  import gradio as gr
6
  import PIL
7
  import numpy as np
8
-
 
9
  dotenv.load_dotenv()
10
 
11
-
12
- def process_image(image) :
13
- # img_name = f"{np.random.randint(0, 100)}.jpg"
14
- img_name = f"{1}.jpg"
15
- PIL.Image.fromarray(image.astype('uint8'), 'RGB').save(img_name)
16
- image = open(img_name, "rb").read()
17
- base64_image = base64_image = base64.b64encode(image).decode('utf-8')
18
- openai_api_key = os.getenv('OPENAI_API_KEY')
19
- # oai_org = os.getenv('OAI_ORG')
20
-
21
- headers = {
22
- "Content-Type": "application/json",
23
- "Authorization": f"Bearer {openai_api_key}"
24
- }
25
-
26
- payload = {
27
- "model": "gpt-4-vision-preview",
28
- "messages": [
29
- {
30
- "role": "user",
31
- "content": [
32
- {
33
- "type": "text",
34
- "text": "What's in this image?"
35
- },
36
- {
37
- "type": "image_url",
38
- "image_url": {
39
- "url": f"data:image/jpeg;base64,{base64_image}"
40
- }
41
- }
42
- ]
43
- }
44
- ],
45
- "max_tokens": 300
46
- }
47
-
48
- response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
49
-
50
  try :
51
- out = response.json()
52
- out = out["choices"][0]["message"]["content"]
53
- print("out : ", out)
54
- print("type(out) : ", type(out))
55
-
56
  return f"{out}"
57
  except Exception as e :
58
  return f"{e}"
59
 
60
- iface = gr.Interface(fn=process_image, inputs="image", outputs="text")
61
  iface.launch()
 
5
  import gradio as gr
6
  import PIL
7
  import numpy as np
8
+ from scipy.io.wavfile import write
9
+ import gradio_client as grc
10
  dotenv.load_dotenv()
11
 
12
+ client = grc.Client("facebook/seamless_m4t")
13
+ def process_image(audio):
14
+ # audio_name = f"{np.random.randint(0, 100)}.jpg"
15
+ audio_name = f"{1}.wav"
16
+ sr, data = audio
17
+ write(audio_name, sr, data.astype(np.int16))
18
+
19
+ out = client.predict(
20
+ "S2TT",
21
+ "file",
22
+ None,
23
+ audio_name,
24
+ "",
25
+ "French",# source language
26
+ "English",# target language
27
+ api_name="/run",
28
+ )
29
+ out = out[1] # get the text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  try :
 
 
 
 
 
31
  return f"{out}"
32
  except Exception as e :
33
  return f"{e}"
34
 
35
+ iface = gr.Interface(fn=process_image, inputs="audio", outputs="text")
36
  iface.launch()