Spaces:

datnth1709
/

FantasticFour-S2T-MT-demo

Build error

App Files Files Community

datnth1709 commited on Sep 23, 2022

Commit

7c45a5f

1 Parent(s): f0cffbb

realtime translate

Browse files

Files changed (2) hide show

app.py +14 -74
convert.ipynb +112 -0

app.py CHANGED Viewed

@@ -181,40 +181,6 @@ def transcribe_en(audio, state_en="", state_vi=""):
     state_vi += vi_text + "+"
     return state_en, state_vi
-def transcribe_vi_1(audio, state_en=""):
-    ds = speech_file_to_array_fn(audio.name)
-    # infer model
-    input_values = processor(
-          ds["speech"],
-          sampling_rate=ds["sampling_rate"],
-          return_tensors="pt"
-    ).input_values
-    # decode ctc output
-    logits = vi_model(input_values).logits[0]
-    pred_ids = torch.argmax(logits, dim=-1)
-    greedy_search_output = processor.decode(pred_ids)
-    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
-    en_text = translate_vi2en(beam_search_output)
-    state_en += en_text + " "
-    return state_en, state_en
-def transcribe_en_1(audio, state_vi=""):
-    speech = load_data(audio)
-    # Tokenize
-    input_values = eng_tokenizer(speech, return_tensors="pt").input_values
-    # Take logits
-    logits = eng_model(input_values).logits
-    # Take argmax
-    predicted_ids = torch.argmax(logits, dim=-1)
-    # Get the words from predicted word ids
-    transcription = eng_tokenizer.decode(predicted_ids[0])
-    # Output is all upper case
-    transcription = correct_casing(transcription.lower())
-    vi_text = translate_en2vi(transcription)
-    state_vi += vi_text + "+"
-    return state_vi, state_vi
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
@@ -255,26 +221,13 @@ with gr.Blocks() as demo:
                         inputs=[vi_audio_1])
         with gr.TabItem("Vi-En Realtime Translation"):
-            gr.Interface(
-                    fn=transcribe_vi_1,
-                    inputs=[
-                        gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True),
-                        "state",
-                    ],
-                    outputs= [
-                        "text",
-                        "state",
-                    ],
-                    live=True).launch()
-            # with gr.Row():
-            #     with gr.Column():
-            #         vi_audio_2 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
-            #     with gr.Column():
-            #         speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
-            #         english_out_3 = gr.Textbox(label="English Text")
-            # vi_audio_2.change(transcribe_vi, [vi_audio_2, speech2text_vi2, english_out_3], [speech2text_vi2, english_out_3])
     with gr.Tabs():
@@ -302,26 +255,13 @@ with gr.Blocks() as demo:
                         inputs=[en_audio_1])
         with gr.TabItem("En-Vi Realtime Translation"):
-            gr.Interface(
-                    fn=transcribe_en_1,
-                    inputs=[
-                        gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True),
-                        "state",
-                    ],
-                    outputs= [
-                        "text",
-                        "state",
-                    ],
-                    live=True).launch()
-            # with gr.Row():
-            #     with gr.Column():
-            #         en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
-            #     with gr.Column():
-            #         speech2text_en2 = gr.Textbox(label="English Text")
-            #         vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
-            # en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
 if __name__ == "__main__":
     demo.launch()

     state_vi += vi_text + "+"
     return state_en, state_vi
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
                         inputs=[vi_audio_1])
         with gr.TabItem("Vi-En Realtime Translation"):
+            with gr.Row():
+                with gr.Column():
+                    vi_audio_2 = gr.Audio(source="microphone", label="Input Vietnamese Audio", type="file", streaming=True)
+                with gr.Column():
+                    speech2text_vi2 = gr.Textbox(label="Vietnamese Text")
+                    english_out_3 = gr.Textbox(label="English Text")
+            vi_audio_2.change(transcribe_vi, [vi_audio_2, speech2text_vi2, english_out_3], [speech2text_vi2, english_out_3])
     with gr.Tabs():
                         inputs=[en_audio_1])
         with gr.TabItem("En-Vi Realtime Translation"):
+            with gr.Row():
+                with gr.Column():
+                    en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
+                with gr.Column():
+                    speech2text_en2 = gr.Textbox(label="English Text")
+                    vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
+            en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
 if __name__ == "__main__":
     demo.launch()

convert.ipynb ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/json": {
+       "ascii": false,
+       "bar_format": null,
+       "colour": null,
+       "elapsed": 0.014345407485961914,
+       "initial": 0,
+       "n": 0,
+       "ncols": null,
+       "nrows": null,
+       "postfix": null,
+       "prefix": "Downloading",
+       "rate": null,
+       "total": 1596,
+       "unit": "B",
+       "unit_divisor": 1024,
+       "unit_scale": true
+      },
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c1e4c5c553c4150b92ef38251ec5ccd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "Unrecognized configuration class <class 'transformers.models.wav2vec2.configuration_wav2vec2.Wav2Vec2Config'> for this kind of AutoModel: AutoModelForSequenceClassification.\nModel type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, CamembertConfig, CanineConfig, ConvBertConfig, CTRLConfig, Data2VecTextConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, ElectraConfig, FlaubertConfig, FNetConfig, FunnelConfig, GPT2Config, GPTNeoConfig, GPTJConfig, IBertConfig, LayoutLMConfig, LayoutLMv2Config, LayoutLMv3Config, LEDConfig, LongformerConfig, MBartConfig, MegatronBertConfig, MobileBertConfig, MPNetConfig, NystromformerConfig, OpenAIGPTConfig, PerceiverConfig, PLBartConfig, QDQBertConfig, ReformerConfig, RemBertConfig, RobertaConfig, RoFormerConfig, SqueezeBertConfig, TapasConfig, TransfoXLConfig, XLMConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, YosoConfig.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_29808\\3826148515.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;31m# load model and tokenizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mmodel_id\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"facebook/wav2vec2-base-960h\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mAutoModelForSequenceClassification\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_id\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      7\u001b[0m \u001b[0mtokenizer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmodel_id\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[0mdummy_model_input\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"This is a sample\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreturn_tensors\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"pt\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\Python37\\lib\\site-packages\\transformers\\models\\auto\\auto_factory.py\u001b[0m in \u001b[0;36mfrom_pretrained\u001b[1;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[0;32m    446\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mmodel_class\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpretrained_model_name_or_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0mmodel_args\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mconfig\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    447\u001b[0m         raise ValueError(\n\u001b[1;32m--> 448\u001b[1;33m             \u001b[1;34mf\"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\\n\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    449\u001b[0m             \u001b[1;34mf\"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}.\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    450\u001b[0m         )\n",
+      "\u001b[1;31mValueError\u001b[0m: Unrecognized configuration class <class 'transformers.models.wav2vec2.configuration_wav2vec2.Wav2Vec2Config'> for this kind of AutoModel: AutoModelForSequenceClassification.\nModel type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, CamembertConfig, CanineConfig, ConvBertConfig, CTRLConfig, Data2VecTextConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, ElectraConfig, FlaubertConfig, FNetConfig, FunnelConfig, GPT2Config, GPTNeoConfig, GPTJConfig, IBertConfig, LayoutLMConfig, LayoutLMv2Config, LayoutLMv3Config, LEDConfig, LongformerConfig, MBartConfig, MegatronBertConfig, MobileBertConfig, MPNetConfig, NystromformerConfig, OpenAIGPTConfig, PerceiverConfig, PLBartConfig, QDQBertConfig, ReformerConfig, RemBertConfig, RobertaConfig, RoFormerConfig, SqueezeBertConfig, TapasConfig, TransfoXLConfig, XLMConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, YosoConfig."
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, Wav2Vec2Tokenizer, Wav2Vec2ForCTC\n",
+    "\n",
+    "# load model and tokenizer\n",
+    "model_name = \"facebook/wav2vec2-base-960h\"\n",
+    "tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)\n",
+    "model = Wav2Vec2ForCTC.from_pretrained(model_name)\n",
+    "dummy_model_input = tokenizer(\"This is a sample\", return_tensors=\"pt\")\n",
+    "\n",
+    "# export\n",
+    "torch.onnx.export(\n",
+    "    model, \n",
+    "    tuple(dummy_model_input.values()),\n",
+    "    f=\"torch-model.onnx\",  \n",
+    "    input_names=['input_ids', 'attention_mask'], \n",
+    "    output_names=['logits'], \n",
+    "    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'}, \n",
+    "                  'attention_mask': {0: 'batch_size', 1: 'sequence'}, \n",
+    "                  'logits': {0: 'batch_size', 1: 'sequence'}}, \n",
+    "    do_constant_folding=True, \n",
+    "    opset_version=13, \n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7.9 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d49c3f6d6dd49f9272b571d9fad348ab55b8c6c3f691520d74ed0af1f69c3dd8"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}