Spaces:

wwdok
/

faster-whisper-webui-cn

Runtime error

App Files Files Community

wwdok commited on Aug 2, 2023

Commit

ea6d8b2

1 Parent(s): 489bea3

add chatgpt support

Browse files

Files changed (6) hide show

README.md +14 -2
app.py +26 -88
docs/text_postprocess.png +0 -0
requirements.txt +5 -3
src/utils.py +152 -3
src/vad.py +3 -3

README.md CHANGED Viewed

@@ -16,7 +16,18 @@ Fork from : https://huggingface.co/spaces/aadnk/faster-whisper-webui/tree/main
 我的更改:
-* 新添加了一个文本后处理的tab
 # Running Locally
@@ -194,4 +205,5 @@ registry.gitlab.com/aadnk/whisper-webui:latest
 - [ ] 如果是一个视频列表，只下载第一个视频
 - [ ] ~~如果已经转录完了再选翻译任务，则不重新转录~~
-- [ ] ~~目前翻译任务只能由任意语言翻译成英语，不能指定其他语言，要能支持翻译成其他语言，至少支持中文~~

 我的更改:
+* 新添加了一个文本后处理的tab：
+![Alt text](docs/text_postprocess.png)
+    * 支持使用ChatGPT或Paddle auto punc（二选一）对文本自动添加合适的标点符号
+    * 支持使用pycorrector对文本进行纠错
+    * 支持去掉指定的语气助词
+    * 支持对输出文本的指定字符进行替换
+该App同时部署在：
+* HuggingFace Spaces: https://huggingface.co/spaces/wwdok/faster-whisper-webui-cn
+* OpenXLab: https://openxlab.org.cn/apps/detail/wwdok/faster-whisper-webui
+* ModelScope: https://modelscope.cn/studios/wwd123/faster-whisper-webui-cn
 # Running Locally
 - [ ] 如果是一个视频列表，只下载第一个视频
 - [ ] ~~如果已经转录完了再选翻译任务，则不重新转录~~
+- [ ] ~~目前翻译任务只能由任意语言翻译成英语，不能指定其他语言，要能支持翻译成其他语言，至少支持中文~~\
+- [ ] 使用ChatGPT自动纠正错别字和添加标点符号

app.py CHANGED Viewed

@@ -10,7 +10,6 @@ import pathlib
 import tempfile
 import zipfile
 import numpy as np
-import pyperclip
 import torch
 from src.config import VAD_INITIAL_PROMPT_MODE_VALUES, ApplicationConfig, VadInitialPromptMode
@@ -32,6 +31,7 @@ import gradio as gr
 from src.download import ExceededMaximumDuration, download_url
 from src.utils import optional_int, slugify, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
 from src.whisper.whisperFactory import create_whisper_container
@@ -614,102 +614,36 @@ def create_ui(app_config: ApplicationConfig):
         gr.Text(label="Segments")
     ])
-    def get_chunks(s, maxlength, separator=None):
-        start = 0
-        end = 0
-        while start + maxlength  < len(s) and end != -1:
-            if separator is not None:
-                end = s.rfind(separator, start, start + maxlength + 1)
-                segment = s[start:end]
-                yield segment.replace(separator, "")
-                start = end +1
-            else:
-                end = start + maxlength
-                yield s[start:end]
-                start = end
-        yield s[start:]
-    def post_processing(text, apply_correction, auto_punc, separator, remove_words):
-        print(f"==>> separator: {separator}")
-        original_separator1 = " "
-        original_separator2 = ","
-        # 对于长文本需要先分段再推理，推理完再合并
-        # auto_punc：自动添加合适的、不同的标点符号
-        if auto_punc == True:
-            # 自动分段文本之前先去除原有的标点符号
-            text = text.replace(original_separator1, "")
-            text = text.replace(original_separator2, "")
-            import paddlehub as hub
-            model = hub.Module(name='auto_punc', version='1.0.0')
-            t3 = time.time()
-            # split long text to short text less than max_length and store them in list
-            max_length = 256
-            chunks = list(get_chunks(text, max_length))
-            results = []
-            results = model.add_puncs(chunks, max_length=max_length)
-            text = "，".join(results) # 分段处硬编码成使用中文逗号分割
-            t4 = time.time()
-            print("Auto punc finished. Cost time: {:.2f}s".format(t4-t3))
-        # print(f"==>> text after auto punc: {text}")
-        else:
-            # 将空格全部统一替换成一种分隔符
-            if separator == "\\n":
-                # 直接使用separator会无法换行
-                text = text.replace(original_separator1, "\n")
-                text = text.replace(original_separator2, "\n")
-            else:
-                text = text.replace(original_separator1, separator)
-                text = text.replace(original_separator2, separator)
-        if apply_correction == True:
-            import pycorrector
-            print("Start correcting...")
-            t1 = time.time()
-            text, detail = pycorrector.correct(text)
-            t2 = time.time()
-            print("Correcting finished. Cost time: {:.2f}s".format(t2-t1))
-            print(f"==>> detail: {detail}")
-        # 去掉语气词
-        t5 = time.time()
-        remove_words = remove_words.split("，") + remove_words.split(",") + remove_words.split(" ")
-        for word in remove_words:
-            text = text.replace(word, "")
-        t6 = time.time()
-        print("Remove words finished. Cost time: {:.2f}s".format(t6-t5))
-        return text
-    def replace(text, src_word, target_word):
-        text = text.replace(src_word, target_word)
-        return text
-    def switch_punc_method(auto_punc):
-        if auto_punc == True:
-            # gr.update里的参数是给output的
-            return gr.update(visible=False) # 教程：https://www.gradio.app/guides/blocks-and-event-listeners#updating-component-configurations
         else:
-            return gr.update(visible=True)
-    def copy_text(text):
-        pyperclip.copy(text)
     test_postprocess = gr.Blocks()
     with test_postprocess:
         gr.Markdown(
         """
-        后处理Simple或Full标签页输出的Transcription里的文本
         """
         )
         with gr.Row():
             with gr.Column():
                 input_text = gr.TextArea(label="输入文本", placeholder="在此处粘贴你的待处理文本")
-                apply_correction = gr.Checkbox(label="文本纠错", value=False)
-                auto_punc = gr.Checkbox(label="自动添加标点符号", value=False)
-                separator = gr.Text(label="分隔符(一般是逗号，或换行\\n)", value="，")
-                remove_words = gr.Text(label="去掉的语气词", value="呢，啊，哦，嗯，嘛，吧，呀，哈，哇，呐，噢，嘞，嘛")
                 submit_btn = gr.Button("提交")
             with gr.Column():
                 output_text = gr.TextArea(label="输出文本", interactive=True).style(show_copy_button=True)
@@ -718,10 +652,14 @@ def create_ui(app_config: ApplicationConfig):
                     target_word = gr.Text(label="替换后的字符")
                 replace_btn = gr.Button("替换")
                 copy_btn = gr.Button("复制到剪贴板")
-        auto_punc.change(switch_punc_method, inputs=[auto_punc], outputs=[separator])
-        submit_btn.click(post_processing, inputs=[input_text, apply_correction, auto_punc, separator, remove_words], outputs=output_text)
         replace_btn.click(replace, inputs=[output_text, src_word, target_word], outputs=output_text)
         copy_btn.click(copy_text, inputs=output_text)
     demo = gr.TabbedInterface([simple_transcribe, full_transcribe, test_postprocess], tab_names=["Simple", "Full", "Text Postprocess"])
     # Queue up the demo

 import tempfile
 import zipfile
 import numpy as np
 import torch
 from src.config import VAD_INITIAL_PROMPT_MODE_VALUES, ApplicationConfig, VadInitialPromptMode
 from src.download import ExceededMaximumDuration, download_url
 from src.utils import optional_int, slugify, write_srt, write_vtt
+from src.utils import post_processing, replace, copy_text, on_token_change, num_tokens_from_messages, chat_with_gpt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 from src.whisper.abstractWhisperContainer import AbstractWhisperContainer
 from src.whisper.whisperFactory import create_whisper_container
         gr.Text(label="Segments")
     ])
+    def switch_punc_method(use_chatgpt, auto_punc):
+        if use_chatgpt == True and auto_punc == True:
+            return gr.update(), gr.update(), gr.update()
+        elif use_chatgpt == True and auto_punc == False:
+            return gr.update(visible=True), gr.update(visible=True, interactive=True), gr.update(visible=False)
+        elif use_chatgpt == False and auto_punc == True:
+            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
         else:
+            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
     test_postprocess = gr.Blocks()
     with test_postprocess:
         gr.Markdown(
         """
+        后处理Simple或Full标签页输出的Transcription里的文本，也可以单独使用
         """
         )
         with gr.Row():
             with gr.Column():
                 input_text = gr.TextArea(label="输入文本", placeholder="在此处粘贴你的待处理文本")
+                tokens_count = gr.Markdown(label="Tokens 计数: 0", visible=False)
+                use_chatgpt = gr.Checkbox(label="使用ChatGPT自动纠正错别字和添加标点符号", value=False)
+                user_token = gr.Textbox(value='', placeholder="OpenAI API Key", type="password", visible=False,
+                                        label="输入你的 OpenAI API Key. 你可以从这里(https://platform.openai.com/account/api-keys)获取.\
+                                        \n⚠ 注意！使用ChatGPT来处理文本会消耗大量的tokens，免费版用户谨慎使用！")
+                apply_correction = gr.Checkbox(label="使用pycorrector纠正错别字", value=False)
+                auto_punc = gr.Checkbox(label="使用paddle auto punc自动添加标点符号", value=False)
+                separator = gr.Text(label="使用统一的标点符号(比如逗号，或换行\\n)", value="，")
+                remove_words = gr.Text(label="去掉的语气助词", value="呢，啊，哦，嗯，嘛，吧，呀，哈，哇，呐，噢，嘞，嘛")
                 submit_btn = gr.Button("提交")
             with gr.Column():
                 output_text = gr.TextArea(label="输出文本", interactive=True).style(show_copy_button=True)
                     target_word = gr.Text(label="替换后的字符")
                 replace_btn = gr.Button("替换")
                 copy_btn = gr.Button("复制到剪贴板")
+        input_text.change(num_tokens_from_messages, inputs=[input_text], outputs=[tokens_count])
+        auto_punc.change(switch_punc_method, inputs=[use_chatgpt, auto_punc], outputs=[tokens_count, user_token, separator])
+        use_chatgpt.change(switch_punc_method, inputs=[use_chatgpt, auto_punc], outputs=[tokens_count, user_token, separator])
+        user_token.change(on_token_change, inputs=[user_token], outputs=[])
+        submit_btn.click(post_processing, inputs=[input_text, use_chatgpt, user_token, apply_correction, auto_punc, separator, remove_words], outputs=output_text)
         replace_btn.click(replace, inputs=[output_text, src_word, target_word], outputs=output_text)
         copy_btn.click(copy_text, inputs=output_text)
     demo = gr.TabbedInterface([simple_transcribe, full_transcribe, test_postprocess], tab_names=["Simple", "Full", "Text Postprocess"])
     # Queue up the demo

docs/text_postprocess.png ADDED Viewed

requirements.txt CHANGED Viewed

@@ -10,6 +10,8 @@ more_itertools
 pycorrector
 paddlepaddle == 2.4.0
 paddlehub
-aiobotocore
-botocore
-pyperclip

 pycorrector
 paddlepaddle == 2.4.0
 paddlehub
+-U aiobotocore
+-U botocore
+pyperclip
+openai
+tiktoken

src/utils.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import textwrap
 import unicodedata
 import re
 import zlib
 from typing import Iterator, TextIO, Union
 import tqdm
 import urllib3
 def exact_div(x, y):
@@ -242,4 +244,151 @@ def download_file(url: str, destination: str):
                         break
                     output.write(buffer)
-                    loop.update(len(buffer))

 import textwrap
 import unicodedata
 import re
+import time
 import zlib
 from typing import Iterator, TextIO, Union
 import tqdm
+import pyperclip
 import urllib3
+import openai
+import tiktoken
 def exact_div(x, y):
                         break
                     output.write(buffer)
+                    loop.update(len(buffer))
+# -------------used for text post processing tab----------------
+system_prompt = "You are a helpful assistant."
+user_prompt = "请帮我把下面的文本纠正错别字并添加合适的标点符号，返回的消息只要处理后的文本："
+def get_chunks(s, maxlength, separator=None):
+    start = 0
+    end = 0
+    while start + maxlength  < len(s) and end != -1:
+        if separator is not None:
+            end = s.rfind(separator, start, start + maxlength + 1)
+            segment = s[start:end]
+            yield segment.replace(separator, "")
+            start = end +1
+        else:
+            end = start + maxlength
+            yield s[start:end]
+            start = end
+    yield s[start:]
+def post_processing(text, use_chatgpt, user_token, apply_correction, auto_punc, separator, remove_words):
+    # print(f"==>> separator: {separator}")
+    original_separator1 = " "
+    original_separator2 = ","
+    if use_chatgpt == True:
+        if user_token == "":
+            text = "请先设置你的OpenAI API Key，然后再重试"
+            return text
+        else:
+            text = chat_with_gpt(text, system_prompt, user_prompt)
+            return text
+    # 对于长文本需要先分段再推理，推理完再合并
+    elif auto_punc == True:
+        # 自动分段文本之前先去除原有的标点符号
+        text = text.replace(original_separator1, "")
+        text = text.replace(original_separator2, "")
+        import paddlehub as hub
+        model = hub.Module(name='auto_punc', version='1.0.0')
+        t3 = time.time()
+        # split long text to short text less than max_length and store them in list
+        max_length = 256
+        chunks = list(get_chunks(text, max_length))
+        results = []
+        results = model.add_puncs(chunks, max_length=max_length)
+        text = "，".join(results) # 分段处硬编码成使用中文逗号分割
+        t4 = time.time()
+        print("Auto punc finished. Cost time: {:.2f}s".format(t4-t3))
+    # print(f"==>> text after auto punc: {text}")
+    else:
+        # 将空格全部统一替换成一种分隔符
+        if separator == "\\n":
+            # 直接使用separator会无法换行
+            text = text.replace(original_separator1, "\n")
+            text = text.replace(original_separator2, "\n")
+        else:
+            text = text.replace(original_separator1, separator)
+            text = text.replace(original_separator2, separator)
+    if apply_correction == True:
+        import pycorrector
+        print("Start correcting...")
+        t1 = time.time()
+        text, detail = pycorrector.correct(text)
+        t2 = time.time()
+        print("Correcting finished. Cost time: {:.2f}s".format(t2-t1))
+        print(f"==>> detail: {detail}")
+    # 去掉语气词
+    t5 = time.time()
+    remove_words = remove_words.split("，") + remove_words.split(",") + remove_words.split(" ")
+    for word in remove_words:
+        text = text.replace(word, "")
+    t6 = time.time()
+    print("Remove words finished. Cost time: {:.2f}s".format(t6-t5))
+    return text
+def replace(text, src_word, target_word):
+    text = text.replace(src_word, target_word)
+    return text
+def copy_text(text):
+    pyperclip.copy(text)
+def num_tokens_from_messages(message):
+    """Return the number of tokens used by a list of messages."""
+    model="gpt-3.5-turbo-0613"
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        print("Warning: model not found. Using cl100k_base encoding.")
+        encoding = tiktoken.get_encoding("cl100k_base")
+    if model in {
+        "gpt-3.5-turbo-0613",
+        "gpt-3.5-turbo-16k-0613",
+        "gpt-4-0314",
+        "gpt-4-32k-0314",
+        "gpt-4-0613",
+        "gpt-4-32k-0613",
+        }:
+        tokens_per_message = 3
+        tokens_per_name = 1
+    elif model == "gpt-3.5-turbo-0301":
+        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
+        tokens_per_name = -1  # if there's a name, the role is omitted
+    elif "gpt-3.5-turbo" in model:
+        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
+        return num_tokens_from_messages(message, model="gpt-3.5-turbo-0613")
+    elif "gpt-4" in model:
+        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
+        return num_tokens_from_messages(message, model="gpt-4-0613")
+    else:
+        raise NotImplementedError(
+            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
+        )
+    num_tokens = 0
+    num_tokens += tokens_per_message
+    message = user_prompt + message
+    num_tokens += len(encoding.encode(message))
+    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
+    return f"Tokens 计数: {num_tokens}"
+def on_token_change(user_token):
+    openai.api_key = user_token
+def chat_with_gpt(input_message, system_prompt, user_prompt, temperature=0, max_tokens=4096):
+    system_content = [{ "role": "system", "content": system_prompt }]
+    user_content = [{ "role": "user", "content":  user_prompt + input_message }]
+    try:
+        completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=system_content + user_content, temperature=temperature, max_tokens=max_tokens)
+        response_msg = completion.choices[0].message['content']
+        prompt_tokens = completion['usage']['prompt_tokens']
+        completion_tokens = completion['usage']['completion_tokens']
+        total_tokens = completion['usage']['total_tokens']
+        print(f"==>> prompt_tokens: {prompt_tokens}")
+        print(f"==>> completion_tokens: {completion_tokens}")
+        print(f"==>> total_tokens: {total_tokens}")
+        return response_msg
+    except Exception as e:
+        return f"Error: {e}"
+# -------------used for text post processing tab----------------

src/vad.py CHANGED Viewed

@@ -203,8 +203,8 @@ class AbstractTranscription(ABC):
                 # Detected language
                 detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
-                print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
-                    segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
                 perf_start_time = time.perf_counter()
@@ -213,7 +213,7 @@ class AbstractTranscription(ABC):
                 segment_result = whisperCallable.invoke(segment_audio, segment_index, segment_prompt, detected_language, progress_listener=scaled_progress_listener)
                 perf_end_time = time.perf_counter()
-                print("Whisper took {} seconds".format(perf_end_time - perf_start_time))
                 adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)

                 # Detected language
                 detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
+                # print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
+                    # segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
                 perf_start_time = time.perf_counter()
                 segment_result = whisperCallable.invoke(segment_audio, segment_index, segment_prompt, detected_language, progress_listener=scaled_progress_listener)
                 perf_end_time = time.perf_counter()
+                # print("Whisper took {} seconds".format(perf_end_time - perf_start_time))
                 adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)