Spaces:

3syunojingi
/

Whisper_Youtube

Running

File size: 10,835 Bytes

import os
import gradio as gr
from openai import OpenAI
from yt_dlp import YoutubeDL
from pydub import AudioSegment
import math

# 定数
MAX_LENGTH = 50000
# MODEL_SUMMARY = "gpt-3.5-turbo-0125"
gpt_model = ["gpt-4o", "gpt-3.5-turbo-0125"]
gpt_model_df = "gpt-4o"

# 分割時間設定（20分）
split_time = 20 * 60 * 1000

lang_code = {'Japanese': "ja", 'English': "en"}

def set_state(openai_key, lang, model_sum, state):
    """ 設定タブの情報をセッションに保存する関数 """

    state["openai_key"]= openai_key
    state["lang"] = lang_code[lang]
    state["model_sum"] = model_sum

    return state

def youtube_mp3(url):

  # ファイル名は動画のIDに
  mp3_name = url[url.rfind('/') + 1:].replace("watch?v=","")


  # フォーマット設定
  ydl_opts = {
      'outtmpl': mp3_name,
      "format": "mp3/bestaudio/best",
      "postprocessors": [
          {
              "key": "FFmpegExtractAudio",
              "preferredcodec": "mp3",
          }
      ],
  }

  try :
      # yt-dlpでmp3に
      with YoutubeDL(ydl_opts) as ydl:
          result = ydl.download([url])

      return mp3_name + ".mp3"

  except Exception as e:
      print(e)
      return None


def create_textfile(url, file, state):

    err_msg = ""

    # OpenAIキーチェック
    if state["openai_key"] == "":

        err_msg = "OpenAIキーを入力してください。（設定タブ）"

        return None, err_msg

    # URL入力チェック
    if url == "" and file is None:

        err_msg = "URLを入力するか、音声ファイルをアップして下さい。"

        return None, err_msg

    # OpenAIクライアント作成
    os.environ["OPENAI_API_KEY"] = state["openai_key"]

    client = OpenAI()

    # client作成後は消す
    os.environ["OPENAI_API_KEY"] = ""

    # 動画を音声ファイルにする
    if url != "":

        file_name = youtube_mp3(url)

        if file_name is None:

            err_msg = "音声ファイルにする作業でエラーが発生しました。URLを確認して下さい。"

            return None, err_msg

        # ファイルを絶対パスに
        file_path = os.getcwd() + "/" + file_name

    else:

        # ファイル名のみ取得
        file_name = os.path.basename(file)

        # ファイルを移動
        file_path = os.getcwd() + "/" + file_name
        os.replace(file, file_path)

    # 音声ファイルを分割
    audio_list = audio_seg(file_path)

    # whisperで文字に起こす
    text_list = whisper_audio(client, audio_list, state["lang"])

    if text_list is None:

        err_msg = "whisperでエラーが発生しました。OpenAI APIキーが正しいか、クレジット残高があるか確認して下さい。"

        return None, err_msg

    text_name = rename_audio(file_name)
    # text_name = file_name.replace(".mp3",".txt")

    # テキストファイルを結合
    if len(text_list) > 1:
        combin_text(text_list, text_name)

    return text_name, err_msg


def rename_audio(audio_name):

      text_name = ""

      if ".mp3" in audio_name:
          text_name = audio_name.replace(".mp3",".txt")
      elif ".MP3" in audio_name:
          text_name = audio_name.replace(".MP3",".txt")
      elif ".wav" in audio_name:
          text_name = audio_name.replace(".wav",".txt")
      elif ".WAV" in audio_name:
          text_name = audio_name.replace(".WAV",".txt")

      if text_name == "":
          text_name = audio_name

      return text_name

def whisper_audio(client, audio_list, lang):

    text_list = []

    try:

        for audio in audio_list:

              audio_file= open(audio, "rb")

              trans_text = client.audio.transcriptions.create(model="whisper-1", file=audio_file, language=lang, response_format="text")

              # print(audio)

              text_name = rename_audio(audio)

              # print(text_name)

              with open(text_name, mode="w") as f:

                  # テキストに書き出す
                  f.write(trans_text)

              text_list.append(text_name)

        return text_list

    except Exception as e:

        print(e)
        return None


def audio_seg(file_path):

    # 分割したリスト
    div_file = []

    # ファイル名取得
    file_name = os.path.basename(file_path)

    # ファイルから音声情報取得
    audio = AudioSegment.from_mp3(file_path)

    # 音声の長さ取得（ミリ秒）
    dur_mlseconds = audio.duration_seconds * 1000

    # 分割数を決める
    div_count = math.ceil(dur_mlseconds / split_time)

    if div_count == 1:

        # 分割なしの時は元ファイルのみ
        div_file.append(file_name)

        return div_file

    # 分割時間初期設定
    start = 0
    end = split_time

    for i in range(div_count):

        div_audio = audio[start:end]
        div_audio.export(str(i+1) + "_" + file_name, format="mp3")

        # ファイル名をセット
        div_file.append(str(i+1) + "_" + file_name)

        start = end
        end += split_time

    return div_file


def combin_text(text_list, text_name):

    # ファイルを一つにまとめる
    with open(text_name, "w") as of:
        for file in text_list:
            with open(file, "r") as f:
                of.write(f.read())

                # 必要であれば改行を加える
                # outfile.write(infile.read() + "\n")

    return

def create_mp3(url, state):

    err_msg = ""

    # URL入力チェック
    if url == "":

        err_msg = "URLを入力して下さい。"

        return None, err_msg

    # 動画を音声ファイルにする
    file_name = youtube_mp3(url)

    if file_name is None:

        err_msg = "音声ファイルにする作業でエラーが発生しました。URLを確認して下さい。"

        return None, err_msg

    return file_name, ""

def create_summary(file, state):

    err_msg = ""

    # OpenAIキーチェック
    if state["openai_key"] == "":

        err_msg = "OpenAIキーを入力してください。（設定タブ）"

        return None, err_msg

    # ファイルチェック
    if file is None:

        err_msg = "テキストファイルがありません。"

        return None, err_msg

    # OpenAIクライアント作成
    os.environ["OPENAI_API_KEY"] = state["openai_key"]

    client = OpenAI()

    # client作成後は消す
    os.environ["OPENAI_API_KEY"] = ""

    summary, err_msg = exec_summary(client, state["model_sum"], file)

    if err_msg != "":

        return None, err_msg

    sum_file = "summary_" + os.path.basename(file)

    with open(sum_file, mode="w") as f:

        f.write(summary)

    return sum_file, ""


def exec_summary(client, model_sum, file):

    try:

        with open(file, 'r') as f:

            text = f.read()

        if len(text) > MAX_LENGTH:

            err_msg = "要約の文字数上限を超えています。"

            return "", err_msg

        messages=[
              {"role": "system", "content": "あなたは優秀なアシスタントです。与えられた文章を要約して下さい。"},
              {"role": "user", "content": text},
          ]

        # GPTに問い合わせ
        response = client.chat.completions.create(
            model=model_sum,
            messages=messages,
            # max_tokens=MAX_TOKENS,
        )

        summary = response.choices[0].message.content

        return summary, ""

    except Exception as e:
        print(e)
        return "", "要約作成でエラーが発生しました。"


with gr.Blocks() as demo:

    title = "<h2>Whisperデモアプリ【応用版】</h2>"
    message = "<h3>最初に[設定]タブからOpenAIキーを入力してください。<br>"
    message += '※URLは"https://www.youtube.com/watch?v=XXXXXXX"の形式で指定して下さい。</h3>'

    gr.Markdown(title + message)

    # セッションの宣言
    state = gr.State({
        "openai_key" : ""
        ,"lang": ""
        ,"model_sum":""
    })

    with gr.Tab("whisperを利用する") as main:

      # 各コンポーネント定義
      url = gr.Text(label="YouTubeのURL")

      with gr.Accordion(label="音声ファイルをアップする", open=False):
          up_file = gr.File(file_types=[".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"], label="音声ファイルアップロード")

      # ボタン類
      with gr.Row():
          btn_txt = gr.Button("テキスト作成")
          btn_mp3 = gr.Button("MP3作成")
          # clear = gr.ClearButton(value="リセット", components=[url, sys_msg])

      # 出力
      sys_msg = gr.Text(label="システムメッセージ")
      # text = gr.TextArea(label="文字起こし内容")
      out_file = gr.File(label="出力テキストファイル", interactive = False)

      with gr.Accordion(label="テキストを要約する", open=False):
          btn_sum = gr.Button("テキスト要約")
          sum_file = gr.File(label="要約テキストファイル", interactive = False)

      # 送信ボタンクリック時の処理
      btn_txt.click(create_textfile, inputs=[url, up_file, state], outputs=[out_file, sys_msg])
      btn_mp3.click(create_mp3, inputs=[url, state], outputs=[out_file, sys_msg])
      btn_sum.click(create_summary, inputs=[out_file, state], outputs=[sum_file, sys_msg])

    with gr.Tab("設定") as set:
      openai_key = gr.Textbox(label="OpenAI API Key", elem_id="openai_key",interactive = True)
      lang = gr.Dropdown(choices=["Japanese", "English"], value = "Japanese", label="言語", interactive = True)
      model_sum = gr.Dropdown(choices=gpt_model, value =gpt_model_df, label="要約モデル", interactive = True)

      # 設定変更時
      main.select(set_state, [openai_key, lang, model_sum, state], state)

    with gr.Tab("利用上の注意"):

      caution = '利用上の注意<br>・URLとファイルが両方ある場合はURLが優先されます。<br>'
      caution += "・Whisperの料金は1分あたり0.006ドル（約0.9円）です。MP3作成のみは無料です。<br>"
      caution += "・広告など一部の動画は文字に起こすことはできません。<br>"
      caution += "・要約で利用するGPTのモデルはGPT-4かGPT3.5を選べます。（gpt-4oかgpt-3.5-turbo）<br>"
      caution += "・要約は50000字までの上限があり、利用の際に料金が発生します。（1000文字あたりGPT-4は約0.2円、GPT3.5は約0.2円）<br>"
      caution += "<br>免責事項<br>本アプリはOpenAIのAPIで製作されており、利用で生じた損害について一切の責任を負えません。"
      gr.Markdown("<h3>" + caution + "</h3>")

demo.queue()
demo.launch(debug=False)