import os from pydub import AudioSegment import openai from openai import OpenAI import feedparser from pathlib import Path import wikipedia import json import streamlit as st import requests client = OpenAI() # def load_whisper_api(audio): # '''Transcribe YT audio to text using Open AI API''' # import openai # file = open(audio, "rb") # transcript = openai.Audio.translate("whisper-1", file) # return transcript @st.cache_data def load_whisper_api(audio): '''Transcribe YT audio to text using Open AI API''' file = open(audio, "rb") transcript = client.audio.transcriptions.create(model="whisper-1", file=file,response_format="text") return transcript @st.cache_data def get_transcribe_podcast(rss_url, local_path='/data/'): st.info("Starting Podcast Transcription Function...") print("Feed URL: ", rss_url) print("Local Path:", local_path) # Download the podcast episode by parsing the RSS feed p = Path(local_path) # p.mkdir(exist_ok=True) st.info("Downloading the podcast episode...") episode_name = "podcast_episode.mp3" with requests.get(rss_url, stream=True) as r: r.raise_for_status() episode_path = p.joinpath(episode_name) print(f'episode path {episode_path}') with open(episode_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) st.info("Podcast Episode downloaded") # Perform the transcription st.info("Starting podcast transcription") audio_file = episode_path #Get size of audio file audio_size = round(os.path.getsize(audio_file)/(1024*1024),1) print(f'audio size: {audio_size}') #Check if file is > 24mb, if not then use Whisper API if audio_size <= 25: #Use whisper API results = load_whisper_api(audio_file) else: st.info('File size larger than 24mb, applying chunking and transcription') song = AudioSegment.from_file(audio_file, format='mp3') # PyDub handles time in milliseconds twenty_minutes = 20 * 60 * 1000 chunks = song[::twenty_minutes] transcriptions = [] for i, chunk in enumerate(chunks): chunk.export(f'chunk_{i}.mp3', format='mp3') transcriptions.append(load_whisper_api(f'chunk_{i}.mp3')) results = ','.join(transcriptions) # Return the transcribed text st.info("Podcast transcription completed, returning results...") return results @st.cache_data def get_podcast_summary(podcast_transcript): instructPrompt = """ You are a podcast analyst and your main task is to summarize the key and important points of the podcast for a busy professional by highlighting the main and important points to ensure the professional has a sufficient summary of the podcast. Include any questions you consider important or any points that warrant further investigation. Please use bulletpoints. """ request = instructPrompt + podcast_transcript chatOutput = client.chat.completions.create(model="gpt-4-turbo-preview", messages=[{"role": "system", "content": "You are a helpful podcast analyzer assistant"}, {"role": "user", "content": request} ] ) podcastSummary = chatOutput.choices[0].message.content return podcastSummary @st.cache_data def get_podcast_guest(podcast_transcript): '''Get guest name, professional title, organization name''' completion = client.chat.completions.create( model="gpt-4-turbo-preview", messages=[{"role": "user", "content": podcast_transcript}], functions=[ { "name": "get_podcast_guest_information", "description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google", "parameters": { "type": "object", "properties": { "guest_name": { "type": "string", "description": "The full name of the guest who is being interviewed in the podcast", }, "guest_organization": { "type": "string", "description": "The name or details of the organization that the podcast guest belongs to, works for or runs", }, "guest_title": { "type": "string", "description": "The title, designation or role the podcast guest holds or type of work that the podcast guest in the organization does", }, }, "required": ["guest_name"], }, } ], function_call={"name": "get_podcast_guest_information"} ) podcast_guest = "" podcast_guest_org = "" podcast_guest_title = "" response_message = completion.choices[0].message.function_call print(f'func res: {response_message}') if response_message: function_name = response_message.name function_args = json.loads(response_message.arguments) podcast_guest=function_args.get("guest_name") podcast_guest_org=function_args.get("guest_organization") podcast_guest_title=function_args.get("guest_title") return (podcast_guest,podcast_guest_org,podcast_guest_title) @st.cache_data def get_podcast_highlights(podcast_transcript): instructPrompt = """ Extract some key moments in the podcast. These are typically interesting insights from the guest or critical questions that the host might have put forward. It could also be a discussion on a hot topic or controversial opinion """ request = instructPrompt + podcast_transcript chatOutput = client.chat.completions.create(model="gpt-4-turbo-preview", messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": podcast_transcript} ] ) podcastHighlights = chatOutput.choices[0].message.content return podcastHighlights @st.cache_data def process_podcast(url, path='/data/'): '''Get podcast transcription into json''' output = {} podcast_details = get_transcribe_podcast(url, path) podcast_summary = get_podcast_summary(podcast_details) podcast_guest_details = get_podcast_guest(podcast_details) podcast_highlights = get_podcast_highlights(podcast_details) output['podcast_details'] = podcast_details output['podcast_summary'] = podcast_summary output['podcast_guest'] = podcast_guest_details[0] output['podcast_guest_org'] = podcast_guest_details[1] output['podcast_guest_title'] = podcast_guest_details[2] output['podcast_highlights'] = podcast_highlights return output