nickmuchi commited on
Commit
cc8b68a
·
verified ·
1 Parent(s): f616749

Create functions.py

Browse files
Files changed (1) hide show
  1. functions.py +204 -0
functions.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pydub import AudioSegment
3
+ import openai
4
+ from openai import OpenAI
5
+ import feedparser
6
+ from pathlib import Path
7
+ import wikipedia
8
+ import json
9
+
10
+
11
+ openai_audio = OpenAI()
12
+
13
+ # def load_whisper_api(audio):
14
+
15
+ # '''Transcribe YT audio to text using Open AI API'''
16
+
17
+ # import openai
18
+ # file = open(audio, "rb")
19
+ # transcript = openai.Audio.translate("whisper-1", file)
20
+
21
+ # return transcript
22
+
23
+ @st.cache_data
24
+ def load_whisper_api(audio):
25
+
26
+ '''Transcribe YT audio to text using Open AI API'''
27
+ file = open(audio, "rb")
28
+ transcript = openai_audio.audio.transcriptions.create(model="whisper-1", file=file,response_format="text")
29
+
30
+ return transcript
31
+
32
+ @st.cache_data
33
+ def get_transcribe_podcast(rss_url, local_path):
34
+
35
+ st.info("Starting Podcast Transcription Function...")
36
+ print("Feed URL: ", rss_url)
37
+ print("Local Path:", local_path)
38
+
39
+ # Download the podcast episode by parsing the RSS feed
40
+ p = Path(local_path)
41
+ p.mkdir(exist_ok=True)
42
+
43
+ st.info("Downloading the podcast episode...")
44
+
45
+ with requests.get(rss_url, stream=True) as r:
46
+ r.raise_for_status()
47
+ episode_path = p.joinpath(episode_name)
48
+
49
+ with open(episode_path, 'wb') as f:
50
+ for chunk in r.iter_content(chunk_size=8192):
51
+ f.write(chunk)
52
+
53
+ st.info("Podcast Episode downloaded")
54
+
55
+ # Perform the transcription
56
+ st.info("Starting podcast transcription")
57
+
58
+ audio_file = local_path + episode_name
59
+
60
+
61
+ #Get size of audio file
62
+ audio_size = round(os.path.getsize(audio_file)/(1024*1024),1)
63
+
64
+ #Check if file is > 24mb, if not then use Whisper API
65
+ if audio_size <= 25:
66
+
67
+ #Use whisper API
68
+ results = load_whisper_api(audio_file)['text']
69
+
70
+ else:
71
+
72
+ st.info('File size larger than 24mb, applying chunking and transcription')
73
+
74
+ song = AudioSegment.from_file(audio_file, format='mp3')
75
+
76
+ # PyDub handles time in milliseconds
77
+ twenty_minutes = 20 * 60 * 1000
78
+
79
+ chunks = song[::twenty_minutes]
80
+
81
+ transcriptions = []
82
+
83
+ for i, chunk in enumerate(chunks):
84
+ chunk.export(f'chunk_{i}.mp3', format='mp3')
85
+ transcriptions.append(load_whisper_api(f'chunk_{i}.mp3')['text'])
86
+
87
+ results = ','.join(transcriptions)
88
+
89
+ # Return the transcribed text
90
+ st.info("Podcast transcription completed, returning results...")
91
+
92
+ return results
93
+
94
+ @st.cache_data
95
+ def get_podcast_summary(podcast_transcript):
96
+
97
+ instructPrompt = """
98
+ You are a podcast analyst and your main task is to summarize the key and important points of
99
+ the podcast for a busy professional by highlighting the main and important points
100
+ to ensure the professional has a sufficient summary of the podcast. Include any questions you consider important or
101
+ any points that warrant further investigation.
102
+
103
+ Please use bulletpoints.
104
+
105
+ """
106
+
107
+ request = instructPrompt + podcast_transcript
108
+
109
+ chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
110
+ messages=[{"role": "system", "content": "You are a helpful assistant."},
111
+ {"role": "user", "content": request}
112
+ ]
113
+ )
114
+
115
+ podcastSummary = chatOutput.choices[0].message.content
116
+
117
+ return podcastSummary
118
+
119
+ @st.cache_data
120
+ def get_podcast_guest(podcast_transcript):
121
+ '''Get guest name, professional title, organization name'''
122
+
123
+ completion = openai.ChatCompletion.create(
124
+ model="gpt-3.5-turbo-16k",
125
+ messages=[{"role": "user", "content": podcast_transcript}],
126
+ functions=[
127
+
128
+ {
129
+ "name": "get_podcast_guest_information",
130
+ "description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google",
131
+ "parameters": {
132
+ "type": "object",
133
+ "properties": {
134
+ "guest_name": {
135
+ "type": "string",
136
+ "description": "The full name of the guest who is being interviewed in the podcast",
137
+ },
138
+ "guest_organization": {
139
+ "type": "string",
140
+ "description": "The name or details of the organization that the podcast guest belongs to, works for or runs",
141
+ },
142
+ "guest_title": {
143
+ "type": "string",
144
+ "description": "The title, designation or role the podcast guest holds or type of work that the podcast guest in the organization does",
145
+ },
146
+ },
147
+ "required": ["guest_name"],
148
+ },
149
+ }
150
+ ],
151
+ function_call={"name": "get_podcast_guest_information"}
152
+ )
153
+
154
+ podcast_guest = ""
155
+ podcast_guest_org = ""
156
+ podcast_guest_title = ""
157
+ response_message = completion["choices"][0]["message"]
158
+
159
+ if response_message.get("function_call"):
160
+
161
+ function_name = response_message["function_call"]["name"]
162
+ function_args = json.loads(response_message["function_call"]["arguments"])
163
+ podcast_guest=function_args.get("guest_name")
164
+ podcast_guest_org=function_args.get("guest_organization")
165
+ podcast_guest_title=function_args.get("guest_title")
166
+
167
+ return (podcast_guest,podcast_guest_org,podcast_guest_title)
168
+
169
+ @st.cache_data
170
+ def get_podcast_highlights(podcast_transcript):
171
+
172
+ instructPrompt = """
173
+ Extract some key moments in the podcast. These are typically interesting insights from the guest or critical questions that the host might have put forward. It could also be a discussion on a hot topic or controversial opinion
174
+ """
175
+ request = instructPrompt + podcast_transcript
176
+
177
+ chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
178
+ messages=[{"role": "system", "content": "You are a helpful assistant."},
179
+ {"role": "user", "content": podcast_transcript}
180
+ ]
181
+ )
182
+
183
+ podcastHighlights = chatOutput.choices[0].message.content
184
+
185
+ return podcastHighlights
186
+
187
+ @st.cache_data
188
+ def process_podcast(url, path):
189
+
190
+ '''Get podcast transcription into json'''
191
+
192
+ output = {}
193
+ podcast_details = get_transcribe_podcast.call(url, path)
194
+ podcast_summary = get_podcast_summary.call(podcast_details)
195
+ podcast_guest_details = get_podcast_guest.call(podcast_details)
196
+ podcast_highlights = get_podcast_highlights.call(podcast_details)
197
+ output['podcast_details'] = podcast_details
198
+ output['podcast_summary'] = podcast_summary
199
+ output['podcast_guest'] = podcast_guest_details[0]
200
+ output['podcast_guest_org'] = podcast_guest_details[1]
201
+ output['podcast_guest_title'] = podcast_guest_details[2]
202
+ output['podcast_highlights'] = podcast_highlights
203
+
204
+ return output