File size: 8,420 Bytes
ee377d8 efa4923 0064167 efa4923 972a238 0064167 972a238 efa4923 972a238 ee377d8 972a238 ee377d8 972a238 ee377d8 972a238 efa4923 972a238 efa4923 972a238 0064167 efa4923 972a238 efa4923 972a238 0064167 972a238 0064167 972a238 0064167 972a238 0064167 972a238 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import streamlit as st
import moviepy.editor as mp
import speech_recognition as sr
from pydub import AudioSegment
import tempfile
import os
import io
import requests
from transformers import pipeline
import matplotlib.pyplot as plt
import librosa
import numpy as np
# Function to download file from URL
def download_file(url):
try:
extension = os.path.splitext(url)[1]
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension)
with requests.get(url, stream=True) as r:
r.raise_for_status()
for chunk in r.iter_content(chunk_size=8192):
temp_file.write(chunk)
temp_file.close()
return temp_file.name
except Exception as e:
st.error(f"Failed to download file: {e}")
return None
# Function to convert video to audio
def video_to_audio(video_file):
video = mp.VideoFileClip(video_file)
audio = video.audio
temp_audio_path = tempfile.mktemp(suffix=".mp3")
audio.write_audiofile(temp_audio_path)
return temp_audio_path
# Function to convert MP3 to WAV
def convert_mp3_to_wav(mp3_file):
audio = AudioSegment.from_mp3(mp3_file)
temp_wav_path = tempfile.mktemp(suffix=".wav")
audio.export(temp_wav_path, format="wav")
return temp_wav_path
# Function to transcribe audio with chunking for large files
def transcribe_audio(audio_file):
audio = AudioSegment.from_wav(audio_file)
duration = len(audio) / 1000 # Duration in seconds
chunk_length = 60 # 60-second chunks
recognizer = sr.Recognizer()
if duration <= chunk_length:
with sr.AudioFile(audio_file) as source:
audio_data = recognizer.record(source)
try:
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
return "Audio could not be understood."
except sr.RequestError:
return "Could not request results from Google Speech Recognition service."
else:
num_chunks = int(duration // chunk_length) + 1
transcriptions = []
for i in range(num_chunks):
start_time = i * chunk_length * 1000 # in milliseconds
end_time = min((i + 1) * chunk_length * 1000, len(audio))
chunk = audio[start_time:end_time]
frame_data = chunk.raw_data
sample_rate = audio.frame_rate
sample_width = audio.sample_width
audio_data = sr.AudioData(frame_data, sample_rate, sample_width)
try:
text = recognizer.recognize_google(audio_data)
transcriptions.append(text)
except sr.UnknownValueError:
transcriptions.append("[Audio could not be understood.]")
except sr.RequestError:
transcriptions.append("[Could not request results.]")
return " ".join(transcriptions)
# Function to detect emotions
def detect_emotion(text):
emotion_pipeline = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
result = emotion_pipeline(text)
emotions = {emotion['label']: emotion['score'] for emotion in result[0]}
return emotions
# Function to plot audio waveform
def plot_waveform(audio_data, duration=10):
audio_data.seek(0)
y, sr = librosa.load(audio_data, sr=None, duration=duration)
plt.figure(figsize=(10, 4))
time = np.linspace(0, len(y)/sr, len(y))
plt.plot(time, y)
plt.title(f"Audio Waveform (first {duration} seconds)")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
st.pyplot(plt)
# Streamlit app layout
st.title("Video and Audio to Text Transcription with Emotion Detection and Visualization")
st.write("Upload a video or audio file, or provide a URL to a large file (up to 1GB).")
st.write("**Note:** Direct file uploads are limited to 200MB. For larger files, please provide a URL.")
tab = st.selectbox("Select file type", ["Video", "Audio"])
if tab == "Video":
method = st.radio("Choose how to provide the video file:", ["Upload file", "Provide URL"])
if method == "Upload file":
uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mov", "avi"])
elif method == "Provide URL":
url = st.text_input("Enter video URL")
if st.button("Analyze Video"):
if method == "Upload file" and uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
tmp_file.write(uploaded_file.read())
file_path = tmp_file.name
elif method == "Provide URL" and url:
with st.spinner("Downloading video... This may take a while for large files."):
file_path = download_file(url)
if file_path is None:
st.error("Failed to download the file. Please check the URL and try again.")
st.stop()
else:
st.error("Please provide a file or URL.")
st.stop()
# Process the video file
with st.spinner("Processing video..."):
audio_file = video_to_audio(file_path)
wav_audio_file = convert_mp3_to_wav(audio_file)
transcription = transcribe_audio(wav_audio_file)
st.text_area("Transcription", transcription, height=300)
emotions = detect_emotion(transcription)
st.write(f"Detected Emotions: {emotions}")
with open(wav_audio_file, "rb") as f:
audio_data = io.BytesIO(f.read())
st.session_state.wav_audio_file = audio_data
plot_waveform(st.session_state.wav_audio_file)
# Cleanup
os.remove(file_path)
os.remove(audio_file)
os.remove(wav_audio_file)
if 'wav_audio_file' in st.session_state:
st.audio(st.session_state.wav_audio_file, format='audio/wav')
st.download_button("Download Transcription", transcription, "transcription.txt", "text/plain")
st.download_button("Download Audio", st.session_state.wav_audio_file, "converted_audio.wav", "audio/wav")
elif tab == "Audio":
method = st.radio("Choose how to provide the audio file:", ["Upload file", "Provide URL"])
if method == "Upload file":
uploaded_file = st.file_uploader("Upload Audio", type=["wav", "mp3"])
elif method == "Provide URL":
url = st.text_input("Enter audio URL")
if st.button("Analyze Audio"):
if method == "Upload file" and uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3' if uploaded_file.type == "audio/mpeg" else '.wav') as tmp_file:
tmp_file.write(uploaded_file.read())
file_path = tmp_file.name
elif method == "Provide URL" and url:
with st.spinner("Downloading audio... This may take a while for large files."):
file_path = download_file(url)
if file_path is None:
st.error("Failed to download the file. Please check the URL and try again.")
st.stop()
else:
st.error("Please provide a file or URL.")
st.stop()
# Process the audio file
with st.spinner("Processing audio..."):
if file_path.endswith('.mp3'):
wav_audio_file = convert_mp3_to_wav(file_path)
else:
wav_audio_file = file_path
transcription = transcribe_audio(wav_audio_file)
st.text_area("Transcription", transcription, height=300)
emotions = detect_emotion(transcription)
st.write(f"Detected Emotions: {emotions}")
with open(wav_audio_file, "rb") as f:
audio_data = io.BytesIO(f.read())
st.session_state.wav_audio_file_audio = audio_data
plot_waveform(st.session_state.wav_audio_file_audio)
# Cleanup
if file_path != wav_audio_file:
os.remove(file_path)
os.remove(wav_audio_file)
if 'wav_audio_file_audio' in st.session_state:
st.audio(st.session_state.wav_audio_file_audio, format='audio/wav')
st.download_button("Download Transcription", transcription, "transcription_audio.txt", "text/plain")
st.download_button("Download Audio", st.session_state.wav_audio_file_audio, "converted_audio_audio.wav", "audio/wav") |