HARISH20205 commited on
Commit
fe98768
·
1 Parent(s): e0e1c9c
Files changed (6) hide show
  1. Dockerfile +26 -0
  2. app.py +206 -0
  3. requirements.txt +9 -0
  4. static/css/styles.css +151 -0
  5. static/js/scripts.js +67 -0
  6. templates/index.html +42 -0
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ ffmpeg \
7
+ build-essential \
8
+ libsndfile1 \
9
+ && apt-get clean \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ COPY requirements.txt .
13
+
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ COPY . .
17
+
18
+ RUN mkdir -p /app/static/audio
19
+
20
+ EXPOSE 7860
21
+
22
+ ENV PYTHONDONTWRITEBYTECODE=1 \
23
+ PYTHONUNBUFFERED=1 \
24
+ MODEL_NAME="google/pegasus-xsum"
25
+
26
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, render_template
2
+ import whisper
3
+ from pydub import AudioSegment
4
+ import os
5
+ import io
6
+ import numpy as np
7
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
8
+ import math
9
+ from yt_dlp import YoutubeDL
10
+ import logging
11
+ from functools import lru_cache
12
+ from dotenv import load_dotenv
13
+ import time
14
+ import re
15
+ import tempfile
16
+
17
+ load_dotenv()
18
+
19
+ app = Flask(__name__)
20
+
21
+ # Configure logging
22
+ logging.basicConfig(level=logging.INFO)
23
+
24
+ # Model setup
25
+ MODEL_NAME = "google/pegasus-xsum"
26
+
27
+ # Function to convert audio bytes to MP3 format in memory
28
+ def convert_audio_to_mp3(audio_bytes, original_format=None):
29
+ try:
30
+ logging.info(f"Converting audio from {original_format} to MP3 in memory...")
31
+ audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format=original_format)
32
+ buffer = io.BytesIO()
33
+ audio.export(buffer, format="mp3")
34
+ buffer.seek(0)
35
+ logging.info("Conversion successful")
36
+ return buffer
37
+ except Exception as e:
38
+ logging.error(f"Error converting audio to MP3: {e}")
39
+ raise ValueError(f"Error converting audio to MP3: {e}")
40
+
41
+ # Function to load Whisper model
42
+ @lru_cache(maxsize=1)
43
+ def load_whisper_model():
44
+ return whisper.load_model("base")
45
+
46
+ # Function to load Pegasus model
47
+ @lru_cache(maxsize=1)
48
+ def load_pegasus_model():
49
+ tokenizer = PegasusTokenizer.from_pretrained(MODEL_NAME)
50
+ model = PegasusForConditionalGeneration.from_pretrained(MODEL_NAME)
51
+ return tokenizer, model
52
+
53
+ # Function to transcribe audio using Whisper
54
+ def transcribe_audio_with_whisper(audio_data):
55
+ try:
56
+ logging.info("Transcribing audio data")
57
+ model = load_whisper_model()
58
+
59
+ # Create a temporary file for Whisper (which requires a file path)
60
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) as temp_file:
61
+ if isinstance(audio_data, io.BytesIO):
62
+ temp_file.write(audio_data.getvalue())
63
+ else:
64
+ temp_file.write(audio_data)
65
+ temp_file.flush()
66
+
67
+ # Transcribe using the temporary file
68
+ result = model.transcribe(temp_file.name)
69
+
70
+ return result["text"]
71
+ except Exception as e:
72
+ logging.error(f"Error in audio transcription: {e}")
73
+ raise ValueError(f"Error in audio transcription: {e}")
74
+
75
+ # Function to summarize text using Pegasus
76
+ def summarize_text_with_pegasus(text, tokenizer, model):
77
+ try:
78
+ inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
79
+ total_tokens = len(inputs["input_ids"][0])
80
+ min_summary_length = max(math.ceil(total_tokens / 4), 75)
81
+ max_summary_length = max(math.ceil(total_tokens / 3), 200)
82
+
83
+ if min_summary_length >= max_summary_length:
84
+ min_summary_length = max_summary_length - 1
85
+
86
+ summary_ids = model.generate(
87
+ inputs.input_ids,
88
+ num_beams=5,
89
+ min_length=min_summary_length,
90
+ max_length=max_summary_length,
91
+ early_stopping=True
92
+ )
93
+
94
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
95
+ summary = remove_repeated_sentences(summary) # Remove repeated sentences from summary
96
+ return summary
97
+ except Exception as e:
98
+ logging.error(f"Error in text summarization: {e}")
99
+ raise ValueError(f"Error in text summarization: {e}")
100
+
101
+ # Function to download audio from YouTube using yt_dlp (in memory)
102
+ def download_audio_from_youtube(url):
103
+ # Create a buffer to store the downloaded audio
104
+ buffer = io.BytesIO()
105
+
106
+ ydl_opts = {
107
+ 'format': 'bestaudio/best',
108
+ 'postprocessors': [{
109
+ 'key': 'FFmpegExtractAudio',
110
+ 'preferredcodec': 'mp3',
111
+ 'preferredquality': '192',
112
+ }],
113
+ # Use temp directory for intermediate files
114
+ 'outtmpl': '-',
115
+ 'logtostderr': True,
116
+ 'quiet': True,
117
+ 'no_warnings': True,
118
+ # Stream to stdout and capture
119
+ 'extract_audio': True,
120
+ }
121
+
122
+ try:
123
+ logging.info(f"Downloading audio from YouTube: {url}")
124
+ # Create temp file for YouTube-DL (it needs a file path)
125
+ with tempfile.NamedTemporaryFile(suffix=".%(ext)s") as temp_file:
126
+ ydl_opts['outtmpl'] = temp_file.name
127
+
128
+ with YoutubeDL(ydl_opts) as ydl:
129
+ # Extract info and download
130
+ info = ydl.extract_info(url, download=True)
131
+ # Get the filename of the downloaded audio
132
+ audio_file_path = ydl.prepare_filename(info).replace('.webm', '.mp3').replace('.m4a', '.mp3')
133
+
134
+ # Read the file into memory
135
+ with open(audio_file_path, 'rb') as audio_file:
136
+ buffer = io.BytesIO(audio_file.read())
137
+ buffer.seek(0)
138
+
139
+ return buffer
140
+ except Exception as e:
141
+ logging.error(f"Unexpected error downloading audio: {e}")
142
+ raise ValueError(f"Error downloading audio from YouTube: {e}")
143
+
144
+ # Function to check allowed file extensions
145
+ def allowed_file(filename):
146
+ ALLOWED_EXTENSIONS = {'mp3', 'aac', 'flac', 'm4a'}
147
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
148
+
149
+ # Function to remove repeated sentences
150
+ def remove_repeated_sentences(text):
151
+ sentences = re.split(r'(?<=[.!?]) +', text) # Split by sentence-ending punctuation
152
+ unique_sentences = []
153
+ seen_sentences = set()
154
+
155
+ for sentence in sentences:
156
+ normalized_sentence = sentence.lower().strip()
157
+ if normalized_sentence not in seen_sentences:
158
+ unique_sentences.append(sentence)
159
+ seen_sentences.add(normalized_sentence)
160
+
161
+ return ' '.join(unique_sentences)
162
+
163
+ # Route to render index.html template
164
+ @app.route('/')
165
+ def index():
166
+ return render_template('index.html')
167
+
168
+ # Route to handle transcription and summarization
169
+ @app.route('/transcribe', methods=['POST'])
170
+ def transcribe():
171
+ try:
172
+ audio_data = None
173
+
174
+ if 'url' in request.form and request.form['url']:
175
+ youtube_url = request.form['url']
176
+ audio_data = download_audio_from_youtube(youtube_url)
177
+ elif 'file' in request.files:
178
+ audio_file = request.files['file']
179
+ if not audio_file.filename:
180
+ return jsonify({"error": "No file selected."}), 400
181
+ if not allowed_file(audio_file.filename):
182
+ return jsonify({"error": "Invalid file type. Please upload an audio file."}), 400
183
+
184
+ # Read file data into memory
185
+ audio_bytes = audio_file.read()
186
+ file_format = audio_file.filename.rsplit('.', 1)[1].lower()
187
+ audio_data = convert_audio_to_mp3(audio_bytes, original_format=file_format)
188
+ else:
189
+ return jsonify({"error": "No audio file or URL provided."}), 400
190
+
191
+ transcription = transcribe_audio_with_whisper(audio_data)
192
+
193
+ if transcription:
194
+ tokenizer, model = load_pegasus_model()
195
+ summary = summarize_text_with_pegasus(transcription, tokenizer, model)
196
+ return jsonify({"transcription": transcription, "summary": summary})
197
+ else:
198
+ return jsonify({"error": "Transcription failed."}), 500
199
+ except ValueError as e:
200
+ return jsonify({"error": str(e)}), 400
201
+ except Exception as e:
202
+ logging.error(f"An unexpected error occurred: {e}")
203
+ return jsonify({"error": "An unexpected error occurred."}), 500
204
+
205
+ if __name__ == "__main__":
206
+ app.run(debug=True, port=7860)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ flask==2.3.3
2
+ openai-whisper==20231117
3
+ pydub==0.25.1
4
+ transformers==4.36.2
5
+ yt-dlp==2023.11.16
6
+ python-dotenv==1.0.0
7
+ torch==2.1.1
8
+ gunicorn==21.2.0
9
+ numpy==1.24.2
static/css/styles.css ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* General styling */
2
+ body {
3
+ font-family: 'Arial', sans-serif;
4
+ margin: 0;
5
+ padding: 0;
6
+ background-color: #121212;
7
+ color: #ffffff;
8
+ }
9
+
10
+ h1 {
11
+ color: #ffffff;
12
+ text-align: center;
13
+ margin-top: 20px;
14
+ font-size: 2.5em;
15
+ padding: 10px;
16
+ }
17
+
18
+ .container {
19
+ display: flex;
20
+ justify-content: space-between;
21
+ padding: 20px;
22
+ background-color: #1e1e1e;
23
+ margin: 20px auto;
24
+ max-width: 1200px;
25
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.5);
26
+ border-radius: 8px;
27
+ }
28
+
29
+ .left-container, .right-container {
30
+ flex: 1;
31
+ margin: 10px;
32
+ display: flex;
33
+ flex-direction: column;
34
+ justify-content: flex-start;
35
+ }
36
+
37
+ form {
38
+ display: flex;
39
+ flex-direction: column;
40
+ }
41
+
42
+ .input-container {
43
+ display: flex;
44
+ flex-direction: column;
45
+ margin-bottom: 20px;
46
+ }
47
+
48
+ .input-container input[type="text"],
49
+ .input-container input[type="file"] {
50
+ padding: 10px;
51
+ margin: 5px 0;
52
+ border: 1px solid #ccc;
53
+ border-radius: 5px;
54
+ box-sizing: border-box;
55
+ background-color: #2e2e2e;
56
+ color: #ffffff;
57
+ }
58
+
59
+ .input-container input[type="submit"],
60
+ .input-container button {
61
+ background-color: #007bff;
62
+ color: #fff;
63
+ border: none;
64
+ padding: 10px 20px;
65
+ cursor: pointer;
66
+ border-radius: 5px;
67
+ transition: background-color 0.3s ease;
68
+ margin-top: 10px;
69
+ }
70
+
71
+ .input-container input[type="submit"]:hover,
72
+ .input-container button:hover {
73
+ background-color: #0056b3;
74
+ }
75
+
76
+ #processing {
77
+ display: none;
78
+ color: #007bff;
79
+ font-size: 1.2em;
80
+ margin-top: 20px;
81
+ }
82
+
83
+ /* Summary and Transcription */
84
+ .response-box {
85
+ padding: 20px;
86
+ background-color: #2e2e2e;
87
+ border-radius: 8px;
88
+ margin-top: 20px;
89
+ box-shadow: 0 0 10px rgba(0, 0, 0, 0.5);
90
+ overflow-wrap: break-word;
91
+ }
92
+
93
+ .response-box h2 {
94
+ display: flex;
95
+ align-items: center;
96
+ justify-content: space-between;
97
+ margin-top: 0;
98
+ font-size: 1.5em;
99
+ color: #ffffff;
100
+ }
101
+
102
+ .response-box p {
103
+ margin: 10px 0 0;
104
+ font-size: 1em;
105
+ color: #b0b0b0;
106
+ }
107
+
108
+ #summary, #transcription {
109
+ margin-top: 10px;
110
+ }
111
+
112
+ .copy-btn {
113
+ background-color: #007bff;
114
+ color: #fff;
115
+ border: none;
116
+ padding: 5px 10px;
117
+ cursor: pointer;
118
+ border-radius: 5px;
119
+ font-size: 0.7em;
120
+ transition: background-color 0.3s ease;
121
+ }
122
+
123
+ .copy-btn:hover {
124
+ background-color: #0056b3;
125
+ }
126
+
127
+ /* Responsive styling */
128
+ @media (max-width: 768px) {
129
+ .container {
130
+ flex-direction: column;
131
+ align-items: center;
132
+ }
133
+
134
+ .left-container, .right-container {
135
+ width: 100%;
136
+ }
137
+ }
138
+
139
+ @media (max-width: 480px) {
140
+ h1 {
141
+ font-size: 2em;
142
+ }
143
+
144
+ .input-container input[type="text"],
145
+ .input-container input[type="file"],
146
+ .input-container input[type="submit"],
147
+ .input-container button {
148
+ width: 100%;
149
+ margin: 5px 0;
150
+ }
151
+ }
static/js/scripts.js ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ document.getElementById("transcribeForm").addEventListener("submit", function(event) {
2
+ event.preventDefault();
3
+
4
+ var form = event.target;
5
+ var formData = new FormData(form);
6
+ var xhr = new XMLHttpRequest();
7
+
8
+ // Display the processing message before sending the request
9
+ document.getElementById("processing").style.display = "block";
10
+ document.getElementById("processing").textContent = "Processing...";
11
+
12
+ xhr.open("POST", form.action, true);
13
+
14
+ xhr.onreadystatechange = function() {
15
+ if (xhr.readyState === XMLHttpRequest.DONE) {
16
+ document.getElementById("processing").style.display = "none";
17
+
18
+ var response = JSON.parse(xhr.responseText);
19
+
20
+ if (xhr.status === 200) {
21
+ document.getElementById("summary-content").textContent = response.summary;
22
+ document.getElementById("transcription-content").textContent = response.transcription;
23
+ } else {
24
+ document.getElementById("processing").style.display = "block";
25
+ document.getElementById("processing").textContent = response.error;
26
+ }
27
+
28
+ document.getElementById("youtube-url").disabled = false;
29
+ document.getElementById("file-input").disabled = false;
30
+ }
31
+ };
32
+
33
+ xhr.send(formData);
34
+
35
+ document.getElementById("youtube-url").disabled = true;
36
+ document.getElementById("file-input").disabled = true;
37
+ document.getElementById("summary-content").textContent = "Summary content will appear here...";
38
+ document.getElementById("transcription-content").textContent = "Transcription content will appear here...";
39
+ });
40
+
41
+ document.getElementById('clear-btn').addEventListener('click', function() {
42
+ document.getElementById('youtube-url').value = '';
43
+ document.getElementById('file-input').value = '';
44
+ document.getElementById("summary-content").textContent = "Summary content will appear here...";
45
+ document.getElementById("transcription-content").textContent = "Transcription content will appear here...";
46
+ document.getElementById("processing").style.display = "none";
47
+ document.getElementById("processing").textContent = "Processing...";
48
+ });
49
+
50
+ document.getElementById('copy-summary-btn').addEventListener('click', function() {
51
+ copyToClipboard('summary-content');
52
+ });
53
+
54
+ document.getElementById('copy-transcription-btn').addEventListener('click', function() {
55
+ copyToClipboard('transcription-content');
56
+ });
57
+
58
+ function copyToClipboard(elementId) {
59
+ var text = document.getElementById(elementId).textContent;
60
+ var textarea = document.createElement("textarea");
61
+ textarea.value = text;
62
+ document.body.appendChild(textarea);
63
+ textarea.select();
64
+ document.execCommand("copy");
65
+ document.body.removeChild(textarea);
66
+ alert("Copied to clipboard");
67
+ }
templates/index.html ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Speech-to-Text Summarization</title>
7
+ <link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='css/styles.css') }}">
8
+ </head>
9
+ <body>
10
+ <h1>Speech to Text Summarization</h1>
11
+ <div class="container">
12
+ <div class="left-container">
13
+ <form id="transcribeForm" action="/transcribe" method="post" enctype="multipart/form-data">
14
+ <div class="input-container">
15
+ <input type="text" id="youtube-url" name="url" placeholder="Enter YouTube URL">
16
+ </div>
17
+ <div class="input-container">
18
+ <input type="file" id="file-input" name="file" accept="audio/*">
19
+ </div>
20
+ <div class="input-container">
21
+ <input type="submit" value="Transcribe and Summarize">
22
+ </div>
23
+ <div class="input-container">
24
+ <button type="button" id="clear-btn">Clear</button>
25
+ </div>
26
+ </form>
27
+ <div id="processing">Processing...</div>
28
+ </div>
29
+ <div class="right-container">
30
+ <div id="summary" class="response-box">
31
+ <h2>Summary <button id="copy-summary-btn" class="copy-btn">Copy</button></h2>
32
+ <p id="summary-content">Summary content will appear here...</p>
33
+ </div>
34
+ <div id="transcription" class="response-box">
35
+ <h2>Transcription <button id="copy-transcription-btn" class="copy-btn">Copy</button></h2>
36
+ <p id="transcription-content">Transcription content will appear here...</p>
37
+ </div>
38
+ </div>
39
+ </div>
40
+ <script src="{{ url_for('static', filename='js/scripts.js') }}"></script>
41
+ </body>
42
+ </html>