Spaces:

HARISH20205
/

Speech-Summarize

Sleeping

App Files Files Community

HARISH20205 commited on Mar 24

Commit

016fc9a

1 Parent(s): dfa9d54

cookies

Browse files

Files changed (3) hide show

Dockerfile +13 -5
app.py +355 -148
requirements.txt +3 -1

Dockerfile CHANGED Viewed

@@ -6,21 +6,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     ffmpeg \
     build-essential \
     libsndfile1 \
     && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
-RUN mkdir -p /app/static/audio
 EXPOSE 7860
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
-    MODEL_NAME="google/pegasus-xsum"
-CMD ["gunicorn", "--bind", "0.0.0.0:7860", "app:app"]

     ffmpeg \
     build-essential \
     libsndfile1 \
+    ca-certificates \
+    openssl \
     && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && update-ca-certificates
 COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt \
+    && pip install --no-cache-dir pytube
 COPY . .
+RUN mkdir -p /app/static/audio /tmp/yt-dlp
+RUN chmod -R 777 /tmp
 EXPOSE 7860
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
+    MODEL_NAME="google/pegasus-xsum" \
+    PYTHONHTTPSVERIFY=0 \
+    SSL_CERT_DIR=/etc/ssl/certs \
+    REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
+CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--timeout", "300", "--workers", "1", "app:app"]

app.py CHANGED Viewed

@@ -16,17 +16,47 @@ import tempfile
 load_dotenv()
-# Create directories with proper permissions
-for directory in ["/tmp/transformers_cache", "/tmp/hf_home", "/tmp/cache"]:
-    os.makedirs(directory, exist_ok=True)
-    # Ensure the directory is writeable
-    os.chmod(directory, 0o777)
-# Set environment variables after creating directories
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
-os.environ["HF_HOME"] = "/tmp/hf_home"
-os.environ["XDG_CACHE_HOME"] = "/tmp/cache"
-os.environ["PYTHONHTTPSVERIFY"] = "0"  # Added to help with SSL issues
 app = Flask(__name__)
@@ -61,25 +91,78 @@ def load_pegasus_model():
     return tokenizer, model
-def transcribe_audio_with_whisper(audio_data, timeout=180):  # 3 minute timeout
     try:
         logging.info("Transcribing audio data")
         start_time = time.time()
         model = load_whisper_model()
-        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) as temp_file:
             if isinstance(audio_data, io.BytesIO):
                 temp_file.write(audio_data.getvalue())
             else:
                 temp_file.write(audio_data)
             temp_file.flush()
-            # Add timeout monitoring
-            result = model.transcribe(temp_file.name)
-            elapsed = time.time() - start_time
-            logging.info(f"Transcription completed in {elapsed:.2f} seconds")
         return result["text"]
     except Exception as e:
         logging.error(f"Error in audio transcription: {e}")
         raise ValueError(f"Error in audio transcription: {e}")
@@ -113,146 +196,209 @@ def summarize_text_with_pegasus(text, tokenizer, model):
         raise ValueError(f"Error in text summarization: {e}")
-def download_audio_from_youtube(url):
-    buffer = io.BytesIO()
-    # Create a temp directory for cookies with correct permissions
-    cookie_dir = tempfile.mkdtemp()
-    cookies_path = os.path.join(cookie_dir, "cookies.txt")
-    # Create an empty cookies file with appropriate permissions
-    with open(cookies_path, "w") as f:
-        f.write("")
-    os.chmod(cookies_path, 0o666)
-    ydl_opts = {
-        "format": "bestaudio/best",
-        "postprocessors": [
-            {
-                "key": "FFmpegExtractAudio",
-                "preferredcodec": "mp3",
-                "preferredquality": "192",
-            }
-        ],
-        "outtmpl": "-",
-        "logtostderr": True,
-        "quiet": False,
-        "no_warnings": False,
-        "extract_audio": True,
-        "nocheckcertificate": True,
-        "ignoreerrors": True,
-        "no_color": True,
-        "geo_bypass": True,
-        "cookies": cookies_path,  # Use our temp cookies file
-        "socket_timeout": 30,
-        "retries": 3,
-    }
     try:
-        logging.info(f"Downloading audio from YouTube: {url}")
-        # Handle SSL certificate issues
-        cert_path = os.path.join(tempfile.gettempdir(), "cacert.pem")
-        import certifi
-        with open(cert_path, "wb") as f:
-            f.write(open(certifi.where(), "rb").read())
-        os.environ["SSL_CERT_FILE"] = cert_path
-        os.environ["REQUESTS_CA_BUNDLE"] = cert_path
-        # Create a proper temporary file for the download
-        with tempfile.NamedTemporaryFile(suffix=".%(ext)s", delete=False) as temp_file:
-            output_path = temp_file.name
-            ydl_opts["outtmpl"] = output_path
             try:
-                with YoutubeDL(ydl_opts) as ydl:
-                    # Add more info messages for debugging
-                    logging.info(f"Starting YouTube download with options: {ydl_opts}")
-                    info = ydl.extract_info(url, download=True)
-                    if not info:
-                        raise ValueError("Could not fetch video information")
-                    # Construct the output filename based on yt-dlp's naming pattern
-                    audio_file_path = ydl.prepare_filename(info)
-                    # Handle different possible extensions
-                    for ext in [".mp3", ".webm.mp3", ".m4a.mp3"]:
-                        possible_path = audio_file_path.replace(".webm", ext).replace(
-                            ".m4a", ext
-                        )
-                        if os.path.exists(possible_path):
-                            audio_file_path = possible_path
-                            break
-                    logging.info(f"Audio downloaded to: {audio_file_path}")
-                    if not os.path.exists(audio_file_path):
-                        raise ValueError(
-                            f"Downloaded file {audio_file_path} does not exist"
-                        )
-                    # Read the file and clean up
-                    with open(audio_file_path, "rb") as audio_file:
-                        buffer = io.BytesIO(audio_file.read())
-                        buffer.seek(0)
-                    # Remove the temporary file
-                    try:
-                        os.unlink(audio_file_path)
-                    except Exception as cleanup_err:
-                        logging.warning(f"Could not remove temp file: {cleanup_err}")
-            except Exception as ydl_err:
-                logging.error(f"YoutubeDL error: {ydl_err}")
-                # Try alternative download method - direct URL only, no fancy features
-                logging.info("Attempting alternative download method...")
-                simple_opts = {
-                    "format": "bestaudio",
-                    "outtmpl": output_path,
-                    "nocheckcertificate": True,
-                    "quiet": True,
-                    "no_warnings": True,
-                    "geo_bypass": True,
-                }
-                with YoutubeDL(simple_opts) as ydl:
-                    info = ydl.extract_info(url, download=True)
-                    if not info:
-                        raise ValueError("Could not fetch video information")
-                    audio_file_path = ydl.prepare_filename(info)
-                    if not os.path.exists(audio_file_path):
-                        raise ValueError(
-                            f"Downloaded file {audio_file_path} does not exist"
-                        )
-                    # Read the file
-                    with open(audio_file_path, "rb") as audio_file:
-                        buffer = io.BytesIO(audio_file.read())
-                        buffer.seek(0)
-                    # Convert to MP3 if needed
-                    if not audio_file_path.endswith(".mp3"):
-                        buffer = convert_audio_to_mp3(
-                            buffer.getvalue(),
-                            original_format=audio_file_path.split(".")[-1],
-                        )
-        # Clean up the temp directory
         try:
-            os.unlink(cookies_path)
-            os.rmdir(cookie_dir)
-        except:
-            pass
         return buffer
     except Exception as e:
-        logging.error(f"Unexpected error downloading audio: {e}", exc_info=True)
-        raise ValueError(f"Error downloading audio from YouTube: {e}")
 def allowed_file(filename):
@@ -281,56 +427,117 @@ def index():
 @app.route("/transcribe", methods=["POST"])
 def transcribe():
     try:
         audio_data = None
         if "url" in request.form and request.form["url"]:
-            youtube_url = request.form["url"]
             try:
                 audio_data = download_audio_from_youtube(youtube_url)
-            except ValueError as e:
-                if "bot" in str(e).lower() or "sign in" in str(e).lower():
                     return (
                         jsonify(
                             {
-                                "error": "YouTube bot protection is preventing download. Please try uploading an audio file directly instead."
                             }
                         ),
                         400,
                     )
                 else:
                     raise e
         elif "file" in request.files:
             audio_file = request.files["file"]
             if not audio_file.filename:
                 return jsonify({"error": "No file selected."}), 400
             if not allowed_file(audio_file.filename):
                 return (
                     jsonify(
-                        {"error": "Invalid file type. Please upload an audio file."}
                     ),
                     400,
                 )
             audio_bytes = audio_file.read()
             file_format = audio_file.filename.rsplit(".", 1)[1].lower()
             audio_data = convert_audio_to_mp3(audio_bytes, original_format=file_format)
         else:
             return jsonify({"error": "No audio file or URL provided."}), 400
         transcription = transcribe_audio_with_whisper(audio_data)
         if transcription:
             tokenizer, model = load_pegasus_model()
             summary = summarize_text_with_pegasus(transcription, tokenizer, model)
             return jsonify({"transcription": transcription, "summary": summary})
         else:
-            return jsonify({"error": "Transcription failed."}), 500
     except ValueError as e:
         return jsonify({"error": str(e)}), 400
     except Exception as e:
-        logging.error(f"An unexpected error occurred: {e}")
-        return jsonify({"error": "An unexpected error occurred."}), 500
 if __name__ == "__main__":

 load_dotenv()
+def setup_environment():
+    """Configure environment for Hugging Face Spaces"""
+    # Create directories with proper permissions
+    for directory in [
+        "/tmp/transformers_cache",
+        "/tmp/hf_home",
+        "/tmp/cache",
+        "/tmp/yt-dlp",
+        "/tmp/certs",
+    ]:
+        os.makedirs(directory, exist_ok=True)
+        try:
+            # Ensure the directory is writeable
+            os.chmod(directory, 0o777)
+        except Exception as e:
+            logging.warning(f"Could not set permissions on {directory}: {e}")
+    # Set environment variables
+    os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
+    os.environ["HF_HOME"] = "/tmp/hf_home"
+    os.environ["XDG_CACHE_HOME"] = "/tmp/cache"
+    # Certificate handling
+    os.environ["PYTHONHTTPSVERIFY"] = "0"
+    os.environ["REQUESTS_CA_BUNDLE"] = "/etc/ssl/certs/ca-certificates.crt"
+    os.environ["SSL_CERT_DIR"] = "/etc/ssl/certs"
+    # Set this to the temp directory to avoid permission issues
+    os.environ["HOME"] = "/tmp"
+    # For yt-dlp
+    os.environ["no_proxy"] = "*"
+    # Disable warnings that might flood logs
+    import warnings
+    warnings.filterwarnings("ignore", category=UserWarning)
+setup_environment()
 app = Flask(__name__)
     return tokenizer, model
+def transcribe_audio_with_whisper(audio_data, timeout=300):  # 5 minute timeout
     try:
         logging.info("Transcribing audio data")
         start_time = time.time()
         model = load_whisper_model()
+        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_file:
             if isinstance(audio_data, io.BytesIO):
                 temp_file.write(audio_data.getvalue())
             else:
                 temp_file.write(audio_data)
             temp_file.flush()
+            temp_file_path = temp_file.name
+        try:
+            # Use multiprocessing to implement a timeout
+            from multiprocessing import Process, Queue
+            def transcribe_process(file_path, result_queue):
+                try:
+                    model = load_whisper_model()
+                    result = model.transcribe(file_path)
+                    result_queue.put(result)
+                except Exception as e:
+                    result_queue.put(e)
+            # Create a queue for the result
+            result_queue = Queue()
+            # Create and start the process
+            process = Process(
+                target=transcribe_process, args=(temp_file_path, result_queue)
+            )
+            process.start()
+            # Wait for the specified timeout
+            process.join(timeout)
+            # If process is still running after timeout, terminate it
+            if process.is_alive():
+                process.terminate()
+                process.join()
+                os.unlink(temp_file_path)  # Clean up
+                raise TimeoutError(f"Transcription timed out after {timeout} seconds")
+            # Get the result
+            if result_queue.empty():
+                os.unlink(temp_file_path)  # Clean up
+                raise ValueError("Transcription process failed")
+            result_or_error = result_queue.get()
+            if isinstance(result_or_error, Exception):
+                os.unlink(temp_file_path)  # Clean up
+                raise result_or_error
+            result = result_or_error
+        finally:
+            # Clean up temp file
+            try:
+                os.unlink(temp_file_path)
+            except:
+                pass
+        elapsed = time.time() - start_time
+        logging.info(f"Transcription completed in {elapsed:.2f} seconds")
         return result["text"]
+    except TimeoutError as e:
+        logging.error(f"Transcription timeout: {e}")
+        raise ValueError(
+            "Audio transcription took too long. Please try a shorter audio file."
+        )
     except Exception as e:
         logging.error(f"Error in audio transcription: {e}")
         raise ValueError(f"Error in audio transcription: {e}")
         raise ValueError(f"Error in text summarization: {e}")
+def download_youtube_with_cookies(url):
+    """Download YouTube audio using the project's cookies file"""
     try:
+        logging.info(f"Downloading YouTube with cookies: {url}")
+        # Use the cookies.txt from the project directory
+        cookies_path = os.path.join(os.getcwd(), "cookies.txt")
+        if not os.path.exists(cookies_path):
+            logging.warning("cookies.txt not found in project directory")
+            # Create an empty cookies file
+            with open(cookies_path, "w") as f:
+                f.write("# Netscape HTTP Cookie File\n")
+        logging.info(f"Using cookies from: {cookies_path}")
+        output_dir = "/tmp/yt_downloads"
+        os.makedirs(output_dir, exist_ok=True)
+        os.chmod(output_dir, 0o777)
+        output_path = os.path.join(output_dir, f"download_{int(time.time())}.%(ext)s")
+        ydl_opts = {
+            "format": "bestaudio/best",
+            "postprocessors": [
+                {
+                    "key": "FFmpegExtractAudio",
+                    "preferredcodec": "mp3",
+                    "preferredquality": "192",
+                }
+            ],
+            "outtmpl": output_path,
+            "cookies": cookies_path,
+            "nocheckcertificate": True,
+            "ignoreerrors": True,
+            "geo_bypass": True,
+            "logtostderr": True,
+            "quiet": False,
+            "no_warnings": False,
+            "socket_timeout": 30,
+            "retries": 5,
+        }
+        with YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=True)
+            if not info:
+                raise ValueError("Could not fetch video information")
+            filename = ydl.prepare_filename(info)
+            # Handle potential mp3 extension
+            if not filename.endswith(".mp3"):
+                filename = filename.rsplit(".", 1)[0] + ".mp3"
+            if not os.path.exists(filename):
+                # Try alternative extensions
+                for ext in [".mp3", ".webm.mp3", ".m4a.mp3"]:
+                    alt_filename = filename.rsplit(".", 1)[0] + ext
+                    if os.path.exists(alt_filename):
+                        filename = alt_filename
+                        break
+            logging.info(f"Downloaded file: {filename}")
+            if not os.path.exists(filename):
+                raise FileNotFoundError(f"Could not find downloaded file: {filename}")
+            with open(filename, "rb") as f:
+                buffer = io.BytesIO(f.read())
+                buffer.seek(0)
+            # Clean up
+            try:
+                os.unlink(filename)
+            except Exception as e:
+                logging.warning(f"Could not remove temp file: {e}")
+            return buffer
+    except Exception as e:
+        logging.error(f"Error downloading with cookies: {e}", exc_info=True)
+        raise ValueError(f"Error downloading with cookies: {e}")
+def download_youtube_direct(url):
+    """Direct YouTube download without cookies, simplified options"""
+    try:
+        logging.info(f"Attempting direct YouTube download: {url}")
+        output_dir = "/tmp/yt_direct"
+        os.makedirs(output_dir, exist_ok=True)
+        os.chmod(output_dir, 0o777)
+        output_path = os.path.join(output_dir, f"direct_{int(time.time())}.%(ext)s")
+        ydl_opts = {
+            "format": "bestaudio",
+            "outtmpl": output_path,
+            "nocheckcertificate": True,
+            "ignoreerrors": False,
+            "geo_bypass": True,
+            "no_warnings": True,
+            "quiet": True,
+            "skip_download": False,
+            "noprogress": True,
+            "nooverwrites": False,
+            "socket_timeout": 30,
+        }
+        with YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=True)
+            if not info:
+                raise ValueError("Could not fetch video information")
+            filename = ydl.prepare_filename(info)
+            if not os.path.exists(filename):
+                raise FileNotFoundError(f"Could not find downloaded file: {filename}")
+            with open(filename, "rb") as f:
+                data = f.read()
+            # Convert to mp3 if needed
+            if not filename.endswith(".mp3"):
+                buffer = convert_audio_to_mp3(
+                    data, original_format=filename.split(".")[-1]
+                )
+            else:
+                buffer = io.BytesIO(data)
+                buffer.seek(0)
+            # Clean up
             try:
+                os.unlink(filename)
+            except Exception as e:
+                logging.warning(f"Could not remove temp file: {e}")
+            return buffer
+    except Exception as e:
+        logging.error(f"Error in direct download: {e}", exc_info=True)
+        raise ValueError(f"Error in direct download: {e}")
+def download_audio_from_youtube(url):
+    """Main YouTube download function with multiple fallback methods"""
+    logging.info(f"Starting YouTube download process for: {url}")
+    errors = []
+    # Method 1: Try with project cookies
+    try:
+        return download_youtube_with_cookies(url)
+    except Exception as e:
+        logging.warning(f"Cookie download failed: {e}")
+        errors.append(f"Cookie method: {str(e)}")
+    # Method 2: Try direct download
+    try:
+        return download_youtube_direct(url)
+    except Exception as e:
+        logging.warning(f"Direct download failed: {e}")
+        errors.append(f"Direct method: {str(e)}")
+    # Method 3: Try with pytube as last resort
+    try:
+        logging.info("Attempting download with pytube")
+        from pytube import YouTube
+        yt = YouTube(url)
+        stream = yt.streams.filter(only_audio=True).first()
+        if not stream:
+            raise ValueError("No audio stream found")
+        output_dir = "/tmp/pytube_downloads"
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = stream.download(output_path=output_dir)
+        logging.info(f"Downloaded to: {output_path}")
+        with open(output_path, "rb") as f:
+            data = f.read()
+        # Convert to mp3
+        buffer = convert_audio_to_mp3(data, original_format=output_path.split(".")[-1])
+        # Clean up
         try:
+            os.unlink(output_path)
+        except Exception as e:
+            logging.warning(f"Could not remove pytube temp file: {e}")
         return buffer
     except Exception as e:
+        logging.error(f"Pytube download failed: {e}")
+        errors.append(f"Pytube method: {str(e)}")
+    # All methods failed
+    error_message = "All download methods failed:\n" + "\n".join(errors)
+    logging.error(error_message)
+    raise ValueError(
+        "Could not download YouTube audio. Please try uploading an audio file directly or use a different URL."
+    )
 def allowed_file(filename):
 @app.route("/transcribe", methods=["POST"])
 def transcribe():
+    # Record the start time
+    start_time = time.time()
+    logging.info("Starting new transcription request")
     try:
         audio_data = None
         if "url" in request.form and request.form["url"]:
+            youtube_url = request.form["url"].strip()
+            logging.info(f"Processing YouTube URL: {youtube_url}")
+            if not youtube_url.startswith(("http://", "https://")):
+                return (
+                    jsonify(
+                        {"error": "Invalid URL format. Please provide a complete URL."}
+                    ),
+                    400,
+                )
             try:
                 audio_data = download_audio_from_youtube(youtube_url)
+                logging.info(
+                    f"YouTube download completed in {time.time() - start_time:.2f} seconds"
+                )
+            except Exception as e:
+                error_msg = str(e).lower()
+                if any(
+                    term in error_msg
+                    for term in [
+                        "bot",
+                        "sign in",
+                        "cookie",
+                        "certificate",
+                        "permission",
+                    ]
+                ):
                     return (
                         jsonify(
                             {
+                                "error": "YouTube access issue. Please try uploading an audio file directly or use a different YouTube URL."
                             }
                         ),
                         400,
                     )
                 else:
                     raise e
         elif "file" in request.files:
             audio_file = request.files["file"]
             if not audio_file.filename:
                 return jsonify({"error": "No file selected."}), 400
             if not allowed_file(audio_file.filename):
                 return (
                     jsonify(
+                        {
+                            "error": "Invalid file type. Please upload an audio file (mp3, aac, flac, or m4a)."
+                        }
                     ),
                     400,
                 )
             audio_bytes = audio_file.read()
             file_format = audio_file.filename.rsplit(".", 1)[1].lower()
+            logging.info(
+                f"Processing uploaded file: {audio_file.filename}, format: {file_format}, size: {len(audio_bytes)} bytes"
+            )
             audio_data = convert_audio_to_mp3(audio_bytes, original_format=file_format)
+            logging.info(
+                f"File conversion completed in {time.time() - start_time:.2f} seconds"
+            )
         else:
             return jsonify({"error": "No audio file or URL provided."}), 400
+        # Transcribe the audio
+        transcribe_start = time.time()
         transcription = transcribe_audio_with_whisper(audio_data)
+        transcribe_time = time.time() - transcribe_start
+        logging.info(
+            f"Transcription completed in {transcribe_time:.2f} seconds. Text length: {len(transcription)}"
+        )
         if transcription:
+            # Summarize the transcription
+            summary_start = time.time()
             tokenizer, model = load_pegasus_model()
             summary = summarize_text_with_pegasus(transcription, tokenizer, model)
+            summary_time = time.time() - summary_start
+            logging.info(
+                f"Summarization completed in {summary_time:.2f} seconds. Summary length: {len(summary)}"
+            )
+            total_time = time.time() - start_time
+            logging.info(f"Total request completed in {total_time:.2f} seconds")
             return jsonify({"transcription": transcription, "summary": summary})
         else:
+            return jsonify({"error": "Transcription failed to produce any text."}), 500
     except ValueError as e:
+        logging.error(f"ValueError: {str(e)}")
         return jsonify({"error": str(e)}), 400
     except Exception as e:
+        logging.error(f"An unexpected error occurred: {e}", exc_info=True)
+        return (
+            jsonify(
+                {"error": "An unexpected error occurred while processing your request."}
+            ),
+            500,
+        )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -7,4 +7,6 @@ python-dotenv==1.0.0
 torch==2.1.1
 gunicorn==21.2.0
 numpy==1.24.2
-certifi>=2023.7.22

 torch==2.1.1
 gunicorn==21.2.0
 numpy==1.24.2
+certifi>=2023.7.22
+pytube>=12.1.0
+werkzeug>=2.2.3