import os import whisper import requests import asyncio import aiohttp # For making async HTTP requests from quart import Quart, request, jsonify, render_template from dotenv import load_dotenv from deepgram import DeepgramClient, PrerecordedOptions import warnings warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead") app = Quart(__name__) print("APP IS RUNNING, ANIKET") # Load the .env file load_dotenv() print("ENV LOADED, ANIKET") # Fetch the API key from the .env file API_KEY = os.getenv("FIRST_API_KEY") DEEPGRAM_API_KEY = os.getenv("SECOND_API_KEY") # Ensure the API key is loaded correctly if not API_KEY: raise ValueError("API Key not found. Make sure it is set in the .env file.") # Ensure the API key is loaded correctly if not DEEPGRAM_API_KEY: raise ValueError("DEEPGRAM_API_KEY not found. Make sure it is set in the .env file.") GEMINI_API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent" GEMINI_API_KEY = API_KEY # Load Whisper AI model at startup # print("Loading Whisper AI model..., ANIKET") # whisper_model = whisper.load_model("base") # Choose model size: tiny, base, small, medium, large # print("Whisper AI model loaded successfully, ANIKET") @app.route("/", methods=["GET"]) async def health_check(): return jsonify({"status": "success", "message": "API is running successfully!"}), 200 @app.route("/mbsa") async def mbsa(): return await render_template("mbsa.html") @app.route('/process-audio', methods=['POST']) async def process_audio(): print("GOT THE PROCESS AUDIO REQUEST, ANIKET") if 'audio' not in request.files: return jsonify({"error": "No audio file provided"}), 400 audio_file = request.files['audio'] print("AUDIO FILE NAME: ", audio_file) try: print("STARTING TRANSCRIPTION, ANIKET") # Step 1: Transcribe the uploaded audio file asynchronously transcription = await transcribe_audio(audio_file) print("BEFORE THE transcription FAILED ERROR, CHECKING IF I GOT THE TRANSCRIPTION", transcription) if not transcription: return jsonify({"error": "Audio transcription failed"}), 500 print("GOT THE transcription") print("Starting the GEMINI REQUEST TO STRUCTURE IT") # Step 2: Generate structured recipe information using Gemini API asynchronously structured_data = await query_gemini_api(transcription) print("GOT THE STRUCTURED DATA", structured_data) # Step 3: Return the structured data return jsonify(structured_data) except Exception as e: return jsonify({"error": str(e)}), 500 import subprocess import os import json from deepgram.clients import DeepgramClient from deepgram.options import PrerecordedOptions # Replace with your actual Deepgram API key DEEPGRAM_API_KEY = "your_deepgram_api_key" async def transcribe_audio(video_file_path, wav_file_path): """ Transcribe audio from a video file using Whisper AI (async function). Args: video_file_path (str): Path to the input video file. wav_file_path (str): Path to save the converted WAV file. Returns: dict: A dictionary containing status, transcript, or error message. """ print("Entered the transcribe_audio function") try: # Initialize Deepgram client deepgram = DeepgramClient(DEEPGRAM_API_KEY) # Convert video to audio in WAV format using FFmpeg print("Converting video to audio (WAV format)...") ffmpeg_command = [ "ffmpeg", "-i", video_file_path, "-q:a", "0", "-map", "a", wav_file_path ] subprocess.run(ffmpeg_command, check=True) print(f"Conversion successful! WAV file saved at: {wav_file_path}") # Open the converted WAV file with open(wav_file_path, 'rb') as buffer_data: payload = {'buffer': buffer_data} # Configure transcription options options = PrerecordedOptions( smart_format=True, model="nova-2", language="en-US" ) # Transcribe the audio response = deepgram.listen.prerecorded.v('1').transcribe_file(payload, options) # Check if the response is valid if response: print("Request successful! Processing response.") # Convert response to JSON string try: data_str = response.to_json(indent=4) except AttributeError as e: return {"status": "error", "message": f"Error converting response to JSON: {e}"} # Parse the JSON string to a Python dictionary try: data = json.loads(data_str) except json.JSONDecodeError as e: return {"status": "error", "message": f"Error parsing JSON string: {e}"} # Extract the transcript try: transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"] except KeyError as e: return {"status": "error", "message": f"Error extracting transcript: {e}"} # Path to the text file output_text_file = "deepGramNovaTranscript.txt" # Write the transcript to the text file with open(output_text_file, "w", encoding="utf-8") as file: file.write(transcript) print(f"Transcript saved to: {output_text_file}") return {"status": "success", "transcript": transcript, "file_path": output_text_file} else: return {"status": "error", "message": "Invalid response from Deepgram."} except FileNotFoundError: return {"status": "error", "message": f"Video file not found: {video_file_path}"} except subprocess.CalledProcessError as e: return {"status": "error", "message": f"Error during audio conversion: {e}"} except Exception as e: return {"status": "error", "message": f"Unexpected error: {e}"} finally: # Clean up the temporary WAV file if os.path.exists(wav_file_path): os.remove(wav_file_path) print(f"Temporary WAV file deleted: {wav_file_path}") async def query_gemini_api(transcription): """ Send transcription text to Gemini API and fetch structured recipe information (async function). """ try: # Define the structured prompt prompt = ( "Analyze the provided cooking video transcription and extract the following structured information:\n" "1. Recipe Name: Identify the name of the dish being prepared.\n" "2. Ingredients List: Extract a detailed list of ingredients with their respective quantities (if mentioned).\n" "3. Steps for Preparation: Provide a step-by-step breakdown of the recipe's preparation process, organized and numbered sequentially.\n" "4. Cooking Techniques Used: Highlight the cooking techniques demonstrated in the video, such as searing, blitzing, wrapping, etc.\n" "5. Equipment Needed: List all tools, appliances, or utensils mentioned, e.g., blender, hot pan, cling film, etc.\n" "6. Nutritional Information (if inferred): Provide an approximate calorie count or nutritional breakdown based on the ingredients used.\n" "7. Serving size: In count of people or portion size.\n" "8. Special Notes or Variations: Include any specific tips, variations, or alternatives mentioned.\n" "9. Festive or Thematic Relevance: Note if the recipe has any special relevance to holidays, events, or seasons.\n" f"Text: {transcription}\n" ) # Prepare the payload and headers payload = { "contents": [ { "parts": [ {"text": prompt} ] } ] } headers = {"Content-Type": "application/json"} # Send request to Gemini API asynchronously async with aiohttp.ClientSession() as session: async with session.post( f"{GEMINI_API_ENDPOINT}?key={GEMINI_API_KEY}", json=payload, headers=headers, timeout=60 # 60 seconds timeout for the request ) as response: response.raise_for_status() # Raise error if response code is not 200 data = await response.json() return data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "No result found") except aiohttp.ClientError as e: print(f"Error querying Gemini API: {e}") return {"error": str(e)} if __name__ == '__main__': app.run(debug=True) # # Above code is without polling and sleep # import os # import whisper # import requests # from flask import Flask, request, jsonify, render_template # import tempfile # import warnings # warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead") # app = Flask(__name__) # print("APP IS RUNNING, ANIKET") # # Gemini API settings # from dotenv import load_dotenv # # Load the .env file # load_dotenv() # print("ENV LOADED, ANIKET") # # Fetch the API key from the .env file # API_KEY = os.getenv("FIRST_API_KEY") # # Ensure the API key is loaded correctly # if not API_KEY: # raise ValueError("API Key not found. Make sure it is set in the .env file.") # GEMINI_API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent" # GEMINI_API_KEY = API_KEY # # Load Whisper AI model at startup # print("Loading Whisper AI model..., ANIKET") # whisper_model = whisper.load_model("base") # Choose model size: tiny, base, small, medium, large # print("Whisper AI model loaded successfully, ANIKET") # # Define the "/" endpoint for health check # @app.route("/", methods=["GET"]) # def health_check(): # return jsonify({"status": "success", "message": "API is running successfully!"}), 200 # @app.route("/mbsa") # def mbsa(): # return render_template("mbsa.html") # @app.route('/process-audio', methods=['POST']) # def process_audio(): # print("GOT THE PROCESS AUDIO REQUEST, ANIKET") # """ # Flask endpoint to process audio: # 1. Transcribe provided audio file using Whisper AI. # 2. Send transcription to Gemini API for recipe information extraction. # 3. Return structured data in the response. # """ # if 'audio' not in request.files: # return jsonify({"error": "No audio file provided"}), 400 # audio_file = request.files['audio'] # print("AUDIO FILE NAME: ", audio_file) # try: # print("STARTING TRANSCRIPTION, ANIKET") # # Step 1: Transcribe the uploaded audio file directly # audio_file = request.files['audio'] # transcription = transcribe_audio(audio_file) # print("BEFORE THE transcription FAILED ERROR, CHECKING IF I GOT THE TRANSCRIPTION", transcription) # if not transcription: # return jsonify({"error": "Audio transcription failed"}), 500 # print("GOT THE transcription") # print("Starting the GEMINI REQUEST TO STRUCTURE IT") # # Step 2: Generate structured recipe information using Gemini API # structured_data = query_gemini_api(transcription) # print("GOT THE STRUCTURED DATA", structured_data) # # Step 3: Return the structured data # return jsonify(structured_data) # except Exception as e: # return jsonify({"error": str(e)}), 500 # def transcribe_audio(audio_path): # """ # Transcribe audio using Whisper AI. # """ # print("CAME IN THE transcribe audio function") # try: # # Transcribe audio using Whisper AI # print("Transcribing audio...") # result = whisper_model.transcribe(audio_path) # print("THE RESULTS ARE", result) # return result.get("text", "").strip() # except Exception as e: # print(f"Error in transcription: {e}") # return None # def query_gemini_api(transcription): # """ # Send transcription text to Gemini API and fetch structured recipe information. # """ # try: # # Define the structured prompt # prompt = ( # "Analyze the provided cooking video transcription and extract the following structured information:\n" # "1. Recipe Name: Identify the name of the dish being prepared.\n" # "2. Ingredients List: Extract a detailed list of ingredients with their respective quantities (if mentioned).\n" # "3. Steps for Preparation: Provide a step-by-step breakdown of the recipe's preparation process, organized and numbered sequentially.\n" # "4. Cooking Techniques Used: Highlight the cooking techniques demonstrated in the video, such as searing, blitzing, wrapping, etc.\n" # "5. Equipment Needed: List all tools, appliances, or utensils mentioned, e.g., blender, hot pan, cling film, etc.\n" # "6. Nutritional Information (if inferred): Provide an approximate calorie count or nutritional breakdown based on the ingredients used.\n" # "7. Serving size: In count of people or portion size.\n" # "8. Special Notes or Variations: Include any specific tips, variations, or alternatives mentioned.\n" # "9. Festive or Thematic Relevance: Note if the recipe has any special relevance to holidays, events, or seasons.\n" # f"Text: {transcription}\n" # ) # # Prepare the payload and headers # payload = { # "contents": [ # { # "parts": [ # {"text": prompt} # ] # } # ] # } # headers = {"Content-Type": "application/json"} # # Send request to Gemini API and wait for the response # print("Querying Gemini API...") # response = requests.post( # f"{GEMINI_API_ENDPOINT}?key={GEMINI_API_KEY}", # json=payload, # headers=headers, # timeout=60 # 60 seconds timeout for the request # ) # response.raise_for_status() # # Extract and return the structured data # data = response.json() # return data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "No result found") # except requests.exceptions.RequestException as e: # print(f"Error querying Gemini API: {e}") # return {"error": str(e)} # if __name__ == '__main__': # app.run(debug=True) # import os # import subprocess # import whisper # import requests # import tempfile # import warnings # import threading # from flask import Flask, request, jsonify, send_file, render_template # from dotenv import load_dotenv # import requests # warnings.filterwarnings("ignore", category=UserWarning, module="whisper") # app = Flask(__name__) # # Gemini API settings # load_dotenv() # API_KEY = os.getenv("FIRST_API_KEY") # # Ensure the API key is loaded correctly # if not API_KEY: # raise ValueError("API Key not found. Make sure it is set in the .env file.") # GEMINI_API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent" # GEMINI_API_KEY = API_KEY # # Load Whisper AI model at startup # print("Loading Whisper AI model...") # whisper_model = whisper.load_model("base") # print("Whisper AI model loaded successfully.") # # Define the "/" endpoint for health check # @app.route("/", methods=["GET"]) # def health_check(): # return jsonify({"status": "success", "message": "API is running successfully!"}), 200 # def process_video_in_background(video_file, temp_video_file_name): # """ # This function is executed in a separate thread to handle the long-running # video processing tasks such as transcription and querying the Gemini API. # """ # try: # transcription = transcribe_audio(temp_video_file_name) # if not transcription: # print("Audio transcription failed") # return # structured_data = query_gemini_api(transcription) # # Send structured data back or store it in a database, depending on your use case # print("Processing complete. Structured data:", structured_data) # except Exception as e: # print(f"Error processing video: {e}") # finally: # # Clean up temporary files # if os.path.exists(temp_video_file_name): # os.remove(temp_video_file_name) # @app.route('/process-video', methods=['POST']) # def process_video(): # if 'video' not in request.files: # return jsonify({"error": "No video file provided"}), 400 # video_file = request.files['video'] # try: # # Save video to a temporary file # with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video_file: # video_file.save(temp_video_file.name) # print(f"Video file saved: {temp_video_file.name}") # # Start the video processing in a background thread # threading.Thread(target=process_video_in_background, args=(video_file, temp_video_file.name)).start() # return jsonify({"message": "Video is being processed in the background."}), 202 # except Exception as e: # return jsonify({"error": str(e)}), 500 # def transcribe_audio(video_path): # """ # Transcribe audio directly from a video file using Whisper AI. # """ # try: # print(f"Transcribing video: {video_path}") # result = whisper_model.transcribe(video_path) # return result['text'] # except Exception as e: # print(f"Error in transcription: {e}") # return None # def query_gemini_api(transcription): # """ # Send transcription text to Gemini API and fetch structured recipe information. # """ # try: # # Define the structured prompt # prompt = ( # "Analyze the provided cooking video transcription and extract the following structured information:\n" # "1. Recipe Name: Identify the name of the dish being prepared.\n" # "2. Ingredients List: Extract a detailed list of ingredients with their respective quantities (if mentioned).\n" # "3. Steps for Preparation: Provide a step-by-step breakdown of the recipe's preparation process, organized and numbered sequentially.\n" # "4. Cooking Techniques Used: Highlight the cooking techniques demonstrated in the video, such as searing, blitzing, wrapping, etc.\n" # "5. Equipment Needed: List all tools, appliances, or utensils mentioned, e.g., blender, hot pan, cling film, etc.\n" # "6. Nutritional Information (if inferred): Provide an approximate calorie count or nutritional breakdown based on the ingredients used.\n" # "7. Serving size: In count of people or portion size.\n" # "8. Special Notes or Variations: Include any specific tips, variations, or alternatives mentioned.\n" # "9. Festive or Thematic Relevance: Note if the recipe has any special relevance to holidays, events, or seasons.\n" # f"Text: {transcription}\n" # ) # payload = { # "contents": [ # {"parts": [{"text": prompt}]} # ] # } # headers = {"Content-Type": "application/json"} # # Send request to Gemini API # response = requests.post( # f"{GEMINI_API_ENDPOINT}?key={GEMINI_API_KEY}", # json=payload, # headers=headers # ) # response.raise_for_status() # # Extract and return the structured data # data = response.json() # return data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "No result found") # except requests.exceptions.RequestException as e: # print(f"Error querying Gemini API: {e}") # return {"error": str(e)} # if __name__ == '__main__': # app.run(debug=True)