Spaces:

GoodML
/

dishDecode

Running

File size: 20,165 Bytes

import os
import whisper
import requests
from flask import Flask, request, jsonify, render_template
from dotenv import load_dotenv
from deepgram import DeepgramClient, PrerecordedOptions
import tempfile
import json

import warnings
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

app = Flask(__name__)
print("APP IS RUNNING, ANIKET")

# Load the .env file
load_dotenv()

print("ENV LOADED, ANIKET")

# Fetch the API key from the .env file
API_KEY = os.getenv("FIRST_API_KEY")
DEEPGRAM_API_KEY = os.getenv("SECOND_API_KEY")

# Ensure the API key is loaded correctly
if not API_KEY:
    raise ValueError("API Key not found. Make sure it is set in the .env file.")

if not DEEPGRAM_API_KEY:
    raise ValueError("DEEPGRAM_API_KEY not found. Make sure it is set in the .env file.")

GEMINI_API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
GEMINI_API_KEY = API_KEY

# Load Whisper AI model at startup
# print("Loading Whisper AI model..., ANIKET")
# whisper_model = whisper.load_model("base")  # Choose model size: tiny, base, small, medium, large
# print("Whisper AI model loaded successfully, ANIKET")


@app.route("/", methods=["GET"])
def health_check():
    return jsonify({"status": "success", "message": "API is running successfully!"}), 200


@app.route("/mbsa")
def mbsa():
    return render_template("mbsa.html")


@app.route('/process-audio', methods=['POST'])
def process_audio():
    print("GOT THE PROCESS AUDIO REQUEST, ANIKET")
    
    if 'audio' not in request.files:
        return jsonify({"error": "No audio file provided"}), 400

    audio_file = request.files['audio']
    print("AUDIO FILE NAME: ", audio_file)

    temp_audio_path = None
    try:
        print("STARTING TRANSCRIPTION, ANIKET")
        
        # Step 1: Save the audio file temporarily to a specific location
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio_file:
            temp_audio_path = temp_audio_file.name  # Get the file path
            temp_audio_file.write(audio_file.read())  # Write the uploaded audio to the temp file
        
        print(f"Temporary audio file saved at: {temp_audio_path}")
        
        # Step 2: Transcribe the uploaded audio file synchronously
        transcription = transcribe_audio(temp_audio_path)

        print("BEFORE THE transcription FAILED ERROR, CHECKING IF I GOT THE TRANSCRIPTION", transcription)

        if not transcription:
            return jsonify({"error": "Audio transcription failed"}), 500

        print("GOT THE transcription")

        # Step 3: Generate structured recipe information using Gemini API synchronously
        print("Starting the GEMINI REQUEST TO STRUCTURE IT")
        structured_data = query_gemini_api(transcription)

        print("GOT THE STRUCTURED DATA", structured_data)
        # Step 4: Return the structured data
        return jsonify(structured_data)

    except Exception as e:
        return jsonify({"error": str(e)}), 500

    finally:
        # Clean up the temporary WAV file
        if temp_audio_path and os.path.exists(temp_audio_path):
            os.remove(temp_audio_path)
            print(f"Temporary WAV file deleted: {temp_audio_path}")



def transcribe_audio(wav_file_path):
    """
    Transcribe audio from a video file using Deepgram API synchronously.
    
    Args:
        wav_file_path (str): Path to save the converted WAV file.

    Returns:
        dict: A dictionary containing status, transcript, or error message.
    """
    print("Entered the transcribe_audio function")
    try:
        # Initialize Deepgram client
        deepgram = DeepgramClient(DEEPGRAM_API_KEY)

        # Open the converted WAV file
        with open(wav_file_path, 'rb') as buffer_data:
            payload = {'buffer': buffer_data}

            # Configure transcription options
            options = PrerecordedOptions(
                smart_format=True, model="nova-2", language="en-US"
            )

            # Transcribe the audio
            response = deepgram.listen.prerecorded.v('1').transcribe_file(payload, options)

            # Check if the response is valid
            if response:
                print("Request successful! Processing response.")

                # Convert response to JSON string
                try:
                    data_str = response.to_json(indent=4)
                except AttributeError as e:
                    return {"status": "error", "message": f"Error converting response to JSON: {e}"}

                # Parse the JSON string to a Python dictionary
                try:
                    data = json.loads(data_str)
                except json.JSONDecodeError as e:
                    return {"status": "error", "message": f"Error parsing JSON string: {e}"}

                # Extract the transcript
                try:
                    transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"]
                except KeyError as e:
                    return {"status": "error", "message": f"Error extracting transcript: {e}"}

                print(f"Transcript obtained: {transcript}")
                # Step: Save the transcript to a text file
                transcript_file_path = "transcript_from_transcribe_audio.txt"
                with open(transcript_file_path, "w", encoding="utf-8") as transcript_file:
                    transcript_file.write(transcript)
                print(f"Transcript saved to file: {transcript_file_path}")
                
                return transcript
            else:
                return {"status": "error", "message": "Invalid response from Deepgram."}

    except FileNotFoundError:
        return {"status": "error", "message": f"Video file not found: {wav_file_path}"}
    except Exception as e:
        return {"status": "error", "message": f"Unexpected error: {e}"}
    finally:
        # Clean up the temporary WAV file
        if os.path.exists(wav_file_path):
            os.remove(wav_file_path)
            print(f"Temporary WAV file deleted: {wav_file_path}")


def query_gemini_api(transcription):
    """
    Send transcription text to Gemini API and fetch structured recipe information synchronously.
    """
    try:
        # Define the structured prompt
        prompt = (
            "Print the transcription in the response as well"
            "Analyze the provided cooking video transcription and extract the following structured information:\n"
            "1. Recipe Name: Identify the name of the dish being prepared.\n"
            "2. Ingredients List: Extract a detailed list of ingredients with their respective quantities (if mentioned).\n"
            "3. Steps for Preparation: Provide a step-by-step breakdown of the recipe's preparation process, organized and numbered sequentially.\n"
            "4. Cooking Techniques Used: Highlight the cooking techniques demonstrated in the video, such as searing, blitzing, wrapping, etc.\n"
            "5. Equipment Needed: List all tools, appliances, or utensils mentioned, e.g., blender, hot pan, cling film, etc.\n"
            "6. Nutritional Information (if inferred): Provide an approximate calorie count or nutritional breakdown based on the ingredients used.\n"
            "7. Serving size: In count of people or portion size.\n"
            "8. Special Notes or Variations: Include any specific tips, variations, or alternatives mentioned.\n"
            "9. Festive or Thematic Relevance: Note if the recipe has any special relevance to holidays, events, or seasons.\n"
            f"Text: {transcription}\n"
        )

        # Prepare the payload and headers
        payload = {
            "contents": [
                {
                    "parts": [
                        {"text": prompt}
                    ]
                }
            ]
        }
        headers = {"Content-Type": "application/json"}

        # Send request to Gemini API synchronously
        response = requests.post(
            f"{GEMINI_API_ENDPOINT}?key={GEMINI_API_KEY}",
            json=payload,
            headers=headers,
            
        )

        # Raise error if response code is not 200
        response.raise_for_status()

        data = response.json()

        return data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "No result found")

    except requests.exceptions.RequestException as e:
        print(f"Error querying Gemini API: {e}")
        return {"error": str(e)}


if __name__ == '__main__':
    app.run(debug=True)





# # Above code is without polling and sleep
# import os
# import whisper
# import requests
# from flask import Flask, request, jsonify, render_template
# import tempfile
# import warnings
# warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

# app = Flask(__name__)
# print("APP IS RUNNING, ANIKET")

# # Gemini API settings
# from dotenv import load_dotenv
# # Load the .env file
# load_dotenv()

# print("ENV LOADED, ANIKET")

# # Fetch the API key from the .env file
# API_KEY = os.getenv("FIRST_API_KEY")

# # Ensure the API key is loaded correctly
# if not API_KEY:
#     raise ValueError("API Key not found. Make sure it is set in the .env file.")

# GEMINI_API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
# GEMINI_API_KEY = API_KEY


# # Load Whisper AI model at startup
# print("Loading Whisper AI model..., ANIKET")
# whisper_model = whisper.load_model("base")  # Choose model size: tiny, base, small, medium, large
# print("Whisper AI model loaded successfully, ANIKET")


# # Define the "/" endpoint for health check
# @app.route("/", methods=["GET"])
# def health_check():
#     return jsonify({"status": "success", "message": "API is running successfully!"}), 200

# @app.route("/mbsa")
# def mbsa():
#     return render_template("mbsa.html")

# @app.route('/process-audio', methods=['POST'])
# def process_audio():
#     print("GOT THE PROCESS AUDIO REQUEST, ANIKET")
#     """
#     Flask endpoint to process audio:
#     1. Transcribe provided audio file using Whisper AI.
#     2. Send transcription to Gemini API for recipe information extraction.
#     3. Return structured data in the response.
#     """
    
#     if 'audio' not in request.files:
#         return jsonify({"error": "No audio file provided"}), 400

#     audio_file = request.files['audio']
#     print("AUDIO FILE NAME: ", audio_file)
    
#     try:
#         print("STARTING TRANSCRIPTION, ANIKET")
#         # Step 1: Transcribe the uploaded audio file directly
#         audio_file = request.files['audio']
#         transcription = transcribe_audio(audio_file)
    
#         print("BEFORE THE transcription FAILED ERROR, CHECKING IF I GOT THE TRANSCRIPTION", transcription)
    
#         if not transcription:
#             return jsonify({"error": "Audio transcription failed"}), 500
        
#         print("GOT THE transcription")
    
#         print("Starting the GEMINI REQUEST TO STRUCTURE IT")
#         # Step 2: Generate structured recipe information using Gemini API
#         structured_data = query_gemini_api(transcription)
        
#         print("GOT THE STRUCTURED DATA", structured_data)
#         # Step 3: Return the structured data
#         return jsonify(structured_data)
    
#     except Exception as e:
#         return jsonify({"error": str(e)}), 500

# def transcribe_audio(audio_path):
#     """
#     Transcribe audio using Whisper AI.
#     """
#     print("CAME IN THE transcribe audio function")
#     try:
#         # Transcribe audio using Whisper AI
#         print("Transcribing audio...")
#         result = whisper_model.transcribe(audio_path)
#         print("THE RESULTS ARE", result)
        
#         return result.get("text", "").strip()

#     except Exception as e:
#         print(f"Error in transcription: {e}")
#         return None


# def query_gemini_api(transcription):
#     """
#     Send transcription text to Gemini API and fetch structured recipe information.
#     """
#     try:
#         # Define the structured prompt
#         prompt = (
#             "Analyze the provided cooking video transcription and extract the following structured information:\n"
#             "1. Recipe Name: Identify the name of the dish being prepared.\n"
#             "2. Ingredients List: Extract a detailed list of ingredients with their respective quantities (if mentioned).\n"
#             "3. Steps for Preparation: Provide a step-by-step breakdown of the recipe's preparation process, organized and numbered sequentially.\n"
#             "4. Cooking Techniques Used: Highlight the cooking techniques demonstrated in the video, such as searing, blitzing, wrapping, etc.\n"
#             "5. Equipment Needed: List all tools, appliances, or utensils mentioned, e.g., blender, hot pan, cling film, etc.\n"
#             "6. Nutritional Information (if inferred): Provide an approximate calorie count or nutritional breakdown based on the ingredients used.\n"
#             "7. Serving size: In count of people or portion size.\n"
#             "8. Special Notes or Variations: Include any specific tips, variations, or alternatives mentioned.\n"
#             "9. Festive or Thematic Relevance: Note if the recipe has any special relevance to holidays, events, or seasons.\n"
#             f"Text: {transcription}\n"
#         )

#         # Prepare the payload and headers
#         payload = {
#             "contents": [
#                 {
#                     "parts": [
#                         {"text": prompt}
#                     ]
#                 }
#             ]
#         }
#         headers = {"Content-Type": "application/json"}

#         # Send request to Gemini API and wait for the response
#         print("Querying Gemini API...")
#         response = requests.post(
#             f"{GEMINI_API_ENDPOINT}?key={GEMINI_API_KEY}",
#             json=payload,
#             headers=headers,
#             timeout=60  # 60 seconds timeout for the request
#         )
#         response.raise_for_status()

#         # Extract and return the structured data
#         data = response.json()
#         return data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "No result found")

#     except requests.exceptions.RequestException as e:
#         print(f"Error querying Gemini API: {e}")
#         return {"error": str(e)}


# if __name__ == '__main__':
#     app.run(debug=True)







# import os
# import subprocess
# import whisper
# import requests
# import tempfile
# import warnings
# import threading
# from flask import Flask, request, jsonify, send_file, render_template

# from dotenv import load_dotenv
# import requests




# warnings.filterwarnings("ignore", category=UserWarning, module="whisper")


# app = Flask(__name__)


# # Gemini API settings
# load_dotenv()
# API_KEY = os.getenv("FIRST_API_KEY")

# # Ensure the API key is loaded correctly
# if not API_KEY:
#     raise ValueError("API Key not found. Make sure it is set in the .env file.")

# GEMINI_API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
# GEMINI_API_KEY = API_KEY

# # Load Whisper AI model at startup
# print("Loading Whisper AI model...")
# whisper_model = whisper.load_model("base")
# print("Whisper AI model loaded successfully.")

# # Define the "/" endpoint for health check
# @app.route("/", methods=["GET"])
# def health_check():
#     return jsonify({"status": "success", "message": "API is running successfully!"}), 200


# def process_video_in_background(video_file, temp_video_file_name):
#     """
#     This function is executed in a separate thread to handle the long-running
#     video processing tasks such as transcription and querying the Gemini API.
#     """
#     try:
#         transcription = transcribe_audio(temp_video_file_name)

#         if not transcription:
#             print("Audio transcription failed")
#             return

#         structured_data = query_gemini_api(transcription)

#         # Send structured data back or store it in a database, depending on your use case
#         print("Processing complete. Structured data:", structured_data)

#     except Exception as e:
#         print(f"Error processing video: {e}")

#     finally:
#         # Clean up temporary files
#         if os.path.exists(temp_video_file_name):
#             os.remove(temp_video_file_name)


# @app.route('/process-video', methods=['POST'])
# def process_video():
#     if 'video' not in request.files:
#         return jsonify({"error": "No video file provided"}), 400

#     video_file = request.files['video']

#     try:
#         # Save video to a temporary file
#         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video_file:
#             video_file.save(temp_video_file.name)
#             print(f"Video file saved: {temp_video_file.name}")

#             # Start the video processing in a background thread
#             threading.Thread(target=process_video_in_background, args=(video_file, temp_video_file.name)).start()

#             return jsonify({"message": "Video is being processed in the background."}), 202

#     except Exception as e:
#         return jsonify({"error": str(e)}), 500


# def transcribe_audio(video_path):
#     """
#     Transcribe audio directly from a video file using Whisper AI.
#     """
#     try:
#         print(f"Transcribing video: {video_path}")
#         result = whisper_model.transcribe(video_path)
#         return result['text']
#     except Exception as e:
#         print(f"Error in transcription: {e}")
#         return None


# def query_gemini_api(transcription):
#     """
#     Send transcription text to Gemini API and fetch structured recipe information.
#     """
#     try:
#         # Define the structured prompt
#         prompt = (
#             "Analyze the provided cooking video transcription and extract the following structured information:\n"
#             "1. Recipe Name: Identify the name of the dish being prepared.\n"
#             "2. Ingredients List: Extract a detailed list of ingredients with their respective quantities (if mentioned).\n"
#             "3. Steps for Preparation: Provide a step-by-step breakdown of the recipe's preparation process, organized and numbered sequentially.\n"
#             "4. Cooking Techniques Used: Highlight the cooking techniques demonstrated in the video, such as searing, blitzing, wrapping, etc.\n"
#             "5. Equipment Needed: List all tools, appliances, or utensils mentioned, e.g., blender, hot pan, cling film, etc.\n"
#             "6. Nutritional Information (if inferred): Provide an approximate calorie count or nutritional breakdown based on the ingredients used.\n"
#             "7. Serving size: In count of people or portion size.\n"
#             "8. Special Notes or Variations: Include any specific tips, variations, or alternatives mentioned.\n"
#             "9. Festive or Thematic Relevance: Note if the recipe has any special relevance to holidays, events, or seasons.\n"
#             f"Text: {transcription}\n"
#         )

#         payload = {
#             "contents": [
#                 {"parts": [{"text": prompt}]}
#             ]
#         }
#         headers = {"Content-Type": "application/json"}

#         # Send request to Gemini API
#         response = requests.post(
#             f"{GEMINI_API_ENDPOINT}?key={GEMINI_API_KEY}",
#             json=payload,
#             headers=headers
#         )
#         response.raise_for_status()

#         # Extract and return the structured data
#         data = response.json()
#         return data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "No result found")

#     except requests.exceptions.RequestException as e:
#         print(f"Error querying Gemini API: {e}")
#         return {"error": str(e)}


# if __name__ == '__main__':
#     app.run(debug=True)