Spaces:

GoodML
/

Dish-Decode-2

Running

App Files Files Community

Dish-Decode-2 / app.py

GoodML

Update app.py

985ead6 verified 7 months ago

raw

history blame

21.4 kB

	# import os
	# import requests
	# import cv2
	# import re
	# from flask import Flask, request, jsonify, render_template
	# from deepgram import DeepgramClient, PrerecordedOptions
	# from dotenv import load_dotenv
	# import tempfile
	# import json
	# import subprocess


	# import warnings
	# warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

	# app = Flask(__name__)
	# print("APP IS RUNNING, ANIKET")

	# # Load the .env file
	# load_dotenv()

	# print("ENV LOADED, ANIKET")

	# # Fetch the API key from the .env file
	# API_KEY = os.getenv("FIRST_API_KEY")
	# DEEPGRAM_API_KEY = os.getenv("SECOND_API_KEY")

	# # Ensure the API key is loaded correctly
	# if not API_KEY:
	# raise ValueError("API Key not found. Make sure it is set in the .env file.")

	# if not DEEPGRAM_API_KEY:
	# raise ValueError("DEEPGRAM_API_KEY not found. Make sure it is set in the .env file.")

	# GEMINI_API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
	# GEMINI_API_KEY = API_KEY

	# @app.route("/", methods=["GET"])
	# def health_check():
	# return jsonify({"status": "success", "message": "API is running successfully!"}), 200


	# def transcribe_audio(wav_file_path):
	# """
	# Transcribe audio from a video file using Deepgram API synchronously.

	# Args:
	# wav_file_path (str): Path to save the converted WAV file.
	# Returns:
	# dict: A dictionary containing status, transcript, or error message.
	# """
	# print("Entered the transcribe_audio function")
	# try:
	# # Initialize Deepgram client
	# deepgram = DeepgramClient(DEEPGRAM_API_KEY)

	# # Open the converted WAV file
	# with open(wav_file_path, 'rb') as buffer_data:
	# payload = {'buffer': buffer_data}

	# # Configure transcription options
	# options = PrerecordedOptions(
	# smart_format=True, model="nova-2", language="en-US"
	# )

	# # Transcribe the audio
	# response = deepgram.listen.prerecorded.v('1').transcribe_file(payload, options)

	# # Check if the response is valid
	# if response:
	# # print("Request successful! Processing response.")

	# # Convert response to JSON string
	# try:
	# data_str = response.to_json(indent=4)
	# except AttributeError as e:
	# return {"status": "error", "message": f"Error converting response to JSON: {e}"}

	# # Parse the JSON string to a Python dictionary
	# try:
	# data = json.loads(data_str)
	# except json.JSONDecodeError as e:
	# return {"status": "error", "message": f"Error parsing JSON string: {e}"}

	# # Extract the transcript
	# try:
	# transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"]
	# except KeyError as e:
	# return {"status": "error", "message": f"Error extracting transcript: {e}"}

	# print(f"Transcript obtained: {transcript}")
	# # Step: Save the transcript to a text file
	# transcript_file_path = "transcript_from_transcribe_audio.txt"
	# with open(transcript_file_path, "w", encoding="utf-8") as transcript_file:
	# transcript_file.write(transcript)
	# # print(f"Transcript saved to file: {transcript_file_path}")

	# return transcript
	# else:
	# return {"status": "error", "message": "Invalid response from Deepgram."}

	# except FileNotFoundError:
	# return {"status": "error", "message": f"Video file not found: {wav_file_path}"}
	# except Exception as e:
	# return {"status": "error", "message": f"Unexpected error: {e}"}
	# finally:
	# # Clean up the temporary WAV file
	# if os.path.exists(wav_file_path):
	# os.remove(wav_file_path)
	# print(f"Temporary WAV file deleted: {wav_file_path}")



	# def download_video(url, temp_video_path):
	# """Download video (MP4 format) from the given URL and save it to temp_video_path."""
	# response = requests.get(url, stream=True)
	# if response.status_code == 200:
	# with open(temp_video_path, 'wb') as f:
	# for chunk in response.iter_content(chunk_size=1024):
	# f.write(chunk)
	# print(f"Audio downloaded successfully to {temp_video_path}")
	# else:
	# raise Exception(f"Failed to download audio, status code: {response.status_code}")


	# def preprocess_frame(frame):
	# """Preprocess the frame for better OCR accuracy."""
	# gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
	# denoised = cv2.medianBlur(gray, 3)
	# _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)
	# return thresh

	# def clean_ocr_text(text):
	# """Clean the OCR output by removing noise and unwanted characters."""
	# cleaned_text = re.sub(r'[^A-Za-z0-9\s,.!?-]', '', text)
	# cleaned_text = '\n'.join([line.strip() for line in cleaned_text.splitlines() if len(line.strip()) > 2])
	# return cleaned_text

	# def get_information_from_video_using_OCR(video_path, interval=1):
	# """Extract text from video frames using OCR and return the combined text content."""
	# cap = cv2.VideoCapture(video_path)
	# fps = int(cap.get(cv2.CAP_PROP_FPS))
	# frame_interval = interval * fps
	# frame_count = 0
	# extracted_text = ""

	# print("Starting text extraction from video...")

	# while cap.isOpened():
	# ret, frame = cap.read()
	# if not ret:
	# break

	# if frame_count % frame_interval == 0:
	# preprocessed_frame = preprocess_frame(frame)
	# text = pytesseract.image_to_string(preprocessed_frame, lang='eng', config='--psm 6 --oem 3')
	# cleaned_text = clean_ocr_text(text)
	# if cleaned_text:
	# extracted_text += cleaned_text + "\n\n"
	# print(f"Text found at frame {frame_count}: {cleaned_text[:50]}...")

	# frame_count += 1

	# cap.release()
	# print("Text extraction completed.")
	# return extracted_text




	# @app.route('/process-video', methods=['POST'])
	# def process_video():
	# if 'videoUrl' not in request.json:
	# return jsonify({"error": "No video URL provided"}), 400

	# video_url = request.json['videoUrl']
	# temp_video_path = None

	# try:
	# # Step 1: Download the WAV file from the provided URL
	# with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video_file:
	# temp_video_path = temp_video_file.name
	# download_video(video_url, temp_video_path)
	# interval = 1
	# # Step 2: get the information from the downloaded MP4 file synchronously
	# video_info = get_information_from_video_using_OCR(temp_video_path, interval)

	# if not video_info:
	# video_info = ""



	# # Step 2: Convert the MP4 to WAV
	# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
	# temp_wav_path = temp_wav_file.name
	# convert_mp4_to_wav(temp_video_path, temp_wav_path)

	# audio_info = transcribe_audio(temp_wav_path)

	# # If no transcription present, use an empty string
	# if not audio_info:
	# audio_info = ""



	# # Step 3: Generate structured recipe information using Gemini API synchronously
	# structured_data = query_gemini_api(video_info, audio_info)

	# return jsonify(structured_data)

	# except Exception as e:
	# return jsonify({"error": str(e)}), 500

	# finally:
	# # Clean up temporary audio file
	# if temp_video_path and os.path.exists(temp_video_path):
	# os.remove(temp_video_path)
	# print(f"Temporary audio file deleted: {temp_video_path}")






	# def query_gemini_api(video_transcription, audio_transcription):
	# """
	# Send transcription text to Gemini API and fetch structured recipe information synchronously.
	# """
	# try:
	# # Define the structured prompt
	# prompt = (
	# "Analyze the provided cooking video and audio transcription combined and based on the combined information extract the following structured information:\n"
	# "1. Recipe Name: Identify the name of the dish being prepared.\n"
	# "2. Ingredients List: Extract a detailed list of ingredients with their respective quantities (if mentioned).\n"
	# "3. Steps for Preparation: Provide a step-by-step breakdown of the recipe's preparation process, organized and numbered sequentially.\n"
	# "4. Cooking Techniques Used: Highlight the cooking techniques demonstrated in the video, such as searing, blitzing, wrapping, etc.\n"
	# "5. Equipment Needed: List all tools, appliances, or utensils mentioned, e.g., blender, hot pan, cling film, etc.\n"
	# "6. Nutritional Information (if inferred): Provide an approximate calorie count or nutritional breakdown based on the ingredients used.\n"
	# "7. Serving size: In count of people or portion size.\n"
	# "8. Special Notes or Variations: Include any specific tips, variations, or alternatives mentioned.\n"
	# "9. Festive or Thematic Relevance: Note if the recipe has any special relevance to holidays, events, or seasons.\n"
	# "Also, make sure not to provide anything else or any other information or warning or text apart from the above things mentioned."
	# f"Text: {audio_transcription}\n"
	# f"Text: {video_transcription}\n"

	# )

	# # Prepare the payload and headers
	# payload = {
	# "contents": [
	# {
	# "parts": [
	# {"text": prompt}
	# ]
	# }
	# ]
	# }
	# headers = {"Content-Type": "application/json"}

	# # Send request to Gemini API synchronously
	# response = requests.post(
	# f"{GEMINI_API_ENDPOINT}?key={GEMINI_API_KEY}",
	# json=payload,
	# headers=headers,
	# )

	# # Raise error if response code is not 200
	# response.raise_for_status()

	# data = response.json()

	# return data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "No result found")

	# except requests.exceptions.RequestException as e:
	# print(f"Error querying Gemini API: {e}")
	# return {"error": str(e)}


	# if __name__ == '__main__':
	# app.run(debug=True)



	import os
	import requests
	import cv2
	import re
	import pytesseract
	from flask import Flask, request, jsonify, render_template
	from deepgram import DeepgramClient, PrerecordedOptions
	from dotenv import load_dotenv
	import tempfile
	import json
	import subprocess


	import warnings
	warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

	app = Flask(__name__)
	print("APP IS RUNNING, ANIKET")

	# Load the .env file
	load_dotenv()

	print("ENV LOADED, ANIKET")

	# Fetch the API key from the .env file
	API_KEY = os.getenv("FIRST_API_KEY")
	DEEPGRAM_API_KEY = os.getenv("SECOND_API_KEY")

	# Ensure the API key is loaded correctly
	if not API_KEY:
	raise ValueError("API Key not found. Make sure it is set in the .env file.")

	if not DEEPGRAM_API_KEY:
	raise ValueError("DEEPGRAM_API_KEY not found. Make sure it is set in the .env file.")

	GEMINI_API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
	GEMINI_API_KEY = API_KEY

	@app.route("/", methods=["GET"])
	def health_check():
	return jsonify({"status": "success", "message": "API is running successfully!"}), 200


	def transcribe_audio(wav_file_path):
	"""
	Transcribe audio from a video file using Deepgram API synchronously.

	Args:
	wav_file_path (str): Path to save the converted WAV file.
	Returns:
	dict: A dictionary containing status, transcript, or error message.
	"""
	print("Entered the transcribe_audio function")
	try:
	# Initialize Deepgram client
	deepgram = DeepgramClient(DEEPGRAM_API_KEY)

	# Open the converted WAV file
	with open(wav_file_path, 'rb') as buffer_data:
	payload = {'buffer': buffer_data}

	# Configure transcription options
	options = PrerecordedOptions(
	smart_format=True, model="nova-2", language="en-US"
	)

	# Transcribe the audio
	response = deepgram.listen.prerecorded.v('1').transcribe_file(payload, options)

	# Check if the response is valid
	if response:
	try:
	data_str = response.to_json(indent=4)
	except AttributeError as e:
	return {"status": "error", "message": f"Error converting response to JSON: {e}"}

	# Parse the JSON string to a Python dictionary
	try:
	data = json.loads(data_str)
	except json.JSONDecodeError as e:
	return {"status": "error", "message": f"Error parsing JSON string: {e}"}

	# Extract the transcript
	try:
	transcript = data["results"]["channels"][0]["alternatives"][0]["transcript"]
	except KeyError as e:
	return {"status": "error", "message": f"Error extracting transcript: {e}"}

	print(f"Transcript obtained: {transcript}")
	# Save the transcript to a text file
	transcript_file_path = "transcript_from_transcribe_audio.txt"
	with open(transcript_file_path, "w", encoding="utf-8") as transcript_file:
	transcript_file.write(transcript)

	return transcript
	else:
	return {"status": "error", "message": "Invalid response from Deepgram."}

	except FileNotFoundError:
	return {"status": "error", "message": f"Video file not found: {wav_file_path}"}
	except Exception as e:
	return {"status": "error", "message": f"Unexpected error: {e}"}
	finally:
	# Clean up the temporary WAV file
	if os.path.exists(wav_file_path):
	os.remove(wav_file_path)
	print(f"Temporary WAV file deleted: {wav_file_path}")



	def download_video(url, temp_video_path):
	"""Download video (MP4 format) from the given URL and save it to temp_video_path."""
	response = requests.get(url, stream=True)
	if response.status_code == 200:
	with open(temp_video_path, 'wb') as f:
	for chunk in response.iter_content(chunk_size=1024):
	f.write(chunk)
	print(f"Audio downloaded successfully to {temp_video_path}")
	else:
	raise Exception(f"Failed to download audio, status code: {response.status_code}")


	def preprocess_frame(frame):
	"""Preprocess the frame for better OCR accuracy."""
	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
	denoised = cv2.medianBlur(gray, 3)
	_, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)
	return thresh

	def clean_ocr_text(text):
	"""Clean the OCR output by removing noise and unwanted characters."""
	cleaned_text = re.sub(r'[^A-Za-z0-9\s,.!?-]', '', text)
	cleaned_text = '\n'.join([line.strip() for line in cleaned_text.splitlines() if len(line.strip()) > 2])
	return cleaned_text

	def get_information_from_video_using_OCR(video_path, interval=1):
	"""Extract text from video frames using OCR and return the combined text content."""
	cap = cv2.VideoCapture(video_path)
	fps = int(cap.get(cv2.CAP_PROP_FPS))
	frame_interval = interval * fps
	frame_count = 0
	extracted_text = ""

	print("Starting text extraction from video...")

	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break

	if frame_count % frame_interval == 0:
	preprocessed_frame = preprocess_frame(frame)
	text = pytesseract.image_to_string(preprocessed_frame, lang='eng', config='--psm 6 --oem 3')
	cleaned_text = clean_ocr_text(text)
	if cleaned_text:
	extracted_text += cleaned_text + "\n\n"
	print(f"Text found at frame {frame_count}: {cleaned_text[:50]}...")

	frame_count += 1

	cap.release()
	print("Text extraction completed.")
	return extracted_text


	def convert_mp4_to_wav(mp4_path, wav_path):
	"""Convert an MP4 file to a WAV file."""
	command = f"ffmpeg -i {mp4_path} -vn -acodec pcm_s16le -ar 44100 -ac 2 {wav_path}"
	subprocess.run(command, shell=True, check=True)
	print(f"MP4 file converted to WAV: {wav_path}")


	@app.route('/process-video', methods=['POST'])
	def process_video():
	if 'videoUrl' not in request.json:
	return jsonify({"error": "No video URL provided"}), 400

	video_url = request.json['videoUrl']
	temp_video_path = None

	try:
	# Step 1: Download the MP4 file from the provided URL
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video_file:
	temp_video_path = temp_video_file.name
	download_video(video_url, temp_video_path)

	# Step 2: Get the information from the downloaded MP4 file synchronously
	video_info = get_information_from_video_using_OCR(temp_video_path, interval=1)

	if not video_info:
	video_info = ""

	# Step 3: Convert the MP4 to WAV
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
	temp_wav_path = temp_wav_file.name
	convert_mp4_to_wav(temp_video_path, temp_wav_path)

	# Step 4: Transcribe the audio
	audio_info = transcribe_audio(temp_wav_path)

	# If no transcription is present, use an empty string
	if not audio_info:
	audio_info = ""

	# Step 5: Generate structured recipe information using Gemini API synchronously
	structured_data = query_gemini_api(video_info, audio_info)

	return jsonify(structured_data)

	except Exception as e:
	return jsonify({"error": str(e)}), 500

	finally:
	# Clean up temporary video file
	if temp_video_path and os.path.exists(temp_video_path):
	os.remove(temp_video_path)
	print(f"Temporary video file deleted: {temp_video_path}")


	def query_gemini_api(video_transcription, audio_transcription):
	"""
	Send transcription text to Gemini API and fetch structured recipe information synchronously.
	"""
	try:
	# Define the structured prompt
	prompt = (
	"Analyze the provided cooking video and audio transcription combined and based on the combined information extract the following structured information:\n"
	"1. Recipe Name: Identify the name of the dish being prepared.\n"
	"2. Ingredients List: Extract a detailed list of ingredients with their respective quantities (if mentioned).\n"
	"3. Steps for Preparation: Provide a step-by-step breakdown of the recipe's preparation process, organized and numbered sequentially.\n"
	"4. Cooking Techniques Used: Highlight the cooking techniques demonstrated in the video, such as searing, blitzing, wrapping, etc.\n"
	"5. Equipment Needed: List all tools, appliances, or utensils mentioned, e.g., blender, hot pan, cling film, etc.\n"
	"6. Nutritional Information (if inferred): Provide an approximate calorie count or nutritional breakdown based on the ingredients used.\n"
	"7. Serving size: In count of people or portion size.\n"
	"8. Special Notes or Variations: Include any specific tips, variations, or alternatives mentioned.\n"
	"9. Festive or Thematic Relevance: Note if the recipe has any special relevance to holidays, events, or seasons.\n"
	f"Text: {audio_transcription}\n"
	f"Text: {video_transcription}\n"
	)

	# Prepare the payload and headers
	payload = {
	"contents": [
	{
	"parts": [
	{"text": prompt}
	]
	}
	]
	}
	headers = {"Content-Type": "application/json"}

	# Send request to Gemini API synchronously
	response = requests.post(
	f"{GEMINI_API_ENDPOINT}?key={GEMINI_API_KEY}",
	json=payload,
	headers=headers,
	)

	# Raise error if response code is not 200
	response.raise_for_status()

	data = response.json()

	return data.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "No result found")

	except requests.exceptions.RequestException as e:
	print(f"Error querying Gemini API: {e}")
	return {"error": str(e)}


	if __name__ == '__main__':
	app.run(debug=True)