import os import time import json import requests import whisper import cv2 import pytesseract import re import boto3 from moviepy.editor import VideoFileClip from flask import Flask, request, jsonify from flask_apscheduler import APScheduler from flask_cors import CORS from werkzeug.utils import secure_filename from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments from sentence_transformers import SentenceTransformer from chromadb import Client as ChromaClient from chromadb.config import Settings from chromadb.utils import embedding_functions app = Flask(__name__) CORS(app) scheduler = APScheduler() scheduler.init_app(app) scheduler.start() # Load Whisper model globally to avoid redundancy MODEL = whisper.load_model("base") app.config['UPLOAD_FOLDER'] = "/home/ubuntu/classcut/data" OCR_TEXT_SUFFIX = "_ocrtext.txt" TRANSCRIPT_SUFFIX = "_transcript.txt" DETAILS_SUFFIX = "_details.json" ALLOWED_VIDEO_EXTENSIONS = {'mp4', 'avi', 'mov', 'mkv'} ALLOWED_AUDIO_EXTENSIONS = {'wav', 'mp3', 'm4a', 'flac'} # Initialize Mistral 7B model and tokenizer tokenizer = AutoTokenizer.from_pretrained("bombaygamercc/learnyfi-mistral", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( "bombaygamercc/learnyfi-mistral", torch_dtype="auto", device_map="auto", trust_remote_code=True ) # Initialize SentenceTransformer for embeddings embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Set the Chroma DB path CHROMA_PATH = "chroma" # Initialize Chroma vector store chroma_client = ChromaClient(Settings(persist_directory=CHROMA_PATH)) collection = chroma_client.get_or_create_collection(name="video_transcripts") # AWS S3 Configuration S3_BUCKET = 'classcut-videos' S3_REGION = 'ap-south-1' # e.g., 'us-west-1' AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID') AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY') s3 = boto3.client('s3', region_name=S3_REGION, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY) def upload_to_s3(file_path): file_name = file_path.split('/')[-1] # Upload the file to S3 try: s3.upload_file(file_path, S3_BUCKET, file_name, ExtraArgs={ 'ContentType': 'binary/octet-stream', 'ContentDisposition': 'inline' }) # Construct the S3 URL s3_url = f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com/{file_name}" print(f"Uploaded {file_name} to S3 bucket: {S3_BUCKET}") return s3_url except Exception as e: print(f"Error uploading {file_name} to S3: {e}") def extract_audio(video_path): """ Extracts audio from a given video file and saves it as an mp3 file. :param video_path: Path to the video file. :return: Path to the extracted audio file. """ with VideoFileClip(video_path) as video: audio_path = f"{video_path}.mp3" video.audio.write_audiofile(audio_path) return audio_path def transcribe_with_timestamps(audio_path): """ Transcribes the given audio file using the Whisper model, including timestamps. :param audio_path: Path to the audio file. :return: A list of transcribed segments with timestamps. """ result = MODEL.transcribe(audio_path, verbose=True, language='hi') return [f"{seg['start']} - {seg['end']}: {seg['text']}" for seg in result["segments"]] def format_transcript(transcript_segments): """ Formats transcript segments into a single string. :param transcript_segments: List of transcript segments. :return: Formatted transcript. """ return "\n".join(transcript_segments).replace('\\n', ' ').strip() def extract_text_from_video(video_path, frame_interval=30): """ Extracts text from video frames using Tesseract OCR and saves unique text. :param video_path: Path to the video file. :param frame_interval: Interval to capture frames for OCR (in seconds). :return: List of unique text found in the video. """ print(f"Attempting to extract text from {video_path}") unique_texts = set() video = VideoFileClip(video_path) duration = int(video.duration) print(f"Duration of video: {duration} seconds.") print(f"Frame interval: {frame_interval} seconds.") for time_sec in range(0, duration, frame_interval): frame = video.get_frame(time_sec) gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) text = pytesseract.image_to_string(gray) if text.strip() and text not in unique_texts: unique_texts.add(text.strip()) with open(f"{video_path}{OCR_TEXT_SUFFIX}", 'w') as file: file.writelines(list(unique_texts)) return list(unique_texts) def process_video(video_path): # Extract audio and transcribe audio_path = extract_audio(video_path) transcript_segments = transcribe_with_timestamps(audio_path) with open(f"{video_path}{TRANSCRIPT_SUFFIX}", 'w') as file: file.writelines(transcript_segments) # Extract text from video frames extract_text_from_video(video_path) # Fine-tune the Mistral model on the new transcript fine_tune_model(transcript_segments) # Add the transcript to ChromaDB add_to_chromadb(' '.join(transcript_segments)) # You can add additional processing if needed print(f"Processing of {video_path} completed.") def allowed_video_file(filename): return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in ALLOWED_VIDEO_EXTENSIONS def add_to_chromadb(text): # Generate embeddings embeddings = embedding_model.encode([text]) # Add to ChromaDB collection.add( documents=[text], embeddings=embeddings.tolist(), metadatas=[{'source': 'video_transcript'}] ) print(f"Text appended to ChromaDB.") def fine_tune_model(transcript_segments): # Prepare data for fine-tuning print("Preparing data for fine-tuning...") dataset = [{'input_ids': tokenizer.encode(text, return_tensors='pt')[0]} for text in transcript_segments] # Define training arguments training_args = TrainingArguments( output_dir='./fine_tuned_model', num_train_epochs=1, # Adjust as needed per_device_train_batch_size=1, # Adjust based on your hardware save_steps=10, save_total_limit=2, logging_steps=10, learning_rate=5e-5, # Hyperparameter tuning can be done here fp16=True, # Enable if using compatible GPU ) # Define a data collator def data_collator(features): return {'input_ids': [f['input_ids'] for f in features], 'labels': [f['input_ids'] for f in features]} # Initialize Trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset, data_collator=data_collator, ) # Fine-tune the model print("Starting fine-tuning...") trainer.train() print("Fine-tuning completed.") # Save the fine-tuned model model.save_pretrained('./fine_tuned_model') tokenizer.save_pretrained('./fine_tuned_model') print("Fine-tuned model saved.") def query_chatbot(query_text): # Retrieve relevant documents from ChromaDB query_embedding = embedding_model.encode([query_text]) results = collection.query(query_embeddings=query_embedding, n_results=5) context_text = " ".join(results['documents'][0]) # Prepare input for the model prompt = f"Context: {context_text}\n\nQuestion: {query_text}\n\nAnswer:" inputs = tokenizer(prompt, return_tensors='pt').to(model.device) outputs = model.generate(**inputs, max_new_tokens=150) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract the answer part answer = response.split("Answer:")[-1].strip() return answer @app.route('/hello', methods=['GET']) def hello(): return jsonify({'message': 'Hello, World!'}) @app.route('/upload', methods=['POST']) def upload_file(): print("Request received.") if 'file' not in request.files: return jsonify({'error': 'No file part'}), 400 file = request.files['file'] if file.filename == '': return jsonify({'error': 'No selected file'}), 400 if file and allowed_video_file(file.filename): filename = secure_filename(file.filename) file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) if not os.path.exists(file_path): print(f"Saving {file.filename} to {file_path}") try: file.save(file_path) scheduler.add_job(func=process_file, args=[file_path], trigger='date', id='file_process_job') file_name = file_path.split('/')[-1] return jsonify({'filename': f"{file_name}"}), 200 except Exception as e: return jsonify({'error': str(e)}), 502 else: print(f"We have already processed this file - {filename}. Skipping processing.") return jsonify({'filename': f"{filename}"}), 200 else: return jsonify({'error': 'File type not allowed'}), 400 x def process_file(file_path): # Your file processing logic here print(f'Processing file: {file_path}') process_video(file_path) # Simulate a long processing task time.sleep(10) print('File processed!') @app.route('/details', methods=['POST']) def get_details(): data = request.get_json() filename = data.get('filename') if data else None if filename: print(f"Received request for details of filename: {filename}") details_json = f"{app.config['UPLOAD_FOLDER']}/{filename}_details.json" print(f"Details JSON path: {details_json}") if os.path.exists(details_json): with open(details_json, 'r') as file: details = json.load(file) return jsonify(details) else: return jsonify({'error': 'Details not found'}), 404 @app.route('/chat', methods=['POST']) def chat(): chat_msg = request.form.get('chat_msg') if chat_msg: print(f"Received chat message: {chat_msg}") resp = query_chatbot(chat_msg) return jsonify({"status": "success", "response": f"{resp}"}) else: return jsonify({"status": "error", "message": "No chat message received"}), 400 if __name__ == '__main__': app.run(host='0.0.0.0', port=5000)