|
import os |
|
import time |
|
import json |
|
import requests |
|
import whisper |
|
import cv2 |
|
import pytesseract |
|
import re |
|
import boto3 |
|
from moviepy.editor import VideoFileClip |
|
from flask import Flask, request, jsonify |
|
from flask_apscheduler import APScheduler |
|
from flask_cors import CORS |
|
from werkzeug.utils import secure_filename |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments |
|
from sentence_transformers import SentenceTransformer |
|
from chromadb import Client as ChromaClient |
|
from chromadb.config import Settings |
|
from chromadb.utils import embedding_functions |
|
|
|
app = Flask(__name__) |
|
CORS(app) |
|
scheduler = APScheduler() |
|
scheduler.init_app(app) |
|
scheduler.start() |
|
|
|
|
|
MODEL = whisper.load_model("base") |
|
app.config['UPLOAD_FOLDER'] = "/home/ubuntu/classcut/data" |
|
OCR_TEXT_SUFFIX = "_ocrtext.txt" |
|
TRANSCRIPT_SUFFIX = "_transcript.txt" |
|
DETAILS_SUFFIX = "_details.json" |
|
|
|
ALLOWED_VIDEO_EXTENSIONS = {'mp4', 'avi', 'mov', 'mkv'} |
|
ALLOWED_AUDIO_EXTENSIONS = {'wav', 'mp3', 'm4a', 'flac'} |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
"mistralai/Mistral-7B-v0.1", |
|
torch_dtype="auto", |
|
device_map="auto", |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
|
|
|
|
CHROMA_PATH = "chroma" |
|
|
|
|
|
chroma_client = ChromaClient(Settings(persist_directory=CHROMA_PATH)) |
|
collection = chroma_client.get_or_create_collection(name="video_transcripts") |
|
|
|
|
|
S3_BUCKET = 'classcut-videos' |
|
S3_REGION = 'ap-south-1' |
|
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID') |
|
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY') |
|
|
|
s3 = boto3.client('s3', region_name=S3_REGION, |
|
aws_access_key_id=AWS_ACCESS_KEY_ID, |
|
aws_secret_access_key=AWS_SECRET_ACCESS_KEY) |
|
|
|
|
|
def upload_to_s3(file_path): |
|
file_name = file_path.split('/')[-1] |
|
|
|
try: |
|
s3.upload_file(file_path, S3_BUCKET, file_name, ExtraArgs={ |
|
'ContentType': 'binary/octet-stream', |
|
'ContentDisposition': 'inline' |
|
}) |
|
|
|
s3_url = f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com/{file_name}" |
|
print(f"Uploaded {file_name} to S3 bucket: {S3_BUCKET}") |
|
return s3_url |
|
except Exception as e: |
|
print(f"Error uploading {file_name} to S3: {e}") |
|
|
|
|
|
def extract_audio(video_path): |
|
""" |
|
Extracts audio from a given video file and saves it as an mp3 file. |
|
|
|
:param video_path: Path to the video file. |
|
:return: Path to the extracted audio file. |
|
""" |
|
with VideoFileClip(video_path) as video: |
|
audio_path = f"{video_path}.mp3" |
|
video.audio.write_audiofile(audio_path) |
|
return audio_path |
|
|
|
|
|
def transcribe_with_timestamps(audio_path): |
|
""" |
|
Transcribes the given audio file using the Whisper model, including timestamps. |
|
|
|
:param audio_path: Path to the audio file. |
|
:return: A list of transcribed segments with timestamps. |
|
""" |
|
result = MODEL.transcribe(audio_path, verbose=True, language='hi') |
|
return [f"{seg['start']} - {seg['end']}: {seg['text']}" for seg in result["segments"]] |
|
|
|
|
|
def format_transcript(transcript_segments): |
|
""" |
|
Formats transcript segments into a single string. |
|
|
|
:param transcript_segments: List of transcript segments. |
|
:return: Formatted transcript. |
|
""" |
|
return "\n".join(transcript_segments).replace('\\n', ' ').strip() |
|
|
|
|
|
def extract_text_from_video(video_path, frame_interval=30): |
|
""" |
|
Extracts text from video frames using Tesseract OCR and saves unique text. |
|
|
|
:param video_path: Path to the video file. |
|
:param frame_interval: Interval to capture frames for OCR (in seconds). |
|
:return: List of unique text found in the video. |
|
""" |
|
print(f"Attempting to extract text from {video_path}") |
|
|
|
unique_texts = set() |
|
video = VideoFileClip(video_path) |
|
duration = int(video.duration) |
|
|
|
print(f"Duration of video: {duration} seconds.") |
|
print(f"Frame interval: {frame_interval} seconds.") |
|
|
|
for time_sec in range(0, duration, frame_interval): |
|
frame = video.get_frame(time_sec) |
|
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) |
|
text = pytesseract.image_to_string(gray) |
|
if text.strip() and text not in unique_texts: |
|
unique_texts.add(text.strip()) |
|
|
|
with open(f"{video_path}{OCR_TEXT_SUFFIX}", 'w') as file: |
|
file.writelines(list(unique_texts)) |
|
|
|
return list(unique_texts) |
|
|
|
|
|
def process_video(video_path): |
|
|
|
audio_path = extract_audio(video_path) |
|
transcript_segments = transcribe_with_timestamps(audio_path) |
|
with open(f"{video_path}{TRANSCRIPT_SUFFIX}", 'w') as file: |
|
file.writelines(transcript_segments) |
|
|
|
|
|
extract_text_from_video(video_path) |
|
|
|
|
|
fine_tune_model(transcript_segments) |
|
|
|
|
|
add_to_chromadb(' '.join(transcript_segments)) |
|
|
|
|
|
print(f"Processing of {video_path} completed.") |
|
|
|
|
|
def allowed_video_file(filename): |
|
return '.' in filename and \ |
|
filename.rsplit('.', 1)[1].lower() in ALLOWED_VIDEO_EXTENSIONS |
|
|
|
|
|
def add_to_chromadb(text): |
|
|
|
embeddings = embedding_model.encode([text]) |
|
|
|
|
|
collection.add( |
|
documents=[text], |
|
embeddings=embeddings.tolist(), |
|
metadatas=[{'source': 'video_transcript'}] |
|
) |
|
|
|
print(f"Text appended to ChromaDB.") |
|
|
|
|
|
def fine_tune_model(transcript_segments): |
|
|
|
print("Preparing data for fine-tuning...") |
|
dataset = [{'input_ids': tokenizer.encode(text, return_tensors='pt')[0]} for text in transcript_segments] |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir='./fine_tuned_model', |
|
num_train_epochs=1, |
|
per_device_train_batch_size=1, |
|
save_steps=10, |
|
save_total_limit=2, |
|
logging_steps=10, |
|
learning_rate=5e-5, |
|
fp16=True, |
|
) |
|
|
|
|
|
def data_collator(features): |
|
return {'input_ids': [f['input_ids'] for f in features], |
|
'labels': [f['input_ids'] for f in features]} |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=dataset, |
|
data_collator=data_collator, |
|
) |
|
|
|
|
|
print("Starting fine-tuning...") |
|
trainer.train() |
|
print("Fine-tuning completed.") |
|
|
|
|
|
model.save_pretrained('./fine_tuned_model') |
|
tokenizer.save_pretrained('./fine_tuned_model') |
|
print("Fine-tuned model saved.") |
|
|
|
|
|
def query_chatbot(query_text): |
|
|
|
query_embedding = embedding_model.encode([query_text]) |
|
results = collection.query(query_embeddings=query_embedding, n_results=5) |
|
context_text = " ".join(results['documents'][0]) |
|
|
|
|
|
prompt = f"Context: {context_text}\n\nQuestion: {query_text}\n\nAnswer:" |
|
|
|
inputs = tokenizer(prompt, return_tensors='pt').to(model.device) |
|
outputs = model.generate(**inputs, max_new_tokens=150) |
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
answer = response.split("Answer:")[-1].strip() |
|
return answer |
|
|
|
|
|
@app.route('/hello', methods=['GET']) |
|
def hello(): |
|
return jsonify({'message': 'Hello, World!'}) |
|
|
|
|
|
@app.route('/upload', methods=['POST']) |
|
def upload_file(): |
|
print("Request received.") |
|
if 'file' not in request.files: |
|
return jsonify({'error': 'No file part'}), 400 |
|
|
|
file = request.files['file'] |
|
if file.filename == '': |
|
return jsonify({'error': 'No selected file'}), 400 |
|
|
|
if file and allowed_video_file(file.filename): |
|
filename = secure_filename(file.filename) |
|
file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) |
|
|
|
if not os.path.exists(file_path): |
|
print(f"Saving {file.filename} to {file_path}") |
|
try: |
|
file.save(file_path) |
|
scheduler.add_job(func=process_file, args=[file_path], trigger='date', id='file_process_job') |
|
file_name = file_path.split('/')[-1] |
|
return jsonify({'filename': f"{file_name}"}), 200 |
|
except Exception as e: |
|
return jsonify({'error': str(e)}), 502 |
|
else: |
|
print(f"We have already processed this file - {filename}. Skipping processing.") |
|
return jsonify({'filename': f"{filename}"}), 200 |
|
else: |
|
return jsonify({'error': 'File type not allowed'}), 400 |
|
|
|
x |
|
def process_file(file_path): |
|
|
|
print(f'Processing file: {file_path}') |
|
process_video(file_path) |
|
|
|
time.sleep(10) |
|
print('File processed!') |
|
|
|
|
|
@app.route('/details', methods=['POST']) |
|
def get_details(): |
|
data = request.get_json() |
|
filename = data.get('filename') if data else None |
|
if filename: |
|
print(f"Received request for details of filename: {filename}") |
|
|
|
details_json = f"{app.config['UPLOAD_FOLDER']}/{filename}_details.json" |
|
print(f"Details JSON path: {details_json}") |
|
if os.path.exists(details_json): |
|
with open(details_json, 'r') as file: |
|
details = json.load(file) |
|
return jsonify(details) |
|
else: |
|
return jsonify({'error': 'Details not found'}), 404 |
|
|
|
|
|
@app.route('/chat', methods=['POST']) |
|
def chat(): |
|
chat_msg = request.form.get('chat_msg') |
|
if chat_msg: |
|
print(f"Received chat message: {chat_msg}") |
|
resp = query_chatbot(chat_msg) |
|
return jsonify({"status": "success", "response": f"{resp}"}) |
|
else: |
|
return jsonify({"status": "error", "message": "No chat message received"}), 400 |
|
|
|
|
|
if __name__ == '__main__': |
|
app.run(host='0.0.0.0', port=5000) |
|
|