Spaces:

KoonJamesZ
/

ccib-qwen

Sleeping

App Files Files Community

ccib-qwen / app.py

KoonJamesZ

Update app.py

6f72ed8 verified 3 months ago

raw

history blame

11.4 kB

	from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
	from qwen_vl_utils import process_vision_info
	import torch
	import uuid
	from moviepy.editor import VideoFileClip
	import os
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	import cv2
	from ultralytics import YOLO
	from heapq import heappush, heappushpop
	import numpy as np
	import uuid
	import uuid
	from ultralytics import YOLO
	import gradio as gr

	# # default: Load the model on the available device(s)
	# model = Qwen2VLForConditionalGeneration.from_pretrained(
	# "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
	# )

	# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen2-VL-7B-Instruct",
	torch_dtype=torch.bfloat16,
	attn_implementation="flash_attention_2",
	device_map="auto",
	)

	# default processer
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

	# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
	# min_pixels = 2562828
	# max_pixels = 12802828
	# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	model_id = "openai/whisper-large-v3"

	model_whisper = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	model_whisper.to(device)

	processor_whisper = AutoProcessor.from_pretrained(model_id)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model_whisper,
	tokenizer=processor_whisper.tokenizer,
	feature_extractor=processor_whisper.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	return_timestamps=True
	)

	output_directory = "temp" # Replace with your desired output directory
	os.makedirs(output_directory, exist_ok=True)

	def extract_audio(video_path):
	try:
	# Load the video file
	video = VideoFileClip(video_path)

	# Extract the audio
	audio = video.audio

	# Generate a unique filename using uuid
	unique_filename = f"{uuid.uuid4()}.mp3"
	audio_output_path = f"{output_directory}/{unique_filename}"

	# Save the audio to the unique file
	audio.write_audiofile(audio_output_path)

	result = pipe(audio_output_path)

	os.remove(audio_output_path)

	return result["text"]

	except Exception as e:

	print(f"Error: {str(e)}")

	return ""

	output_dir = '/content/images'
	model_yolo = YOLO('/model/best.pt')

	def extract_top_weapon_frames(video_path, threshold=30):
	os.makedirs(output_dir, exist_ok=True)
	saved_paths = {
	'original': [], # Paths for original frames
	'boxed': [] # Paths for frames with boxes
	}


	weapon_classes = ['weapon', 'knife']
	top_frames = [] # (confidence_score, original_frame, boxed_frame, frame_number)

	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	print("Error: Could not open video.")
	return saved_paths

	ret, prev_frame = cap.read()
	if not ret:
	print("Error: Could not read the first frame.")
	return saved_paths

	prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
	frame_number = 0

	while True:
	ret, frame = cap.read()
	if not ret:
	break

	gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
	frame_diff = cv2.absdiff(gray, prev_gray)
	mean_diff = frame_diff.mean()

	if mean_diff > threshold:
	print(f"Processing frame {frame_number}")
	results = model_yolo.predict(source=frame, show=False)

	frame_max_conf = 0
	frame_with_boxes = frame.copy()

	for result in results:
	for box in result.boxes:
	class_id = int(box.cls[0])
	class_name = model_yolo.names[class_id]
	confidence = float(box.conf[0])

	if class_name in weapon_classes:
	frame_max_conf = max(frame_max_conf, confidence)
	x1, y1, x2, y2 = map(int, box.xyxy[0])
	cv2.rectangle(frame_with_boxes, (x1, y1), (x2, y2), (0, 255, 0), 2)
	label = f"{class_name} ({confidence:.2f})"
	cv2.putText(frame_with_boxes, label, (x1, y1 - 10),
	cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

	if frame_max_conf > 0:
	if len(top_frames) < 2:
	heappush(top_frames, (frame_max_conf, frame.copy(), frame_with_boxes, frame_number))
	elif frame_max_conf > top_frames[0][0]:
	heappushpop(top_frames, (frame_max_conf, frame.copy(), frame_with_boxes, frame_number))

	prev_gray = gray
	frame_number += 1

	# Save the top 2 frames (both original and with boxes)
	for confidence, original_frame, boxed_frame, _ in sorted(top_frames, reverse=True):
	# Save original frame
	original_filename = f"{uuid.uuid4()}.jpg"
	original_path = os.path.join(output_dir, original_filename)
	cv2.imwrite(original_path, original_frame)
	saved_paths['original'].append(original_path)

	# Save frame with boxes
	boxed_filename = f"{uuid.uuid4()}.jpg"
	boxed_path = os.path.join(output_dir, boxed_filename)
	cv2.imwrite(boxed_path, boxed_frame)
	saved_paths['boxed'].append(boxed_path)

	print(f"Saved frame pair with confidence {confidence:.3f}")

	cap.release()
	return saved_paths

	def detect_weapon_image(source_image_path):

	# Ensure the output directory exists
	os.makedirs(output_dir, exist_ok=True)

	# Run YOLO predictions
	results = model_yolo.predict(source=source_image_path, save=False, show=False)

	# List to store paths to saved images
	saved_paths = []

	for result in results:
	# Get the annotated image
	annotated_img = result.plot()

	# Generate a unique filename using UUID
	unique_filename = f"{uuid.uuid4()}.jpg"
	output_path = os.path.join(output_dir, unique_filename)

	# Save the annotated image
	cv2.imwrite(output_path, annotated_img)
	saved_paths.append(output_path)

	return saved_paths
	def response(messages):
	# Preparation for inference
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to("cuda")

	# Inference: Generation of the output
	generated_ids = model.generate(**inputs, max_new_tokens=1024)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)
	return output_text[0]


	system_prompt = """
	Analyze the image for illegal items or contraband. Detect and categorize objects like guns, knives, drugs, and hidden compartments. Highlight areas of interest and provide:

	1. A detailed explanation in Thai describing illegal items and their context.
	2. A JSON output summarizing the findings.

	Output Example:
	1. Explanation (Thai): (detailed explanation in Thai describing illegal items and their context.)
	2. JSON: [{"category": "weapon", "type": "gun"}]
	"""


	def is_mp4_file(file_path):
	return os.path.isfile(file_path) and file_path.lower().endswith(".mp4")

	def process_inputs(text_input, file_input):

	if is_mp4_file(file_input):
	extract_images_from_video = extract_top_weapon_frames(file_input)
	transcription = extract_audio(file_input)

	try:
	# Prepare image content for messages
	image_content = []

	# Check if we have any original images
	if extract_images_from_video['original']:
	# Add first image if available
	image_content.append({
	"type": "image",
	"image": f"file://{extract_images_from_video['original'][0]}"
	})

	# Add second image if available
	if len(extract_images_from_video['original']) > 1:
	image_content.append({
	"type": "image",
	"image": f"file://{extract_images_from_video['original'][1]}"
	})

	# Create messages list with available content
	messages = [{"role": "system", "content": system_prompt},
	{
	"role": "user",
	"content": [
	*image_content, # Unpack available image content
	{"type": "text", "text": f"Content From Social Media Post: {text_input}."},
	{"type": "text", "text": f"this is transcription from video:{transcription}"}
	]
	}
	]

	# Return response and available boxed images (empty list if none)
	result = response(messages), extract_images_from_video.get('boxed', [])
	return result

	except Exception as e:
	return f"Error: {str(e)}", []


	else:
	try:
	# Call your response function with text and file path
	messages = [ {"role": "system", "content": system_prompt},

	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": f"file://{file_input}",
	},
	{"type": "text", "text": f"Content From Social Media Post: {text_input}."},
	],
	}]

	result = response(messages)
	detect_weapon = detect_weapon_image(file_input)
	# Optionally, delete the temporary file after processing

	return result,detect_weapon
	except Exception as e:
	# Handle any exceptions and return the error
	return f"Error: {str(e)}",[]

	# Create the Gradio interface
	demo = gr.Interface(
	fn=process_inputs,
	inputs=[
	gr.Textbox(
	label="Text Input",
	placeholder="Enter your text here...",
	lines=3
	),
	gr.File(
	label="File Upload",
	file_types=[".mp4", ".png", ".jpeg",".jpg"],
	type="filepath"
	)
	],
	outputs= [gr.Textbox(label="Process Results", lines=8),
	gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[2], rows=[1], object_fit="contain", height="auto")],

	title="Text and File Input Processor Qwen2-VL-7B-Instruct",
	description="Enter text and/or upload a file to process them together",
	)

	if __name__ == "__main__":
	demo.launch()