Spaces:

metek7
/

instagram-short-summarizing

Runtime error

App Files Files Community

instagram-short-summarizing / app.py

metek7

Update app.py

508d056 verified 4 months ago

raw

history blame

7.17 kB

	pip install googletrans
	import spaces
	import gradio as gr
	import subprocess
	from googletrans import Translator


	# Gerekli kütüphanelerin kurulumu
	subprocess.run(
	"pip install flash-attn --no-build-isolation",
	env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
	shell=True,
	)
	subprocess.run("pip install googletrans==3.1.0a0", shell=True)

	import torch
	from llava.model.builder import load_pretrained_model
	from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
	from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
	from llava.conversation import conv_templates, SeparatorStyle
	import copy
	import warnings
	from decord import VideoReader, cpu
	import numpy as np

	# Çevirmen nesnesi oluştur
	translator = Translator()

	title = "# 🙋🏻‍♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
	description1 = """🌋📹LLaVA-Video-7B-Qwen2, 🌋📹LLaVA-Video-178K veri seti ve LLaVA-OneVision veri seti üzerinde eğitilmiş 7B parametreli bir modeldir. [Qwen2 dil modeline dayanmaktadır](https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f) ve 32K tokene kadar bağlam penceresini destekler. Model, görüntüleri, çoklu görüntüleri ve videoları işleyebilir ve bunlarla etkileşime girebilir, video analizi için özel optimizasyonlara sahiptir.
	Bu model, görsel girdi için SO400M görüş omurgasını ve dil işleme için Qwen2'yi kullanır, bu da onu görsel ve video tabanlı görevler de dahil olmak üzere çoklu modal akıl yürütmede oldukça verimli kılar.
	🌋📹LLaVA-Video'nun [32B](https://huggingface.co/lmms-lab/LLaVA-NeXT-Video-32B-Qwen) ve [72B](https://huggingface.co/lmms-lab/LLaVA-Video-72B-Qwen2) daha büyük varyantları ve [sadece yeni sentetik veriler üzerinde eğitilmiş bir varyantı](https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only) bulunmaktadır.
	Daha fazla detay için lütfen [Proje Sayfasını](https://github.com/LLaVA-VL/LLaVA-NeXT) ziyaret edin veya ilgili [araştırma makalesine](https://arxiv.org/abs/2410.02713) göz atın.
	- Mimari: `LlavaQwenForCausalLM`
	- Dikkat Başlıkları: 28
	- Gizli Katmanlar: 28
	- Gizli Boyut: 3584
	"""
	description2 = """
	- Ara Boyut: 18944
	- Desteklenen Maksimum Kare Sayısı: 64
	- Desteklenen Diller: İngilizce, Çince
	- Görüntü En-Boy Oranı: `anyres_max_9`
	- Görüntü Çözünürlüğü: Çeşitli ızgara çözünürlükleri
	- Maksimum Konum Gömmeleri: 32,768
	- Kelime Dağarcığı Boyutu: 152,064
	- Model Hassasiyeti: bfloat16
	- Eğitim İçin Kullanılan Donanım: 256 * Nvidia Tesla A100 GPU'ları
	"""

	join_us = """
	## Bize Katılın:
	🌟TeamTonic🌟 her zaman harika demolar yapıyor! Aktif geliştirici 🛠️topluluğumuza 👻 katılın [![Discord'da bize katılın](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) 🤗Huggingface'de:[MultiTransformer](https://huggingface.co/MultiTransformer) 🌐Github'da: [Tonic-AI](https://github.com/tonic-ai) & 🌟 [Build Tonic](https://git.tonic-ai.com/contribute)'e katkıda bulunun 🤗 Yuvi Sharma ve Huggingface'deki herkese topluluk hibesi için çok teşekkürler 🤗
	"""

	def load_video(video_path, max_frames_num, fps=1, force_sample=False):
	if max_frames_num == 0:
	return np.zeros((1, 336, 336, 3))
	vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
	total_frame_num = len(vr)
	video_time = total_frame_num / vr.get_avg_fps()
	fps = round(vr.get_avg_fps()/fps)
	frame_idx = [i for i in range(0, len(vr), fps)]
	frame_time = [i/fps for i in frame_idx]
	if len(frame_idx) > max_frames_num or force_sample:
	sample_fps = max_frames_num
	uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
	frame_idx = uniform_sampled_frames.tolist()
	frame_time = [i/vr.get_avg_fps() for i in frame_idx]
	frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
	spare_frames = vr.get_batch(frame_idx).asnumpy()
	return spare_frames, frame_time, video_time

	# Model yükleme
	pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
	model_name = "llava_qwen"
	device = "cuda" if torch.cuda.is_available() else "cpu"
	device_map = "auto"

	print("Model yükleniyor...")
	tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)
	model.eval()
	print("Model başarıyla yüklendi!")

	@spaces.GPU
	def process_video(video_path, question):
	max_frames_num = 64
	video, frame_time, video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
	video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].to(device).bfloat16()
	video = [video]

	conv_template = "qwen_1_5"
	time_instruction = f"Video {video_time:.2f} saniye sürmektedir ve {len(video[0])} kare uniform olarak örneklenmiştir. Bu kareler {frame_time} konumlarında bulunmaktadır. Lütfen bu videoyla ilgili aşağıdaki soruları cevaplayın."

	# Soruyu İngilizce'ye çevir
	question_en = translator.translate(question, dest='en').text
	full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{question_en}"

	conv = copy.deepcopy(conv_templates[conv_template])
	conv.append_message(conv.roles[0], full_question)
	conv.append_message(conv.roles[1], None)
	prompt_question = conv.get_prompt()

	input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)

	with torch.no_grad():
	output = model.generate(
	input_ids,
	images=video,
	modalities=["video"],
	do_sample=False,
	temperature=0,
	max_new_tokens=4096,
	)

	response = tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()

	# Cevabı Türkçe'ye çevir
	response_tr = translator.translate(response, dest='tr').text
	return response_tr

	def gradio_interface(video_file, question):
	if video_file is None:
	return "Lütfen bir video dosyası yükleyin."
	response = process_video(video_file, question)
	return response

	with gr.Blocks() as demo:
	gr.Markdown(title)
	with gr.Row():
	with gr.Group():
	gr.Markdown(description1)
	with gr.Group():
	gr.Markdown(description2)
	with gr.Accordion("Bize Katılın", open=False):
	gr.Markdown(join_us)
	with gr.Row():
	with gr.Column():
	video_input = gr.Video()
	question_input = gr.Textbox(label="🙋🏻‍♂️Kullanıcı Sorusu", placeholder="Video hakkında bir soru sorun...")
	submit_button = gr.Button("🌋📹LLaVA-Video'ya Sor")
	output = gr.Textbox(label="🌋📹LLaVA-Video")

	submit_button.click(
	fn=gradio_interface,
	inputs=[video_input, question_input],
	outputs=output
	)

	if __name__ == "__main__":
	demo.launch(show_error=True)