Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

Doc-VLMs-OCR / app.py

prithivMLmods

Update app.py

0b5bfb4 verified 3 months ago

raw

history blame

3.18 kB

	import gradio as gr
	from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
	from threading import Thread
	import re
	import time
	import torch
	import spaces
	import subprocess
	subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
	subprocess.run("pip install av", shell=True) # Install pyav for video processing

	from io import BytesIO

	processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
	model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct",
	_attn_implementation="flash_attention_2",
	torch_dtype=torch.bfloat16).to("cuda:0")

	@spaces.GPU
	def model_inference(input_dict, history, max_tokens):
	text = input_dict["text"]
	media_queue = []
	user_content = []

	for file in input_dict.get("files", []):
	if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
	media_queue.append({"type": "image", "path": file})
	elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
	media_queue.append({"type": "video", "path": file})

	if "<image>" in text or "<video>" in text:
	parts = re.split(r'(<image>\|<video>)', text)
	for part in parts:
	if part == "<image>" and media_queue:
	user_content.append(media_queue.pop(0))
	elif part == "<video>" and media_queue:
	user_content.append(media_queue.pop(0))
	elif part.strip():
	user_content.append({"type": "text", "text": part.strip()})
	else:
	user_content.append({"type": "text", "text": text})
	user_content.extend(media_queue)

	resulting_messages = [{"role": "user", "content": user_content}]

	if not text and not media_queue:
	return "Please provide text and/or media input."

	inputs = processor.apply_chat_template(
	resulting_messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	).to(model.device)

	streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
	generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)

	thread = Thread(target=model.generate, kwargs=generation_args)
	thread.start()

	yield "Generating response..."
	buffer = ""

	for new_text in streamer:
	buffer += new_text
	time.sleep(0.01)
	yield buffer


	demo = gr.ChatInterface(
	fn=model_inference,
	title="SmolVLM2: The Smallest Video Model Ever 📺",
	description="Play with SmolVLM2-2.2B-Instruct. Upload an image or video and ask a question.",
	textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
	stop_btn="Stop Generation",
	multimodal=True,
	cache_examples=False,
	additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
	type="messages"
	)

	demo.launch(share=True, debug=True)