Spaces:

remyxai
/

SpaceLLaVA

Running

App Files Files Community

SpaceLLaVA / app.py

salma-remyx

Update app.py

4447695 verified over 1 year ago

raw

history blame contribute delete

3.07 kB

	import io
	import base64
	import numpy as np
	import torch
	import matplotlib
	import matplotlib.cm
	import gradio as gr
	from PIL import Image

	from llama_cpp import Llama
	from llama_cpp.llama_chat_format import Llava15ChatHandler

	# Converts an image input (PIL Image or file path) into a base64 data URI
	def image_to_base64_data_uri(image_input):
	if isinstance(image_input, str):
	with open(image_input, "rb") as img_file:
	base64_data = base64.b64encode(img_file.read()).decode('utf-8')
	elif isinstance(image_input, Image.Image):
	buffer = io.BytesIO()
	image_input.save(buffer, format="PNG")
	base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
	else:
	raise ValueError("Unsupported input type. Input must be a file path or a PIL.Image.Image instance.")
	return f"data:image/png;base64,{base64_data}"

	class Llava:
	def __init__(self, mmproj="model/mmproj-model-f16.gguf", model_path="model/ggml-model-q4_0.gguf", gpu=False):
	chat_handler = Llava15ChatHandler(clip_model_path=mmproj, verbose=True)
	n_gpu_layers = 0
	if gpu:
	n_gpu_layers = -1
	self.llm = Llama(model_path=model_path, chat_handler=chat_handler, n_ctx=2048, logits_all=True, n_gpu_layers=n_gpu_layers)

	def run_inference(self, image, prompt):
	data_uri = image_to_base64_data_uri(image)
	res = self.llm.create_chat_completion(
	messages=[
	{"role": "system", "content": "You are an assistant who perfectly describes images."},
	{
	"role": "user",
	"content": [
	{"type": "image_url", "image_url": {"url": data_uri}},
	{"type": "text", "text": prompt}
	]
	}
	]
	)
	return res["choices"][0]["message"]["content"]

	llm_model = Llava()

	def predict(image, prompt):
	result = llm_model.run_inference(image, prompt)
	return result

	title_and_links_markdown = """
	# 🛸SpaceLLaVA🌋: A spatial reasoning multi-modal model
	This space hosts our initial release of LLaVA 1.5 LoRA tuned for spatial reasoning using data generated with [VQASynth](https://github.com/remyxai/VQASynth).
	Upload an image and ask a question.

	[Model](https://huggingface.co/remyxai/SpaceLLaVA) \| [Code](https://github.com/remyxai/VQASynth) \| [Paper](https://spatial-vlm.github.io)
	"""

	examples = [
	["examples/warehouse_1.jpg", "Is the man wearing gray pants to the left of the pile of boxes on a pallet?"],
	["examples/warehouse_2.jpg", "Is the forklift taller than the shelves of boxes?"],
	]


	# Create the Gradio interface with the markdown
	iface = gr.Interface(
	fn=predict,
	inputs=[gr.Image(type="pil", label="Input Image"), gr.Textbox(label="Prompt")],
	outputs=gr.Textbox(),
	examples=examples,
	title="🛸SpaceLLaVA🌋: A spatial reasoning multi-modal model",
	description=title_and_links_markdown # Use description for markdown
	)

	# Launch the Gradio app
	iface.launch()