Spaces:
Running
Running
import gradio as gr | |
import os | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
os.environ["LLAMA_CPP_USE_CUDA"] = "0" | |
title = "SmolLM 2 - Bulgarian Joke Master - GGUF" | |
description = """ | |
π [SmolLM 2](https://huggingface.co/unsloth/SmolLM2-135M-Instruct-bnb-4bit) fine-tuned for Bulgarian jokes, running on CPU in GGUF format.\n | |
This model is fine-tuned for generating humorous content in Bulgarian, utilizing the [Llama.cpp library](https://github.com/ggerganov/llama.cpp).\n | |
Running on CPU, it can still produce impressive results, although larger models may require more processing power. | |
""" | |
model_dir = "models" | |
model_name = "unsloth.Q4_K_M.gguf" | |
model_path = os.path.join(model_dir, model_name) | |
hf_hub_download( | |
repo_id="vislupus/bulgarian-joke-master-SmolLM2-135M-Instruct-bnb-4bit-gguf", | |
filename=model_name, | |
local_dir=model_dir | |
) | |
if not os.path.exists(model_path): | |
raise FileNotFoundError(f"Model file not found at {model_path}") | |
llm = Llama(model_path=model_path) | |
def generate_response(message, history, temperature=0.7, top_p=1.0, max_tokens=1280): | |
try: | |
response = llm(message, max_tokens=max_tokens, temperature=temperature, top_p=top_p) | |
return response["choices"][0]["text"].strip() | |
except Exception as e: | |
return f"Error generating response: {str(e)}" | |
if __name__ == "__main__": | |
gguf_demo = gr.ChatInterface( | |
generate_response, | |
title=title, | |
description=description, | |
) | |
gguf_demo.launch(share=True) | |