from vllm import LLM, SamplingParams import gradio as gr import os from huggingface_hub import login class TextCompletion: def __init__(self, model, sampling_params): self.model = model self.sampling_params = sampling_params def generate(self, prompt: str): output = self.model.generate(prompt, self.sampling_params) response = output[0].outputs[0].text return response if __name__ == "__main__": HF_TOKEN = os.getenv('HF_TOKEN') login(token=HF_TOKEN) model = LLM( model="mep296/llama-3-8b-entigraph-quality", tokenizer="meta-llama/Meta-Llama-3-8B", device="cuda" ) tokenizer = model.get_tokenizer() sampling_params = SamplingParams( temperature=0.1, max_tokens=500, stop=[tokenizer.eos_token, "## Example 7", "##"] ) def text_completion_fn(prompt): text_completer = TextCompletion(model, sampling_params) return text_completer.generate(prompt) demo = gr.Interface(fn=text_completion_fn, inputs="textbox", outputs="textbox") demo.launch()