Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,9 @@ from vllm import LLM, SamplingParams
|
|
| 5 |
# Load the model and tokenizer from Hugging Face
|
| 6 |
model_name = "facebook/opt-125m"
|
| 7 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def generate_response(prompt, max_tokens, temperature, top_p):
|
| 11 |
# Tokenize the prompt
|
|
@@ -27,8 +29,8 @@ def generate_response(prompt, max_tokens, temperature, top_p):
|
|
| 27 |
|
| 28 |
# Gradio UI
|
| 29 |
with gr.Blocks() as demo:
|
| 30 |
-
gr.Markdown("# 🚀 Hugging Face Integration with vLLM")
|
| 31 |
-
gr.Markdown("Generate text using the vLLM integration with Hugging Face models.")
|
| 32 |
|
| 33 |
with gr.Row():
|
| 34 |
with gr.Column():
|
|
@@ -74,4 +76,4 @@ with gr.Blocks() as demo:
|
|
| 74 |
)
|
| 75 |
|
| 76 |
# Launch the app
|
| 77 |
-
demo.launch()
|
|
|
|
| 5 |
# Load the model and tokenizer from Hugging Face
|
| 6 |
model_name = "facebook/opt-125m"
|
| 7 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 8 |
+
|
| 9 |
+
# Initialize vLLM with CPU-only configuration
|
| 10 |
+
vllm_model = LLM(model=model_name, tensor_parallel_size=1, device="cpu")
|
| 11 |
|
| 12 |
def generate_response(prompt, max_tokens, temperature, top_p):
|
| 13 |
# Tokenize the prompt
|
|
|
|
| 29 |
|
| 30 |
# Gradio UI
|
| 31 |
with gr.Blocks() as demo:
|
| 32 |
+
gr.Markdown("# 🚀 Hugging Face Integration with vLLM (CPU)")
|
| 33 |
+
gr.Markdown("Generate text using the vLLM integration with Hugging Face models on CPU.")
|
| 34 |
|
| 35 |
with gr.Row():
|
| 36 |
with gr.Column():
|
|
|
|
| 76 |
)
|
| 77 |
|
| 78 |
# Launch the app
|
| 79 |
+
demo.launch()
|