Spaces:
Running
Running
from fastapi import FastAPI | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
from langchain.llms import HuggingFacePipeline | |
import torch | |
app = FastAPI() | |
# --- LLM Initialization using Hugging Face --- | |
model_id = "Qwen/Qwen2.5-1.5B-Instruct" | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
device_map="auto", | |
torch_dtype=torch.float16 | |
) | |
generator = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
max_length=256, | |
temperature=0.3, | |
) | |
llm = HuggingFacePipeline(pipeline=generator) | |
# Example endpoint using the new llm | |
async def post_query(query: str): | |
# Create a simple prompt structure | |
prompt = f"Answer the following query:\n\n{query}\n" | |
# Get the response from the LLM | |
response = llm(prompt) | |
return {"response": response} | |
# (Keep your WebSocket endpoint and other code mostly unchanged) | |