Spaces:

JJ94
/

AI-API

Sleeping

File size: 1,370 Bytes

817f664
c138cbc
817f664
335f4a7
 
 
817f664
 
c138cbc
 
 
 
f79962c
c138cbc
0e6877a
 
 
 
 
817f664
 
 
 
335f4a7
817f664
 
2e1b7f6
8d8525b
817f664
2e1b7f6
817f664
8d8525b
817f664
 
 
 
 
8d8525b
817f664
335f4a7
 
817f664
f79962c
35666dc

from flask import Flask, render_template, request, Response, stream_with_context
from llama_cpp import Llama
import time

app = Flask(__name__)

# Load the Llama model
print("🚀 Loading model...")
llm = Llama.from_pretrained(
    repo_id="bartowski/google_gemma-3-1b-it-GGUF",
    filename="google_gemma-3-1b-it-IQ4_XS.gguf",
)
print("✅ Model loaded!")

@app.route("/")
def home():
    print("📢 Serving index.html")
    return render_template("index.html")

@app.route("/chat", methods=["POST"])
def chat():
    user_input = request.json.get("message", "")
    print(f"💬 Received message: {user_input}")

    def generate_response():
        print("🤖 Generating response...")
        response = llm.create_chat_completion(
            messages=[{"role": "user", "content": user_input}],
            stream=True  # Enable streaming response
        )
        
        for chunk in response:
            token = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
            if token:
                print(f"📝 Token: {token}", end="", flush=True)
                yield token  # Send token to the client
                time.sleep(0.05)  # Simulate a more natural delay

    return Response(stream_with_context(generate_response()), content_type="text/plain")

if __name__ == "__main__":
    app.run(debug=True, host="0.0.0.0", port=7860)