import torch from transformers import AutoTokenizer, AutoModelForCausalLM from flask import Flask, request, jsonify from threading import Thread app = Flask(__name__) tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") model = AutoModelForCausalLM.from_pretrained( "HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" ) @app.route("/api/chat", methods=["POST"]) def chat(): data = request.get_json() question = data.get("question", "") prompt = f"Eres BITER, un mentor experto en negocios. Siempre respondes en español con consejos breves y útiles.\nUsuario: {question}\nBITER:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(**inputs, max_new_tokens=200) response = tokenizer.decode(outputs[0], skip_special_tokens=True) respuesta_final = response.split("BITER:")[-1].strip() return jsonify({"choices": [{"message": {"content": respuesta_final}}]}) def run(): app.run(host='0.0.0.0', port=7860) Thread(target=run).start()