File size: 2,878 Bytes
f1fd41e
 
6f93dce
 
 
 
 
f1fd41e
 
 
 
6f93dce
 
 
 
 
f1fd41e
 
6f93dce
f1fd41e
6f93dce
f1fd41e
 
 
 
6f93dce
 
f1fd41e
6f93dce
f1fd41e
 
6f93dce
 
 
 
 
 
 
f1fd41e
 
 
 
 
 
 
 
 
 
 
 
 
6f93dce
 
 
 
 
 
f1fd41e
 
 
6f93dce
f1fd41e
6f93dce
f1fd41e
6f93dce
 
 
f1fd41e
 
6f93dce
 
 
 
 
 
f1fd41e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6f93dce
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
from pathlib import Path
from flask import Flask, request, jsonify
from flask_cors import CORS
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Create cache directory if not exists
cache_dir = Path(os.getenv('TRANSFORMERS_CACHE', '/app/cache'))
cache_dir.mkdir(parents=True, exist_ok=True)

app = Flask(__name__)
CORS(app)

# Model configuration
MODEL_NAME = "deepseek-ai/deepseek-r1-6b-chat"
MAX_NEW_TOKENS = 256  # Reduced for free tier limits
DEVICE = "cpu"  # Force CPU for Hugging Face Spaces

# Initialize model
try:
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME,
        cache_dir=str(cache_dir)
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        cache_dir=str(cache_dir),
        device_map="auto",
        torch_dtype=torch.float32,
        low_cpu_mem_usage=True
    )
    print("Model loaded successfully!")
except Exception as e:
    print(f"Model loading failed: {str(e)}")
    model = None

def generate_response(prompt):
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        return f"Error generating response: {str(e)}"

@app.route('/chat', methods=['POST'])
def chat():
    if not model:
        return jsonify({"error": "Model not loaded"}), 500
    
    data = request.get_json()
    if not data or 'prompt' not in data:
        return jsonify({"error": "No prompt provided"}), 400
    
    prompt = data['prompt'].strip()
    if not prompt:
        return jsonify({"error": "Empty prompt"}), 400
    
    try:
        response = generate_response(prompt)
        # Clean up extra text after the final answer
        response = response.split("</s>")[0].strip()
        return jsonify({"response": response})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/health', methods=['GET'])
def health_check():
    status = {
        "model_loaded": bool(model),
        "device": DEVICE,
        "cache_dir": str(cache_dir),
        "memory_usage": f"{torch.cuda.memory_allocated()/1024**2:.2f}MB" if torch.cuda.is_available() else "CPU"
    }
    return jsonify(status)

@app.route('/')
def home():
    return jsonify({
        "service": "DeepSeek Chat API",
        "endpoints": {
            "POST /chat": "Process chat prompts",
            "GET /health": "Service health check"
        },
        "config": {
            "max_tokens": MAX_NEW_TOKENS,
            "model": MODEL_NAME
        }
    })

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)