File size: 4,204 Bytes
8a0a858
 
 
 
 
2e86fc7
8a0a858
 
 
 
 
 
 
 
 
 
 
 
2e86fc7
8a0a858
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e86fc7
8a0a858
 
 
 
 
 
 
 
 
2e86fc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a0a858
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import streamlit as st
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
from huggingface_hub import login

# Set page config for better display
st.set_page_config(page_title="LLaMA Chatbot", page_icon="πŸ¦™")
status_placeholder = st.empty()

# Check GPU
if torch.cuda.is_available():
    st.sidebar.success("βœ… CUDA is available")
    st.sidebar.info(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    st.sidebar.warning("⚠️ CUDA is not available. Using CPU.")

# Authentication with simplified token handling
try:
    # Try to get token
    hf_token = os.environ.get("HF_TOKEN")
    if not hf_token:
        hf_token = st.secrets.get("HF_TOKEN")
    
    if hf_token:
        st.info(f"Token found! First 4 characters: {hf_token[:4]}...")
        login(token=hf_token)
        status_placeholder.success("πŸ”‘ Successfully logged in to Hugging Face!")
    else:
        st.error("No token found in any location")
        st.stop()
        
except Exception as e:
    st.error(f"🚫 Error with HF token: {str(e)}")
    st.stop()

st.title("πŸ¦™ LLaMA Chatbot")

# Model loading with detailed status updates
@st.cache_resource
def load_model():
    try:
        model_path = "Alaaeldin/Llama-demo"
        
        with st.spinner("πŸ”„ Loading tokenizer..."):
            tokenizer = AutoTokenizer.from_pretrained(
                model_path,
                token=hf_token,
                trust_remote_code=True
            )
            st.success("βœ… Tokenizer loaded!")
        
        with st.spinner("πŸ”„ Loading model... This might take a few minutes..."):
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                torch_dtype=torch.float16,
                device_map="auto",
                token=hf_token,
                trust_remote_code=True
            )
            st.success("βœ… Model loaded!")
        
        return model, tokenizer
    except Exception as e:
        st.error(f"❌ Error loading model: {str(e)}")
        return None, None

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = []

# Load model
model, tokenizer = load_model()

# Chat interface
if model and tokenizer:
    st.success("✨ Ready to chat! Enter your message below.")
    
    # Display chat history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # Chat input
    if prompt := st.chat_input("Speak thy mind..."):
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": prompt})
        
        # Display user message
        with st.chat_message("user"):
            st.markdown(prompt)
            
        # Generate response
        with st.chat_message("assistant"):
            with st.spinner("πŸ€” Composing a verse..."):
                try:
                    # Prepare input
                    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
                    
                    # Generate response
                    with torch.no_grad():
                        outputs = model.generate(
                            inputs["input_ids"],
                            max_length=200,
                            num_return_sequences=1,
                            temperature=0.7,
                            do_sample=True,
                            pad_token_id=tokenizer.eos_token_id
                        )
                    
                    # Decode response
                    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
                    
                    # Display response
                    st.markdown(response)
                    
                    # Add assistant response to chat history
                    st.session_state.messages.append({"role": "assistant", "content": response})
                except Exception as e:
                    st.error(f"Error generating response: {str(e)}")

else:
    st.error("⚠️ Model loading failed. Please check the error messages above.")