Spaces:

AIdeaText
/

TestOneLlama

Paused

App Files Files Community

AIdeaText commited on Nov 26, 2024

Commit

fa0a856

verified ·

1 Parent(s): ce284a4

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -12

app.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import streamlit as st
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 from typing import List, Dict
 import time
 class LlamaDemo:
     def __init__(self):
-        self.model_name = "meta-llama/Llama-2-7b-chat-hf"
         # Initialize in lazy loading fashion
         self._model = None
         self._tokenizer = None
@@ -17,24 +18,31 @@ class LlamaDemo:
             self._model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 torch_dtype=torch.float16,
-                device_map="auto"
             )
         return self._model
     @property
     def tokenizer(self):
         if self._tokenizer is None:
-            self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         return self._tokenizer
     def generate_response(self, prompt: str, max_length: int = 512) -> str:
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         # Generate response
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
-                max_length=max_length,
                 num_return_sequences=1,
                 temperature=0.7,
                 do_sample=True,
@@ -42,20 +50,23 @@ class LlamaDemo:
             )
         response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return response.replace(prompt, "").strip()
 def main():
     st.set_page_config(
-        page_title="Llama 3.1 Demo",
         page_icon="🦙",
         layout="wide"
     )
-    st.title("🦙 Llama 3.1 Demo")
     # Initialize session state
     if 'llama' not in st.session_state:
-        st.session_state.llama = LlamaDemo()
     if 'chat_history' not in st.session_state:
         st.session_state.chat_history = []
@@ -85,7 +96,7 @@ def main():
             with st.chat_message("assistant"):
                 message_placeholder = st.empty()
-                with st.spinner("Generating response..."):
                     response = st.session_state.llama.generate_response(prompt)
                     message_placeholder.write(response)
@@ -95,11 +106,20 @@ def main():
                     "content": response
                 })
-    # Sidebar with settings
     with st.sidebar:
         st.header("Settings")
         max_length = st.slider("Maximum response length", 64, 1024, 512)
         if st.button("Clear Chat History"):
             st.session_state.chat_history = []
             st.experimental_rerun()

 import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 from typing import List, Dict
 import time
 class LlamaDemo:
     def __init__(self):
+        # Using TinyLlama, which is open source and doesn't require authentication
+        self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
         # Initialize in lazy loading fashion
         self._model = None
         self._tokenizer = None
             self._model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 torch_dtype=torch.float16,
+                device_map="auto",
+                trust_remote_code=True
             )
         return self._model
     @property
     def tokenizer(self):
         if self._tokenizer is None:
+            self._tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name,
+                trust_remote_code=True
+            )
         return self._tokenizer
     def generate_response(self, prompt: str, max_length: int = 512) -> str:
+        # Format the prompt according to TinyLlama's chat template
+        chat_prompt = f"<|system|>You are a helpful AI assistant.</s><|user|>{prompt}</s><|assistant|>"
+        inputs = self.tokenizer(chat_prompt, return_tensors="pt").to(self.model.device)
         # Generate response
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
+                max_new_tokens=max_length,
                 num_return_sequences=1,
                 temperature=0.7,
                 do_sample=True,
             )
         response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Remove the prompt from the response
+        response = response.split("<|assistant|>")[-1].strip()
+        return response
 def main():
     st.set_page_config(
+        page_title="Open Source Llama Demo",
         page_icon="🦙",
         layout="wide"
     )
+    st.title("🦙 Open Source Llama Demo")
     # Initialize session state
     if 'llama' not in st.session_state:
+        with st.spinner("Loading model... This might take a few minutes..."):
+            st.session_state.llama = LlamaDemo()
     if 'chat_history' not in st.session_state:
         st.session_state.chat_history = []
             with st.chat_message("assistant"):
                 message_placeholder = st.empty()
+                with st.spinner("Thinking..."):
                     response = st.session_state.llama.generate_response(prompt)
                     message_placeholder.write(response)
                     "content": response
                 })
+    # Sidebar with settings and info
     with st.sidebar:
         st.header("Settings")
         max_length = st.slider("Maximum response length", 64, 1024, 512)
+        st.markdown("---")
+        st.markdown("""
+        ### About
+        This demo uses TinyLlama, an open source language model that's smaller but
+        still capable. It's perfect for demonstrations and testing.
+        The model is loaded locally and doesn't require any authentication or API keys.
+        """)
         if st.button("Clear Chat History"):
             st.session_state.chat_history = []
             st.experimental_rerun()