Spaces:

AugustLight
/

LLight-3.2-3b-Instruct

Sleeping

App Files Files Community

AugustLight commited on Oct 26, 2024

Commit

32f05f8

verified ·

1 Parent(s): f747916

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -79

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 import os
-# Global model instance
 model = None
 def load_model():
@@ -21,11 +21,9 @@ def load_model():
         model = Llama(
             model_path=model_path,
-            n_ctx=512,
-            n_threads=os.cpu_count(),
-            n_batch=128,
-            n_gpu_layers=0,
-            embedding_cache_size=1024
         )
         print("Модель успешно инициализирована!")
@@ -39,58 +37,34 @@ def respond(message, history, system_message, max_new_tokens, temperature, top_p
     try:
         global model
         if model is None:
-            print("Загружаем модель...")
             model = load_model()
-            print("Модель загружена")
-        # Отладочная печать входных параметров
-        print(f"""
-        Входные параметры:
-        - message: {message}
-        - history length: {len(history)}
-        - system_message: {system_message}
-        - max_new_tokens: {max_new_tokens}
-        - temperature: {temperature}
-        - top_p: {top_p}
-        """)
-        # Ограничиваем историю последними 3 сообщениями
-        recent_history = history[-3:] if len(history) > 3 else history
         context = f"{system_message}\n\n"
-        for user_msg, assistant_msg in recent_history:
             context += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
         context += f"User: {message}\nAssistant: "
         print(f"Генерируем ответ для контекста длиной {len(context)} символов")
-        try:
-            response = model(
-                prompt=context,
-                max_tokens=max_new_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                stop=["User:", "\n\n", "<|endoftext|>"],
-                echo=False
-            )
-            generated_text = response['choices'][0]['text']
-            print(f"Ответ сгенерирован успешно, длина: {len(generated_text)}")
-            return generated_text.strip()
-        except Exception as inner_e:
-            print(f"Ошибка при генерации: {str(inner_e)}")
-            print(f"Тип ошибки: {type(inner_e).__name__}")
-            return f"Ошибка при генерации: {str(inner_e)}"
-    except KeyboardInterrupt:
-        return "Генерация прервана пользователем"
     except Exception as e:
-        print(f"Основная ошибка: {str(e)}")
-        print(f"Тип ошибки: {type(e).__name__}")
-        traceback.print_exc()  # Добавим полный стек ошибки
-        return f"Произошла ошибка: {str(e)}\nТип ошибки: {type(e).__name__}"
-# Создаем интерфейс с оптимизированными параметрами
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
@@ -100,8 +74,8 @@ demo = gr.ChatInterface(
         ),
         gr.Slider(
             minimum=1,
-            maximum=512,
-            value=128,
             step=1,
             label="Max new tokens"
         ),
@@ -120,41 +94,23 @@ demo = gr.ChatInterface(
             label="Top-p (nucleus sampling)"
         ),
     ],
-    title="LLight Chat Model (Optimized)",
-    description="Оптимизированный чат с LLight-3.2-3B",
-    examples = [
-        ["Привет! Как дела?",
-         "Ты дружелюбный и полезный ассистент. Отвечай обдуманно и по делу.",  # system_message
-         128,    # max_new_tokens
-         0.3,    # temperature
-         0.95    # top_p
-        ],
-        ["Расскажи мне о себе",
-         "Ты дружелюбный и полезный ассистент. Отвечай обдуманно и по делу.",
-         128,
-         0.3,
-         0.95
-        ],
-        ["Что ты умеешь делать?",
-         "Ты дружелюбный и полезный ассистент. Отвечай обдуманно и по делу.",
-         128,
-         0.3,
-         0.95
-        ]
     ],
-    cache_examples=True
 )
 # Запускаем приложение
 if __name__ == "__main__":
     try:
         print("Инициализация приложения...")
-        model = load_model()  # Предзагружаем модель
         print("Модель загружена успешно при старте")
     except Exception as e:
         print(f"Ошибка при инициализации: {str(e)}")
-    demo.launch(
-        show_error=True,  # Показывать подробности ошибок
-        debug=True        # Включить отладочный режим
-    )

 from llama_cpp import Llama
 import os
+# Так надо
 model = None
 def load_model():
         model = Llama(
             model_path=model_path,
+            n_ctx=2048,        # Размер контекста
+            n_threads=4,       # Количество потоков
+            n_batch=512        # Размер батча
         )
         print("Модель успешно инициализирована!")
     try:
         global model
         if model is None:
             model = load_model()
         context = f"{system_message}\n\n"
+        for user_msg, assistant_msg in history:
             context += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
         context += f"User: {message}\nAssistant: "
         print(f"Генерируем ответ для контекста длиной {len(context)} символов")
+        response = model(
+            prompt=context,
+            max_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            stop=["User:", "\n\n", "<|endoftext|>"],
+            echo=False  # Не возвращать промпт в ответе
+        )
+        generated_text = response['choices'][0]['text']
+        print(f"Ответ сгенерирован успешно, длина: {len(generated_text)}")
+        return generated_text.strip()
     except Exception as e:
+        error_msg = f"Произошла ошибка: {str(e)}"
+        print(error_msg)
+        return error_msg
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
         ),
         gr.Slider(
             minimum=1,
+            maximum=2048,
+            value=512,
             step=1,
             label="Max new tokens"
         ),
             label="Top-p (nucleus sampling)"
         ),
     ],
+    title="GGUF Chat Model",
+    description="Чат с GGUF моделью (LLight-3.2-3B-Instruct)",
+    examples=[
+        ["Привет! Как дела?"],
+        ["Расскажи мне о себе"],
+        ["Что ты умеешь делать?"]
     ],
+    cache_examples=False
 )
 # Запускаем приложение
 if __name__ == "__main__":
     try:
         print("Инициализация приложения...")
+        model = load_model()
         print("Модель загружена успешно при старте")
     except Exception as e:
         print(f"Ошибка при инициализации: {str(e)}")
+    demo.launch()