Rulga commited on
Commit
ea4670b
·
1 Parent(s): 373afaa

old version

Browse files
Files changed (6) hide show
  1. .gitignore +0 -2
  2. README.md +8 -8
  3. app.py +220 -384
  4. colab_request.py +39 -0
  5. requirements.txt +11 -8
  6. run.sh +4 -1
.gitignore CHANGED
@@ -4,5 +4,3 @@
4
  venv
5
  .streamlit/secrets.toml
6
 
7
- stop_space.py
8
- colab_request.py
 
4
  venv
5
  .streamlit/secrets.toml
6
 
 
 
README.md CHANGED
@@ -1,15 +1,15 @@
1
  ---
2
- title: 'Doc LS Chatbot '
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: yellow
6
- sdk: docker
7
- sdk_version: 1.42.2
8
  app_file: app.py
9
  pinned: false
10
  short_description: It is a chat built with an AI model about www.Status.law
11
  ---
12
 
13
- # LS DOC Chatbot Log
14
 
15
- It is a chat app built using Hugging Face and Docker Space that allows users to interact with an AI model to communicate about www.Status.law
 
1
  ---
2
+ title: LS Chatbot Log
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.42.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: It is a chat built with an AI model about www.Status.law
11
  ---
12
 
13
+ # LS Chatbot Log
14
 
15
+ It is a chat app built using Streamlit that allows users to interact with an AI model to communicate about www.Status.law
app.py CHANGED
@@ -1,430 +1,266 @@
1
- 1/0
2
-
3
  import os
4
- import sys
5
- import json
6
- import traceback
7
- import warnings
8
- import asyncio
9
- import aiohttp
10
- from datetime import datetime
11
- from typing import Optional, List, Dict
12
- import logging
13
-
14
- # Настройка логгера
15
- logger = logging.getLogger(__name__)
16
- logging.basicConfig(
17
- level=logging.INFO,
18
- format='%(asctime)s - %(levelname)s - %(message)s'
19
- )
20
-
21
- from bs4 import BeautifulSoup
22
  from dotenv import load_dotenv
23
- from fastapi import FastAPI, HTTPException, BackgroundTasks
24
- from pydantic import BaseModel
25
  from langchain_groq import ChatGroq
26
  from langchain_huggingface import HuggingFaceEmbeddings
27
  from langchain_community.vectorstores import FAISS
28
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
29
  from langchain_core.prompts import PromptTemplate
30
  from langchain_core.output_parsers import StrOutputParser
31
- from langchain_core.tracers import ConsoleCallbackHandler
32
- from langchain_core.callbacks import CallbackManager
33
- from langchain_core.documents import Document
34
-
35
- # Ignore SSL warnings
36
- warnings.filterwarnings('ignore')
37
 
38
  # Initialize environment variables
39
  load_dotenv()
40
 
41
- # Проверяем наличие необходимых переменных окружения
42
- required_env_vars = ["GROQ_API_KEY"]
43
- missing_vars = [var for var in required_env_vars if not os.getenv(var)]
44
- if missing_vars:
45
- raise EnvironmentError(f"Missing required environment variables: {', '.join(missing_vars)}")
46
-
47
- # Проверяем наличие и права доступа к директориям кэша
48
- cache_dir = "/app/.cache"
49
- if not os.path.exists(cache_dir):
50
- os.makedirs(cache_dir, exist_ok=True)
51
- os.chmod(cache_dir, 0o777)
52
-
53
- hf_cache_dir = os.path.join(cache_dir, "huggingface")
54
- if not os.path.exists(hf_cache_dir):
55
- os.makedirs(hf_cache_dir, exist_ok=True)
56
- os.chmod(hf_cache_dir, 0o777)
57
-
58
- logger.info(f"Cache directories initialized: {cache_dir}, {hf_cache_dir}")
59
-
60
- # Initialize FastAPI app
61
- app = FastAPI(title="Status Law Assistant API")
62
-
63
- # Константы
64
- EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
65
- VECTOR_STORE_PATH = "vector_store"
66
- KB_CONFIG_PATH = "vector_store/kb_config.json"
67
- CACHE_DIR = "cache"
68
-
69
- # Создаем необходимые директории
70
- os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
71
- os.makedirs(CACHE_DIR, exist_ok=True)
72
-
73
- def get_kb_config():
74
- if os.path.exists(KB_CONFIG_PATH):
75
- with open(KB_CONFIG_PATH, 'r') as f:
76
- return json.load(f)
77
- return {
78
- "version": 1,
79
- "processed_urls": [],
80
- "last_update": None
81
  }
 
 
 
 
82
 
83
- def save_kb_config(config):
84
- os.makedirs(os.path.dirname(KB_CONFIG_PATH), exist_ok=True)
85
- with open(KB_CONFIG_PATH, 'w') as f:
86
- json.dump(config, f)
87
-
88
- # Models for request/response
89
- class ChatRequest(BaseModel):
90
- message: str
91
-
92
- class ChatResponse(BaseModel):
93
- response: str
94
- context: Optional[str] = None
95
-
96
- # Global variables
97
- URLS = [
98
- "https://status.law",
99
- "https://status.law/about",
100
- "https://status.law/careers",
101
- "https://status.law/tariffs-for-services-against-extradition-en/",
102
- "https://status.law/challenging-sanctions",
103
- "https://status.law/law-firm-contact-legal-protection",
104
- "https://status.law/cross-border-banking-legal-issues",
105
- "https://status.law/extradition-defense",
106
- "https://status.law/international-prosecution-protection",
107
- "https://status.law/interpol-red-notice-removal",
108
- "https://status.law/practice-areas",
109
- "https://status.law/reputation-protection",
110
- "https://status.law/faq"
111
- ]
112
-
113
- # Enhanced logging
114
- class CustomCallbackHandler(ConsoleCallbackHandler):
115
- def on_chain_end(self, run):
116
  log_entry = {
117
  "timestamp": datetime.now().isoformat(),
118
- "run_id": str(run.id),
119
- "inputs": run.inputs,
120
- "outputs": run.outputs,
121
- "execution_time": run.end_time - run.start_time if run.end_time else None,
122
- "metadata": run.metadata
123
  }
124
 
125
  os.makedirs("chat_history", exist_ok=True)
126
- with open("chat_history/detailed_logs.json", "a", encoding="utf-8") as f:
127
- json.dump(log_entry, f, ensure_ascii=False)
128
- f.write("\n")
 
 
 
 
 
129
 
 
 
130
  def init_models():
131
- """Инициализация моделей с обработкой ошибок"""
132
  try:
133
- callback_handler = CustomCallbackHandler()
134
- callback_manager = CallbackManager([callback_handler])
135
-
136
- # Инициализация LLM
137
  llm = ChatGroq(
138
  model_name="llama-3.3-70b-versatile",
139
  temperature=0.6,
140
- api_key=os.getenv("GROQ_API_KEY"),
141
- callback_manager=callback_manager
142
  )
143
-
144
- # Инициализация embeddings с явным указанием кэша
145
  embeddings = HuggingFaceEmbeddings(
146
- model_name=EMBEDDING_MODEL,
147
- cache_folder=hf_cache_dir
148
  )
149
-
150
- logger.info("Models initialized successfully")
151
  return llm, embeddings
152
-
153
- except Exception as e:
154
- logger.error(f"Model initialization error: {str(e)}")
155
- logger.error(traceback.format_exc())
156
- raise Exception(f"Model initialization failed: {str(e)}")
157
-
158
- async def fetch_url(session, url):
159
- cache_file = os.path.join(CACHE_DIR, f"{url.replace('/', '_').replace(':', '_')}.html")
160
-
161
- # Проверяем кэш
162
- if os.path.exists(cache_file):
163
- with open(cache_file, 'r', encoding='utf-8') as f:
164
- return url, f.read()
165
-
166
- try:
167
- async with session.get(url, ssl=False, timeout=30) as response:
168
- if response.status == 200:
169
- content = await response.text()
170
- # Сохраняем в кэш
171
- with open(cache_file, 'w', encoding='utf-8') as f:
172
- f.write(content)
173
- return url, content
174
- else:
175
- logger.warning(f"Failed to load {url}, status code: {response.status}")
176
- return url, None
177
  except Exception as e:
178
- logger.error(f"Error fetching {url}: {str(e)}")
179
- return url, None
180
 
181
- def process_html_content(url, html_content):
182
- if not html_content:
183
- return None
184
-
185
- soup = BeautifulSoup(html_content, 'html.parser')
186
-
187
- # Remove script and style elements
188
- for script in soup(["script", "style"]):
189
- script.decompose()
190
-
191
- # Get text content
192
- text = soup.get_text()
193
-
194
- # Clean up text
195
- lines = (line.strip() for line in text.splitlines())
196
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
197
- text = ' '.join(chunk for chunk in chunks if chunk)
198
-
199
- if not text.strip():
200
- return None
201
-
202
- return Document(page_content=text, metadata={"source": url})
203
-
204
- async def load_all_urls(urls_to_process):
205
- documents = []
206
-
207
- async with aiohttp.ClientSession() as session:
208
- tasks = [fetch_url(session, url) for url in urls_to_process]
209
- results = await asyncio.gather(*tasks)
210
-
211
- for url, content in results:
212
- if content:
213
- doc = process_html_content(url, content)
214
- if doc:
215
- documents.append(doc)
216
- logger.info(f"Successfully processed content from {url}")
217
- else:
218
- logger.warning(f"No useful content extracted from {url}")
219
- else:
220
- logger.warning(f"Failed to load content from {url}")
221
-
222
- return documents
223
 
224
- async def build_knowledge_base_async(embeddings, force_rebuild=False):
225
- """
226
- Асинхронное построение базы знаний.
227
- Параметр force_rebuild позволяет принудительно обновить все URL.
228
- """
229
  try:
230
- logger.info("Starting knowledge base construction...")
231
- kb_config = get_kb_config()
232
 
233
- # Определяем URL для обработки
234
- if force_rebuild:
235
- urls_to_process = URLS
236
- kb_config["processed_urls"] = []
237
- logger.info("Forcing rebuild of entire knowledge base")
238
- else:
239
- urls_to_process = [url for url in URLS if url not in kb_config["processed_urls"]]
240
-
241
- if not urls_to_process:
242
- logger.info("No new URLs to process")
243
- return FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
244
 
245
- logger.info(f"Processing {len(urls_to_process)} new URLs")
246
-
247
- documents = await load_all_urls(urls_to_process)
248
-
249
- if not documents:
250
- if kb_config["processed_urls"] and os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
251
- logger.info("No new documents to add, loading existing vector store")
252
- return FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
253
- raise Exception("No documents were successfully loaded!")
254
-
255
- logger.info(f"Total new documents loaded: {len(documents)}")
256
-
257
- # Увеличиваем размер чанков
258
- text_splitter = RecursiveCharacterTextSplitter(
259
- chunk_size=2500, # Увеличенный размер чанка
260
- chunk_overlap=100
261
- )
262
- logger.info("Splitting documents into chunks...")
263
- chunks = text_splitter.split_documents(documents)
264
- logger.info(f"Created {len(chunks)} chunks")
265
-
266
- # Если есть существующая база знаний и мы не выполняем полное обновление, добавляем к ней
267
- if not force_rebuild and os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
268
- logger.info("Loading existing vector store...")
269
- vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
270
- logger.info("Adding new documents to existing vector store...")
271
- vector_store.add_documents(chunks)
272
- else:
273
- logger.info("Creating new vector store...")
274
- vector_store = FAISS.from_documents(chunks, embeddings)
275
-
276
- logger.info("Saving vector store...")
277
- vector_store.save_local(folder_path=VECTOR_STORE_PATH, index_name="index")
278
-
279
- # Обновляем конфигурацию
280
- for url in urls_to_process:
281
- if url not in kb_config["processed_urls"]:
282
- kb_config["processed_urls"].append(url)
283
 
284
- kb_config["version"] += 1
285
- kb_config["last_update"] = datetime.now().isoformat()
286
- save_kb_config(kb_config)
287
-
288
- logger.info(f"Knowledge base updated to version {kb_config['version']}")
289
- return vector_store
290
-
291
- except Exception as e:
292
- logger.error(f"Error in build_knowledge_base: {str(e)}")
293
- traceback.print_exc()
294
- raise Exception(f"Knowledge base creation failed: {str(e)}")
295
-
296
- # Initialize models and knowledge base on startup
297
- llm, embeddings = init_models()
298
- vector_store = None
299
-
300
- @app.on_event("startup")
301
- async def startup_event():
302
- global vector_store
303
- try:
304
- # Проверяем существование базы знаний
305
- if os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
306
- vector_store = FAISS.load_local(
307
- VECTOR_STORE_PATH,
308
- embeddings,
309
- allow_dangerous_deserialization=True
310
  )
311
- logger.info("Existing knowledge base loaded successfully")
312
- else:
313
- logger.info("No existing knowledge base found. Use /rebuild-kb endpoint to create one")
314
- except Exception as e:
315
- logger.error(f"Error during startup: {str(e)}")
316
- vector_store = None
317
-
318
- # API endpoints
319
- @app.post("/chat", response_model=ChatResponse)
320
- async def chat_endpoint(request: ChatRequest):
321
- global vector_store
322
-
323
- # Проверяем, инициализирована ли база знаний
324
- if vector_store is None:
325
- raise HTTPException(
326
- status_code=503,
327
- detail="Knowledge base not initialized. Please use /rebuild-kb endpoint first."
328
- )
329
-
330
- try:
331
- # Retrieve context
332
- context_docs = vector_store.similarity_search(request.message, k=3) # Ограничиваем количество документов
333
- context_text = "\n".join([d.page_content for d in context_docs])
334
-
335
- # Generate response
336
- prompt_template = PromptTemplate.from_template('''
337
- You are a helpful and polite legal assistant at Status Law.
338
- You answer in the language in which the question was asked.
339
- Answer the question based on the context provided.
340
- If you cannot answer based on the context, say so politely and offer to contact Status Law directly via the following channels:
341
- - For all users: +32465594521 (landline phone).
342
- - For English and Swedish speakers only: +46728495129 (available on WhatsApp, Telegram, Signal, IMO).
343
- - Provide a link to the contact form: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
344
-
345
- Context: {context}
346
- Question: {question}
347
 
348
- Response Guidelines:
349
- 1. Answer in the user's language
350
- 2. Cite sources when possible
351
- 3. Offer contact options if unsure
352
- ''')
353
-
354
- chain = prompt_template | llm | StrOutputParser()
355
- response = chain.invoke({
356
- "context": context_text,
357
- "question": request.message
358
- })
359
-
360
- # Log interaction
361
- log_interaction(request.message, response, context_text)
362
-
363
- return ChatResponse(response=response, context=context_text)
364
-
365
  except Exception as e:
366
- raise HTTPException(status_code=500, detail=str(e))
367
-
368
- @app.post("/rebuild-kb")
369
- async def rebuild_knowledge_base(background_tasks: BackgroundTasks, force: bool = False):
370
- """
371
- Rebuild knowledge base in the background
 
 
 
 
 
372
 
373
- - force: если True, перестраивает всю базу знаний с нуля
374
- """
375
- global vector_store
 
 
 
376
 
377
- try:
378
- # Запускаем в фоне
379
- background_tasks.add_task(_rebuild_kb_task, force)
380
- action = "rebuild" if force else "update"
381
- return {"status": "success", "message": f"Knowledge base {action} started in background"}
382
- except Exception as e:
383
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
- async def _rebuild_kb_task(force: bool = False):
386
- """Фоновая задача для обновления базы знаний"""
387
- global vector_store
388
- try:
389
- vector_store = await build_knowledge_base_async(embeddings, force_rebuild=force)
390
- logger.info("Knowledge base rebuild completed successfully")
391
- except Exception as e:
392
- logger.error(f"Knowledge base rebuild failed: {str(e)}")
393
 
394
- @app.get("/kb-status")
395
- async def get_kb_status():
396
- """Get current knowledge base status"""
397
- global vector_store
398
-
399
- kb_config = get_kb_config()
400
- return {
401
- "initialized": vector_store is not None,
402
- "version": kb_config["version"],
403
- "total_urls": len(URLS),
404
- "processed_urls": len(kb_config["processed_urls"]),
405
- "pending_urls": len([url for url in URLS if url not in kb_config["processed_urls"]]),
406
- "last_update": kb_config["last_update"]
407
- }
408
 
409
- def log_interaction(user_input: str, bot_response: str, context: str):
410
- try:
411
- kb_config = get_kb_config()
412
- log_entry = {
413
- "timestamp": datetime.now().isoformat(),
414
- "user_input": user_input,
415
- "bot_response": bot_response,
416
- "context": context[:500],
417
- "kb_version": kb_config["version"] # Используем актуальную версию
418
- }
419
-
420
- os.makedirs("chat_history", exist_ok=True)
421
- with open("chat_history/chat_logs.json", "a", encoding="utf-8") as f:
422
- f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
423
-
424
- except Exception as e:
425
- logger.error(f"Logging error: {str(e)}")
426
- logger.error(traceback.format_exc())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
  if __name__ == "__main__":
429
- import uvicorn
430
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
 
 
1
  import os
2
+ import time
3
+ import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from dotenv import load_dotenv
 
 
5
  from langchain_groq import ChatGroq
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_community.vectorstores import FAISS
8
  from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ from langchain_community.document_loaders import WebBaseLoader
10
  from langchain_core.prompts import PromptTemplate
11
  from langchain_core.output_parsers import StrOutputParser
12
+ from datetime import datetime
13
+ import json
14
+ import traceback
 
 
 
15
 
16
  # Initialize environment variables
17
  load_dotenv()
18
 
19
+ # --------------- Session State Initialization ---------------
20
+ def init_session_state():
21
+ """Initialize all required session state variables"""
22
+ defaults = {
23
+ 'kb_info': {
24
+ 'build_time': None,
25
+ 'size': None,
26
+ 'version': '1.1'
27
+ },
28
+ 'messages': [],
29
+ 'vector_store': None,
30
+ 'models_initialized': False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
+
33
+ for key, value in defaults.items():
34
+ if key not in st.session_state:
35
+ st.session_state[key] = value
36
 
37
+ # --------------- Enhanced Logging ---------------
38
+ def log_interaction(user_input: str, bot_response: str, context: str):
39
+ """Log interactions with error handling"""
40
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  log_entry = {
42
  "timestamp": datetime.now().isoformat(),
43
+ "user_input": user_input,
44
+ "bot_response": bot_response,
45
+ "context": context[:500], # Store first 500 chars of context
46
+ "kb_version": st.session_state.kb_info['version']
 
47
  }
48
 
49
  os.makedirs("chat_history", exist_ok=True)
50
+ log_path = os.path.join("chat_history", "chat_logs.json")
51
+
52
+ with open(log_path, "a", encoding="utf-8") as f:
53
+ f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
54
+
55
+ except Exception as e:
56
+ st.error(f"Logging error: {str(e)}")
57
+ print(traceback.format_exc())
58
 
59
+ # --------------- Model Initialization ---------------
60
+ @st.cache_resource
61
  def init_models():
62
+ """Initialize AI models with caching"""
63
  try:
 
 
 
 
64
  llm = ChatGroq(
65
  model_name="llama-3.3-70b-versatile",
66
  temperature=0.6,
67
+ api_key=os.getenv("GROQ_API_KEY")
 
68
  )
 
 
69
  embeddings = HuggingFaceEmbeddings(
70
+ model_name="intfloat/multilingual-e5-large-instruct"
 
71
  )
72
+ st.session_state.models_initialized = True
 
73
  return llm, embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  except Exception as e:
75
+ st.error(f"Model initialization failed: {str(e)}")
76
+ st.stop()
77
 
78
+ # --------------- Knowledge Base Management ---------------
79
+ VECTOR_STORE_PATH = "vector_store"
80
+ URLS = [
81
+ "https://status.law",
82
+ "https://status.law/about",
83
+ "https://status.law/careers",
84
+ "https://status.law/tariffs-for-services-of-protection-against-extradition",
85
+ "https://status.law/challenging-sanctions",
86
+ "https://status.law/law-firm-contact-legal-protection"
87
+ "https://status.law/cross-border-banking-legal-issues",
88
+ "https://status.law/extradition-defense",
89
+ "https://status.law/international-prosecution-protection",
90
+ "https://status.law/interpol-red-notice-removal",
91
+ "https://status.law/practice-areas",
92
+ "https://status.law/reputation-protection",
93
+ "https://status.law/faq"
94
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ def build_knowledge_base(_embeddings):
97
+ """Build or update the knowledge base"""
 
 
 
98
  try:
99
+ start_time = time.time()
100
+ documents = []
101
 
102
+ with st.status("Building knowledge base..."):
103
+ # Создаем папку заранее
104
+ os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
 
 
 
 
 
 
 
 
105
 
106
+ # Загрузка документов
107
+ for url in URLS:
108
+ try:
109
+ loader = WebBaseLoader(url)
110
+ docs = loader.load()
111
+ documents.extend(docs)
112
+ st.write(f" Loaded {url}")
113
+ except Exception as e:
114
+ st.error(f"Failed to load {url}: {str(e)}")
115
+ continue # Продолжаем при ошибках загрузки
116
+
117
+ if not documents:
118
+ st.error("No documents loaded!")
119
+ return None
120
+
121
+ # Разделение на чанки
122
+ text_splitter = RecursiveCharacterTextSplitter(
123
+ chunk_size=500,
124
+ chunk_overlap=100
125
+ )
126
+ chunks = text_splitter.split_documents(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ # Явное сохранение
129
+ vector_store = FAISS.from_documents(chunks, _embeddings)
130
+ vector_store.save_local(
131
+ folder_path=VECTOR_STORE_PATH,
132
+ index_name="index"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ # Проверка создания файлов
136
+ if not os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
137
+ raise RuntimeError("FAISS index file not created!")
138
+
139
+ # Обновление информации
140
+ st.session_state.kb_info.update({
141
+ 'build_time': time.time() - start_time,
142
+ 'size': sum(
143
+ os.path.getsize(os.path.join(VECTOR_STORE_PATH, f))
144
+ for f in ["index.faiss", "index.pkl"]
145
+ ) / (1024 ** 2),
146
+ 'version': datetime.now().strftime("%Y%m%d-%H%M%S")
147
+ })
148
+
149
+ st.success("Knowledge base successfully created!")
150
+ return vector_store
151
+
152
  except Exception as e:
153
+ st.error(f"Knowledge base creation failed: {str(e)}")
154
+ # Отладочная информация
155
+ st.write("Debug info:")
156
+ st.write(f"Documents loaded: {len(documents)}")
157
+ st.write(f"Chunks created: {len(chunks) if 'chunks' in locals() else 0}")
158
+ st.write(f"Vector store path exists: {os.path.exists(VECTOR_STORE_PATH)}")
159
+ st.stop()
160
+ # --------------- Main Application ---------------
161
+ def main():
162
+ # Initialize session state first
163
+ init_session_state()
164
 
165
+ # Page configuration
166
+ st.set_page_config(
167
+ page_title="Status Law Assistant",
168
+ page_icon="⚖️",
169
+ layout="wide"
170
+ )
171
 
172
+ # Display header
173
+ st.markdown('''
174
+ <h1 style="border-bottom: 2px solid #444; padding-bottom: 10px;">
175
+ ⚖️ <a href="https://status.law/" style="text-decoration: none; color: #2B5876;">Status.Law</a> Legal Assistant
176
+ </h1>
177
+ ''', unsafe_allow_html=True)
178
+
179
+ # Initialize models
180
+ llm, embeddings = init_models()
181
+
182
+ # Knowledge base initialization
183
+ if not os.path.exists(VECTOR_STORE_PATH):
184
+ st.warning("Knowledge base not initialized")
185
+ if st.button("Create Knowledge Base"):
186
+ st.session_state.vector_store = build_knowledge_base(embeddings)
187
+ st.rerun()
188
+ return
189
+
190
+ if not st.session_state.vector_store:
191
+ try:
192
+ st.session_state.vector_store = FAISS.load_local(
193
+ VECTOR_STORE_PATH,
194
+ embeddings,
195
+ allow_dangerous_deserialization=True
196
+ )
197
+ except Exception as e:
198
+ st.error(f"Failed to load knowledge base: {str(e)}")
199
+ st.stop()
200
 
201
+ # Chat interface
202
+ for message in st.session_state.messages:
203
+ with st.chat_message(message["role"]):
204
+ st.markdown(message["content"])
 
 
 
 
205
 
206
+ if prompt := st.chat_input("Ask your legal question"):
207
+ # Add user message to chat history
208
+ st.session_state.messages.append({"role": "user", "content": prompt})
209
+ with st.chat_message("user"):
210
+ st.markdown(prompt)
 
 
 
 
 
 
 
 
 
211
 
212
+ # Generate response
213
+ with st.chat_message("assistant"):
214
+ try:
215
+ # Retrieve context
216
+ context_docs = st.session_state.vector_store.similarity_search(prompt)
217
+ context_text = "\n".join([d.page_content for d in context_docs])
218
+
219
+ # Generate response
220
+ prompt_template = PromptTemplate.from_template('''
221
+ You are a helpful and polite legal assistant at Status Law.
222
+ You answer in the language in which the question was asked.
223
+ Answer the question based on the context provided.
224
+ If you cannot answer based on the context, say so politely and offer to contact Status Law directly via the following channels:
225
+ - For all users: +32465594521 (landline phone).
226
+ - For English and Swedish speakers only: +46728495129 (available on WhatsApp, Telegram, Signal, IMO).
227
+ - Provide a link to the contact form: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
228
+ If the user has questions about specific services and their costs, suggest they visit the page https://status.law/tariffs-for-services-of-protection-against-extradition-and-international-prosecution/ for detailed information.
229
+
230
+ Ask the user additional questions to understand which service to recommend and provide an estimated cost. For example, clarify their situation and needs to suggest the most appropriate options.
231
+
232
+ Also, offer free consultations if they are available and suitable for the user's request.
233
+ Answer professionally but in a friendly manner.
234
+
235
+ Example:
236
+ Q: How can I challenge the sanctions?
237
+ A: To challenge the sanctions, you should consult with our legal team, who specialize in this area. Please contact us directly for detailed advice. You can fill out our contact form here: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
238
+
239
+ Context: {context}
240
+ Question: {question}
241
+
242
+ Response Guidelines:
243
+ 1. Answer in the user's language
244
+ 2. Cite sources when possible
245
+ 3. Offer contact options if unsure
246
+ ''')
247
+
248
+ chain = prompt_template | llm | StrOutputParser()
249
+ response = chain.invoke({
250
+ "context": context_text,
251
+ "question": prompt
252
+ })
253
+
254
+ # Display and log
255
+ st.markdown(response)
256
+ log_interaction(prompt, response, context_text)
257
+ st.session_state.messages.append({"role": "assistant", "content": response})
258
+
259
+ except Exception as e:
260
+ error_msg = f"Error generating response: {str(e)}"
261
+ st.error(error_msg)
262
+ log_interaction(prompt, error_msg, "")
263
+ print(traceback.format_exc())
264
 
265
  if __name__ == "__main__":
266
+ main()
 
colab_request.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+
4
+ base_url = "https://rulga-doc-chat.hf.space"
5
+ max_retries = 10 # Максимальное количество попыток
6
+ retry_delay = 30 # Задержка между попытками в секундах
7
+
8
+ def wait_for_service():
9
+ print("Waiting for the service to start...")
10
+ for attempt in range(max_retries):
11
+ try:
12
+ response = requests.get(base_url)
13
+ if response.status_code == 200 and "Could not parse JSON" not in response.text:
14
+ print(f"Service is ready after {attempt + 1} attempts!")
15
+ return True
16
+ except requests.exceptions.RequestException:
17
+ pass
18
+
19
+ print(f"Attempt {attempt + 1}/{max_retries}. Service is still starting. Waiting {retry_delay} seconds...")
20
+ time.sleep(retry_delay)
21
+
22
+ return False
23
+
24
+ if wait_for_service():
25
+ # Запуск создания базы знаний
26
+ print("\nSending rebuild request...")
27
+ rebuild_url = f"{base_url}/rebuild-kb"
28
+ response = requests.post(rebuild_url, params={"force": True})
29
+ print(f"Status code: {response.status_code}")
30
+ print(f"Response: {response.text}")
31
+
32
+ # Проверка статуса
33
+ print("\nChecking status...")
34
+ status_url = f"{base_url}/kb-status"
35
+ status = requests.get(status_url)
36
+ print(f"Status code: {status.status_code}")
37
+ print(f"Status: {status.text}")
38
+ else:
39
+ print("Service failed to start after maximum retries")
requirements.txt CHANGED
@@ -1,20 +1,23 @@
1
- # Основные компоненты для работы с LLM и базой знаний
2
  langchain-community
3
  langchain-core
4
  langchain-huggingface
5
  langchain-groq
6
- sentence-transformers
7
  python-dotenv
 
8
  faiss-cpu
9
  requests
10
- beautifulsoup4
11
-
12
- # Для API и логирования
13
  fastapi
14
  uvicorn[standard]
15
  pydantic
 
16
  pandas
 
 
 
 
 
 
17
 
18
- # Для LangChain логирования
19
- langgraph
20
- langchain-core[tracing]
 
1
+ streamlit
2
  langchain-community
3
  langchain-core
4
  langchain-huggingface
5
  langchain-groq
 
6
  python-dotenv
7
+ beautifulsoup4
8
  faiss-cpu
9
  requests
10
+ langgraph
11
+ langchain-anthropic
 
12
  fastapi
13
  uvicorn[standard]
14
  pydantic
15
+ python-multipart
16
  pandas
17
+ langchain
18
+ plotly
19
+
20
+
21
+
22
+
23
 
 
 
 
run.sh CHANGED
@@ -1,2 +1,5 @@
1
  #!/bin/bash
2
- uvicorn app:app --host 0.0.0.0 --port 8000 --reload
 
 
 
 
1
  #!/bin/bash
2
+
3
+ # Запуск Streamlit и FastAPI параллельно
4
+ streamlit run app.py & # Запуск чат-бота
5
+ uvicorn api.main:app --reload # Запуск API для анализа логов