Rulga commited on
Commit
1fc15a8
·
1 Parent(s): e50357d

Update README and enhance app.py with Docker support, improved logging, and URL content loading

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. app.py +88 -26
README.md CHANGED
@@ -3,13 +3,13 @@ title: 'Doc LS Chatbot '
3
  emoji: 🔥
4
  colorFrom: yellow
5
  colorTo: yellow
6
- sdk: streamlit
7
  sdk_version: 1.42.2
8
  app_file: app.py
9
  pinned: false
10
  short_description: It is a chat built with an AI model about www.Status.law
11
  ---
12
 
13
- # LS Chatbot Log
14
 
15
- It is a chat app built using Streamlit that allows users to interact with an AI model to communicate about www.Status.law
 
3
  emoji: 🔥
4
  colorFrom: yellow
5
  colorTo: yellow
6
+ sdk: Docker
7
  sdk_version: 1.42.2
8
  app_file: app.py
9
  pinned: false
10
  short_description: It is a chat built with an AI model about www.Status.law
11
  ---
12
 
13
+ # LS DOC Chatbot Log
14
 
15
+ It is a chat app built using Hugging Face and Docker Space that allows users to interact with an AI model to communicate about www.Status.law
app.py CHANGED
@@ -1,5 +1,14 @@
1
  import os
2
  import time
 
 
 
 
 
 
 
 
 
3
  from dotenv import load_dotenv
4
  from fastapi import FastAPI, HTTPException
5
  from pydantic import BaseModel
@@ -7,15 +16,15 @@ from langchain_groq import ChatGroq
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_community.vectorstores import FAISS
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
- from langchain_community.document_loaders import WebBaseLoader
11
  from langchain_core.prompts import PromptTemplate
12
  from langchain_core.output_parsers import StrOutputParser
13
- from datetime import datetime
14
- import json
15
- import traceback
16
- from typing import Optional, List, Dict
17
  from langchain_core.tracers import ConsoleCallbackHandler
18
  from langchain_core.callbacks import CallbackManager
 
 
 
 
19
 
20
  # Initialize environment variables
21
  load_dotenv()
@@ -49,6 +58,21 @@ URLS = [
49
  "https://status.law/faq"
50
  ]
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Enhanced logging
53
  class CustomCallbackHandler(ConsoleCallbackHandler):
54
  def on_chain_end(self, run):
@@ -66,7 +90,6 @@ class CustomCallbackHandler(ConsoleCallbackHandler):
66
  json.dump(log_entry, f, ensure_ascii=False)
67
  f.write("\n")
68
 
69
- # Initialize models
70
  def init_models():
71
  try:
72
  callback_handler = CustomCallbackHandler()
@@ -85,50 +108,88 @@ def init_models():
85
  except Exception as e:
86
  raise Exception(f"Model initialization failed: {str(e)}")
87
 
88
- # Knowledge base management
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def build_knowledge_base(embeddings):
90
  try:
91
  documents = []
92
  os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
93
 
94
- print("Starting to load documents...") # Debug log
 
 
 
 
95
 
96
- for url in URLS:
 
97
  try:
98
- print(f"Attempting to load {url}") # Debug log
99
- loader = WebBaseLoader(url)
100
- docs = loader.load()
101
- documents.extend(docs)
102
- print(f"Successfully loaded {url}") # Debug log
 
 
103
  except Exception as e:
104
- print(f"Failed to load {url}: {str(e)}")
105
- traceback.print_exc() # Print full traceback
106
  continue
107
 
108
  if not documents:
109
- raise Exception("No documents loaded!")
110
-
111
- print(f"Total documents loaded: {len(documents)}") # Debug log
112
 
 
 
113
  text_splitter = RecursiveCharacterTextSplitter(
114
  chunk_size=500,
115
  chunk_overlap=100
116
  )
117
- print("Splitting documents into chunks...") # Debug log
118
  chunks = text_splitter.split_documents(documents)
119
- print(f"Created {len(chunks)} chunks") # Debug log
120
 
121
- print("Creating vector store...") # Debug log
122
  vector_store = FAISS.from_documents(chunks, embeddings)
123
 
124
- print("Saving vector store...") # Debug log
125
  vector_store.save_local(folder_path=VECTOR_STORE_PATH, index_name="index")
126
 
127
- print("Vector store successfully created and saved") # Debug log
128
  return vector_store
129
  except Exception as e:
130
- print("Error in build_knowledge_base:") # Debug log
131
- traceback.print_exc() # Print full traceback
132
  raise Exception(f"Knowledge base creation failed: {str(e)}")
133
 
134
  # Initialize models and knowledge base on startup
@@ -148,6 +209,7 @@ if os.path.exists(VECTOR_STORE_PATH):
148
  if vector_store is None:
149
  vector_store = build_knowledge_base(embeddings)
150
 
 
151
  # API endpoints
152
  @app.post("/chat", response_model=ChatResponse)
153
  async def chat_endpoint(request: ChatRequest):
 
1
  import os
2
  import time
3
+ import sys
4
+ import json
5
+ import traceback
6
+ import warnings
7
+ from datetime import datetime
8
+ from typing import Optional, List, Dict
9
+
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
  from dotenv import load_dotenv
13
  from fastapi import FastAPI, HTTPException
14
  from pydantic import BaseModel
 
16
  from langchain_huggingface import HuggingFaceEmbeddings
17
  from langchain_community.vectorstores import FAISS
18
  from langchain_text_splitters import RecursiveCharacterTextSplitter
19
+ from langchain_community.document_loaders import WebBaseLoader, BSHTMLLoader
20
  from langchain_core.prompts import PromptTemplate
21
  from langchain_core.output_parsers import StrOutputParser
 
 
 
 
22
  from langchain_core.tracers import ConsoleCallbackHandler
23
  from langchain_core.callbacks import CallbackManager
24
+ from langchain_core.documents import Document
25
+
26
+ # Ignore SSL warnings
27
+ warnings.filterwarnings('ignore')
28
 
29
  # Initialize environment variables
30
  load_dotenv()
 
58
  "https://status.law/faq"
59
  ]
60
 
61
+ # Check write permissions
62
+ try:
63
+ if not os.path.exists(VECTOR_STORE_PATH):
64
+ os.makedirs(VECTOR_STORE_PATH)
65
+ test_file_path = os.path.join(VECTOR_STORE_PATH, 'test_write.txt')
66
+ with open(test_file_path, 'w') as f:
67
+ f.write('test')
68
+ os.remove(test_file_path)
69
+ print(f"Write permissions OK for {VECTOR_STORE_PATH}")
70
+ except Exception as e:
71
+ print(f"WARNING: No write permissions for {VECTOR_STORE_PATH}: {str(e)}")
72
+ print("Current working directory:", os.getcwd())
73
+ print("User:", os.getenv('USER'))
74
+ sys.exit(1)
75
+
76
  # Enhanced logging
77
  class CustomCallbackHandler(ConsoleCallbackHandler):
78
  def on_chain_end(self, run):
 
90
  json.dump(log_entry, f, ensure_ascii=False)
91
  f.write("\n")
92
 
 
93
  def init_models():
94
  try:
95
  callback_handler = CustomCallbackHandler()
 
108
  except Exception as e:
109
  raise Exception(f"Model initialization failed: {str(e)}")
110
 
111
+ def check_url_availability(url: str) -> bool:
112
+ try:
113
+ response = requests.get(url, verify=False, timeout=10)
114
+ return response.status_code == 200
115
+ except Exception as e:
116
+ print(f"Error checking {url}: {str(e)}")
117
+ return False
118
+
119
+ def load_url_content(url: str) -> List[Document]:
120
+ try:
121
+ response = requests.get(url, verify=False, timeout=30)
122
+ if response.status_code != 200:
123
+ print(f"Failed to load {url}, status code: {response.status_code}")
124
+ return []
125
+
126
+ soup = BeautifulSoup(response.text, 'html.parser')
127
+
128
+ # Remove script and style elements
129
+ for script in soup(["script", "style"]):
130
+ script.decompose()
131
+
132
+ # Get text content
133
+ text = soup.get_text()
134
+
135
+ # Clean up text
136
+ lines = (line.strip() for line in text.splitlines())
137
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
138
+ text = ' '.join(chunk for chunk in chunks if chunk)
139
+
140
+ return [Document(page_content=text, metadata={"source": url})]
141
+ except Exception as e:
142
+ print(f"Error processing {url}: {str(e)}")
143
+ return []
144
+
145
  def build_knowledge_base(embeddings):
146
  try:
147
  documents = []
148
  os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
149
 
150
+ print("Starting to load documents...")
151
+
152
+ # First check which URLs are available
153
+ available_urls = [url for url in URLS if check_url_availability(url)]
154
+ print(f"\nAccessible URLs: {len(available_urls)} out of {len(URLS)}")
155
 
156
+ # Load content from available URLs
157
+ for url in available_urls:
158
  try:
159
+ print(f"\nProcessing {url}")
160
+ docs = load_url_content(url)
161
+ if docs:
162
+ documents.extend(docs)
163
+ print(f"Successfully loaded content from {url}")
164
+ else:
165
+ print(f"No content extracted from {url}")
166
  except Exception as e:
167
+ print(f"Failed to process {url}: {str(e)}")
 
168
  continue
169
 
170
  if not documents:
171
+ raise Exception("No documents were successfully loaded!")
 
 
172
 
173
+ print(f"\nTotal documents loaded: {len(documents)}")
174
+
175
  text_splitter = RecursiveCharacterTextSplitter(
176
  chunk_size=500,
177
  chunk_overlap=100
178
  )
179
+ print("Splitting documents into chunks...")
180
  chunks = text_splitter.split_documents(documents)
181
+ print(f"Created {len(chunks)} chunks")
182
 
183
+ print("Creating vector store...")
184
  vector_store = FAISS.from_documents(chunks, embeddings)
185
 
186
+ print("Saving vector store...")
187
  vector_store.save_local(folder_path=VECTOR_STORE_PATH, index_name="index")
188
 
 
189
  return vector_store
190
  except Exception as e:
191
+ print(f"Error in build_knowledge_base: {str(e)}")
192
+ traceback.print_exc()
193
  raise Exception(f"Knowledge base creation failed: {str(e)}")
194
 
195
  # Initialize models and knowledge base on startup
 
209
  if vector_store is None:
210
  vector_store = build_knowledge_base(embeddings)
211
 
212
+ # API endpoints
213
  # API endpoints
214
  @app.post("/chat", response_model=ChatResponse)
215
  async def chat_endpoint(request: ChatRequest):