Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,268 +1,42 @@
|
|
1 |
-
|
2 |
-
import
|
3 |
-
|
4 |
-
|
5 |
-
import
|
6 |
-
from openai import OpenAI
|
7 |
-
import time
|
8 |
-
import copy
|
9 |
|
10 |
-
#
|
11 |
-
def is_valid_url(url, base_url):
|
12 |
-
parsed_url = urlparse(url)
|
13 |
-
parsed_base = urlparse(base_url)
|
14 |
-
return parsed_url.netloc == parsed_base.netloc
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
try:
|
19 |
-
response = requests.get(url, timeout=10)
|
20 |
-
if response.status_code == 200:
|
21 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
22 |
-
|
23 |
-
# Remove script, style elements and comments
|
24 |
-
for element in soup(['script', 'style', 'header', 'footer', 'nav']):
|
25 |
-
element.decompose()
|
26 |
-
|
27 |
-
# Get text content
|
28 |
-
text = soup.get_text(separator=' ', strip=True)
|
29 |
-
|
30 |
-
# Clean up whitespace
|
31 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
32 |
-
|
33 |
-
return text
|
34 |
-
else:
|
35 |
-
return None
|
36 |
-
except Exception as e:
|
37 |
-
print(f"Error scraping {url}: {e}")
|
38 |
-
return None
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
site_content = {}
|
46 |
-
|
47 |
-
while urls_to_visit and len(visited_urls) < max_pages:
|
48 |
-
current_url = urls_to_visit.pop(0)
|
49 |
-
|
50 |
-
if current_url in visited_urls:
|
51 |
-
continue
|
52 |
-
|
53 |
-
print(f"Crawling: {current_url}")
|
54 |
-
visited_urls.add(current_url)
|
55 |
-
|
56 |
-
try:
|
57 |
-
response = requests.get(current_url, timeout=10)
|
58 |
-
if response.status_code == 200:
|
59 |
-
# Get content of the current page
|
60 |
-
content = scrape_page(current_url)
|
61 |
-
if content:
|
62 |
-
site_content[current_url] = content
|
63 |
-
|
64 |
-
# Find all links on the page
|
65 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
66 |
-
for link in soup.find_all('a', href=True):
|
67 |
-
href = link['href']
|
68 |
-
full_url = urljoin(current_url, href)
|
69 |
-
|
70 |
-
# Only follow links that are part of the same website
|
71 |
-
if is_valid_url(full_url, base_url) and full_url not in visited_urls:
|
72 |
-
urls_to_visit.append(full_url)
|
73 |
-
|
74 |
-
# Add a small delay to be respectful
|
75 |
-
time.sleep(0.5)
|
76 |
-
|
77 |
-
except Exception as e:
|
78 |
-
print(f"Error visiting {current_url}: {e}")
|
79 |
-
|
80 |
-
print(f"Crawled {len(visited_urls)} pages and collected content from {len(site_content)} pages.")
|
81 |
-
return site_content
|
82 |
|
83 |
-
|
84 |
-
def create_context(site_content, max_context_length=8000):
|
85 |
-
context = "Content from https://innovativeskillsbd.com website:\n\n"
|
86 |
-
|
87 |
-
for url, content in site_content.items():
|
88 |
-
# Add URL and a portion of its content (limited to keep context manageable)
|
89 |
-
page_content = f"Page: {url}\n{content[:1000]}...\n\n"
|
90 |
-
|
91 |
-
# Check if adding this would exceed max context length
|
92 |
-
if len(context) + len(page_content) > max_context_length:
|
93 |
-
break
|
94 |
-
|
95 |
-
context += page_content
|
96 |
-
|
97 |
-
return context
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
# Look for URLs in the text
|
102 |
-
url_pattern = r'https?://[^\s/$.?#].[^\s]*'
|
103 |
-
urls = re.findall(url_pattern, text)
|
104 |
-
|
105 |
-
for url in urls:
|
106 |
-
# If the URL contains the wrong domain but appears to be an InnovativeSkills link
|
107 |
-
if ('innovative-skill.com' in url or 'innovativeskill.com' in url) and 'innovativeskillsbd.com' not in url:
|
108 |
-
# Create the correct URL by replacing the domain
|
109 |
-
path = urlparse(url).path
|
110 |
-
correct_url = f"https://innovativeskillsbd.com{path}"
|
111 |
-
# Replace in the text
|
112 |
-
text = text.replace(url, correct_url)
|
113 |
-
|
114 |
-
return text
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
client = OpenAI(
|
120 |
-
base_url="https://openrouter.ai/api/v1",
|
121 |
-
api_key=api_key,
|
122 |
-
)
|
123 |
-
|
124 |
-
completion = client.chat.completions.create(
|
125 |
-
extra_headers={
|
126 |
-
"HTTP-Referer": "https://innovativeskillsbd.com",
|
127 |
-
"X-Title": "InnovativeSkills ChatBot",
|
128 |
-
},
|
129 |
-
model="deepseek/deepseek-chat-v3-0324:free",
|
130 |
-
messages=messages
|
131 |
-
)
|
132 |
-
|
133 |
-
response = completion.choices[0].message.content
|
134 |
-
|
135 |
-
# Fix any incorrect URLs - ensure all links point to the correct domain
|
136 |
-
response = fix_urls_in_text(response)
|
137 |
-
|
138 |
-
return response
|
139 |
-
except Exception as e:
|
140 |
-
return f"Error querying the model: {str(e)}"
|
141 |
|
142 |
-
|
143 |
-
|
144 |
-
if not api_key:
|
145 |
-
return "Please enter your OpenRouter API key.", history
|
146 |
-
|
147 |
-
# Prepare the context from scraped content
|
148 |
-
context = create_context(site_content)
|
149 |
-
|
150 |
-
# Create system message with context
|
151 |
-
system_message = {
|
152 |
-
"role": "system",
|
153 |
-
"content": f"""You are a helpful AI assistant for InnovativeSkills Bangladesh, a website focused on helping people learn IT skills.
|
154 |
-
Use the following content from the website to answer user questions. If the question is not related to the website or the
|
155 |
-
information is not available in the content, politely say so and try to provide general guidance related to InnovativeSkills.
|
156 |
-
|
157 |
-
IMPORTANT: When referring to any URLs related to the website, ALWAYS use the domain 'innovativeskillsbd.com' (NOT 'innovative-skill.com' or 'innovativeskill.com').
|
158 |
-
For example, use 'https://innovativeskillsbd.com/student-job-success' instead of any other domain.
|
159 |
-
|
160 |
-
{context}"""
|
161 |
-
}
|
162 |
-
|
163 |
-
# Create user message
|
164 |
-
user_message = {"role": "user", "content": question}
|
165 |
-
|
166 |
-
# Create message history for the API call
|
167 |
-
messages = [system_message]
|
168 |
-
|
169 |
-
# Add conversation history
|
170 |
-
for user_msg, assistant_msg in history:
|
171 |
-
messages.append({"role": "user", "content": user_msg})
|
172 |
-
messages.append({"role": "assistant", "content": assistant_msg})
|
173 |
-
|
174 |
-
# Add current question
|
175 |
-
messages.append(user_message)
|
176 |
-
|
177 |
-
# Query the model
|
178 |
-
response = query_model(api_key, messages)
|
179 |
-
|
180 |
-
# Update history by adding the new exchange
|
181 |
-
new_history = copy.deepcopy(history)
|
182 |
-
new_history.append((question, response))
|
183 |
-
return response, new_history
|
184 |
|
185 |
-
|
186 |
-
def init_scraper(progress=gr.Progress()):
|
187 |
-
base_url = "https://innovativeskillsbd.com/"
|
188 |
-
progress(0, desc="Starting website crawler...")
|
189 |
-
site_content = crawl_website(base_url)
|
190 |
-
progress(1, desc="Finished crawling website")
|
191 |
-
return site_content
|
192 |
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
with gr.Row():
|
200 |
-
api_key_input = gr.Textbox(
|
201 |
-
label="OpenRouter API Key",
|
202 |
-
placeholder="Enter your OpenRouter API key",
|
203 |
-
type="password"
|
204 |
-
)
|
205 |
-
|
206 |
-
chatbot = gr.Chatbot(height=500, show_copy_button=True)
|
207 |
-
msg = gr.Textbox(label="Ask a question about InnovativeSkills Bangladesh")
|
208 |
-
|
209 |
-
# Container for site content (hidden from UI)
|
210 |
-
site_content_state = gr.State(site_content)
|
211 |
-
|
212 |
-
# Container for chat history
|
213 |
-
chat_history = gr.State([])
|
214 |
-
|
215 |
-
# Button to start the conversation
|
216 |
-
clear = gr.Button("Clear conversation")
|
217 |
-
|
218 |
-
# Events
|
219 |
-
def user_input(api_key, message, site_content, history):
|
220 |
-
if not message:
|
221 |
-
return "", chatbot, history
|
222 |
-
|
223 |
-
# Process the response
|
224 |
-
bot_response, updated_history = answer_question(api_key, message, site_content, history)
|
225 |
-
|
226 |
-
# Format history for chatbot display
|
227 |
-
chatbot_display = []
|
228 |
-
for user_msg, bot_msg in updated_history:
|
229 |
-
chatbot_display.append([user_msg, bot_msg])
|
230 |
-
|
231 |
-
return "", chatbot_display, updated_history
|
232 |
-
|
233 |
-
msg.submit(
|
234 |
-
user_input,
|
235 |
-
inputs=[api_key_input, msg, site_content_state, chat_history],
|
236 |
-
outputs=[msg, chatbot, chat_history]
|
237 |
-
)
|
238 |
-
|
239 |
-
def clear_chat():
|
240 |
-
return "", [], []
|
241 |
-
|
242 |
-
clear.click(
|
243 |
-
clear_chat,
|
244 |
-
outputs=[msg, chatbot, chat_history]
|
245 |
-
)
|
246 |
-
|
247 |
-
return app
|
248 |
-
|
249 |
-
# Initialize and launch the app
|
250 |
-
def main():
|
251 |
-
print("Starting to initialize the InnovativeSkills chatbot...")
|
252 |
-
|
253 |
-
# First, scrape the website content
|
254 |
-
site_content = {}
|
255 |
-
try:
|
256 |
-
site_content = crawl_website("https://innovativeskillsbd.com/")
|
257 |
-
except Exception as e:
|
258 |
-
print(f"Error during initial website crawling: {e}")
|
259 |
-
print("The chatbot will still work, but without initial website content.")
|
260 |
-
|
261 |
-
# Create the Gradio interface with the site content
|
262 |
-
app = create_interface(site_content)
|
263 |
-
|
264 |
-
# Launch the app
|
265 |
-
app.launch()
|
266 |
|
267 |
if __name__ == "__main__":
|
268 |
-
|
|
|
|
|
|
|
|
1 |
+
from Crypto.Cipher import AES
|
2 |
+
from Crypto.Protocol.KDF import PBKDF2
|
3 |
+
import os
|
4 |
+
import tempfile
|
5 |
+
from dotenv import load_dotenv
|
|
|
|
|
|
|
6 |
|
7 |
+
load_dotenv() # Load all environment variables
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
def unpad(data):
|
10 |
+
return data[:-data[-1]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
def decrypt_and_run():
|
13 |
+
# Get password from Hugging Face Secrets environment variable
|
14 |
+
password = os.getenv("PASSWORD")
|
15 |
+
if not password:
|
16 |
+
raise ValueError("PASSWORD secret not found in environment variables")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
password = password.encode()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
with open("code.enc", "rb") as f:
|
21 |
+
encrypted = f.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
salt = encrypted[:16]
|
24 |
+
iv = encrypted[16:32]
|
25 |
+
ciphertext = encrypted[32:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
key = PBKDF2(password, salt, dkLen=32, count=1000000)
|
28 |
+
cipher = AES.new(key, AES.MODE_CBC, iv)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
plaintext = unpad(cipher.decrypt(ciphertext))
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
with tempfile.NamedTemporaryFile(suffix=".py", delete=False, mode='wb') as tmp:
|
33 |
+
tmp.write(plaintext)
|
34 |
+
tmp.flush()
|
35 |
+
print(f"[INFO] Running decrypted code from {tmp.name}")
|
36 |
+
os.system(f"python {tmp.name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
if __name__ == "__main__":
|
39 |
+
decrypt_and_run()
|
40 |
+
|
41 |
+
# This script decrypts the encrypted code and runs it.
|
42 |
+
# Ensure you have the PASSWORD secret set in your Hugging Face Secrets
|