Update web_scraper.py
Browse files- web_scraper.py +14 -10
web_scraper.py
CHANGED
@@ -11,8 +11,12 @@ from io import BytesIO
|
|
11 |
from googlesearch import search
|
12 |
import json
|
13 |
import logging
|
|
|
14 |
|
15 |
-
|
|
|
|
|
|
|
16 |
|
17 |
# Create a queue for log messages
|
18 |
log_queue = queue.Queue()
|
@@ -23,7 +27,7 @@ class QueueHandler(logging.Handler):
|
|
23 |
log_entry = self.format(record)
|
24 |
log_queue.put(log_entry)
|
25 |
|
26 |
-
|
27 |
logger = logging.getLogger()
|
28 |
queue_handler = QueueHandler()
|
29 |
queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
@@ -31,6 +35,11 @@ queue_handler.addFilter(SocketWarningFilter()) # Add the filter to the handler
|
|
31 |
logger.addHandler(queue_handler)
|
32 |
logger.setLevel(logging.INFO)
|
33 |
|
|
|
|
|
|
|
|
|
|
|
34 |
def search_images(query, num_images=5):
|
35 |
# Headers to mimic a browser request
|
36 |
headers = {
|
@@ -169,7 +178,7 @@ def scrape_site_content(query, num_sites=5):
|
|
169 |
for attempt in range(retries):
|
170 |
try:
|
171 |
# Get the HTML content
|
172 |
-
|
173 |
logger.info(f"Scraping URL: {url}")
|
174 |
response = requests.get(
|
175 |
url,
|
@@ -182,7 +191,7 @@ def scrape_site_content(query, num_sites=5):
|
|
182 |
# Verify it's HTML content
|
183 |
content_type = response.headers.get('Content-Type', '').lower()
|
184 |
if 'text/html' not in content_type:
|
185 |
-
|
186 |
break
|
187 |
|
188 |
# Parse the HTML content
|
@@ -197,7 +206,7 @@ def scrape_site_content(query, num_sites=5):
|
|
197 |
|
198 |
# Skip if not enough content
|
199 |
if len(text_content.split()) < 100: # Skip if less than 100 words
|
200 |
-
|
201 |
break
|
202 |
|
203 |
# Extract all links (limit to first 10)
|
@@ -301,21 +310,16 @@ def analyze_with_gpt(scraped_content, research_query, openrouter_key):
|
|
301 |
|
302 |
# Prepare the prompt
|
303 |
prompt = f"""You are a research assistant analyzing web content to provide comprehensive research.
|
304 |
-
|
305 |
Research Query: {research_query}
|
306 |
-
|
307 |
Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
|
308 |
Make sure to cite sources when making specific claims.
|
309 |
-
|
310 |
Scraped Content:
|
311 |
{json.dumps(scraped_content, indent=2)}
|
312 |
-
|
313 |
Please provide:
|
314 |
1. A comprehensive analysis of the topic
|
315 |
2. Key findings and insights
|
316 |
3. Supporting evidence from the sources
|
317 |
4. Any additional considerations or caveats
|
318 |
-
|
319 |
Format your response in markdown with proper headings and citations."""
|
320 |
|
321 |
response = requests.post(
|
|
|
11 |
from googlesearch import search
|
12 |
import json
|
13 |
import logging
|
14 |
+
import queue
|
15 |
|
16 |
+
# Create a logging filter to suppress socket warnings
|
17 |
+
class SocketWarningFilter(logging.Filter):
|
18 |
+
def filter(self, record):
|
19 |
+
return not (record.levelname == 'WARNING' and 'socket.send()' in record.getMessage())
|
20 |
|
21 |
# Create a queue for log messages
|
22 |
log_queue = queue.Queue()
|
|
|
27 |
log_entry = self.format(record)
|
28 |
log_queue.put(log_entry)
|
29 |
|
30 |
+
# Set up logging with the custom handler
|
31 |
logger = logging.getLogger()
|
32 |
queue_handler = QueueHandler()
|
33 |
queue_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
|
|
35 |
logger.addHandler(queue_handler)
|
36 |
logger.setLevel(logging.INFO)
|
37 |
|
38 |
+
# Also add the filter to the root logger to catch all socket warnings
|
39 |
+
logging.getLogger().addFilter(SocketWarningFilter())
|
40 |
+
|
41 |
+
app = Flask(__name__)
|
42 |
+
|
43 |
def search_images(query, num_images=5):
|
44 |
# Headers to mimic a browser request
|
45 |
headers = {
|
|
|
178 |
for attempt in range(retries):
|
179 |
try:
|
180 |
# Get the HTML content
|
181 |
+
logger.info(f"Trying {url} (attempt {attempt + 1}/{retries})")
|
182 |
logger.info(f"Scraping URL: {url}")
|
183 |
response = requests.get(
|
184 |
url,
|
|
|
191 |
# Verify it's HTML content
|
192 |
content_type = response.headers.get('Content-Type', '').lower()
|
193 |
if 'text/html' not in content_type:
|
194 |
+
logger.info(f"Skipping {url} - not HTML content")
|
195 |
break
|
196 |
|
197 |
# Parse the HTML content
|
|
|
206 |
|
207 |
# Skip if not enough content
|
208 |
if len(text_content.split()) < 100: # Skip if less than 100 words
|
209 |
+
logger.info(f"Skipping {url} - not enough content")
|
210 |
break
|
211 |
|
212 |
# Extract all links (limit to first 10)
|
|
|
310 |
|
311 |
# Prepare the prompt
|
312 |
prompt = f"""You are a research assistant analyzing web content to provide comprehensive research.
|
|
|
313 |
Research Query: {research_query}
|
|
|
314 |
Below is content scraped from various web sources. Analyze this content and provide a detailed, well-structured research response.
|
315 |
Make sure to cite sources when making specific claims.
|
|
|
316 |
Scraped Content:
|
317 |
{json.dumps(scraped_content, indent=2)}
|
|
|
318 |
Please provide:
|
319 |
1. A comprehensive analysis of the topic
|
320 |
2. Key findings and insights
|
321 |
3. Supporting evidence from the sources
|
322 |
4. Any additional considerations or caveats
|
|
|
323 |
Format your response in markdown with proper headings and citations."""
|
324 |
|
325 |
response = requests.post(
|