Pamudu13 commited on
Commit
f5a443c
·
verified ·
1 Parent(s): befa8d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -19
app.py CHANGED
@@ -127,33 +127,49 @@ def scrape_site_content(query, num_sites=5):
127
  scraped = 0
128
 
129
  try:
130
- # First, search Google for the query
131
- search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}"
132
  search_response = requests.get(search_url, headers=headers, timeout=30)
133
  search_response.raise_for_status()
134
 
135
  # Parse the search results
136
  search_soup = BeautifulSoup(search_response.text, 'html.parser')
137
- search_results = search_soup.find_all('div', class_='g')
138
 
139
- # Extract URLs from search results
140
- for result in search_results:
141
- if scraped >= num_sites:
142
- break
143
 
144
- link = result.find('a')
145
- if not link:
146
- continue
 
 
147
 
148
- url = link.get('href', '')
149
- if not url.startswith(('http://', 'https://')):
150
- continue
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  try:
153
  # Get the HTML content
154
- response = requests.get(url, headers=headers, timeout=30)
155
  response.raise_for_status()
156
 
 
 
 
 
 
157
  # Parse the HTML content
158
  soup = BeautifulSoup(response.text, 'html.parser')
159
 
@@ -161,14 +177,14 @@ def scrape_site_content(query, num_sites=5):
161
  for script in soup(["script", "style"]):
162
  script.decompose()
163
 
164
- # Extract text content
165
- text_content = soup.get_text(separator='\n', strip=True)
166
 
167
- # Extract all links
168
  links = []
169
- for link in soup.find_all('a', href=True):
170
  href = link['href']
171
- if href.startswith('http'): # Only include absolute URLs
172
  links.append({
173
  'text': link.get_text(strip=True),
174
  'url': href
 
127
  scraped = 0
128
 
129
  try:
130
+ # Use a more direct search URL format
131
+ search_url = f"https://www.google.com/search?q={urllib.parse.quote(query)}&num={num_sites}"
132
  search_response = requests.get(search_url, headers=headers, timeout=30)
133
  search_response.raise_for_status()
134
 
135
  # Parse the search results
136
  search_soup = BeautifulSoup(search_response.text, 'html.parser')
 
137
 
138
+ # Look for URLs in multiple possible locations
139
+ search_results = []
 
 
140
 
141
+ # Method 1: Look for cite elements
142
+ for cite in search_soup.find_all('cite'):
143
+ url = cite.text.strip()
144
+ if url.startswith(('http://', 'https://')):
145
+ search_results.append(url)
146
 
147
+ # Method 2: Look for links with specific attributes
148
+ for a in search_soup.find_all('a'):
149
+ href = a.get('href', '')
150
+ if 'url?q=' in href:
151
+ url = href.split('url?q=')[1].split('&')[0]
152
+ if url.startswith(('http://', 'https://')):
153
+ search_results.append(urllib.parse.unquote(url))
154
+
155
+ # Remove duplicates while preserving order
156
+ search_results = list(dict.fromkeys(search_results))
157
+
158
+ # Process each found URL
159
+ for url in search_results:
160
+ if scraped >= num_sites:
161
+ break
162
 
163
  try:
164
  # Get the HTML content
165
+ response = requests.get(url, headers=headers, timeout=10)
166
  response.raise_for_status()
167
 
168
+ # Verify it's HTML content
169
+ content_type = response.headers.get('Content-Type', '').lower()
170
+ if 'text/html' not in content_type:
171
+ continue
172
+
173
  # Parse the HTML content
174
  soup = BeautifulSoup(response.text, 'html.parser')
175
 
 
177
  for script in soup(["script", "style"]):
178
  script.decompose()
179
 
180
+ # Extract text content (limit to first 1000 characters)
181
+ text_content = soup.get_text(separator='\n', strip=True)[:1000]
182
 
183
+ # Extract all links (limit to first 10)
184
  links = []
185
+ for link in soup.find_all('a', href=True)[:10]:
186
  href = link['href']
187
+ if href.startswith('http'):
188
  links.append({
189
  'text': link.get_text(strip=True),
190
  'url': href