joshuadunlop commited on
Commit
c14ff59
·
verified ·
1 Parent(s): 5ed1472

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -12
app.py CHANGED
@@ -8,6 +8,7 @@ import requests
8
  import time
9
  from bs4 import BeautifulSoup
10
  import re
 
11
 
12
  def get_seo_powersuite_data(domains, api_key):
13
  url_domain_inlink_rank = "https://api.seopowersuite.com/backlinks/v1.0/get-domain-inlink-rank"
@@ -89,11 +90,14 @@ def find_sitemap(url):
89
  if robots_response.status_code == 200:
90
  for line in robots_response.text.split("\n"):
91
  if line.startswith("Sitemap:"):
92
- return line.split(":", 1)[1].strip()
 
 
93
  except requests.exceptions.RequestException:
94
  pass
95
 
96
  sitemap_urls = [
 
97
  "/sitemap.xml", "/wp-sitemap.xml", "/?sitemap=1", "/sitemap_index/xml",
98
  "/sitemap-index.xml", "/sitemap.php", "/sitemap.txt", "/sitemap.xml.gz",
99
  "/sitemap/", "/sitemap/sitemap.xml", "/sitemapindex.xml", "/sitemap/index.xml", "/sitemap1.xml"
@@ -109,12 +113,14 @@ def find_sitemap(url):
109
 
110
  return None
111
 
112
- def crawl_posts(df, page_count):
113
  crawl_results = []
114
- for i, row in df.head(page_count).iterrows():
115
- url = row['loc']
 
 
116
  try:
117
- response = requests.get(url)
118
  if response.status_code == 200:
119
  html = response.text
120
  soup = BeautifulSoup(html, 'html.parser')
@@ -126,14 +132,29 @@ def crawl_posts(df, page_count):
126
  link_text = a.text.strip()
127
  link_nofollow = 'nofollow' in a.get('rel', [])
128
  links.append({'url': link_url, 'text': link_text, 'nofollow': link_nofollow})
129
- crawl_results.append({
130
- 'url': url,
131
  'title': title,
132
  'meta_desc': meta_desc,
133
  'links': links
134
- })
135
  except requests.exceptions.RequestException:
136
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  return pd.DataFrame(crawl_results)
138
 
139
  def download_csv(df, filename):
@@ -146,7 +167,8 @@ def main():
146
  st.title("Website Crawler")
147
 
148
  urls = st.text_area("Enter the website URLs (one per line):", value="")
149
- page_count = st.number_input("Enter the number of pages to crawl:", value=2000, min_value=1, step=1)
 
150
 
151
  col1, col2 = st.columns(2)
152
  with col1:
@@ -167,16 +189,21 @@ def main():
167
  all_unique_outbound_links_df = pd.DataFrame()
168
  all_final_df = pd.DataFrame()
169
  all_analysis_df = pd.DataFrame()
170
-
 
171
  for url in url_list:
172
  with st.spinner(f"Finding sitemap for {url}..."):
173
  sitemap_url = find_sitemap(url)
174
  if sitemap_url:
175
  with st.spinner(f"Crawling {url}..."):
176
  sitemap_df = adv.sitemap_to_df(sitemap_url)
177
- crawl_results = crawl_posts(sitemap_df, page_count)
 
178
 
179
  if not crawl_results.empty:
 
 
 
180
  link_df = pd.DataFrame(crawl_results['links'].explode().tolist())
181
  link_df = link_df[~link_df['url'].str.startswith(('/','#'))]
182
  link_df['internal'] = link_df['url'].apply(lambda x: extract_hostname(url) in extract_hostname(x))
@@ -276,6 +303,12 @@ def main():
276
  else:
277
  st.warning(f"Sitemap not found for {url}.")
278
 
 
 
 
 
 
 
279
  st.subheader("Outbound Links")
280
  if download_links:
281
  st.markdown(download_csv(all_link_df, "Outbound Links"), unsafe_allow_html=True)
 
8
  import time
9
  from bs4 import BeautifulSoup
10
  import re
11
+ import concurrent.futures
12
 
13
  def get_seo_powersuite_data(domains, api_key):
14
  url_domain_inlink_rank = "https://api.seopowersuite.com/backlinks/v1.0/get-domain-inlink-rank"
 
90
  if robots_response.status_code == 200:
91
  for line in robots_response.text.split("\n"):
92
  if line.startswith("Sitemap:"):
93
+ sitemap_url = line.split(":", 1)[1].strip()
94
+ if "post" in sitemap_url.lower() or "blog" in sitemap_url.lower():
95
+ return sitemap_url
96
  except requests.exceptions.RequestException:
97
  pass
98
 
99
  sitemap_urls = [
100
+ "/post-sitemap.xml", "/blog-sitemap.xml", "/sitemap-posts.xml",
101
  "/sitemap.xml", "/wp-sitemap.xml", "/?sitemap=1", "/sitemap_index/xml",
102
  "/sitemap-index.xml", "/sitemap.php", "/sitemap.txt", "/sitemap.xml.gz",
103
  "/sitemap/", "/sitemap/sitemap.xml", "/sitemapindex.xml", "/sitemap/index.xml", "/sitemap1.xml"
 
113
 
114
  return None
115
 
116
+ def crawl_posts(df, page_count, url, concurrent_scrapes):
117
  crawl_results = []
118
+ crawl_status = st.empty()
119
+
120
+ def crawl_page(row):
121
+ page_url = row['loc']
122
  try:
123
+ response = requests.get(page_url)
124
  if response.status_code == 200:
125
  html = response.text
126
  soup = BeautifulSoup(html, 'html.parser')
 
132
  link_text = a.text.strip()
133
  link_nofollow = 'nofollow' in a.get('rel', [])
134
  links.append({'url': link_url, 'text': link_text, 'nofollow': link_nofollow})
135
+ return {
136
+ 'url': page_url, # Use page_url instead of url
137
  'title': title,
138
  'meta_desc': meta_desc,
139
  'links': links
140
+ }
141
  except requests.exceptions.RequestException:
142
+ return None
143
+
144
+ with concurrent.futures.ThreadPoolExecutor() as executor:
145
+ futures = []
146
+ for i in range(0, page_count, concurrent_scrapes):
147
+ batch_df = df.iloc[i:i+concurrent_scrapes]
148
+ batch_futures = [executor.submit(crawl_page, row) for _, row in batch_df.iterrows()]
149
+ futures.extend(batch_futures)
150
+
151
+ for i, future in enumerate(concurrent.futures.as_completed(futures)):
152
+ result = future.result()
153
+ if result is not None:
154
+ crawl_results.append(result)
155
+ crawl_status.text(f"Crawling {url} - Page {i+1}/{page_count}")
156
+
157
+ crawl_status.empty()
158
  return pd.DataFrame(crawl_results)
159
 
160
  def download_csv(df, filename):
 
167
  st.title("Website Crawler")
168
 
169
  urls = st.text_area("Enter the website URLs (one per line):", value="")
170
+ page_count = st.number_input("Enter the number of pages to crawl:", value=1000, min_value=1, step=1)
171
+ concurrent_scrapes = st.number_input("Enter the number of concurrent scrapes:", value=20, min_value=1, step=1)
172
 
173
  col1, col2 = st.columns(2)
174
  with col1:
 
189
  all_unique_outbound_links_df = pd.DataFrame()
190
  all_final_df = pd.DataFrame()
191
  all_analysis_df = pd.DataFrame()
192
+ all_crawled_pages_df = pd.DataFrame()
193
+ #
194
  for url in url_list:
195
  with st.spinner(f"Finding sitemap for {url}..."):
196
  sitemap_url = find_sitemap(url)
197
  if sitemap_url:
198
  with st.spinner(f"Crawling {url}..."):
199
  sitemap_df = adv.sitemap_to_df(sitemap_url)
200
+ sitemap_df = sitemap_df.sort_values(by="lastmod", ascending=False) # Sort by lastmod in descending order
201
+ crawl_results = crawl_posts(sitemap_df, page_count, url, concurrent_scrapes)
202
 
203
  if not crawl_results.empty:
204
+ crawled_pages_df = pd.DataFrame({'Originating Domain': url, 'Crawled Page': crawl_results['url']})
205
+ all_crawled_pages_df = pd.concat([all_crawled_pages_df, crawled_pages_df], ignore_index=True)
206
+
207
  link_df = pd.DataFrame(crawl_results['links'].explode().tolist())
208
  link_df = link_df[~link_df['url'].str.startswith(('/','#'))]
209
  link_df['internal'] = link_df['url'].apply(lambda x: extract_hostname(url) in extract_hostname(x))
 
303
  else:
304
  st.warning(f"Sitemap not found for {url}.")
305
 
306
+ st.subheader("Crawled Pages")
307
+ if download_links:
308
+ st.markdown(download_csv(all_crawled_pages_df, "Crawled Pages"), unsafe_allow_html=True)
309
+ else:
310
+ st.write(all_crawled_pages_df)
311
+
312
  st.subheader("Outbound Links")
313
  if download_links:
314
  st.markdown(download_csv(all_link_df, "Outbound Links"), unsafe_allow_html=True)