oceansweep commited on
Commit
74c007c
·
verified ·
1 Parent(s): 0cc941c

Delete App_Function_Libraries/Article_Extractor_Lib.py

Browse files
App_Function_Libraries/Article_Extractor_Lib.py DELETED
@@ -1,377 +0,0 @@
1
- # Article_Extractor_Lib.py
2
- #########################################
3
- # Article Extraction Library
4
- # This library is used to handle scraping and extraction of articles from web pages.
5
- #
6
- ####################
7
- # Function List
8
- #
9
- # 1. get_page_title(url)
10
- # 2. get_article_text(url)
11
- # 3. get_article_title(article_url_arg)
12
- #
13
- ####################
14
- #
15
- # Import necessary libraries
16
- import logging
17
- # 3rd-Party Imports
18
- import asyncio
19
- import os
20
- import tempfile
21
- from datetime import datetime
22
- from typing import List, Dict
23
- from urllib.parse import urljoin, urlparse
24
- from xml.dom import minidom
25
- from playwright.async_api import async_playwright
26
- from bs4 import BeautifulSoup
27
- import requests
28
- import trafilatura
29
- import xml.etree.ElementTree as ET
30
-
31
-
32
- # Import Local
33
- #
34
- #######################################################################################################################
35
- # Function Definitions
36
- #
37
-
38
- def get_page_title(url: str) -> str:
39
- try:
40
- response = requests.get(url)
41
- response.raise_for_status()
42
- soup = BeautifulSoup(response.text, 'html.parser')
43
- title_tag = soup.find('title')
44
- return title_tag.string.strip() if title_tag else "Untitled"
45
- except requests.RequestException as e:
46
- logging.error(f"Error fetching page title: {e}")
47
- return "Untitled"
48
-
49
-
50
- def scrape_article(url):
51
- async def fetch_html(url: str) -> str:
52
- async with async_playwright() as p:
53
- browser = await p.chromium.launch(headless=True)
54
- context = await browser.new_context(
55
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
56
- page = await context.new_page()
57
- await page.goto(url)
58
- await page.wait_for_load_state("networkidle") # Wait for the network to be idle
59
- content = await page.content()
60
- await browser.close()
61
- return content
62
-
63
- # FIXME - Add option for extracting comments/tables/images
64
- def extract_article_data(html: str, url: str) -> dict:
65
- downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
66
- metadata = trafilatura.extract_metadata(html)
67
-
68
- result = {
69
- 'title': 'N/A',
70
- 'author': 'N/A',
71
- 'content': '',
72
- 'date': 'N/A',
73
- 'url': url,
74
- 'extraction_successful': False
75
- }
76
-
77
- if downloaded:
78
- result['content'] = downloaded
79
- result['extraction_successful'] = True
80
-
81
- if metadata:
82
- result.update({
83
- 'title': metadata.title if metadata.title else 'N/A',
84
- 'author': metadata.author if metadata.author else 'N/A',
85
- 'date': metadata.date if metadata.date else 'N/A'
86
- })
87
- else:
88
- logging.warning("Metadata extraction failed.")
89
-
90
- if not downloaded:
91
- logging.warning("Content extraction failed.")
92
-
93
- return result
94
-
95
- def convert_html_to_markdown(html: str) -> str:
96
- soup = BeautifulSoup(html, 'html.parser')
97
- for para in soup.find_all('p'):
98
- # Add a newline at the end of each paragraph for markdown separation
99
- para.append('\n')
100
- # Use .get_text() with separator to keep paragraph separation
101
- return soup.get_text(separator='\n\n')
102
-
103
- async def fetch_and_extract_article(url: str):
104
- html = await fetch_html(url)
105
- article_data = extract_article_data(html, url)
106
- if article_data['extraction_successful']:
107
- article_data['content'] = convert_html_to_markdown(article_data['content'])
108
- return article_data
109
-
110
- return asyncio.run(fetch_and_extract_article(url))
111
-
112
-
113
- def collect_internal_links(base_url: str) -> set:
114
- visited = set()
115
- to_visit = {base_url}
116
-
117
- while to_visit:
118
- current_url = to_visit.pop()
119
- if current_url in visited:
120
- continue
121
-
122
- try:
123
- response = requests.get(current_url)
124
- response.raise_for_status()
125
- soup = BeautifulSoup(response.text, 'html.parser')
126
-
127
- # Collect internal links
128
- for link in soup.find_all('a', href=True):
129
- full_url = urljoin(base_url, link['href'])
130
- # Only process links within the same domain
131
- if urlparse(full_url).netloc == urlparse(base_url).netloc:
132
- if full_url not in visited:
133
- to_visit.add(full_url)
134
-
135
- visited.add(current_url)
136
- except requests.RequestException as e:
137
- logging.error(f"Error visiting {current_url}: {e}")
138
- continue
139
-
140
- return visited
141
-
142
-
143
- def generate_temp_sitemap_from_links(links: set) -> str:
144
- """
145
- Generate a temporary sitemap file from collected links and return its path.
146
-
147
- :param links: A set of URLs to include in the sitemap
148
- :return: Path to the temporary sitemap file
149
- """
150
- # Create the root element
151
- urlset = ET.Element("urlset")
152
- urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
153
-
154
- # Add each link to the sitemap
155
- for link in links:
156
- url = ET.SubElement(urlset, "url")
157
- loc = ET.SubElement(url, "loc")
158
- loc.text = link
159
- lastmod = ET.SubElement(url, "lastmod")
160
- lastmod.text = datetime.now().strftime("%Y-%m-%d")
161
- changefreq = ET.SubElement(url, "changefreq")
162
- changefreq.text = "daily"
163
- priority = ET.SubElement(url, "priority")
164
- priority.text = "0.5"
165
-
166
- # Create the tree and get it as a string
167
- xml_string = ET.tostring(urlset, 'utf-8')
168
-
169
- # Pretty print the XML
170
- pretty_xml = minidom.parseString(xml_string).toprettyxml(indent=" ")
171
-
172
- # Create a temporary file
173
- with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as temp_file:
174
- temp_file.write(pretty_xml)
175
- temp_file_path = temp_file.name
176
-
177
- logging.info(f"Temporary sitemap created at: {temp_file_path}")
178
- return temp_file_path
179
-
180
-
181
- def generate_sitemap_for_url(url: str) -> List[Dict[str, str]]:
182
- """
183
- Generate a sitemap for the given URL using the create_filtered_sitemap function.
184
-
185
- Args:
186
- url (str): The base URL to generate the sitemap for
187
-
188
- Returns:
189
- List[Dict[str, str]]: A list of dictionaries, each containing 'url' and 'title' keys
190
- """
191
- with tempfile.NamedTemporaryFile(mode="w+", suffix=".xml", delete=False) as temp_file:
192
- create_filtered_sitemap(url, temp_file.name, is_content_page)
193
- temp_file.seek(0)
194
- tree = ET.parse(temp_file.name)
195
- root = tree.getroot()
196
-
197
- sitemap = []
198
- for url_elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
199
- loc = url_elem.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
200
- sitemap.append({"url": loc, "title": loc.split("/")[-1] or url}) # Use the last part of the URL as a title
201
-
202
- return sitemap
203
-
204
- def scrape_entire_site(base_url: str) -> List[Dict]:
205
- """
206
- Scrape the entire site by generating a temporary sitemap and extracting content from each page.
207
-
208
- :param base_url: The base URL of the site to scrape
209
- :return: A list of dictionaries containing scraped article data
210
- """
211
- # Step 1: Collect internal links from the site
212
- links = collect_internal_links(base_url)
213
- logging.info(f"Collected {len(links)} internal links.")
214
-
215
- # Step 2: Generate the temporary sitemap
216
- temp_sitemap_path = generate_temp_sitemap_from_links(links)
217
-
218
- # Step 3: Scrape each URL in the sitemap
219
- scraped_articles = []
220
- try:
221
- for link in links:
222
- logging.info(f"Scraping {link} ...")
223
- article_data = scrape_article(link)
224
-
225
- if article_data:
226
- logging.info(f"Title: {article_data['title']}")
227
- logging.info(f"Author: {article_data['author']}")
228
- logging.info(f"Date: {article_data['date']}")
229
- logging.info(f"Content: {article_data['content'][:500]}...")
230
-
231
- scraped_articles.append(article_data)
232
- finally:
233
- # Clean up the temporary sitemap file
234
- os.unlink(temp_sitemap_path)
235
- logging.info("Temporary sitemap file deleted")
236
-
237
- return scraped_articles
238
-
239
-
240
- def scrape_by_url_level(base_url: str, level: int) -> list:
241
- """Scrape articles from URLs up to a certain level under the base URL."""
242
-
243
- def get_url_level(url: str) -> int:
244
- return len(urlparse(url).path.strip('/').split('/'))
245
-
246
- links = collect_internal_links(base_url)
247
- filtered_links = [link for link in links if get_url_level(link) <= level]
248
-
249
- return [article for link in filtered_links if (article := scrape_article(link))]
250
-
251
-
252
- def scrape_from_sitemap(sitemap_url: str) -> list:
253
- """Scrape articles from a sitemap URL."""
254
- try:
255
- response = requests.get(sitemap_url)
256
- response.raise_for_status()
257
- root = ET.fromstring(response.content)
258
-
259
- return [article for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
260
- if (article := scrape_article(url.text))]
261
- except requests.RequestException as e:
262
- logging.error(f"Error fetching sitemap: {e}")
263
- return []
264
-
265
-
266
- def convert_to_markdown(articles: list) -> str:
267
- """Convert a list of article data into a single markdown document."""
268
- markdown = ""
269
- for article in articles:
270
- markdown += f"# {article['title']}\n\n"
271
- markdown += f"Author: {article['author']}\n"
272
- markdown += f"Date: {article['date']}\n\n"
273
- markdown += f"{article['content']}\n\n"
274
- markdown += "---\n\n" # Separator between articles
275
- return markdown
276
-
277
-
278
- def is_content_page(url: str) -> bool:
279
- """
280
- Determine if a URL is likely to be a content page.
281
- This is a basic implementation and may need to be adjusted based on the specific website structure.
282
-
283
- :param url: The URL to check
284
- :return: True if the URL is likely a content page, False otherwise
285
- """
286
- #Add more specific checks here based on the website's structure
287
- # Exclude common non-content pages
288
- exclude_patterns = [
289
- '/tag/', '/category/', '/author/', '/search/', '/page/',
290
- 'wp-content', 'wp-includes', 'wp-json', 'wp-admin',
291
- 'login', 'register', 'cart', 'checkout', 'account',
292
- '.jpg', '.png', '.gif', '.pdf', '.zip'
293
- ]
294
- return not any(pattern in url.lower() for pattern in exclude_patterns)
295
-
296
-
297
- def create_filtered_sitemap(base_url: str, output_file: str, filter_function):
298
- """
299
- Create a sitemap from internal links and filter them based on a custom function.
300
-
301
- :param base_url: The base URL of the website
302
- :param output_file: The file to save the sitemap to
303
- :param filter_function: A function that takes a URL and returns True if it should be included
304
- """
305
- links = collect_internal_links(base_url)
306
- filtered_links = set(filter(filter_function, links))
307
-
308
- root = ET.Element("urlset")
309
- root.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
310
-
311
- for link in filtered_links:
312
- url = ET.SubElement(root, "url")
313
- loc = ET.SubElement(url, "loc")
314
- loc.text = link
315
-
316
- tree = ET.ElementTree(root)
317
- tree.write(output_file, encoding='utf-8', xml_declaration=True)
318
- print(f"Filtered sitemap saved to {output_file}")
319
-
320
-
321
- def scrape_from_filtered_sitemap(sitemap_file: str, filter_function) -> list:
322
- """
323
- Scrape articles from a sitemap file, applying an additional filter function.
324
-
325
- :param sitemap_file: Path to the sitemap file
326
- :param filter_function: A function that takes a URL and returns True if it should be scraped
327
- :return: List of scraped articles
328
- """
329
- try:
330
- tree = ET.parse(sitemap_file)
331
- root = tree.getroot()
332
-
333
- articles = []
334
- for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
335
- if filter_function(url.text):
336
- article_data = scrape_article(url.text)
337
- if article_data:
338
- articles.append(article_data)
339
-
340
- return articles
341
- except ET.ParseError as e:
342
- logging.error(f"Error parsing sitemap: {e}")
343
- return []
344
-
345
-
346
- def scrape_and_convert_with_filter(source: str, output_file: str, filter_function=is_content_page, level: int = None):
347
- """
348
- Scrape articles from a sitemap or by URL level, apply filtering, and convert to a single markdown file.
349
-
350
- :param source: URL of the sitemap, base URL for level-based scraping, or path to a local sitemap file
351
- :param output_file: Path to save the output markdown file
352
- :param filter_function: Function to filter URLs (default is is_content_page)
353
- :param level: URL level for scraping (None if using sitemap)
354
- """
355
- if level is not None:
356
- # Scraping by URL level
357
- articles = scrape_by_url_level(source, level)
358
- articles = [article for article in articles if filter_function(article['url'])]
359
- elif source.startswith('http'):
360
- # Scraping from online sitemap
361
- articles = scrape_from_sitemap(source)
362
- articles = [article for article in articles if filter_function(article['url'])]
363
- else:
364
- # Scraping from local sitemap file
365
- articles = scrape_from_filtered_sitemap(source, filter_function)
366
-
367
- articles = [article for article in articles if filter_function(article['url'])]
368
- markdown_content = convert_to_markdown(articles)
369
-
370
- with open(output_file, 'w', encoding='utf-8') as f:
371
- f.write(markdown_content)
372
-
373
- logging.info(f"Scraped and filtered content saved to {output_file}")
374
-
375
- #
376
- #
377
- #######################################################################################################################