# -*- coding: utf-8 -*- """ Created on Fri May 29 00:38:13 2020 @author: ASUS """ # 導入 模組(module) import requests # 導入 BeautifulSoup 模組(module):解析HTML 語法工具 import bs4 # 文章連結 # URL = "https://www.ptt.cc/bbs/Gossiping/M.1590678355.A.246.html" URL = """https://www.ptt.cc/bbs/Gossiping/index.html""" from urllib.parse import urlparse def get_host(URL): parsed_uri = urlparse(URL) result = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri) return result def proc(ch, HOSTNAME): try: [title_a] = ch.select('.title' )[0].select('a') except ValueError as err: return return dict( title= title_a.getText(), link= HOSTNAME + '/' + title_a.attrs['href'], author=ch.select('.author')[0].getText(), date= ch.select('.date' )[0].getText(), ) def getall(URL): HOSTNAME = get_host(URL) # 設定Header與Cookie # my_headers = {'cookie': 'over18=1;'} cookies = { 'over18': '1' } # 發送get 請求 到 ptt 八卦版 response = requests.get(URL, # headers = my_headers cookies=cookies ) # 把網頁程式碼(HTML) 丟入 bs4模組分析 soup = bs4.BeautifulSoup(response.text,"html.parser") all_articles = soup.find("div", class_="r-list-container action-bar-margin bbs-screen") mu = [] for ch in all_articles.children: # .select('div'): if isinstance(ch, bs4.element.Tag): if ch.attrs['class'] == ['r-ent']: output = proc(ch, HOSTNAME) if output: mu.append(output) elif ch.attrs['class'] == ['r-list-sep']: break buttons = soup.select('a.btn.wide') prev_page, next_page = None, None for button in buttons: if '上頁' in button.getText(): if 'disabled' not in button.attrs['class']: prev_page = HOSTNAME + '/' + button.attrs['href'] else: prev_page = None if '下頁' in button.getText(): if 'disabled' not in button.attrs['class']: next_page = HOSTNAME + '/' + button.attrs['href'] else: next_page = None return mu, prev_page, next_page URL = """https://www.ptt.cc/bbs/Gossiping/index.html""" RR = [] from tqdm import tqdm for iiii in tqdm(range(100)): res, prev_, next_ = getall(URL) print(res[0]["date"], end='\t') print(res[-1]["date"]) URL = prev_ RR.extend(res)