Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
""" | |
Created on Fri May 29 00:38:13 2020 | |
@author: ASUS | |
""" | |
# 導入 模組(module) | |
import requests | |
# 導入 BeautifulSoup 模組(module):解析HTML 語法工具 | |
import bs4 | |
# 文章連結 | |
# URL = "https://www.ptt.cc/bbs/Gossiping/M.1590678355.A.246.html" | |
URL = """https://www.ptt.cc/bbs/Gossiping/index.html""" | |
from urllib.parse import urlparse | |
def get_host(URL): | |
parsed_uri = urlparse(URL) | |
result = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri) | |
return result | |
def proc(ch, HOSTNAME): | |
try: | |
[title_a] = ch.select('.title' )[0].select('a') | |
except ValueError as err: | |
return | |
return dict( | |
title= title_a.getText(), | |
link= HOSTNAME + '/' + title_a.attrs['href'], | |
author=ch.select('.author')[0].getText(), | |
date= ch.select('.date' )[0].getText(), | |
) | |
def getall(URL): | |
HOSTNAME = get_host(URL) | |
# 設定Header與Cookie | |
# my_headers = {'cookie': 'over18=1;'} | |
cookies = { | |
'over18': '1' | |
} | |
# 發送get 請求 到 ptt 八卦版 | |
response = requests.get(URL, | |
# headers = my_headers | |
cookies=cookies | |
) | |
# 把網頁程式碼(HTML) 丟入 bs4模組分析 | |
soup = bs4.BeautifulSoup(response.text,"html.parser") | |
all_articles = soup.find("div", class_="r-list-container action-bar-margin bbs-screen") | |
mu = [] | |
for ch in all_articles.children: # .select('div'): | |
if isinstance(ch, bs4.element.Tag): | |
if ch.attrs['class'] == ['r-ent']: | |
output = proc(ch, HOSTNAME) | |
if output: | |
mu.append(output) | |
elif ch.attrs['class'] == ['r-list-sep']: | |
break | |
buttons = soup.select('a.btn.wide') | |
prev_page, next_page = None, None | |
for button in buttons: | |
if '上頁' in button.getText(): | |
if 'disabled' not in button.attrs['class']: | |
prev_page = HOSTNAME + '/' + button.attrs['href'] | |
else: | |
prev_page = None | |
if '下頁' in button.getText(): | |
if 'disabled' not in button.attrs['class']: | |
next_page = HOSTNAME + '/' + button.attrs['href'] | |
else: | |
next_page = None | |
return mu, prev_page, next_page | |
URL = """https://www.ptt.cc/bbs/Gossiping/index.html""" | |
RR = [] | |
from tqdm import tqdm | |
for iiii in tqdm(range(100)): | |
res, prev_, next_ = getall(URL) | |
print(res[0]["date"], end='\t') | |
print(res[-1]["date"]) | |
URL = prev_ | |
RR.extend(res) |