jeffeux's picture
Add application file
21e639d
# -*- coding: utf-8 -*-
"""
Created on Fri May 29 00:38:13 2020
@author: ASUS
"""
# 導入 模組(module)
import requests
# 導入 BeautifulSoup 模組(module):解析HTML 語法工具
import bs4
# 文章連結
# URL = "https://www.ptt.cc/bbs/Gossiping/M.1590678355.A.246.html"
URL = """https://www.ptt.cc/bbs/Gossiping/index.html"""
from urllib.parse import urlparse
def get_host(URL):
parsed_uri = urlparse(URL)
result = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri)
return result
def proc(ch, HOSTNAME):
try:
[title_a] = ch.select('.title' )[0].select('a')
except ValueError as err:
return
return dict(
title= title_a.getText(),
link= HOSTNAME + '/' + title_a.attrs['href'],
author=ch.select('.author')[0].getText(),
date= ch.select('.date' )[0].getText(),
)
def getall(URL):
HOSTNAME = get_host(URL)
# 設定Header與Cookie
# my_headers = {'cookie': 'over18=1;'}
cookies = {
'over18': '1'
}
# 發送get 請求 到 ptt 八卦版
response = requests.get(URL,
# headers = my_headers
cookies=cookies
)
# 把網頁程式碼(HTML) 丟入 bs4模組分析
soup = bs4.BeautifulSoup(response.text,"html.parser")
all_articles = soup.find("div", class_="r-list-container action-bar-margin bbs-screen")
mu = []
for ch in all_articles.children: # .select('div'):
if isinstance(ch, bs4.element.Tag):
if ch.attrs['class'] == ['r-ent']:
output = proc(ch, HOSTNAME)
if output:
mu.append(output)
elif ch.attrs['class'] == ['r-list-sep']:
break
buttons = soup.select('a.btn.wide')
prev_page, next_page = None, None
for button in buttons:
if '上頁' in button.getText():
if 'disabled' not in button.attrs['class']:
prev_page = HOSTNAME + '/' + button.attrs['href']
else:
prev_page = None
if '下頁' in button.getText():
if 'disabled' not in button.attrs['class']:
next_page = HOSTNAME + '/' + button.attrs['href']
else:
next_page = None
return mu, prev_page, next_page
URL = """https://www.ptt.cc/bbs/Gossiping/index.html"""
RR = []
from tqdm import tqdm
for iiii in tqdm(range(100)):
res, prev_, next_ = getall(URL)
print(res[0]["date"], end='\t')
print(res[-1]["date"])
URL = prev_
RR.extend(res)