File size: 2,552 Bytes
21e639d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

# -*- coding: utf-8 -*-
"""
Created on Fri May 29 00:38:13 2020
@author: ASUS
"""


# 導入 模組(module) 
import requests 
# 導入 BeautifulSoup 模組(module):解析HTML 語法工具
import bs4

# 文章連結
# URL = "https://www.ptt.cc/bbs/Gossiping/M.1590678355.A.246.html"
URL = """https://www.ptt.cc/bbs/Gossiping/index.html"""



from urllib.parse import urlparse

def get_host(URL):
    parsed_uri = urlparse(URL)
    result = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri)
    return result
    

def proc(ch, HOSTNAME):
    try:
        [title_a] = ch.select('.title' )[0].select('a')
    except ValueError as err:
        return
        
    return dict(
        title= title_a.getText(),
        link=  HOSTNAME + '/' + title_a.attrs['href'],
        author=ch.select('.author')[0].getText(),
        date=  ch.select('.date'  )[0].getText(),
    )


def getall(URL):
    HOSTNAME = get_host(URL)
    

    # 設定Header與Cookie
    # my_headers = {'cookie': 'over18=1;'}
    cookies = {
    'over18': '1'
}

    # 發送get 請求 到 ptt 八卦版
    response = requests.get(URL, 
    # headers = my_headers
    cookies=cookies
    )


    #  把網頁程式碼(HTML) 丟入 bs4模組分析
    soup = bs4.BeautifulSoup(response.text,"html.parser")

    all_articles = soup.find("div", class_="r-list-container action-bar-margin bbs-screen")
    mu = []
    for ch in all_articles.children:  # .select('div'):
        if isinstance(ch, bs4.element.Tag):
            if ch.attrs['class'] == ['r-ent']:
                output = proc(ch, HOSTNAME)
                if output:
                    mu.append(output)
            elif ch.attrs['class'] == ['r-list-sep']:
                break
    
    
    buttons = soup.select('a.btn.wide')
    prev_page, next_page = None, None
    for button in buttons:
        if '上頁' in button.getText():
            if 'disabled' not in button.attrs['class']:
                prev_page = HOSTNAME + '/' + button.attrs['href']
            else:
                prev_page = None
        if '下頁' in button.getText():
            if 'disabled' not in button.attrs['class']:
                next_page = HOSTNAME + '/' + button.attrs['href']
            else:
                next_page = None
    
    return mu, prev_page, next_page


URL = """https://www.ptt.cc/bbs/Gossiping/index.html"""
RR = []
from tqdm import tqdm
for iiii in tqdm(range(100)):
    res, prev_, next_ = getall(URL)
    print(res[0]["date"], end='\t')
    print(res[-1]["date"])
    URL = prev_
    RR.extend(res)