Spaces:

jeffeux
/

assignment-1-jeffeuxMartin

Runtime error

App Files Files Community

assignment-1-jeffeuxMartin / crawler_jeff /jeff_crawler_ver1.py

jeffeux

Add application file

21e639d over 2 years ago

raw

history blame contribute delete

2.55 kB


	# -- coding: utf-8 --
	"""
	Created on Fri May 29 00:38:13 2020
	@author: ASUS
	"""


	# 導入模組(module)
	import requests
	# 導入 BeautifulSoup 模組(module)：解析HTML 語法工具
	import bs4

	# 文章連結
	# URL = "https://www.ptt.cc/bbs/Gossiping/M.1590678355.A.246.html"
	URL = """https://www.ptt.cc/bbs/Gossiping/index.html"""



	from urllib.parse import urlparse

	def get_host(URL):
	parsed_uri = urlparse(URL)
	result = "{uri.scheme}://{uri.netloc}/".format(uri=parsed_uri)
	return result


	def proc(ch, HOSTNAME):
	try:
	[title_a] = ch.select('.title' )[0].select('a')
	except ValueError as err:
	return

	return dict(
	title= title_a.getText(),
	link= HOSTNAME + '/' + title_a.attrs['href'],
	author=ch.select('.author')[0].getText(),
	date= ch.select('.date' )[0].getText(),
	)


	def getall(URL):
	HOSTNAME = get_host(URL)


	# 設定Header與Cookie
	# my_headers = {'cookie': 'over18=1;'}
	cookies = {
	'over18': '1'
	}

	# 發送get 請求到 ptt 八卦版
	response = requests.get(URL,
	# headers = my_headers
	cookies=cookies
	)


	# 把網頁程式碼(HTML) 丟入 bs4模組分析
	soup = bs4.BeautifulSoup(response.text,"html.parser")

	all_articles = soup.find("div", class_="r-list-container action-bar-margin bbs-screen")
	mu = []
	for ch in all_articles.children: # .select('div'):
	if isinstance(ch, bs4.element.Tag):
	if ch.attrs['class'] == ['r-ent']:
	output = proc(ch, HOSTNAME)
	if output:
	mu.append(output)
	elif ch.attrs['class'] == ['r-list-sep']:
	break


	buttons = soup.select('a.btn.wide')
	prev_page, next_page = None, None
	for button in buttons:
	if '上頁' in button.getText():
	if 'disabled' not in button.attrs['class']:
	prev_page = HOSTNAME + '/' + button.attrs['href']
	else:
	prev_page = None
	if '下頁' in button.getText():
	if 'disabled' not in button.attrs['class']:
	next_page = HOSTNAME + '/' + button.attrs['href']
	else:
	next_page = None

	return mu, prev_page, next_page


	URL = """https://www.ptt.cc/bbs/Gossiping/index.html"""
	RR = []
	from tqdm import tqdm
	for iiii in tqdm(range(100)):
	res, prev_, next_ = getall(URL)
	print(res[0]["date"], end='\t')
	print(res[-1]["date"])
	URL = prev_
	RR.extend(res)