|
|
|
|
|
from scrap_util import getDriver, titleLocInfo, find_key_paragrap, extract_from_driver, table_record_doc |
|
import helium as hm |
|
import time |
|
import datetime |
|
import numpy as np |
|
import pandas as pd |
|
|
|
print("please input the url list to dealt with") |
|
print("调用 web tool TO get Info and context") |
|
urls = ['http://csglw.beijing.gov.cn/zwxx/rsgl/gwygl/202209/t20220909_2812285.html', |
|
'http://ghzrzyw.beijing.gov.cn/zhengwuxinxi/rsxx/sj/202304/t20230418_3058292.html', |
|
'http://mzj.beijing.gov.cn/art/2022/6/16/art_383_630732.html', |
|
'http://www.scrsw.net/zhaokao/2023/zk91559_1.html', |
|
'https://www.gyct.com.cn/info/1487/112938.htm', |
|
'http://www.scrsw.net/zhaokao/2023/zk92498_1.html', |
|
'http://www.hnrs.com.cn/shiye/5148.html', |
|
'http://www.hnrs.com.cn/shiye/5077.html', |
|
'http://www.hnrs.com.cn/shiye/5072.html', |
|
'http://hrss.gd.gov.cn/zwgk/xxgkml/content/post_4148656.html', |
|
'https://www.gdzz.gov.cn/tzgg/content/post_18310.html', |
|
'https://www.gdzz.gov.cn/tzgg/content/post_18035.html', |
|
'https://www.jsdzj.gov.cn/art/2023/2/24/art_28_15933.html', |
|
'http://www.js.msa.gov.cn/art/2023/2/24/art_11436_1391666.html', |
|
'http://www.jsrsks.com/index/article/content/id/3315.shtml'] |
|
|
|
num_urls = len(urls) |
|
print(f"需要处理的任务url个数{num_urls}") |
|
|
|
|
|
st = time.time() |
|
|
|
driver = getDriver() |
|
hm.set_driver(driver) |
|
|
|
|
|
task_docs = [] |
|
contents = [] |
|
for task_link in urls: |
|
hm.go_to(task_link) |
|
time.sleep(1) |
|
|
|
|
|
|
|
items_ = driver.find_elements_by_xpath("//p") |
|
items_ = [i.text for i in items_ if i.text != ""] |
|
context_to_label = "\n".join(items_) |
|
doc = extract_from_driver(driver) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
doc_row = table_record_doc(doc) |
|
content = [task_link] |
|
content.extend(doc_row) |
|
content.append(context_to_label) |
|
contents.append(content) |
|
task_docs.append(doc) |
|
print(doc) |
|
|
|
title = doc["title"].replace("\n", "").strip(" ") |
|
|
|
|
|
|
|
|
|
driver.close() |
|
print(f"当前任务爬取完成!,总共{num_urls}") |
|
|
|
name = ["sub_url", "title", "zwk_year", "zwk_sheng", "zwk_diqu", "zwk_zip", "zwlx", |
|
"bm_sj", "fee_sj", "ks_sj", "zkz_sj", |
|
"tidy_bm_sj","tidy_fee_sj", "tidy_ks_sj", "tidy_zkz_sj", |
|
"fn_list", "content"] |
|
|
|
|
|
end_time = time.time() |
|
cost = end_time - st |
|
print(f"total cost time:{cost}") |
|
|
|
dt, tm = str(datetime.datetime.today()).split() |
|
tm = tm[:5] |
|
time_ckt = dt+"_"+tm |
|
fn_name = "./scrap_data/" + time_ckt + ".csv" |
|
df = pd.DataFrame(data=contents, columns=name) |
|
for i in df.columns: |
|
if np.all(pd.notnull(df[i])) == False: |
|
df[i].fillna("", inplace=True) |
|
num_res = df.shape[0] |
|
print(f"抓取结果数{num_res}") |
|
df.to_csv(fn_name, index=False) |
|
|
|
|
|
print(f"# 信息数据库保存! 在文件中:{fn_name}") |
|
|