Spaces:
Build error
Build error
File size: 5,416 Bytes
3b2ecd2 494cdcf 9917a76 973d2bc ca11d6d baf3cbb aee7417 e37298f 973d2bc e37298f efebe44 aee7417 efebe44 aee7417 efebe44 aee7417 1497d6f 25b053a 1497d6f 25b053a 1497d6f 973d2bc 65230b0 3895bea ca9d760 1497d6f ca9d760 1497d6f 247ee08 34b052b 247ee08 65230b0 247ee08 77269d1 247ee08 02fa17e 3895bea 247ee08 65230b0 247ee08 65230b0 3895bea 247ee08 efebe44 c7f05f3 efebe44 c7f05f3 1497d6f efebe44 c7f05f3 efebe44 4196efc 34b052b 0d18c41 1bf1c75 0d8732a efebe44 0d18c41 3895bea efebe44 8996078 efebe44 8996078 7274bf1 efebe44 7274bf1 8996078 efebe44 8996078 efebe44 8996078 efebe44 8996078 efebe44 8996078 ca9d760 7b4d042 1497d6f 25b053a 1497d6f 7b4d042 ca9d760 1497d6f ca9d760 1497d6f 8996078 ca9d760 8996078 e37298f 973d2bc ca9d760 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
from seleniumwire import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
from fastapi import FastAPI, Request
import uvicorn
import time
import json
from urllib.parse import unquote, urlparse
app = FastAPI()
# 解析cookie字符串为字典
def convert_cookies_to_dict(cookies):
cookie_items = cookies.split("; ")
parsed_cookies = {item.split("=", 1)[0].strip(): item.split("=", 1)[1].strip() if "=" in item else "" for item in cookie_items}
return parsed_cookies#
# 获取域名字符串的根域
def get_root_domain(url):
parsed_url = urlparse(url)
domain = parsed_url.netloc
parts = domain.split('.')
if len(parts) > 1:
return '.'.join(parts[-2:])
else:
return domain
def filter_type(_type: str):
types = [
'application/javascript', 'application/x-javascript', 'text/css', 'webp', 'image/png', 'image/gif',
'image/jpeg', 'image/x-icon', 'application/octet-stream'
]
if _type not in types:
return True
return False
@app.get("/")
def main():
return {"code": 200,"msg":"Success"}
@app.get("/chrome")
def chrome(url:str=None,wait:int=5,header:str=None,cookie:str=None):
caps = {
"browserName": "chrome",
'goog:loggingPrefs': {'performance': 'ALL'} # 开启日志性能监听
}
# 必须有目标url
if type(url) == str:
target_url = unquote(url)
target_domain = get_root_domain(target_url)
else:
return {"code": 500,"msg":"No target URL"}
# 等待时间必须在 0 到 30 之间
if wait in range(0, 31):
wait_time = wait
else:
return {"code": 500,"msg":"The waiting time must be between 0 and 30"}
header_array = {}
# header可以覆写,但必须传入json
try:
if type(header) == str:
header_array.update(json.loads(unquote(header)))
except Exception as e:
return {"code": 500,"msg":"The header field is not JSON"}
# 如果输入了cookie
if type(cookie) == str:
header_array.update({"cookie":unquote(cookie)})
# 初始化浏览器
options = Options()
# 设置为无头模式
options.add_argument('--headless')
for key, value in caps.items():
options.set_capability(key, value)
# 实例化
driver = webdriver.Chrome(options=options)
# 需要打开网址页面,才能用 driver.add_cookie 进行cookie追加
driver.get(target_url)
# 清除本次打开网址页面,可能存储在本地的cookie、sessionStorage、localStorage
driver.delete_all_cookies()
driver.execute_script("window.sessionStorage.clear();")
driver.execute_script("window.localStorage.clear();")
# 对浏览器追加我们传递进来的cookie
if 'cookie' in header_array:
cookie_array = convert_cookies_to_dict(header_array['cookie'])
del header_array['cookie']
for key, value in cookie_array.items():
driver.add_cookie({"name": key, "value": value, "domain": f'.{target_domain}', "path": "/", "secure": False})
# 把下次访问中的请求头修改成我们需要的样式(没有修改的项目则保持原样)
driver.header_overrides = header_array
# 再次访问网址
driver.get(target_url)
# 输出此时访问的网页源码
print(driver.page_source)
# 等待多少秒,来预估网页完全的加载完成(执行完内部的所有js,因为部分js可能涉及到请求后的动态处理,或者延时跳转)
if wait_time > 0:
time.sleep(wait_time)
# 获取完全加载完成时,页面的URL
current_url = driver.current_url
# 获取完全加载完成时,页面的源代码
page_source = driver.page_source
# 获取完全加载完成时,页面的cookie
cookies = driver.get_cookies()
# 完全加载完成时,页面是否有发生过 301 302 跳转过
is_jump = (target_url != current_url)
performance_log = driver.get_log('performance') # 获取名称为 performance 的日志
for packet in performance_log:
message = json.loads(packet.get('message')).get('message') # 获取message的数据
if message.get('method') != 'Network.responseReceived': # 如果method 不是 responseReceived 类型就不往下执行
continue
packet_type = message.get('params').get('response').get('mimeType') # 获取该请求返回的type
if not filter_type(_type=packet_type): # 过滤type
continue
requestId = message.get('params').get('requestId') # 唯一的请求标识符。相当于该请求的身份证
url = message.get('params').get('response').get('url') # 获取 该请求 url
try:
resp = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': requestId}) # selenium调用 cdp
print(f'type: {packet_type} url: {url}')
print(f'response: {resp}')
print()
except WebDriverException: # 忽略异常
pass
data = {
"url": current_url,
"page_source": page_source,
"cookies": cookies,
"is_jump": is_jump
}
driver.quit()
return {"code": 200,"data":data}
if __name__ == '__main__':
uvicorn.run(app='app:app', host="0.0.0.0", port=7860) |