Spaces:
Build error
Build error
from seleniumwire import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from selenium.common.exceptions import WebDriverException | |
from fastapi import FastAPI, Request | |
import uvicorn | |
import time | |
import json | |
from urllib.parse import unquote, urlparse | |
app = FastAPI() | |
# 解析cookie字符串为字典 | |
def convert_cookies_to_dict(cookies): | |
cookie_items = cookies.split("; ") | |
parsed_cookies = {item.split("=", 1)[0].strip(): item.split("=", 1)[1].strip() if "=" in item else "" for item in cookie_items} | |
return parsed_cookies# | |
# 获取域名字符串的根域 | |
def get_root_domain(url): | |
parsed_url = urlparse(url) | |
domain = parsed_url.netloc | |
parts = domain.split('.') | |
if len(parts) > 1: | |
return '.'.join(parts[-2:]) | |
else: | |
return domain | |
def filter_type(_type: str): | |
types = [ | |
'application/javascript', 'application/x-javascript', 'text/css', 'webp', 'image/png', 'image/gif', | |
'image/jpeg', 'image/x-icon', 'application/octet-stream' | |
] | |
if _type not in types: | |
return True | |
return False | |
def main(): | |
return {"code": 200,"msg":"Success"} | |
def chrome(url:str=None,wait:int=5,header:str=None,cookie:str=None): | |
caps = { | |
"browserName": "chrome", | |
'goog:loggingPrefs': {'performance': 'ALL'} # 开启日志性能监听 | |
} | |
# 必须有目标url | |
if type(url) == str: | |
target_url = unquote(url) | |
target_domain = get_root_domain(target_url) | |
else: | |
return {"code": 500,"msg":"No target URL"} | |
# 等待时间必须在 0 到 30 之间 | |
if wait in range(0, 31): | |
wait_time = wait | |
else: | |
return {"code": 500,"msg":"The waiting time must be between 0 and 30"} | |
header_array = {} | |
# header可以覆写,但必须传入json | |
try: | |
if type(header) == str: | |
header_array.update(json.loads(unquote(header))) | |
except Exception as e: | |
return {"code": 500,"msg":"The header field is not JSON"} | |
# 如果输入了cookie | |
if type(cookie) == str: | |
header_array.update({"cookie":unquote(cookie)}) | |
# 初始化浏览器 | |
options = Options() | |
# 设置为无头模式 | |
options.add_argument('--headless') | |
for key, value in caps.items(): | |
options.set_capability(key, value) | |
# 实例化 | |
driver = webdriver.Chrome(options=options) | |
# 需要打开网址页面,才能用 driver.add_cookie 进行cookie追加 | |
driver.get(target_url) | |
# 清除本次打开网址页面,可能存储在本地的cookie、sessionStorage、localStorage | |
driver.delete_all_cookies() | |
driver.execute_script("window.sessionStorage.clear();") | |
driver.execute_script("window.localStorage.clear();") | |
# 对浏览器追加我们传递进来的cookie | |
if 'cookie' in header_array: | |
cookie_array = convert_cookies_to_dict(header_array['cookie']) | |
del header_array['cookie'] | |
for key, value in cookie_array.items(): | |
driver.add_cookie({"name": key, "value": value, "domain": f'.{target_domain}', "path": "/", "secure": False}) | |
# 把下次访问中的请求头修改成我们需要的样式(没有修改的项目则保持原样) | |
driver.header_overrides = header_array | |
# 再次访问网址 | |
driver.get(target_url) | |
# 输出此时访问的网页源码 | |
print(driver.page_source) | |
# 等待多少秒,来预估网页完全的加载完成(执行完内部的所有js,因为部分js可能涉及到请求后的动态处理,或者延时跳转) | |
if wait_time > 0: | |
time.sleep(wait_time) | |
# 获取完全加载完成时,页面的URL | |
current_url = driver.current_url | |
# 获取完全加载完成时,页面的源代码 | |
page_source = driver.page_source | |
# 获取完全加载完成时,页面的cookie | |
cookies = driver.get_cookies() | |
# 完全加载完成时,页面是否有发生过 301 302 跳转过 | |
is_jump = (target_url != current_url) | |
performance_log = driver.get_log('performance') # 获取名称为 performance 的日志 | |
for packet in performance_log: | |
message = json.loads(packet.get('message')).get('message') # 获取message的数据 | |
if message.get('method') != 'Network.responseReceived': # 如果method 不是 responseReceived 类型就不往下执行 | |
continue | |
packet_type = message.get('params').get('response').get('mimeType') # 获取该请求返回的type | |
if not filter_type(_type=packet_type): # 过滤type | |
continue | |
requestId = message.get('params').get('requestId') # 唯一的请求标识符。相当于该请求的身份证 | |
url = message.get('params').get('response').get('url') # 获取 该请求 url | |
try: | |
resp = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': requestId}) # selenium调用 cdp | |
print(f'type: {packet_type} url: {url}') | |
print(f'response: {resp}') | |
print() | |
except WebDriverException: # 忽略异常 | |
pass | |
data = { | |
"url": current_url, | |
"page_source": page_source, | |
"cookies": cookies, | |
"is_jump": is_jump | |
} | |
driver.quit() | |
return {"code": 200,"data":data} | |
if __name__ == '__main__': | |
uvicorn.run(app='app:app', host="0.0.0.0", port=7860) |