url2txt / app.py
qiqi657s's picture
Update app.py
2a96f6e verified
import os
import time
import uvicorn
from fastapi import FastAPI, HTTPException
from starlette.responses import HTMLResponse
from pydantic import BaseModel
from DrissionPage import ChromiumPage
import threading
import subprocess
import os
# 定义启动 Chrome 的函数
def start_chrome():
# 设置脚本路径
script_path = "cd /app && bash init_chrome.sh >> /dev/null 2>&1"
print('run init_chrome.sh')
while 1:
try:
os.popen(script_path)
except Exception as e:
print(e)
# 创建并启动线程
chrome_thread = threading.Thread(target=start_chrome)
chrome_thread.start()
while 1:
try:
browser = ChromiumPage('127.0.0.1:9200')
break
except Exception as e:
print("err",e)
time.sleep(10)
# Pydantic 模型定义
class FetchRequest(BaseModel):
url: str
app = FastAPI()
@app.get("/", response_class=HTMLResponse)
async def read_root():
"""
首页路由(/)的作用是为用户提供 API 相关的基本信息和使用说明。
该路由返回一个美化过的 HTML 页面,提供 API 的描述和如何使用。
"""
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>DrissionPage API Documentation</title>
<style>
body {
font-family: Arial, sans-serif;
background-color: #f4f4f9;
color: #333;
margin: 0;
padding: 0;
}
.container {
width: 80%;
margin: 0 auto;
padding: 30px;
}
header {
background-color: #4CAF50;
color: white;
padding: 20px 0;
text-align: center;
}
h1 {
margin: 0;
font-size: 36px;
}
.section {
margin-top: 30px;
}
.section h2 {
color: #333;
font-size: 24px;
}
.section p {
font-size: 16px;
line-height: 1.6;
color: #555;
}
.code-block {
background-color: #f5f5f5;
padding: 15px;
border-radius: 5px;
margin-top: 10px;
font-family: monospace;
white-space: pre-wrap;
word-wrap: break-word;
}
.note {
background-color: #ffeb3b;
padding: 10px;
border-radius: 5px;
margin-top: 20px;
}
footer {
text-align: center;
margin-top: 50px;
font-size: 14px;
color: #777;
}
</style>
</head>
<body>
<div class="container">
<header>
<h1>DrissionPage API Documentation</h1>
</header>
<div class="section">
<h2>Welcome!</h2>
<p>Welcome to the DrissionPage API! This API allows you to fetch webpages, capture screenshots, and return the source HTML of the page. Below is the API usage and details.</p>
</div>
<div class="section">
<h2>Usage</h2>
<h3>POST /fetch</h3>
<p>This endpoint allows you to fetch a webpage and get a screenshot along with the raw HTML content.</p>
<h4>Request</h4>
<p>The request should be a JSON object containing the URL to fetch:</p>
<div class="code-block">
{
"url": "https://example.com"
}
</div>
<h4>Response</h4>
<p>The response will contain two fields: `jpg` (a base64-encoded image of the webpage screenshot) and `source` (the raw HTML of the page).</p>
<div class="code-block">
{
"jpg": "base64_encoded_image_string_here",
"source": "<html>...</html>"
}
</div>
</div>
<div class="note">
<p><strong>Note:</strong> Make sure the browser is running and accessible at the specified address.</p>
</div>
<footer>
<p>API Documentation by DrissionPage</p>
</footer>
</div>
</body>
</html>
"""
return HTMLResponse(content=html_content)
@app.post("/fetch")
async def fetch_page(request: FetchRequest):
url = request.url
# 去掉 URL 中的查询参数和锚点
listen_url = url.split('?')[0].split('#')[0][8:]
try:
browser.listen.start(listen_url)
browser.get(url)
# 等待数据包返回
packet = browser.listen.wait(timeout=5, count=1)
# 获取页面截图(base64 格式)
screenshot = browser.get_screenshot(as_base64=True)
# 获取页面原始内容
source = packet.response.raw_body if packet else ""
# 停止监听
browser.listen.stop()
return {
"jpg": screenshot,
"source": source,
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error fetching the page: {e}")
if __name__ == "__main__":
uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)