Spaces:

qiqi657s
/

url2txt

Runtime error

App Files Files Community

url2txt / app.py

qiqi657s

Update app.py

2a96f6e verified 7 months ago

raw

history blame contribute delete

5.64 kB

	import os
	import time
	import uvicorn
	from fastapi import FastAPI, HTTPException
	from starlette.responses import HTMLResponse

	from pydantic import BaseModel
	from DrissionPage import ChromiumPage


	import threading
	import subprocess
	import os

	# 定义启动 Chrome 的函数
	def start_chrome():
	# 设置脚本路径
	script_path = "cd /app && bash init_chrome.sh >> /dev/null 2>&1"
	print('run init_chrome.sh')
	while 1:
	try:
	os.popen(script_path)
	except Exception as e:
	print(e)








	# 创建并启动线程
	chrome_thread = threading.Thread(target=start_chrome)
	chrome_thread.start()






	while 1:
	try:
	browser = ChromiumPage('127.0.0.1:9200')
	break

	except Exception as e:
	print("err",e)
	time.sleep(10)



	# Pydantic 模型定义
	class FetchRequest(BaseModel):
	url: str

	app = FastAPI()

	@app.get("/", response_class=HTMLResponse)
	async def read_root():
	"""
	首页路由（/）的作用是为用户提供 API 相关的基本信息和使用说明。
	该路由返回一个美化过的 HTML 页面，提供 API 的描述和如何使用。
	"""
	html_content = """
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>DrissionPage API Documentation</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	background-color: #f4f4f9;
	color: #333;
	margin: 0;
	padding: 0;
	}
	.container {
	width: 80%;
	margin: 0 auto;
	padding: 30px;
	}
	header {
	background-color: #4CAF50;
	color: white;
	padding: 20px 0;
	text-align: center;
	}
	h1 {
	margin: 0;
	font-size: 36px;
	}
	.section {
	margin-top: 30px;
	}
	.section h2 {
	color: #333;
	font-size: 24px;
	}
	.section p {
	font-size: 16px;
	line-height: 1.6;
	color: #555;
	}
	.code-block {
	background-color: #f5f5f5;
	padding: 15px;
	border-radius: 5px;
	margin-top: 10px;
	font-family: monospace;
	white-space: pre-wrap;
	word-wrap: break-word;
	}
	.note {
	background-color: #ffeb3b;
	padding: 10px;
	border-radius: 5px;
	margin-top: 20px;
	}
	footer {
	text-align: center;
	margin-top: 50px;
	font-size: 14px;
	color: #777;
	}
	</style>
	</head>
	<body>
	<div class="container">
	<header>
	<h1>DrissionPage API Documentation</h1>
	</header>

	<div class="section">
	<h2>Welcome!</h2>
	<p>Welcome to the DrissionPage API! This API allows you to fetch webpages, capture screenshots, and return the source HTML of the page. Below is the API usage and details.</p>
	</div>

	<div class="section">
	<h2>Usage</h2>
	<h3>POST /fetch</h3>
	<p>This endpoint allows you to fetch a webpage and get a screenshot along with the raw HTML content.</p>
	<h4>Request</h4>
	<p>The request should be a JSON object containing the URL to fetch:</p>
	<div class="code-block">
	{
	"url": "https://example.com"
	}
	</div>

	<h4>Response</h4>
	<p>The response will contain two fields: `jpg` (a base64-encoded image of the webpage screenshot) and `source` (the raw HTML of the page).</p>
	<div class="code-block">
	{
	"jpg": "base64_encoded_image_string_here",
	"source": "<html>...</html>"
	}
	</div>
	</div>

	<div class="note">
	<p><strong>Note:</strong> Make sure the browser is running and accessible at the specified address.</p>
	</div>

	<footer>
	<p>API Documentation by DrissionPage</p>
	</footer>
	</div>
	</body>
	</html>
	"""
	return HTMLResponse(content=html_content)



	@app.post("/fetch")
	async def fetch_page(request: FetchRequest):
	url = request.url

	# 去掉 URL 中的查询参数和锚点
	listen_url = url.split('?')[0].split('#')[0][8:]

	try:
	browser.listen.start(listen_url)
	browser.get(url)

	# 等待数据包返回
	packet = browser.listen.wait(timeout=5, count=1)

	# 获取页面截图（base64 格式）
	screenshot = browser.get_screenshot(as_base64=True)

	# 获取页面原始内容
	source = packet.response.raw_body if packet else ""

	# 停止监听
	browser.listen.stop()

	return {
	"jpg": screenshot,
	"source": source,
	}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error fetching the page: {e}")

	if __name__ == "__main__":
	uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)