Spaces:

qiqi657s
/

url2txt

Runtime error

App Files Files Community

qiqi657s commited on Dec 4, 2024

Commit

315d050

verified ·

1 Parent(s): 284c083

Create app.py

Browse files

Files changed (1) hide show

app.py +214 -0

app.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+import time
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from starlette.responses import HTMLResponse
+from pydantic import BaseModel
+from DrissionPage import ChromiumPage
+import threading
+import subprocess
+import os
+# 定义启动 Chrome 的函数
+def start_chrome():
+    # 设置脚本路径
+    script_path = "cd /app && bash init_chrome.sh >> /dev/null 2>&1"
+    print('run init_chrome.sh')
+    while 1:
+        try:
+            os.popen(script_path)
+        except Exception as e:
+            print(e)
+# 确保 /app/cache 目录存在
+cache_dir = "/app/cache"
+if not os.path.exists(cache_dir):
+    os.makedirs(cache_dir)  # 创建目录
+# 创建并启动线程
+chrome_thread = threading.Thread(target=start_chrome)
+chrome_thread.start()
+while 1:
+    try:
+        browser = ChromiumPage('127.0.0.1:9200')
+        break
+    except Exception as e:
+        print("err",e)
+        time.sleep(10)
+# Pydantic 模型定义
+class FetchRequest(BaseModel):
+    url: str
+app = FastAPI()
+@app.get("/", response_class=HTMLResponse)
+async def read_root():
+    """
+    首页路由（/）的作用是为用户提供 API 相关的基本信息和使用说明。
+    该路由返回一个美化过的 HTML 页面，提供 API 的描述和如何使用。
+    """
+    html_content = """
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>DrissionPage API Documentation</title>
+        <style>
+            body {
+                font-family: Arial, sans-serif;
+                background-color: #f4f4f9;
+                color: #333;
+                margin: 0;
+                padding: 0;
+            }
+            .container {
+                width: 80%;
+                margin: 0 auto;
+                padding: 30px;
+            }
+            header {
+                background-color: #4CAF50;
+                color: white;
+                padding: 20px 0;
+                text-align: center;
+            }
+            h1 {
+                margin: 0;
+                font-size: 36px;
+            }
+            .section {
+                margin-top: 30px;
+            }
+            .section h2 {
+                color: #333;
+                font-size: 24px;
+            }
+            .section p {
+                font-size: 16px;
+                line-height: 1.6;
+                color: #555;
+            }
+            .code-block {
+                background-color: #f5f5f5;
+                padding: 15px;
+                border-radius: 5px;
+                margin-top: 10px;
+                font-family: monospace;
+                white-space: pre-wrap;
+                word-wrap: break-word;
+            }
+            .note {
+                background-color: #ffeb3b;
+                padding: 10px;
+                border-radius: 5px;
+                margin-top: 20px;
+            }
+            footer {
+                text-align: center;
+                margin-top: 50px;
+                font-size: 14px;
+                color: #777;
+            }
+        </style>
+    </head>
+    <body>
+        <div class="container">
+            <header>
+                <h1>DrissionPage API Documentation</h1>
+            </header>
+            <div class="section">
+                <h2>Welcome!</h2>
+                <p>Welcome to the DrissionPage API! This API allows you to fetch webpages, capture screenshots, and return the source HTML of the page. Below is the API usage and details.</p>
+            </div>
+            <div class="section">
+                <h2>Usage</h2>
+                <h3>POST /fetch</h3>
+                <p>This endpoint allows you to fetch a webpage and get a screenshot along with the raw HTML content.</p>
+                <h4>Request</h4>
+                <p>The request should be a JSON object containing the URL to fetch:</p>
+                <div class="code-block">
+                    {
+                        "url": "https://example.com"
+                    }
+                </div>
+                <h4>Response</h4>
+                <p>The response will contain two fields: `jpg` (a base64-encoded image of the webpage screenshot) and `source` (the raw HTML of the page).</p>
+                <div class="code-block">
+                    {
+                        "jpg": "base64_encoded_image_string_here",
+                        "source": "<html>...</html>"
+                    }
+                </div>
+            </div>
+            <div class="note">
+                <p><strong>Note:</strong> Make sure the browser is running and accessible at the specified address.</p>
+            </div>
+            <footer>
+                <p>API Documentation by DrissionPage</p>
+            </footer>
+        </div>
+    </body>
+    </html>
+    """
+    return HTMLResponse(content=html_content)
+@app.post("/fetch")
+async def fetch_page(request: FetchRequest):
+    url = request.url
+    # 去掉 URL 中的查询参数和锚点
+    listen_url = url.split('?')[0].split('#')[0][8:]
+    try:
+        browser.listen.start(listen_url)
+        browser.get(url)
+        # 等待数据包返回
+        packet = browser.listen.wait(timeout=5, count=1)
+        # 获取页面截图（base64 格式）
+        screenshot = browser.get_screenshot(as_base64=True)
+        # 获取页面原始内容
+        source = packet.response.raw_body if packet else ""
+        # 停止监听
+        browser.listen.stop()
+        return {
+            "jpg": screenshot,
+            "source": source,
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error fetching the page: {e}")
+if __name__ == "__main__":
+    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)