Spaces:
Sleeping
Sleeping
File size: 4,861 Bytes
1e74012 4b0daa3 b29fd10 4b0daa3 1e74012 4b0daa3 1e74012 4b0daa3 1e74012 b29fd10 1e74012 f924c30 1e74012 b29fd10 7924068 b29fd10 7924068 b29fd10 7924068 b29fd10 7924068 b29fd10 7924068 b29fd10 7924068 b29fd10 7924068 b29fd10 7924068 b29fd10 7924068 0ce5866 4e129bc 8f70af7 2f3056d 92ee903 1e74012 b29fd10 1e74012 b29fd10 360e292 b29fd10 0ce5866 b29fd10 1e74012 b29fd10 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# routers/scraping_router.py
from fastapi import APIRouter, HTTPException, Header, Depends
from fastapi.responses import JSONResponse
from fastapi.security import APIKeyHeader
from pydantic import BaseModel
from typing import Optional
import logging
import re
import os
from helpers.ai_client import AIClient
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
router = APIRouter(
prefix="/api/v1",
tags=["Web Scraping"]
)
ai_client = AIClient()
AI_SCRAPER_API_KEY = os.getenv("AI_SCRAPER_API_KEY")
# API Key security scheme
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=True)
async def verify_api_key(api_key: str = Depends(api_key_header)):
if api_key != AI_SCRAPER_API_KEY:
raise HTTPException(
status_code=401,
detail="Invalid API key"
)
return api_key
class CheerioScriptRequest(BaseModel):
html: str
user_input: str
class CheerioScriptResponse(BaseModel):
cheerio_script: str
status: str
message: str
@router.post("/generate-cheerio-script", response_model=CheerioScriptResponse)
async def generate_cheerio_script(
request: CheerioScriptRequest,
#api_key: str = Depends(verify_api_key)
):
try:
example = """
Input HTML:
<html>
<div class="product-card">
<h2 class="title">iPhone 14</h2>
<span class="price">$999</span>
</div>
</html>
Input Request: "extract product title and price"
Expected Output:
<cheerio_script>
function extract(input, cheerio) {
let result = {
success: false,
data: null,
error: null
};
try {
let $ = cheerio.load(input);
result.data = {
title: $('.product-card .title').text().trim() || null,
price: $('.product-card .price').text().trim() || null
};
result.success = true;
} catch (error) {
result.error = error.message;
}
return result;
}
</cheerio_script>
"""
system_prompt = f"""You are an expert at writing Cheerio.js web scraping scripts.
Task: Generate a Cheerio.js script to extract {request.user_input} from the provided HTML.
Requirements:
- Script must be wrapped in a function named 'extract' that takes (input, cheerio) parameters
- Return object must include: {{ success: boolean, data: object|null, error: string|null }}
- Use modern JavaScript syntax
- Include try-catch error handling
- Make the script reusable and efficient
- Enclose the entire script in <cheerio_script> tags
Here's an example of the expected format:
{example}
HTML to process:
{request.html}"""
user_prompt = f"""Generate a Cheerio.js script to extract {request.user_input}.
The script must:
1. Be wrapped in a function named 'extract' that takes (input, cheerio) parameters
2. Return an object with success, data, and error fields
3. Handle missing elements by returning null
4. Use proper Cheerio selectors
5. Include error handling
6. Be enclosed in <cheerio_script> tags"""
response = ""
response = ai_client.chat(
prompt=user_prompt,
system_message=system_prompt,
model_id="openai/gpt-4o-mini"#"deepseek/deepseek-chat"#"google/gemini-pro-1.5" #"deepseek/deepseek-chat"
)
cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>'
matches = re.search(cheerio_pattern, response, re.DOTALL)
if matches:
cheerio_script = matches.group(1).strip()
return JSONResponse(
status_code=200,
content={
"cheerio_script": cheerio_script,
"status": "success",
"message": "Cheerio script generated successfully"
}
)
else:
return JSONResponse(
status_code=200,
content={
"cheerio_script": "",
"status": "error",
"message": f"No valid Cheerio script found in response: {response}"
}
)
except Exception as e:
logger.error(f"Error generating Cheerio script: {e}")
return JSONResponse(
status_code=500,
content={
"cheerio_script": "",
"status": "error",
"message": f"Error generating script: {str(e)}"
}
) |