File size: 4,861 Bytes
1e74012
4b0daa3
b29fd10
4b0daa3
1e74012
 
 
 
4b0daa3
1e74012
 
 
 
 
 
 
 
 
 
 
4b0daa3
 
 
 
 
 
 
 
 
 
 
 
1e74012
 
 
 
 
b29fd10
 
 
 
 
 
1e74012
 
f924c30
1e74012
 
b29fd10
 
7924068
b29fd10
 
 
 
7924068
b29fd10
 
 
7924068
 
b29fd10
 
7924068
b29fd10
7924068
 
 
 
 
 
 
 
 
 
 
 
 
b29fd10
 
 
7924068
b29fd10
 
 
 
7924068
 
b29fd10
 
 
7924068
b29fd10
 
 
 
 
 
 
 
 
7924068
 
 
 
 
 
0ce5866
 
4e129bc
8f70af7
2f3056d
92ee903
1e74012
 
 
 
 
 
 
b29fd10
 
 
 
 
 
 
 
1e74012
b29fd10
360e292
b29fd10
 
 
0ce5866
b29fd10
1e74012
 
 
 
b29fd10
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# routers/scraping_router.py
from fastapi import APIRouter, HTTPException, Header, Depends
from fastapi.responses import JSONResponse
from fastapi.security import APIKeyHeader
from pydantic import BaseModel
from typing import Optional
import logging
import re
import os
from helpers.ai_client import AIClient

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

router = APIRouter(
    prefix="/api/v1",
    tags=["Web Scraping"]
)

ai_client = AIClient()
AI_SCRAPER_API_KEY = os.getenv("AI_SCRAPER_API_KEY")

# API Key security scheme
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=True)

async def verify_api_key(api_key: str = Depends(api_key_header)):
    if api_key != AI_SCRAPER_API_KEY:
        raise HTTPException(
            status_code=401,
            detail="Invalid API key"
        )
    return api_key

class CheerioScriptRequest(BaseModel):
    html: str
    user_input: str

class CheerioScriptResponse(BaseModel):
    cheerio_script: str
    status: str
    message: str

@router.post("/generate-cheerio-script", response_model=CheerioScriptResponse)
async def generate_cheerio_script(
    request: CheerioScriptRequest,
    #api_key: str = Depends(verify_api_key)
):
    try:
        example = """
        Input HTML:
        <html>
        <div class="product-card">
            <h2 class="title">iPhone 14</h2>
            <span class="price">$999</span>
        </div>
        </html>
        Input Request: "extract product title and price"
        Expected Output:
        <cheerio_script>
        function extract(input, cheerio) {
            let result = {
                success: false,
                data: null,
                error: null
            };
            
            try {
                let $ = cheerio.load(input);
                result.data = {
                    title: $('.product-card .title').text().trim() || null,
                    price: $('.product-card .price').text().trim() || null
                };
                result.success = true;
            } catch (error) {
                result.error = error.message;
            }
            
            return result;
        }
        </cheerio_script>
        """
        
        system_prompt = f"""You are an expert at writing Cheerio.js web scraping scripts.
        Task: Generate a Cheerio.js script to extract {request.user_input} from the provided HTML.
        
        Requirements:
        - Script must be wrapped in a function named 'extract' that takes (input, cheerio) parameters
        - Return object must include: {{ success: boolean, data: object|null, error: string|null }}
        - Use modern JavaScript syntax
        - Include try-catch error handling
        - Make the script reusable and efficient
        - Enclose the entire script in <cheerio_script> tags
        
        Here's an example of the expected format:
        {example}
        
        HTML to process:
        {request.html}"""
        
        user_prompt = f"""Generate a Cheerio.js script to extract {request.user_input}.
        The script must:
        1. Be wrapped in a function named 'extract' that takes (input, cheerio) parameters
        2. Return an object with success, data, and error fields
        3. Handle missing elements by returning null
        4. Use proper Cheerio selectors
        5. Include error handling
        6. Be enclosed in <cheerio_script> tags"""

        response = ""
        response = ai_client.chat(
            prompt=user_prompt,
            system_message=system_prompt,
            model_id="openai/gpt-4o-mini"#"deepseek/deepseek-chat"#"google/gemini-pro-1.5" #"deepseek/deepseek-chat"
        )
        
        cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>'
        matches = re.search(cheerio_pattern, response, re.DOTALL)
        
        if matches:
            cheerio_script = matches.group(1).strip()
            return JSONResponse(
                status_code=200,
                content={
                    "cheerio_script": cheerio_script,
                    "status": "success",
                    "message": "Cheerio script generated successfully"
                }
            )
        else:
            return JSONResponse(
                status_code=200,
                content={
                    "cheerio_script": "",
                    "status": "error",
                    "message": f"No valid Cheerio script found in response: {response}"
                }
            )
            
    except Exception as e:
        logger.error(f"Error generating Cheerio script: {e}")
        return JSONResponse(
            status_code=500,
            content={
                "cheerio_script": "",
                "status": "error",
                "message": f"Error generating script: {str(e)}"
            }
        )