Spaces:
Sleeping
Sleeping
Update routers/scraping_router.py
Browse files- routers/scraping_router.py +85 -13
routers/scraping_router.py
CHANGED
@@ -1,13 +1,12 @@
|
|
1 |
# routers/scraping_router.py
|
2 |
from fastapi import APIRouter, HTTPException, Header
|
|
|
3 |
from pydantic import BaseModel
|
4 |
-
import os
|
5 |
from typing import Optional
|
6 |
import logging
|
7 |
import re
|
8 |
from helpers.ai_client import AIClient
|
9 |
|
10 |
-
# Set up logging
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
@@ -16,7 +15,6 @@ router = APIRouter(
|
|
16 |
tags=["Web Scraping"]
|
17 |
)
|
18 |
|
19 |
-
# Initialize AI client
|
20 |
ai_client = AIClient()
|
21 |
|
22 |
class CheerioScriptRequest(BaseModel):
|
@@ -24,16 +22,73 @@ class CheerioScriptRequest(BaseModel):
|
|
24 |
user_input: str
|
25 |
api_key: Optional[str] = None
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
28 |
async def generate_cheerio_script(
|
29 |
request: CheerioScriptRequest,
|
30 |
x_api_key: str = Header(None)
|
31 |
):
|
32 |
try:
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
response = ai_client.chat(
|
38 |
prompt=user_prompt,
|
39 |
system_message=system_prompt,
|
@@ -41,19 +96,36 @@ async def generate_cheerio_script(
|
|
41 |
api_key=x_api_key
|
42 |
)
|
43 |
|
44 |
-
# Extract Cheerio script using regex
|
45 |
cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>'
|
46 |
matches = re.search(cheerio_pattern, response, re.DOTALL)
|
47 |
|
48 |
if matches:
|
49 |
cheerio_script = matches.group(1).strip()
|
50 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
else:
|
52 |
-
|
53 |
status_code=422,
|
54 |
-
|
|
|
|
|
|
|
|
|
55 |
)
|
56 |
|
57 |
except Exception as e:
|
58 |
logger.error(f"Error generating Cheerio script: {e}")
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# routers/scraping_router.py
|
2 |
from fastapi import APIRouter, HTTPException, Header
|
3 |
+
from fastapi.responses import JSONResponse
|
4 |
from pydantic import BaseModel
|
|
|
5 |
from typing import Optional
|
6 |
import logging
|
7 |
import re
|
8 |
from helpers.ai_client import AIClient
|
9 |
|
|
|
10 |
logging.basicConfig(level=logging.INFO)
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
|
|
15 |
tags=["Web Scraping"]
|
16 |
)
|
17 |
|
|
|
18 |
ai_client = AIClient()
|
19 |
|
20 |
class CheerioScriptRequest(BaseModel):
|
|
|
22 |
user_input: str
|
23 |
api_key: Optional[str] = None
|
24 |
|
25 |
+
class CheerioScriptResponse(BaseModel):
|
26 |
+
cheerio_script: str
|
27 |
+
status: str
|
28 |
+
message: str
|
29 |
+
|
30 |
+
@router.post("/generate-cheerio-script", response_model=CheerioScriptResponse)
|
31 |
async def generate_cheerio_script(
|
32 |
request: CheerioScriptRequest,
|
33 |
x_api_key: str = Header(None)
|
34 |
):
|
35 |
try:
|
36 |
+
example = """
|
37 |
+
Input HTML:
|
38 |
+
<div class="product-card">
|
39 |
+
<h2 class="title">iPhone 14</h2>
|
40 |
+
<span class="price">$999</span>
|
41 |
+
</div>
|
42 |
+
|
43 |
+
Input Request: "extract product title and price"
|
44 |
+
|
45 |
+
Expected Output:
|
46 |
+
<cheerio_script>
|
47 |
+
let result = {};
|
48 |
+
try {
|
49 |
+
const productCard = $('.product-card');
|
50 |
+
result = {
|
51 |
+
success: true,
|
52 |
+
data: {
|
53 |
+
title: productCard.find('.title').text().trim() || null,
|
54 |
+
price: productCard.find('.price').text().trim() || null
|
55 |
+
},
|
56 |
+
error: null
|
57 |
+
};
|
58 |
+
} catch (error) {
|
59 |
+
result = {
|
60 |
+
success: false,
|
61 |
+
data: null,
|
62 |
+
error: error.message
|
63 |
+
};
|
64 |
+
}
|
65 |
+
return result;
|
66 |
+
</cheerio_script>
|
67 |
+
"""
|
68 |
+
|
69 |
+
system_prompt = f"""You are an expert at writing Cheerio.js web scraping scripts.
|
70 |
+
Task: Generate a Cheerio.js script to extract {request.user_input} from the provided HTML.
|
71 |
+
|
72 |
+
Requirements:
|
73 |
+
- Return a dictionary/object with the structure: {{ success: boolean, data: object|null, error: string|null }}
|
74 |
+
- Use modern JavaScript syntax
|
75 |
+
- Include try-catch error handling
|
76 |
+
- Make the script reusable and efficient
|
77 |
+
- Enclose the script in <cheerio_script> tags
|
78 |
+
|
79 |
+
Here's an example of the expected format:
|
80 |
+
{example}
|
81 |
+
|
82 |
+
HTML to process:
|
83 |
+
{request.html}"""
|
84 |
+
|
85 |
+
user_prompt = f"""Generate a Cheerio.js script to extract {request.user_input}.
|
86 |
+
The script must:
|
87 |
+
1. Return a dictionary/object with success, data, and error fields
|
88 |
+
2. Handle missing elements gracefully
|
89 |
+
3. Use proper Cheerio selectors
|
90 |
+
4. Be enclosed in <cheerio_script> tags"""
|
91 |
+
|
92 |
response = ai_client.chat(
|
93 |
prompt=user_prompt,
|
94 |
system_message=system_prompt,
|
|
|
96 |
api_key=x_api_key
|
97 |
)
|
98 |
|
|
|
99 |
cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>'
|
100 |
matches = re.search(cheerio_pattern, response, re.DOTALL)
|
101 |
|
102 |
if matches:
|
103 |
cheerio_script = matches.group(1).strip()
|
104 |
+
return JSONResponse(
|
105 |
+
status_code=200,
|
106 |
+
content={
|
107 |
+
"cheerio_script": cheerio_script,
|
108 |
+
"status": "success",
|
109 |
+
"message": "Cheerio script generated successfully"
|
110 |
+
}
|
111 |
+
)
|
112 |
else:
|
113 |
+
return JSONResponse(
|
114 |
status_code=422,
|
115 |
+
content={
|
116 |
+
"cheerio_script": "",
|
117 |
+
"status": "error",
|
118 |
+
"message": "No valid Cheerio script found in response"
|
119 |
+
}
|
120 |
)
|
121 |
|
122 |
except Exception as e:
|
123 |
logger.error(f"Error generating Cheerio script: {e}")
|
124 |
+
return JSONResponse(
|
125 |
+
status_code=500,
|
126 |
+
content={
|
127 |
+
"cheerio_script": "",
|
128 |
+
"status": "error",
|
129 |
+
"message": f"Error generating script: {str(e)}"
|
130 |
+
}
|
131 |
+
)
|