pvanand commited on
Commit
b29fd10
·
verified ·
1 Parent(s): 8f70af7

Update routers/scraping_router.py

Browse files
Files changed (1) hide show
  1. routers/scraping_router.py +85 -13
routers/scraping_router.py CHANGED
@@ -1,13 +1,12 @@
1
  # routers/scraping_router.py
2
  from fastapi import APIRouter, HTTPException, Header
 
3
  from pydantic import BaseModel
4
- import os
5
  from typing import Optional
6
  import logging
7
  import re
8
  from helpers.ai_client import AIClient
9
 
10
- # Set up logging
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
@@ -16,7 +15,6 @@ router = APIRouter(
16
  tags=["Web Scraping"]
17
  )
18
 
19
- # Initialize AI client
20
  ai_client = AIClient()
21
 
22
  class CheerioScriptRequest(BaseModel):
@@ -24,16 +22,73 @@ class CheerioScriptRequest(BaseModel):
24
  user_input: str
25
  api_key: Optional[str] = None
26
 
27
- @router.post("/generate-cheerio-script")
 
 
 
 
 
28
  async def generate_cheerio_script(
29
  request: CheerioScriptRequest,
30
  x_api_key: str = Header(None)
31
  ):
32
  try:
33
- system_prompt = f"""return cheerio js script to extract {request.user_input} from the following html,
34
- enclose the cheerio script in <cheerio_script> .. </cheerio_script> format, {request.html}"""
35
- user_prompt = "return valid cherio js script enclosed in <cheerio_script> </cheerio_script> format"
36
- response = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  response = ai_client.chat(
38
  prompt=user_prompt,
39
  system_message=system_prompt,
@@ -41,19 +96,36 @@ async def generate_cheerio_script(
41
  api_key=x_api_key
42
  )
43
 
44
- # Extract Cheerio script using regex
45
  cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>'
46
  matches = re.search(cheerio_pattern, response, re.DOTALL)
47
 
48
  if matches:
49
  cheerio_script = matches.group(1).strip()
50
- return {"cheerio_script": cheerio_script}
 
 
 
 
 
 
 
51
  else:
52
- raise HTTPException(
53
  status_code=422,
54
- detail=f"No valid Cheerio script found in the response: {response}"
 
 
 
 
55
  )
56
 
57
  except Exception as e:
58
  logger.error(f"Error generating Cheerio script: {e}")
59
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
1
  # routers/scraping_router.py
2
  from fastapi import APIRouter, HTTPException, Header
3
+ from fastapi.responses import JSONResponse
4
  from pydantic import BaseModel
 
5
  from typing import Optional
6
  import logging
7
  import re
8
  from helpers.ai_client import AIClient
9
 
 
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
 
15
  tags=["Web Scraping"]
16
  )
17
 
 
18
  ai_client = AIClient()
19
 
20
  class CheerioScriptRequest(BaseModel):
 
22
  user_input: str
23
  api_key: Optional[str] = None
24
 
25
+ class CheerioScriptResponse(BaseModel):
26
+ cheerio_script: str
27
+ status: str
28
+ message: str
29
+
30
+ @router.post("/generate-cheerio-script", response_model=CheerioScriptResponse)
31
  async def generate_cheerio_script(
32
  request: CheerioScriptRequest,
33
  x_api_key: str = Header(None)
34
  ):
35
  try:
36
+ example = """
37
+ Input HTML:
38
+ <div class="product-card">
39
+ <h2 class="title">iPhone 14</h2>
40
+ <span class="price">$999</span>
41
+ </div>
42
+
43
+ Input Request: "extract product title and price"
44
+
45
+ Expected Output:
46
+ <cheerio_script>
47
+ let result = {};
48
+ try {
49
+ const productCard = $('.product-card');
50
+ result = {
51
+ success: true,
52
+ data: {
53
+ title: productCard.find('.title').text().trim() || null,
54
+ price: productCard.find('.price').text().trim() || null
55
+ },
56
+ error: null
57
+ };
58
+ } catch (error) {
59
+ result = {
60
+ success: false,
61
+ data: null,
62
+ error: error.message
63
+ };
64
+ }
65
+ return result;
66
+ </cheerio_script>
67
+ """
68
+
69
+ system_prompt = f"""You are an expert at writing Cheerio.js web scraping scripts.
70
+ Task: Generate a Cheerio.js script to extract {request.user_input} from the provided HTML.
71
+
72
+ Requirements:
73
+ - Return a dictionary/object with the structure: {{ success: boolean, data: object|null, error: string|null }}
74
+ - Use modern JavaScript syntax
75
+ - Include try-catch error handling
76
+ - Make the script reusable and efficient
77
+ - Enclose the script in <cheerio_script> tags
78
+
79
+ Here's an example of the expected format:
80
+ {example}
81
+
82
+ HTML to process:
83
+ {request.html}"""
84
+
85
+ user_prompt = f"""Generate a Cheerio.js script to extract {request.user_input}.
86
+ The script must:
87
+ 1. Return a dictionary/object with success, data, and error fields
88
+ 2. Handle missing elements gracefully
89
+ 3. Use proper Cheerio selectors
90
+ 4. Be enclosed in <cheerio_script> tags"""
91
+
92
  response = ai_client.chat(
93
  prompt=user_prompt,
94
  system_message=system_prompt,
 
96
  api_key=x_api_key
97
  )
98
 
 
99
  cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>'
100
  matches = re.search(cheerio_pattern, response, re.DOTALL)
101
 
102
  if matches:
103
  cheerio_script = matches.group(1).strip()
104
+ return JSONResponse(
105
+ status_code=200,
106
+ content={
107
+ "cheerio_script": cheerio_script,
108
+ "status": "success",
109
+ "message": "Cheerio script generated successfully"
110
+ }
111
+ )
112
  else:
113
+ return JSONResponse(
114
  status_code=422,
115
+ content={
116
+ "cheerio_script": "",
117
+ "status": "error",
118
+ "message": "No valid Cheerio script found in response"
119
+ }
120
  )
121
 
122
  except Exception as e:
123
  logger.error(f"Error generating Cheerio script: {e}")
124
+ return JSONResponse(
125
+ status_code=500,
126
+ content={
127
+ "cheerio_script": "",
128
+ "status": "error",
129
+ "message": f"Error generating script: {str(e)}"
130
+ }
131
+ )