pvanand commited on
Commit
1e74012
·
verified ·
1 Parent(s): 71c73b5

Create scraping_router.py

Browse files
Files changed (1) hide show
  1. routers/scraping_router.py +58 -0
routers/scraping_router.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # routers/scraping_router.py
2
+ from fastapi import APIRouter, HTTPException, Header
3
+ from pydantic import BaseModel
4
+ import os
5
+ from typing import Optional
6
+ import logging
7
+ import re
8
+ from helpers.ai_client import AIClient
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ router = APIRouter(
15
+ prefix="/api/v1",
16
+ tags=["Web Scraping"]
17
+ )
18
+
19
+ # Initialize AI client
20
+ ai_client = AIClient()
21
+
22
+ class CheerioScriptRequest(BaseModel):
23
+ html: str
24
+ user_input: str
25
+ api_key: Optional[str] = None
26
+
27
+ @router.post("/generate-cheerio-script")
28
+ async def generate_cheerio_script(
29
+ request: CheerioScriptRequest,
30
+ x_api_key: str = Header(None)
31
+ ):
32
+ try:
33
+ system_prompt = f"""return cheerio script to extract {request.user_input} from the following html,
34
+ enclose the cheerio script in <cheerio_script> .. </cheerio_script> format, {request.html}"""
35
+
36
+ response = ai_client.send_prompt(
37
+ prompt=request.user_input,
38
+ system_message=system_prompt,
39
+ model_id="openai/gpt-4-mini",
40
+ api_key=x_api_key
41
+ )
42
+
43
+ # Extract Cheerio script using regex
44
+ cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>'
45
+ matches = re.search(cheerio_pattern, response, re.DOTALL)
46
+
47
+ if matches:
48
+ cheerio_script = matches.group(1).strip()
49
+ return {"cheerio_script": cheerio_script}
50
+ else:
51
+ raise HTTPException(
52
+ status_code=422,
53
+ detail="No valid Cheerio script found in the response"
54
+ )
55
+
56
+ except Exception as e:
57
+ logger.error(f"Error generating Cheerio script: {e}")
58
+ raise HTTPException(status_code=500, detail=str(e))