Spaces:
Running
Running
File size: 1,763 Bytes
1e74012 4e129bc 1e74012 f475beb 1e74012 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# routers/scraping_router.py
from fastapi import APIRouter, HTTPException, Header
from pydantic import BaseModel
import os
from typing import Optional
import logging
import re
from helpers.ai_client import AIClient
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
router = APIRouter(
prefix="/api/v1",
tags=["Web Scraping"]
)
# Initialize AI client
ai_client = AIClient()
class CheerioScriptRequest(BaseModel):
html: str
user_input: str
api_key: Optional[str] = None
@router.post("/generate-cheerio-script")
async def generate_cheerio_script(
request: CheerioScriptRequest,
x_api_key: str = Header(None)
):
try:
system_prompt = f"""return cheerio script to extract {request.user_input} from the following html,
enclose the cheerio script in <cheerio_script> .. </cheerio_script> format, {request.html}"""
response = ai_client.chat(
prompt=request.user_input,
system_message=system_prompt,
model_id="openai/gpt-4o-mini",
api_key=x_api_key
)
# Extract Cheerio script using regex
cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>'
matches = re.search(cheerio_pattern, response, re.DOTALL)
if matches:
cheerio_script = matches.group(1).strip()
return {"cheerio_script": cheerio_script}
else:
raise HTTPException(
status_code=422,
detail="No valid Cheerio script found in the response"
)
except Exception as e:
logger.error(f"Error generating Cheerio script: {e}")
raise HTTPException(status_code=500, detail=str(e)) |