Spaces:
Running
Running
# routers/scraping_router.py | |
from fastapi import APIRouter, HTTPException, Header | |
from pydantic import BaseModel | |
import os | |
from typing import Optional | |
import logging | |
import re | |
from helpers.ai_client import AIClient | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
router = APIRouter( | |
prefix="/api/v1", | |
tags=["Web Scraping"] | |
) | |
# Initialize AI client | |
ai_client = AIClient() | |
class CheerioScriptRequest(BaseModel): | |
html: str | |
user_input: str | |
api_key: Optional[str] = None | |
async def generate_cheerio_script( | |
request: CheerioScriptRequest, | |
x_api_key: str = Header(None) | |
): | |
try: | |
system_prompt = f"""return cheerio script to extract {request.user_input} from the following html, | |
enclose the cheerio script in <cheerio_script> .. </cheerio_script> format, {request.html}""" | |
response = ai_client.chat( | |
prompt=request.user_input, | |
system_message=system_prompt, | |
model_id="openai/gpt-4o-mini", | |
api_key=x_api_key | |
) | |
# Extract Cheerio script using regex | |
cheerio_pattern = r'<cheerio_script>(.*?)</cheerio_script>' | |
matches = re.search(cheerio_pattern, response, re.DOTALL) | |
if matches: | |
cheerio_script = matches.group(1).strip() | |
return {"cheerio_script": cheerio_script} | |
else: | |
raise HTTPException( | |
status_code=422, | |
detail="No valid Cheerio script found in the response" | |
) | |
except Exception as e: | |
logger.error(f"Error generating Cheerio script: {e}") | |
raise HTTPException(status_code=500, detail=str(e)) |