Spaces:
Runtime error
Runtime error
import os | |
import time | |
import json | |
from crawl4ai.web_crawler import WebCrawler | |
from crawl4ai.chunking_strategy import * | |
from crawl4ai.extraction_strategy import * | |
from crawl4ai.crawler_strategy import * | |
url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot' | |
crawler = WebCrawler() | |
crawler.warmup() | |
from pydantic import BaseModel, Field | |
class PageSummary(BaseModel): | |
title: str = Field(..., description="Title of the page.") | |
summary: str = Field(..., description="Summary of the page.") | |
brief_summary: str = Field(..., description="Brief summary of the page.") | |
keywords: list = Field(..., description="Keywords assigned to the page.") | |
result = crawler.run( | |
url=url, | |
word_count_threshold=1, | |
extraction_strategy= LLMExtractionStrategy( | |
provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), | |
schema=PageSummary.model_json_schema(), | |
extraction_type="schema", | |
apply_chunking =False, | |
instruction="From the crawled content, extract the following details: "\ | |
"1. Title of the page "\ | |
"2. Summary of the page, which is a detailed summary "\ | |
"3. Brief summary of the page, which is a paragraph text "\ | |
"4. Keywords assigned to the page, which is a list of keywords. "\ | |
'The extracted JSON format should look like this: '\ | |
'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }' | |
), | |
bypass_cache=True, | |
) | |
page_summary = json.loads(result.extracted_content) | |
print(page_summary) | |
with open(".data/page_summary.json", "w", encoding="utf-8") as f: | |
f.write(result.extracted_content) | |