Spaces:
Runtime error
Runtime error
File size: 14,221 Bytes
03c0888 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 |
import os
import time
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *
from rich import print
from rich.console import Console
from functools import lru_cache
console = Console()
@lru_cache()
def create_crawler():
crawler = WebCrawler(verbose=True)
crawler.warmup()
return crawler
def print_result(result):
# Print each key in one line and just the first 10 characters of each one's value and three dots
console.print(f"\t[bold]Result:[/bold]")
for key, value in result.model_dump().items():
if isinstance(value, str) and value:
console.print(f"\t{key}: [green]{value[:20]}...[/green]")
if result.extracted_content:
items = json.loads(result.extracted_content)
print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
def cprint(message, press_any_key=False):
console.print(message)
if press_any_key:
console.print("Press any key to continue...", style="")
input()
def basic_usage(crawler):
cprint("π οΈ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", only_text = True)
cprint("[LOG] π¦ [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result)
def basic_usage_some_params(crawler):
cprint("π οΈ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True)
cprint("[LOG] π¦ [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result)
def screenshot_usage(crawler):
cprint("\nπΈ [bold cyan]Let's take a screenshot of the page![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
cprint("[LOG] π¦ [bold yellow]Screenshot result:[/bold yellow]")
# Save the screenshot to a file
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
cprint("Screenshot saved to 'screenshot.png'!")
print_result(result)
def understanding_parameters(crawler):
cprint("\nπ§ [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
# First crawl (reads from cache)
cprint("1οΈβ£ First crawl (caches the result):", True)
start_time = time.time()
result = crawler.run(url="https://www.nbcnews.com/business")
end_time = time.time()
cprint(f"[LOG] π¦ [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]")
print_result(result)
# Force to crawl again
cprint("2οΈβ£ Second crawl (Force to crawl again):", True)
start_time = time.time()
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
end_time = time.time()
cprint(f"[LOG] π¦ [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
print_result(result)
def add_chunking_strategy(crawler):
# Adding a chunking strategy: RegexChunking
cprint("\n𧩠[bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
result = crawler.run(
url="https://www.nbcnews.com/business",
chunking_strategy=RegexChunking(patterns=["\n\n"])
)
cprint("[LOG] π¦ [bold yellow]RegexChunking result:[/bold yellow]")
print_result(result)
# Adding another chunking strategy: NlpSentenceChunking
cprint("\nπ [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
result = crawler.run(
url="https://www.nbcnews.com/business",
chunking_strategy=NlpSentenceChunking()
)
cprint("[LOG] π¦ [bold yellow]NlpSentenceChunking result:[/bold yellow]")
print_result(result)
def add_extraction_strategy(crawler):
# Adding an extraction strategy: CosineStrategy
cprint("\nπ§ [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True)
cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
)
cprint("[LOG] π¦ [bold yellow]CosineStrategy result:[/bold yellow]")
print_result(result)
# Using semantic_filter with CosineStrategy
cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy(
semantic_filter="inflation rent prices",
)
)
cprint("[LOG] π¦ [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
print_result(result)
def add_llm_extraction_strategy(crawler):
# Adding an LLM extraction strategy without instructions
cprint("\nπ€ [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True)
cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
)
cprint("[LOG] π¦ [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]")
print_result(result)
# Adding an LLM extraction strategy with instructions
cprint("\nπ [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True)
cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!")
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
instruction="I am interested in only financial news"
)
)
cprint("[LOG] π¦ [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]")
print_result(result)
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
instruction="Extract only content related to technology"
)
)
cprint("[LOG] π¦ [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]")
print_result(result)
def targeted_extraction(crawler):
# Using a CSS selector to extract only H2 tags
cprint("\nπ― [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True)
result = crawler.run(
url="https://www.nbcnews.com/business",
css_selector="h2"
)
cprint("[LOG] π¦ [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
print_result(result)
def interactive_extraction(crawler):
# Passing JavaScript code to interact with the page
cprint("\nπ±οΈ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
js_code = """
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run(
url="https://www.nbcnews.com/business",
js = js_code
)
cprint("[LOG] π¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result)
def multiple_scrip(crawler):
# Passing JavaScript code to interact with the page
cprint("\nπ±οΈ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
js_code = ["""
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""] * 2
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run(
url="https://www.nbcnews.com/business",
js = js_code
)
cprint("[LOG] π¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result)
def using_crawler_hooks(crawler):
# Example usage of the hooks for authentication and setting a cookie
def on_driver_created(driver):
print("[HOOK] on_driver_created")
# Example customization: maximize the window
driver.maximize_window()
# Example customization: logging in to a hypothetical website
driver.get('https://example.com/login')
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, 'username'))
)
driver.find_element(By.NAME, 'username').send_keys('testuser')
driver.find_element(By.NAME, 'password').send_keys('password123')
driver.find_element(By.NAME, 'login').click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'welcome'))
)
# Add a custom cookie
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
return driver
def before_get_url(driver):
print("[HOOK] before_get_url")
# Example customization: add a custom header
# Enable Network domain for sending headers
driver.execute_cdp_cmd('Network.enable', {})
# Add a custom header
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
return driver
def after_get_url(driver):
print("[HOOK] after_get_url")
# Example customization: log the URL
print(driver.current_url)
return driver
def before_return_html(driver, html):
print("[HOOK] before_return_html")
# Example customization: log the HTML
print(len(html))
return driver
cprint("\nπ [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('on_driver_created', on_driver_created)
crawler_strategy.set_hook('before_get_url', before_get_url)
crawler_strategy.set_hook('after_get_url', after_get_url)
crawler_strategy.set_hook('before_return_html', before_return_html)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
result = crawler.run(url="https://example.com")
cprint("[LOG] π¦ [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result= result)
def using_crawler_hooks_dleay_example(crawler):
def delay(driver):
print("Delaying for 5 seconds...")
time.sleep(5)
print("Resuming...")
def create_crawler():
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('after_get_url', delay)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
return crawler
cprint("\nπ [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
crawler = create_crawler()
result = crawler.run(url="https://google.com", bypass_cache=True)
cprint("[LOG] π¦ [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result)
def main():
cprint("π [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! π[/bold green]")
cprint("β³οΈ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
crawler = create_crawler()
crawler.always_by_pass_cache = True
basic_usage(crawler)
# basic_usage_some_params(crawler)
understanding_parameters(crawler)
crawler.always_by_pass_cache = True
screenshot_usage(crawler)
add_chunking_strategy(crawler)
add_extraction_strategy(crawler)
add_llm_extraction_strategy(crawler)
targeted_extraction(crawler)
interactive_extraction(crawler)
multiple_scrip(crawler)
cprint("\nπ [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! πΈοΈ[/bold green]")
if __name__ == "__main__":
main()
|