Spaces:
Runtime error
Runtime error
import os | |
from pathlib import Path | |
import sqlite3 | |
from typing import Optional, Tuple | |
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") | |
os.makedirs(DB_PATH, exist_ok=True) | |
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") | |
def init_db(): | |
global DB_PATH | |
conn = sqlite3.connect(DB_PATH) | |
cursor = conn.cursor() | |
cursor.execute(''' | |
CREATE TABLE IF NOT EXISTS crawled_data ( | |
url TEXT PRIMARY KEY, | |
html TEXT, | |
cleaned_html TEXT, | |
markdown TEXT, | |
extracted_content TEXT, | |
success BOOLEAN, | |
media TEXT DEFAULT "{}", | |
links TEXT DEFAULT "{}", | |
metadata TEXT DEFAULT "{}", | |
screenshot TEXT DEFAULT "" | |
) | |
''') | |
conn.commit() | |
conn.close() | |
def alter_db_add_screenshot(new_column: str = "media"): | |
check_db_path() | |
try: | |
conn = sqlite3.connect(DB_PATH) | |
cursor = conn.cursor() | |
cursor.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') | |
conn.commit() | |
conn.close() | |
except Exception as e: | |
print(f"Error altering database to add screenshot column: {e}") | |
def check_db_path(): | |
if not DB_PATH: | |
raise ValueError("Database path is not set or is empty.") | |
def get_cached_url(url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: | |
check_db_path() | |
try: | |
conn = sqlite3.connect(DB_PATH) | |
cursor = conn.cursor() | |
cursor.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,)) | |
result = cursor.fetchone() | |
conn.close() | |
return result | |
except Exception as e: | |
print(f"Error retrieving cached URL: {e}") | |
return None | |
def cache_url(url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media : str = "{}", links : str = "{}", metadata : str = "{}", screenshot: str = ""): | |
check_db_path() | |
try: | |
conn = sqlite3.connect(DB_PATH) | |
cursor = conn.cursor() | |
cursor.execute(''' | |
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot) | |
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
ON CONFLICT(url) DO UPDATE SET | |
html = excluded.html, | |
cleaned_html = excluded.cleaned_html, | |
markdown = excluded.markdown, | |
extracted_content = excluded.extracted_content, | |
success = excluded.success, | |
media = excluded.media, | |
links = excluded.links, | |
metadata = excluded.metadata, | |
screenshot = excluded.screenshot | |
''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)) | |
conn.commit() | |
conn.close() | |
except Exception as e: | |
print(f"Error caching URL: {e}") | |
def get_total_count() -> int: | |
check_db_path() | |
try: | |
conn = sqlite3.connect(DB_PATH) | |
cursor = conn.cursor() | |
cursor.execute('SELECT COUNT(*) FROM crawled_data') | |
result = cursor.fetchone() | |
conn.close() | |
return result[0] | |
except Exception as e: | |
print(f"Error getting total count: {e}") | |
return 0 | |
def clear_db(): | |
check_db_path() | |
try: | |
conn = sqlite3.connect(DB_PATH) | |
cursor = conn.cursor() | |
cursor.execute('DELETE FROM crawled_data') | |
conn.commit() | |
conn.close() | |
except Exception as e: | |
print(f"Error clearing database: {e}") | |
def flush_db(): | |
check_db_path() | |
try: | |
conn = sqlite3.connect(DB_PATH) | |
cursor = conn.cursor() | |
cursor.execute('DROP TABLE crawled_data') | |
conn.commit() | |
conn.close() | |
except Exception as e: | |
print(f"Error flushing database: {e}") | |
def update_existing_records(new_column: str = "media", default_value: str = "{}"): | |
check_db_path() | |
try: | |
conn = sqlite3.connect(DB_PATH) | |
cursor = conn.cursor() | |
cursor.execute(f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL') | |
conn.commit() | |
conn.close() | |
except Exception as e: | |
print(f"Error updating existing records: {e}") | |
if __name__ == "__main__": | |
# Delete the existing database file | |
if os.path.exists(DB_PATH): | |
os.remove(DB_PATH) | |
init_db() | |
# alter_db_add_screenshot("COL_NAME") | |