Spaces:
Runtime error
Runtime error
import os | |
from dotenv import load_dotenv | |
load_dotenv() # Load environment variables from .env file | |
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy | |
DEFAULT_PROVIDER = "openai/gpt-4o-mini" | |
MODEL_REPO_BRANCH = "new-release-0.0.2" | |
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy | |
PROVIDER_MODELS = { | |
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token | |
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"), | |
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"), | |
"openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"), | |
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"), | |
"openai/o1-mini": os.getenv("OPENAI_API_KEY"), | |
"openai/o1-preview": os.getenv("OPENAI_API_KEY"), | |
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"), | |
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"), | |
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"), | |
"anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"), | |
} | |
# Chunk token threshold | |
CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens | |
OVERLAP_RATE = 0.1 | |
WORD_TOKEN_RATE = 1.3 | |
# Threshold for the minimum number of word in a HTML tag to be considered | |
MIN_WORD_THRESHOLD = 1 | |
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1 | |
IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height'] | |
ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'] | |
SOCIAL_MEDIA_DOMAINS = [ | |
'facebook.com', | |
'twitter.com', | |
'x.com', | |
'linkedin.com', | |
'instagram.com', | |
'pinterest.com', | |
'tiktok.com', | |
'snapchat.com', | |
'reddit.com', | |
] | |
# Threshold for the Image extraction - Range is 1 to 6 | |
# Images are scored based on point based system, to filter based on usefulness. Points are assigned | |
# to each image based on the following aspects. | |
# If either height or width exceeds 150px | |
# If image size is greater than 10Kb | |
# If alt property is set | |
# If image format is in jpg, png or webp | |
# If image is in the first half of the total images extracted from the page | |
IMAGE_SCORE_THRESHOLD = 2 | |
MAX_METRICS_HISTORY = 1000 | |
NEED_MIGRATION = True | |
URL_LOG_SHORTEN_LENGTH = 30 | |
SHOW_DEPRECATION_WARNINGS = True | |
SCREENSHOT_HEIGHT_TRESHOLD = 10000 | |
PAGE_TIMEOUT=60000 | |
DOWNLOAD_PAGE_TIMEOUT=60000 |