{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5223b1b7", "metadata": {}, "outputs": [], "source": [ "from web2json.preprocessor import *\n", "from web2json.ai_extractor import *\n", "from web2json.postprocessor import *\n", "from web2json.pipeline import *" ] }, { "cell_type": "code", "execution_count": 2, "id": "ae4e7f03", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import dotenv\n", "dotenv.load_dotenv()" ] }, { "cell_type": "code", "execution_count": 3, "id": "9e6b0eb9", "metadata": {}, "outputs": [], "source": [ "llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})\n", "reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\n", "# reranker = HFRerankerClient()" ] }, { "cell_type": "code", "execution_count": 4, "id": "3bc223d0", "metadata": {}, "outputs": [], "source": [ "prompt_template = \"\"\"\n", "You are a helpful assistant that extracts structured data from web pages.\n", "You will be given a web page and you need to extract the following information:\n", "{content}\n", "\n", "schema: {schema}\n", "Please provide the extracted data in JSON format.\n", "WITH ONLY THE FIELDS THAT ARE IN THE SCHEMA.\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "475fccd2", "metadata": {}, "outputs": [], "source": [ "classification_prompt_template = \"\"\"\n", "{\n", " \"title\": {\"type\": \"string\", \"description\": \"Page title\"},\n", " \"price\": {\"type\": \"number\", \"description\": \"Product price\"},\n", " \"description\": {\"type\": \"string\", \"description\": \"Product description\"}\n", "}\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 6, "id": "974417de", "metadata": {}, "outputs": [], "source": [ "# classification_prompt_template = \"\"\"\n", "# # HTML Chunk Relevance Classification Prompt\n", "\n", "# You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.\n", "\n", "# ## Instructions:\n", "# 1. Carefully examine the provided HTML chunk\n", "# 2. Compare it against the given schema/criteria\n", "# 3. Determine if the HTML chunk contains content that matches or is relevant to the schema\n", "# 4. Respond with ONLY a JSON object containing a single field \"relevant\" with value 1 (relevant) or 0 (not relevant)\n", "\n", "# ## Input Format:\n", "# **Schema/Criteria:**\n", "# {schema}\n", "\n", "# **HTML Chunk:**\n", "# ```html\n", "# {content}\n", "# ```\n", "\n", "# ## Output Format:\n", "# Your response must be ONLY a valid JSON object with no additional text:\n", "\n", "# ```json\n", "# {{\n", "# \"relevant\": 1\n", "# }}\n", "# ```\n", "\n", "# OR\n", "\n", "# ```json\n", "# {{\n", "# \"relevant\": 0\n", "# }}\n", "# ```\n", "\n", "# ## Classification Rules:\n", "# - Output 1 if the HTML chunk contains content that matches the schema criteria\n", "# - Output 0 if the HTML chunk does not contain relevant content\n", "# - Consider semantic meaning, not just exact keyword matches\n", "# - Look at text content, attributes, structure, and context\n", "# - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content\n", "# - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema\n", "# - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)\n", "# - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema\n", "\n", "# CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.\n", "# \"\"\"" ] }, { "cell_type": "code", "execution_count": 7, "id": "58436d65", "metadata": {}, "outputs": [], "source": [ "pre = BasicPreprocessor(config={'keep_tags':True})\n", "# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY'),})\n", "# ai = AIExtractor(llm_client=llm ,prompt_template=prompt_template)\n", "ai = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)\n", "post = PostProcessor()" ] }, { "cell_type": "code", "execution_count": 41, "id": "c4e75e63", "metadata": {}, "outputs": [], "source": [ "html_chunks = [\n", " \"\"\"\n", "
Experience immersive sound with active noise cancellation and long battery life.
\n", " $299.99\n", " \n", "Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n", "Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n", "Experience immersive sound with active noise cancellation and long battery life.
\\n $299.99\\n \\nArtificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\nArtificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\nI’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\\n \\n – Sarah M.\\nExperience immersive sound with active noise cancellation and long battery life.
\\n $299.99\\n \\nArtificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\nArtificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\nI’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\\n \\n – Sarah M.\\nExperience immersive sound with active noise cancellation and long battery life.
\\n $299.99\\n \\nArtificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\nArtificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\nI’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\\n \\n – Sarah M.\\nExperience immersive sound with active noise cancellation and long battery life.
\\n $299.99\\n \\nArtificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\nArtificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\nI’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\\n \\n – Sarah M.\\nExperience immersive sound with active noise cancellation and long battery life.
\n", " $299.99\n", " \n", "Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n", "Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n", "I’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\n", " \n", " – Sarah M.\n", "\\n \\n \\n\\n \\n \\n \\n \\n \\n \\n \\nAlert Me \\n \\n \\n \\n\\n \\n\\n Color: Silver \\n\\n\\n\\n \\n |
\\n \\n \\n After viewing product detail pages or search results, look here to find an easy way to navigate back to pages you are interested in. \\n | \\n
\\n \\n › \\n View and edit your browsing history\\n \\n |
\\n \\n \\nAfter viewing product detail pages or search results, look here to find an easy way to navigate back to pages you are interested in. \\n | \\n
\\n \\n › \\nView and edit your browsing history\\n \\n |
\\n \\n \\nAfter viewing product detail pages or search results, look here to find an easy way to navigate back to pages you are interested in. \\n | \\n
\\n \\n › \\nView and edit your browsing history\\n \\n |