{ "cells": [ { "cell_type": "code", "execution_count": 101, "id": "5223b1b7", "metadata": {}, "outputs": [], "source": [ "from web2json.preprocessor import *\n", "from web2json.ai_extractor import *\n", "from web2json.postprocessor import *\n", "from web2json.pipeline import *" ] }, { "cell_type": "code", "execution_count": 102, "id": "5ccc96b7", "metadata": {}, "outputs": [], "source": [ "def seed_everything(seed=42):\n", " random.seed(seed)\n", " np.random.seed(seed)\n", " torch.manual_seed(seed)\n", "\n", " if torch.cuda.is_available():\n", " torch.cuda.manual_seed(seed)\n", " torch.cuda.manual_seed_all(seed) # if using multi-GPU\n", "\n", " torch.backends.cudnn.deterministic = True\n", " torch.backends.cudnn.benchmark = False\n", "\n", "seed_everything(44)" ] }, { "cell_type": "code", "execution_count": 103, "id": "ae4e7f03", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import dotenv\n", "dotenv.load_dotenv()" ] }, { "cell_type": "code", "execution_count": 104, "id": "9e6b0eb9", "metadata": {}, "outputs": [], "source": [ "llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})\n", "reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': \"nv-rerank-qa-mistral-4b:1\"})\n", "# reranker = HFRerankerClient()" ] }, { "cell_type": "code", "execution_count": 105, "id": "114ce917", "metadata": {}, "outputs": [], "source": [ "from pydantic import BaseModel, Field, constr, condecimal\n", "\n", "class ProductModel(BaseModel):\n", " productTitle: constr(min_length=1, max_length=200) = Field(\n", " ...,\n", " title=\"Product Title\",\n", " description=\"The full title of the product\"\n", " )\n", " price: condecimal(gt=0, decimal_places=2) = Field(\n", " ...,\n", " title=\"Product Price\",\n", " description=\"Unit price (must be > 0, two decimal places).\"\n", " )\n", " manufacturer: constr(min_length=1, max_length=1000) = Field(\n", " ...,\n", " title=\"Manufacturer\",\n", " description=\"Name of the product manufacturer.\"\n", " )\n" ] }, { "cell_type": "code", "execution_count": 106, "id": "2d6c1215", "metadata": {}, "outputs": [], "source": [ "url = \"https://www.amazon.com/Instant-Pot-Multi-Use-Programmable-Pressure/dp/B00FLYWNYQ?_encoding=UTF8&content-id=amzn1.sym.2f889ce0-246f-467a-a086-d9a721167240&dib=eyJ2IjoiMSJ9.2EzBddTDEktLY8ckTsraM_cZ6pzKuNkA6y_gLR0-Uz1ekttQU6tuQEcjb8PThy0PfhvxLqeYWh3N7pQrGgRxAWzapVklC_aU6xBzD-3Wwqx3qyQRHsmOhPRsSpeCOIIZqS3SKDowZEPYrGnCbRMt5vxnsYMW-fD-zBbgeoeGYmbsN2U6_HNhLjrpePKCbQPmnZBJ9UhgYE4fE3DVuYm8xlJe9l5GixDLVFtZUq4m5FE.Ol-jiuu9P6mQie0yXLJj-Ht5-TXmIXuRPije85p_YVo&dib_tag=se&keywords=cooker&pd_rd_r=2cede598-f3ae-49ca-8a46-e5945a9c2631&pd_rd_w=2HLSC&pd_rd_wg=ZyUUn&qid=1749508157&sr=8-3\"\n", "schema = ProductModel # pydantic class\n", "\n", "# read html file \n", "# with open(r'C:\\Users\\abdfa\\Desktop\\UNI STUFFING\\GRADUATION PROJECT\\Group Work\\MCP_WEB2JSON\\0000.htm', 'r', encoding='utf-8') as file:\n", "# content = file.read()\n", "\n", "# with open(r'C:\\Users\\abdfa\\Desktop\\UNI STUFFING\\GRADUATION PROJECT\\Group Work\\MCP_WEB2JSON\\Amazon.com_ Instant Pot Duo 7-in-1 Electric Pressure Cooker, Slow Cooker, Rice Cooker, Steamer, Sauté, Yogurt Maker, Warmer & Sterilizer, Includes App With Over 800 Recipes, Stainless Steel, 6 Quart.htm', 'r', encoding='utf-8') as file:\n", "# content = file.read()\n" ] }, { "cell_type": "code", "execution_count": 107, "id": "3bc223d0", "metadata": {}, "outputs": [], "source": [ "prompt_template = \"\"\"\n", "You are a helpful assistant that extracts structured data from web pages.\n", "You will be given a web page and you need to extract the following information:\n", "{content}\n", "\n", "schema: {schema}\n", "Please provide the extracted data in JSON format.\n", "WITH ONLY THE FIELDS THAT ARE IN THE SCHEMA.\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 108, "id": "475fccd2", "metadata": {}, "outputs": [], "source": [ "classification_prompt_template = schema.model_json_schema()" ] }, { "cell_type": "code", "execution_count": 109, "id": "974417de", "metadata": {}, "outputs": [], "source": [ "# classification_prompt_template = \"\"\"\n", "# # HTML Chunk Relevance Classification Prompt\n", "\n", "# You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.\n", "\n", "# ## Instructions:\n", "# 1. Carefully examine the provided HTML chunk\n", "# 2. Compare it against the given schema/criteria\n", "# 3. Determine if the HTML chunk contains content that matches or is relevant to the schema\n", "# 4. Respond with ONLY a JSON object containing a single field \"relevant\" with value 1 (relevant) or 0 (not relevant)\n", "\n", "# ## Input Format:\n", "# **Schema/Criteria:**\n", "# {schema}\n", "\n", "# **HTML Chunk:**\n", "# ```html\n", "# {content}\n", "# ```\n", "\n", "# ## Output Format:\n", "# Your response must be ONLY a valid JSON object with no additional text:\n", "\n", "# ```json\n", "# {{\n", "# \"relevant\": 1\n", "# }}\n", "# ```\n", "\n", "# OR\n", "\n", "# ```json\n", "# {{\n", "# \"relevant\": 0\n", "# }}\n", "# ```\n", "\n", "# ## Classification Rules:\n", "# - Output 1 if the HTML chunk contains content that matches the schema criteria\n", "# - Output 0 if the HTML chunk does not contain relevant content\n", "# - Consider semantic meaning, not just exact keyword matches\n", "# - Look at text content, attributes, structure, and context\n", "# - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content\n", "# - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema\n", "# - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)\n", "# - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema\n", "\n", "# CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.\n", "# \"\"\"" ] }, { "cell_type": "code", "execution_count": 110, "id": "58436d65", "metadata": {}, "outputs": [], "source": [ "pre = BasicPreprocessor(config={'keep_tags':True})\n", "# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY'),})\n", "# ai = AIExtractor(llm_client=llm ,prompt_template=prompt_template)\n", "ai = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)\n", "post = PostProcessor()" ] }, { "cell_type": "code", "execution_count": 111, "id": "c4e75e63", "metadata": {}, "outputs": [], "source": [ "html_chunks = [\n", " \"\"\"\n", "
Experience immersive sound with active noise cancellation and long battery life.
\n", " $299.99\n", " \n", "Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n", "Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n", "I’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\n", " \n", " – Sarah M.\n", "Experience immersive sound with active noise cancellation and long battery life.
\\n $299.99\\n \\nExperience immersive sound with active noise cancellation and long battery life.
\n", " $299.99\n", " \n", "This product is compatible with outlets that support 120 volts and might require a converter when used outside of the United States.
This product is compatible with outlets that support 120 volts and might require a converter when used outside of the United States.