In [101]:
from web2json.preprocessor import *
from web2json.ai_extractor import *
from web2json.postprocessor import *
from web2json.pipeline import *

In [102]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if using multi-GPU

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(44)

In [103]:
import dotenv
dotenv.load_dotenv()

True

In [104]:
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': "nv-rerank-qa-mistral-4b:1"})
# reranker = HFRerankerClient()

In [105]:
from pydantic import BaseModel, Field, constr, condecimal

class ProductModel(BaseModel):
    productTitle: constr(min_length=1, max_length=200) = Field(
        ...,
        title="Product Title",
        description="The full title of the product"
    )
    price: condecimal(gt=0, decimal_places=2) = Field(
        ...,
        title="Product Price",
        description="Unit price (must be > 0, two decimal places)."
    )
    manufacturer: constr(min_length=1, max_length=1000) = Field(
        ...,
        title="Manufacturer",
        description="Name of the product manufacturer."
    )


In [106]:
url = "https://www.amazon.com/Instant-Pot-Multi-Use-Programmable-Pressure/dp/B00FLYWNYQ?_encoding=UTF8&content-id=amzn1.sym.2f889ce0-246f-467a-a086-d9a721167240&dib=eyJ2IjoiMSJ9.2EzBddTDEktLY8ckTsraM_cZ6pzKuNkA6y_gLR0-Uz1ekttQU6tuQEcjb8PThy0PfhvxLqeYWh3N7pQrGgRxAWzapVklC_aU6xBzD-3Wwqx3qyQRHsmOhPRsSpeCOIIZqS3SKDowZEPYrGnCbRMt5vxnsYMW-fD-zBbgeoeGYmbsN2U6_HNhLjrpePKCbQPmnZBJ9UhgYE4fE3DVuYm8xlJe9l5GixDLVFtZUq4m5FE.Ol-jiuu9P6mQie0yXLJj-Ht5-TXmIXuRPije85p_YVo&dib_tag=se&keywords=cooker&pd_rd_r=2cede598-f3ae-49ca-8a46-e5945a9c2631&pd_rd_w=2HLSC&pd_rd_wg=ZyUUn&qid=1749508157&sr=8-3"
schema = ProductModel  # pydantic class

# read html file 
# with open(r'C:\Users\abdfa\Desktop\UNI STUFFING\GRADUATION PROJECT\Group Work\MCP_WEB2JSON\0000.htm', 'r', encoding='utf-8') as file:
#     content = file.read()

# with open(r'C:\Users\abdfa\Desktop\UNI STUFFING\GRADUATION PROJECT\Group Work\MCP_WEB2JSON\Amazon.com_ Instant Pot Duo 7-in-1 Electric Pressure Cooker, Slow Cooker, Rice Cooker, Steamer, Sauté, Yogurt Maker, Warmer & Sterilizer, Includes App With Over 800 Recipes, Stainless Steel, 6 Quart.htm', 'r', encoding='utf-8') as file:
#     content = file.read()


In [107]:
prompt_template = """
You are a helpful assistant that extracts structured data from web pages.
You will be given a web page and you need to extract the following information:
{content}

schema: {schema}
Please provide the extracted data in JSON format.
WITH ONLY THE FIELDS THAT ARE IN THE SCHEMA.
"""

In [108]:
classification_prompt_template = schema.model_json_schema()

In [109]:
# classification_prompt_template = """
#     # HTML Chunk Relevance Classification Prompt

#     You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.

#     ## Instructions:
#     1. Carefully examine the provided HTML chunk
#     2. Compare it against the given schema/criteria
#     3. Determine if the HTML chunk contains content that matches or is relevant to the schema
#     4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)

#     ## Input Format:
#     **Schema/Criteria:**
#     {schema}

#     **HTML Chunk:**
#     ```html
#     {content}
#     ```

#     ## Output Format:
#     Your response must be ONLY a valid JSON object with no additional text:

#     ```json
#     {{
#     "relevant": 1
#     }}
#     ```

#     OR

#     ```json
#     {{
#     "relevant": 0
#     }}
#     ```

#     ## Classification Rules:
#     - Output 1 if the HTML chunk contains content that matches the schema criteria
#     - Output 0 if the HTML chunk does not contain relevant content
#     - Consider semantic meaning, not just exact keyword matches
#     - Look at text content, attributes, structure, and context
#     - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
#     - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
#     - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
#     - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema

#     CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
#     """

In [110]:
pre = BasicPreprocessor(config={'keep_tags':True})
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY'),})
# ai = AIExtractor(llm_client=llm ,prompt_template=prompt_template)
ai = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
post = PostProcessor()

In [111]:
html_chunks = [
    """
    <div class="product-card">
        <h2 class="product-title">Wireless Noise Cancelling Headphones</h2>
        <p class="product-description">Experience immersive sound with active noise cancellation and long battery life.</p>
        <span class="price">$299.99</span>
        <button>Add to Cart</button>
    </div>
    """,

    """
    <section class="blog-post">
        <h1>Top 5 AI Tools to Try in 2025</h1>
        <p>Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:</p>
        <ul>
            <li>LangChain</li>
            <li>AutoGen</li>
            <li>OpenDevin</li>
            <li>FastRAG</li>
            <li>GPTScript</li>
        </ul>
        <footer>Published by <strong>TechToday</strong> on June 30, 2025</footer>
    </section>
    """,

    """
    <section class="blog-post">
        <h1>Top 5 AI Tools to Try in 2025</h1>
        <p>Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:</p>
        <ul>
            <li>LangChain</li>
            <li>AutoGen</li>
            <li>OpenDevin</li>
            <li>FastRAG</li>
            <li>GPTScript</li>
        </ul>
        <footer>Published by <strong>TechToday</strong> on June 30, 2025</footer>
    </section>
    """,

    """
    <div class="review">
        <h3>User Review: Amazing Performance!</h3>
        <p>I’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!</p>
        <div class="rating">Rating: ⭐⭐⭐⭐⭐</div>
        <span class="user">– Sarah M.</span>
    </div>
    """
]


In [112]:
len(html_chunks)

4

In [113]:
output = reranker.rerank(query=classification_prompt_template,passages=html_chunks)

raw scores [-11.34375   -13.8984375 -14.7578125 -14.7578125]
Sigmoid scores: [1.18431390e-05 9.20417541e-07 3.89729515e-07 3.89729515e-07]
Normalized scores: [1.        0.0463345 0.        0.       ]
Filtered pairs:
[(Document(metadata={'relevance_score': -11.34375}, page_content='\n    <div class="product-card">\n        <h2 class="product-title">Wireless Noise Cancelling Headphones</h2>\n        <p class="product-description">Experience immersive sound with active noise cancellation and long battery life.</p>\n        <span class="price">$299.99</span>\n        <button>Add to Cart</button>\n    </div>\n    '), 1.0)]


In [114]:
for o in output:
    print(o)
    print('-'*80)


    <div class="product-card">
        <h2 class="product-title">Wireless Noise Cancelling Headphones</h2>
        <p class="product-description">Experience immersive sound with active noise cancellation and long battery life.</p>
        <span class="price">$299.99</span>
        <button>Add to Cart</button>
    </div>
    
--------------------------------------------------------------------------------


In [115]:
len(output)

1

In [116]:
# ai.extract(chunks=["the price is $1000", "the title is 'NVIDIA H100 SXM'"])

In [117]:
pipe = Pipeline(preprocessor=pre, ai_extractor=ai, postprocessor=post)

In [118]:
config = {
    'keep_tags': True,
}

In [119]:
# import os

# content = """<html>
# <body>
# """

# from web2json.ai_extractor import HFRerankerClient, LLMClassifierExtractor, NvidiaLLMClient

# hf_reranker = HFRerankerClient()
# llm_client = NvidiaLLMClient(config={"api_key": os.environ.get('NVIDIA_API_KEY')})
# extractor = LLMClassifierExtractor(
#     reranker=hf_reranker,
#     llm_client=llm_client,
#     prompt_template="Extract from: {content} using schema: {schema}",
#     classifier_prompt="What is the price?"
# )

# # Run using HuggingFace reranker
# result = extractor.extract(content=content, schema=schema, hf=True)


In [120]:
pipe.run(content=url,is_url=True, schema=schema, hf=True)

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
raw scores [-12.40625   -12.453125  -12.46875   -12.5546875 -12.6875   ]
Sigmoid scores: [4.09291105e-06 3.90548374e-06 3.84493506e-06 3.52831186e-06
 3.08949445e-06]
Normalized scores: [1.         0.81321087 0.75286836 0.43732325 0.        ]
Filtered pairs:
[(Document(metadata={'relevance_score': -12.40625}, page_content='<html><body><div><div><div><div><div><div>\n<div> <h1> About this item </h1> <ul> <li><span> 7-IN-1 FUNCTIONALITY: Pressure cook, slow cook, rice cooker, yogurt maker, steamer, sauté pan and food warmer.  </span></li> <li><span> QUICK ONE-TOUCH COOKING: 13 customizable Smart Programs for pressure cooking ribs, soups, beans, rice, poultry, yogurt, desserts and more.  </span></li> <li><span> COOK FAST OR SLOW: Pressure cook delicious one-pot meals up to 70% faster than traditional cooking methods or slow cook your favorite traditional recipes – just like grandma used to make.  </span></li>

{'productTitle': 'Instant Pot RIO, 7-in-1 Electric Multi-Cooker, PressureCooker, SlowCooker, RiceCooker, Steamer, Sauté, Yogurt Maker, & Warmer, Includes App With Over 800 Recipes, 6 Quart',
 'price': 94.95,
 'manufacturer': 'Instant Pot'}