diff --git "a/test.ipynb" "b/test.ipynb"
deleted file mode 100644--- "a/test.ipynb"
+++ /dev/null
@@ -1,821 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "5223b1b7",
- "metadata": {},
- "outputs": [],
- "source": [
- "from web2json.preprocessor import *\n",
- "from web2json.ai_extractor import *\n",
- "from web2json.postprocessor import *\n",
- "from web2json.pipeline import *"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "ae4e7f03",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import dotenv\n",
- "dotenv.load_dotenv()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "9e6b0eb9",
- "metadata": {},
- "outputs": [],
- "source": [
- "llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})\n",
- "reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\n",
- "# reranker = HFRerankerClient()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "3bc223d0",
- "metadata": {},
- "outputs": [],
- "source": [
- "prompt_template = \"\"\"\n",
- "You are a helpful assistant that extracts structured data from web pages.\n",
- "You will be given a web page and you need to extract the following information:\n",
- "{content}\n",
- "\n",
- "schema: {schema}\n",
- "Please provide the extracted data in JSON format.\n",
- "WITH ONLY THE FIELDS THAT ARE IN THE SCHEMA.\n",
- "\"\"\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "475fccd2",
- "metadata": {},
- "outputs": [],
- "source": [
- "classification_prompt_template = \"\"\"\n",
- "{\n",
- " \"title\": {\"type\": \"string\", \"description\": \"Page title\"},\n",
- " \"price\": {\"type\": \"number\", \"description\": \"Product price\"},\n",
- " \"description\": {\"type\": \"string\", \"description\": \"Product description\"}\n",
- "}\n",
- "\"\"\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "974417de",
- "metadata": {},
- "outputs": [],
- "source": [
- "# classification_prompt_template = \"\"\"\n",
- "# # HTML Chunk Relevance Classification Prompt\n",
- "\n",
- "# You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.\n",
- "\n",
- "# ## Instructions:\n",
- "# 1. Carefully examine the provided HTML chunk\n",
- "# 2. Compare it against the given schema/criteria\n",
- "# 3. Determine if the HTML chunk contains content that matches or is relevant to the schema\n",
- "# 4. Respond with ONLY a JSON object containing a single field \"relevant\" with value 1 (relevant) or 0 (not relevant)\n",
- "\n",
- "# ## Input Format:\n",
- "# **Schema/Criteria:**\n",
- "# {schema}\n",
- "\n",
- "# **HTML Chunk:**\n",
- "# ```html\n",
- "# {content}\n",
- "# ```\n",
- "\n",
- "# ## Output Format:\n",
- "# Your response must be ONLY a valid JSON object with no additional text:\n",
- "\n",
- "# ```json\n",
- "# {{\n",
- "# \"relevant\": 1\n",
- "# }}\n",
- "# ```\n",
- "\n",
- "# OR\n",
- "\n",
- "# ```json\n",
- "# {{\n",
- "# \"relevant\": 0\n",
- "# }}\n",
- "# ```\n",
- "\n",
- "# ## Classification Rules:\n",
- "# - Output 1 if the HTML chunk contains content that matches the schema criteria\n",
- "# - Output 0 if the HTML chunk does not contain relevant content\n",
- "# - Consider semantic meaning, not just exact keyword matches\n",
- "# - Look at text content, attributes, structure, and context\n",
- "# - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content\n",
- "# - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema\n",
- "# - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)\n",
- "# - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema\n",
- "\n",
- "# CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.\n",
- "# \"\"\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "58436d65",
- "metadata": {},
- "outputs": [],
- "source": [
- "pre = BasicPreprocessor(config={'keep_tags':True})\n",
- "# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY'),})\n",
- "# ai = AIExtractor(llm_client=llm ,prompt_template=prompt_template)\n",
- "ai = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)\n",
- "post = PostProcessor()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "c4e75e63",
- "metadata": {},
- "outputs": [],
- "source": [
- "html_chunks = [\n",
- " \"\"\"\n",
- "
\n",
- "
Wireless Noise Cancelling Headphones \n",
- "
Experience immersive sound with active noise cancellation and long battery life.
\n",
- "
$299.99 \n",
- "
Add to Cart \n",
- "
\n",
- " \"\"\",\n",
- "\n",
- " \"\"\"\n",
- " \n",
- " Top 5 AI Tools to Try in 2025 \n",
- " Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n",
- " \n",
- " LangChain \n",
- " AutoGen \n",
- " OpenDevin \n",
- " FastRAG \n",
- " GPTScript \n",
- " \n",
- " Published by TechToday on June 30, 2025 \n",
- " \n",
- " \"\"\",\n",
- "\n",
- " \"\"\"\n",
- " \n",
- " Top 5 AI Tools to Try in 2025 \n",
- " Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n",
- " \n",
- " LangChain \n",
- " AutoGen \n",
- " OpenDevin \n",
- " FastRAG \n",
- " GPTScript \n",
- " \n",
- " Published by TechToday on June 30, 2025 \n",
- " \n",
- " \"\"\",\n",
- "\n",
- " \"\"\"\n",
- " شسيمنبتشسينبتشسكنميبتكشسمينتبك\n",
- " \"\"\"\n",
- "]\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "id": "bb4edecf",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "4"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(html_chunks)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "9927a78e",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Bonjour\n",
- " [Document(metadata={}, page_content='\\n \\n
Wireless Noise Cancelling Headphones \\n
Experience immersive sound with active noise cancellation and long battery life.
\\n
$299.99 \\n
Add to Cart \\n
\\n '), Document(metadata={}, page_content='\\n \\n Top 5 AI Tools to Try in 2025 \\n Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\n \\n LangChain \\n AutoGen \\n OpenDevin \\n FastRAG \\n GPTScript \\n \\n Published by TechToday on June 30, 2025 \\n \\n '), Document(metadata={}, page_content='\\n \\n Top 5 AI Tools to Try in 2025 \\n Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\n \\n LangChain \\n AutoGen \\n OpenDevin \\n FastRAG \\n GPTScript \\n \\n Published by TechToday on June 30, 2025 \\n \\n '), Document(metadata={}, page_content='\\n \\n
User Review: Amazing Performance! \\n
I’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\\n
Rating: ⭐⭐⭐⭐⭐
\\n
– Sarah M. \\n
\\n ')]\n",
- " \n",
- "{\n",
- " \"title\": {\"type\": \"string\", \"description\": \"Page title\"},\n",
- " \"price\": {\"type\": \"number\", \"description\": \"Product price\"},\n",
- " \"description\": {\"type\": \"string\", \"description\": \"Product description\"}\n",
- "}\n",
- "\n",
- "Scored Docs [Document(metadata={'relevance_score': -11.25}, page_content='\\n \\n
Wireless Noise Cancelling Headphones \\n
Experience immersive sound with active noise cancellation and long battery life.
\\n
$299.99 \\n
Add to Cart \\n
\\n '), Document(metadata={'relevance_score': -15.2265625}, page_content='\\n \\n Top 5 AI Tools to Try in 2025 \\n Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\n \\n LangChain \\n AutoGen \\n OpenDevin \\n FastRAG \\n GPTScript \\n \\n Published by TechToday on June 30, 2025 \\n \\n '), Document(metadata={'relevance_score': -15.2265625}, page_content='\\n \\n Top 5 AI Tools to Try in 2025 \\n Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\n \\n LangChain \\n AutoGen \\n OpenDevin \\n FastRAG \\n GPTScript \\n \\n Published by TechToday on June 30, 2025 \\n \\n '), Document(metadata={'relevance_score': -15.859375}, page_content='\\n \\n
User Review: Amazing Performance! \\n
I’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\\n
Rating: ⭐⭐⭐⭐⭐
\\n
– Sarah M. \\n
\\n ')]\n",
- "Ayeeeee\n",
- "Docs Value: [Document(metadata={'relevance_score': -11.25, 'softmax_score': 0.9546922134634852, 'minmax_score': 1.0}, page_content='\\n \\n
Wireless Noise Cancelling Headphones \\n
Experience immersive sound with active noise cancellation and long battery life.
\\n
$299.99 \\n
Add to Cart \\n
\\n '), Document(metadata={'relevance_score': -15.2265625, 'softmax_score': 0.017900461577508887, 'minmax_score': 0.13728813559322034}, page_content='\\n \\n Top 5 AI Tools to Try in 2025 \\n Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\n \\n LangChain \\n AutoGen \\n OpenDevin \\n FastRAG \\n GPTScript \\n \\n Published by TechToday on June 30, 2025 \\n \\n '), Document(metadata={'relevance_score': -15.2265625, 'softmax_score': 0.017900461577508887, 'minmax_score': 0.13728813559322034}, page_content='\\n \\n Top 5 AI Tools to Try in 2025 \\n Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\n \\n LangChain \\n AutoGen \\n OpenDevin \\n FastRAG \\n GPTScript \\n \\n Published by TechToday on June 30, 2025 \\n \\n '), Document(metadata={'relevance_score': -15.859375, 'softmax_score': 0.009506863381497203, 'minmax_score': 0.0}, page_content='\\n \\n
User Review: Amazing Performance! \\n
I’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\\n
Rating: ⭐⭐⭐⭐⭐
\\n
– Sarah M. \\n
\\n ')]\n",
- "Final [Document(metadata={'relevance_score': -11.25, 'softmax_score': 0.9546922134634852, 'minmax_score': 1.0}, page_content='\\n \\n
Wireless Noise Cancelling Headphones \\n
Experience immersive sound with active noise cancellation and long battery life.
\\n
$299.99 \\n
Add to Cart \\n
\\n '), Document(metadata={'relevance_score': -15.2265625, 'softmax_score': 0.017900461577508887, 'minmax_score': 0.13728813559322034}, page_content='\\n \\n Top 5 AI Tools to Try in 2025 \\n Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\n \\n LangChain \\n AutoGen \\n OpenDevin \\n FastRAG \\n GPTScript \\n \\n Published by TechToday on June 30, 2025 \\n \\n '), Document(metadata={'relevance_score': -15.2265625, 'softmax_score': 0.017900461577508887, 'minmax_score': 0.13728813559322034}, page_content='\\n \\n Top 5 AI Tools to Try in 2025 \\n Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\\n \\n LangChain \\n AutoGen \\n OpenDevin \\n FastRAG \\n GPTScript \\n \\n Published by TechToday on June 30, 2025 \\n \\n '), Document(metadata={'relevance_score': -15.859375, 'softmax_score': 0.009506863381497203, 'minmax_score': 0.0}, page_content='\\n \\n
User Review: Amazing Performance! \\n
I’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\\n
Rating: ⭐⭐⭐⭐⭐
\\n
– Sarah M. \\n
\\n ')]\n"
- ]
- }
- ],
- "source": [
- "output = reranker.rerank(query=classification_prompt_template,passages=html_chunks, threshold=0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b77015f3",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "page_content='\n",
- " \n",
- "
Wireless Noise Cancelling Headphones \n",
- "
Experience immersive sound with active noise cancellation and long battery life.
\n",
- "
$299.99 \n",
- "
Add to Cart \n",
- "
\n",
- " ' metadata={'relevance_score': -11.25, 'softmax_score': 0.9546922134634852, 'minmax_score': 1.0}\n",
- "--------------------------------------------------------------------------------\n",
- "page_content='\n",
- " \n",
- " Top 5 AI Tools to Try in 2025 \n",
- " Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n",
- " \n",
- " LangChain \n",
- " AutoGen \n",
- " OpenDevin \n",
- " FastRAG \n",
- " GPTScript \n",
- " \n",
- " Published by TechToday on June 30, 2025 \n",
- " \n",
- " ' metadata={'relevance_score': -15.2265625, 'softmax_score': 0.017900461577508887, 'minmax_score': 0.13728813559322034}\n",
- "--------------------------------------------------------------------------------\n",
- "page_content='\n",
- " \n",
- " Top 5 AI Tools to Try in 2025 \n",
- " Artificial intelligence continues to evolve. Here are five tools you should explore in 2025:
\n",
- " \n",
- " LangChain \n",
- " AutoGen \n",
- " OpenDevin \n",
- " FastRAG \n",
- " GPTScript \n",
- " \n",
- " Published by TechToday on June 30, 2025 \n",
- " \n",
- " ' metadata={'relevance_score': -15.2265625, 'softmax_score': 0.017900461577508887, 'minmax_score': 0.13728813559322034}\n",
- "--------------------------------------------------------------------------------\n",
- "page_content='\n",
- " \n",
- "
User Review: Amazing Performance! \n",
- "
I’ve been using this laptop for a few months and it’s blazing fast. Great for deep learning workloads!
\n",
- "
Rating: ⭐⭐⭐⭐⭐
\n",
- "
– Sarah M. \n",
- "
\n",
- " ' metadata={'relevance_score': -15.859375, 'softmax_score': 0.009506863381497203, 'minmax_score': 0.0}\n",
- "--------------------------------------------------------------------------------\n"
- ]
- }
- ],
- "source": [
- "for o in output:\n",
- " print(o)\n",
- " print('-'*80)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "bb3fa1b0",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "4"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(output)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "c1c43f7c",
- "metadata": {},
- "outputs": [],
- "source": [
- "# ai.extract(chunks=[\"the price is $1000\", \"the title is 'NVIDIA H100 SXM'\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "9c78eec9",
- "metadata": {},
- "outputs": [],
- "source": [
- "pipe = Pipeline(preprocessor=pre, ai_extractor=ai, postprocessor=post)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "0b324a01",
- "metadata": {},
- "outputs": [],
- "source": [
- "from pydantic import BaseModel, Field, constr, condecimal\n",
- "\n",
- "class ProductModel(BaseModel):\n",
- " productTitle: constr(min_length=1, max_length=200) = Field(\n",
- " ...,\n",
- " title=\"Product Title\",\n",
- " description=\"The full title of the product\"\n",
- " )\n",
- " price: condecimal(gt=0, decimal_places=2) = Field(\n",
- " ...,\n",
- " title=\"Product Price\",\n",
- " description=\"Unit price (must be > 0, two decimal places).\"\n",
- " )\n",
- " manufacturer: constr(min_length=1, max_length=1000) = Field(\n",
- " ...,\n",
- " title=\"Manufacturer\",\n",
- " description=\"Name of the product manufacturer.\"\n",
- " )\n",
- "\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "92a5fc23",
- "metadata": {},
- "outputs": [],
- "source": [
- "config = {\n",
- " 'keep_tags': True,\n",
- "}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "d2cfb033",
- "metadata": {},
- "outputs": [],
- "source": [
- "# url = \"https://www.amazon.com/Instant-Pot-Multi-Use-Programmable-Pressure/dp/B00FLYWNYQ?_encoding=UTF8&content-id=amzn1.sym.2f889ce0-246f-467a-a086-d9a721167240&dib=eyJ2IjoiMSJ9.2EzBddTDEktLY8ckTsraM_cZ6pzKuNkA6y_gLR0-Uz1ekttQU6tuQEcjb8PThy0PfhvxLqeYWh3N7pQrGgRxAWzapVklC_aU6xBzD-3Wwqx3qyQRHsmOhPRsSpeCOIIZqS3SKDowZEPYrGnCbRMt5vxnsYMW-fD-zBbgeoeGYmbsN2U6_HNhLjrpePKCbQPmnZBJ9UhgYE4fE3DVuYm8xlJe9l5GixDLVFtZUq4m5FE.Ol-jiuu9P6mQie0yXLJj-Ht5-TXmIXuRPije85p_YVo&dib_tag=se&keywords=cooker&pd_rd_r=2cede598-f3ae-49ca-8a46-e5945a9c2631&pd_rd_w=2HLSC&pd_rd_wg=ZyUUn&qid=1749508157&sr=8-3\"\n",
- "schema = ProductModel # pydantic class\n",
- "\n",
- "\n",
- "# read html file \n",
- "with open(r'C:\\Users\\abdfa\\Desktop\\UNI STUFFING\\GRADUATION PROJECT\\Group Work\\Experimental\\data\\tmp\\SWDE-Camera\\files\\camera-amazon(1767)\\0000.htm', 'r', encoding='utf-8') as file:\n",
- " content = file.read()\n",
- "\n",
- "# with open(r'C:\\Users\\abdfa\\Desktop\\UNI STUFFING\\GRADUATION PROJECT\\Group Work\\MCP_WEB2JSON\\Amazon.com_ Instant Pot Duo 7-in-1 Electric Pressure Cooker, Slow Cooker, Rice Cooker, Steamer, Sauté, Yogurt Maker, Warmer & Sterilizer, Includes App With Over 800 Recipes, Stainless Steel, 6 Quart.htm', 'r', encoding='utf-8') as file:\n",
- "# content = file.read()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "f07e1aca",
- "metadata": {},
- "outputs": [],
- "source": [
- "# import os\n",
- "\n",
- "# content = \"\"\"\n",
- "# \n",
- "# \"\"\"\n",
- "\n",
- "# from web2json.ai_extractor import HFRerankerClient, LLMClassifierExtractor, NvidiaLLMClient\n",
- "\n",
- "# hf_reranker = HFRerankerClient()\n",
- "# llm_client = NvidiaLLMClient(config={\"api_key\": os.environ.get('NVIDIA_API_KEY')})\n",
- "# extractor = LLMClassifierExtractor(\n",
- "# reranker=hf_reranker,\n",
- "# llm_client=llm_client,\n",
- "# prompt_template=\"Extract from: {content} using schema: {schema}\",\n",
- "# classifier_prompt=\"What is the price?\"\n",
- "# )\n",
- "\n",
- "# # Run using HuggingFace reranker\n",
- "# result = extractor.extract(content=content, schema=schema, hf=True)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "79cf2321",
- "metadata": {},
- "outputs": [],
- "source": [
- "# pipe.run(content=url,is_url=True, schema=schema, hf=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "d5f9ae67",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'\\ufeff \\n\\n \\n\\n \\n \\n \\n\\n \\n \\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n \\n \\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n \\n \\n \\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\t\\n\\t\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n \\n \\n\\n \\n\\n\\n\\n\\n \\n\\n\\n\\n\\t\\n\\t\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n \\n\\n\\n\\n\\n \\n\\n \\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n \\t \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n \\n\\n\\n\\n \\n \\n \\n\\n \\n\\n\\n\\n\\n\\n\\n\\n \\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n \\n \\n \\n\\n \\nAmazon.com: Canon PowerShot SD1300 IS 12.1 MP Digital Camera with 4x Wide Angle Optical Image Stabilized Zoom and 2.7-Inch LCD (Silver): Camera & Photo \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
\\n\\n\\n
\\n\\n\\n \\n amazon.com \\n \\n \\n \\n \\n \\n
Hello. \\n
Sign in to get personalized recommendations.\\n
New customer? Start here . \\n
\\n \\n \\n \\n
\\n \\n \\n\\n\\n \\n\\n \\n \\n\\n\\n \\n \\n \\n\\n\\n \\n \\n \\n \\n \\n \\n \\n \\n\\n
\\n\\n\\n\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n \\n \\n \\n\\n\\n\\n \\n \\n\\n \\n \\n \\n\\n \\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n \\n \\n \\n \\n\\n\\n \\n\\n\\n \\n\\n \\n\\n\\n\\n\\n\\n\\n
\\n
\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n \\n\\n\\n\\n\\n\\n
\\n
\\n
\\n
\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
\\n\\n\\n\\n\\n\\n\\n\\n
\\n\\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
\\n
\\n\\n\\n\\n\\n\\n
\\n \\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n
\\n\\n\\n\\n
\\n\\n \\n \\n \\n \\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n
\\n
\\n
Product Features and Technical Details \\n \\n\\n \\n\\n
Color: Silver
\\n\\n\\n\\n \\n
\\n \\n
\\n Product Features\\n \\n
\\n
\\n \\n 28mm wide-angle lens; 4x optical zoom and Optical Image Stabilizer \\n Smart AUTO mode intelligently selects from 18 predefined settings \\n 12.1-megapixel resolution allows you to print large size images with clarity and detail \\n Clear 2.7-inch PureColor System LCD \\n Shoot in Low Light mode for dimly-lit situations \\n \\n \\n
\\n \\n
\\n \\n
\\n Technical Details\\n \\n
\\n \\n
\\n Brand Name : Canon \\nModel : SD1300IS Silver \\nOptical Sensor Resolution : 12.1 MP \\nOptical Sensor Technology : CCD \\nOptical zoom : 4 x \\nMaximum Aperture Range : F/2.8-5.9 \\nMinimum focal length : 5 millimeters \\nMaximum focal length : 20 millimeters \\nLens Type : Zoom lens \\nOptical Sensor Size : 1/2.3\" \\nIncluded Flash Type : Built-in flash \\nDisplay Size : 2.700 inches \\nLight Sensitivity : ISO 100, ISO 800, ISO 400, ISO 200, ISO 80, ISO auto, ISO 1600 \\nImage types : JPEG \\nShooting Modes : Frame movie mode \\nExposure Control Type : Beach, Kids & pets, Snow, Portrait mode, Slow shutter, Night mode, Fireworks, Low light, Snap shot, Underwater, Indoor, Foliage \\nViewfinder Type : None \\nWidth : 3.6 inches \\nHeight : 2.2 inches \\nWeight : 4.9 pounds \\n\\n \\n \\n
\\n \\n \\n
\\n
\\n \\n\\n\\n\\n\\n\\n\\n
\\n\\n\\n\\n
\\n
\\n\\n\\n\\n\\n\\n\\n\\n
\\n\\n
\\n
\\n
\\n
\\n
\\n
\\n
\\n
Alert Me
\\n
\\n
\\n\\n \\n\\n
Color: Silver
\\n\\n\\n\\n
\\n
Want us to e-mail you when this item becomes available? \\n
\\n
\\n
\\n
\\n
\\n\\n
\\n\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
\\n
\\n \\n
\\n
\\n
\\n \\n
\\n\\n\\n\\n\\n\\n\\n
\\n\\n\\n\\n\\n\\n\\n\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
\\n\\n
\\n
\\n
\\n
\\n
\\n
\\n
\\n \\n
\\n\\n\\n\\n\\n\\n
\\n \\n \\n \\n
After viewing product detail pages or search results, look here to find an easy way to navigate back to pages you are interested in.
\\n
\\n \\n \\n \\n \\n \\n
\\n
\\n
\\n\\n\\n
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n
\\n\\n\\n\\n \\n \\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n'"
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "content"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "371f7a17",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "tensor([0., 0., 1.])"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import torch \n",
- "\n",
- "softmax_scores = [0.6 , 0.3 , 0.06 , 0.01]\n",
- "softmax_scores = torch.tensor(softmax_scores)\n",
- "softmax_scores = softmax_scores / softmax_scores.sum()\n",
- "\n",
- "# Normalize to [0, 1]\n",
- "softmax_scores = (softmax_scores - softmax_scores.min()) / (softmax_scores.max() - softmax_scores.min())\n",
- "softmax_scores"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "393134a1",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: expression(this.scrollHeight > 183?\"184px\": \"auto\") [260:7: height]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [445:3: cursor]\n",
- "ERROR\tSelector: Unexpected IDENT. [461:2: html]\n",
- "ERROR\tSelectorList: Invalid Selector: *html div.mbcTradeIn\n",
- "WARNING\tProperty: Unknown Property name. [489:9: -moz-border-radius-topright]\n",
- "WARNING\tProperty: Unknown Property name. [490:9: -moz-border-radius-bottomright]\n",
- "WARNING\tProperty: Unknown Property name. [491:9: -webkit-border-top-right-radius]\n",
- "WARNING\tProperty: Unknown Property name. [492:9: -webkit-border-bottom-right-radius]\n",
- "WARNING\tProperty: Unknown Property name. [506:9: -moz-box-shadow]\n",
- "WARNING\tProperty: Unknown Property name. [516:9: text-size]\n",
- "ERROR\tPropertyValue: No match: ('CHAR', ':', 518, 15)\n",
- "ERROR\tPropertyValue: Unknown syntax or no value: center\n",
- " cursor: pointer\n",
- "ERROR\tCSSStyleDeclaration: Syntax Error in Property: text-align: center\n",
- " cursor: pointer\n",
- "WARNING\tProperty: Unknown Property name. [577:9: -moz-border-radius-topright]\n",
- "WARNING\tProperty: Unknown Property name. [578:9: -moz-border-radius-bottomright]\n",
- "WARNING\tProperty: Unknown Property name. [579:9: -webkit-border-top-right-radius]\n",
- "WARNING\tProperty: Unknown Property name. [580:9: -webkit-border-bottom-right-radius]\n",
- "WARNING\tProperty: Unknown Property name. [606:9: filter]\n",
- "WARNING\tProperty: Unknown Property name. [615:9: filter]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: 999999 [639:3: color]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: 999999 [646:3: color]\n",
- "WARNING\tProperty: Unknown Property name. [685:3: _background-image]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [954:2: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: -moz-inline-box [1169:5: display]\n",
- "ERROR\tPropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '?', 1787, 57)\n",
- "ERROR\tPropertyValue: Unknown syntax or no value: expression((document.body.clientWidth < 936) ? \"895px\" : \"100%\" )\n",
- "ERROR\tCSSStyleDeclaration: Syntax Error in Property: width: expression((document.body.clientWidth < 936) ? \"895px\" : \"100%\" )\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [1972:2: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: block\t [2082:16: display]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [106:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [116:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [126:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [136:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [146:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [156:5: cursor]\n",
- "WARNING\tProperty: Unknown Property name. [160:5: hight]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [162:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [166:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: -moz-inline-box [173:5: display]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: -moz-inline-box [190:28: display]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: 0 5 [237:2: background-position]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: expression(this.scrollHeight > 183?\"184px\": \"auto\") [260:7: height]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [445:3: cursor]\n",
- "ERROR\tSelector: Unexpected IDENT. [461:2: html]\n",
- "ERROR\tSelectorList: Invalid Selector: *html div.mbcTradeIn\n",
- "WARNING\tProperty: Unknown Property name. [489:9: -moz-border-radius-topright]\n",
- "WARNING\tProperty: Unknown Property name. [490:9: -moz-border-radius-bottomright]\n",
- "WARNING\tProperty: Unknown Property name. [491:9: -webkit-border-top-right-radius]\n",
- "WARNING\tProperty: Unknown Property name. [492:9: -webkit-border-bottom-right-radius]\n",
- "WARNING\tProperty: Unknown Property name. [506:9: -moz-box-shadow]\n",
- "WARNING\tProperty: Unknown Property name. [516:9: text-size]\n",
- "ERROR\tPropertyValue: No match: ('CHAR', ':', 518, 15)\n",
- "ERROR\tPropertyValue: Unknown syntax or no value: center\n",
- " cursor: pointer\n",
- "ERROR\tCSSStyleDeclaration: Syntax Error in Property: text-align: center\n",
- " cursor: pointer\n",
- "WARNING\tProperty: Unknown Property name. [577:9: -moz-border-radius-topright]\n",
- "WARNING\tProperty: Unknown Property name. [578:9: -moz-border-radius-bottomright]\n",
- "WARNING\tProperty: Unknown Property name. [579:9: -webkit-border-top-right-radius]\n",
- "WARNING\tProperty: Unknown Property name. [580:9: -webkit-border-bottom-right-radius]\n",
- "WARNING\tProperty: Unknown Property name. [606:9: filter]\n",
- "WARNING\tProperty: Unknown Property name. [615:9: filter]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: 999999 [639:3: color]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: 999999 [646:3: color]\n",
- "WARNING\tProperty: Unknown Property name. [685:3: _background-image]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [954:2: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: -moz-inline-box [1169:5: display]\n",
- "ERROR\tPropertyValue: Missing token for production Choice(ColorValue, Dimension, URIValue, Value, variable, MSValue, CSSCalc, function): ('CHAR', '?', 1787, 57)\n",
- "ERROR\tPropertyValue: Unknown syntax or no value: expression((document.body.clientWidth < 936) ? \"895px\" : \"100%\" )\n",
- "ERROR\tCSSStyleDeclaration: Syntax Error in Property: width: expression((document.body.clientWidth < 936) ? \"895px\" : \"100%\" )\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [1972:2: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: block\t [2082:16: display]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [106:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [116:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [126:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [136:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [146:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [156:5: cursor]\n",
- "WARNING\tProperty: Unknown Property name. [160:5: hight]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [162:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS3 Basic User Interface Module\" property: hand [166:5: cursor]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: -moz-inline-box [173:5: display]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: -moz-inline-box [190:28: display]\n",
- "ERROR\tProperty: Invalid value for \"CSS Level 2.1\" property: 0 5 [237:2: background-position]\n"
- ]
- }
- ],
- "source": [
- "chunks = ai.chunk_content(content=content)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "a51a598a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "41"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(chunks)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "fa31e2ff",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['Amazon.com: Canon PowerShot SD1300 IS 12.1 MP Digital Camera with 4x Wide Angle Optical Image Stabilized Zoom and 2.7-Inch LCD (Silver): Camera & Photo ',\n",
- " '\\n\\n\\n
Hello. \\n
Sign in to get personalized recommendations.\\n
New customer? Start here . \\n
\\n\\xa0 \\n\\n \\n
\\n
',\n",
- " '',\n",
- " '',\n",
- " 'Baby Beauty Books Cell Phones & Accessories Clothing & Accessories Electronics Grocery & Gourmet Food Health & Personal Care Home, Garden & Pets Industrial & Scientific Jewelry Kindle Store
',\n",
- " 'Magazine Subscriptions Movies & TV MP3 Downloads Music Musical Instruments Office Products & Supplies Shoes Software Sports & Outdoors Tools & Home Improvement Toys & Games VHS Video Games
',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '\\n
\\n
\\n
\\n
\\n
\\n
\\n
Quantity: \\xa01 2 3
\\n\\n
\\n
\\n
or \\n
\\n
Sign in to turn on 1-Click ordering.\\n
\\n
\\n
\\n
\\n
',\n",
- " '',\n",
- " '',\n",
- " '\\n\\n\\n\\nBestPriceAu... \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
',\n",
- " '\\n\\n\\n\\nDBROTH \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
',\n",
- " '\\n\\n\\n\\nAce Photo Digital \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '\\n
Canon PowerShot SD1300 IS 12.1 MP Digital Camera with 4x Wide Angle Optical Image Stabilized Zoom and 2.7-Inch LCD (Silver) \\n
\\nby\\xa0Canon \\n \\n
\\n
',\n",
- " '',\n",
- " '\\n
\\n
\\n
\\n\\nList Price: \\n$179.99 \\n \\n\\nPrice: \\n$129.00 \\n \\n\\n \\n\\n\\n\\n\\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n\\nYou Save: \\n$50.99\\n (28%)\\n \\n \\n\\n\\xa0 \\nSpecial Offers Available \\n \\n \\n
\\n
\\n
\\n
',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '\\n
\\n28mm wide-angle lens; 4x optical zoom and Optical Image Stabilizer \\nSmart AUTO mode intelligently selects from 18 predefined settings \\n12.1-megapixel resolution allows you to print large size images with clarity and detail \\nClear 2.7-inch PureColor System LCD \\nShoot in Low Light mode for dimly-lit situations \\n \\n
\\n Technical Details\\n ',\n",
- " '\\n
\\nBrand Name : Canon \\nModel : SD1300IS Silver \\nOptical Sensor Resolution : 12.1 MP \\nOptical Sensor Technology : CCD \\nOptical zoom : 4 x \\nMaximum Aperture Range : F/2.8-5.9 \\nMinimum focal length : 5 millimeters \\nMaximum focal length : 20 millimeters \\nLens Type : Zoom lens \\nOptical Sensor Size : 1/2.3\" \\nIncluded Flash Type : Built-in flash \\nDisplay Size : 2.700 inches \\nLight Sensitivity : ISO 100, ISO 800, ISO 400, ISO 200, ISO 80, ISO auto, ISO 1600 \\nImage types : JPEG \\nShooting Modes : Frame movie mode \\nExposure Control Type : Beach, Kids & pets, Snow, Portrait mode, Slow shutter, Night mode, Fireworks, Low light, Snap shot, Underwater, Indoor, Foliage \\nViewfinder Type : None \\nWidth : 3.6 inches \\nHeight : 2.2 inches \\nWeight : 4.9 pounds \\n \\n
',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '',\n",
- " '\\n
\\n
\\n
\\n
\\n
\\n
\\n
\\n
\\n
\\n\\n\\n\\n
After viewing product detail pages or search results, look here to find an easy way to navigate back to pages you are interested in.
\\n
\\n \\n \\n\\n\\n \\n
\\n
\\n
\\n
',\n",
- " '
',\n",
- " '
',\n",
- " '
']"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "chunks"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "0fc45d52",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Bonjour\n",
- " [Document(metadata={}, page_content='Amazon.com: Canon PowerShot SD1300 IS 12.1 MP Digital Camera with 4x Wide Angle Optical Image Stabilized Zoom and 2.7-Inch LCD (Silver): Camera & Photo '), Document(metadata={}, page_content='\\n\\n\\n
Hello. \\n
Sign in to get personalized recommendations.\\n
New customer? Start here . \\n
\\n\\xa0 \\n\\n \\n
\\n
'), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content='Baby Beauty Books Cell Phones & Accessories Clothing & Accessories Electronics Grocery & Gourmet Food Health & Personal Care Home, Garden & Pets Industrial & Scientific Jewelry Kindle Store
'), Document(metadata={}, page_content='Magazine Subscriptions Movies & TV MP3 Downloads Music Musical Instruments Office Products & Supplies Shoes Software Sports & Outdoors Tools & Home Improvement Toys & Games VHS Video Games
'), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content='\\n
\\n
\\n
\\n
\\n
\\n
\\n
Quantity: \\xa01 2 3
\\n\\n
\\n
\\n
or \\n
\\n
Sign in to turn on 1-Click ordering.\\n
\\n
\\n
\\n
\\n
'), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content='\\n\\n\\n\\nBestPriceAu... \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'), Document(metadata={}, page_content='\\n\\n\\n\\nDBROTH \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'), Document(metadata={}, page_content='\\n\\n\\n\\nAce Photo Digital \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content='\\n
Canon PowerShot SD1300 IS 12.1 MP Digital Camera with 4x Wide Angle Optical Image Stabilized Zoom and 2.7-Inch LCD (Silver) \\n
\\nby\\xa0Canon \\n \\n
\\n
'), Document(metadata={}, page_content=''), Document(metadata={}, page_content='\\n
\\n
\\n
\\n\\nList Price: \\n$179.99 \\n \\n\\nPrice: \\n$129.00 \\n \\n\\n \\n\\n\\n\\n\\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n\\nYou Save: \\n$50.99\\n (28%)\\n \\n \\n\\n\\xa0 \\nSpecial Offers Available \\n \\n \\n
\\n
\\n
\\n
'), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content='\\n
\\n28mm wide-angle lens; 4x optical zoom and Optical Image Stabilizer \\nSmart AUTO mode intelligently selects from 18 predefined settings \\n12.1-megapixel resolution allows you to print large size images with clarity and detail \\nClear 2.7-inch PureColor System LCD \\nShoot in Low Light mode for dimly-lit situations \\n \\n
\\n Technical Details\\n '), Document(metadata={}, page_content='\\n
\\nBrand Name : Canon \\nModel : SD1300IS Silver \\nOptical Sensor Resolution : 12.1 MP \\nOptical Sensor Technology : CCD \\nOptical zoom : 4 x \\nMaximum Aperture Range : F/2.8-5.9 \\nMinimum focal length : 5 millimeters \\nMaximum focal length : 20 millimeters \\nLens Type : Zoom lens \\nOptical Sensor Size : 1/2.3\" \\nIncluded Flash Type : Built-in flash \\nDisplay Size : 2.700 inches \\nLight Sensitivity : ISO 100, ISO 800, ISO 400, ISO 200, ISO 80, ISO auto, ISO 1600 \\nImage types : JPEG \\nShooting Modes : Frame movie mode \\nExposure Control Type : Beach, Kids & pets, Snow, Portrait mode, Slow shutter, Night mode, Fireworks, Low light, Snap shot, Underwater, Indoor, Foliage \\nViewfinder Type : None \\nWidth : 3.6 inches \\nHeight : 2.2 inches \\nWeight : 4.9 pounds \\n \\n
'), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content=''), Document(metadata={}, page_content='\\n
\\n
\\n
\\n
\\n
\\n
\\n
\\n
\\n
\\n\\n\\n\\n
After viewing product detail pages or search results, look here to find an easy way to navigate back to pages you are interested in.
\\n
\\n \\n \\n\\n\\n \\n
\\n
\\n
\\n
'), Document(metadata={}, page_content='
'), Document(metadata={}, page_content='
'), Document(metadata={}, page_content='
')]\n",
- " \n",
- "{\n",
- " \"title\": {\"type\": \"string\", \"description\": \"Page title\"},\n",
- " \"price\": {\"type\": \"number\", \"description\": \"Product price\"},\n",
- " \"description\": {\"type\": \"string\", \"description\": \"Product description\"}\n",
- "}\n",
- "\n",
- "Scored Docs [Document(metadata={'relevance_score': -14.578125}, page_content='\\n
\\n
\\n
\\n\\nList Price: \\n$179.99 \\n \\n\\nPrice: \\n$129.00 \\n \\n\\n \\n\\n\\n\\n\\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n\\nYou Save: \\n$50.99\\n (28%)\\n \\n \\n\\n\\xa0 \\nSpecial Offers Available \\n \\n \\n
\\n
\\n
\\n
'), Document(metadata={'relevance_score': -14.9140625}, page_content=''), Document(metadata={'relevance_score': -15.1015625}, page_content='\\n\\n\\n\\nBestPriceAu... \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'), Document(metadata={'relevance_score': -15.125}, page_content='\\n\\n\\n\\nDBROTH \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'), Document(metadata={'relevance_score': -15.171875}, page_content='')]\n",
- "Ayeeeee\n",
- "Docs Value: [Document(metadata={'relevance_score': -14.578125, 'softmax_score': 0.29085356753038766, 'minmax_score': 1.0}, page_content='\\n
\\n
\\n
\\n\\nList Price: \\n$179.99 \\n \\n\\nPrice: \\n$129.00 \\n \\n\\n \\n\\n\\n\\n\\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n\\nYou Save: \\n$50.99\\n (28%)\\n \\n \\n\\n\\xa0 \\nSpecial Offers Available \\n \\n \\n
\\n
\\n
\\n
'), Document(metadata={'relevance_score': -14.9140625, 'softmax_score': 0.2078636708396982, 'minmax_score': 0.4342105263157895}, page_content=''), Document(metadata={'relevance_score': -15.1015625, 'softmax_score': 0.172325035737976, 'minmax_score': 0.11842105263157894}, page_content='\\n\\n\\n\\nBestPriceAu... \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'), Document(metadata={'relevance_score': -15.125, 'softmax_score': 0.16833313058462612, 'minmax_score': 0.07894736842105263}, page_content='\\n\\n\\n\\nDBROTH \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'), Document(metadata={'relevance_score': -15.171875, 'softmax_score': 0.16062459530731207, 'minmax_score': 0.0}, page_content='')]\n",
- "Final [Document(metadata={'relevance_score': -14.578125, 'softmax_score': 0.29085356753038766, 'minmax_score': 1.0}, page_content='\\n
\\n
\\n
\\n\\nList Price: \\n$179.99 \\n \\n\\nPrice: \\n$129.00 \\n \\n\\n \\n\\n\\n\\n\\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n\\nYou Save: \\n$50.99\\n (28%)\\n \\n \\n\\n\\xa0 \\nSpecial Offers Available \\n \\n \\n
\\n
\\n
\\n
'), Document(metadata={'relevance_score': -14.9140625, 'softmax_score': 0.2078636708396982, 'minmax_score': 0.4342105263157895}, page_content=''), Document(metadata={'relevance_score': -15.1015625, 'softmax_score': 0.172325035737976, 'minmax_score': 0.11842105263157894}, page_content='\\n\\n\\n\\nBestPriceAu... \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'), Document(metadata={'relevance_score': -15.125, 'softmax_score': 0.16833313058462612, 'minmax_score': 0.07894736842105263}, page_content='\\n\\n\\n\\nDBROTH \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'), Document(metadata={'relevance_score': -15.171875, 'softmax_score': 0.16062459530731207, 'minmax_score': 0.0}, page_content='')]\n"
- ]
- }
- ],
- "source": [
- "output = reranker.rerank(query=classification_prompt_template,passages=chunks, threshold=-1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "966586dc",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Document(metadata={'relevance_score': -14.578125, 'softmax_score': 0.29085356753038766, 'minmax_score': 1.0}, page_content='\\n
\\n
\\n
\\n\\nList Price: \\n$179.99 \\n \\n\\nPrice: \\n$129.00 \\n \\n\\n \\n\\n\\n\\n\\n \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n\\nYou Save: \\n$50.99\\n (28%)\\n \\n \\n\\n\\xa0 \\nSpecial Offers Available \\n \\n \\n
\\n
\\n
\\n
'),\n",
- " Document(metadata={'relevance_score': -14.9140625, 'softmax_score': 0.2078636708396982, 'minmax_score': 0.4342105263157895}, page_content=''),\n",
- " Document(metadata={'relevance_score': -15.1015625, 'softmax_score': 0.172325035737976, 'minmax_score': 0.11842105263157894}, page_content='\\n\\n\\n\\nBestPriceAu... \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'),\n",
- " Document(metadata={'relevance_score': -15.125, 'softmax_score': 0.16833313058462612, 'minmax_score': 0.07894736842105263}, page_content='\\n\\n\\n\\nDBROTH \\n\\nAdd to Cart \\n \\n \\n\\n$129.00\\xa0\\n\\n\\n\\n\\n& this item ships for FREE with Super Saver Shipping .\\n\\n\\n\\nDetails \\n \\n \\n
\\n \\n
'),\n",
- " Document(metadata={'relevance_score': -15.171875, 'softmax_score': 0.16062459530731207, 'minmax_score': 0.0}, page_content='')]"
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "output"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "ca8ee231",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Document(metadata={'relevance_score': -16.578125, 'softmax_score': 0.9005297861178929, 'minmax_score': 1.0}, page_content='Amazon.com: Canon PowerShot SD1300 IS 12.1 MP Digital Camera with 4x Wide Angle Optical Image Stabilized Zoom and 2.7-Inch LCD (Silver): Camera & Photo '),\n",
- " Document(metadata={'relevance_score': -18.78125, 'softmax_score': 0.09947021388210707, 'minmax_score': 0.0}, page_content='\\n\\n\\n
Hello. \\n
Sign in to get personalized recommendations.\\n
New customer? Start here . \\n
\\n\\xa0 \\n\\n \\n
\\n
')]"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "output"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1a9e2c78",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}