Spaces:

garage-lab
/

MCP_HTML2JSON

Building

File size: 12,798 Bytes

import json
import pandas as pd
import gradio as gr
from typing import Dict, Any, Type
from web2json.preprocessor import BasicPreprocessor
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient
from web2json.postprocessor import PostProcessor
from web2json.pipeline import Pipeline
from pydantic import BaseModel, Field, create_model
import os
import dotenv

dotenv.load_dotenv()

def parse_schema_input(schema_input: str) -> Type[BaseModel]:
    """
    Convert user schema input to a Pydantic BaseModel.
    Supports multiple input formats:
    1. JSON schema format
    2. Python class definition
    3. Simple field definitions
    """
    schema_input = schema_input.strip()
    
    if not schema_input:
        # Default schema if none provided
        return create_model('DefaultSchema', 
                          title=(str, Field(description="Title of the content")),
                          content=(str, Field(description="Main content")))
    
    try:
        # Try parsing as JSON schema
        if schema_input.startswith('{'):
            schema_dict = json.loads(schema_input)
            return json_schema_to_basemodel(schema_dict)
        
        # Try parsing as Python class definition
        elif 'class ' in schema_input and 'BaseModel' in schema_input:
            return python_class_to_basemodel(schema_input)
        
        # Try parsing as simple field definitions
        else:
            return simple_fields_to_basemodel(schema_input)
            
    except Exception as e:
        raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")

def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
    """Convert JSON schema to BaseModel"""
    fields = {}
    properties = schema_dict.get('properties', {})
    required = schema_dict.get('required', [])
    
    for field_name, field_info in properties.items():
        field_type = get_python_type(field_info.get('type', 'string'))
        field_description = field_info.get('description', '')
        
        if field_name in required:
            fields[field_name] = (field_type, Field(description=field_description))
        else:
            fields[field_name] = (field_type, Field(default=None, description=field_description))
    
    return create_model('DynamicSchema', **fields)

def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
    """Convert Python class definition to BaseModel"""
    try:
        # Execute the class definition in a safe namespace
        namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int, 
                    'float': float, 'bool': bool, 'list': list, 'dict': dict}
        exec(class_definition, namespace)
        
        # Find the class that inherits from BaseModel
        for name, obj in namespace.items():
            if (isinstance(obj, type) and 
                issubclass(obj, BaseModel) and 
                obj != BaseModel):
                return obj
        
        raise ValueError("No BaseModel class found in definition")
    except Exception as e:
        raise ValueError(f"Invalid Python class definition: {str(e)}")

def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
    """Convert simple field definitions to BaseModel"""
    fields = {}
    
    for line in fields_text.strip().split('\n'):
        line = line.strip()
        if not line or line.startswith('#'):
            continue
            
        # Parse field definition (e.g., "name: str = description")
        if ':' in line:
            parts = line.split(':', 1)
            field_name = parts[0].strip()
            
            type_and_desc = parts[1].strip()
            if '=' in type_and_desc:
                type_part, desc_part = type_and_desc.split('=', 1)
                field_type = get_python_type(type_part.strip())
                description = desc_part.strip().strip('"\'')
            else:
                field_type = get_python_type(type_and_desc.strip())
                description = ""
            
            fields[field_name] = (field_type, Field(description=description))
        else:
            # Simple field name only
            field_name = line.strip()
            fields[field_name] = (str, Field(description=""))
    
    if not fields:
        raise ValueError("No valid fields found in schema definition")
    
    return create_model('DynamicSchema', **fields)

def get_python_type(type_str: str):
    """Convert type string to Python type"""
    type_str = type_str.lower().strip()
    type_mapping = {
        'string': str, 'str': str,
        'integer': int, 'int': int,
        'number': float, 'float': float,
        'boolean': bool, 'bool': bool,
        'array': list, 'list': list,
        'object': dict, 'dict': dict
    }
    return type_mapping.get(type_str, str)

def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
    """Wrapper function that converts schema input to BaseModel"""
    try:
        # Parse the schema input into a BaseModel
        schema_model = parse_schema_input(schema_input)
        
        # Call the original function
        return webpage_to_json(content, is_url, schema_model)
        
    except Exception as e:
        return {"error": f"Schema parsing error: {str(e)}"}

def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
    """
    Extracts structured JSON information from a given content based on a specified schema.
    This function sets up a processing pipeline that includes:
    - Preprocessing the input content.
    - Utilizing an AI language model to extract information according to the provided schema.
    - Postprocessing the extracted output to match the exact schema requirements.
    Parameters:
        content (str): The input content to be analyzed. This can be direct text or a URL content.
        is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
        schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
    Returns:
        Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
                        or processing, the dictionary will include an "error" key with a descriptive message.
    """
    prompt_template = """Extract the following information from the provided content according to the specified schema.
    
    Content to analyze:
    {content}
    
    Schema requirements:
    {schema}
    
    Instructions:
    - Extract only information that is explicitly present in the content
    - Follow the exact structure and data types specified in the schema
    - If a required field cannot be found, indicate this clearly
    - Preserve the original formatting and context where relevant
    - Return the extracted data in the format specified by the schema"""
    
    classification_prompt_template = """
    # HTML Chunk Relevance Classification Prompt

    You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.

    ## Instructions:
    1. Carefully examine the provided HTML chunk
    2. Compare it against the given schema/criteria
    3. Determine if the HTML chunk contains content that matches or is relevant to the schema
    4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)

    ## Input Format:
    **Schema/Criteria:**
    {schema}

    **HTML Chunk:**
    ```html
    {content}
    ```

    ## Output Format:
    Your response must be ONLY a valid JSON object with no additional text:

    ```json
    {{
    "relevant": 1
    }}
    ```

    OR

    ```json
    {{
    "relevant": 0
    }}
    ```

    ## Classification Rules:
    - Output 1 if the HTML chunk contains content that matches the schema criteria
    - Output 0 if the HTML chunk does not contain relevant content
    - Consider semantic meaning, not just exact keyword matches
    - Look at text content, attributes, structure, and context
    - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
    - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
    - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
    - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema

    CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
    """
    # Initialize pipeline components
    # TODO: improve the RAG system and optimize (don't instantiate every time)
    preprocessor = BasicPreprocessor(config={'keep_tags': True}) 
    try:
        # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
        llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
    except Exception as e:
        return {"error": f"Failed to initialize LLM client: {str(e)}"}
    
    # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
    ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
    postprocessor = PostProcessor()
    pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
    
    try:
        result = pipeline.run(content, is_url, schema)
        print("-"*80)
        print(f"Processed result: {result}")
        return result
    except Exception as e:
        return {"error": f"Processing error: {str(e)}"}

# Example schemas for the user
example_schemas = """
**Example Schema Formats:**

1. **Simple field definitions:**
```
title: str = Page title
price: float = Product price
description: str = Product description
available: bool = Is available
```

2. **JSON Schema:**
```json
{
  "properties": {
    "title": {"type": "string", "description": "Page title"},
    "price": {"type": "number", "description": "Product price"},
    "description": {"type": "string", "description": "Product description"}
  },
  "required": ["title"]
}
```

3. **Python Class Definition:**
```python
class ProductSchema(BaseModel):
    title: str = Field(description="Product title")
    price: float = Field(description="Product price")
    description: str = Field(description="Product description")
    available: bool = Field(default=False, description="Availability status")
```
"""

# Build Gradio Interface
demo = gr.Interface(
    fn=webpage_to_json_wrapper,
    inputs=[
        gr.Textbox(
            label="Content (URL or Raw Text)", 
            lines=10,
            placeholder="Enter URL or paste raw HTML/text here."
        ),
        gr.Checkbox(label="Content is URL?", value=False),
        gr.Textbox(
            label="Schema Definition",
            lines=15,
            placeholder="Define your extraction schema (see examples below)",
            info=example_schemas
        )
    ],
    outputs=gr.JSON(label="Output JSON"),
    title="Webpage to JSON Converter",
    description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
    examples=[
        [
            "https://example.com",
            True,
            "title: str = Page title\nprice: float = Product price\ndescription: str = Description"
        ],
        [
            "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
            False,
            '''{
            "type": "object",
            "properties": {
                "title": {
                "type": "string",
                "description": "Name of the product"
                },
                "price": {
                "type": "number",
                "description": "Price of the product"
                },
                "description": {
                "type": "string",
                "description": "Detailed description of the product"
                },
                "availability": {
                "type": "boolean",
                "description": "Whether the product is in stock (true) or not (false)"
                }
            },
            "required": ["title", "price"]
            }'''
        ]
    ]
)

if __name__ == "__main__":
    demo.launch(mcp_server=True)