Spaces:
Building
Building
import json | |
import pandas as pd | |
import gradio as gr | |
from typing import Dict, Any, Type | |
from web2json.preprocessor import BasicPreprocessor | |
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient | |
from web2json.postprocessor import PostProcessor | |
from web2json.pipeline import Pipeline | |
from pydantic import BaseModel, Field, create_model | |
import os | |
import dotenv | |
dotenv.load_dotenv() | |
def parse_schema_input(schema_input: str) -> Type[BaseModel]: | |
""" | |
Convert user schema input to a Pydantic BaseModel. | |
Supports multiple input formats: | |
1. JSON schema format | |
2. Python class definition | |
3. Simple field definitions | |
""" | |
schema_input = schema_input.strip() | |
if not schema_input: | |
# Default schema if none provided | |
return create_model('DefaultSchema', | |
title=(str, Field(description="Title of the content")), | |
content=(str, Field(description="Main content"))) | |
try: | |
# Try parsing as JSON schema | |
if schema_input.startswith('{'): | |
schema_dict = json.loads(schema_input) | |
return json_schema_to_basemodel(schema_dict) | |
# Try parsing as Python class definition | |
elif 'class ' in schema_input and 'BaseModel' in schema_input: | |
return python_class_to_basemodel(schema_input) | |
# Try parsing as simple field definitions | |
else: | |
return simple_fields_to_basemodel(schema_input) | |
except Exception as e: | |
raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.") | |
def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]: | |
"""Convert JSON schema to BaseModel""" | |
fields = {} | |
properties = schema_dict.get('properties', {}) | |
required = schema_dict.get('required', []) | |
for field_name, field_info in properties.items(): | |
field_type = get_python_type(field_info.get('type', 'string')) | |
field_description = field_info.get('description', '') | |
if field_name in required: | |
fields[field_name] = (field_type, Field(description=field_description)) | |
else: | |
fields[field_name] = (field_type, Field(default=None, description=field_description)) | |
return create_model('DynamicSchema', **fields) | |
def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]: | |
"""Convert Python class definition to BaseModel""" | |
try: | |
# Execute the class definition in a safe namespace | |
namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int, | |
'float': float, 'bool': bool, 'list': list, 'dict': dict} | |
exec(class_definition, namespace) | |
# Find the class that inherits from BaseModel | |
for name, obj in namespace.items(): | |
if (isinstance(obj, type) and | |
issubclass(obj, BaseModel) and | |
obj != BaseModel): | |
return obj | |
raise ValueError("No BaseModel class found in definition") | |
except Exception as e: | |
raise ValueError(f"Invalid Python class definition: {str(e)}") | |
def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]: | |
"""Convert simple field definitions to BaseModel""" | |
fields = {} | |
for line in fields_text.strip().split('\n'): | |
line = line.strip() | |
if not line or line.startswith('#'): | |
continue | |
# Parse field definition (e.g., "name: str = description") | |
if ':' in line: | |
parts = line.split(':', 1) | |
field_name = parts[0].strip() | |
type_and_desc = parts[1].strip() | |
if '=' in type_and_desc: | |
type_part, desc_part = type_and_desc.split('=', 1) | |
field_type = get_python_type(type_part.strip()) | |
description = desc_part.strip().strip('"\'') | |
else: | |
field_type = get_python_type(type_and_desc.strip()) | |
description = "" | |
fields[field_name] = (field_type, Field(description=description)) | |
else: | |
# Simple field name only | |
field_name = line.strip() | |
fields[field_name] = (str, Field(description="")) | |
if not fields: | |
raise ValueError("No valid fields found in schema definition") | |
return create_model('DynamicSchema', **fields) | |
def get_python_type(type_str: str): | |
"""Convert type string to Python type""" | |
type_str = type_str.lower().strip() | |
type_mapping = { | |
'string': str, 'str': str, | |
'integer': int, 'int': int, | |
'number': float, 'float': float, | |
'boolean': bool, 'bool': bool, | |
'array': list, 'list': list, | |
'object': dict, 'dict': dict | |
} | |
return type_mapping.get(type_str, str) | |
def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]: | |
"""Wrapper function that converts schema input to BaseModel""" | |
try: | |
# Parse the schema input into a BaseModel | |
schema_model = parse_schema_input(schema_input) | |
# Call the original function | |
return webpage_to_json(content, is_url, schema_model) | |
except Exception as e: | |
return {"error": f"Schema parsing error: {str(e)}"} | |
def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]: | |
""" | |
Extracts structured JSON information from a given content based on a specified schema. | |
This function sets up a processing pipeline that includes: | |
- Preprocessing the input content. | |
- Utilizing an AI language model to extract information according to the provided schema. | |
- Postprocessing the extracted output to match the exact schema requirements. | |
Parameters: | |
content (str): The input content to be analyzed. This can be direct text or a URL content. | |
is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False). | |
schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output. | |
Returns: | |
Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization | |
or processing, the dictionary will include an "error" key with a descriptive message. | |
""" | |
prompt_template = """Extract the following information from the provided content according to the specified schema. | |
Content to analyze: | |
{content} | |
Schema requirements: | |
{schema} | |
Instructions: | |
- Extract only information that is explicitly present in the content | |
- Follow the exact structure and data types specified in the schema | |
- If a required field cannot be found, indicate this clearly | |
- Preserve the original formatting and context where relevant | |
- Return the extracted data in the format specified by the schema""" | |
classification_prompt_template = """ | |
# HTML Chunk Relevance Classification Prompt | |
You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant. | |
## Instructions: | |
1. Carefully examine the provided HTML chunk | |
2. Compare it against the given schema/criteria | |
3. Determine if the HTML chunk contains content that matches or is relevant to the schema | |
4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant) | |
## Input Format: | |
**Schema/Criteria:** | |
{schema} | |
**HTML Chunk:** | |
```html | |
{content} | |
``` | |
## Output Format: | |
Your response must be ONLY a valid JSON object with no additional text: | |
```json | |
{{ | |
"relevant": 1 | |
}} | |
``` | |
OR | |
```json | |
{{ | |
"relevant": 0 | |
}} | |
``` | |
## Classification Rules: | |
- Output 1 if the HTML chunk contains content that matches the schema criteria | |
- Output 0 if the HTML chunk does not contain relevant content | |
- Consider semantic meaning, not just exact keyword matches | |
- Look at text content, attributes, structure, and context | |
- Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content | |
- Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema | |
- Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0) | |
- The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema | |
CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object. | |
""" | |
# Initialize pipeline components | |
# TODO: improve the RAG system and optimize (don't instantiate every time) | |
preprocessor = BasicPreprocessor(config={'keep_tags': True}) | |
try: | |
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')}) | |
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'}) | |
except Exception as e: | |
return {"error": f"Failed to initialize LLM client: {str(e)}"} | |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template) | |
ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template) | |
postprocessor = PostProcessor() | |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor) | |
try: | |
result = pipeline.run(content, is_url, schema) | |
print("-"*80) | |
print(f"Processed result: {result}") | |
return result | |
except Exception as e: | |
return {"error": f"Processing error: {str(e)}"} | |
# Example schemas for the user | |
example_schemas = """ | |
**Example Schema Formats:** | |
1. **Simple field definitions:** | |
``` | |
title: str = Page title | |
price: float = Product price | |
description: str = Product description | |
available: bool = Is available | |
``` | |
2. **JSON Schema:** | |
```json | |
{ | |
"properties": { | |
"title": {"type": "string", "description": "Page title"}, | |
"price": {"type": "number", "description": "Product price"}, | |
"description": {"type": "string", "description": "Product description"} | |
}, | |
"required": ["title"] | |
} | |
``` | |
3. **Python Class Definition:** | |
```python | |
class ProductSchema(BaseModel): | |
title: str = Field(description="Product title") | |
price: float = Field(description="Product price") | |
description: str = Field(description="Product description") | |
available: bool = Field(default=False, description="Availability status") | |
``` | |
""" | |
# Build Gradio Interface | |
demo = gr.Interface( | |
fn=webpage_to_json_wrapper, | |
inputs=[ | |
gr.Textbox( | |
label="Content (URL or Raw Text)", | |
lines=10, | |
placeholder="Enter URL or paste raw HTML/text here." | |
), | |
gr.Checkbox(label="Content is URL?", value=False), | |
gr.Textbox( | |
label="Schema Definition", | |
lines=15, | |
placeholder="Define your extraction schema (see examples below)", | |
info=example_schemas | |
) | |
], | |
outputs=gr.JSON(label="Output JSON"), | |
title="Webpage to JSON Converter", | |
description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.", | |
examples=[ | |
[ | |
"https://example.com", | |
True, | |
"title: str = Page title\nprice: float = Product price\ndescription: str = Description" | |
], | |
[ | |
"<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>", | |
False, | |
'''{ | |
"type": "object", | |
"properties": { | |
"title": { | |
"type": "string", | |
"description": "Name of the product" | |
}, | |
"price": { | |
"type": "number", | |
"description": "Price of the product" | |
}, | |
"description": { | |
"type": "string", | |
"description": "Detailed description of the product" | |
}, | |
"availability": { | |
"type": "boolean", | |
"description": "Whether the product is in stock (true) or not (false)" | |
} | |
}, | |
"required": ["title", "price"] | |
}''' | |
] | |
] | |
) | |
if __name__ == "__main__": | |
demo.launch(mcp_server=True) |