Spaces:
Running
Running
import json | |
import pandas as pd | |
import gradio as gr | |
from typing import Dict, Any, Type | |
from web2json.preprocessor import BasicPreprocessor | |
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient | |
from web2json.postprocessor import PostProcessor | |
from web2json.pipeline import Pipeline | |
from pydantic import BaseModel, Field, create_model | |
import os | |
import dotenv | |
import random | |
import numpy as np | |
import torch | |
dotenv.load_dotenv() | |
def seed_everything(seed=42): | |
random.seed(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
if torch.cuda.is_available(): | |
torch.cuda.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) # if using multi-GPU | |
torch.backends.cudnn.deterministic = True | |
torch.backends.cudnn.benchmark = False | |
seed_everything(22) | |
def parse_schema_input(schema_input: str) -> Type[BaseModel]: | |
""" | |
Convert user schema input to a Pydantic BaseModel. | |
Supports multiple input formats: | |
1. JSON schema format | |
2. Python class definition | |
3. Simple field definitions | |
""" | |
schema_input = schema_input.strip() | |
if not schema_input: | |
# Default schema if none provided | |
return create_model('DefaultSchema', | |
title=(str, Field(description="Title of the content")), | |
content=(str, Field(description="Main content"))) | |
try: | |
# Try parsing as JSON schema | |
if schema_input.startswith('{'): | |
schema_dict = json.loads(schema_input) | |
return json_schema_to_basemodel(schema_dict) | |
# Try parsing as Python class definition | |
elif 'class ' in schema_input and 'BaseModel' in schema_input: | |
return python_class_to_basemodel(schema_input) | |
# Try parsing as simple field definitions | |
else: | |
return simple_fields_to_basemodel(schema_input) | |
except Exception as e: | |
raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.") | |
def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]: | |
"""Convert JSON schema to BaseModel""" | |
fields = {} | |
properties = schema_dict.get('properties', {}) | |
required = schema_dict.get('required', []) | |
for field_name, field_info in properties.items(): | |
field_type = get_python_type(field_info.get('type', 'string')) | |
field_description = field_info.get('description', '') | |
if field_name in required: | |
fields[field_name] = (field_type, Field(description=field_description)) | |
else: | |
fields[field_name] = (field_type, Field(default=None, description=field_description)) | |
return create_model('DynamicSchema', **fields) | |
def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]: | |
"""Convert Python class definition to BaseModel""" | |
try: | |
# Execute the class definition in a safe namespace | |
namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int, | |
'float': float, 'bool': bool, 'list': list, 'dict': dict} | |
exec(class_definition, namespace) | |
# Find the class that inherits from BaseModel | |
for name, obj in namespace.items(): | |
if (isinstance(obj, type) and | |
issubclass(obj, BaseModel) and | |
obj != BaseModel): | |
return obj | |
raise ValueError("No BaseModel class found in definition") | |
except Exception as e: | |
raise ValueError(f"Invalid Python class definition: {str(e)}") | |
def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]: | |
"""Convert simple field definitions to BaseModel""" | |
fields = {} | |
for line in fields_text.strip().split('\n'): | |
line = line.strip() | |
if not line or line.startswith('#'): | |
continue | |
# Parse field definition (e.g., "name: str = description") | |
if ':' in line: | |
parts = line.split(':', 1) | |
field_name = parts[0].strip() | |
type_and_desc = parts[1].strip() | |
if '=' in type_and_desc: | |
type_part, desc_part = type_and_desc.split('=', 1) | |
field_type = get_python_type(type_part.strip()) | |
description = desc_part.strip().strip('"\'') | |
else: | |
field_type = get_python_type(type_and_desc.strip()) | |
description = "" | |
fields[field_name] = (field_type, Field(description=description)) | |
else: | |
# Simple field name only | |
field_name = line.strip() | |
fields[field_name] = (str, Field(description="")) | |
if not fields: | |
raise ValueError("No valid fields found in schema definition") | |
return create_model('DynamicSchema', **fields) | |
def get_python_type(type_str: str): | |
"""Convert type string to Python type""" | |
type_str = type_str.lower().strip() | |
type_mapping = { | |
'string': str, 'str': str, | |
'integer': int, 'int': int, | |
'number': float, 'float': float, | |
'boolean': bool, 'bool': bool, | |
'array': list, 'list': list, | |
'object': dict, 'dict': dict | |
} | |
return type_mapping.get(type_str, str) | |
def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]: | |
"""Wrapper function that converts schema input to BaseModel""" | |
try: | |
# Parse the schema input into a BaseModel | |
schema_model = parse_schema_input(schema_input) | |
# Call the original function | |
return webpage_to_json(content, is_url, schema_model) | |
except Exception as e: | |
return {"error": f"Schema parsing error: {str(e)}"} | |
def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]: | |
""" | |
Extracts structured JSON information from a given content based on a specified schema. | |
This function sets up a processing pipeline that includes: | |
- Preprocessing the input content. | |
- Utilizing an AI language model to extract information according to the provided schema. | |
- Postprocessing the extracted output to match the exact schema requirements. | |
Parameters: | |
content (str): The input content to be analyzed. This can be direct text or a URL content. | |
is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False). | |
schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output. | |
Returns: | |
Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization | |
or processing, the dictionary will include an "error" key with a descriptive message. | |
""" | |
prompt_template = """Extract the following information from the provided content according to the specified schema. | |
Content to analyze: | |
{content} | |
Schema requirements: | |
{schema} | |
Instructions: | |
- Extract only information that is explicitly present in the content | |
- Follow the exact structure and data types specified in the schema | |
- If a required field cannot be found, indicate this clearly | |
- Preserve the original formatting and context where relevant | |
- Return the extracted data in the format specified by the schema | |
- STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE | |
- IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS | |
- OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """ | |
classification_prompt_template = schema.model_json_schema() | |
# Initialize pipeline components | |
# TODO: improve the RAG system and optimize (don't instantiate every time) | |
preprocessor = BasicPreprocessor(config={'keep_tags': True}) | |
try: | |
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')}) | |
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'google/gemma-3n-e2b-it'}) | |
# reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\ | |
reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-rerank.modal.run") | |
except Exception as e: | |
return {"error": f"Failed to initialize LLM client: {str(e)}"} | |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template) | |
ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template) | |
postprocessor = PostProcessor() | |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor) | |
try: | |
result = pipeline.run(content, is_url, schema) | |
print("-"*80) | |
print(f"Processed result: {result}") | |
return result | |
except Exception as e: | |
return {"error": f"Processing error: {str(e)}"} | |
# Example schemas for the user | |
example_schemas = """ | |
**Example Schema Formats:** | |
1. **Simple field definitions:** | |
``` | |
title: str = Page title | |
price: float = Product price | |
description: str = Product description | |
available: bool = Is available | |
``` | |
2. **JSON Schema:** | |
```json | |
{ | |
"properties": { | |
"title": {"type": "string", "description": "Page title"}, | |
"price": {"type": "number", "description": "Product price"}, | |
"description": {"type": "string", "description": "Product description"} | |
}, | |
"required": ["title"] | |
} | |
``` | |
3. **Python Class Definition:** | |
```python | |
class ProductSchema(BaseModel): | |
title: str = Field(description="Product title") | |
price: float = Field(description="Product price") | |
description: str = Field(description="Product description") | |
available: bool = Field(default=False, description="Availability status") | |
``` | |
""" | |
# Build Gradio Interface | |
demo = gr.Interface( | |
fn=webpage_to_json_wrapper, | |
inputs=[ | |
gr.Textbox( | |
label="Content (URL or Raw Text)", | |
lines=10, | |
placeholder="Enter URL or paste raw HTML/text here." | |
), | |
gr.Checkbox(label="Content is URL?", value=False), | |
gr.Textbox( | |
label="Schema Definition", | |
lines=15, | |
placeholder="Define your extraction schema (see examples below)", | |
info=example_schemas | |
) | |
], | |
outputs=gr.JSON(label="Output JSON"), | |
title="Webpage to JSON Converter", | |
description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.", | |
examples=[ | |
[ | |
"https://example.com", | |
True, | |
"title: str = Page title\nprice: float = Product price\ndescription: str = Description" | |
], | |
[ | |
"<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>", | |
False, | |
'''{ | |
"type": "object", | |
"properties": { | |
"title": { | |
"type": "string", | |
"description": "Name of the product" | |
}, | |
"price": { | |
"type": "number", | |
"description": "Price of the product" | |
}, | |
"description": { | |
"type": "string", | |
"description": "Detailed description of the product" | |
}, | |
"availability": { | |
"type": "boolean", | |
"description": "Whether the product is in stock (true) or not (false)" | |
} | |
}, | |
"required": ["title", "price"] | |
}''' | |
] | |
] | |
) | |
if __name__ == "__main__": | |
demo.launch(mcp_server=True) |