Spaces:

garage-lab
/

MCP_HTML2JSON

Running

App Files Files Community

abdo-Mansour commited on Jul 4

Commit

f6427da

1 Parent(s): 02778e5

please drop

Browse files

Files changed (2) hide show

.gitattributes +0 -35
app.py +0 -308

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

app.py DELETED Viewed

@@ -1,308 +0,0 @@
-import json
-import pandas as pd
-import gradio as gr
-from typing import Dict, Any, Type
-from web2json.preprocessor import BasicPreprocessor
-from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient
-from web2json.postprocessor import PostProcessor
-from web2json.pipeline import Pipeline
-from pydantic import BaseModel, Field, create_model
-import os
-import dotenv
-import random
-import numpy as np
-import torch
-dotenv.load_dotenv()
-def seed_everything(seed=42):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)  # if using multi-GPU
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-seed_everything(22)
-def parse_schema_input(schema_input: str) -> Type[BaseModel]:
-    """
-    Convert user schema input to a Pydantic BaseModel.
-    Supports multiple input formats:
-    1. JSON schema format
-    2. Python class definition
-    3. Simple field definitions
-    """
-    schema_input = schema_input.strip()
-    if not schema_input:
-        # Default schema if none provided
-        return create_model('DefaultSchema',
-                          title=(str, Field(description="Title of the content")),
-                          content=(str, Field(description="Main content")))
-    try:
-        # Try parsing as JSON schema
-        if schema_input.startswith('{'):
-            schema_dict = json.loads(schema_input)
-            return json_schema_to_basemodel(schema_dict)
-        # Try parsing as Python class definition
-        elif 'class ' in schema_input and 'BaseModel' in schema_input:
-            return python_class_to_basemodel(schema_input)
-        # Try parsing as simple field definitions
-        else:
-            return simple_fields_to_basemodel(schema_input)
-    except Exception as e:
-        raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
-def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
-    """Convert JSON schema to BaseModel"""
-    fields = {}
-    properties = schema_dict.get('properties', {})
-    required = schema_dict.get('required', [])
-    for field_name, field_info in properties.items():
-        field_type = get_python_type(field_info.get('type', 'string'))
-        field_description = field_info.get('description', '')
-        if field_name in required:
-            fields[field_name] = (field_type, Field(description=field_description))
-        else:
-            fields[field_name] = (field_type, Field(default=None, description=field_description))
-    return create_model('DynamicSchema', **fields)
-def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
-    """Convert Python class definition to BaseModel"""
-    try:
-        # Execute the class definition in a safe namespace
-        namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
-                    'float': float, 'bool': bool, 'list': list, 'dict': dict}
-        exec(class_definition, namespace)
-        # Find the class that inherits from BaseModel
-        for name, obj in namespace.items():
-            if (isinstance(obj, type) and
-                issubclass(obj, BaseModel) and
-                obj != BaseModel):
-                return obj
-        raise ValueError("No BaseModel class found in definition")
-    except Exception as e:
-        raise ValueError(f"Invalid Python class definition: {str(e)}")
-def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
-    """Convert simple field definitions to BaseModel"""
-    fields = {}
-    for line in fields_text.strip().split('\n'):
-        line = line.strip()
-        if not line or line.startswith('#'):
-            continue
-        # Parse field definition (e.g., "name: str = description")
-        if ':' in line:
-            parts = line.split(':', 1)
-            field_name = parts[0].strip()
-            type_and_desc = parts[1].strip()
-            if '=' in type_and_desc:
-                type_part, desc_part = type_and_desc.split('=', 1)
-                field_type = get_python_type(type_part.strip())
-                description = desc_part.strip().strip('"\'')
-            else:
-                field_type = get_python_type(type_and_desc.strip())
-                description = ""
-            fields[field_name] = (field_type, Field(description=description))
-        else:
-            # Simple field name only
-            field_name = line.strip()
-            fields[field_name] = (str, Field(description=""))
-    if not fields:
-        raise ValueError("No valid fields found in schema definition")
-    return create_model('DynamicSchema', **fields)
-def get_python_type(type_str: str):
-    """Convert type string to Python type"""
-    type_str = type_str.lower().strip()
-    type_mapping = {
-        'string': str, 'str': str,
-        'integer': int, 'int': int,
-        'number': float, 'float': float,
-        'boolean': bool, 'bool': bool,
-        'array': list, 'list': list,
-        'object': dict, 'dict': dict
-    }
-    return type_mapping.get(type_str, str)
-def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
-    """Wrapper function that converts schema input to BaseModel"""
-    try:
-        # Parse the schema input into a BaseModel
-        schema_model = parse_schema_input(schema_input)
-        # Call the original function
-        return webpage_to_json(content, is_url, schema_model)
-    except Exception as e:
-        return {"error": f"Schema parsing error: {str(e)}"}
-def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
-    """
-    Extracts structured JSON information from a given content based on a specified schema.
-    This function sets up a processing pipeline that includes:
-    - Preprocessing the input content.
-    - Utilizing an AI language model to extract information according to the provided schema.
-    - Postprocessing the extracted output to match the exact schema requirements.
-    Parameters:
-        content (str): The input content to be analyzed. This can be direct text or a URL content.
-        is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
-        schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
-    Returns:
-        Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
-                        or processing, the dictionary will include an "error" key with a descriptive message.
-    """
-    prompt_template = """Extract the following information from the provided content according to the specified schema.
-    Content to analyze:
-    {content}
-    Schema requirements:
-    {schema}
-    Instructions:
-    - Extract only information that is explicitly present in the content
-    - Follow the exact structure and data types specified in the schema
-    - If a required field cannot be found, indicate this clearly
-    - Preserve the original formatting and context where relevant
-    - Return the extracted data in the format specified by the schema
-    - STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE
-    - IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS
-    - OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """
-    classification_prompt_template = schema.model_json_schema()
-    # Initialize pipeline components
-    # TODO: improve the RAG system and optimize (don't instantiate every time)
-    preprocessor = BasicPreprocessor(config={'keep_tags': True})
-    try:
-        # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
-        llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'google/gemma-3n-e2b-it'})
-        # reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\
-        reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-html-rerank.modal.run")
-    except Exception as e:
-        return {"error": f"Failed to initialize LLM client: {str(e)}"}
-    # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
-    ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
-    postprocessor = PostProcessor()
-    pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
-    try:
-        result = pipeline.run(content, is_url, schema)
-        print("-"*80)
-        print(f"Processed result: {result}")
-        return result
-    except Exception as e:
-        return {"error": f"Processing error: {str(e)}"}
-# Example schemas for the user
-example_schemas = """
-**Example Schema Formats:**
-1. **Simple field definitions:**
-```
-title: str = Page title
-price: float = Product price
-description: str = Product description
-available: bool = Is available
-```
-2. **JSON Schema:**
-```json
-{
-  "properties": {
-    "title": {"type": "string", "description": "Page title"},
-    "price": {"type": "number", "description": "Product price"},
-    "description": {"type": "string", "description": "Product description"}
-  },
-  "required": ["title"]
-}
-```
-3. **Python Class Definition:**
-```python
-class ProductSchema(BaseModel):
-    title: str = Field(description="Product title")
-    price: float = Field(description="Product price")
-    description: str = Field(description="Product description")
-    available: bool = Field(default=False, description="Availability status")
-```
-"""
-# Build Gradio Interface
-demo = gr.Interface(
-    fn=webpage_to_json_wrapper,
-    inputs=[
-        gr.Textbox(
-            label="Content (URL or Raw Text)",
-            lines=10,
-            placeholder="Enter URL or paste raw HTML/text here."
-        ),
-        gr.Checkbox(label="Content is URL?", value=False),
-        gr.Textbox(
-            label="Schema Definition",
-            lines=15,
-            placeholder="Define your extraction schema (see examples below)",
-            info=example_schemas
-        )
-    ],
-    outputs=gr.JSON(label="Output JSON"),
-    title="Webpage to JSON Converter",
-    description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
-    examples=[
-        [
-            "https://example.com",
-            True,
-            "title: str = Page title\nprice: float = Product price\ndescription: str = Description"
-        ],
-        [
-            "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
-            False,
-            '''{
-            "type": "object",
-            "properties": {
-                "title": {
-                "type": "string",
-                "description": "Name of the product"
-                },
-                "price": {
-                "type": "number",
-                "description": "Price of the product"
-                },
-                "description": {
-                "type": "string",
-                "description": "Detailed description of the product"
-                },
-                "availability": {
-                "type": "boolean",
-                "description": "Whether the product is in stock (true) or not (false)"
-                }
-            },
-            "required": ["title", "price"]
-            }'''
-        ]
-    ]
-)
-if __name__ == "__main__":
-    demo.launch(mcp_server=True)