Spaces:
Building
Building
Commit
·
f6427da
1
Parent(s):
02778e5
please drop
Browse files- .gitattributes +0 -35
- app.py +0 -308
.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
DELETED
@@ -1,308 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import pandas as pd
|
3 |
-
import gradio as gr
|
4 |
-
from typing import Dict, Any, Type
|
5 |
-
from web2json.preprocessor import BasicPreprocessor
|
6 |
-
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient
|
7 |
-
from web2json.postprocessor import PostProcessor
|
8 |
-
from web2json.pipeline import Pipeline
|
9 |
-
from pydantic import BaseModel, Field, create_model
|
10 |
-
import os
|
11 |
-
import dotenv
|
12 |
-
import random
|
13 |
-
import numpy as np
|
14 |
-
import torch
|
15 |
-
|
16 |
-
dotenv.load_dotenv()
|
17 |
-
|
18 |
-
def seed_everything(seed=42):
|
19 |
-
random.seed(seed)
|
20 |
-
np.random.seed(seed)
|
21 |
-
torch.manual_seed(seed)
|
22 |
-
|
23 |
-
if torch.cuda.is_available():
|
24 |
-
torch.cuda.manual_seed(seed)
|
25 |
-
torch.cuda.manual_seed_all(seed) # if using multi-GPU
|
26 |
-
|
27 |
-
torch.backends.cudnn.deterministic = True
|
28 |
-
torch.backends.cudnn.benchmark = False
|
29 |
-
|
30 |
-
seed_everything(22)
|
31 |
-
|
32 |
-
def parse_schema_input(schema_input: str) -> Type[BaseModel]:
|
33 |
-
"""
|
34 |
-
Convert user schema input to a Pydantic BaseModel.
|
35 |
-
Supports multiple input formats:
|
36 |
-
1. JSON schema format
|
37 |
-
2. Python class definition
|
38 |
-
3. Simple field definitions
|
39 |
-
"""
|
40 |
-
schema_input = schema_input.strip()
|
41 |
-
|
42 |
-
if not schema_input:
|
43 |
-
# Default schema if none provided
|
44 |
-
return create_model('DefaultSchema',
|
45 |
-
title=(str, Field(description="Title of the content")),
|
46 |
-
content=(str, Field(description="Main content")))
|
47 |
-
|
48 |
-
try:
|
49 |
-
# Try parsing as JSON schema
|
50 |
-
if schema_input.startswith('{'):
|
51 |
-
schema_dict = json.loads(schema_input)
|
52 |
-
return json_schema_to_basemodel(schema_dict)
|
53 |
-
|
54 |
-
# Try parsing as Python class definition
|
55 |
-
elif 'class ' in schema_input and 'BaseModel' in schema_input:
|
56 |
-
return python_class_to_basemodel(schema_input)
|
57 |
-
|
58 |
-
# Try parsing as simple field definitions
|
59 |
-
else:
|
60 |
-
return simple_fields_to_basemodel(schema_input)
|
61 |
-
|
62 |
-
except Exception as e:
|
63 |
-
raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
|
64 |
-
|
65 |
-
def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
|
66 |
-
"""Convert JSON schema to BaseModel"""
|
67 |
-
fields = {}
|
68 |
-
properties = schema_dict.get('properties', {})
|
69 |
-
required = schema_dict.get('required', [])
|
70 |
-
|
71 |
-
for field_name, field_info in properties.items():
|
72 |
-
field_type = get_python_type(field_info.get('type', 'string'))
|
73 |
-
field_description = field_info.get('description', '')
|
74 |
-
|
75 |
-
if field_name in required:
|
76 |
-
fields[field_name] = (field_type, Field(description=field_description))
|
77 |
-
else:
|
78 |
-
fields[field_name] = (field_type, Field(default=None, description=field_description))
|
79 |
-
|
80 |
-
return create_model('DynamicSchema', **fields)
|
81 |
-
|
82 |
-
def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
|
83 |
-
"""Convert Python class definition to BaseModel"""
|
84 |
-
try:
|
85 |
-
# Execute the class definition in a safe namespace
|
86 |
-
namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
|
87 |
-
'float': float, 'bool': bool, 'list': list, 'dict': dict}
|
88 |
-
exec(class_definition, namespace)
|
89 |
-
|
90 |
-
# Find the class that inherits from BaseModel
|
91 |
-
for name, obj in namespace.items():
|
92 |
-
if (isinstance(obj, type) and
|
93 |
-
issubclass(obj, BaseModel) and
|
94 |
-
obj != BaseModel):
|
95 |
-
return obj
|
96 |
-
|
97 |
-
raise ValueError("No BaseModel class found in definition")
|
98 |
-
except Exception as e:
|
99 |
-
raise ValueError(f"Invalid Python class definition: {str(e)}")
|
100 |
-
|
101 |
-
def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
|
102 |
-
"""Convert simple field definitions to BaseModel"""
|
103 |
-
fields = {}
|
104 |
-
|
105 |
-
for line in fields_text.strip().split('\n'):
|
106 |
-
line = line.strip()
|
107 |
-
if not line or line.startswith('#'):
|
108 |
-
continue
|
109 |
-
|
110 |
-
# Parse field definition (e.g., "name: str = description")
|
111 |
-
if ':' in line:
|
112 |
-
parts = line.split(':', 1)
|
113 |
-
field_name = parts[0].strip()
|
114 |
-
|
115 |
-
type_and_desc = parts[1].strip()
|
116 |
-
if '=' in type_and_desc:
|
117 |
-
type_part, desc_part = type_and_desc.split('=', 1)
|
118 |
-
field_type = get_python_type(type_part.strip())
|
119 |
-
description = desc_part.strip().strip('"\'')
|
120 |
-
else:
|
121 |
-
field_type = get_python_type(type_and_desc.strip())
|
122 |
-
description = ""
|
123 |
-
|
124 |
-
fields[field_name] = (field_type, Field(description=description))
|
125 |
-
else:
|
126 |
-
# Simple field name only
|
127 |
-
field_name = line.strip()
|
128 |
-
fields[field_name] = (str, Field(description=""))
|
129 |
-
|
130 |
-
if not fields:
|
131 |
-
raise ValueError("No valid fields found in schema definition")
|
132 |
-
|
133 |
-
return create_model('DynamicSchema', **fields)
|
134 |
-
|
135 |
-
def get_python_type(type_str: str):
|
136 |
-
"""Convert type string to Python type"""
|
137 |
-
type_str = type_str.lower().strip()
|
138 |
-
type_mapping = {
|
139 |
-
'string': str, 'str': str,
|
140 |
-
'integer': int, 'int': int,
|
141 |
-
'number': float, 'float': float,
|
142 |
-
'boolean': bool, 'bool': bool,
|
143 |
-
'array': list, 'list': list,
|
144 |
-
'object': dict, 'dict': dict
|
145 |
-
}
|
146 |
-
return type_mapping.get(type_str, str)
|
147 |
-
|
148 |
-
def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
|
149 |
-
"""Wrapper function that converts schema input to BaseModel"""
|
150 |
-
try:
|
151 |
-
# Parse the schema input into a BaseModel
|
152 |
-
schema_model = parse_schema_input(schema_input)
|
153 |
-
|
154 |
-
# Call the original function
|
155 |
-
return webpage_to_json(content, is_url, schema_model)
|
156 |
-
|
157 |
-
except Exception as e:
|
158 |
-
return {"error": f"Schema parsing error: {str(e)}"}
|
159 |
-
|
160 |
-
def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
|
161 |
-
"""
|
162 |
-
Extracts structured JSON information from a given content based on a specified schema.
|
163 |
-
This function sets up a processing pipeline that includes:
|
164 |
-
- Preprocessing the input content.
|
165 |
-
- Utilizing an AI language model to extract information according to the provided schema.
|
166 |
-
- Postprocessing the extracted output to match the exact schema requirements.
|
167 |
-
Parameters:
|
168 |
-
content (str): The input content to be analyzed. This can be direct text or a URL content.
|
169 |
-
is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
|
170 |
-
schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
|
171 |
-
Returns:
|
172 |
-
Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
|
173 |
-
or processing, the dictionary will include an "error" key with a descriptive message.
|
174 |
-
"""
|
175 |
-
prompt_template = """Extract the following information from the provided content according to the specified schema.
|
176 |
-
|
177 |
-
Content to analyze:
|
178 |
-
{content}
|
179 |
-
|
180 |
-
Schema requirements:
|
181 |
-
{schema}
|
182 |
-
|
183 |
-
Instructions:
|
184 |
-
- Extract only information that is explicitly present in the content
|
185 |
-
- Follow the exact structure and data types specified in the schema
|
186 |
-
- If a required field cannot be found, indicate this clearly
|
187 |
-
- Preserve the original formatting and context where relevant
|
188 |
-
- Return the extracted data in the format specified by the schema
|
189 |
-
- STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE
|
190 |
-
- IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS
|
191 |
-
- OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """
|
192 |
-
|
193 |
-
classification_prompt_template = schema.model_json_schema()
|
194 |
-
# Initialize pipeline components
|
195 |
-
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
196 |
-
preprocessor = BasicPreprocessor(config={'keep_tags': True})
|
197 |
-
try:
|
198 |
-
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
199 |
-
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'google/gemma-3n-e2b-it'})
|
200 |
-
# reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\
|
201 |
-
reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-html-rerank.modal.run")
|
202 |
-
except Exception as e:
|
203 |
-
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
204 |
-
|
205 |
-
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
206 |
-
ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
|
207 |
-
postprocessor = PostProcessor()
|
208 |
-
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
209 |
-
|
210 |
-
try:
|
211 |
-
result = pipeline.run(content, is_url, schema)
|
212 |
-
print("-"*80)
|
213 |
-
print(f"Processed result: {result}")
|
214 |
-
return result
|
215 |
-
except Exception as e:
|
216 |
-
return {"error": f"Processing error: {str(e)}"}
|
217 |
-
|
218 |
-
# Example schemas for the user
|
219 |
-
example_schemas = """
|
220 |
-
**Example Schema Formats:**
|
221 |
-
|
222 |
-
1. **Simple field definitions:**
|
223 |
-
```
|
224 |
-
title: str = Page title
|
225 |
-
price: float = Product price
|
226 |
-
description: str = Product description
|
227 |
-
available: bool = Is available
|
228 |
-
```
|
229 |
-
|
230 |
-
2. **JSON Schema:**
|
231 |
-
```json
|
232 |
-
{
|
233 |
-
"properties": {
|
234 |
-
"title": {"type": "string", "description": "Page title"},
|
235 |
-
"price": {"type": "number", "description": "Product price"},
|
236 |
-
"description": {"type": "string", "description": "Product description"}
|
237 |
-
},
|
238 |
-
"required": ["title"]
|
239 |
-
}
|
240 |
-
```
|
241 |
-
|
242 |
-
3. **Python Class Definition:**
|
243 |
-
```python
|
244 |
-
class ProductSchema(BaseModel):
|
245 |
-
title: str = Field(description="Product title")
|
246 |
-
price: float = Field(description="Product price")
|
247 |
-
description: str = Field(description="Product description")
|
248 |
-
available: bool = Field(default=False, description="Availability status")
|
249 |
-
```
|
250 |
-
"""
|
251 |
-
|
252 |
-
# Build Gradio Interface
|
253 |
-
demo = gr.Interface(
|
254 |
-
fn=webpage_to_json_wrapper,
|
255 |
-
inputs=[
|
256 |
-
gr.Textbox(
|
257 |
-
label="Content (URL or Raw Text)",
|
258 |
-
lines=10,
|
259 |
-
placeholder="Enter URL or paste raw HTML/text here."
|
260 |
-
),
|
261 |
-
gr.Checkbox(label="Content is URL?", value=False),
|
262 |
-
gr.Textbox(
|
263 |
-
label="Schema Definition",
|
264 |
-
lines=15,
|
265 |
-
placeholder="Define your extraction schema (see examples below)",
|
266 |
-
info=example_schemas
|
267 |
-
)
|
268 |
-
],
|
269 |
-
outputs=gr.JSON(label="Output JSON"),
|
270 |
-
title="Webpage to JSON Converter",
|
271 |
-
description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
|
272 |
-
examples=[
|
273 |
-
[
|
274 |
-
"https://example.com",
|
275 |
-
True,
|
276 |
-
"title: str = Page title\nprice: float = Product price\ndescription: str = Description"
|
277 |
-
],
|
278 |
-
[
|
279 |
-
"<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
|
280 |
-
False,
|
281 |
-
'''{
|
282 |
-
"type": "object",
|
283 |
-
"properties": {
|
284 |
-
"title": {
|
285 |
-
"type": "string",
|
286 |
-
"description": "Name of the product"
|
287 |
-
},
|
288 |
-
"price": {
|
289 |
-
"type": "number",
|
290 |
-
"description": "Price of the product"
|
291 |
-
},
|
292 |
-
"description": {
|
293 |
-
"type": "string",
|
294 |
-
"description": "Detailed description of the product"
|
295 |
-
},
|
296 |
-
"availability": {
|
297 |
-
"type": "boolean",
|
298 |
-
"description": "Whether the product is in stock (true) or not (false)"
|
299 |
-
}
|
300 |
-
},
|
301 |
-
"required": ["title", "price"]
|
302 |
-
}'''
|
303 |
-
]
|
304 |
-
]
|
305 |
-
)
|
306 |
-
|
307 |
-
if __name__ == "__main__":
|
308 |
-
demo.launch(mcp_server=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|