abdo-Mansour commited on
Commit
f6427da
·
1 Parent(s): 02778e5

please drop

Browse files
Files changed (2) hide show
  1. .gitattributes +0 -35
  2. app.py +0 -308
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py DELETED
@@ -1,308 +0,0 @@
1
- import json
2
- import pandas as pd
3
- import gradio as gr
4
- from typing import Dict, Any, Type
5
- from web2json.preprocessor import BasicPreprocessor
6
- from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient , ModalRerankerClient
7
- from web2json.postprocessor import PostProcessor
8
- from web2json.pipeline import Pipeline
9
- from pydantic import BaseModel, Field, create_model
10
- import os
11
- import dotenv
12
- import random
13
- import numpy as np
14
- import torch
15
-
16
- dotenv.load_dotenv()
17
-
18
- def seed_everything(seed=42):
19
- random.seed(seed)
20
- np.random.seed(seed)
21
- torch.manual_seed(seed)
22
-
23
- if torch.cuda.is_available():
24
- torch.cuda.manual_seed(seed)
25
- torch.cuda.manual_seed_all(seed) # if using multi-GPU
26
-
27
- torch.backends.cudnn.deterministic = True
28
- torch.backends.cudnn.benchmark = False
29
-
30
- seed_everything(22)
31
-
32
- def parse_schema_input(schema_input: str) -> Type[BaseModel]:
33
- """
34
- Convert user schema input to a Pydantic BaseModel.
35
- Supports multiple input formats:
36
- 1. JSON schema format
37
- 2. Python class definition
38
- 3. Simple field definitions
39
- """
40
- schema_input = schema_input.strip()
41
-
42
- if not schema_input:
43
- # Default schema if none provided
44
- return create_model('DefaultSchema',
45
- title=(str, Field(description="Title of the content")),
46
- content=(str, Field(description="Main content")))
47
-
48
- try:
49
- # Try parsing as JSON schema
50
- if schema_input.startswith('{'):
51
- schema_dict = json.loads(schema_input)
52
- return json_schema_to_basemodel(schema_dict)
53
-
54
- # Try parsing as Python class definition
55
- elif 'class ' in schema_input and 'BaseModel' in schema_input:
56
- return python_class_to_basemodel(schema_input)
57
-
58
- # Try parsing as simple field definitions
59
- else:
60
- return simple_fields_to_basemodel(schema_input)
61
-
62
- except Exception as e:
63
- raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
64
-
65
- def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
66
- """Convert JSON schema to BaseModel"""
67
- fields = {}
68
- properties = schema_dict.get('properties', {})
69
- required = schema_dict.get('required', [])
70
-
71
- for field_name, field_info in properties.items():
72
- field_type = get_python_type(field_info.get('type', 'string'))
73
- field_description = field_info.get('description', '')
74
-
75
- if field_name in required:
76
- fields[field_name] = (field_type, Field(description=field_description))
77
- else:
78
- fields[field_name] = (field_type, Field(default=None, description=field_description))
79
-
80
- return create_model('DynamicSchema', **fields)
81
-
82
- def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
83
- """Convert Python class definition to BaseModel"""
84
- try:
85
- # Execute the class definition in a safe namespace
86
- namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
87
- 'float': float, 'bool': bool, 'list': list, 'dict': dict}
88
- exec(class_definition, namespace)
89
-
90
- # Find the class that inherits from BaseModel
91
- for name, obj in namespace.items():
92
- if (isinstance(obj, type) and
93
- issubclass(obj, BaseModel) and
94
- obj != BaseModel):
95
- return obj
96
-
97
- raise ValueError("No BaseModel class found in definition")
98
- except Exception as e:
99
- raise ValueError(f"Invalid Python class definition: {str(e)}")
100
-
101
- def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
102
- """Convert simple field definitions to BaseModel"""
103
- fields = {}
104
-
105
- for line in fields_text.strip().split('\n'):
106
- line = line.strip()
107
- if not line or line.startswith('#'):
108
- continue
109
-
110
- # Parse field definition (e.g., "name: str = description")
111
- if ':' in line:
112
- parts = line.split(':', 1)
113
- field_name = parts[0].strip()
114
-
115
- type_and_desc = parts[1].strip()
116
- if '=' in type_and_desc:
117
- type_part, desc_part = type_and_desc.split('=', 1)
118
- field_type = get_python_type(type_part.strip())
119
- description = desc_part.strip().strip('"\'')
120
- else:
121
- field_type = get_python_type(type_and_desc.strip())
122
- description = ""
123
-
124
- fields[field_name] = (field_type, Field(description=description))
125
- else:
126
- # Simple field name only
127
- field_name = line.strip()
128
- fields[field_name] = (str, Field(description=""))
129
-
130
- if not fields:
131
- raise ValueError("No valid fields found in schema definition")
132
-
133
- return create_model('DynamicSchema', **fields)
134
-
135
- def get_python_type(type_str: str):
136
- """Convert type string to Python type"""
137
- type_str = type_str.lower().strip()
138
- type_mapping = {
139
- 'string': str, 'str': str,
140
- 'integer': int, 'int': int,
141
- 'number': float, 'float': float,
142
- 'boolean': bool, 'bool': bool,
143
- 'array': list, 'list': list,
144
- 'object': dict, 'dict': dict
145
- }
146
- return type_mapping.get(type_str, str)
147
-
148
- def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
149
- """Wrapper function that converts schema input to BaseModel"""
150
- try:
151
- # Parse the schema input into a BaseModel
152
- schema_model = parse_schema_input(schema_input)
153
-
154
- # Call the original function
155
- return webpage_to_json(content, is_url, schema_model)
156
-
157
- except Exception as e:
158
- return {"error": f"Schema parsing error: {str(e)}"}
159
-
160
- def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
161
- """
162
- Extracts structured JSON information from a given content based on a specified schema.
163
- This function sets up a processing pipeline that includes:
164
- - Preprocessing the input content.
165
- - Utilizing an AI language model to extract information according to the provided schema.
166
- - Postprocessing the extracted output to match the exact schema requirements.
167
- Parameters:
168
- content (str): The input content to be analyzed. This can be direct text or a URL content.
169
- is_url (bool): A flag indicating whether the provided content is a URL (True) or raw text (False).
170
- schema (BaseModel): A Pydantic BaseModel defining the expected structure and data types for the output.
171
- Returns:
172
- Dict[str, Any]: A dictionary containing the extracted data matching the schema. In case of errors during initialization
173
- or processing, the dictionary will include an "error" key with a descriptive message.
174
- """
175
- prompt_template = """Extract the following information from the provided content according to the specified schema.
176
-
177
- Content to analyze:
178
- {content}
179
-
180
- Schema requirements:
181
- {schema}
182
-
183
- Instructions:
184
- - Extract only information that is explicitly present in the content
185
- - Follow the exact structure and data types specified in the schema
186
- - If a required field cannot be found, indicate this clearly
187
- - Preserve the original formatting and context where relevant
188
- - Return the extracted data in the format specified by the schema
189
- - STICK TO THE SCHEMA DON'T EVEN THINK OF DOING SOMETHING ELSE
190
- - IF THE SCHEMA ASKS FOR AN ARRAY THEN YOU MAY TRY TO EXTRACT ONE IF THERE IS
191
- - OR I WILL KILL AND KIDNAP YOUR FAMILY AND TORTURE THEM """
192
-
193
- classification_prompt_template = schema.model_json_schema()
194
- # Initialize pipeline components
195
- # TODO: improve the RAG system and optimize (don't instantiate every time)
196
- preprocessor = BasicPreprocessor(config={'keep_tags': True})
197
- try:
198
- # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
199
- llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'google/gemma-3n-e2b-it'})
200
- # reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})\
201
- reranker = ModalRerankerClient("https://abdulrahmanmfam2003--qwen3-reranker-html-rerank.modal.run")
202
- except Exception as e:
203
- return {"error": f"Failed to initialize LLM client: {str(e)}"}
204
-
205
- # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
206
- ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
207
- postprocessor = PostProcessor()
208
- pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
209
-
210
- try:
211
- result = pipeline.run(content, is_url, schema)
212
- print("-"*80)
213
- print(f"Processed result: {result}")
214
- return result
215
- except Exception as e:
216
- return {"error": f"Processing error: {str(e)}"}
217
-
218
- # Example schemas for the user
219
- example_schemas = """
220
- **Example Schema Formats:**
221
-
222
- 1. **Simple field definitions:**
223
- ```
224
- title: str = Page title
225
- price: float = Product price
226
- description: str = Product description
227
- available: bool = Is available
228
- ```
229
-
230
- 2. **JSON Schema:**
231
- ```json
232
- {
233
- "properties": {
234
- "title": {"type": "string", "description": "Page title"},
235
- "price": {"type": "number", "description": "Product price"},
236
- "description": {"type": "string", "description": "Product description"}
237
- },
238
- "required": ["title"]
239
- }
240
- ```
241
-
242
- 3. **Python Class Definition:**
243
- ```python
244
- class ProductSchema(BaseModel):
245
- title: str = Field(description="Product title")
246
- price: float = Field(description="Product price")
247
- description: str = Field(description="Product description")
248
- available: bool = Field(default=False, description="Availability status")
249
- ```
250
- """
251
-
252
- # Build Gradio Interface
253
- demo = gr.Interface(
254
- fn=webpage_to_json_wrapper,
255
- inputs=[
256
- gr.Textbox(
257
- label="Content (URL or Raw Text)",
258
- lines=10,
259
- placeholder="Enter URL or paste raw HTML/text here."
260
- ),
261
- gr.Checkbox(label="Content is URL?", value=False),
262
- gr.Textbox(
263
- label="Schema Definition",
264
- lines=15,
265
- placeholder="Define your extraction schema (see examples below)",
266
- info=example_schemas
267
- )
268
- ],
269
- outputs=gr.JSON(label="Output JSON"),
270
- title="Webpage to JSON Converter",
271
- description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
272
- examples=[
273
- [
274
- "https://example.com",
275
- True,
276
- "title: str = Page title\nprice: float = Product price\ndescription: str = Description"
277
- ],
278
- [
279
- "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
280
- False,
281
- '''{
282
- "type": "object",
283
- "properties": {
284
- "title": {
285
- "type": "string",
286
- "description": "Name of the product"
287
- },
288
- "price": {
289
- "type": "number",
290
- "description": "Price of the product"
291
- },
292
- "description": {
293
- "type": "string",
294
- "description": "Detailed description of the product"
295
- },
296
- "availability": {
297
- "type": "boolean",
298
- "description": "Whether the product is in stock (true) or not (false)"
299
- }
300
- },
301
- "required": ["title", "price"]
302
- }'''
303
- ]
304
- ]
305
- )
306
-
307
- if __name__ == "__main__":
308
- demo.launch(mcp_server=True)