Spaces:

garage-lab
/

MCP_HTML2JSON

Building

MCP_HTML2JSON / web2json /pipeline.py

done with version 2

c67f04e 8 days ago

1.6 kB

	from web2json.ai_extractor import *
	from web2json.postprocessor import *
	from web2json.preprocessor import *
	from pydantic import BaseModel

	class Pipeline:
	# constructor
	def __init__(self,
	preprocessor: Preprocessor,
	ai_extractor: AIExtractor,
	postprocessor: PostProcessor):
	self.preprocessor = preprocessor
	self.ai_extractor = ai_extractor
	self.postprocessor = postprocessor

	def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
	"""
	Run the entire pipeline: preprocess, extract, and postprocess.

	Args:
	content (str): The raw content to process.
	is_url (bool): Whether the content is a URL or raw text.
	schema (BaseModel): The schema defining the structure of the expected output.

	Returns:
	dict: The final structured data after processing.
	"""
	# Step 1: Preprocess the content
	preprocessed_content = self.preprocessor.preprocess(content, is_url)
	print(f"Preprocessed content: {preprocessed_content}...")
	print('+'*80)
	# Step 2: Extract structured information using AI
	extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
	print(f"Extracted data: {extracted_data[:100]}...")
	print('+'*80)
	# Step 3: Post-process the extracted data
	final_output = self.postprocessor.process(extracted_data)
	print(f"Final output: {final_output}")
	print('+'*80)

	return final_output