import os from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from langchain_community.llms import HuggingFaceEndpoint from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings import gradio as gr import subprocess import json # Ensure Playwright installs required browsers and dependencies subprocess.run(["playwright", "install"]) #subprocess.run(["playwright", "install-deps"]) # Load environment variables load_dotenv() HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') # Initialize the model instances repo_id = "mistralai/Mistral-7B-Instruct-v0.2" llm_model_instance = HuggingFaceEndpoint( repo_id=repo_id, max_length=128, temperature=0.3, token=HUGGINGFACEHUB_API_TOKEN ) embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" ) graph_config = { "llm": {"model_instance": llm_model_instance}, "embeddings": {"model_instance": embedder_model_instance} } def scrape_and_summarize(prompt, source): smart_scraper_graph = SmartScraperGraph( prompt=prompt, source=source, config=graph_config ) result = smart_scraper_graph.run() # Ensure the result is properly formatted as JSON if isinstance(result, dict): result_json = result else: try: result_json = json.loads(result) except json.JSONDecodeError as e: # Attempt to extract JSON from the result start_index = result.find("[") end_index = result.rfind("]") if start_index != -1 and end_index != -1: json_str = result[start_index:end_index+1] try: result_json = json.loads(json_str) except json.JSONDecodeError as inner_e: raise ValueError(f"Invalid JSON output: {result}") from inner_e else: raise ValueError(f"Invalid JSON output: {result}") from e return result_json # Gradio interface with gr.Blocks() as demo: gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>") gr.Markdown("""This is a no code ML app for scraping <br> 1. Just provide the Prompt, ie., the items you wanna Scrap from the website <br> 2. Provide the url for the site you wanna Scrap, click Generate<br> And BOOM 💥 you can copy the result and view the execution details in the right side pannel """) with gr.Row(): with gr.Column(): prompt_input = gr.Textbox(label="Prompt", value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too.") source_input = gr.Textbox(label="Source URL", value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist") scrape_button = gr.Button("Generate") with gr.Column(): result_output = gr.JSON(label="Result") scrape_button.click( scrape_and_summarize, inputs=[prompt_input, source_input], outputs=[result_output] ) # Launch the Gradio app if __name__ == "__main__": demo.launch()