""" Web Scraper and Summarizer using Mistral AI. This module provides a Gradio-based web application for scraping websites and summarizing content using the Mistral AI language model. It allows users to input a prompt and a source URL, then generates a JSON output of the scraped and summarized information. Developer: Vicky_111 LinkedIn: https://www.linkedin.com/in/itz-me-vicky111/ """ import os import json import subprocess from typing import Dict, Any import gradio as gr from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from langchain_community.llms import HuggingFaceEndpoint from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings # Ensure Playwright installs required browsers and dependencies subprocess.run(["playwright", "install"], check=True) # subprocess.run(["playwright", "install-deps"]) # Load environment variables load_dotenv() HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') # Initialize the model instances REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2" llm_model_instance = HuggingFaceEndpoint( repo_id=REPO_ID, max_length=128, temperature=0.3, token=HUGGINGFACEHUB_API_TOKEN ) # Embed using Hugging face interferance embedding embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" ) graph_config = { "llm": {"model_instance": llm_model_instance}, "embeddings": {"model_instance": embedder_model_instance} } # Using smart scraper graph the content is scrapped and summarised def scrape_and_summarize(prompt: str, source: str) -> Dict[str, Any]: """ Scrape a website and summarize its content based on a given prompt. This function uses the SmartScraperGraph to scrape the provided URL and generate a summary based on the given prompt. It ensures the output is in a valid JSON format. Args: prompt (str): The prompt to guide the scraping and summarization. source (str): The URL of the website to scrape. Returns: Dict[str, Any]: A JSON-formatted dictionary containing the scraped and summarized information. Raises: ValueError: If the output cannot be parsed as valid JSON. """ smart_scraper_graph = SmartScraperGraph( prompt=prompt, source=source, config=graph_config ) result = smart_scraper_graph.run() # Ensure the result is properly formatted as JSON if isinstance(result, dict): return result try: return json.loads(result) except json.JSONDecodeError as e: # Attempt to extract JSON from the result start_index = result.find("[") end_index = result.rfind("]") if start_index != -1 and end_index != -1: json_str = result[start_index:end_index+1] try: return json.loads(json_str) except json.JSONDecodeError as inner_e: raise ValueError(f"Invalid JSON output: {result}") from inner_e else: raise ValueError(f"Invalid JSON output: {result}") from e # Gradio User interface with gr.Blocks() as demo: gr.Markdown("

Websites Scraper using Mistral AI

") gr.Markdown(""" This is a no code ML app for scraping
1. Just provide the Prompt, i.e., the items you want to scrape from the website
2. Provide the URL for the site you want to scrape, click Generate
And BOOM 💥 you can copy the result and view the execution details in the right side panel """) with gr.Row(): with gr.Column(): prompt_input = gr.Textbox( label="Prompt", value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too." ) source_input = gr.Textbox( label="Source URL", value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist" ) scrape_button = gr.Button("Generate") with gr.Column(): result_output = gr.JSON(label="Result") scrape_button.click( scrape_and_summarize, inputs=[prompt_input, source_input], outputs=[result_output] ) if __name__ == "__main__": demo.launch()