""" Web Scraper and Summarizer using Mistral AI. This module provides a Gradio-based web application for scraping websites and summarizing content using the Mistral AI language model. It allows users to input a prompt and a source URL, then generates a JSON output of the scraped and summarized information. Developer: Vicky_111 LinkedIn: https://www.linkedin.com/in/itz-me-vicky111/ """ import os import json import subprocess from typing import Dict, Any import gradio as gr from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from langchain_community.llms import HuggingFaceEndpoint from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings # Ensure Playwright installs required browsers and dependencies subprocess.run(["playwright", "install"], check=True) # subprocess.run(["playwright", "install-deps"]) # Load environment variables load_dotenv() HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') # Initialize the model instances REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2" llm_model_instance = HuggingFaceEndpoint( repo_id=REPO_ID, max_length=128, temperature=0.3, token=HUGGINGFACEHUB_API_TOKEN ) # Embed using Hugging face interferance embedding embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2" ) graph_config = { "llm": {"model_instance": llm_model_instance}, "embeddings": {"model_instance": embedder_model_instance} } # Using smart scraper graph the content is scrapped and summarised def scrape_and_summarize(prompt: str, source: str) -> Dict[str, Any]: """ Scrape a website and summarize its content based on a given prompt. This function uses the SmartScraperGraph to scrape the provided URL and generate a summary based on the given prompt. It ensures the output is in a valid JSON format. Args: prompt (str): The prompt to guide the scraping and summarization. source (str): The URL of the website to scrape. Returns: Dict[str, Any]: A JSON-formatted dictionary containing the scraped and summarized information. Raises: ValueError: If the output cannot be parsed as valid JSON. """ smart_scraper_graph = SmartScraperGraph( prompt=prompt, source=source, config=graph_config ) result = smart_scraper_graph.run() # Ensure the result is properly formatted as JSON if isinstance(result, dict): return result try: return json.loads(result) except json.JSONDecodeError as e: # Attempt to extract JSON from the result start_index = result.find("[") end_index = result.rfind("]") if start_index != -1 and end_index != -1: json_str = result[start_index:end_index+1] try: return json.loads(json_str) except json.JSONDecodeError as inner_e: raise ValueError(f"Invalid JSON output: {result}") from inner_e else: raise ValueError(f"Invalid JSON output: {result}") from e # Gradio User interface with gr.Blocks() as demo: gr.Markdown("