Spaces:
Running
Running
""" | |
Web Scraper and Summarizer using Mistral AI. | |
This module provides a Gradio-based web application for scraping websites | |
and summarizing content using the Mistral AI language model. It allows users | |
to input a prompt and a source URL, then generates a JSON output of the | |
scraped and summarized information. | |
Developer: Vicky_111 | |
LinkedIn: https://www.linkedin.com/in/itz-me-vicky111/ | |
""" | |
import os | |
import json | |
import subprocess | |
from typing import Dict, Any | |
import gradio as gr | |
from dotenv import load_dotenv | |
from scrapegraphai.graphs import SmartScraperGraph | |
from scrapegraphai.utils import prettify_exec_info | |
from langchain_community.llms import HuggingFaceEndpoint | |
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings | |
# Ensure Playwright installs required browsers and dependencies | |
subprocess.run(["playwright", "install"], check=True) | |
# subprocess.run(["playwright", "install-deps"]) | |
# Load environment variables | |
load_dotenv() | |
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') | |
# Initialize the model instances | |
REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2" | |
llm_model_instance = HuggingFaceEndpoint( | |
repo_id=REPO_ID, | |
max_length=128, | |
temperature=0.3, | |
token=HUGGINGFACEHUB_API_TOKEN | |
) | |
# Embed using Hugging face interferance embedding | |
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( | |
api_key=HUGGINGFACEHUB_API_TOKEN, | |
model_name="sentence-transformers/all-MiniLM-l6-v2" | |
) | |
graph_config = { | |
"llm": {"model_instance": llm_model_instance}, | |
"embeddings": {"model_instance": embedder_model_instance} | |
} | |
# Using smart scraper graph the content is scrapped and summarised | |
def scrape_and_summarize(prompt: str, source: str) -> Dict[str, Any]: | |
""" | |
Scrape a website and summarize its content based on a given prompt. | |
This function uses the SmartScraperGraph to scrape the provided URL | |
and generate a summary based on the given prompt. It ensures the output | |
is in a valid JSON format. | |
Args: | |
prompt (str): The prompt to guide the scraping and summarization. | |
source (str): The URL of the website to scrape. | |
Returns: | |
Dict[str, Any]: A JSON-formatted dictionary containing the scraped | |
and summarized information. | |
Raises: | |
ValueError: If the output cannot be parsed as valid JSON. | |
""" | |
smart_scraper_graph = SmartScraperGraph( | |
prompt=prompt, | |
source=source, | |
config=graph_config | |
) | |
result = smart_scraper_graph.run() | |
# Ensure the result is properly formatted as JSON | |
if isinstance(result, dict): | |
return result | |
try: | |
return json.loads(result) | |
except json.JSONDecodeError as e: | |
# Attempt to extract JSON from the result | |
start_index = result.find("[") | |
end_index = result.rfind("]") | |
if start_index != -1 and end_index != -1: | |
json_str = result[start_index:end_index+1] | |
try: | |
return json.loads(json_str) | |
except json.JSONDecodeError as inner_e: | |
raise ValueError(f"Invalid JSON output: {result}") from inner_e | |
else: | |
raise ValueError(f"Invalid JSON output: {result}") from e | |
# Gradio User interface | |
with gr.Blocks() as demo: | |
gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>") | |
gr.Markdown(""" | |
This is a no code ML app for scraping <br> | |
1. Just provide the Prompt, i.e., the items you want to scrape from the website <br> | |
2. Provide the URL for the site you want to scrape, click Generate<br> | |
And BOOM π₯ you can copy the result and view the execution details in the right side panel | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
prompt_input = gr.Textbox( | |
label="Prompt", | |
value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too." | |
) | |
source_input = gr.Textbox( | |
label="Source URL", | |
value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist" | |
) | |
scrape_button = gr.Button("Generate") | |
with gr.Column(): | |
result_output = gr.JSON(label="Result") | |
scrape_button.click( | |
scrape_and_summarize, | |
inputs=[prompt_input, source_input], | |
outputs=[result_output] | |
) | |
if __name__ == "__main__": | |
demo.launch() |