MistralScrapy / app.py
IamVicky111's picture
Update app.py
d2eaed3 verified
"""
Web Scraper and Summarizer using Mistral AI.
This module provides a Gradio-based web application for scraping websites
and summarizing content using the Mistral AI language model. It allows users
to input a prompt and a source URL, then generates a JSON output of the
scraped and summarized information.
Developer: Vicky_111
LinkedIn: https://www.linkedin.com/in/itz-me-vicky111/
"""
import os
import json
import subprocess
from typing import Dict, Any
import gradio as gr
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
# Ensure Playwright installs required browsers and dependencies
subprocess.run(["playwright", "install"], check=True)
# subprocess.run(["playwright", "install-deps"])
# Load environment variables
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
# Initialize the model instances
REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"
llm_model_instance = HuggingFaceEndpoint(
repo_id=REPO_ID,
max_length=128,
temperature=0.3,
token=HUGGINGFACEHUB_API_TOKEN
)
# Embed using Hugging face interferance embedding
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
api_key=HUGGINGFACEHUB_API_TOKEN,
model_name="sentence-transformers/all-MiniLM-l6-v2"
)
graph_config = {
"llm": {"model_instance": llm_model_instance},
"embeddings": {"model_instance": embedder_model_instance}
}
# Using smart scraper graph the content is scrapped and summarised
def scrape_and_summarize(prompt: str, source: str) -> Dict[str, Any]:
"""
Scrape a website and summarize its content based on a given prompt.
This function uses the SmartScraperGraph to scrape the provided URL
and generate a summary based on the given prompt. It ensures the output
is in a valid JSON format.
Args:
prompt (str): The prompt to guide the scraping and summarization.
source (str): The URL of the website to scrape.
Returns:
Dict[str, Any]: A JSON-formatted dictionary containing the scraped
and summarized information.
Raises:
ValueError: If the output cannot be parsed as valid JSON.
"""
smart_scraper_graph = SmartScraperGraph(
prompt=prompt,
source=source,
config=graph_config
)
result = smart_scraper_graph.run()
# Ensure the result is properly formatted as JSON
if isinstance(result, dict):
return result
try:
return json.loads(result)
except json.JSONDecodeError as e:
# Attempt to extract JSON from the result
start_index = result.find("[")
end_index = result.rfind("]")
if start_index != -1 and end_index != -1:
json_str = result[start_index:end_index+1]
try:
return json.loads(json_str)
except json.JSONDecodeError as inner_e:
raise ValueError(f"Invalid JSON output: {result}") from inner_e
else:
raise ValueError(f"Invalid JSON output: {result}") from e
# Gradio User interface
with gr.Blocks() as demo:
gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>")
gr.Markdown("""
This is a no code ML app for scraping <br>
1. Just provide the Prompt, i.e., the items you want to scrape from the website <br>
2. Provide the URL for the site you want to scrape, click Generate<br>
And BOOM πŸ’₯ you can copy the result and view the execution details in the right side panel
""")
with gr.Row():
with gr.Column():
prompt_input = gr.Textbox(
label="Prompt",
value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too."
)
source_input = gr.Textbox(
label="Source URL",
value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist"
)
scrape_button = gr.Button("Generate")
with gr.Column():
result_output = gr.JSON(label="Result")
scrape_button.click(
scrape_and_summarize,
inputs=[prompt_input, source_input],
outputs=[result_output]
)
if __name__ == "__main__":
demo.launch()