Spaces:

IamVicky111
/

MistralScrapy

Sleeping

App Files Files Community

MistralScrapy / app.py

IamVicky111

Update app.py

d2eaed3 verified 11 months ago

raw

history blame contribute delete

4.43 kB

	"""
	Web Scraper and Summarizer using Mistral AI.

	This module provides a Gradio-based web application for scraping websites
	and summarizing content using the Mistral AI language model. It allows users
	to input a prompt and a source URL, then generates a JSON output of the
	scraped and summarized information.

	Developer: Vicky_111
	LinkedIn: https://www.linkedin.com/in/itz-me-vicky111/
	"""

	import os
	import json
	import subprocess
	from typing import Dict, Any

	import gradio as gr
	from dotenv import load_dotenv
	from scrapegraphai.graphs import SmartScraperGraph
	from scrapegraphai.utils import prettify_exec_info
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

	# Ensure Playwright installs required browsers and dependencies
	subprocess.run(["playwright", "install"], check=True)
	# subprocess.run(["playwright", "install-deps"])

	# Load environment variables
	load_dotenv()
	HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

	# Initialize the model instances
	REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"
	llm_model_instance = HuggingFaceEndpoint(
	repo_id=REPO_ID,
	max_length=128,
	temperature=0.3,
	token=HUGGINGFACEHUB_API_TOKEN
	)
	# Embed using Hugging face interferance embedding
	embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
	api_key=HUGGINGFACEHUB_API_TOKEN,
	model_name="sentence-transformers/all-MiniLM-l6-v2"
	)

	graph_config = {
	"llm": {"model_instance": llm_model_instance},
	"embeddings": {"model_instance": embedder_model_instance}
	}

	# Using smart scraper graph the content is scrapped and summarised
	def scrape_and_summarize(prompt: str, source: str) -> Dict[str, Any]:
	"""
	Scrape a website and summarize its content based on a given prompt.

	This function uses the SmartScraperGraph to scrape the provided URL
	and generate a summary based on the given prompt. It ensures the output
	is in a valid JSON format.

	Args:
	prompt (str): The prompt to guide the scraping and summarization.
	source (str): The URL of the website to scrape.

	Returns:
	Dict[str, Any]: A JSON-formatted dictionary containing the scraped
	and summarized information.

	Raises:
	ValueError: If the output cannot be parsed as valid JSON.
	"""
	smart_scraper_graph = SmartScraperGraph(
	prompt=prompt,
	source=source,
	config=graph_config
	)
	result = smart_scraper_graph.run()

	# Ensure the result is properly formatted as JSON
	if isinstance(result, dict):
	return result

	try:
	return json.loads(result)
	except json.JSONDecodeError as e:
	# Attempt to extract JSON from the result
	start_index = result.find("[")
	end_index = result.rfind("]")
	if start_index != -1 and end_index != -1:
	json_str = result[start_index:end_index+1]
	try:
	return json.loads(json_str)
	except json.JSONDecodeError as inner_e:
	raise ValueError(f"Invalid JSON output: {result}") from inner_e
	else:
	raise ValueError(f"Invalid JSON output: {result}") from e


	# Gradio User interface
	with gr.Blocks() as demo:
	gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>")
	gr.Markdown("""
	This is a no code ML app for scraping <br>
	1. Just provide the Prompt, i.e., the items you want to scrape from the website <br>
	2. Provide the URL for the site you want to scrape, click Generate<br>
	And BOOM 💥 you can copy the result and view the execution details in the right side panel
	""")

	with gr.Row():
	with gr.Column():
	prompt_input = gr.Textbox(
	label="Prompt",
	value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too."
	)
	source_input = gr.Textbox(
	label="Source URL",
	value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist"
	)
	scrape_button = gr.Button("Generate")

	with gr.Column():
	result_output = gr.JSON(label="Result")

	scrape_button.click(
	scrape_and_summarize,
	inputs=[prompt_input, source_input],
	outputs=[result_output]
	)


	if __name__ == "__main__":
	demo.launch()