Spaces:

IamVicky111
/

MistralScrapy

Sleeping

File size: 3,115 Bytes

import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
import gradio as gr
import subprocess
import redis
from langchain_community.vectorstores.redis import RedisVectorStore

#Using Mistral Modal
# Ensure Playwright installs required browsers and dependencies
subprocess.run(["playwright", "install"])
#subprocess.run(["playwright", "install-deps"])

# Load environment variables
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Initialize the model instances
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm_model_instance = HuggingFaceEndpoint(
    repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
)
#Calling Sentence Transformer
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)


r = redis.Redis(host="localhost", port=6379)
vector_store = RedisVectorStore(redis=r)

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance},
    "vector_store": {"model_instance": vector_store}
}
}
#To Scrape the data and summarize it
def scrape_and_summarize(prompt, source):
    smart_scraper_graph = SmartScraperGraph(
        prompt=prompt,
        source=source,
        config=graph_config
    )
    result = smart_scraper_graph.run()
    exec_info = smart_scraper_graph.get_execution_info()
    return result, prettify_exec_info(exec_info)

# Gradio User interface
with gr.Blocks() as demo:
    gr.Markdown("A project on WEB-SCRAPING using Mistral model")
    gr.Markdown("""Effortlessly extract and condense web content using cutting-edge AI models from the Hugging Face Hub—no coding required! Simply provide your desired prompt and source URL to begin. This no-code solution is inspired by the impressive library ScrapeGraphAI, and while it’s currently a basic demo, we encourage contributions to enhance its utility!""")
#(https://github.com/VinciGit00/Scrapegraph-ai) is suggested by the tutor
    with gr.Row():
        with gr.Column():
            
            model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2, As all-MiniLM-l6-v2")
            prompt_input = gr.Textbox(label="Prompt", value="List me all the doctors name and their timing")
            source_input = gr.Textbox(label="Source URL", value="https://www.yelp.com/search?find_desc=dentist&find_loc=San+Francisco%2C+CA")
            scrape_button = gr.Button("Scrape the data")
        
        with gr.Column():
            result_output = gr.JSON(label="Result")
            exec_info_output = gr.Textbox(label="Output Info")

    scrape_button.click(
        scrape_and_summarize,
        inputs=[prompt_input, source_input],
        outputs=[result_output, exec_info_output]
    )

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()