File size: 4,428 Bytes
d2eaed3
 
 
 
 
 
 
 
 
 
 
 
8b741f1
d2eaed3
 
 
 
 
8b741f1
 
 
 
 
6cdcd01
8b741f1
d2eaed3
 
8b741f1
 
 
 
 
 
d2eaed3
8b741f1
d2eaed3
 
 
 
8b741f1
d2eaed3
8b741f1
d2eaed3
 
8b741f1
 
 
 
4c71ef0
6cdcd01
b7ea448
d2eaed3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b741f1
 
 
 
 
 
d2eaed3
4c71ef0
 
d2eaed3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b741f1
be551d0
d2eaed3
 
 
 
 
 
4c71ef0
8b741f1
 
d2eaed3
 
 
 
 
 
 
 
6084635
d2eaed3
8b741f1
 
d2eaed3
8b741f1
 
 
028a1e6
8b741f1
 
d2eaed3
8b741f1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
Web Scraper and Summarizer using Mistral AI.

This module provides a Gradio-based web application for scraping websites
and summarizing content using the Mistral AI language model. It allows users
to input a prompt and a source URL, then generates a JSON output of the
scraped and summarized information.

Developer: Vicky_111
LinkedIn: https://www.linkedin.com/in/itz-me-vicky111/
"""

import os
import json
import subprocess
from typing import Dict, Any

import gradio as gr
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

# Ensure Playwright installs required browsers and dependencies
subprocess.run(["playwright", "install"], check=True)
# subprocess.run(["playwright", "install-deps"])

# Load environment variables
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Initialize the model instances
REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"
llm_model_instance = HuggingFaceEndpoint(
    repo_id=REPO_ID,
    max_length=128,
    temperature=0.3,
    token=HUGGINGFACEHUB_API_TOKEN
)
# Embed using Hugging face interferance embedding
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN,
    model_name="sentence-transformers/all-MiniLM-l6-v2"
)

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance}
}

# Using smart scraper graph the content is scrapped and summarised
def scrape_and_summarize(prompt: str, source: str) -> Dict[str, Any]:
    """
    Scrape a website and summarize its content based on a given prompt.

    This function uses the SmartScraperGraph to scrape the provided URL
    and generate a summary based on the given prompt. It ensures the output
    is in a valid JSON format.

    Args:
        prompt (str): The prompt to guide the scraping and summarization.
        source (str): The URL of the website to scrape.

    Returns:
        Dict[str, Any]: A JSON-formatted dictionary containing the scraped
        and summarized information.

    Raises:
        ValueError: If the output cannot be parsed as valid JSON.
    """
    smart_scraper_graph = SmartScraperGraph(
        prompt=prompt,
        source=source,
        config=graph_config
    )
    result = smart_scraper_graph.run()

    # Ensure the result is properly formatted as JSON
    if isinstance(result, dict):
        return result

    try:
        return json.loads(result)
    except json.JSONDecodeError as e:
        # Attempt to extract JSON from the result
        start_index = result.find("[")
        end_index = result.rfind("]")
        if start_index != -1 and end_index != -1:
            json_str = result[start_index:end_index+1]
            try:
                return json.loads(json_str)
            except json.JSONDecodeError as inner_e:
                raise ValueError(f"Invalid JSON output: {result}") from inner_e
        else:
            raise ValueError(f"Invalid JSON output: {result}") from e


# Gradio User interface
with gr.Blocks() as demo:
    gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>")
    gr.Markdown("""
    This is a no code ML app for scraping <br>
    1. Just provide the Prompt, i.e., the items you want to scrape from the website <br>
    2. Provide the URL for the site you want to scrape, click Generate<br>
    And BOOM 💥 you can copy the result and view the execution details in the right side panel
    """)

    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(
                label="Prompt",
                value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too."
            )
            source_input = gr.Textbox(
                label="Source URL",
                value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist"
            )
            scrape_button = gr.Button("Generate")

        with gr.Column():
            result_output = gr.JSON(label="Result")

    scrape_button.click(
        scrape_and_summarize,
        inputs=[prompt_input, source_input],
        outputs=[result_output]
    )


if __name__ == "__main__":
    demo.launch()