File size: 3,521 Bytes
8b741f1
 
 
 
 
 
 
 
4c71ef0
6cdcd01
8b741f1
 
 
 
 
 
 
 
 
 
 
 
 
4c71ef0
8b741f1
 
 
 
 
 
4c71ef0
6cdcd01
b7ea448
8b741f1
 
 
 
 
 
 
 
4c71ef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b741f1
4c71ef0
 
 
8b741f1
4c71ef0
 
 
 
 
8b741f1
 
4c71ef0
 
 
 
8b741f1
 
 
4c71ef0
8b741f1
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
import gradio as gr
import subprocess
import json

# Ensure Playwright installs required browsers and dependencies
subprocess.run(["playwright", "install"])
#subprocess.run(["playwright", "install-deps"])

# Load environment variables
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Initialize the model instances
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
llm_model_instance = HuggingFaceEndpoint(
    repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
)

embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
)

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance}
}

def scrape_and_summarize(prompt, source):
    smart_scraper_graph = SmartScraperGraph(
        prompt=prompt,
        source=source,
        config=graph_config
    )
    result = smart_scraper_graph.run()
    exec_info = smart_scraper_graph.get_execution_info()
    
    # Ensure the result is properly formatted as JSON
    if isinstance(result, dict):
        result_json = result
    else:
        try:
            result_json = json.loads(result)
        except json.JSONDecodeError as e:
            # Attempt to extract JSON from the result
            start_index = result.find("[")
            end_index = result.rfind("]")
            if start_index != -1 and end_index != -1:
                json_str = result[start_index:end_index+1]
                try:
                    result_json = json.loads(json_str)
                except json.JSONDecodeError as inner_e:
                    raise ValueError(f"Invalid JSON output: {result}") from inner_e
            else:
                raise ValueError(f"Invalid JSON output: {result}") from e

    return result_json, prettify_exec_info(exec_info)

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Scrape websites, no-code version")
    gr.Markdown("""Easily scrape and summarize web content using advanced AI models on the Hugging Face Hub without writing any code. Input your desired prompt and source URL to get started.
                This is a no-code version of the excellent lib [ScrapeGraphAI](https://github.com/VinciGit00/Scrapegraph-ai).
                It's a basic demo and a work in progress. Please contribute to it to make it more useful!""")

    with gr.Row():
        with gr.Column():
            model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
            prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
            source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
            scrape_button = gr.Button("Scrape and Summarize")
        
        with gr.Column():
            result_output = gr.JSON(label="Result")
            exec_info_output = gr.Textbox(label="Execution Info")

    scrape_button.click(
        scrape_and_summarize,
        inputs=[prompt_input, source_input],
        outputs=[result_output, exec_info_output]
    )

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()