IamVicky111 commited on
Commit
d2eaed3
·
verified ·
1 Parent(s): 028a1e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -37
app.py CHANGED
@@ -1,29 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
 
 
 
2
  from dotenv import load_dotenv
3
  from scrapegraphai.graphs import SmartScraperGraph
4
  from scrapegraphai.utils import prettify_exec_info
5
  from langchain_community.llms import HuggingFaceEndpoint
6
  from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
7
- import gradio as gr
8
- import subprocess
9
- import json
10
 
11
  # Ensure Playwright installs required browsers and dependencies
12
- subprocess.run(["playwright", "install"])
13
- #subprocess.run(["playwright", "install-deps"])
14
 
15
  # Load environment variables
16
  load_dotenv()
17
  HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
18
 
19
  # Initialize the model instances
20
- repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
21
  llm_model_instance = HuggingFaceEndpoint(
22
- repo_id=repo_id, max_length=128, temperature=0.3, token=HUGGINGFACEHUB_API_TOKEN
 
 
 
23
  )
24
-
25
  embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
26
- api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
 
27
  )
28
 
29
  graph_config = {
@@ -31,55 +49,84 @@ graph_config = {
31
  "embeddings": {"model_instance": embedder_model_instance}
32
  }
33
 
34
- def scrape_and_summarize(prompt, source):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  smart_scraper_graph = SmartScraperGraph(
36
  prompt=prompt,
37
  source=source,
38
  config=graph_config
39
  )
40
  result = smart_scraper_graph.run()
41
-
42
  # Ensure the result is properly formatted as JSON
43
  if isinstance(result, dict):
44
- result_json = result
45
- else:
46
- try:
47
- result_json = json.loads(result)
48
- except json.JSONDecodeError as e:
49
- # Attempt to extract JSON from the result
50
- start_index = result.find("[")
51
- end_index = result.rfind("]")
52
- if start_index != -1 and end_index != -1:
53
- json_str = result[start_index:end_index+1]
54
- try:
55
- result_json = json.loads(json_str)
56
- except json.JSONDecodeError as inner_e:
57
- raise ValueError(f"Invalid JSON output: {result}") from inner_e
58
- else:
59
- raise ValueError(f"Invalid JSON output: {result}") from e
60
-
61
- return result_json
62
-
63
- # Gradio interface
64
  with gr.Blocks() as demo:
65
  gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>")
66
- gr.Markdown("""This is a no code ML app for scraping <br> 1. Just provide the Prompt, ie., the items you wanna Scrap from the website <br> 2. Provide the url for the site you wanna Scrap, click Generate<br> And BOOM 💥 you can copy the result and view the execution details in the right side pannel """)
 
 
 
 
 
67
 
68
  with gr.Row():
69
  with gr.Column():
70
- prompt_input = gr.Textbox(label="Prompt", value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too.")
71
- source_input = gr.Textbox(label="Source URL", value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist")
 
 
 
 
 
 
72
  scrape_button = gr.Button("Generate")
73
-
74
  with gr.Column():
75
  result_output = gr.JSON(label="Result")
76
-
77
  scrape_button.click(
78
  scrape_and_summarize,
79
  inputs=[prompt_input, source_input],
80
  outputs=[result_output]
81
  )
82
 
83
- # Launch the Gradio app
84
  if __name__ == "__main__":
85
  demo.launch()
 
1
+ """
2
+ Web Scraper and Summarizer using Mistral AI.
3
+
4
+ This module provides a Gradio-based web application for scraping websites
5
+ and summarizing content using the Mistral AI language model. It allows users
6
+ to input a prompt and a source URL, then generates a JSON output of the
7
+ scraped and summarized information.
8
+
9
+ Developer: Vicky_111
10
+ LinkedIn: https://www.linkedin.com/in/itz-me-vicky111/
11
+ """
12
+
13
  import os
14
+ import json
15
+ import subprocess
16
+ from typing import Dict, Any
17
+
18
+ import gradio as gr
19
  from dotenv import load_dotenv
20
  from scrapegraphai.graphs import SmartScraperGraph
21
  from scrapegraphai.utils import prettify_exec_info
22
  from langchain_community.llms import HuggingFaceEndpoint
23
  from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
 
 
 
24
 
25
  # Ensure Playwright installs required browsers and dependencies
26
+ subprocess.run(["playwright", "install"], check=True)
27
+ # subprocess.run(["playwright", "install-deps"])
28
 
29
  # Load environment variables
30
  load_dotenv()
31
  HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
32
 
33
  # Initialize the model instances
34
+ REPO_ID = "mistralai/Mistral-7B-Instruct-v0.2"
35
  llm_model_instance = HuggingFaceEndpoint(
36
+ repo_id=REPO_ID,
37
+ max_length=128,
38
+ temperature=0.3,
39
+ token=HUGGINGFACEHUB_API_TOKEN
40
  )
41
+ # Embed using Hugging face interferance embedding
42
  embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
43
+ api_key=HUGGINGFACEHUB_API_TOKEN,
44
+ model_name="sentence-transformers/all-MiniLM-l6-v2"
45
  )
46
 
47
  graph_config = {
 
49
  "embeddings": {"model_instance": embedder_model_instance}
50
  }
51
 
52
+ # Using smart scraper graph the content is scrapped and summarised
53
+ def scrape_and_summarize(prompt: str, source: str) -> Dict[str, Any]:
54
+ """
55
+ Scrape a website and summarize its content based on a given prompt.
56
+
57
+ This function uses the SmartScraperGraph to scrape the provided URL
58
+ and generate a summary based on the given prompt. It ensures the output
59
+ is in a valid JSON format.
60
+
61
+ Args:
62
+ prompt (str): The prompt to guide the scraping and summarization.
63
+ source (str): The URL of the website to scrape.
64
+
65
+ Returns:
66
+ Dict[str, Any]: A JSON-formatted dictionary containing the scraped
67
+ and summarized information.
68
+
69
+ Raises:
70
+ ValueError: If the output cannot be parsed as valid JSON.
71
+ """
72
  smart_scraper_graph = SmartScraperGraph(
73
  prompt=prompt,
74
  source=source,
75
  config=graph_config
76
  )
77
  result = smart_scraper_graph.run()
78
+
79
  # Ensure the result is properly formatted as JSON
80
  if isinstance(result, dict):
81
+ return result
82
+
83
+ try:
84
+ return json.loads(result)
85
+ except json.JSONDecodeError as e:
86
+ # Attempt to extract JSON from the result
87
+ start_index = result.find("[")
88
+ end_index = result.rfind("]")
89
+ if start_index != -1 and end_index != -1:
90
+ json_str = result[start_index:end_index+1]
91
+ try:
92
+ return json.loads(json_str)
93
+ except json.JSONDecodeError as inner_e:
94
+ raise ValueError(f"Invalid JSON output: {result}") from inner_e
95
+ else:
96
+ raise ValueError(f"Invalid JSON output: {result}") from e
97
+
98
+
99
+ # Gradio User interface
 
100
  with gr.Blocks() as demo:
101
  gr.Markdown("<h1>Websites Scraper using Mistral AI</h1>")
102
+ gr.Markdown("""
103
+ This is a no code ML app for scraping <br>
104
+ 1. Just provide the Prompt, i.e., the items you want to scrape from the website <br>
105
+ 2. Provide the URL for the site you want to scrape, click Generate<br>
106
+ And BOOM 💥 you can copy the result and view the execution details in the right side panel
107
+ """)
108
 
109
  with gr.Row():
110
  with gr.Column():
111
+ prompt_input = gr.Textbox(
112
+ label="Prompt",
113
+ value="List me all the hospital or clinic names and their opening closing time, if the mobile number is present provide it too."
114
+ )
115
+ source_input = gr.Textbox(
116
+ label="Source URL",
117
+ value="https://www.yelp.com/biz/all-smiles-dental-san-francisco-5?osq=dentist"
118
+ )
119
  scrape_button = gr.Button("Generate")
120
+
121
  with gr.Column():
122
  result_output = gr.JSON(label="Result")
123
+
124
  scrape_button.click(
125
  scrape_and_summarize,
126
  inputs=[prompt_input, source_input],
127
  outputs=[result_output]
128
  )
129
 
130
+
131
  if __name__ == "__main__":
132
  demo.launch()