anshu-man853 commited on
Commit
9422afe
·
1 Parent(s): 7f15265

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +31 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+ import html
6
+
7
+ # Define the web scraping function
8
+ def scrape_website(url):
9
+ # Send a GET request to the website
10
+ response = requests.get(url)
11
+ html_content = response.content
12
+ # Parse the HTML content using BeautifulSoup
13
+ soup = BeautifulSoup(html_content, "html.parser")
14
+ # Extract all text from the HTML
15
+ text = soup.get_text()
16
+ # Clean the text by removing extra whitespaces and special characters
17
+ cleaned_text = re.sub(r"\s+", " ", text)
18
+ cleaned_text = html.unescape(cleaned_text)
19
+ return cleaned_text
20
+
21
+ # Create a Gradio interface
22
+ iface = gr.Interface(
23
+ fn=scrape_website,
24
+ inputs="text",
25
+ outputs="text",
26
+ title="Web Scraping",
27
+ description="Enter a website URL to scrape its text",
28
+ example="https://www.example.com"
29
+ )
30
+
31
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ requests
3
+ beautifulsoup4