bsenst commited on
Commit
9bc8ed7
·
1 Parent(s): efb0e4e

add xpath und css selector options

Browse files
Files changed (3) hide show
  1. README.md +52 -0
  2. app.py +66 -30
  3. requirements.txt +1 -0
README.md CHANGED
@@ -11,3 +11,55 @@ short_description: URLs extrahieren
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ ## Installation and Setup
16
+
17
+ 1. **Clone the Repository**
18
+ Open a terminal and run the following command to clone the repository:
19
+
20
+ ```bash
21
+ git clone https://huggingface.co/spaces/datenwerkzeuge/Webseiten-URL-Extraktor
22
+ ```
23
+
24
+ 2. **Navigate to the Streamlit Application Directory**
25
+ Change your directory to the `streamlit` folder where the `app.py` and `requirements.txt` files are located:
26
+
27
+ ```bash
28
+ cd Webseiten-URL-Extraktor
29
+ ```
30
+
31
+ 3. **Create and Activate a Virtual Environment (Optional but Recommended)**
32
+ It's a good practice to use a virtual environment to manage dependencies. To create and activate a virtual environment, use the following commands:
33
+
34
+ ```bash
35
+ # Create a virtual environment
36
+ python -m venv venv
37
+
38
+ # Activate the virtual environment
39
+ # On Windows:
40
+ .\venv\Scripts\activate
41
+
42
+ # On macOS/Linux:
43
+ source venv/bin/activate
44
+ ```
45
+
46
+ 4. **Install the Required Packages**
47
+ Install the dependencies listed in `requirements.txt`:
48
+
49
+ ```bash
50
+ pip install -r requirements.txt
51
+ ```
52
+
53
+ 5. **Run the Streamlit Application**
54
+ To start the Streamlit app, use the following command:
55
+
56
+ ```bash
57
+ streamlit run app.py
58
+ ```
59
+
60
+ 6. **Access the Application**
61
+ Once the server starts, you can access the web application by visiting:
62
+
63
+ ```
64
+ http://localhost:8501
65
+ ```
app.py CHANGED
@@ -1,26 +1,36 @@
1
  import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
 
5
- # Function to extract all URLs from a webpage
6
- def extract_urls_from_url(url):
7
  try:
8
- response = requests.get(url)
9
- response.raise_for_status()
10
- soup = BeautifulSoup(response.text, 'html.parser')
11
- links = [a.get('href') for a in soup.find_all('a', href=True)]
12
  return links
13
  except Exception as e:
14
- return f"Error processing the URL: {e}"
 
15
 
16
- # Function to extract all URLs from uploaded HTML content
17
- def extract_urls_from_html(html_content):
18
  try:
19
- soup = BeautifulSoup(html_content, 'html.parser')
20
- links = [a.get('href') for a in soup.find_all('a', href=True)]
21
  return links
22
  except Exception as e:
23
- return f"Error processing the HTML file: {e}"
 
 
 
 
 
 
 
 
 
24
 
25
  # Streamlit App
26
  st.title("Webseiten-URL-Extraktor")
@@ -29,38 +39,64 @@ st.title("Webseiten-URL-Extraktor")
29
  input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))
30
 
31
  if input_option == "URL":
32
- # Input field for URL
33
- url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://example.com")
 
 
 
 
 
 
 
34
 
35
- if st.button("URLs extrahieren"):
36
  if url_input:
37
- st.write(f"Extrahiere URLs von: {url_input}")
38
- urls = extract_urls_from_url(url_input)
39
- if isinstance(urls, list):
40
- for url in set(urls):
41
- if url.startswith("http"):
 
 
 
 
 
 
42
  st.write(url)
 
 
43
  else:
44
- st.error(urls)
45
  else:
46
  st.warning("Bitte geben Sie eine gültige URL ein.")
47
 
48
  elif input_option == "HTML-Datei hochladen":
49
- # File uploader for HTML files
50
  uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")
51
 
52
- if st.button("URLs extrahieren"):
 
 
 
 
 
 
 
 
53
  if uploaded_file:
54
  try:
55
  html_content = uploaded_file.read().decode("utf-8")
56
- st.write("Extrahiere URLs aus der hochgeladenen HTML-Datei...")
57
- urls = extract_urls_from_html(html_content)
58
- if isinstance(urls, list):
59
- for url in set(urls):
60
- if url.startswith("http"):
61
- st.write(url)
 
 
 
 
62
  else:
63
- st.error(urls)
64
  except Exception as e:
65
  st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
66
  else:
 
1
  import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
+ from lxml import html
5
 
6
+ # Function to extract links using XPath
7
+ def extract_links_with_xpath(content, xpath):
8
  try:
9
+ tree = html.fromstring(content)
10
+ elements = tree.xpath(xpath)
11
+ links = [elem.text for elem in elements]
 
12
  return links
13
  except Exception as e:
14
+ return f"Error processing the XPath: {e}"
15
+
16
 
17
+ # Function to extract links using CSS selector
18
+ def extract_links_with_css(content, css_selector):
19
  try:
20
+ soup = BeautifulSoup(content, 'html.parser')
21
+ links = [a.text for a in soup.select(css_selector)]
22
  return links
23
  except Exception as e:
24
+ return f"Error processing the CSS selector: {e}"
25
+
26
+ # Function to extract all URLs from a webpage
27
+ def extract_urls_from_url(url):
28
+ try:
29
+ response = requests.get(url)
30
+ response.raise_for_status()
31
+ return response.text
32
+ except Exception as e:
33
+ return f"Error processing the URL: {e}"
34
 
35
  # Streamlit App
36
  st.title("Webseiten-URL-Extraktor")
 
39
  input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))
40
 
41
  if input_option == "URL":
42
+ url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/")
43
+
44
+ extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
45
+ custom_input = None
46
+
47
+ if extraction_method == "XPath":
48
+ custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
49
+ elif extraction_method == "CSS Selector":
50
+ custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
51
 
52
+ if st.button("Extrahieren"):
53
  if url_input:
54
+ st.write(f"Extrahiere von: {url_input}")
55
+ page_content = extract_urls_from_url(url_input)
56
+
57
+ if isinstance(page_content, str):
58
+ if extraction_method == "XPath":
59
+ links = extract_links_with_xpath(page_content, custom_input)
60
+ elif extraction_method == "CSS Selector":
61
+ links = extract_links_with_css(page_content, custom_input)
62
+
63
+ if isinstance(links, list):
64
+ for url in set(links):
65
  st.write(url)
66
+ else:
67
+ st.error(links)
68
  else:
69
+ st.error(page_content)
70
  else:
71
  st.warning("Bitte geben Sie eine gültige URL ein.")
72
 
73
  elif input_option == "HTML-Datei hochladen":
 
74
  uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")
75
 
76
+ extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
77
+ custom_input = None
78
+
79
+ if extraction_method == "XPath":
80
+ custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
81
+ elif extraction_method == "CSS Selector":
82
+ custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
83
+
84
+ if st.button("Extrahieren"):
85
  if uploaded_file:
86
  try:
87
  html_content = uploaded_file.read().decode("utf-8")
88
+ st.write("Extrahiere aus der hochgeladenen HTML-Datei...")
89
+
90
+ if extraction_method == "XPath":
91
+ links = extract_links_with_xpath(html_content, custom_input)
92
+ elif extraction_method == "CSS Selector":
93
+ links = extract_links_with_css(html_content, custom_input)
94
+
95
+ if isinstance(links, list):
96
+ for url in set(links):
97
+ st.write(url)
98
  else:
99
+ st.error(links)
100
  except Exception as e:
101
  st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
102
  else:
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  streamlit
2
  beautifulsoup4
 
 
1
  streamlit
2
  beautifulsoup4
3
+ lxml