add xpath und css selector options
Browse files- README.md +52 -0
- app.py +66 -30
- requirements.txt +1 -0
README.md
CHANGED
@@ -11,3 +11,55 @@ short_description: URLs extrahieren
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
15 |
+
## Installation and Setup
|
16 |
+
|
17 |
+
1. **Clone the Repository**
|
18 |
+
Open a terminal and run the following command to clone the repository:
|
19 |
+
|
20 |
+
```bash
|
21 |
+
git clone https://huggingface.co/spaces/datenwerkzeuge/Webseiten-URL-Extraktor
|
22 |
+
```
|
23 |
+
|
24 |
+
2. **Navigate to the Streamlit Application Directory**
|
25 |
+
Change your directory to the `streamlit` folder where the `app.py` and `requirements.txt` files are located:
|
26 |
+
|
27 |
+
```bash
|
28 |
+
cd Webseiten-URL-Extraktor
|
29 |
+
```
|
30 |
+
|
31 |
+
3. **Create and Activate a Virtual Environment (Optional but Recommended)**
|
32 |
+
It's a good practice to use a virtual environment to manage dependencies. To create and activate a virtual environment, use the following commands:
|
33 |
+
|
34 |
+
```bash
|
35 |
+
# Create a virtual environment
|
36 |
+
python -m venv venv
|
37 |
+
|
38 |
+
# Activate the virtual environment
|
39 |
+
# On Windows:
|
40 |
+
.\venv\Scripts\activate
|
41 |
+
|
42 |
+
# On macOS/Linux:
|
43 |
+
source venv/bin/activate
|
44 |
+
```
|
45 |
+
|
46 |
+
4. **Install the Required Packages**
|
47 |
+
Install the dependencies listed in `requirements.txt`:
|
48 |
+
|
49 |
+
```bash
|
50 |
+
pip install -r requirements.txt
|
51 |
+
```
|
52 |
+
|
53 |
+
5. **Run the Streamlit Application**
|
54 |
+
To start the Streamlit app, use the following command:
|
55 |
+
|
56 |
+
```bash
|
57 |
+
streamlit run app.py
|
58 |
+
```
|
59 |
+
|
60 |
+
6. **Access the Application**
|
61 |
+
Once the server starts, you can access the web application by visiting:
|
62 |
+
|
63 |
+
```
|
64 |
+
http://localhost:8501
|
65 |
+
```
|
app.py
CHANGED
@@ -1,26 +1,36 @@
|
|
1 |
import streamlit as st
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
|
|
4 |
|
5 |
-
# Function to extract
|
6 |
-
def
|
7 |
try:
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
links = [a.get('href') for a in soup.find_all('a', href=True)]
|
12 |
return links
|
13 |
except Exception as e:
|
14 |
-
return f"Error processing the
|
|
|
15 |
|
16 |
-
# Function to extract
|
17 |
-
def
|
18 |
try:
|
19 |
-
soup = BeautifulSoup(
|
20 |
-
links = [a.
|
21 |
return links
|
22 |
except Exception as e:
|
23 |
-
return f"Error processing the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Streamlit App
|
26 |
st.title("Webseiten-URL-Extraktor")
|
@@ -29,38 +39,64 @@ st.title("Webseiten-URL-Extraktor")
|
|
29 |
input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))
|
30 |
|
31 |
if input_option == "URL":
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
if st.button("
|
36 |
if url_input:
|
37 |
-
st.write(f"Extrahiere
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
st.write(url)
|
|
|
|
|
43 |
else:
|
44 |
-
st.error(
|
45 |
else:
|
46 |
st.warning("Bitte geben Sie eine gültige URL ein.")
|
47 |
|
48 |
elif input_option == "HTML-Datei hochladen":
|
49 |
-
# File uploader for HTML files
|
50 |
uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")
|
51 |
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
if uploaded_file:
|
54 |
try:
|
55 |
html_content = uploaded_file.read().decode("utf-8")
|
56 |
-
st.write("Extrahiere
|
57 |
-
|
58 |
-
if
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
62 |
else:
|
63 |
-
st.error(
|
64 |
except Exception as e:
|
65 |
st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
|
66 |
else:
|
|
|
1 |
import streamlit as st
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
+
from lxml import html
|
5 |
|
6 |
+
# Function to extract links using XPath
|
7 |
+
def extract_links_with_xpath(content, xpath):
|
8 |
try:
|
9 |
+
tree = html.fromstring(content)
|
10 |
+
elements = tree.xpath(xpath)
|
11 |
+
links = [elem.text for elem in elements]
|
|
|
12 |
return links
|
13 |
except Exception as e:
|
14 |
+
return f"Error processing the XPath: {e}"
|
15 |
+
|
16 |
|
17 |
+
# Function to extract links using CSS selector
|
18 |
+
def extract_links_with_css(content, css_selector):
|
19 |
try:
|
20 |
+
soup = BeautifulSoup(content, 'html.parser')
|
21 |
+
links = [a.text for a in soup.select(css_selector)]
|
22 |
return links
|
23 |
except Exception as e:
|
24 |
+
return f"Error processing the CSS selector: {e}"
|
25 |
+
|
26 |
+
# Function to extract all URLs from a webpage
|
27 |
+
def extract_urls_from_url(url):
|
28 |
+
try:
|
29 |
+
response = requests.get(url)
|
30 |
+
response.raise_for_status()
|
31 |
+
return response.text
|
32 |
+
except Exception as e:
|
33 |
+
return f"Error processing the URL: {e}"
|
34 |
|
35 |
# Streamlit App
|
36 |
st.title("Webseiten-URL-Extraktor")
|
|
|
39 |
input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))
|
40 |
|
41 |
if input_option == "URL":
|
42 |
+
url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/")
|
43 |
+
|
44 |
+
extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
|
45 |
+
custom_input = None
|
46 |
+
|
47 |
+
if extraction_method == "XPath":
|
48 |
+
custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
|
49 |
+
elif extraction_method == "CSS Selector":
|
50 |
+
custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
|
51 |
|
52 |
+
if st.button("Extrahieren"):
|
53 |
if url_input:
|
54 |
+
st.write(f"Extrahiere von: {url_input}")
|
55 |
+
page_content = extract_urls_from_url(url_input)
|
56 |
+
|
57 |
+
if isinstance(page_content, str):
|
58 |
+
if extraction_method == "XPath":
|
59 |
+
links = extract_links_with_xpath(page_content, custom_input)
|
60 |
+
elif extraction_method == "CSS Selector":
|
61 |
+
links = extract_links_with_css(page_content, custom_input)
|
62 |
+
|
63 |
+
if isinstance(links, list):
|
64 |
+
for url in set(links):
|
65 |
st.write(url)
|
66 |
+
else:
|
67 |
+
st.error(links)
|
68 |
else:
|
69 |
+
st.error(page_content)
|
70 |
else:
|
71 |
st.warning("Bitte geben Sie eine gültige URL ein.")
|
72 |
|
73 |
elif input_option == "HTML-Datei hochladen":
|
|
|
74 |
uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")
|
75 |
|
76 |
+
extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
|
77 |
+
custom_input = None
|
78 |
+
|
79 |
+
if extraction_method == "XPath":
|
80 |
+
custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
|
81 |
+
elif extraction_method == "CSS Selector":
|
82 |
+
custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
|
83 |
+
|
84 |
+
if st.button("Extrahieren"):
|
85 |
if uploaded_file:
|
86 |
try:
|
87 |
html_content = uploaded_file.read().decode("utf-8")
|
88 |
+
st.write("Extrahiere aus der hochgeladenen HTML-Datei...")
|
89 |
+
|
90 |
+
if extraction_method == "XPath":
|
91 |
+
links = extract_links_with_xpath(html_content, custom_input)
|
92 |
+
elif extraction_method == "CSS Selector":
|
93 |
+
links = extract_links_with_css(html_content, custom_input)
|
94 |
+
|
95 |
+
if isinstance(links, list):
|
96 |
+
for url in set(links):
|
97 |
+
st.write(url)
|
98 |
else:
|
99 |
+
st.error(links)
|
100 |
except Exception as e:
|
101 |
st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
|
102 |
else:
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
streamlit
|
2 |
beautifulsoup4
|
|
|
|
1 |
streamlit
|
2 |
beautifulsoup4
|
3 |
+
lxml
|