File size: 4,051 Bytes
dced2cd
 
 
9bc8ed7
dced2cd
9bc8ed7
 
dced2cd
9bc8ed7
 
 
dced2cd
 
9bc8ed7
 
efb0e4e
9bc8ed7
 
efb0e4e
9bc8ed7
 
efb0e4e
 
9bc8ed7
 
 
 
 
 
 
 
 
 
dced2cd
 
 
 
efb0e4e
 
 
 
9bc8ed7
 
 
 
 
 
 
 
 
efb0e4e
9bc8ed7
efb0e4e
9bc8ed7
 
 
 
 
 
 
 
 
 
 
efb0e4e
9bc8ed7
 
efb0e4e
9bc8ed7
efb0e4e
 
 
 
 
 
9bc8ed7
 
 
 
 
 
 
 
 
efb0e4e
 
 
9bc8ed7
 
 
 
 
 
 
 
 
 
efb0e4e
9bc8ed7
efb0e4e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import streamlit as st
import requests
from bs4 import BeautifulSoup
from lxml import html

# Function to extract links using XPath
def extract_links_with_xpath(content, xpath):
    try:
        tree = html.fromstring(content)
        elements = tree.xpath(xpath)
        links = [elem.text for elem in elements]
        return links
    except Exception as e:
        return f"Error processing the XPath: {e}"


# Function to extract links using CSS selector
def extract_links_with_css(content, css_selector):
    try:
        soup = BeautifulSoup(content, 'html.parser')
        links = [a.text for a in soup.select(css_selector)]
        return links
    except Exception as e:
        return f"Error processing the CSS selector: {e}"

# Function to extract all URLs from a webpage
def extract_urls_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except Exception as e:
        return f"Error processing the URL: {e}"

# Streamlit App
st.title("Webseiten-URL-Extraktor")

# Input options: URL or HTML file
input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))

if input_option == "URL":
    url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/")
    
    extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
    custom_input = None

    if extraction_method == "XPath":
        custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
    elif extraction_method == "CSS Selector":
        custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")

    if st.button("Extrahieren"):
        if url_input:
            st.write(f"Extrahiere von: {url_input}")
            page_content = extract_urls_from_url(url_input)

            if isinstance(page_content, str):
                if extraction_method == "XPath":
                    links = extract_links_with_xpath(page_content, custom_input)
                elif extraction_method == "CSS Selector":
                    links = extract_links_with_css(page_content, custom_input)

                if isinstance(links, list):
                    for url in set(links):
                        st.write(url)
                else:
                    st.error(links)
            else:
                st.error(page_content)
        else:
            st.warning("Bitte geben Sie eine gültige URL ein.")

elif input_option == "HTML-Datei hochladen":
    uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")

    extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
    custom_input = None

    if extraction_method == "XPath":
        custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
    elif extraction_method == "CSS Selector":
        custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")

    if st.button("Extrahieren"):
        if uploaded_file:
            try:
                html_content = uploaded_file.read().decode("utf-8")
                st.write("Extrahiere aus der hochgeladenen HTML-Datei...")

                if extraction_method == "XPath":
                    links = extract_links_with_xpath(html_content, custom_input)
                elif extraction_method == "CSS Selector":
                    links = extract_links_with_css(html_content, custom_input)

                if isinstance(links, list):
                    for url in set(links):
                        st.write(url)
                else:
                    st.error(links)
            except Exception as e:
                st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
        else:
            st.warning("Bitte laden Sie eine HTML-Datei hoch.")