Spaces:

datenwerkzeuge
/

Webseiten-URL-Extraktor

Sleeping

App Files Files Community

bsenst commited on Dec 18, 2024

Commit

efb0e4e

verified ·

1 Parent(s): 1a56b6b

add html file upload option

Browse files

Files changed (1) hide show

app.py +52 -14

app.py CHANGED Viewed

@@ -2,8 +2,8 @@ import streamlit as st
 import requests
 from bs4 import BeautifulSoup
-# Funktion, um alle URLs von einer Webseite zu extrahieren
-def extract_urls(url):
     try:
         response = requests.get(url)
         response.raise_for_status()
@@ -11,19 +11,57 @@ def extract_urls(url):
         links = [a.get('href') for a in soup.find_all('a', href=True)]
         return links
     except Exception as e:
-        return str(e)
 # Streamlit App
 st.title("Webseiten-URL-Extraktor")
-# Eingabefeld für die URL
-url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://example.com")
-# Wenn der Nutzer eine URL eingibt und auf den Button klickt
-if st.button("URLs extrahieren"):
-    if url_input:
-        st.write(f"Extrahiere URLs von: {url_input}")
-        urls = extract_urls(url_input)
-        for url in set(urls):
-            if url.startswith("http"):
-                st.write(url)

 import requests
 from bs4 import BeautifulSoup
+# Function to extract all URLs from a webpage
+def extract_urls_from_url(url):
     try:
         response = requests.get(url)
         response.raise_for_status()
         links = [a.get('href') for a in soup.find_all('a', href=True)]
         return links
     except Exception as e:
+        return f"Error processing the URL: {e}"
+# Function to extract all URLs from uploaded HTML content
+def extract_urls_from_html(html_content):
+    try:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        links = [a.get('href') for a in soup.find_all('a', href=True)]
+        return links
+    except Exception as e:
+        return f"Error processing the HTML file: {e}"
 # Streamlit App
 st.title("Webseiten-URL-Extraktor")
+# Input options: URL or HTML file
+input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))
+if input_option == "URL":
+    # Input field for URL
+    url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://example.com")
+    if st.button("URLs extrahieren"):
+        if url_input:
+            st.write(f"Extrahiere URLs von: {url_input}")
+            urls = extract_urls_from_url(url_input)
+            if isinstance(urls, list):
+                for url in set(urls):
+                    if url.startswith("http"):
+                        st.write(url)
+            else:
+                st.error(urls)
+        else:
+            st.warning("Bitte geben Sie eine gültige URL ein.")
+elif input_option == "HTML-Datei hochladen":
+    # File uploader for HTML files
+    uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")
+    if st.button("URLs extrahieren"):
+        if uploaded_file:
+            try:
+                html_content = uploaded_file.read().decode("utf-8")
+                st.write("Extrahiere URLs aus der hochgeladenen HTML-Datei...")
+                urls = extract_urls_from_html(html_content)
+                if isinstance(urls, list):
+                    for url in set(urls):
+                        if url.startswith("http"):
+                            st.write(url)
+                else:
+                    st.error(urls)
+            except Exception as e:
+                st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
+        else:
+            st.warning("Bitte laden Sie eine HTML-Datei hoch.")