Spaces:

datenwerkzeuge
/

Webseiten-URL-Extraktor

Sleeping

App Files Files Community

bsenst commited on Jan 19

Commit

9bc8ed7

1 Parent(s): efb0e4e

add xpath und css selector options

Browse files

Files changed (3) hide show

README.md +52 -0
app.py +66 -30
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -11,3 +11,55 @@ short_description: URLs extrahieren
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+## Installation and Setup
+1. **Clone the Repository**
+   Open a terminal and run the following command to clone the repository:
+   ```bash
+   git clone https://huggingface.co/spaces/datenwerkzeuge/Webseiten-URL-Extraktor
+   ```
+2. **Navigate to the Streamlit Application Directory**
+   Change your directory to the `streamlit` folder where the `app.py` and `requirements.txt` files are located:
+   ```bash
+   cd Webseiten-URL-Extraktor
+   ```
+3. **Create and Activate a Virtual Environment (Optional but Recommended)**
+   It's a good practice to use a virtual environment to manage dependencies. To create and activate a virtual environment, use the following commands:
+   ```bash
+   # Create a virtual environment
+   python -m venv venv
+   # Activate the virtual environment
+   # On Windows:
+   .\venv\Scripts\activate
+   # On macOS/Linux:
+   source venv/bin/activate
+   ```
+4. **Install the Required Packages**
+   Install the dependencies listed in `requirements.txt`:
+   ```bash
+   pip install -r requirements.txt
+   ```
+5. **Run the Streamlit Application**
+   To start the Streamlit app, use the following command:
+   ```bash
+   streamlit run app.py
+   ```
+6. **Access the Application**
+   Once the server starts, you can access the web application by visiting:
+   ```
+   http://localhost:8501
+   ```

app.py CHANGED Viewed

@@ -1,26 +1,36 @@
 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
-# Function to extract all URLs from a webpage
-def extract_urls_from_url(url):
     try:
-        response = requests.get(url)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
-        links = [a.get('href') for a in soup.find_all('a', href=True)]
         return links
     except Exception as e:
-        return f"Error processing the URL: {e}"
-# Function to extract all URLs from uploaded HTML content
-def extract_urls_from_html(html_content):
     try:
-        soup = BeautifulSoup(html_content, 'html.parser')
-        links = [a.get('href') for a in soup.find_all('a', href=True)]
         return links
     except Exception as e:
-        return f"Error processing the HTML file: {e}"
 # Streamlit App
 st.title("Webseiten-URL-Extraktor")
@@ -29,38 +39,64 @@ st.title("Webseiten-URL-Extraktor")
 input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))
 if input_option == "URL":
-    # Input field for URL
-    url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://example.com")
-    if st.button("URLs extrahieren"):
         if url_input:
-            st.write(f"Extrahiere URLs von: {url_input}")
-            urls = extract_urls_from_url(url_input)
-            if isinstance(urls, list):
-                for url in set(urls):
-                    if url.startswith("http"):
                         st.write(url)
             else:
-                st.error(urls)
         else:
             st.warning("Bitte geben Sie eine gültige URL ein.")
 elif input_option == "HTML-Datei hochladen":
-    # File uploader for HTML files
     uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")
-    if st.button("URLs extrahieren"):
         if uploaded_file:
             try:
                 html_content = uploaded_file.read().decode("utf-8")
-                st.write("Extrahiere URLs aus der hochgeladenen HTML-Datei...")
-                urls = extract_urls_from_html(html_content)
-                if isinstance(urls, list):
-                    for url in set(urls):
-                        if url.startswith("http"):
-                            st.write(url)
                 else:
-                    st.error(urls)
             except Exception as e:
                 st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
         else:

 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
+from lxml import html
+# Function to extract links using XPath
+def extract_links_with_xpath(content, xpath):
     try:
+        tree = html.fromstring(content)
+        elements = tree.xpath(xpath)
+        links = [elem.text for elem in elements]
         return links
     except Exception as e:
+        return f"Error processing the XPath: {e}"
+# Function to extract links using CSS selector
+def extract_links_with_css(content, css_selector):
     try:
+        soup = BeautifulSoup(content, 'html.parser')
+        links = [a.text for a in soup.select(css_selector)]
         return links
     except Exception as e:
+        return f"Error processing the CSS selector: {e}"
+# Function to extract all URLs from a webpage
+def extract_urls_from_url(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        return response.text
+    except Exception as e:
+        return f"Error processing the URL: {e}"
 # Streamlit App
 st.title("Webseiten-URL-Extraktor")
 input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))
 if input_option == "URL":
+    url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/")
+    extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
+    custom_input = None
+    if extraction_method == "XPath":
+        custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
+    elif extraction_method == "CSS Selector":
+        custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
+    if st.button("Extrahieren"):
         if url_input:
+            st.write(f"Extrahiere von: {url_input}")
+            page_content = extract_urls_from_url(url_input)
+            if isinstance(page_content, str):
+                if extraction_method == "XPath":
+                    links = extract_links_with_xpath(page_content, custom_input)
+                elif extraction_method == "CSS Selector":
+                    links = extract_links_with_css(page_content, custom_input)
+                if isinstance(links, list):
+                    for url in set(links):
                         st.write(url)
+                else:
+                    st.error(links)
             else:
+                st.error(page_content)
         else:
             st.warning("Bitte geben Sie eine gültige URL ein.")
 elif input_option == "HTML-Datei hochladen":
     uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")
+    extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
+    custom_input = None
+    if extraction_method == "XPath":
+        custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
+    elif extraction_method == "CSS Selector":
+        custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
+    if st.button("Extrahieren"):
         if uploaded_file:
             try:
                 html_content = uploaded_file.read().decode("utf-8")
+                st.write("Extrahiere aus der hochgeladenen HTML-Datei...")
+                if extraction_method == "XPath":
+                    links = extract_links_with_xpath(html_content, custom_input)
+                elif extraction_method == "CSS Selector":
+                    links = extract_links_with_css(html_content, custom_input)
+                if isinstance(links, list):
+                    for url in set(links):
+                        st.write(url)
                 else:
+                    st.error(links)
             except Exception as e:
                 st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
         else:

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 streamlit
 beautifulsoup4

 streamlit
 beautifulsoup4
+lxml