File size: 4,051 Bytes
dced2cd 9bc8ed7 dced2cd 9bc8ed7 dced2cd 9bc8ed7 dced2cd 9bc8ed7 efb0e4e 9bc8ed7 efb0e4e 9bc8ed7 efb0e4e 9bc8ed7 dced2cd efb0e4e 9bc8ed7 efb0e4e 9bc8ed7 efb0e4e 9bc8ed7 efb0e4e 9bc8ed7 efb0e4e 9bc8ed7 efb0e4e 9bc8ed7 efb0e4e 9bc8ed7 efb0e4e 9bc8ed7 efb0e4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import streamlit as st
import requests
from bs4 import BeautifulSoup
from lxml import html
# Function to extract links using XPath
def extract_links_with_xpath(content, xpath):
try:
tree = html.fromstring(content)
elements = tree.xpath(xpath)
links = [elem.text for elem in elements]
return links
except Exception as e:
return f"Error processing the XPath: {e}"
# Function to extract links using CSS selector
def extract_links_with_css(content, css_selector):
try:
soup = BeautifulSoup(content, 'html.parser')
links = [a.text for a in soup.select(css_selector)]
return links
except Exception as e:
return f"Error processing the CSS selector: {e}"
# Function to extract all URLs from a webpage
def extract_urls_from_url(url):
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except Exception as e:
return f"Error processing the URL: {e}"
# Streamlit App
st.title("Webseiten-URL-Extraktor")
# Input options: URL or HTML file
input_option = st.radio("Wählen Sie die Eingabemethode:", ("URL", "HTML-Datei hochladen"))
if input_option == "URL":
url_input = st.text_input("Gib die URL der Webseite ein:", placeholder="https://bsenst.github.io/toscrape/", value="https://bsenst.github.io/toscrape/")
extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
custom_input = None
if extraction_method == "XPath":
custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
elif extraction_method == "CSS Selector":
custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
if st.button("Extrahieren"):
if url_input:
st.write(f"Extrahiere von: {url_input}")
page_content = extract_urls_from_url(url_input)
if isinstance(page_content, str):
if extraction_method == "XPath":
links = extract_links_with_xpath(page_content, custom_input)
elif extraction_method == "CSS Selector":
links = extract_links_with_css(page_content, custom_input)
if isinstance(links, list):
for url in set(links):
st.write(url)
else:
st.error(links)
else:
st.error(page_content)
else:
st.warning("Bitte geben Sie eine gültige URL ein.")
elif input_option == "HTML-Datei hochladen":
uploaded_file = st.file_uploader("Laden Sie eine HTML-Datei hoch:", type="html")
extraction_method = st.selectbox("Wählen Sie die Extraktionsmethode:", ("XPath", "CSS Selector"))
custom_input = None
if extraction_method == "XPath":
custom_input = st.text_input("Geben Sie den XPath-Ausdruck ein:", placeholder="//a[@href]", value="//a[@href]")
elif extraction_method == "CSS Selector":
custom_input = st.text_input("Geben Sie den CSS-Selektor ein:", placeholder="a[href]", value="a[href]")
if st.button("Extrahieren"):
if uploaded_file:
try:
html_content = uploaded_file.read().decode("utf-8")
st.write("Extrahiere aus der hochgeladenen HTML-Datei...")
if extraction_method == "XPath":
links = extract_links_with_xpath(html_content, custom_input)
elif extraction_method == "CSS Selector":
links = extract_links_with_css(html_content, custom_input)
if isinstance(links, list):
for url in set(links):
st.write(url)
else:
st.error(links)
except Exception as e:
st.error(f"Fehler beim Verarbeiten der HTML-Datei: {e}")
else:
st.warning("Bitte laden Sie eine HTML-Datei hoch.")
|