mgokg commited on
Commit
1328c27
·
verified ·
1 Parent(s): 1de7e12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -1
app.py CHANGED
@@ -1,9 +1,125 @@
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  #import sounddevice as sd
3
  import numpy as np
4
  import wavio
5
  import speech_recognition as sr
6
 
 
7
  st.title("Audio Recorder und Transkription")
8
 
9
  # Aufnahmeparameter
@@ -40,7 +156,6 @@ st.write("Klicke auf 'Aufnahme starten', um die Aufnahme zu beginnen.")
40
 
41
 
42
 
43
- '''
44
 
45
  import streamlit as st
46
  import pydub
 
1
  import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from urllib.parse import urljoin
5
+ import json
6
+ import csv
7
+ import pandas as pd
8
+ import os
9
+
10
+ #api_key = os.getenv('groq')
11
+
12
+ def parse_links_and_content(ort):
13
+ base_url = "https://vereine-in-deutschland.net"
14
+ all_links = []
15
+ # Konstruiere die vollständige URL
16
+ initial_url = f"{base_url}/vereine/Bayern/{ort}/"
17
+
18
+ try:
19
+ # Senden der Anfrage an die initiale URL
20
+ response = requests.get(initial_url)
21
+ response.raise_for_status() # Überprüfen, ob die Anfrage erfolgreich war
22
+
23
+ # Parse the HTML content using BeautifulSoup
24
+ soup = BeautifulSoup(response.content, 'html.parser')
25
+
26
+ # Ermittle die letzte Seite
27
+ link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
28
+
29
+ if link_element and 'href' in link_element.attrs:
30
+ href = link_element['href']
31
+ # Extrahiere die letzten beiden Zeichen der URL
32
+ last_two_chars = href[-2:]
33
+
34
+ # Konvertiere die letzten beiden Zeichen in einen Integer
35
+ last_two_chars_int = int(last_two_chars)
36
+ else:
37
+ last_two_chars_int = 1 # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt
38
+
39
+ # Schleife durch alle Seiten und sammle Links
40
+ for page_number in range(1, last_two_chars_int + 1):
41
+ page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
42
+ response = requests.get(page_url)
43
+ response.raise_for_status()
44
+ soup = BeautifulSoup(response.content, 'html.parser')
45
+ target_div = soup.select_one('div.row-cols-1:nth-child(4)')
46
+
47
+ if target_div:
48
+ texts = [a.text for a in target_div.find_all('a', href=True)]
49
+ all_links.extend(texts)
50
+ else:
51
+ st.write(f"Target div not found on page {page_number}")
52
+
53
+ except Exception as e:
54
+ return str(e), []
55
+
56
+ all_links = all_links[0::2]
57
+ return all_links
58
+
59
+ def scrape_links(links):
60
+ contact_details = []
61
+ client = Client("mgokg/PerplexicaApi")
62
+ for verein in links:
63
+ result = client.predict(
64
+ prompt=f"{verein}",
65
+ api_name="/parse_links"
66
+ )
67
+ contact_details.append(result)
68
+
69
+ return contact_details
70
+
71
+ # Speichere die JSON-Daten in eine CSV-Datei
72
+ def save_to_csv(data, filename):
73
+ keys = data[0].keys()
74
+ with open(filename, 'w', newline='', encoding='utf-8') as output_file:
75
+ dict_writer = csv.DictWriter(output_file, fieldnames=keys)
76
+ dict_writer.writeheader()
77
+ dict_writer.writerows(data)
78
+
79
+ # Streamlit App
80
+ st.title("Vereinsinformationen abrufen")
81
+
82
+ ort_input = st.text_input("Ort", placeholder="Gib den Namen des Ortes ein")
83
+
84
+ if st.button("Senden"):
85
+ links = parse_links_and_content(ort_input)
86
+ contact_details = scrape_links(links)
87
+ json_data = [json.loads(item) for item in contact_details]
88
+
89
+ # Zeige die Ergebnisse an
90
+ st.json(json_data)
91
+
92
+ # Speichere die Daten in einer CSV-Datei
93
+ save_to_csv(json_data, 'contact_details.csv')
94
+
95
+ # Bereitstellung des Download-Links
96
+ with open('contact_details.csv', 'rb') as file:
97
+ st.download_button(
98
+ label="CSV-Datei herunterladen",
99
+ data=file,
100
+ file_name='contact_details.csv',
101
+ mime='text/csv'
102
+ )
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+ '''
116
+ import streamlit as st
117
  #import sounddevice as sd
118
  import numpy as np
119
  import wavio
120
  import speech_recognition as sr
121
 
122
+
123
  st.title("Audio Recorder und Transkription")
124
 
125
  # Aufnahmeparameter
 
156
 
157
 
158
 
 
159
 
160
  import streamlit as st
161
  import pydub