mgokg commited on
Commit
79b0e5e
·
verified ·
1 Parent(s): 3b7e38f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -202
app.py CHANGED
@@ -1,227 +1,52 @@
1
  import gradio as gr
2
  import requests
3
- from bs4 import BeautifulSoup
4
- from urllib.parse import urljoin
5
  import os
6
  import json
7
- import pandas as pd
8
- from io import StringIO
9
  import google.generativeai as genai
10
 
11
  # Load environment variables
12
  genai.configure(api_key=os.environ["geminiapikey"])
13
- api_key = os.environ.get('GROQ_API_KEY')
14
  read_key = os.environ.get('HF_TOKEN', None)
15
 
16
- # Initialize Groq client
17
- if api_key:
18
- from groq import Client as GroqClient
19
- client = GroqClient(api_key=api_key)
20
- else:
21
- client = None
22
-
23
- # Use Llama 3 70B powered by Groq for answering
24
- def ask_llm(ort):
25
- if not client:
26
- return "Groq API key not set."
 
 
 
 
 
 
 
 
 
 
27
 
28
- try:
29
- completion = client.chat.completions.create(
30
- model="llama3-70b-8192",
31
- messages=[
32
- {"role": "system", "content": "You are a helpful assistant."},
33
- {"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
34
- ],
35
- )
36
- return completion.choices[0].message.content
37
- except Exception as e:
38
- return f"Error in response generation: {str(e)}"
39
-
40
- def parse_links_and_content(ort):
41
- base_url = "https://vereine-in-deutschland.net"
42
- all_links = []
43
- all_links_text = []
44
- initial_url = f"{base_url}/vereine/Bayern/{ort}"
45
-
46
- try:
47
- response = requests.get(initial_url)
48
- response.raise_for_status()
49
-
50
- soup = BeautifulSoup(response.content, 'html.parser')
51
-
52
- # Determine the last page
53
- link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
54
- last_page = 10
55
- if link_element and 'href' in link_element.attrs:
56
- href = link_element['href']
57
- last_page = int(href.split('/')[-1])
58
-
59
- # Loop through all pages and collect links
60
- for page_number in range(1, last_page + 1):
61
- page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
62
- response = requests.get(page_url)
63
- response.raise_for_status()
64
- soup = BeautifulSoup(response.content, 'html.parser')
65
- target_div = soup.select_one('div.row-cols-1:nth-child(4)')
66
-
67
- if target_div:
68
- links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
69
- texts = [a.text for a in target_div.find_all('a', href=True)]
70
- all_links.extend(links)
71
- all_links_text.extend(texts)
72
- else:
73
- print(f"Target div not found on page {page_number}")
74
-
75
- except Exception as e:
76
- return str(e), []
77
-
78
- all_links = all_links[0::2]
79
- all_links_text = all_links_text[0::2]
80
-
81
- return all_links_text, all_links
82
-
83
- def extract_vereinsname(url):
84
- parts = url.split('/')
85
- vereinsname = parts[-1]
86
- vereinsname = vereinsname.replace("-", " ")
87
- return vereinsname
88
-
89
- def scrape_links(links):
90
- details = []
91
- for link in links:
92
- try:
93
- response = requests.get(link)
94
- response.raise_for_status()
95
- soup = BeautifulSoup(response.content, 'html.parser')
96
- target_nav = soup.select_one('.nav')
97
- parts = link.split('/')
98
-
99
- # Log the URL and its parts for debugging
100
- print(f"Processing URL: {link}")
101
- print(f"URL parts: {parts}")
102
-
103
- # Extract the name of the Verein from the URL
104
- vereinsname = parts[-1] if parts[-1] else parts[-2] # Fallback to the second-to-last part if the last part is empty
105
- texte = target_nav.text.strip()
106
- texte = texte.replace("Amtsgericht: Schweinfurt", "")
107
- texte = texte.replace("Adresse folgt", "")
108
- texte = texte.replace("Adresse", "Adresse:")
109
- texte = texte.replace("Kontakt", "Email:")
110
- texte = texte.replace("Noch keine Daten vorhanden", "")
111
-
112
- if target_nav:
113
- details.append(f"Verein: {vereinsname} {texte}")
114
- else:
115
- details.append(f"Verein: {vereinsname} - No contact information found")
116
- except Exception as e:
117
- details.append(f"Error: {str(e)}")
118
-
119
- return details
120
-
121
- def save_to_csv(data, filename):
122
- keys = data[0].keys() if data else []
123
- with open(filename, 'w', newline='', encoding='utf-8') as output_file:
124
- dict_writer = csv.DictWriter(output_file, fieldnames=keys)
125
- dict_writer.writeheader()
126
- dict_writer.writerows(data)
127
-
128
- # Clear output
129
- def clear():
130
- return "", ""
131
-
132
- def load_data():
133
- return df
134
 
135
  # Create the Gradio interface
136
- with gr.Blocks() as demo:
137
- gr.Markdown("[Download](https://specialist-it.de/verein.csv)")
138
  with gr.Row():
139
- ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
140
  with gr.Row():
141
- details_output = gr.Markdown(label="Ausgabe",)
142
  #details_output = gr.Textbox(label="Ausgabe", value = f"\n\n\n\n")
143
-
144
- def process_ort(ort):
145
- links_text, links = parse_links_and_content(ort)
146
- #return links_text
147
- kontakt = []
148
-
149
- # Create the model
150
- generation_config = {
151
- "temperature": 1,
152
- "top_p": 0.95,
153
- "top_k": 40,
154
- "max_output_tokens": 8192,
155
- "response_mime_type": "text/plain",
156
- }
157
-
158
- model = genai.GenerativeModel(
159
- model_name="gemini-2.0-flash-exp",
160
- generation_config=generation_config,
161
- )
162
-
163
- chat_session = model.start_chat(
164
- history=[
165
- ]
166
- )
167
-
168
-
169
- json_schema ='{"vereinsname": "", "email": "", "adresse": "","telefon": ""}'
170
- response = chat_session.send_message(f"return a valid json object with contact details for each verein \n {links_text}")
171
- #response.text = response.text.replace("```json", "")
172
- #response.text = response.text.replace("```", "")
173
- #kontakt.append(response.text)
174
-
175
- return response.text
176
-
177
- for verein in links_text:
178
- json_schema ='{"vereinsname": "", "email": "", "adresse": "","telefon": ""}'
179
- response = chat_session.send_message(f"return a valid json object with kontakdaten für {verein}. here is the json schema: {json_schema}")
180
- #response.text = response.text.replace("```json", "")
181
- #response.text = response.text.replace("```", "")
182
- kontakt.append(response.text)
183
-
184
- return kontakt
185
-
186
- contact_details = scrape_links(links)
187
 
188
- from gradio_client import Client
189
-
190
-
191
- qwen_client = Client("Qwen/Qwen2.5-72B-Instruct")
192
- result = qwen_client.predict(
193
- query=f"return a valid json objects with contact details foreach verein. return the generated json only \n {contact_details}",
194
- history=[],
195
- system="you are a expert for json data and your job is to extract information from text and return a valid json object only. no text no explanations. reverse all email adresses. example: reverse ed.nesuahsgnitle-vs@dnatsrov to [email protected]. replace all dashes from vereinsname with a whitespace",
196
- api_name="/model_chat"
197
- )
198
-
199
- json_data = result[1][0][1]
200
- json_data = json_data.replace("```json", "")
201
- json_data = json_data.replace("```", "")
202
-
203
- # Convert JSON string to Python dictionary
204
- data_dict = json.loads(json_data)
205
-
206
- # Convert dictionary to DataFrame
207
- df = pd.DataFrame(data_dict)
208
- # DataFrame in eine CSV-Datei konvertieren
209
- #df.to_csv('daten.csv', index=False)
210
- # DataFrame in eine CSV-Variable konvertieren
211
- csv_buffer = StringIO()
212
- df.to_csv(csv_buffer, index=False)
213
- csv_data = csv_buffer.getvalue()
214
-
215
- print(csv_data)
216
- #return csv_data
217
- return df
218
-
219
  with gr.Row():
220
  clearbutton = gr.Button("Clear")
221
  button = gr.Button("Senden")
222
 
223
  # Connect the button to the function
224
- button.click(fn=process_ort, inputs=ort_input, outputs=details_output)
225
  clearbutton.click(fn=clear, inputs=[], outputs=details_output)
226
 
227
  # Launch the Gradio application
 
1
  import gradio as gr
2
  import requests
 
 
3
  import os
4
  import json
 
 
5
  import google.generativeai as genai
6
 
7
  # Load environment variables
8
  genai.configure(api_key=os.environ["geminiapikey"])
 
9
  read_key = os.environ.get('HF_TOKEN', None)
10
 
11
+ custom_css = ".md{height:450px}"
12
+
13
+ def predict(prompt):
14
+ # Create the model
15
+ generation_config = {
16
+ "temperature": 1,
17
+ "top_p": 0.95,
18
+ "top_k": 40,
19
+ "max_output_tokens": 8192,
20
+ "response_mime_type": "text/plain",
21
+ }
22
+
23
+ model = genai.GenerativeModel(
24
+ model_name="gemini-2.0-flash-exp",
25
+ generation_config=generation_config,
26
+ )
27
+
28
+ chat_session = model.start_chat(
29
+ history=[
30
+ ]
31
+ )
32
 
33
+ response = chat_session.send_message(prompt)
34
+ return response.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # Create the Gradio interface
37
+ with gr.Blocks(css=custom_css) as demo:
 
38
  with gr.Row():
39
+ ort_input = gr.Textbox(label="prompt", placeholder="Gib den Namen des Ortes ein")
40
  with gr.Row():
41
+ details_output = gr.Markdown(label="answer",elem_id="md")
42
  #details_output = gr.Textbox(label="Ausgabe", value = f"\n\n\n\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  with gr.Row():
45
  clearbutton = gr.Button("Clear")
46
  button = gr.Button("Senden")
47
 
48
  # Connect the button to the function
49
+ button.click(fn=predict, inputs=ort_input, outputs=details_output)
50
  clearbutton.click(fn=clear, inputs=[], outputs=details_output)
51
 
52
  # Launch the Gradio application