Spaces:
Running
Running
File size: 4,481 Bytes
ca5f84f 3e7cc7e 62040f2 b2ba4fb 9e7dfc2 b2ba4fb dc25526 b2ba4fb 09e15cd b764634 561c520 b2ba4fb 09e15cd b2ba4fb 1e4e902 b2ba4fb ca5f84f 6c2fae1 af406c2 6c2fae1 6b8933e 62040f2 6b8933e da8c651 af406c2 b2ba4fb 3e7cc7e 6c2fae1 cd0f9aa 99c69ca ba251cc 99c69ca 6718f93 28fb445 99c69ca 561c520 cd0f9aa 6c2fae1 3e7cc7e 24310b3 3201d2f 3821caa 4a6f2f0 ca5f84f 4a6f2f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import gradio as gr
from googlesearch import search
import requests
from bs4 import BeautifulSoup
from gradio_client import Client
from urllib.parse import urljoin
import pandas as pd
from io import StringIO
import json
import groq
import os
api_key = os.getenv('groq')
client = groq.Client(api_key=api_key)
json_schema = """
{
"name": "",
"email": "",
"phone": "",
"ort": ""
}
"""
def llm(message):
message = f"return a json object with contact details \n fill in the avaiable values which can be found here:\n {message} \n return valid json only"
try:
completion = client.chat.completions.create(
model="llama3-70b-8192",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"{message}"}
],
)
return completion.choices[0].message.content
except Exception as e:
return f"Error in response generation: {str(e)}"
def list_of_clubs(ort):
base_url = "https://vereine-in-deutschland.net"
all_links_text = []
initial_url = f"{base_url}/vereine/Bayern/{ort}"
try:
response = requests.get(initial_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Determine the last page
link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
last_page = 10
if link_element and 'href' in link_element.attrs:
href = link_element['href']
last_page = int(href.split('/')[-1])
# Loop through all pages and collect links
for page_number in range(1, last_page + 1):
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
if target_div:
#links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
texts = [a.text for a in target_div.find_all('a', href=True)]
all_links_text.extend(texts)
else:
print(f"Target div not found on page {page_number}")
except Exception as e:
return str(e), []
all_links_text = all_links_text[0::2]
return all_links_text
def google_search(query):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# Führt die Suche durch und erhält das erste Ergebnis
for result in search(query, num_results=1):
url = result
break
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#first_div = soup.find('div', class_='MjjYud')
first_div = soup.find('body')
return first_div.text.strip()
def process_ort(ort):
links_text = list_of_clubs(ort)
vereine = []
for verein in links_text:
#query = f"impressum {verein}"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
#search_results = google_search(search_term)
url = f"https://www.google.com/search?q=impressum {verein}"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
impressum_div = soup.find('body')
#return impressum_div.text
#uri = f"https://www.google.com/search?q={query}"
#response = requests.get(uri)
#soup = BeautifulSoup(response.text, 'html.parser')
#first_div = soup.find('body')
#erg = llm(first_div.text)
#return first_div.text
contact_detailes = impressum_div.text
#json_object = llm(contact_detailes)
vereine.append(contact_detailes)
#vereine.append(json_object)
#vereine.extend(json_object)
return vereine
return links_text
demo = gr.Interface(
#fn=google_search,
fn=process_ort,
inputs=gr.Textbox(lines=1, placeholder="Geben Sie Ihre Suchanfrage ein..."),
outputs="text",
#outputs=gr.JSON(),
title="google websearch",
description="Geben Sie eine Suchanfrage ein..."
)
demo.launch() |