Spaces:
Running
Running
File size: 3,618 Bytes
ca5f84f 3e7cc7e 62040f2 b2ba4fb 9e7dfc2 b2ba4fb dc25526 b2ba4fb 09e15cd b2ba4fb 09e15cd b2ba4fb 09e15cd b2ba4fb ca5f84f 6c2fae1 af406c2 6c2fae1 6b8933e 62040f2 6b8933e da8c651 af406c2 b2ba4fb 3e7cc7e 6c2fae1 cd0f9aa 6c2fae1 3e7cc7e 24310b3 3201d2f a0436c9 4a6f2f0 ca5f84f 4a6f2f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import gradio as gr
from googlesearch import search
import requests
from bs4 import BeautifulSoup
from gradio_client import Client
from urllib.parse import urljoin
import pandas as pd
from io import StringIO
import json
import groq
import os
api_key = os.getenv('groq')
client = groq.Client(api_key=api_key)
json_schema = """
{
"name": "",
"email": "",
"phone": "",
"ort": ""
}
"""
def llm(message):
try:
completion = client.chat.completions.create(
model="llama3-70b-8192",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f "return a json object. here is the schema \n {json_schema} \n fill in the avaiable values which can be found here:\n {message} \n return valid json only"}
],
)
return completion.choices[0].message.content
except Exception as e:
return f"Error in response generation: {str(e)}"
def list_of_clubs(ort):
base_url = "https://vereine-in-deutschland.net"
all_links_text = []
initial_url = f"{base_url}/vereine/Bayern/{ort}"
try:
response = requests.get(initial_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Determine the last page
link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
last_page = 10
if link_element and 'href' in link_element.attrs:
href = link_element['href']
last_page = int(href.split('/')[-1])
# Loop through all pages and collect links
for page_number in range(1, last_page + 1):
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
if target_div:
#links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
texts = [a.text for a in target_div.find_all('a', href=True)]
all_links_text.extend(texts)
else:
print(f"Target div not found on page {page_number}")
except Exception as e:
return str(e), []
all_links_text = all_links_text[0::2]
return all_links_text
def google_search(query):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# Führt die Suche durch und erhält das erste Ergebnis
for result in search(query, num_results=1):
url = result
break
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#first_div = soup.find('div', class_='MjjYud')
first_div = soup.find('body')
return first_div.text.strip()
def process_ort(ort):
links_text = list_of_clubs(ort)
vereine = []
for verein in links_text:
query = f"impressum {verein}"
contact_detailes = google_search(query)
json_object = llm(contact_detailes)
vereine.extend(json_object)
return vereine
return links_text
demo = gr.Interface(
#fn=google_search,
fn=process_ort,
inputs=gr.Textbox(lines=1, placeholder="Geben Sie Ihre Suchanfrage ein..."),
outputs="text",
title="google websearch",
description="Geben Sie eine Suchanfrage ein..."
)
demo.launch() |