Spaces:

mgokg
/

google_search

Running

File size: 3,618 Bytes

ca5f84f
3e7cc7e
62040f2
 
b2ba4fb
9e7dfc2
 
 
 
b2ba4fb
dc25526
b2ba4fb
 
 
 
09e15cd
 
 
 
 
 
 
 
 
b2ba4fb
 
 
09e15cd
b2ba4fb
 
09e15cd
b2ba4fb
 
 
 
 
ca5f84f
6c2fae1
 
 
 
 
 
 
 
 
af406c2
6c2fae1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b8933e
 
 
 
 
 
 
 
62040f2
6b8933e
 
da8c651
af406c2
b2ba4fb
3e7cc7e
6c2fae1
 
cd0f9aa
 
 
 
 
 
 
 
 
 
6c2fae1
 
3e7cc7e
24310b3
 
3201d2f
a0436c9
4a6f2f0
 
ca5f84f
 
4a6f2f0

import gradio as gr
from googlesearch import search
import requests
from bs4 import BeautifulSoup
from gradio_client import Client
from urllib.parse import urljoin
import pandas as pd
from io import StringIO
import json
import groq
import os

api_key = os.getenv('groq')
client = groq.Client(api_key=api_key)

json_schema = """
{
  "name": "",
  "email": "",
  "phone": "",
  "ort": ""
}
"""

def llm(message):   
    try:        
        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f "return a json object. here is the schema \n {json_schema} \n fill in the avaiable values which can be found here:\n {message} \n return valid json only"}
            ],
        )       
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error in response generation: {str(e)}"

def list_of_clubs(ort):
    base_url = "https://vereine-in-deutschland.net"
    all_links_text = []
    initial_url = f"{base_url}/vereine/Bayern/{ort}"

    try:
        response = requests.get(initial_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Determine the last page
        link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
        last_page = 10
        if link_element and 'href' in link_element.attrs:
            href = link_element['href']
            last_page = int(href.split('/')[-1])

        # Loop through all pages and collect links
        for page_number in range(1, last_page + 1):
            page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
            response = requests.get(page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            target_div = soup.select_one('div.row-cols-1:nth-child(4)')

            if target_div:
                #links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
                texts = [a.text for a in target_div.find_all('a', href=True)]
                all_links_text.extend(texts)
            else:
                print(f"Target div not found on page {page_number}")

    except Exception as e:
        return str(e), []

    all_links_text = all_links_text[0::2]
    return all_links_text

def google_search(query):       
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }    
    # Führt die Suche durch und erhält das erste Ergebnis
    for result in search(query, num_results=1):
        url = result
        break
        
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')  
    #first_div = soup.find('div', class_='MjjYud')
    first_div = soup.find('body')   
    return first_div.text.strip()

def process_ort(ort):
    links_text = list_of_clubs(ort)
    vereine = []
    for verein in links_text:
        query = f"impressum {verein}"
        contact_detailes = google_search(query)
        json_object = llm(contact_detailes)
        vereine.extend(json_object)
        
    return vereine

        
    return links_text

demo = gr.Interface(
    #fn=google_search,
    fn=process_ort,
    inputs=gr.Textbox(lines=1, placeholder="Geben Sie Ihre Suchanfrage ein..."),
    outputs="text",
    title="google websearch",
    description="Geben Sie eine Suchanfrage ein..."
)

demo.launch()