Spaces:

mgokg
/

google_search

Running

File size: 4,284 Bytes

ca5f84f
62040f2
 
b2ba4fb
9e7dfc2
 
 
 
b2ba4fb
dc25526
b2ba4fb
6cd0567
 
 
 
 
 
 
 
 
 
 
b2ba4fb
 
 
8e85569
 
 
 
 
 
 
fcb8fe0
 
8e85569
b764634
cf04987
b2ba4fb
 
09e15cd
b2ba4fb
 
1e4e902
b2ba4fb
 
 
 
 
ca5f84f
6c2fae1
 
 
 
 
 
 
 
 
af406c2
6c2fae1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd0f9aa
10ae401
cd0f9aa
10ae401
99c69ca
 
 
 
ba251cc
99c69ca
 
 
 
10ae401
 
 
cd0f9aa
8e85569
ab48c87
8e85569
2a1f4e1
8e85569
 
ab48c87
ca5f84f
6cd0567
 
 
8e85569
 
6cd0567

import gradio as gr
import requests
from bs4 import BeautifulSoup
from gradio_client import Client
from urllib.parse import urljoin
import pandas as pd
from io import StringIO
import json
import groq
import os

custom_css = """
#md {
    height: 400px;  
    font-size: 30px;
    background: #202020;
    padding: 20px;
    color: white;
    border: 1 px solid white;
}
"""

api_key = os.getenv('groq')
client = groq.Client(api_key=api_key)

def qwen(jsondata):
    client = Client("Qwen/Qwen2.5-72B-Instruct")
    result = client.predict(
		query= f"return a valid json object \n {jsondata}",
		history=[],
		system="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
		api_name="/model_chat"
    )
    return result

def llm(message):
    message = f"return a json object with the keys: name,email,phone,website \n the values can be found here, leave blank if value is not available:\n {message} \n return a json object only. no text, no explanaition"
    try:        
        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"{message}"}
            ],
        )       
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error in response generation: {str(e)}"

def list_of_clubs(ort):
    base_url = "https://vereine-in-deutschland.net"
    all_links_text = []
    initial_url = f"{base_url}/vereine/Bayern/{ort}"

    try:
        response = requests.get(initial_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Determine the last page
        link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
        last_page = 10
        if link_element and 'href' in link_element.attrs:
            href = link_element['href']
            last_page = int(href.split('/')[-1])

        # Loop through all pages and collect links
        for page_number in range(1, last_page + 1):
            page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
            response = requests.get(page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            target_div = soup.select_one('div.row-cols-1:nth-child(4)')

            if target_div:
                texts = [a.text for a in target_div.find_all('a', href=True)]
                all_links_text.extend(texts)
            else:
                print(f"Target div not found on page {page_number}")

    except Exception as e:
        return str(e), []

    all_links_text = all_links_text[0::2]
    return all_links_text

def process_ort(ort):
    links_text = list_of_clubs(ort)
    vereine = []
    
    for verein in links_text:
        
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
    
        url = f"https://www.google.com/search?q=impressum {verein}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        impressum_div = soup.find('body')
        contact_detailes = impressum_div.text
        json_object = llm(contact_detailes)
        #vereine.append(contact_detailes)
        vereine.append(json_object)
        
    valid_json = qwen(vereine)
    # Convert JSON string to Python dictionary
    data_dict = json.loads(valid_json)
    #Convert dictionary to DataFrame
    df = pd.DataFrame(data_dict)
    return df      
    return vereine

# Create the Gradio interface
with gr.Blocks(css=custom_css) as demo:
    with gr.Row():
        details_output = gr.DataFrame(label="answer", elem_id="md")        
        #details_output = gr.Textbox(label="Ausgabe", value = f"\n\n\n\n")  
    with gr.Row():
        ort_input = gr.Textbox(label="prompt", placeholder="ask anything...")      
    with gr.Row():         
        button = gr.Button("Senden")    

    # Connect the button to the function
    button.click(fn=process_ort, inputs=ort_input, outputs=details_output)   

# Launch the Gradio application
demo.launch()