Spaces:

KrishanRao
/

URL

Sleeping

URL

File size: 4,350 Bytes

5c6ed8c

#!/usr/bin/env python
# coding: utf-8

# In[ ]:


import gradio as gr
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from transformers import pipeline
import time

# Set up Selenium with headless Chrome
def setup_driver():
    options = Options()
    options.headless = True
    driver = webdriver.Chrome(options=options)  # Make sure you have 'chromedriver' installed
    return driver

# Function to extract text from the URL using Selenium
def extract_text(url):
    try:
        driver = setup_driver()
        driver.get(url)
        time.sleep(3)  # Wait for page to load completely
        page_source = driver.page_source
        driver.quit()

        soup = BeautifulSoup(page_source, "html.parser")
        text = ' '.join(soup.stripped_strings)
        return text
    except Exception as e:
        return f"Error extracting text from URL: {str(e)}"

# Load Hugging Face model (for extracting named entities or QA)
try:
    ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")
except Exception as e:
    ner_model = None
    print(f"Error loading model: {str(e)}")

# Function to extract information using Hugging Face model
def extract_info_with_model(text):
    if not ner_model:
        return {
            "Keytags": "Model loading failed.",
            "Amenities": "Model loading failed.",
            "Facilities": "Model loading failed.",
            "Seller Name": "Model loading failed.",
            "Location Details": "Model loading failed."
        }
    
    try:
        # Apply named entity recognition (NER) to extract entities from the text
        ner_results = ner_model(text)

        # Initialize variables
        keytags = []
        seller_name = ""
        location_details = ""
        amenities = ""
        facilities = ""

        # Search for relevant named entities
        for entity in ner_results:
            if entity['label'] == 'ORG':
                keytags.append(entity['word'])  # Example: Company or key term (this can be changed)
            elif entity['label'] == 'PERSON':
                seller_name = entity['word']  # If a person is mentioned, consider it the seller name
            elif entity['label'] == 'GPE':
                location_details = entity['word']  # Geopolitical entity as location

        # For amenities and facilities, you can modify the logic or use additional models (e.g., question-answering models)
        amenities = "No amenities found"  # Placeholder for the amenities
        facilities = "No facilities found"  # Placeholder for the facilities

        return {
            "Keytags": ", ".join(keytags) if keytags else "No keytags found",
            "Amenities": amenities,
            "Facilities": facilities,
            "Seller Name": seller_name if seller_name else "No seller name found",
            "Location Details": location_details if location_details else "No location details found"
        }
    except Exception as e:
        return {
            "Keytags": f"Error processing text: {str(e)}",
            "Amenities": f"Error processing text: {str(e)}",
            "Facilities": f"Error processing text: {str(e)}",
            "Seller Name": f"Error processing text: {str(e)}",
            "Location Details": f"Error processing text: {str(e)}"
        }

# Function to combine the extraction process (from URL + model processing)
def get_info(url):
    text = extract_text(url)
    if "Error" in text:
        return text, text, text, text, text  # Return the error message for all outputs
    
    extracted_info = extract_info_with_model(text)
    
    return (
        extracted_info["Keytags"],
        extracted_info["Amenities"],
        extracted_info["Facilities"],
        extracted_info["Seller Name"],
        extracted_info["Location Details"]
    )

# Gradio Interface to allow user input and display output
demo = gr.Interface(
    fn=get_info,
    inputs="text",  # Input is a URL
    outputs=["text", "text", "text", "text", "text"],  # Outputs for each field (Keytags, Amenities, etc.)
    title="Real Estate Info Extractor",
    description="Extract Keytags, Amenities, Facilities, Seller Name, and Location Details from a real estate article URL."
)

if __name__ == "__main__":
    demo.launch(show_api=False)