Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[ ]: | |
import gradio as gr | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from bs4 import BeautifulSoup | |
from transformers import pipeline | |
import time | |
# Set up Selenium with headless Chrome | |
def setup_driver(): | |
options = Options() | |
options.headless = True | |
driver = webdriver.Chrome(options=options) # Make sure you have 'chromedriver' installed | |
return driver | |
# Function to extract text from the URL using Selenium | |
def extract_text(url): | |
try: | |
driver = setup_driver() | |
driver.get(url) | |
time.sleep(3) # Wait for page to load completely | |
page_source = driver.page_source | |
driver.quit() | |
soup = BeautifulSoup(page_source, "html.parser") | |
text = ' '.join(soup.stripped_strings) | |
return text | |
except Exception as e: | |
return f"Error extracting text from URL: {str(e)}" | |
# Load Hugging Face model (for extracting named entities or QA) | |
try: | |
ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english") | |
except Exception as e: | |
ner_model = None | |
print(f"Error loading model: {str(e)}") | |
# Function to extract information using Hugging Face model | |
def extract_info_with_model(text): | |
if not ner_model: | |
return { | |
"Keytags": "Model loading failed.", | |
"Amenities": "Model loading failed.", | |
"Facilities": "Model loading failed.", | |
"Seller Name": "Model loading failed.", | |
"Location Details": "Model loading failed." | |
} | |
try: | |
# Apply named entity recognition (NER) to extract entities from the text | |
ner_results = ner_model(text) | |
# Initialize variables | |
keytags = [] | |
seller_name = "" | |
location_details = "" | |
amenities = "" | |
facilities = "" | |
# Search for relevant named entities | |
for entity in ner_results: | |
if entity['label'] == 'ORG': | |
keytags.append(entity['word']) # Example: Company or key term (this can be changed) | |
elif entity['label'] == 'PERSON': | |
seller_name = entity['word'] # If a person is mentioned, consider it the seller name | |
elif entity['label'] == 'GPE': | |
location_details = entity['word'] # Geopolitical entity as location | |
# For amenities and facilities, you can modify the logic or use additional models (e.g., question-answering models) | |
amenities = "No amenities found" # Placeholder for the amenities | |
facilities = "No facilities found" # Placeholder for the facilities | |
return { | |
"Keytags": ", ".join(keytags) if keytags else "No keytags found", | |
"Amenities": amenities, | |
"Facilities": facilities, | |
"Seller Name": seller_name if seller_name else "No seller name found", | |
"Location Details": location_details if location_details else "No location details found" | |
} | |
except Exception as e: | |
return { | |
"Keytags": f"Error processing text: {str(e)}", | |
"Amenities": f"Error processing text: {str(e)}", | |
"Facilities": f"Error processing text: {str(e)}", | |
"Seller Name": f"Error processing text: {str(e)}", | |
"Location Details": f"Error processing text: {str(e)}" | |
} | |
# Function to combine the extraction process (from URL + model processing) | |
def get_info(url): | |
text = extract_text(url) | |
if "Error" in text: | |
return text, text, text, text, text # Return the error message for all outputs | |
extracted_info = extract_info_with_model(text) | |
return ( | |
extracted_info["Keytags"], | |
extracted_info["Amenities"], | |
extracted_info["Facilities"], | |
extracted_info["Seller Name"], | |
extracted_info["Location Details"] | |
) | |
# Gradio Interface to allow user input and display output | |
demo = gr.Interface( | |
fn=get_info, | |
inputs="text", # Input is a URL | |
outputs=["text", "text", "text", "text", "text"], # Outputs for each field (Keytags, Amenities, etc.) | |
title="Real Estate Info Extractor", | |
description="Extract Keytags, Amenities, Facilities, Seller Name, and Location Details from a real estate article URL." | |
) | |
if __name__ == "__main__": | |
demo.launch(show_api=False) | |