Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[ ]: | |
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from transformers import pipeline | |
import os | |
# Function to extract text from the URL using requests | |
def extract_text(url): | |
try: | |
# Enhanced headers to simulate a real browser request | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept-Language': 'en-US,en;q=0.9', | |
'Accept-Encoding': 'gzip, deflate, br', | |
'Connection': 'keep-alive', | |
'Referer': 'https://www.mansionglobal.com/', # Adding referer might help bypass restrictions | |
'Upgrade-Insecure-Requests': '1', # Can sometimes help | |
'Cache-Control': 'max-age=0' | |
} | |
# Sending GET request with headers | |
response = requests.get(url, headers=headers) | |
# Raise an error for bad status codes | |
response.raise_for_status() | |
# Parse HTML and extract text | |
soup = BeautifulSoup(response.text, "html.parser") | |
text = ' '.join(soup.stripped_strings) | |
return text | |
except requests.exceptions.RequestException as e: | |
return f"Error extracting text from URL: {str(e)}" | |
# Load Hugging Face model (for extracting named entities or QA) | |
try: | |
ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english") | |
except Exception as e: | |
ner_model = None | |
print(f"Error loading model: {str(e)}") | |
# Function to extract information using Hugging Face model | |
def extract_info_with_model(text): | |
if not ner_model: | |
return { | |
"Keytags": "Model loading failed.", | |
"Amenities": "Model loading failed.", | |
"Facilities": "Model loading failed.", | |
"Seller Name": "Model loading failed.", | |
"Location Details": "Model loading failed." | |
} | |
try: | |
# Apply named entity recognition (NER) to extract entities from the text | |
ner_results = ner_model(text) | |
# Initialize variables | |
keytags = [] | |
seller_name = "" | |
location_details = "" | |
amenities = "" | |
facilities = "" | |
# Search for relevant named entities | |
for entity in ner_results: | |
if entity['label'] == 'ORG': | |
keytags.append(entity['word']) # Example: Company or key term (this can be changed) | |
elif entity['label'] == 'PERSON': | |
seller_name = entity['word'] # If a person is mentioned, consider it the seller name | |
elif entity['label'] == 'GPE': | |
location_details = entity['word'] # Geopolitical entity as location | |
# For amenities and facilities, you can modify the logic or use additional models (e.g., question-answering models) | |
amenities = "No amenities found" # Placeholder for the amenities | |
facilities = "No facilities found" # Placeholder for the facilities | |
return { | |
"Keytags": ", ".join(keytags) if keytags else "No keytags found", | |
"Amenities": amenities, | |
"Facilities": facilities, | |
"Seller Name": seller_name if seller_name else "No seller name found", | |
"Location Details": location_details if location_details else "No location details found" | |
} | |
except Exception as e: | |
return { | |
"Keytags": f"Error processing text: {str(e)}", | |
"Amenities": f"Error processing text: {str(e)}", | |
"Facilities": f"Error processing text: {str(e)}", | |
"Seller Name": f"Error processing text: {str(e)}", | |
"Location Details": f"Error processing text: {str(e)}" | |
} | |
# Function to combine the extraction process (from URL + model processing) | |
def get_info(url): | |
text = extract_text(url) | |
if "Error" in text: | |
return text, text, text, text, text # Return the error message for all outputs | |
extracted_info = extract_info_with_model(text) | |
return ( | |
extracted_info["Keytags"], | |
extracted_info["Amenities"], | |
extracted_info["Facilities"], | |
extracted_info["Seller Name"], | |
extracted_info["Location Details"] | |
) | |
# Gradio Interface to allow user input and display output | |
demo = gr.Interface( | |
fn=get_info, | |
inputs="text", # Input is a URL | |
outputs=["text", "text", "text", "text", "text"], # Outputs for each field (Keytags, Amenities, etc.) | |
title="Real Estate Info Extractor", | |
description="Extract Keytags, Amenities, Facilities, Seller Name, and Location Details from a real estate article URL." | |
) | |
if __name__ == "__main__": | |
demo.launch(show_api=False) | |