#!/usr/bin/env python # coding: utf-8 # In[ ]: import gradio as gr import requests from bs4 import BeautifulSoup from transformers import pipeline import os # Function to extract text from the URL using requests def extract_text(url): try: # Enhanced headers to simulate a real browser request headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Referer': 'https://www.mansionglobal.com/', # Adding referer might help bypass restrictions 'Upgrade-Insecure-Requests': '1', # Can sometimes help 'Cache-Control': 'max-age=0' } # Sending GET request with headers response = requests.get(url, headers=headers) # Raise an error for bad status codes response.raise_for_status() # Parse HTML and extract text soup = BeautifulSoup(response.text, "html.parser") text = ' '.join(soup.stripped_strings) return text except requests.exceptions.RequestException as e: return f"Error extracting text from URL: {str(e)}" # Load Hugging Face model (for extracting named entities or QA) try: ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english") except Exception as e: ner_model = None print(f"Error loading model: {str(e)}") # Function to extract information using Hugging Face model def extract_info_with_model(text): if not ner_model: return { "Keytags": "Model loading failed.", "Amenities": "Model loading failed.", "Facilities": "Model loading failed.", "Seller Name": "Model loading failed.", "Location Details": "Model loading failed." } try: # Apply named entity recognition (NER) to extract entities from the text ner_results = ner_model(text) # Initialize variables keytags = [] seller_name = "" location_details = "" amenities = "" facilities = "" # Search for relevant named entities for entity in ner_results: if entity['label'] == 'ORG': keytags.append(entity['word']) # Example: Company or key term (this can be changed) elif entity['label'] == 'PERSON': seller_name = entity['word'] # If a person is mentioned, consider it the seller name elif entity['label'] == 'GPE': location_details = entity['word'] # Geopolitical entity as location # For amenities and facilities, you can modify the logic or use additional models (e.g., question-answering models) amenities = "No amenities found" # Placeholder for the amenities facilities = "No facilities found" # Placeholder for the facilities return { "Keytags": ", ".join(keytags) if keytags else "No keytags found", "Amenities": amenities, "Facilities": facilities, "Seller Name": seller_name if seller_name else "No seller name found", "Location Details": location_details if location_details else "No location details found" } except Exception as e: return { "Keytags": f"Error processing text: {str(e)}", "Amenities": f"Error processing text: {str(e)}", "Facilities": f"Error processing text: {str(e)}", "Seller Name": f"Error processing text: {str(e)}", "Location Details": f"Error processing text: {str(e)}" } # Function to combine the extraction process (from URL + model processing) def get_info(url): text = extract_text(url) if "Error" in text: return text, text, text, text, text # Return the error message for all outputs extracted_info = extract_info_with_model(text) return ( extracted_info["Keytags"], extracted_info["Amenities"], extracted_info["Facilities"], extracted_info["Seller Name"], extracted_info["Location Details"] ) # Gradio Interface to allow user input and display output demo = gr.Interface( fn=get_info, inputs="text", # Input is a URL outputs=["text", "text", "text", "text", "text"], # Outputs for each field (Keytags, Amenities, etc.) title="Real Estate Info Extractor", description="Extract Keytags, Amenities, Facilities, Seller Name, and Location Details from a real estate article URL." ) if __name__ == "__main__": demo.launch(show_api=False)