Address-Lookup / app.py
OttoYu's picture
Update app.py
b98028a verified
raw
history blame
8.15 kB
import gradio as gr
import jieba
import jieba.analyse
import aiohttp
import asyncio
import ssl
from aiohttp import ClientSession
from functools import lru_cache
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
area_data = {
'香港': {
'中西區': ["西環", "堅尼地城", "石塘咀", "西營盤", "上環", "中環", "金鐘", "西半山", "中半山", "半山", "山頂"],
'灣仔': ["灣仔", "銅鑼灣", "跑馬地", "大坑", "掃桿埔", "渣甸山"],
'東區': ["天后", "寶馬山", "北角", "鰂魚涌", "西灣河", "筲箕灣", "柴灣", "小西灣"],
'南區': ["薄扶林", "香港仔", "鴨脷洲", "黃竹坑", "壽臣山", "淺水灣", "舂磡角", "赤柱", "大潭", "石澳", "田灣"]
},
'九龍': {
'油尖旺': ["尖沙咀", "油麻地", "西九龍", "京士柏", "旺角", "大角咀", "佐敦", "太子"],
'深水埗': ["美孚", "荔枝角", "長沙灣", "深水埗", "石硤尾", "又一村", "大窩坪", "昂船洲"],
'九龍城': ["紅磡", "土瓜灣", "馬頭角", "馬頭圍", "啟德", "九龍城", "何文田", "九龍塘", "筆架山"],
'黃大仙': ["新蒲崗", "黃大仙", "東頭", "橫頭磡", "樂富", "鑽石山", "慈雲山", "牛池灣"],
'觀塘': ["坪石", "九龍灣", "牛頭角", "佐敦谷", "觀塘", "秀茂坪", "藍田", "油塘", "鯉魚門"]
},
'新界': {
'葵青': ["葵涌", "青衣", "葵芳"],
'荃灣': ["荃灣", "梨木樹", "汀九", "深井", "青龍頭", "馬灣", "欣澳"],
'屯門': ["大欖涌", "掃管笏", "屯門", "藍地"],
'元朗': ["洪水橋", "廈村", "流浮山", "天水圍", "元朗", "新田", "落馬洲", "錦田", "石崗", "八鄉"],
'北區': ["粉嶺", "聯和墟", "上水", "石湖墟", "沙頭角", "鹿頸", "烏蛟騰"],
'大埔': ["大埔墟", "大埔", "大埔滘", "大尾篤", "船灣", "樟木頭", "企嶺下", "太和"],
'沙田': ["大圍", "沙田", "火炭", "馬料水", "烏溪沙", "馬鞍山"],
'西貢': ["清水灣", "西貢", "大網仔", "將軍澳", "坑口", "調景嶺", "馬游塘"],
'離島': ["長洲", "坪洲", "大嶼山", "東涌", "南丫島"]
}
}
@lru_cache(maxsize=None)
def load_user_dict_terms():
user_dict_terms = set()
dictionaries = [
'flag/RVT_AddressCh.txt',
'flag/RVT_AddressEn.txt',
'flag/RVT_Area.txt',
'flag/RVT_BuildingCh.txt',
'flag/RVT_BuildingEn.txt'
]
for file_path in dictionaries:
try:
with open(file_path, 'r', encoding='utf-8') as f:
user_dict_terms.update(line.strip().split()[0] for line in f)
except FileNotFoundError:
print(f'File not found: {file_path}')
except Exception as e:
print(f'Error reading file {file_path}: {e}')
return user_dict_terms
def setup_jieba_dictionaries():
dictionaries = [
'flag/RVT_AddressCh.txt',
'flag/RVT_AddressEn.txt',
'flag/RVT_Area.txt',
'flag/RVT_BuildingCh.txt',
'flag/RVT_BuildingEn.txt'
]
for file_path in dictionaries:
jieba.load_userdict(file_path)
def process_text(text):
setup_jieba_dictionaries()
user_dict_terms = load_user_dict_terms()
lines = text.splitlines()
results = []
for line in lines:
line = line.strip()
keywords = jieba.analyse.textrank(line, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
keyword_text = ' '.join(keyword for keyword in keywords if keyword in user_dict_terms)
results.append(keyword_text)
return results
def reformat_text(text):
return [line.strip() for line in text.splitlines() if line.strip()]
def process_text_only(text, reformat):
extracted_keywords = process_text(text)
if reformat:
extracted_keywords = reformat_text('\n'.join(extracted_keywords))
return '\n'.join(extracted_keywords)
async def lookup_address(query, language='zh-Hant'):
url = 'https://www.als.gov.hk/lookup'
headers = {
'Accept': 'application/json',
'Accept-Language': language
}
payload = {
'q': query
}
async with ClientSession() as session:
try:
async with session.post(url, headers=headers, data=payload, ssl=ssl_context) as response:
if response.status == 200:
return await response.json()
else:
print(f'Error fetching data: Status Code {response.status}')
return {'error': f'Error fetching data: Status Code {response.status}'}
except aiohttp.ClientError as e:
print(f'Client Error: {e}')
return {'error': f'Client Error: {e}'}
except Exception as e:
print(f'General Error: {e}')
return {'error': f'General Error: {e}'}
async def get_address_lookup_results(keywords):
results = []
tasks = []
for keyword in keywords:
keyword = keyword.strip()
if not keyword:
continue
if not is_valid_for_lookup(keyword):
continue
else:
tasks.append(lookup_address(keyword))
lookup_results = await asyncio.gather(*tasks)
for keyword, lookup_result in zip(keywords, lookup_results):
if 'SuggestedAddress' in lookup_result and isinstance(lookup_result['SuggestedAddress'], list):
first_match = lookup_result['SuggestedAddress'][0]
result = {'Keyword': keyword}
if first_match:
premises_address = first_match['Address']['PremisesAddress']
raw_address = premises_address.get('ChiPremisesAddress', {})
matched_building = raw_address.get('BuildingName', 'No Building Name')
if matched_building != 'No Building Name':
result['Full Address'] = matched_building
geo_address = premises_address.get('GeoAddress')
if geo_address and geo_address != 'N/A':
result['Geo Address'] = geo_address
geo_info = premises_address.get('GeospatialInformation', {})
latitude = geo_info.get('Latitude')
longitude = geo_info.get('Longitude')
if latitude and latitude != 'N/A':
result['Latitude'] = latitude
if longitude and longitude != 'N/A':
result['Longitude'] = longitude
if len(result) > 1: # Only add if there's more than just the Keyword
results.append(result)
else:
pass
return results
async def gradio_function(text, reformat, perform_lookup):
extracted_keywords = process_text_only(text, reformat)
keywords_list = extracted_keywords.splitlines()
address_results = []
if perform_lookup:
address_results = await get_address_lookup_results(keywords_list)
return extracted_keywords, address_results
def is_valid_for_lookup(keyword):
for region, districts in area_data.items():
if keyword in districts.keys():
return False
for subdistricts in districts.values():
if keyword in subdistricts:
return False
return True
def gradio_interface(text, reformat, perform_lookup):
return asyncio.run(gradio_function(text, reformat, perform_lookup))
interface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(lines=20, placeholder="Paste text here, each line will be processed separately..."),
gr.Checkbox(label="Reformat text (remove empty lines)"),
gr.Checkbox(label="Perform Address Lookup")
],
outputs=[
gr.Textbox(label="Extracted Address Keywords"),
gr.JSON(label="Address Lookup Results")
],
title="Address Extraction and Lookup with Natural Language Processing",
description="Extract address keywords using NLP and optionally perform address lookup using ALS."
)
interface.launch()