Spaces:

OttoYu
/

Address-Lookup

Sleeping

App Files Files Community

Address-Lookup / app.py

OttoYu

Update app.py

0f33334 verified 10 months ago

raw

history blame contribute delete

8.05 kB

	import gradio as gr
	import jieba
	import jieba.analyse
	import aiohttp
	import asyncio
	import ssl
	from functools import lru_cache

	# SSL context setup
	ssl_context = ssl.create_default_context()
	ssl_context.check_hostname = False
	ssl_context.verify_mode = ssl.CERT_NONE

	area_data = {
	'香港': {
	'中西區': ["西環", "堅尼地城", "石塘咀", "西營盤", "上環", "中環", "金鐘", "西半山", "中半山", "半山", "山頂"],
	'灣仔': ["灣仔", "銅鑼灣", "跑馬地", "大坑", "掃桿埔", "渣甸山"],
	'東區': ["天后", "寶馬山", "北角", "鰂魚涌", "西灣河", "筲箕灣", "柴灣", "小西灣"],
	'南區': ["薄扶林", "香港仔", "鴨脷洲", "黃竹坑", "壽臣山", "淺水灣", "舂磡角", "赤柱", "大潭", "石澳", "田灣"]
	},
	'九龍': {
	'油尖旺': ["尖沙咀", "油麻地", "西九龍", "京士柏", "旺角", "大角咀", "佐敦", "太子"],
	'深水埗': ["美孚", "荔枝角", "長沙灣", "深水埗", "石硤尾", "又一村", "大窩坪", "昂船洲"],
	'九龍城': ["紅磡", "土瓜灣", "馬頭角", "馬頭圍", "啟德", "九龍城", "何文田", "九龍塘", "筆架山"],
	'黃大仙': ["新蒲崗", "黃大仙", "東頭", "橫頭磡", "樂富", "鑽石山", "慈雲山", "牛池灣"],
	'觀塘': ["坪石", "九龍灣", "牛頭角", "佐敦谷", "觀塘", "秀茂坪", "藍田", "油塘", "鯉魚門"]
	},
	'新界': {
	'葵青': ["葵涌", "青衣", "葵芳"],
	'荃灣': ["荃灣", "梨木樹", "汀九", "深井", "青龍頭", "馬灣", "欣澳"],
	'屯門': ["大欖涌", "掃管笏", "屯門", "藍地"],
	'元朗': ["洪水橋", "廈村", "流浮山", "天水圍", "元朗", "新田", "落馬洲", "錦田", "石崗", "八鄉"],
	'北區': ["粉嶺", "聯和墟", "上水", "石湖墟", "沙頭角", "鹿頸", "烏蛟騰"],
	'大埔': ["大埔墟", "大埔", "大埔滘", "大尾篤", "船灣", "樟木頭", "企嶺下", "太和"],
	'沙田': ["大圍", "沙田", "火炭", "馬料水", "烏溪沙", "馬鞍山"],
	'西貢': ["清水灣", "西貢", "大網仔", "將軍澳", "坑口", "調景嶺", "馬游塘"],
	'離島': ["長洲", "坪洲", "大嶼山", "東涌", "南丫島"]
	}
	}

	@lru_cache(maxsize=None)
	def load_user_dict_terms():
	user_dict_terms = set()
	dictionaries = [
	'flag/RVT_AddressCh.txt', 'flag/RVT_AddressEn.txt', 'flag/RVT_Area.txt',
	'flag/RVT_BuildingCh.txt', 'flag/RVT_BuildingEn.txt', 'flag/ChiVillage.txt',
	'flag/ChiEstate.txt', 'flag/ChiStreet.txt', 'flag/ChiBuilding.txt'
	]
	for file_path in dictionaries:
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	user_dict_terms.update(line.strip().split()[0] for line in f)
	except FileNotFoundError:
	print(f'File not found: {file_path}')
	except Exception as e:
	print(f'Error reading file {file_path}: {e}')
	return user_dict_terms

	def setup_jieba_dictionaries():
	dictionaries = [
	'flag/RVT_AddressCh.txt', 'flag/RVT_AddressEn.txt', 'flag/RVT_Area.txt',
	'flag/RVT_BuildingCh.txt', 'flag/RVT_BuildingEn.txt', 'flag/ChiVillage.txt',
	'flag/ChiEstate.txt', 'flag/ChiStreet.txt', 'flag/ChiBuilding.txt'
	]
	for file_path in dictionaries:
	jieba.load_userdict(file_path)

	def process_text(text):
	setup_jieba_dictionaries()
	user_dict_terms = load_user_dict_terms()

	lines = text.splitlines()
	results = []

	for line in lines:
	line = line.strip()
	keywords = jieba.analyse.textrank(line, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
	keyword_text = ' '.join(keyword for keyword in keywords if keyword in user_dict_terms)
	results.append(keyword_text)

	return results

	def reformat_text(text):
	return [line.strip() for line in text.splitlines() if line.strip()]

	def process_text_only(text, reformat):
	extracted_keywords = process_text(text)
	if reformat:
	extracted_keywords = reformat_text('\n'.join(extracted_keywords))
	return '\n'.join(extracted_keywords)

	async def lookup_address(query, language='zh-Hant'):
	url = 'https://www.als.gov.hk/lookup'
	headers = {'Accept': 'application/json', 'Accept-Language': language}
	payload = {'q': query}

	async with aiohttp.ClientSession() as session:
	try:
	async with session.post(url, headers=headers, data=payload, ssl=ssl_context) as response:
	if response.status == 200:
	return await response.json()
	else:
	print(f'Error fetching data: Status Code {response.status}')
	return {'error': f'Error fetching data: Status Code {response.status}'}
	except aiohttp.ClientError as e:
	print(f'Client Error: {e}')
	return {'error': f'Client Error: {e}'}
	except Exception as e:
	print(f'General Error: {e}')
	return {'error': f'General Error: {e}'}

	async def get_address_lookup_results(keywords):
	results = []
	tasks = [lookup_address(keyword.strip()) for keyword in keywords if
	keyword.strip() and is_valid_for_lookup(keyword.strip())]

	lookup_results = await asyncio.gather(*tasks)

	for keyword, lookup_result in zip(keywords, lookup_results):
	if 'SuggestedAddress' in lookup_result and isinstance(lookup_result['SuggestedAddress'], list):
	first_match = lookup_result['SuggestedAddress'][0]
	result = {'Keyword': keyword}

	if first_match:
	premises_address = first_match['Address']['PremisesAddress']
	raw_address = premises_address.get('ChiPremisesAddress', {})
	matched_building = raw_address.get('BuildingName', 'No Building Name')

	if matched_building != 'No Building Name':
	result['Full Address'] = matched_building

	geo_address = premises_address.get('GeoAddress')
	if geo_address and geo_address != 'N/A':
	result['Geo Address'] = geo_address

	geo_info = premises_address.get('GeospatialInformation', {})
	latitude = geo_info.get('Latitude')
	longitude = geo_info.get('Longitude')

	if latitude and latitude != 'N/A':
	result['Latitude'] = latitude
	if longitude and longitude != 'N/A':
	result['Longitude'] = longitude

	if len(result) > 1: # Only add if there's more than just the Keyword
	results.append(result)

	return results

	def is_valid_for_lookup(keyword):
	return not any(keyword in districts.keys() or keyword in subdistrict
	for districts in area_data.values()
	for subdistrict in districts.values())

	async def gradio_function(text, reformat, perform_lookup):
	extracted_keywords = process_text_only(text, reformat)
	keywords_list = extracted_keywords.splitlines()

	address_results = []
	if perform_lookup:
	address_results = await get_address_lookup_results(keywords_list)

	return extracted_keywords, address_results

	def gradio_interface(text, reformat, perform_lookup):
	return asyncio.run(gradio_function(text, reformat, perform_lookup))

	interface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.Textbox(lines=20, placeholder="Paste text here, each line will be processed separately..."),
	gr.Checkbox(label="Reformat text (remove empty lines)"),
	gr.Checkbox(label="Perform Address Lookup")
	],
	outputs=[
	gr.Textbox(label="Extracted Address Keywords"),
	gr.JSON(label="Address Lookup Results")
	],
	title="Address Extraction and Lookup with Natural Language Processing",
	description="Extract address keywords using NLP and optionally perform address lookup using ALS."
	)

	interface.launch()