Spaces:
Sleeping
Sleeping
File size: 8,051 Bytes
d2594f1 b98028a d2594f1 6b1ec2d d2594f1 b98028a d2594f1 6b1ec2d d2594f1 b98028a 6b1ec2d b98028a d2594f1 b98028a d2594f1 6b1ec2d d2594f1 6b1ec2d d2594f1 6b1ec2d b98028a d2594f1 b98028a d2594f1 b98028a d2594f1 6b1ec2d d2594f1 b98028a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import gradio as gr
import jieba
import jieba.analyse
import aiohttp
import asyncio
import ssl
from functools import lru_cache
# SSL context setup
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
area_data = {
'香港': {
'中西區': ["西環", "堅尼地城", "石塘咀", "西營盤", "上環", "中環", "金鐘", "西半山", "中半山", "半山", "山頂"],
'灣仔': ["灣仔", "銅鑼灣", "跑馬地", "大坑", "掃桿埔", "渣甸山"],
'東區': ["天后", "寶馬山", "北角", "鰂魚涌", "西灣河", "筲箕灣", "柴灣", "小西灣"],
'南區': ["薄扶林", "香港仔", "鴨脷洲", "黃竹坑", "壽臣山", "淺水灣", "舂磡角", "赤柱", "大潭", "石澳", "田灣"]
},
'九龍': {
'油尖旺': ["尖沙咀", "油麻地", "西九龍", "京士柏", "旺角", "大角咀", "佐敦", "太子"],
'深水埗': ["美孚", "荔枝角", "長沙灣", "深水埗", "石硤尾", "又一村", "大窩坪", "昂船洲"],
'九龍城': ["紅磡", "土瓜灣", "馬頭角", "馬頭圍", "啟德", "九龍城", "何文田", "九龍塘", "筆架山"],
'黃大仙': ["新蒲崗", "黃大仙", "東頭", "橫頭磡", "樂富", "鑽石山", "慈雲山", "牛池灣"],
'觀塘': ["坪石", "九龍灣", "牛頭角", "佐敦谷", "觀塘", "秀茂坪", "藍田", "油塘", "鯉魚門"]
},
'新界': {
'葵青': ["葵涌", "青衣", "葵芳"],
'荃灣': ["荃灣", "梨木樹", "汀九", "深井", "青龍頭", "馬灣", "欣澳"],
'屯門': ["大欖涌", "掃管笏", "屯門", "藍地"],
'元朗': ["洪水橋", "廈村", "流浮山", "天水圍", "元朗", "新田", "落馬洲", "錦田", "石崗", "八鄉"],
'北區': ["粉嶺", "聯和墟", "上水", "石湖墟", "沙頭角", "鹿頸", "烏蛟騰"],
'大埔': ["大埔墟", "大埔", "大埔滘", "大尾篤", "船灣", "樟木頭", "企嶺下", "太和"],
'沙田': ["大圍", "沙田", "火炭", "馬料水", "烏溪沙", "馬鞍山"],
'西貢': ["清水灣", "西貢", "大網仔", "將軍澳", "坑口", "調景嶺", "馬游塘"],
'離島': ["長洲", "坪洲", "大嶼山", "東涌", "南丫島"]
}
}
@lru_cache(maxsize=None)
def load_user_dict_terms():
user_dict_terms = set()
dictionaries = [
'flag/RVT_AddressCh.txt', 'flag/RVT_AddressEn.txt', 'flag/RVT_Area.txt',
'flag/RVT_BuildingCh.txt', 'flag/RVT_BuildingEn.txt', 'flag/ChiVillage.txt',
'flag/ChiEstate.txt', 'flag/ChiStreet.txt', 'flag/ChiBuilding.txt'
]
for file_path in dictionaries:
try:
with open(file_path, 'r', encoding='utf-8') as f:
user_dict_terms.update(line.strip().split()[0] for line in f)
except FileNotFoundError:
print(f'File not found: {file_path}')
except Exception as e:
print(f'Error reading file {file_path}: {e}')
return user_dict_terms
def setup_jieba_dictionaries():
dictionaries = [
'flag/RVT_AddressCh.txt', 'flag/RVT_AddressEn.txt', 'flag/RVT_Area.txt',
'flag/RVT_BuildingCh.txt', 'flag/RVT_BuildingEn.txt', 'flag/ChiVillage.txt',
'flag/ChiEstate.txt', 'flag/ChiStreet.txt', 'flag/ChiBuilding.txt'
]
for file_path in dictionaries:
jieba.load_userdict(file_path)
def process_text(text):
setup_jieba_dictionaries()
user_dict_terms = load_user_dict_terms()
lines = text.splitlines()
results = []
for line in lines:
line = line.strip()
keywords = jieba.analyse.textrank(line, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
keyword_text = ' '.join(keyword for keyword in keywords if keyword in user_dict_terms)
results.append(keyword_text)
return results
def reformat_text(text):
return [line.strip() for line in text.splitlines() if line.strip()]
def process_text_only(text, reformat):
extracted_keywords = process_text(text)
if reformat:
extracted_keywords = reformat_text('\n'.join(extracted_keywords))
return '\n'.join(extracted_keywords)
async def lookup_address(query, language='zh-Hant'):
url = 'https://www.als.gov.hk/lookup'
headers = {'Accept': 'application/json', 'Accept-Language': language}
payload = {'q': query}
async with aiohttp.ClientSession() as session:
try:
async with session.post(url, headers=headers, data=payload, ssl=ssl_context) as response:
if response.status == 200:
return await response.json()
else:
print(f'Error fetching data: Status Code {response.status}')
return {'error': f'Error fetching data: Status Code {response.status}'}
except aiohttp.ClientError as e:
print(f'Client Error: {e}')
return {'error': f'Client Error: {e}'}
except Exception as e:
print(f'General Error: {e}')
return {'error': f'General Error: {e}'}
async def get_address_lookup_results(keywords):
results = []
tasks = [lookup_address(keyword.strip()) for keyword in keywords if
keyword.strip() and is_valid_for_lookup(keyword.strip())]
lookup_results = await asyncio.gather(*tasks)
for keyword, lookup_result in zip(keywords, lookup_results):
if 'SuggestedAddress' in lookup_result and isinstance(lookup_result['SuggestedAddress'], list):
first_match = lookup_result['SuggestedAddress'][0]
result = {'Keyword': keyword}
if first_match:
premises_address = first_match['Address']['PremisesAddress']
raw_address = premises_address.get('ChiPremisesAddress', {})
matched_building = raw_address.get('BuildingName', 'No Building Name')
if matched_building != 'No Building Name':
result['Full Address'] = matched_building
geo_address = premises_address.get('GeoAddress')
if geo_address and geo_address != 'N/A':
result['Geo Address'] = geo_address
geo_info = premises_address.get('GeospatialInformation', {})
latitude = geo_info.get('Latitude')
longitude = geo_info.get('Longitude')
if latitude and latitude != 'N/A':
result['Latitude'] = latitude
if longitude and longitude != 'N/A':
result['Longitude'] = longitude
if len(result) > 1: # Only add if there's more than just the Keyword
results.append(result)
return results
def is_valid_for_lookup(keyword):
return not any(keyword in districts.keys() or keyword in subdistrict
for districts in area_data.values()
for subdistrict in districts.values())
async def gradio_function(text, reformat, perform_lookup):
extracted_keywords = process_text_only(text, reformat)
keywords_list = extracted_keywords.splitlines()
address_results = []
if perform_lookup:
address_results = await get_address_lookup_results(keywords_list)
return extracted_keywords, address_results
def gradio_interface(text, reformat, perform_lookup):
return asyncio.run(gradio_function(text, reformat, perform_lookup))
interface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(lines=20, placeholder="Paste text here, each line will be processed separately..."),
gr.Checkbox(label="Reformat text (remove empty lines)"),
gr.Checkbox(label="Perform Address Lookup")
],
outputs=[
gr.Textbox(label="Extracted Address Keywords"),
gr.JSON(label="Address Lookup Results")
],
title="Address Extraction and Lookup with Natural Language Processing",
description="Extract address keywords using NLP and optionally perform address lookup using ALS."
)
interface.launch() |