File size: 8,051 Bytes
d2594f1
 
 
 
 
 
b98028a
d2594f1
6b1ec2d
d2594f1
 
 
 
b98028a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2594f1
6b1ec2d
 
 
d2594f1
 
 
 
 
 
 
 
 
b98028a
 
 
 
6b1ec2d
 
 
b98028a
 
 
 
 
 
 
d2594f1
 
 
 
 
 
 
 
 
 
 
 
 
b98028a
d2594f1
 
 
 
 
 
 
 
 
6b1ec2d
 
d2594f1
6b1ec2d
d2594f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b1ec2d
 
b98028a
 
 
 
 
 
 
d2594f1
 
 
 
 
b98028a
 
 
 
 
 
 
 
d2594f1
b98028a
 
 
 
 
 
 
 
 
 
d2594f1
 
 
6b1ec2d
 
 
 
 
d2594f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b98028a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import gradio as gr
import jieba
import jieba.analyse
import aiohttp
import asyncio
import ssl
from functools import lru_cache

# SSL context setup
ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE

area_data = {
    '香港': {
        '中西區': ["西環", "堅尼地城", "石塘咀", "西營盤", "上環", "中環", "金鐘", "西半山", "中半山", "半山", "山頂"],
        '灣仔': ["灣仔", "銅鑼灣", "跑馬地", "大坑", "掃桿埔", "渣甸山"],
        '東區': ["天后", "寶馬山", "北角", "鰂魚涌", "西灣河", "筲箕灣", "柴灣", "小西灣"],
        '南區': ["薄扶林", "香港仔", "鴨脷洲", "黃竹坑", "壽臣山", "淺水灣", "舂磡角", "赤柱", "大潭", "石澳", "田灣"]
    },
    '九龍': {
        '油尖旺': ["尖沙咀", "油麻地", "西九龍", "京士柏", "旺角", "大角咀", "佐敦", "太子"],
        '深水埗': ["美孚", "荔枝角", "長沙灣", "深水埗", "石硤尾", "又一村", "大窩坪", "昂船洲"],
        '九龍城': ["紅磡", "土瓜灣", "馬頭角", "馬頭圍", "啟德", "九龍城", "何文田", "九龍塘", "筆架山"],
        '黃大仙': ["新蒲崗", "黃大仙", "東頭", "橫頭磡", "樂富", "鑽石山", "慈雲山", "牛池灣"],
        '觀塘': ["坪石", "九龍灣", "牛頭角", "佐敦谷", "觀塘", "秀茂坪", "藍田", "油塘", "鯉魚門"]
    },
    '新界': {
        '葵青': ["葵涌", "青衣", "葵芳"],
        '荃灣': ["荃灣", "梨木樹", "汀九", "深井", "青龍頭", "馬灣", "欣澳"],
        '屯門': ["大欖涌", "掃管笏", "屯門", "藍地"],
        '元朗': ["洪水橋", "廈村", "流浮山", "天水圍", "元朗", "新田", "落馬洲", "錦田", "石崗", "八鄉"],
        '北區': ["粉嶺", "聯和墟", "上水", "石湖墟", "沙頭角", "鹿頸", "烏蛟騰"],
        '大埔': ["大埔墟", "大埔", "大埔滘", "大尾篤", "船灣", "樟木頭", "企嶺下", "太和"],
        '沙田': ["大圍", "沙田", "火炭", "馬料水", "烏溪沙", "馬鞍山"],
        '西貢': ["清水灣", "西貢", "大網仔", "將軍澳", "坑口", "調景嶺", "馬游塘"],
        '離島': ["長洲", "坪洲", "大嶼山", "東涌", "南丫島"]
    }
}

@lru_cache(maxsize=None)
def load_user_dict_terms():
    user_dict_terms = set()
    dictionaries = [
        'flag/RVT_AddressCh.txt', 'flag/RVT_AddressEn.txt', 'flag/RVT_Area.txt',
        'flag/RVT_BuildingCh.txt', 'flag/RVT_BuildingEn.txt', 'flag/ChiVillage.txt',
        'flag/ChiEstate.txt', 'flag/ChiStreet.txt', 'flag/ChiBuilding.txt'
    ]
    for file_path in dictionaries:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                user_dict_terms.update(line.strip().split()[0] for line in f)
        except FileNotFoundError:
            print(f'File not found: {file_path}')
        except Exception as e:
            print(f'Error reading file {file_path}: {e}')
    return user_dict_terms

def setup_jieba_dictionaries():
    dictionaries = [
        'flag/RVT_AddressCh.txt', 'flag/RVT_AddressEn.txt', 'flag/RVT_Area.txt',
        'flag/RVT_BuildingCh.txt', 'flag/RVT_BuildingEn.txt', 'flag/ChiVillage.txt',
        'flag/ChiEstate.txt', 'flag/ChiStreet.txt', 'flag/ChiBuilding.txt'
    ]
    for file_path in dictionaries:
        jieba.load_userdict(file_path)

def process_text(text):
    setup_jieba_dictionaries()
    user_dict_terms = load_user_dict_terms()

    lines = text.splitlines()
    results = []

    for line in lines:
        line = line.strip()
        keywords = jieba.analyse.textrank(line, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
        keyword_text = ' '.join(keyword for keyword in keywords if keyword in user_dict_terms)
        results.append(keyword_text)

    return results

def reformat_text(text):
    return [line.strip() for line in text.splitlines() if line.strip()]

def process_text_only(text, reformat):
    extracted_keywords = process_text(text)
    if reformat:
        extracted_keywords = reformat_text('\n'.join(extracted_keywords))
    return '\n'.join(extracted_keywords)

async def lookup_address(query, language='zh-Hant'):
    url = 'https://www.als.gov.hk/lookup'
    headers = {'Accept': 'application/json', 'Accept-Language': language}
    payload = {'q': query}

    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(url, headers=headers, data=payload, ssl=ssl_context) as response:
                if response.status == 200:
                    return await response.json()
                else:
                    print(f'Error fetching data: Status Code {response.status}')
                    return {'error': f'Error fetching data: Status Code {response.status}'}
        except aiohttp.ClientError as e:
            print(f'Client Error: {e}')
            return {'error': f'Client Error: {e}'}
        except Exception as e:
            print(f'General Error: {e}')
            return {'error': f'General Error: {e}'}

async def get_address_lookup_results(keywords):
    results = []
    tasks = [lookup_address(keyword.strip()) for keyword in keywords if
             keyword.strip() and is_valid_for_lookup(keyword.strip())]

    lookup_results = await asyncio.gather(*tasks)

    for keyword, lookup_result in zip(keywords, lookup_results):
        if 'SuggestedAddress' in lookup_result and isinstance(lookup_result['SuggestedAddress'], list):
            first_match = lookup_result['SuggestedAddress'][0]
            result = {'Keyword': keyword}

            if first_match:
                premises_address = first_match['Address']['PremisesAddress']
                raw_address = premises_address.get('ChiPremisesAddress', {})
                matched_building = raw_address.get('BuildingName', 'No Building Name')

                if matched_building != 'No Building Name':
                    result['Full Address'] = matched_building

                geo_address = premises_address.get('GeoAddress')
                if geo_address and geo_address != 'N/A':
                    result['Geo Address'] = geo_address

                geo_info = premises_address.get('GeospatialInformation', {})
                latitude = geo_info.get('Latitude')
                longitude = geo_info.get('Longitude')

                if latitude and latitude != 'N/A':
                    result['Latitude'] = latitude
                if longitude and longitude != 'N/A':
                    result['Longitude'] = longitude

            if len(result) > 1:  # Only add if there's more than just the Keyword
                results.append(result)

    return results

def is_valid_for_lookup(keyword):
    return not any(keyword in districts.keys() or keyword in subdistrict
                   for districts in area_data.values()
                   for subdistrict in districts.values())

async def gradio_function(text, reformat, perform_lookup):
    extracted_keywords = process_text_only(text, reformat)
    keywords_list = extracted_keywords.splitlines()

    address_results = []
    if perform_lookup:
        address_results = await get_address_lookup_results(keywords_list)

    return extracted_keywords, address_results

def gradio_interface(text, reformat, perform_lookup):
    return asyncio.run(gradio_function(text, reformat, perform_lookup))

interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(lines=20, placeholder="Paste text here, each line will be processed separately..."),
        gr.Checkbox(label="Reformat text (remove empty lines)"),
        gr.Checkbox(label="Perform Address Lookup")
    ],
    outputs=[
        gr.Textbox(label="Extracted Address Keywords"),
        gr.JSON(label="Address Lookup Results")
    ],
    title="Address Extraction and Lookup with Natural Language Processing",
    description="Extract address keywords using NLP and optionally perform address lookup using ALS."
)

interface.launch()