OttoYu commited on
Commit
d2594f1
·
verified ·
1 Parent(s): c0c69b1

Upload 7 files

Browse files
flag/RVT_AddressCh.txt ADDED
The diff for this file is too large to render. See raw diff
 
flag/RVT_AddressEn.txt ADDED
The diff for this file is too large to render. See raw diff
 
flag/RVT_Area.txt ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 港島 ns
2
+ 中西區 ns
3
+ 堅尼地城 ns
4
+ 石塘咀 ns
5
+ 西營盤 ns
6
+ 上環 ns
7
+ 中環 ns
8
+ 金鐘 ns
9
+ 半山區 ns
10
+ 山頂 ns
11
+ 灣仔 ns
12
+ 灣仔 ns
13
+ 銅鑼灣 ns
14
+ 跑馬地 ns
15
+ 大坑 ns
16
+ 掃桿埔 ns
17
+ 渣甸山 ns
18
+ 東區 ns
19
+ 天后 ns
20
+ 寶馬山 ns
21
+ 北角 ns
22
+ 鰂魚涌 ns
23
+ 西灣河 ns
24
+ 筲箕灣 ns
25
+ 柴灣 ns
26
+ 小西灣 ns
27
+ 南區 ns
28
+ 薄扶林 ns
29
+ 香港仔 ns
30
+ 鴨脷洲 ns
31
+ 黃竹坑 ns
32
+ 壽臣山 ns
33
+ 淺水灣 ns
34
+ 舂磡角 ns
35
+ 赤柱 ns
36
+ 大潭 ns
37
+ 石澳 ns
38
+ 九龍 ns
39
+ 油尖旺 ns
40
+ 尖沙咀 ns
41
+ 油麻地 ns
42
+ 西九龍填海區 ns
43
+ 京士柏 ns
44
+ 旺角 ns
45
+ 大角咀 ns
46
+ 深水埗 ns
47
+ 美孚 ns
48
+ 荔枝角 ns
49
+ 長沙灣 ns
50
+ 深水埗 ns
51
+ 石硤尾 ns
52
+ 又一村 ns
53
+ 大窩坪 ns
54
+ 昂船洲 ns
55
+ 九龍城 ns
56
+ 紅磡 ns
57
+ 土瓜灣 ns
58
+ 馬頭角 ns
59
+ 馬頭圍 ns
60
+ 啟德 ns
61
+ 九龍城 ns
62
+ 何文田 ns
63
+ 九龍塘 ns
64
+ 筆架山 ns
65
+ 黃大仙 ns
66
+ 新蒲崗 ns
67
+ 黃大仙 ns
68
+ 東頭 ns
69
+ 橫頭磡 ns
70
+ 樂富 ns
71
+ 鑽石山 ns
72
+ 慈雲山 ns
73
+ 牛池灣 ns
74
+ 觀塘 ns
75
+ 坪石 ns
76
+ 九龍灣 ns
77
+ 牛頭角 ns
78
+ 佐敦谷 ns
79
+ 觀塘 ns
80
+ 秀茂坪 ns
81
+ 藍田 ns
82
+ 油塘 ns
83
+ 鯉魚門 ns
84
+ 葵青 ns
85
+ 葵涌 ns
86
+ 青衣 ns
87
+ 荃灣 ns
88
+ 荃灣 ns
89
+ 梨木樹 ns
90
+ 汀九 ns
91
+ 深井 ns
92
+ 青龍頭 ns
93
+ 馬灣 ns
94
+ 欣澳 ns
95
+ 新界 ns
96
+ 屯門 ns
97
+ 大欖涌 ns
98
+ 掃管笏 ns
99
+ 屯門 ns
100
+ 藍地 ns
101
+ 元朗 ns
102
+ 洪水橋 ns
103
+ 廈村 ns
104
+ 流浮山 ns
105
+ 天水圍 ns
106
+ 元朗 ns
107
+ 新田 ns
108
+ 落馬洲 ns
109
+ 錦田 ns
110
+ 石崗 ns
111
+ 八鄉 ns
112
+ 北區 ns
113
+ 粉嶺 ns
114
+ 聯和墟 ns
115
+ 上水 ns
116
+ 石湖墟 ns
117
+ 沙頭角 ns
118
+ 鹿頸 ns
119
+ 烏蛟騰 ns
120
+ 大埔 ns
121
+ 大埔墟 ns
122
+ 大埔 ns
123
+ 大埔滘 ns
124
+ 大尾篤 ns
125
+ 船灣 ns
126
+ 樟木頭 ns
127
+ 企嶺下 ns
128
+ 沙田 ns
129
+ 大圍 ns
130
+ 沙田 ns
131
+ 火炭 ns
132
+ 馬料水 ns
133
+ 烏溪沙 ns
134
+ 馬鞍山 ns
135
+ 西貢 ns
136
+ 清水灣 ns
137
+ 西貢 ns
138
+ 大網仔 ns
139
+ 將軍澳 ns
140
+ 坑口 ns
141
+ 調景嶺 ns
142
+ 馬游塘 ns
143
+ 離島 ns
144
+ 長洲 ns
145
+ 坪洲 ns
146
+ 大嶼山 ns
147
+ 東涌 ns
148
+ 南丫島 ns
flag/RVT_BuildingCh.txt ADDED
The diff for this file is too large to render. See raw diff
 
flag/RVT_BuildingEn.txt ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import jieba
3
+ import jieba.analyse
4
+ import aiohttp
5
+ import asyncio
6
+ import ssl
7
+ from aiohttp import ClientSession
8
+
9
+ ssl_context = ssl.create_default_context()
10
+ ssl_context.check_hostname = False
11
+ ssl_context.verify_mode = ssl.CERT_NONE
12
+
13
+ def setup_jieba_dictionaries():
14
+ dictionaries = [
15
+ 'flag/RVT_AddressCh.txt',
16
+ 'flag/RVT_AddressEn.txt',
17
+ 'flag/RVT_Area.txt',
18
+ 'flag/RVT_BuildingCh.txt',
19
+ 'flag/RVT_BuildingEn.txt'
20
+ ]
21
+ for file_path in dictionaries:
22
+ jieba.load_userdict(file_path)
23
+
24
+ def process_text(text):
25
+ setup_jieba_dictionaries()
26
+
27
+ user_dict_terms = set()
28
+ for file_path in [
29
+ 'flag/RVT_AddressCh.txt',
30
+ 'flag/RVT_AddressEn.txt',
31
+ 'flag/RVT_Area.txt',
32
+ 'flag/RVT_BuildingCh.txt',
33
+ 'flag/RVT_BuildingEn.txt'
34
+ ]:
35
+ try:
36
+ with open(file_path, 'r', encoding='utf-8') as f:
37
+ user_dict_terms.update(line.strip().split()[0] for line in f)
38
+ except FileNotFoundError:
39
+ print(f'File not found: {file_path}')
40
+ except Exception as e:
41
+ print(f'Error reading file {file_path}: {e}')
42
+
43
+ lines = text.splitlines()
44
+ results = []
45
+
46
+ for line in lines:
47
+ line = line.strip()
48
+ keywords = jieba.analyse.textrank(line, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
49
+ keyword_text = ' '.join(keyword for keyword in keywords if keyword in user_dict_terms)
50
+ results.append(keyword_text)
51
+
52
+ return results
53
+
54
+ def reformat_text(text):
55
+ lines = text.splitlines()
56
+ return [line.strip() for line in lines if line.strip()]
57
+
58
+ def process_text_only(text, reformat):
59
+ extracted_keywords = process_text(text)
60
+ if reformat:
61
+ extracted_keywords = reformat_text('\n'.join(extracted_keywords))
62
+ # Join keywords with newline characters
63
+ return '\n'.join(extracted_keywords)
64
+
65
+ async def lookup_address(query, language='zh-Hant'):
66
+ url = 'https://www.als.gov.hk/lookup'
67
+ headers = {
68
+ 'Accept': 'application/json',
69
+ 'Accept-Language': language
70
+ }
71
+ payload = {
72
+ 'q': query
73
+ }
74
+
75
+ async with ClientSession() as session:
76
+ try:
77
+ async with session.post(url, headers=headers, data=payload, ssl=ssl_context) as response:
78
+ if response.status == 200:
79
+ return await response.json()
80
+ else:
81
+ print(f'Error fetching data: Status Code {response.status}')
82
+ return {'error': f'Error fetching data: Status Code {response.status}'}
83
+ except aiohttp.ClientError as e:
84
+ print(f'Client Error: {e}')
85
+ return {'error': f'Client Error: {e}'}
86
+ except Exception as e:
87
+ print(f'General Error: {e}')
88
+ return {'error': f'General Error: {e}'}
89
+
90
+ async def get_address_lookup_results(keywords):
91
+ results = []
92
+ for keyword in keywords:
93
+ keyword = keyword.strip()
94
+ if not keyword: # Skip empty keywords
95
+ continue
96
+ lookup_results = await lookup_address(keyword)
97
+ if 'SuggestedAddress' in lookup_results and isinstance(lookup_results['SuggestedAddress'], list):
98
+ first_match = lookup_results['SuggestedAddress'][0] # Use the first match
99
+ full_address = 'No matches found'
100
+ geo_address = 'N/A'
101
+ latitude = 'N/A'
102
+ longitude = 'N/A'
103
+ matched_building = 'No Building Name'
104
+
105
+ if first_match:
106
+ premises_address = first_match['Address']['PremisesAddress']
107
+ raw_address = premises_address.get('ChiPremisesAddress', {})
108
+ matched_building = raw_address.get('BuildingName', 'No Building Name')
109
+ full_address = matched_building
110
+ geo_address = premises_address.get('GeoAddress', 'N/A')
111
+ geo_info = premises_address.get('GeospatialInformation', {})
112
+ latitude = geo_info.get('Latitude', 'N/A')
113
+ longitude = geo_info.get('Longitude', 'N/A')
114
+
115
+ results.append({
116
+ 'Keyword': keyword,
117
+ 'Full Address': full_address,
118
+ 'Geo Address': geo_address,
119
+ 'Latitude': latitude,
120
+ 'Longitude': longitude
121
+ })
122
+ else:
123
+ results.append({
124
+ 'Keyword': keyword,
125
+ 'Full Address': 'No matches found',
126
+ 'Geo Address': 'N/A',
127
+ 'Latitude': 'N/A',
128
+ 'Longitude': 'N/A'
129
+ })
130
+
131
+ return results
132
+
133
+ async def gradio_function(text, reformat, perform_lookup):
134
+ extracted_keywords = process_text_only(text, reformat)
135
+ keywords_list = extracted_keywords.splitlines()
136
+
137
+ address_results = []
138
+ if perform_lookup:
139
+ address_results = await get_address_lookup_results(keywords_list)
140
+
141
+ return extracted_keywords, address_results
142
+
143
+ def gradio_interface(text, reformat, perform_lookup):
144
+ return asyncio.run(gradio_function(text, reformat, perform_lookup))
145
+
146
+ interface = gr.Interface(
147
+ fn=gradio_interface,
148
+ inputs=[
149
+ gr.Textbox(lines=20, placeholder="Paste text here, each line will be processed separately..."),
150
+ gr.Checkbox(label="Reformat text (remove empty lines)"),
151
+ gr.Checkbox(label="Perform Address Lookup")
152
+ ],
153
+ outputs=[
154
+ gr.Textbox(label="Extracted Address Keywords"),
155
+ gr.JSON(label="Address Lookup Results")
156
+ ],
157
+ title="Address Extraction and Lookup with Natural Language Processing",
158
+ description="Extract address keywords using NLP and optionally perform address lookup using ALS."
159
+ )
160
+
161
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ aiohttp==3.9.5
2
+ aiohttp==3.10.3
3
+ gradio==4.41.0
4
+ jieba==0.42.1