File size: 5,654 Bytes
d2594f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import gradio as gr
import jieba
import jieba.analyse
import aiohttp
import asyncio
import ssl
from aiohttp import ClientSession

ssl_context = ssl.create_default_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE

def setup_jieba_dictionaries():
    dictionaries = [
        'flag/RVT_AddressCh.txt',
        'flag/RVT_AddressEn.txt',
        'flag/RVT_Area.txt',
        'flag/RVT_BuildingCh.txt',
        'flag/RVT_BuildingEn.txt'
    ]
    for file_path in dictionaries:
        jieba.load_userdict(file_path)

def process_text(text):
    setup_jieba_dictionaries()

    user_dict_terms = set()
    for file_path in [
        'flag/RVT_AddressCh.txt',
        'flag/RVT_AddressEn.txt',
        'flag/RVT_Area.txt',
        'flag/RVT_BuildingCh.txt',
        'flag/RVT_BuildingEn.txt'
    ]:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                user_dict_terms.update(line.strip().split()[0] for line in f)
        except FileNotFoundError:
            print(f'File not found: {file_path}')
        except Exception as e:
            print(f'Error reading file {file_path}: {e}')

    lines = text.splitlines()
    results = []

    for line in lines:
        line = line.strip()
        keywords = jieba.analyse.textrank(line, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))
        keyword_text = ' '.join(keyword for keyword in keywords if keyword in user_dict_terms)
        results.append(keyword_text)

    return results

def reformat_text(text):
    lines = text.splitlines()
    return [line.strip() for line in lines if line.strip()]

def process_text_only(text, reformat):
    extracted_keywords = process_text(text)
    if reformat:
        extracted_keywords = reformat_text('\n'.join(extracted_keywords))
    # Join keywords with newline characters
    return '\n'.join(extracted_keywords)

async def lookup_address(query, language='zh-Hant'):
    url = 'https://www.als.gov.hk/lookup'
    headers = {
        'Accept': 'application/json',
        'Accept-Language': language
    }
    payload = {
        'q': query
    }

    async with ClientSession() as session:
        try:
            async with session.post(url, headers=headers, data=payload, ssl=ssl_context) as response:
                if response.status == 200:
                    return await response.json()
                else:
                    print(f'Error fetching data: Status Code {response.status}')
                    return {'error': f'Error fetching data: Status Code {response.status}'}
        except aiohttp.ClientError as e:
            print(f'Client Error: {e}')
            return {'error': f'Client Error: {e}'}
        except Exception as e:
            print(f'General Error: {e}')
            return {'error': f'General Error: {e}'}

async def get_address_lookup_results(keywords):
    results = []
    for keyword in keywords:
        keyword = keyword.strip()
        if not keyword:  # Skip empty keywords
            continue
        lookup_results = await lookup_address(keyword)
        if 'SuggestedAddress' in lookup_results and isinstance(lookup_results['SuggestedAddress'], list):
            first_match = lookup_results['SuggestedAddress'][0]  # Use the first match
            full_address = 'No matches found'
            geo_address = 'N/A'
            latitude = 'N/A'
            longitude = 'N/A'
            matched_building = 'No Building Name'

            if first_match:
                premises_address = first_match['Address']['PremisesAddress']
                raw_address = premises_address.get('ChiPremisesAddress', {})
                matched_building = raw_address.get('BuildingName', 'No Building Name')
                full_address = matched_building
                geo_address = premises_address.get('GeoAddress', 'N/A')
                geo_info = premises_address.get('GeospatialInformation', {})
                latitude = geo_info.get('Latitude', 'N/A')
                longitude = geo_info.get('Longitude', 'N/A')

            results.append({
                'Keyword': keyword,
                'Full Address': full_address,
                'Geo Address': geo_address,
                'Latitude': latitude,
                'Longitude': longitude
            })
        else:
            results.append({
                'Keyword': keyword,
                'Full Address': 'No matches found',
                'Geo Address': 'N/A',
                'Latitude': 'N/A',
                'Longitude': 'N/A'
            })

    return results

async def gradio_function(text, reformat, perform_lookup):
    extracted_keywords = process_text_only(text, reformat)
    keywords_list = extracted_keywords.splitlines()

    address_results = []
    if perform_lookup:
        address_results = await get_address_lookup_results(keywords_list)

    return extracted_keywords, address_results

def gradio_interface(text, reformat, perform_lookup):
    return asyncio.run(gradio_function(text, reformat, perform_lookup))

interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(lines=20, placeholder="Paste text here, each line will be processed separately..."),
        gr.Checkbox(label="Reformat text (remove empty lines)"),
        gr.Checkbox(label="Perform Address Lookup")
    ],
    outputs=[
        gr.Textbox(label="Extracted Address Keywords"),
        gr.JSON(label="Address Lookup Results")
    ],
    title="Address Extraction and Lookup with Natural Language Processing",
    description="Extract address keywords using NLP and optionally perform address lookup using ALS."
)

interface.launch()