File size: 5,471 Bytes
5b65b88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import pandas as pd

# Set the option to display all columns
pd.options.display.max_columns = 5

# Model name from Hugging Face model hub
model_name = "zekun-li/geolm-base-toponym-recognition"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Example input sentence
input_sentence = "Minneapolis, officially the City of Minneapolis, is a city in " \
                 "the state of Minnesota and the county seat of Hennepin County."
input_2 = "Los Angeles, often referred to by its initials L.A., is the most populous " \
          "city in California, the most populous U.S. state. It is the commercial, " \
          "financial,  and cultural center of Southern California. Los Angeles is the " \
          "second-most populous city in the United States after New York City, with a population of " \
          "roughly 3.9  million residents within the city limits as of 2020."

# input_sentence = input_2

# Tokenize input sentence
tokens = tokenizer.encode(input_sentence, return_tensors="pt")
original_words = tokenizer.convert_ids_to_tokens(tokens[0])
# tokenizer.to
# Pass tokens through the model
outputs = model(tokens)

# Retrieve predicted labels for each token
predicted_labels = torch.argmax(outputs.logits, dim=2)

predicted_labels = predicted_labels.detach().cpu().numpy()
# Decode predicted labels
predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]

# Print predicted labels
print(predicted_labels)
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']

name_list = []  # store the place where B-topo emerged. \
place = 0
# this for loop find place where B-Topo emerged,
for i in predicted_labels:
    if i == "B-Topo":
        name_list.append(place)
    place = place + 1

# this for loop finds if I-topo emerged after the B-topo emerged.
name_length_list = []
j = 1
for i in name_list:
    while predicted_labels[i + j]:
        if predicted_labels[i + j] == "I-Topo":
            j = j + 1
        else:
            name_length_list.append(j)
            j = 1
            break

# find the word according to name_list and name_length_list
print(original_words)
print(name_list)
print(name_length_list)

# this part merge I-topo to B-topo.
which_word = 0
for length in name_length_list:
    if length == 1:
        which_word += 1
        continue
    else:
        start_topo = original_words[name_list[which_word]]
        i = 1
        while i < length:
            start_topo = start_topo + original_words[name_list[which_word] + i]
            i += 1
        original_words[name_list[which_word]] = start_topo
        which_word += 1
print(original_words)

# This part find all words and delete '#'
all_words = []
i = 0
while i < len(name_list):
    word = original_words[name_list[i]]
    word = word.replace("#", "")
    # this loop add a space before a uppercase letter
    word_length = len(word)
    j = 1
    while j < word_length:
        if word[j].isupper() & (word[j - 1].isalpha()):
            word = word[:j] + ' ' + word[j:]
        j += 1
    all_words.append(word)
    i += 1
print(all_words)

# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
# what happend to other 0s #

dtypes_dict = {
    0: int,  # geonameid
    1: str,  # name
    2: str,  # asciiname
    3: str,  # alternatenames
    4: float,  # latitude
    5: float,  # longitude
    6: str,  # feature class
    7: str,  # feature code
    8: str,  # country code
    9: str,  # cc2
    10: str,  # admin1 code
    11: str,  # admin2 code
    12: str,  # admin3 code
    13: str,  # admin4 code
    14: int,  # population
    15: int,  # elevation
    16: int,  # dem (digital elevation model)
    17: str,  # timezone
    18: str  # modification date yyyy-MM-dd
}

# Load the Geonames dataset into a Pandas DataFrame
geonames_df = pd.read_csv('cities5000.txt', sep='\t', header=None,
                          names=['geonameid', 'name', 'asciiname', 'alternatenames',
                                 'latitude', 'longitude', 'feature class', 'feature code',
                                 'country code', 'cc2', 'admin1 code', 'admin2 code',
                                 'admin3 code', 'admin4 code', 'population', 'elevation',
                                 'dem', 'timezone', 'modification date'])

# print(geonames_df)

# create 2-d matrix to store lines.
total_words = len(geonames_df)
print(total_words)

# String array to compare
string_array_to_compare = all_words

# Create a filter using isin() to check if 'name', 'asciiname', or 'alternatenames' match any string in the array
filter_condition = (geonames_df['name'].isin(string_array_to_compare) |
                    geonames_df['asciiname'].isin(string_array_to_compare) |
                    geonames_df['alternatenames'].apply(lambda x: any(substring in x for substring in string_array_to_compare) if isinstance(x, str) else False))


# Apply the filter to the DataFrame
filtered_df = geonames_df[filter_condition]

# Print the filtered DataFrame
print(filtered_df)

print(filtered_df['alternatenames'].to_csv(index=False))