Files changed (6) hide show
  1. .gitattributes +3 -0
  2. allCountries.txt +3 -0
  3. allCountries_2.txt +3 -0
  4. app.py +4 -0
  5. cities5000.txt +3 -0
  6. main.py +158 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ allCountries_2.txt filter=lfs diff=lfs merge=lfs -text
37
+ allCountries.txt filter=lfs diff=lfs merge=lfs -text
38
+ cities5000.txt filter=lfs diff=lfs merge=lfs -text
allCountries.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31c3e287a7fe8bd50d89267d6e5db10df21fcec230bb7729c8b8fdd40de1ca20
3
+ size 1619260526
allCountries_2.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31c3e287a7fe8bd50d89267d6e5db10df21fcec230bb7729c8b8fdd40de1ca20
3
+ size 1619260526
app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ x = st.slider('Select a value')
4
+ st.write(x, 'squared is', x * x)
cities5000.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aed4c67d2adcae014c91052ee4c56d8f164225f2d810174356e4d7a88029a128
3
+ size 10903184
main.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
3
+ import pandas as pd
4
+
5
+ # Set the option to display all columns
6
+ pd.options.display.max_columns = 5
7
+
8
+ # Model name from Hugging Face model hub
9
+ model_name = "zekun-li/geolm-base-toponym-recognition"
10
+
11
+ # Load tokenizer and model
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
14
+
15
+ # Example input sentence
16
+ input_sentence = "Minneapolis, officially the City of Minneapolis, is a city in " \
17
+ "the state of Minnesota and the county seat of Hennepin County."
18
+ input_2 = "Los Angeles, often referred to by its initials L.A., is the most populous " \
19
+ "city in California, the most populous U.S. state. It is the commercial, " \
20
+ "financial, and cultural center of Southern California. Los Angeles is the " \
21
+ "second-most populous city in the United States after New York City, with a population of " \
22
+ "roughly 3.9 million residents within the city limits as of 2020."
23
+
24
+ # input_sentence = input_2
25
+
26
+ # Tokenize input sentence
27
+ tokens = tokenizer.encode(input_sentence, return_tensors="pt")
28
+ original_words = tokenizer.convert_ids_to_tokens(tokens[0])
29
+ # tokenizer.to
30
+ # Pass tokens through the model
31
+ outputs = model(tokens)
32
+
33
+ # Retrieve predicted labels for each token
34
+ predicted_labels = torch.argmax(outputs.logits, dim=2)
35
+
36
+ predicted_labels = predicted_labels.detach().cpu().numpy()
37
+ # Decode predicted labels
38
+ predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
39
+
40
+ # Print predicted labels
41
+ print(predicted_labels)
42
+ # ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
43
+ # 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
44
+
45
+ name_list = [] # store the place where B-topo emerged. \
46
+ place = 0
47
+ # this for loop find place where B-Topo emerged,
48
+ for i in predicted_labels:
49
+ if i == "B-Topo":
50
+ name_list.append(place)
51
+ place = place + 1
52
+
53
+ # this for loop finds if I-topo emerged after the B-topo emerged.
54
+ name_length_list = []
55
+ j = 1
56
+ for i in name_list:
57
+ while predicted_labels[i + j]:
58
+ if predicted_labels[i + j] == "I-Topo":
59
+ j = j + 1
60
+ else:
61
+ name_length_list.append(j)
62
+ j = 1
63
+ break
64
+
65
+ # find the word according to name_list and name_length_list
66
+ print(original_words)
67
+ print(name_list)
68
+ print(name_length_list)
69
+
70
+ # this part merge I-topo to B-topo.
71
+ which_word = 0
72
+ for length in name_length_list:
73
+ if length == 1:
74
+ which_word += 1
75
+ continue
76
+ else:
77
+ start_topo = original_words[name_list[which_word]]
78
+ i = 1
79
+ while i < length:
80
+ start_topo = start_topo + original_words[name_list[which_word] + i]
81
+ i += 1
82
+ original_words[name_list[which_word]] = start_topo
83
+ which_word += 1
84
+ print(original_words)
85
+
86
+ # This part find all words and delete '#'
87
+ all_words = []
88
+ i = 0
89
+ while i < len(name_list):
90
+ word = original_words[name_list[i]]
91
+ word = word.replace("#", "")
92
+ # this loop add a space before a uppercase letter
93
+ word_length = len(word)
94
+ j = 1
95
+ while j < word_length:
96
+ if word[j].isupper() & (word[j - 1].isalpha()):
97
+ word = word[:j] + ' ' + word[j:]
98
+ j += 1
99
+ all_words.append(word)
100
+ i += 1
101
+ print(all_words)
102
+
103
+ # ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
104
+ # 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
105
+ # what happend to other 0s #
106
+
107
+ dtypes_dict = {
108
+ 0: int, # geonameid
109
+ 1: str, # name
110
+ 2: str, # asciiname
111
+ 3: str, # alternatenames
112
+ 4: float, # latitude
113
+ 5: float, # longitude
114
+ 6: str, # feature class
115
+ 7: str, # feature code
116
+ 8: str, # country code
117
+ 9: str, # cc2
118
+ 10: str, # admin1 code
119
+ 11: str, # admin2 code
120
+ 12: str, # admin3 code
121
+ 13: str, # admin4 code
122
+ 14: int, # population
123
+ 15: int, # elevation
124
+ 16: int, # dem (digital elevation model)
125
+ 17: str, # timezone
126
+ 18: str # modification date yyyy-MM-dd
127
+ }
128
+
129
+ # Load the Geonames dataset into a Pandas DataFrame
130
+ geonames_df = pd.read_csv('cities5000.txt', sep='\t', header=None,
131
+ names=['geonameid', 'name', 'asciiname', 'alternatenames',
132
+ 'latitude', 'longitude', 'feature class', 'feature code',
133
+ 'country code', 'cc2', 'admin1 code', 'admin2 code',
134
+ 'admin3 code', 'admin4 code', 'population', 'elevation',
135
+ 'dem', 'timezone', 'modification date'])
136
+
137
+ # print(geonames_df)
138
+
139
+ # create 2-d matrix to store lines.
140
+ total_words = len(geonames_df)
141
+ print(total_words)
142
+
143
+ # String array to compare
144
+ string_array_to_compare = all_words
145
+
146
+ # Create a filter using isin() to check if 'name', 'asciiname', or 'alternatenames' match any string in the array
147
+ filter_condition = (geonames_df['name'].isin(string_array_to_compare) |
148
+ geonames_df['asciiname'].isin(string_array_to_compare) |
149
+ geonames_df['alternatenames'].apply(lambda x: any(substring in x for substring in string_array_to_compare) if isinstance(x, str) else False))
150
+
151
+
152
+ # Apply the filter to the DataFrame
153
+ filtered_df = geonames_df[filter_condition]
154
+
155
+ # Print the filtered DataFrame
156
+ print(filtered_df)
157
+
158
+ print(filtered_df['alternatenames'].to_csv(index=False))