Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- .gitattributes +3 -0
- allCountries.txt +3 -0
- allCountries_2.txt +3 -0
- app.py +4 -0
- cities5000.txt +3 -0
- main.py +158 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
allCountries_2.txt filter=lfs diff=lfs merge=lfs -text
|
37 |
+
allCountries.txt filter=lfs diff=lfs merge=lfs -text
|
38 |
+
cities5000.txt filter=lfs diff=lfs merge=lfs -text
|
allCountries.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c3e287a7fe8bd50d89267d6e5db10df21fcec230bb7729c8b8fdd40de1ca20
|
3 |
+
size 1619260526
|
allCountries_2.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c3e287a7fe8bd50d89267d6e5db10df21fcec230bb7729c8b8fdd40de1ca20
|
3 |
+
size 1619260526
|
app.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
x = st.slider('Select a value')
|
4 |
+
st.write(x, 'squared is', x * x)
|
cities5000.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aed4c67d2adcae014c91052ee4c56d8f164225f2d810174356e4d7a88029a128
|
3 |
+
size 10903184
|
main.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
# Set the option to display all columns
|
6 |
+
pd.options.display.max_columns = 5
|
7 |
+
|
8 |
+
# Model name from Hugging Face model hub
|
9 |
+
model_name = "zekun-li/geolm-base-toponym-recognition"
|
10 |
+
|
11 |
+
# Load tokenizer and model
|
12 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
13 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
14 |
+
|
15 |
+
# Example input sentence
|
16 |
+
input_sentence = "Minneapolis, officially the City of Minneapolis, is a city in " \
|
17 |
+
"the state of Minnesota and the county seat of Hennepin County."
|
18 |
+
input_2 = "Los Angeles, often referred to by its initials L.A., is the most populous " \
|
19 |
+
"city in California, the most populous U.S. state. It is the commercial, " \
|
20 |
+
"financial, and cultural center of Southern California. Los Angeles is the " \
|
21 |
+
"second-most populous city in the United States after New York City, with a population of " \
|
22 |
+
"roughly 3.9 million residents within the city limits as of 2020."
|
23 |
+
|
24 |
+
# input_sentence = input_2
|
25 |
+
|
26 |
+
# Tokenize input sentence
|
27 |
+
tokens = tokenizer.encode(input_sentence, return_tensors="pt")
|
28 |
+
original_words = tokenizer.convert_ids_to_tokens(tokens[0])
|
29 |
+
# tokenizer.to
|
30 |
+
# Pass tokens through the model
|
31 |
+
outputs = model(tokens)
|
32 |
+
|
33 |
+
# Retrieve predicted labels for each token
|
34 |
+
predicted_labels = torch.argmax(outputs.logits, dim=2)
|
35 |
+
|
36 |
+
predicted_labels = predicted_labels.detach().cpu().numpy()
|
37 |
+
# Decode predicted labels
|
38 |
+
predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
|
39 |
+
|
40 |
+
# Print predicted labels
|
41 |
+
print(predicted_labels)
|
42 |
+
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
|
43 |
+
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
|
44 |
+
|
45 |
+
name_list = [] # store the place where B-topo emerged. \
|
46 |
+
place = 0
|
47 |
+
# this for loop find place where B-Topo emerged,
|
48 |
+
for i in predicted_labels:
|
49 |
+
if i == "B-Topo":
|
50 |
+
name_list.append(place)
|
51 |
+
place = place + 1
|
52 |
+
|
53 |
+
# this for loop finds if I-topo emerged after the B-topo emerged.
|
54 |
+
name_length_list = []
|
55 |
+
j = 1
|
56 |
+
for i in name_list:
|
57 |
+
while predicted_labels[i + j]:
|
58 |
+
if predicted_labels[i + j] == "I-Topo":
|
59 |
+
j = j + 1
|
60 |
+
else:
|
61 |
+
name_length_list.append(j)
|
62 |
+
j = 1
|
63 |
+
break
|
64 |
+
|
65 |
+
# find the word according to name_list and name_length_list
|
66 |
+
print(original_words)
|
67 |
+
print(name_list)
|
68 |
+
print(name_length_list)
|
69 |
+
|
70 |
+
# this part merge I-topo to B-topo.
|
71 |
+
which_word = 0
|
72 |
+
for length in name_length_list:
|
73 |
+
if length == 1:
|
74 |
+
which_word += 1
|
75 |
+
continue
|
76 |
+
else:
|
77 |
+
start_topo = original_words[name_list[which_word]]
|
78 |
+
i = 1
|
79 |
+
while i < length:
|
80 |
+
start_topo = start_topo + original_words[name_list[which_word] + i]
|
81 |
+
i += 1
|
82 |
+
original_words[name_list[which_word]] = start_topo
|
83 |
+
which_word += 1
|
84 |
+
print(original_words)
|
85 |
+
|
86 |
+
# This part find all words and delete '#'
|
87 |
+
all_words = []
|
88 |
+
i = 0
|
89 |
+
while i < len(name_list):
|
90 |
+
word = original_words[name_list[i]]
|
91 |
+
word = word.replace("#", "")
|
92 |
+
# this loop add a space before a uppercase letter
|
93 |
+
word_length = len(word)
|
94 |
+
j = 1
|
95 |
+
while j < word_length:
|
96 |
+
if word[j].isupper() & (word[j - 1].isalpha()):
|
97 |
+
word = word[:j] + ' ' + word[j:]
|
98 |
+
j += 1
|
99 |
+
all_words.append(word)
|
100 |
+
i += 1
|
101 |
+
print(all_words)
|
102 |
+
|
103 |
+
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
|
104 |
+
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
|
105 |
+
# what happend to other 0s #
|
106 |
+
|
107 |
+
dtypes_dict = {
|
108 |
+
0: int, # geonameid
|
109 |
+
1: str, # name
|
110 |
+
2: str, # asciiname
|
111 |
+
3: str, # alternatenames
|
112 |
+
4: float, # latitude
|
113 |
+
5: float, # longitude
|
114 |
+
6: str, # feature class
|
115 |
+
7: str, # feature code
|
116 |
+
8: str, # country code
|
117 |
+
9: str, # cc2
|
118 |
+
10: str, # admin1 code
|
119 |
+
11: str, # admin2 code
|
120 |
+
12: str, # admin3 code
|
121 |
+
13: str, # admin4 code
|
122 |
+
14: int, # population
|
123 |
+
15: int, # elevation
|
124 |
+
16: int, # dem (digital elevation model)
|
125 |
+
17: str, # timezone
|
126 |
+
18: str # modification date yyyy-MM-dd
|
127 |
+
}
|
128 |
+
|
129 |
+
# Load the Geonames dataset into a Pandas DataFrame
|
130 |
+
geonames_df = pd.read_csv('cities5000.txt', sep='\t', header=None,
|
131 |
+
names=['geonameid', 'name', 'asciiname', 'alternatenames',
|
132 |
+
'latitude', 'longitude', 'feature class', 'feature code',
|
133 |
+
'country code', 'cc2', 'admin1 code', 'admin2 code',
|
134 |
+
'admin3 code', 'admin4 code', 'population', 'elevation',
|
135 |
+
'dem', 'timezone', 'modification date'])
|
136 |
+
|
137 |
+
# print(geonames_df)
|
138 |
+
|
139 |
+
# create 2-d matrix to store lines.
|
140 |
+
total_words = len(geonames_df)
|
141 |
+
print(total_words)
|
142 |
+
|
143 |
+
# String array to compare
|
144 |
+
string_array_to_compare = all_words
|
145 |
+
|
146 |
+
# Create a filter using isin() to check if 'name', 'asciiname', or 'alternatenames' match any string in the array
|
147 |
+
filter_condition = (geonames_df['name'].isin(string_array_to_compare) |
|
148 |
+
geonames_df['asciiname'].isin(string_array_to_compare) |
|
149 |
+
geonames_df['alternatenames'].apply(lambda x: any(substring in x for substring in string_array_to_compare) if isinstance(x, str) else False))
|
150 |
+
|
151 |
+
|
152 |
+
# Apply the filter to the DataFrame
|
153 |
+
filtered_df = geonames_df[filter_condition]
|
154 |
+
|
155 |
+
# Print the filtered DataFrame
|
156 |
+
print(filtered_df)
|
157 |
+
|
158 |
+
print(filtered_df['alternatenames'].to_csv(index=False))
|