Spaces:
Runtime error
Runtime error
File size: 5,471 Bytes
5b65b88 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import pandas as pd
# Set the option to display all columns
pd.options.display.max_columns = 5
# Model name from Hugging Face model hub
model_name = "zekun-li/geolm-base-toponym-recognition"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
# Example input sentence
input_sentence = "Minneapolis, officially the City of Minneapolis, is a city in " \
"the state of Minnesota and the county seat of Hennepin County."
input_2 = "Los Angeles, often referred to by its initials L.A., is the most populous " \
"city in California, the most populous U.S. state. It is the commercial, " \
"financial, and cultural center of Southern California. Los Angeles is the " \
"second-most populous city in the United States after New York City, with a population of " \
"roughly 3.9 million residents within the city limits as of 2020."
# input_sentence = input_2
# Tokenize input sentence
tokens = tokenizer.encode(input_sentence, return_tensors="pt")
original_words = tokenizer.convert_ids_to_tokens(tokens[0])
# tokenizer.to
# Pass tokens through the model
outputs = model(tokens)
# Retrieve predicted labels for each token
predicted_labels = torch.argmax(outputs.logits, dim=2)
predicted_labels = predicted_labels.detach().cpu().numpy()
# Decode predicted labels
predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
# Print predicted labels
print(predicted_labels)
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
name_list = [] # store the place where B-topo emerged. \
place = 0
# this for loop find place where B-Topo emerged,
for i in predicted_labels:
if i == "B-Topo":
name_list.append(place)
place = place + 1
# this for loop finds if I-topo emerged after the B-topo emerged.
name_length_list = []
j = 1
for i in name_list:
while predicted_labels[i + j]:
if predicted_labels[i + j] == "I-Topo":
j = j + 1
else:
name_length_list.append(j)
j = 1
break
# find the word according to name_list and name_length_list
print(original_words)
print(name_list)
print(name_length_list)
# this part merge I-topo to B-topo.
which_word = 0
for length in name_length_list:
if length == 1:
which_word += 1
continue
else:
start_topo = original_words[name_list[which_word]]
i = 1
while i < length:
start_topo = start_topo + original_words[name_list[which_word] + i]
i += 1
original_words[name_list[which_word]] = start_topo
which_word += 1
print(original_words)
# This part find all words and delete '#'
all_words = []
i = 0
while i < len(name_list):
word = original_words[name_list[i]]
word = word.replace("#", "")
# this loop add a space before a uppercase letter
word_length = len(word)
j = 1
while j < word_length:
if word[j].isupper() & (word[j - 1].isalpha()):
word = word[:j] + ' ' + word[j:]
j += 1
all_words.append(word)
i += 1
print(all_words)
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
# what happend to other 0s #
dtypes_dict = {
0: int, # geonameid
1: str, # name
2: str, # asciiname
3: str, # alternatenames
4: float, # latitude
5: float, # longitude
6: str, # feature class
7: str, # feature code
8: str, # country code
9: str, # cc2
10: str, # admin1 code
11: str, # admin2 code
12: str, # admin3 code
13: str, # admin4 code
14: int, # population
15: int, # elevation
16: int, # dem (digital elevation model)
17: str, # timezone
18: str # modification date yyyy-MM-dd
}
# Load the Geonames dataset into a Pandas DataFrame
geonames_df = pd.read_csv('cities5000.txt', sep='\t', header=None,
names=['geonameid', 'name', 'asciiname', 'alternatenames',
'latitude', 'longitude', 'feature class', 'feature code',
'country code', 'cc2', 'admin1 code', 'admin2 code',
'admin3 code', 'admin4 code', 'population', 'elevation',
'dem', 'timezone', 'modification date'])
# print(geonames_df)
# create 2-d matrix to store lines.
total_words = len(geonames_df)
print(total_words)
# String array to compare
string_array_to_compare = all_words
# Create a filter using isin() to check if 'name', 'asciiname', or 'alternatenames' match any string in the array
filter_condition = (geonames_df['name'].isin(string_array_to_compare) |
geonames_df['asciiname'].isin(string_array_to_compare) |
geonames_df['alternatenames'].apply(lambda x: any(substring in x for substring in string_array_to_compare) if isinstance(x, str) else False))
# Apply the filter to the DataFrame
filtered_df = geonames_df[filter_condition]
# Print the filtered DataFrame
print(filtered_df)
print(filtered_df['alternatenames'].to_csv(index=False))
|