Spaces:
Runtime error
Runtime error
import torch | |
from transformers import AutoModelForTokenClassification, AutoTokenizer | |
import pandas as pd | |
# Set the option to display all columns | |
pd.options.display.max_columns = 5 | |
# Model name from Hugging Face model hub | |
model_name = "zekun-li/geolm-base-toponym-recognition" | |
# Load tokenizer and model | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForTokenClassification.from_pretrained(model_name) | |
# Example input sentence | |
input_sentence = "Minneapolis, officially the City of Minneapolis, is a city in " \ | |
"the state of Minnesota and the county seat of Hennepin County." | |
input_2 = "Los Angeles, often referred to by its initials L.A., is the most populous " \ | |
"city in California, the most populous U.S. state. It is the commercial, " \ | |
"financial, and cultural center of Southern California. Los Angeles is the " \ | |
"second-most populous city in the United States after New York City, with a population of " \ | |
"roughly 3.9 million residents within the city limits as of 2020." | |
# input_sentence = input_2 | |
# Tokenize input sentence | |
tokens = tokenizer.encode(input_sentence, return_tensors="pt") | |
original_words = tokenizer.convert_ids_to_tokens(tokens[0]) | |
# tokenizer.to | |
# Pass tokens through the model | |
outputs = model(tokens) | |
# Retrieve predicted labels for each token | |
predicted_labels = torch.argmax(outputs.logits, dim=2) | |
predicted_labels = predicted_labels.detach().cpu().numpy() | |
# Decode predicted labels | |
predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]] | |
# Print predicted labels | |
print(predicted_labels) | |
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O', | |
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O'] | |
name_list = [] # store the place where B-topo emerged. \ | |
place = 0 | |
# this for loop find place where B-Topo emerged, | |
for i in predicted_labels: | |
if i == "B-Topo": | |
name_list.append(place) | |
place = place + 1 | |
# this for loop finds if I-topo emerged after the B-topo emerged. | |
name_length_list = [] | |
j = 1 | |
for i in name_list: | |
while predicted_labels[i + j]: | |
if predicted_labels[i + j] == "I-Topo": | |
j = j + 1 | |
else: | |
name_length_list.append(j) | |
j = 1 | |
break | |
# find the word according to name_list and name_length_list | |
print(original_words) | |
print(name_list) | |
print(name_length_list) | |
# this part merge I-topo to B-topo. | |
which_word = 0 | |
for length in name_length_list: | |
if length == 1: | |
which_word += 1 | |
continue | |
else: | |
start_topo = original_words[name_list[which_word]] | |
i = 1 | |
while i < length: | |
start_topo = start_topo + original_words[name_list[which_word] + i] | |
i += 1 | |
original_words[name_list[which_word]] = start_topo | |
which_word += 1 | |
print(original_words) | |
# This part find all words and delete '#' | |
all_words = [] | |
i = 0 | |
while i < len(name_list): | |
word = original_words[name_list[i]] | |
word = word.replace("#", "") | |
# this loop add a space before a uppercase letter | |
word_length = len(word) | |
j = 1 | |
while j < word_length: | |
if word[j].isupper() & (word[j - 1].isalpha()): | |
word = word[:j] + ' ' + word[j:] | |
j += 1 | |
all_words.append(word) | |
i += 1 | |
print(all_words) | |
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O', | |
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O'] | |
# what happend to other 0s # | |
dtypes_dict = { | |
0: int, # geonameid | |
1: str, # name | |
2: str, # asciiname | |
3: str, # alternatenames | |
4: float, # latitude | |
5: float, # longitude | |
6: str, # feature class | |
7: str, # feature code | |
8: str, # country code | |
9: str, # cc2 | |
10: str, # admin1 code | |
11: str, # admin2 code | |
12: str, # admin3 code | |
13: str, # admin4 code | |
14: int, # population | |
15: int, # elevation | |
16: int, # dem (digital elevation model) | |
17: str, # timezone | |
18: str # modification date yyyy-MM-dd | |
} | |
# Load the Geonames dataset into a Pandas DataFrame | |
geonames_df = pd.read_csv('cities5000.txt', sep='\t', header=None, | |
names=['geonameid', 'name', 'asciiname', 'alternatenames', | |
'latitude', 'longitude', 'feature class', 'feature code', | |
'country code', 'cc2', 'admin1 code', 'admin2 code', | |
'admin3 code', 'admin4 code', 'population', 'elevation', | |
'dem', 'timezone', 'modification date']) | |
# print(geonames_df) | |
# create 2-d matrix to store lines. | |
total_words = len(geonames_df) | |
print(total_words) | |
# String array to compare | |
string_array_to_compare = all_words | |
# Create a filter using isin() to check if 'name', 'asciiname', or 'alternatenames' match any string in the array | |
filter_condition = (geonames_df['name'].isin(string_array_to_compare) | | |
geonames_df['asciiname'].isin(string_array_to_compare) | | |
geonames_df['alternatenames'].apply(lambda x: any(substring in x for substring in string_array_to_compare) if isinstance(x, str) else False)) | |
# Apply the filter to the DataFrame | |
filtered_df = geonames_df[filter_condition] | |
# Print the filtered DataFrame | |
print(filtered_df) | |
print(filtered_df['alternatenames'].to_csv(index=False)) | |