geolm-linking / main.py
GBRKNIGHT's picture
Upload 5 files
5b65b88
raw
history blame
5.47 kB
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import pandas as pd
# Set the option to display all columns
pd.options.display.max_columns = 5
# Model name from Hugging Face model hub
model_name = "zekun-li/geolm-base-toponym-recognition"
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
# Example input sentence
input_sentence = "Minneapolis, officially the City of Minneapolis, is a city in " \
"the state of Minnesota and the county seat of Hennepin County."
input_2 = "Los Angeles, often referred to by its initials L.A., is the most populous " \
"city in California, the most populous U.S. state. It is the commercial, " \
"financial, and cultural center of Southern California. Los Angeles is the " \
"second-most populous city in the United States after New York City, with a population of " \
"roughly 3.9 million residents within the city limits as of 2020."
# input_sentence = input_2
# Tokenize input sentence
tokens = tokenizer.encode(input_sentence, return_tensors="pt")
original_words = tokenizer.convert_ids_to_tokens(tokens[0])
# tokenizer.to
# Pass tokens through the model
outputs = model(tokens)
# Retrieve predicted labels for each token
predicted_labels = torch.argmax(outputs.logits, dim=2)
predicted_labels = predicted_labels.detach().cpu().numpy()
# Decode predicted labels
predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
# Print predicted labels
print(predicted_labels)
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
name_list = [] # store the place where B-topo emerged. \
place = 0
# this for loop find place where B-Topo emerged,
for i in predicted_labels:
if i == "B-Topo":
name_list.append(place)
place = place + 1
# this for loop finds if I-topo emerged after the B-topo emerged.
name_length_list = []
j = 1
for i in name_list:
while predicted_labels[i + j]:
if predicted_labels[i + j] == "I-Topo":
j = j + 1
else:
name_length_list.append(j)
j = 1
break
# find the word according to name_list and name_length_list
print(original_words)
print(name_list)
print(name_length_list)
# this part merge I-topo to B-topo.
which_word = 0
for length in name_length_list:
if length == 1:
which_word += 1
continue
else:
start_topo = original_words[name_list[which_word]]
i = 1
while i < length:
start_topo = start_topo + original_words[name_list[which_word] + i]
i += 1
original_words[name_list[which_word]] = start_topo
which_word += 1
print(original_words)
# This part find all words and delete '#'
all_words = []
i = 0
while i < len(name_list):
word = original_words[name_list[i]]
word = word.replace("#", "")
# this loop add a space before a uppercase letter
word_length = len(word)
j = 1
while j < word_length:
if word[j].isupper() & (word[j - 1].isalpha()):
word = word[:j] + ' ' + word[j:]
j += 1
all_words.append(word)
i += 1
print(all_words)
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
# what happend to other 0s #
dtypes_dict = {
0: int, # geonameid
1: str, # name
2: str, # asciiname
3: str, # alternatenames
4: float, # latitude
5: float, # longitude
6: str, # feature class
7: str, # feature code
8: str, # country code
9: str, # cc2
10: str, # admin1 code
11: str, # admin2 code
12: str, # admin3 code
13: str, # admin4 code
14: int, # population
15: int, # elevation
16: int, # dem (digital elevation model)
17: str, # timezone
18: str # modification date yyyy-MM-dd
}
# Load the Geonames dataset into a Pandas DataFrame
geonames_df = pd.read_csv('cities5000.txt', sep='\t', header=None,
names=['geonameid', 'name', 'asciiname', 'alternatenames',
'latitude', 'longitude', 'feature class', 'feature code',
'country code', 'cc2', 'admin1 code', 'admin2 code',
'admin3 code', 'admin4 code', 'population', 'elevation',
'dem', 'timezone', 'modification date'])
# print(geonames_df)
# create 2-d matrix to store lines.
total_words = len(geonames_df)
print(total_words)
# String array to compare
string_array_to_compare = all_words
# Create a filter using isin() to check if 'name', 'asciiname', or 'alternatenames' match any string in the array
filter_condition = (geonames_df['name'].isin(string_array_to_compare) |
geonames_df['asciiname'].isin(string_array_to_compare) |
geonames_df['alternatenames'].apply(lambda x: any(substring in x for substring in string_array_to_compare) if isinstance(x, str) else False))
# Apply the filter to the DataFrame
filtered_df = geonames_df[filter_condition]
# Print the filtered DataFrame
print(filtered_df)
print(filtered_df['alternatenames'].to_csv(index=False))