Spaces:

zekun-li
/

geolm-linking

Runtime error

File size: 5,471 Bytes

5b65b88

import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import pandas as pd

# Set the option to display all columns
pd.options.display.max_columns = 5

# Model name from Hugging Face model hub
model_name = "zekun-li/geolm-base-toponym-recognition"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Example input sentence
input_sentence = "Minneapolis, officially the City of Minneapolis, is a city in " \
                 "the state of Minnesota and the county seat of Hennepin County."
input_2 = "Los Angeles, often referred to by its initials L.A., is the most populous " \
          "city in California, the most populous U.S. state. It is the commercial, " \
          "financial,  and cultural center of Southern California. Los Angeles is the " \
          "second-most populous city in the United States after New York City, with a population of " \
          "roughly 3.9  million residents within the city limits as of 2020."

# input_sentence = input_2

# Tokenize input sentence
tokens = tokenizer.encode(input_sentence, return_tensors="pt")
original_words = tokenizer.convert_ids_to_tokens(tokens[0])
# tokenizer.to
# Pass tokens through the model
outputs = model(tokens)

# Retrieve predicted labels for each token
predicted_labels = torch.argmax(outputs.logits, dim=2)

predicted_labels = predicted_labels.detach().cpu().numpy()
# Decode predicted labels
predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]

# Print predicted labels
print(predicted_labels)
# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']

name_list = []  # store the place where B-topo emerged. \
place = 0
# this for loop find place where B-Topo emerged,
for i in predicted_labels:
    if i == "B-Topo":
        name_list.append(place)
    place = place + 1

# this for loop finds if I-topo emerged after the B-topo emerged.
name_length_list = []
j = 1
for i in name_list:
    while predicted_labels[i + j]:
        if predicted_labels[i + j] == "I-Topo":
            j = j + 1
        else:
            name_length_list.append(j)
            j = 1
            break

# find the word according to name_list and name_length_list
print(original_words)
print(name_list)
print(name_length_list)

# this part merge I-topo to B-topo.
which_word = 0
for length in name_length_list:
    if length == 1:
        which_word += 1
        continue
    else:
        start_topo = original_words[name_list[which_word]]
        i = 1
        while i < length:
            start_topo = start_topo + original_words[name_list[which_word] + i]
            i += 1
        original_words[name_list[which_word]] = start_topo
        which_word += 1
print(original_words)

# This part find all words and delete '#'
all_words = []
i = 0
while i < len(name_list):
    word = original_words[name_list[i]]
    word = word.replace("#", "")
    # this loop add a space before a uppercase letter
    word_length = len(word)
    j = 1
    while j < word_length:
        if word[j].isupper() & (word[j - 1].isalpha()):
            word = word[:j] + ' ' + word[j:]
        j += 1
    all_words.append(word)
    i += 1
print(all_words)

# ['O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'O',
# 'O', 'O', 'B-Topo', 'O', 'O', 'O', 'O', 'O', 'B-Topo', 'I-Topo', 'I-Topo', 'O', 'O', 'O']
# what happend to other 0s #

dtypes_dict = {
    0: int,  # geonameid
    1: str,  # name
    2: str,  # asciiname
    3: str,  # alternatenames
    4: float,  # latitude
    5: float,  # longitude
    6: str,  # feature class
    7: str,  # feature code
    8: str,  # country code
    9: str,  # cc2
    10: str,  # admin1 code
    11: str,  # admin2 code
    12: str,  # admin3 code
    13: str,  # admin4 code
    14: int,  # population
    15: int,  # elevation
    16: int,  # dem (digital elevation model)
    17: str,  # timezone
    18: str  # modification date yyyy-MM-dd
}

# Load the Geonames dataset into a Pandas DataFrame
geonames_df = pd.read_csv('cities5000.txt', sep='\t', header=None,
                          names=['geonameid', 'name', 'asciiname', 'alternatenames',
                                 'latitude', 'longitude', 'feature class', 'feature code',
                                 'country code', 'cc2', 'admin1 code', 'admin2 code',
                                 'admin3 code', 'admin4 code', 'population', 'elevation',
                                 'dem', 'timezone', 'modification date'])

# print(geonames_df)

# create 2-d matrix to store lines.
total_words = len(geonames_df)
print(total_words)

# String array to compare
string_array_to_compare = all_words

# Create a filter using isin() to check if 'name', 'asciiname', or 'alternatenames' match any string in the array
filter_condition = (geonames_df['name'].isin(string_array_to_compare) |
                    geonames_df['asciiname'].isin(string_array_to_compare) |
                    geonames_df['alternatenames'].apply(lambda x: any(substring in x for substring in string_array_to_compare) if isinstance(x, str) else False))


# Apply the filter to the DataFrame
filtered_df = geonames_df[filter_condition]

# Print the filtered DataFrame
print(filtered_df)

print(filtered_df['alternatenames'].to_csv(index=False))