import torch from transformers import AutoTokenizer,AutoModelForTokenClassification from transformers import GeoLMModel import requests import numpy as np import pandas as pd import scipy.spatial as sp import streamlit as st import folium from streamlit.components.v1 import html from haversine import haversine, Unit dataset=None def generate_human_readable(tokens,labels): ret = [] for t,lab in zip(tokens,labels): if t == '[SEP]': continue if t.startswith("##") : assert len(ret) > 0 ret[-1] = ret[-1] + t.strip('##') elif lab==2: assert len(ret) > 0 ret[-1] = ret[-1] + " "+ t.strip('##') else: ret.append(t) return ret def getSlice(tensor): result = [] curr = [] for index, value in enumerate(tensor[0]): if value == 1 or value == 2: curr.append(index) if value == 0 and curr != []: result.append(curr) curr = [] return result def getIndex(input): tokenizer, model= getModel1() # Tokenize input sentence tokens = tokenizer.encode(input, return_tensors="pt") # Pass tokens through the model outputs = model(tokens) # Retrieve predicted labels for each token predicted_labels = torch.argmax(outputs.logits, dim=2) predicted_labels = predicted_labels.detach().cpu().numpy() # "id2label": { "0": "O", "1": "B-Topo", "2": "I-Topo" } predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]] # print(predicted_labels) predicted_labels = torch.argmax(outputs.logits, dim=2) # print(predicted_labels) query_tokens = tokens[0][torch.where(predicted_labels[0] != 0)[0]] query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]] print(predicted_labels) print(predicted_labels.shape) slices=getSlice(predicted_labels) # print(tokenizer.convert_ids_to_tokens(query_tokens)) return slices def cutSlices(tensor, slicesList): locationTensor= torch.zeros(1, len(slicesList), 768) curr=0 for slice in slicesList: if len(slice)==1: locationTensor[0][curr] = tensor[0][slice[0]] curr=curr+1 if len(slice)>1 : sliceTensor=tensor[0][slice[0]:slice[-1]+1] #(len, 768)-> (1,len, 768) sliceTensor = sliceTensor.unsqueeze(0) mean = torch.mean(sliceTensor,dim=1,keepdim=True) locationTensor[0][curr] = mean[0] curr=curr+1 return locationTensor def MLearningFormInput(input): tokenizer,model=getModel2() tokens = tokenizer.encode(input, return_tensors="pt") # ['[CLS]', 'Minneapolis','[SEP]','Saint','Paul','[SEP]','Du','##lut','##h','[SEP]'] # print(tokens) outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape)) # print(outputs.last_hidden_state) # print(outputs.last_hidden_state.shape) slicesIndex=getIndex(input) # print(slicesIndex) #tensor -> tensor res= cutSlices(outputs.last_hidden_state, slicesIndex) return res def generate_human_readable(tokens,labels): ret = [] for t,lab in zip(tokens,labels): if t == '[SEP]': continue if t.startswith("##") : assert len(ret) > 0 ret[-1] = ret[-1] + t.strip('##') elif lab==2: assert len(ret) > 0 ret[-1] = ret[-1] + " "+ t.strip('##') else: ret.append(t) return ret def getLocationName(input_sentence): # Model name from Hugging Face model hub tokenizer, model= getModel1() # Tokenize input sentence tokens = tokenizer.encode(input_sentence, return_tensors="pt") # Pass tokens through the model outputs = model(tokens) # Retrieve predicted labels for each token predicted_labels = torch.argmax(outputs.logits, dim=2) predicted_labels = predicted_labels.detach().cpu().numpy() # "id2label": { "0": "O", "1": "B-Topo", "2": "I-Topo" } predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]] predicted_labels = torch.argmax(outputs.logits, dim=2) query_tokens = tokens[0][torch.where(predicted_labels[0] != 0)[0]] query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]] human_readable = generate_human_readable(tokenizer.convert_ids_to_tokens(query_tokens), query_labels) return human_readable def search_geonames(toponym, df): # GeoNames API endpoint api_endpoint = "http://api.geonames.org/searchJSON" username = "zekun" print(toponym) params = { 'q': toponym, 'username': username, 'maxRows':10 } response = requests.get(api_endpoint, params=params) data = response.json() result = [] lat=[] lon=[] name=[] country=[] fcodeName=[] population=[] if 'geonames' in data: for place_info in data['geonames']: latitude = float(place_info.get('lat', 0.0)) longitude = float(place_info.get('lng', 0.0)) lat.append(latitude) lon.append(longitude) name.append(place_info.get('name', '')) country.append(place_info.get('countryName', '')) fcodeName.append(place_info.get('fcodeName', '')) population.append(place_info.get('population', '')) print(latitude) print(longitude) # getNeighborsDistance id = place_info.get('geonameId', '') print(id) global dataset res = get50Neigbors(id, dataset, k=50) result.append(res) # candidate_places.append({ # 'name': place_info.get('name', ''), # 'country': place_info.get('countryName', ''), # 'latitude': latitude, # 'longitude': longitude, # }) print(res) df['lat'] = lat df['lon'] = lon df['name']=name df['country']=country df['fcodeName']=fcodeName df['population']=population result = torch.cat(result, dim=1).detach().numpy() return result def get50Neigbors(locationID, dataset, k=50): print("neighbor part----------------------------------------------------------------") input_row = dataset.loc[dataset['GeonameID'] == locationID].iloc[0] lat, lon, geohash,name = input_row['Latitude'], input_row['Longitude'], input_row['Geohash'], input_row['Name'] filtered_dataset = dataset.loc[dataset['Geohash'].str.startswith(geohash[:7])].copy() filtered_dataset['distance'] = filtered_dataset.apply( lambda row: haversine((lat, lon), (row['Latitude'], row['Longitude']), Unit.KILOMETERS), axis=1 ).copy() print("neighbor search end----------------------------------------------------------------") filtered_dataset = filtered_dataset.sort_values(by='distance') nearest_neighbors = filtered_dataset.head(k)[['Name']] neighbors=nearest_neighbors.values.tolist() tokenizer, model= getModel1_0() sep_token_id = tokenizer.convert_tokens_to_ids(tokenizer.sep_token) cls_token_id = tokenizer.convert_tokens_to_ids(tokenizer.cls_token) neighbor_token_list = [] neighbor_token_list.append(cls_token_id) target_token=tokenizer.convert_tokens_to_ids(tokenizer.tokenize(name)) for neighbor in neighbors: neighbor_token = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(neighbor[0])) neighbor_token_list.extend(neighbor_token) neighbor_token_list.append(sep_token_id) # print(tokenizer.convert_ids_to_tokens(neighbor_token_list)) #-------------------------------------------- tokens = torch.Tensor(neighbor_token_list).unsqueeze(0).long() # input "new neighbor sentence"-> model -> output outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape)) # print(outputs.last_hidden_state) # print(outputs.last_hidden_state.shape) targetIndex=list(range(1, len(target_token)+1)) # #tensor -> tensor # get (1, len(target_token), 768) -> (1, 1, 768) res=cutSlices(outputs.last_hidden_state, [targetIndex]) print("neighbor end----------------------------------------------------------------") return res def cosine_similarity(target_feature, candidate_feature): target_feature = target_feature.squeeze() candidate_feature = candidate_feature.squeeze() dot_product = torch.dot(target_feature, candidate_feature) target = torch.norm(target_feature) candidate = torch.norm(candidate_feature) similarity = dot_product / (target * candidate) return similarity.item() @st.cache_data def getCSV(): dataset = pd.read_csv('geohash.csv') return dataset @st.cache_data def getModel1(): # Model name from Hugging Face model hub model_name = "zekun-li/geolm-base-toponym-recognition" # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForTokenClassification.from_pretrained(model_name) return tokenizer,model def getModel1_0(): # Model name from Hugging Face model hub model_name = "zekun-li/geolm-base-toponym-recognition" # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained(model_name) model = GeoLMModel.from_pretrained(model_name) return tokenizer,model def getModel2(): model_name = "zekun-li/geolm-base-cased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = GeoLMModel.from_pretrained(model_name) return tokenizer,model def showing(df): m = folium.Map(location=[df['lat'].mean(), df['lon'].mean()], zoom_start=5) size_scale = 100 color_scale = 255 for i in range(len(df)): lat, lon, prob, name, country,fcodeName,population = df.iloc[i]['lat'], df.iloc[i]['lon'], df.iloc[i]['prob'],df.iloc[i]['name'],df.iloc[i]['country'],df.iloc[i]['fcodeName'],df.iloc[i]['population'] size = int(prob**2 * size_scale ) color = int(prob**2 * color_scale) popup_info= f"Name:{name}
Country: {country}
fcodeName: {fcodeName}
population:{population}" folium.CircleMarker( location=[lat, lon], radius=size, color=f'#{color:02X}0000', fill=True, fill_color=f'#{color:02X}0000', popup=popup_info ).add_to(m) m.save("map.html") with open("map.html", "r", encoding="utf-8") as f: map_html = f.read() st.components.v1.html(map_html, height=600) def mapping(selected_place,locations, sentence_info): location_index = locations.index(selected_place) print(location_index) df = pd.DataFrame() # get same name for "Beijing" in geonames same_name_embedding=search_geonames(selected_place, df) sim_matrix=[] print(sim_matrix) print("calculate similarities-----------------------------------") same_name_embedding=torch.tensor(same_name_embedding) # loop each "Beijing" for i in range(same_name_embedding.size(1)): print((sentence_info[:, location_index, :]).shape) print((same_name_embedding[:, i, :]).shape) similarities = cosine_similarity(sentence_info[:, location_index, :], same_name_embedding[:, i, :]) sim_matrix.append(similarities) # print("Cosine Similarity Matrix:") # print(sim_matrix) def sigmoid(x): return 1 / (1 + np.exp(-x)) prob_matrix = sigmoid(np.array(sim_matrix)) print("calculate similarities end ----------------------------------") df['prob'] = prob_matrix print(df) showing(df) def show_on_map(): input = st.text_area("Enter a sentence:", height=200) st.button("Submit") sentence_info= MLearningFormInput(input) print("sentence info: ") print(sentence_info) print(sentence_info.shape) # input: a sentence -> output : locations locations=getLocationName(input) selected_place = st.selectbox("Select a location:", locations) if selected_place is not None: mapping(selected_place, locations, sentence_info) if __name__ == "__main__": dataset = getCSV() show_on_map() # # just for testing, hidding............................................................. # #len: 80 # input= 'Minneapolis, officially the City of Minneapolis, is a city in the state of Minnesota and the county seat of Hennepin County. making it the largest city in Minnesota and the 46th-most-populous in the United States. Nicknamed the "City of Lakes", Minneapolis is abundant in water, with thirteen lakes, wetlands, the Mississippi River, creeks, and waterfalls.' # 1. input: a sentence -> output: tensor (1,num_locations,768) # sentence_info= MLearningFormInput(input) # print("sentence info: ") # print(sentence_info) # print(sentence_info.shape) # # input: a sentence -> output : locations # locations=getLocationName(input) # print(locations) # j=0 # k=0 # for location in locations: # if k==0: # # input: locations -> output: search in geoname(get top 10 items) -> loop each item -> num_location x 10 x (1,1,768) # same_name_embedding=search_geonames(location) # sim_matrix=[] # print(sim_matrix) # same_name_embedding=torch.tensor(same_name_embedding) # # loop each "Beijing" # for i in range(same_name_embedding.size(1)): # # print((sentence_info[:, j, :]).shape) # # print((same_name_embedding[:, i, :]).shape) # similarities = cosine_similarity(sentence_info[:, j, :], same_name_embedding[:, i, :]) # sim_matrix.append(similarities) # j=j+1 # print("Cosine Similarity Matrix:") # print(sim_matrix) # k=1 # else: # break