zekun-li jinwei12 commited on
Commit
2ead40b
·
1 Parent(s): 9dce8d7

Upload 2 files (#2)

Browse files

- Upload 2 files (04bf3e8a3105f8ae6015fddd446d207b7d0e5591)


Co-authored-by: Jinwei <[email protected]>

Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +579 -0
  3. geohash.csv +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ geohash.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,579 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
3
+ from transformers import GeoLMModel
4
+ import requests
5
+ import numpy as np
6
+ import pandas as pd
7
+ import scipy.spatial as sp
8
+ import streamlit as st
9
+ import folium
10
+ from streamlit.components.v1 import html
11
+
12
+
13
+ from haversine import haversine, Unit
14
+
15
+
16
+ dataset=None
17
+
18
+
19
+
20
+ def generate_human_readable(tokens,labels):
21
+ ret = []
22
+ for t,lab in zip(tokens,labels):
23
+ if t == '[SEP]':
24
+ continue
25
+
26
+ if t.startswith("##") :
27
+ assert len(ret) > 0
28
+ ret[-1] = ret[-1] + t.strip('##')
29
+
30
+ elif lab==2:
31
+ assert len(ret) > 0
32
+ ret[-1] = ret[-1] + " "+ t.strip('##')
33
+ else:
34
+ ret.append(t)
35
+
36
+ return ret
37
+
38
+ def getSlice(tensor):
39
+ result = []
40
+ curr = []
41
+ for index, value in enumerate(tensor[0]):
42
+ if value == 1 or value == 2:
43
+ curr.append(index)
44
+
45
+ if value == 0 and curr != []:
46
+ result.append(curr)
47
+ curr = []
48
+
49
+ return result
50
+
51
+ def getIndex(input):
52
+
53
+
54
+ tokenizer, model= getModel1()
55
+
56
+ # Tokenize input sentence
57
+ tokens = tokenizer.encode(input, return_tensors="pt")
58
+
59
+
60
+ # Pass tokens through the model
61
+ outputs = model(tokens)
62
+
63
+
64
+ # Retrieve predicted labels for each token
65
+ predicted_labels = torch.argmax(outputs.logits, dim=2)
66
+
67
+ predicted_labels = predicted_labels.detach().cpu().numpy()
68
+
69
+ # "id2label": { "0": "O", "1": "B-Topo", "2": "I-Topo" }
70
+
71
+ predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
72
+ # print(predicted_labels)
73
+
74
+ predicted_labels = torch.argmax(outputs.logits, dim=2)
75
+
76
+ # print(predicted_labels)
77
+
78
+ query_tokens = tokens[0][torch.where(predicted_labels[0] != 0)[0]]
79
+
80
+ query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
81
+
82
+ print(predicted_labels)
83
+ print(predicted_labels.shape)
84
+
85
+ slices=getSlice(predicted_labels)
86
+
87
+
88
+ # print(tokenizer.convert_ids_to_tokens(query_tokens))
89
+
90
+
91
+ return slices
92
+
93
+ def cutSlices(tensor, slicesList):
94
+
95
+ locationTensor= torch.zeros(1, len(slicesList), 768)
96
+
97
+ curr=0
98
+ for slice in slicesList:
99
+
100
+ if len(slice)==1:
101
+ locationTensor[0][curr] = tensor[0][slice[0]]
102
+ curr=curr+1
103
+ if len(slice)>1 :
104
+
105
+ sliceTensor=tensor[0][slice[0]:slice[-1]+1]
106
+ #(len, 768)-> (1,len, 768)
107
+ sliceTensor = sliceTensor.unsqueeze(0)
108
+
109
+ mean = torch.mean(sliceTensor,dim=1,keepdim=True)
110
+
111
+ locationTensor[0][curr] = mean[0]
112
+
113
+ curr=curr+1
114
+
115
+
116
+ return locationTensor
117
+
118
+
119
+
120
+
121
+
122
+
123
+ def MLearningFormInput(input):
124
+
125
+
126
+ tokenizer,model=getModel2()
127
+
128
+ tokens = tokenizer.encode(input, return_tensors="pt")
129
+
130
+ # ['[CLS]', 'Minneapolis','[SEP]','Saint','Paul','[SEP]','Du','##lut','##h','[SEP]']
131
+ # print(tokens)
132
+
133
+
134
+ outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape))
135
+
136
+
137
+ # print(outputs.last_hidden_state)
138
+
139
+ # print(outputs.last_hidden_state.shape)
140
+
141
+
142
+ slicesIndex=getIndex(input)
143
+
144
+ # print(slicesIndex)
145
+
146
+ #tensor -> tensor
147
+ res= cutSlices(outputs.last_hidden_state, slicesIndex)
148
+
149
+
150
+ return res
151
+
152
+
153
+
154
+
155
+
156
+ def generate_human_readable(tokens,labels):
157
+ ret = []
158
+ for t,lab in zip(tokens,labels):
159
+ if t == '[SEP]':
160
+ continue
161
+
162
+ if t.startswith("##") :
163
+ assert len(ret) > 0
164
+ ret[-1] = ret[-1] + t.strip('##')
165
+
166
+ elif lab==2:
167
+ assert len(ret) > 0
168
+ ret[-1] = ret[-1] + " "+ t.strip('##')
169
+ else:
170
+ ret.append(t)
171
+
172
+ return ret
173
+
174
+
175
+ def getLocationName(input_sentence):
176
+ # Model name from Hugging Face model hub
177
+ tokenizer, model= getModel1()
178
+
179
+
180
+ # Tokenize input sentence
181
+ tokens = tokenizer.encode(input_sentence, return_tensors="pt")
182
+
183
+
184
+ # Pass tokens through the model
185
+ outputs = model(tokens)
186
+
187
+
188
+ # Retrieve predicted labels for each token
189
+ predicted_labels = torch.argmax(outputs.logits, dim=2)
190
+
191
+ predicted_labels = predicted_labels.detach().cpu().numpy()
192
+
193
+ # "id2label": { "0": "O", "1": "B-Topo", "2": "I-Topo" }
194
+
195
+ predicted_labels = [model.config.id2label[label] for label in predicted_labels[0]]
196
+
197
+ predicted_labels = torch.argmax(outputs.logits, dim=2)
198
+
199
+ query_tokens = tokens[0][torch.where(predicted_labels[0] != 0)[0]]
200
+
201
+ query_labels = predicted_labels[0][torch.where(predicted_labels[0] != 0)[0]]
202
+
203
+
204
+ human_readable = generate_human_readable(tokenizer.convert_ids_to_tokens(query_tokens), query_labels)
205
+
206
+ return human_readable
207
+
208
+
209
+
210
+ def search_geonames(toponym, df):
211
+ # GeoNames API endpoint
212
+ api_endpoint = "http://api.geonames.org/searchJSON"
213
+
214
+ username = "zekun"
215
+
216
+ print(toponym)
217
+
218
+ params = {
219
+ 'q': toponym,
220
+ 'username': username,
221
+ 'maxRows':10
222
+ }
223
+
224
+ response = requests.get(api_endpoint, params=params)
225
+ data = response.json()
226
+
227
+ result = []
228
+
229
+ lat=[]
230
+ lon=[]
231
+
232
+ if 'geonames' in data:
233
+ for place_info in data['geonames']:
234
+ latitude = float(place_info.get('lat', 0.0))
235
+ longitude = float(place_info.get('lng', 0.0))
236
+
237
+ lat.append(latitude)
238
+ lon.append(longitude)
239
+
240
+ print(latitude)
241
+ print(longitude)
242
+
243
+ # getNeighborsDistance
244
+
245
+ id = place_info.get('geonameId', '')
246
+
247
+ print(id)
248
+
249
+ global dataset
250
+ res = get50Neigbors(id, dataset, k=50)
251
+ result.append(res)
252
+ # candidate_places.append({
253
+ # 'name': place_info.get('name', ''),
254
+ # 'country': place_info.get('countryName', ''),
255
+ # 'latitude': latitude,
256
+ # 'longitude': longitude,
257
+
258
+ # })
259
+ print(res)
260
+
261
+
262
+ df['lat'] = lat
263
+ df['lon'] = lon
264
+ result = torch.cat(result, dim=1).detach().numpy()
265
+ return result
266
+
267
+
268
+
269
+ def get50Neigbors(locationID, dataset, k=50):
270
+
271
+ print("neighbor part----------------------------------------------------------------")
272
+
273
+ input_row = dataset.loc[dataset['GeonameID'] == locationID].iloc[0]
274
+
275
+
276
+ lat, lon, geohash,name = input_row['Latitude'], input_row['Longitude'], input_row['Geohash'], input_row['Name']
277
+
278
+ filtered_dataset = dataset.loc[dataset['Geohash'].str.startswith(geohash[:7])].copy()
279
+
280
+ filtered_dataset['distance'] = filtered_dataset.apply(
281
+ lambda row: haversine((lat, lon), (row['Latitude'], row['Longitude']), Unit.KILOMETERS),
282
+ axis=1
283
+ ).copy()
284
+
285
+
286
+ print("neighbor search end----------------------------------------------------------------")
287
+
288
+
289
+
290
+ filtered_dataset = filtered_dataset.sort_values(by='distance')
291
+
292
+
293
+
294
+ nearest_neighbors = filtered_dataset.head(k)[['Name']]
295
+
296
+
297
+ neighbors=nearest_neighbors.values.tolist()
298
+
299
+
300
+ tokenizer, model= getModel1_0()
301
+
302
+
303
+ sep_token_id = tokenizer.convert_tokens_to_ids(tokenizer.sep_token)
304
+ cls_token_id = tokenizer.convert_tokens_to_ids(tokenizer.cls_token)
305
+
306
+
307
+ neighbor_token_list = []
308
+ neighbor_token_list.append(cls_token_id)
309
+
310
+ target_token=tokenizer.convert_tokens_to_ids(tokenizer.tokenize(name))
311
+
312
+
313
+
314
+ for neighbor in neighbors:
315
+
316
+
317
+ neighbor_token = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(neighbor[0]))
318
+ neighbor_token_list.extend(neighbor_token)
319
+ neighbor_token_list.append(sep_token_id)
320
+
321
+
322
+ # print(tokenizer.convert_ids_to_tokens(neighbor_token_list))
323
+
324
+ #--------------------------------------------
325
+
326
+
327
+ tokens = torch.Tensor(neighbor_token_list).unsqueeze(0).long()
328
+
329
+
330
+ # input "new neighbor sentence"-> model -> output
331
+ outputs = model(tokens, spatial_position_list_x=torch.zeros(tokens.shape), spatial_position_list_y=torch.zeros(tokens.shape))
332
+
333
+
334
+
335
+ # print(outputs.last_hidden_state)
336
+
337
+ # print(outputs.last_hidden_state.shape)
338
+
339
+
340
+ targetIndex=list(range(1, len(target_token)+1))
341
+
342
+ # #tensor -> tensor
343
+ # get (1, len(target_token), 768) -> (1, 1, 768)
344
+ res=cutSlices(outputs.last_hidden_state, [targetIndex])
345
+
346
+
347
+ print("neighbor end----------------------------------------------------------------")
348
+
349
+
350
+
351
+ return res
352
+
353
+
354
+
355
+ def cosine_similarity(target_feature, candidate_feature):
356
+
357
+ target_feature = target_feature.squeeze()
358
+ candidate_feature = candidate_feature.squeeze()
359
+
360
+ dot_product = torch.dot(target_feature, candidate_feature)
361
+
362
+ target = torch.norm(target_feature)
363
+ candidate = torch.norm(candidate_feature)
364
+
365
+ similarity = dot_product / (target * candidate)
366
+
367
+ return similarity.item()
368
+
369
+
370
+ @st.cache_data
371
+
372
+ def getCSV():
373
+ dataset = pd.read_csv('geohash.csv')
374
+ return dataset
375
+
376
+ @st.cache_data
377
+
378
+ def getModel1():
379
+ # Model name from Hugging Face model hub
380
+ model_name = "zekun-li/geolm-base-toponym-recognition"
381
+
382
+ # Load tokenizer and model
383
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
384
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
385
+
386
+ return tokenizer,model
387
+
388
+ def getModel1_0():
389
+ # Model name from Hugging Face model hub
390
+ model_name = "zekun-li/geolm-base-toponym-recognition"
391
+
392
+ # Load tokenizer and model
393
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
394
+ model = GeoLMModel.from_pretrained(model_name)
395
+ return tokenizer,model
396
+
397
+
398
+
399
+ def getModel2():
400
+
401
+ model_name = "zekun-li/geolm-base-cased"
402
+
403
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
404
+
405
+ model = GeoLMModel.from_pretrained(model_name)
406
+
407
+ return tokenizer,model
408
+
409
+
410
+ def showing(df):
411
+
412
+ m = folium.Map(location=[df['lat'].mean(), df['lon'].mean()], zoom_start=5)
413
+
414
+ size_scale = 100
415
+ color_scale = 255
416
+ for i in range(len(df)):
417
+ lat, lon, prob = df.iloc[i]['lat'], df.iloc[i]['lon'], df.iloc[i]['prob']
418
+
419
+ size = int(prob**2 * size_scale )
420
+ color = int(prob**2 * color_scale)
421
+
422
+ folium.CircleMarker(
423
+ location=[lat, lon],
424
+ radius=size,
425
+ color=f'#{color:02X}0000',
426
+ fill=True,
427
+ fill_color=f'#{color:02X}0000'
428
+ ).add_to(m)
429
+
430
+ m.save("map.html")
431
+
432
+ with open("map.html", "r", encoding="utf-8") as f:
433
+ map_html = f.read()
434
+
435
+ st.components.v1.html(map_html, height=600)
436
+
437
+
438
+ def mapping(selected_place,locations, sentence_info):
439
+ location_index = locations.index(selected_place)
440
+ print(location_index)
441
+
442
+ df = pd.DataFrame()
443
+
444
+ # get same name for "Beijing" in geonames
445
+ same_name_embedding=search_geonames(selected_place, df)
446
+
447
+
448
+ sim_matrix=[]
449
+ print(sim_matrix)
450
+
451
+
452
+ print("calculate similarities-----------------------------------")
453
+
454
+
455
+ same_name_embedding=torch.tensor(same_name_embedding)
456
+ # loop each "Beijing"
457
+ for i in range(same_name_embedding.size(1)):
458
+ print((sentence_info[:, location_index, :]).shape)
459
+ print((same_name_embedding[:, i, :]).shape)
460
+
461
+ similarities = cosine_similarity(sentence_info[:, location_index, :], same_name_embedding[:, i, :])
462
+ sim_matrix.append(similarities)
463
+
464
+ # print("Cosine Similarity Matrix:")
465
+ # print(sim_matrix)
466
+
467
+ def sigmoid(x):
468
+ return 1 / (1 + np.exp(-x))
469
+
470
+ prob_matrix = sigmoid(np.array(sim_matrix))
471
+
472
+
473
+ print("calculate similarities end ----------------------------------")
474
+
475
+
476
+
477
+ df['prob'] = prob_matrix
478
+
479
+
480
+ print(df)
481
+
482
+ showing(df)
483
+
484
+
485
+
486
+ def show_on_map():
487
+
488
+
489
+
490
+ input = st.text_area("Enter a sentence:", height=200)
491
+
492
+ st.button("Submit")
493
+
494
+ sentence_info= MLearningFormInput(input)
495
+
496
+ print("sentence info: ")
497
+ print(sentence_info)
498
+ print(sentence_info.shape)
499
+
500
+
501
+ # input: a sentence -> output : locations
502
+ locations=getLocationName(input)
503
+
504
+ selected_place = st.selectbox("Select a location:", locations)
505
+
506
+ if selected_place is not None:
507
+
508
+ mapping(selected_place, locations, sentence_info)
509
+
510
+
511
+
512
+
513
+ if __name__ == "__main__":
514
+
515
+
516
+ dataset = getCSV()
517
+
518
+ show_on_map()
519
+
520
+
521
+ # # just for testing, hidding.............................................................
522
+
523
+ # #len: 80
524
+ # input= 'Minneapolis, officially the City of Minneapolis, is a city in the state of Minnesota and the county seat of Hennepin County. making it the largest city in Minnesota and the 46th-most-populous in the United States. Nicknamed the "City of Lakes", Minneapolis is abundant in water, with thirteen lakes, wetlands, the Mississippi River, creeks, and waterfalls.'
525
+
526
+
527
+ # 1. input: a sentence -> output: tensor (1,num_locations,768)
528
+ # sentence_info= MLearningFormInput(input)
529
+
530
+ # print("sentence info: ")
531
+ # print(sentence_info)
532
+ # print(sentence_info.shape)
533
+
534
+
535
+
536
+ # # input: a sentence -> output : locations
537
+ # locations=getLocationName(input)
538
+
539
+ # print(locations)
540
+
541
+ # j=0
542
+
543
+
544
+ # k=0
545
+
546
+ # for location in locations:
547
+
548
+ # if k==0:
549
+ # # input: locations -> output: search in geoname(get top 10 items) -> loop each item -> num_location x 10 x (1,1,768)
550
+ # same_name_embedding=search_geonames(location)
551
+
552
+ # sim_matrix=[]
553
+ # print(sim_matrix)
554
+
555
+
556
+
557
+
558
+
559
+ # same_name_embedding=torch.tensor(same_name_embedding)
560
+ # # loop each "Beijing"
561
+ # for i in range(same_name_embedding.size(1)):
562
+ # # print((sentence_info[:, j, :]).shape)
563
+ # # print((same_name_embedding[:, i, :]).shape)
564
+
565
+ # similarities = cosine_similarity(sentence_info[:, j, :], same_name_embedding[:, i, :])
566
+ # sim_matrix.append(similarities)
567
+
568
+
569
+
570
+ # j=j+1
571
+
572
+
573
+ # print("Cosine Similarity Matrix:")
574
+ # print(sim_matrix)
575
+
576
+ # k=1
577
+
578
+ # else:
579
+ # break
geohash.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a20fbc0326c65920428a298f1674f3b2046f3bafc0c38f3bb417ab15774aa0b
3
+ size 677244066