pizzagatakasugi commited on
Commit
565028a
·
1 Parent(s): 26ed1ea

Create tools.py

Browse files
Files changed (1) hide show
  1. tools.py +161 -0
tools.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ KIFU_TO_SQUARE_NAMES = [
4
+ '1一', '1二', '1三', '1四', '1五', '1六', '1七', '1八', '1九',
5
+ '2一', '2二', '2三', '2四', '2五', '2六', '2七', '2八', '2九',
6
+ '3一', '3二', '3三', '3四', '3五', '3六', '3七', '3八', '3九',
7
+ '4一', '4二', '4三', '4四', '4五', '4六', '4七', '4八', '4九',
8
+ '5一', '5二', '5三', '5四', '5五', '5六', '5七', '5八', '5九',
9
+ '6一', '6二', '6三', '6四', '6五', '6六', '6七', '6八', '6九',
10
+ '7一', '7二', '7三', '7四', '7五', '7六', '7七', '7八', '7九',
11
+ '8一', '8二', '8三', '8四', '8五', '8六', '8七', '8八', '8九',
12
+ '9一', '9二', '9三', '9四', '9五', '9六', '9七', '9八', '9九',
13
+ ]
14
+ KIFU_FROM_SQUARE_NAMES = [
15
+ '11', '12', '13', '14', '15', '16', '17', '18', '19',
16
+ '21', '22', '23', '24', '25', '26', '27', '28', '29',
17
+ '31', '32', '33', '34', '35', '36', '37', '38', '39',
18
+ '41', '42', '43', '44', '45', '46', '47', '48', '49',
19
+ '51', '52', '53', '54', '55', '56', '57', '58', '59',
20
+ '61', '62', '63', '64', '65', '66', '67', '68', '69',
21
+ '71', '72', '73', '74', '75', '76', '77', '78', '79',
22
+ '81', '82', '83', '84', '85', '86', '87', '88', '89',
23
+ '91', '92', '93', '94', '95', '96', '97', '98', '99',
24
+ ]
25
+
26
+ def nomalize_precedence_name(df):
27
+ #先手の対局者の名前から段位、タイトル名を削除する
28
+ for x in range(len(df)):
29
+ df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace(" ","").replace(" ","").replace("\u3000","")
30
+ if df["precedence_name"].iloc[x].endswith("段"):
31
+ df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x][:-2]
32
+ df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace("十七世名人","").replace("十八世名人","").replace("十九世名人","")
33
+ df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace("王将","").replace("王座","").replace("名人","").replace("竜王","").replace("棋聖","").replace("叡王","").replace("王位","").replace("棋王","")
34
+ df["precedence_name"].iloc[x] = df["precedence_name"].iloc[x].replace("・","").replace("二冠","").replace("三冠","")
35
+ return df
36
+
37
+ def nomalize_kif(df):
38
+ for x in range(len(df)):
39
+ kif = eval(df.iloc[x]["kif"])
40
+ #kifの正規化処理 手数、消費時間を削除する
41
+ cnt = -1
42
+ for y in kif:
43
+ cnt += 1
44
+ while(1):
45
+ if "0" <= y[0] <= "9":
46
+ y = y[1:]
47
+ kif[cnt] = y
48
+ else:
49
+ break
50
+ kif[cnt] = kif[cnt].replace("\u3000","")
51
+ for z in range(len(y)):
52
+ if y[z] == "(":
53
+ kif[cnt] = y[:z]
54
+ break
55
+ kifs = ""
56
+ for i in kif:
57
+ kifs += i.replace("\u3000","")
58
+ df["kif"].iloc[x] = kifs
59
+ return df
60
+
61
+ def nomalize_comment(df):
62
+ #文章中のword省略処理
63
+ for cnt in range(len(df["output"])):
64
+ x = df["output"].iloc[cnt]
65
+ read = x.split("。")
66
+ #print(read)
67
+ line = ""
68
+ for z in read:
69
+ if "期" in z or "出身" in z or "優勝" in z or "受賞" in z or "回" in z or "記録" in z or "棋士番号" in z or "勝" in z or "敗" in z or "名人" in z:
70
+ pass
71
+ elif "時" in z or "分" in z or "成績" in z or "棋戦" in z or "段" in z or "本日" in z or "立会" in z or "ABEMA" in z or "第" in z or "本局" in z:
72
+ pass
73
+ elif "対局" in z or "永世" in z:
74
+ pass
75
+ elif z == "":
76
+ pass
77
+ else:
78
+ #print(z)
79
+ line += z+"。"
80
+ df["output"].iloc[cnt] = line
81
+ return df
82
+
83
+ def accuracy_bestlist(df):
84
+ cnt2 = 0
85
+ num = 0
86
+ for z in range(len(df)):
87
+ blist = eval(df["bestlist"].iloc[z])
88
+ b2list = eval(df["best2list"].iloc[z])
89
+ te = eval(df["kif"].iloc[z])
90
+ #print(blist[0][0])
91
+ #print(b2list[0][0])
92
+ cnt = 0
93
+ for x in range(1,len(te)):
94
+ try:
95
+ if blist[x-1][0] in te[x] or b2list[x-1][0] in te[x]:
96
+ cnt += 1
97
+ #print(te[x],blist[x][0],b2list[x][0])
98
+ except Exception as e:
99
+ pass
100
+ if cnt == 0:
101
+ print("accuracy = 0",z)
102
+ print("z = ",z," accuracy = ",cnt/len(te))
103
+ cnt2 += cnt/len(te)
104
+ num += 1
105
+ print("mean_acuuracy",cnt2/num)
106
+
107
+ def nomalize_sfen(s):
108
+ flag = 0
109
+ movelist = []
110
+ for x in range(len(s)):
111
+ if x < 2:
112
+ continue
113
+ if len(s[x]) < 30 and flag == 0:
114
+ #半角の指し手を全角に変換する
115
+ temp = s[x].split()
116
+ num = temp[1][0] + temp[1][1]
117
+ for y in range(len(KIFU_FROM_SQUARE_NAMES)):
118
+ if num == KIFU_FROM_SQUARE_NAMES[y]:
119
+ sq = KIFU_TO_SQUARE_NAMES[y]
120
+ word = sq+temp[1][2:]
121
+ word = word.replace("竜","龍").replace("成銀","全").replace("成桂","圭").replace("成香","杏")
122
+ if s[x].split()[1] not in ["投了" , "千日手" , "持将棋" , "反則勝ち"]:
123
+ movelist.append(word)
124
+ else:
125
+ movelist.append(s[x].split()[1])
126
+ flag = 1
127
+ return movelist
128
+
129
+ def make_triplets(df, column):
130
+ # 重複を除いたユニークな文章リストを作成
131
+ triplets = []
132
+ for x in range(len(df)):
133
+ anchor = df.iloc[x]
134
+ # Anchorと同じではない文章をPositiveとして選択
135
+ num = df.loc[(df[column] == anchor[column]) & (df["kif"] != anchor["kif"])].sample(n=1).index
136
+ # print(df.loc[num])
137
+ positive = df.loc[num]["kif"].values[0]
138
+
139
+ # Anchorと異なる文章をNegativeとして選択
140
+ num2 = df.loc[(df[column] != anchor[column]) & (df["kif"] != anchor["kif"])].sample(n=1).index
141
+ # print(df.loc[num2])
142
+ negative = df.loc[num2]["kif"].values[0]
143
+
144
+ triplets.append((anchor["kif"], positive, negative,df.loc[num][column].values[0],df.loc[num2][column].values[0]))
145
+
146
+ def add_symbol(df,column):
147
+ teban ="▲"
148
+ kif = ""
149
+ for x in range(len(df)):
150
+ for y in df[column].iloc[x]:
151
+ if y in ["0","1","2","3","4","5","6","7","8","9","同",0,1,2,3,4,5,6,7,8,9]:
152
+ kif += teban + y
153
+ if teban =="▲":
154
+ teban = "△"
155
+ else:
156
+ teban = "▲"
157
+ else:
158
+ kif += y
159
+ df[column].iloc[x] = kif
160
+ kif = ""
161
+ return df