KoichiYasuoka
commited on
Commit
•
1a1a985
1
Parent(s):
03cac51
exclude pytextspan
Browse files
README.md
CHANGED
@@ -28,4 +28,4 @@ nlp=pipeline("universal-dependencies","KoichiYasuoka/deberta-base-japanese-juman
|
|
28 |
print(nlp("全学年にわたって小学校の国語の教科書に挿し絵が用いられている"))
|
29 |
```
|
30 |
|
31 |
-
[fugashi](https://pypi.org/project/fugashi)
|
|
|
28 |
print(nlp("全学年にわたって小学校の国語の教科書に挿し絵が用いられている"))
|
29 |
```
|
30 |
|
31 |
+
[fugashi](https://pypi.org/project/fugashi) is required.
|
ud.py
CHANGED
@@ -68,10 +68,17 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
|
|
68 |
|
69 |
class MecabPreTokenizer(MecabTokenizer):
|
70 |
def mecab_split(self,i,normalized_string):
|
71 |
-
import textspan
|
72 |
t=str(normalized_string)
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
def pre_tokenize(self,pretok):
|
76 |
pretok.split(self.mecab_split)
|
77 |
|
|
|
68 |
|
69 |
class MecabPreTokenizer(MecabTokenizer):
|
70 |
def mecab_split(self,i,normalized_string):
|
|
|
71 |
t=str(normalized_string)
|
72 |
+
z=[]
|
73 |
+
e=0
|
74 |
+
for c in self.tokenize(t):
|
75 |
+
s=t.find(c,e)
|
76 |
+
if s<0:
|
77 |
+
z.append((0,0))
|
78 |
+
else:
|
79 |
+
e=s+len(c)
|
80 |
+
z.append((s,e))
|
81 |
+
return [normalized_string[s:e] for s,e in z]
|
82 |
def pre_tokenize(self,pretok):
|
83 |
pretok.split(self.mecab_split)
|
84 |
|