dongxiaoqun
commited on
Commit
·
62ca995
1
Parent(s):
647224d
Update README.md
Browse files
README.md
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
language: zh
|
3 |
tags:
|
4 |
- summarization
|
|
|
5 |
---
|
6 |
|
7 |
IDEA-CCNL/Randeng_Pegasus_238M_Summary_Chinese model (Chinese) has 238M million parameter, pretrained on 180G Chinese data with GSG task which is stochastically sample important sentences with sampled gap sentence ratios by 25%. The pretraining task just as same as the paper PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization mentioned.
|
@@ -14,9 +15,6 @@ After pre-training, We use 8 summary datasets which we collect on the internet t
|
|
14 |
Task: Summarization
|
15 |
## Usage
|
16 |
```python
|
17 |
-
from typing import List, Optional
|
18 |
-
import jieba_fast as jieba
|
19 |
-
jieba.initialize()
|
20 |
from transformers import PegasusForConditionalGeneration,BertTokenizer
|
21 |
# Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance,
|
22 |
# or you can download tokenizers_pegasus.py and data_utils.py in https://huggingface.co/IDEA-CCNL/Randeng_Pegasus_523M/tree/main
|
@@ -25,42 +23,7 @@ from transformers import PegasusForConditionalGeneration,BertTokenizer
|
|
25 |
# 2. cd Fengshenbang-LM/fengshen/examples/pegasus/
|
26 |
# and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model
|
27 |
|
28 |
-
|
29 |
-
# from tokenizers_pegasus import PegasusTokenizer
|
30 |
-
|
31 |
-
class PegasusTokenizer(BertTokenizer):
|
32 |
-
model_input_names = ["input_ids", "attention_mask"]
|
33 |
-
def __init__(self, **kwargs):
|
34 |
-
super().__init__(pre_tokenizer=lambda x: jieba.cut(x, HMM=False), **kwargs)
|
35 |
-
self.add_special_tokens({'additional_special_tokens':["<mask_1>"]})
|
36 |
-
|
37 |
-
def build_inputs_with_special_tokens(
|
38 |
-
self,
|
39 |
-
token_ids_0: List[int],
|
40 |
-
token_ids_1: Optional[List[int]] = None) -> List[int]:
|
41 |
-
|
42 |
-
if token_ids_1 is None:
|
43 |
-
return token_ids_0 + [self.eos_token_id]
|
44 |
-
return token_ids_0 + token_ids_1 + [self.eos_token_id]
|
45 |
-
|
46 |
-
def _special_token_mask(self, seq):
|
47 |
-
all_special_ids = set(
|
48 |
-
self.all_special_ids) # call it once instead of inside list comp
|
49 |
-
# all_special_ids.remove(self.unk_token_id) # <unk> is only sometimes special
|
50 |
-
return [1 if x in all_special_ids else 0 for x in seq]
|
51 |
-
|
52 |
-
def get_special_tokens_mask(
|
53 |
-
self,
|
54 |
-
token_ids_0: List[int],
|
55 |
-
token_ids_1: Optional[List[int]] = None,
|
56 |
-
already_has_special_tokens: bool = False) -> List[int]:
|
57 |
-
if already_has_special_tokens:
|
58 |
-
return self._special_token_mask(token_ids_0)
|
59 |
-
elif token_ids_1 is None:
|
60 |
-
return self._special_token_mask(token_ids_0) + [self.eos_token_id]
|
61 |
-
else:
|
62 |
-
return self._special_token_mask(token_ids_0 +
|
63 |
-
token_ids_1) + [self.eos_token_id]
|
64 |
|
65 |
model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
|
66 |
tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
|
|
|
2 |
language: zh
|
3 |
tags:
|
4 |
- summarization
|
5 |
+
inference: False
|
6 |
---
|
7 |
|
8 |
IDEA-CCNL/Randeng_Pegasus_238M_Summary_Chinese model (Chinese) has 238M million parameter, pretrained on 180G Chinese data with GSG task which is stochastically sample important sentences with sampled gap sentence ratios by 25%. The pretraining task just as same as the paper PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization mentioned.
|
|
|
15 |
Task: Summarization
|
16 |
## Usage
|
17 |
```python
|
|
|
|
|
|
|
18 |
from transformers import PegasusForConditionalGeneration,BertTokenizer
|
19 |
# Need to download tokenizers_pegasus.py and other Python script from Fengshenbang-LM github repo in advance,
|
20 |
# or you can download tokenizers_pegasus.py and data_utils.py in https://huggingface.co/IDEA-CCNL/Randeng_Pegasus_523M/tree/main
|
|
|
23 |
# 2. cd Fengshenbang-LM/fengshen/examples/pegasus/
|
24 |
# and then you will see the tokenizers_pegasus.py and data_utils.py which are needed by pegasus model
|
25 |
|
26 |
+
from tokenizers_pegasus import PegasusTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
|
29 |
tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese")
|