Commit
·
9c6de98
1
Parent(s):
4bb7d18
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# REBEL-ru
|
2 |
+
Based on russian part of wikipedia (scrapped with CROCODILE).
|
3 |
+
Model trained for 3 epochs on russian ruT5-base
|
4 |
+
|
5 |
+
|
6 |
+
# How to use
|
7 |
+
Same code as REBEL
|
8 |
+
|
9 |
+
```
|
10 |
+
|
11 |
+
text = '''За последние 9 месяцев инвесторы в азиатские долларовые долговые обязательства потеряли 155 миллиардов долларов, пострадав от слабости Китая в дополнение к глобальной распродаже фиксированного дохода, наблюдаемой во всем мире по мере роста процентных ставок. '''
|
12 |
+
|
13 |
+
|
14 |
+
model_path = r"memyprokotow/rut5-REBEL-base"
|
15 |
+
triplet_extractor = pipeline('text2text-generation', model=model_path,
|
16 |
+
tokenizer=model_path,
|
17 |
+
#device=0
|
18 |
+
)
|
19 |
+
# We need to use the tokenizer manually since we need special tokens.
|
20 |
+
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False, max_length=500)[0]["generated_token_ids"]])
|
21 |
+
|
22 |
+
print(extracted_text[0])
|
23 |
+
# Function to parse the generated text and extract the triplets
|
24 |
+
def extract_triplets(text):
|
25 |
+
triplets = []
|
26 |
+
relation, subject, relation, object_ = '', '', '', ''
|
27 |
+
text = text.strip()
|
28 |
+
current = 'x'
|
29 |
+
for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
|
30 |
+
if token == "<triplet>":
|
31 |
+
current = 't'
|
32 |
+
if relation != '':
|
33 |
+
triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
|
34 |
+
relation = ''
|
35 |
+
subject = ''
|
36 |
+
elif token == "<subj>":
|
37 |
+
current = 's'
|
38 |
+
if relation != '':
|
39 |
+
triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
|
40 |
+
object_ = ''
|
41 |
+
elif token == "<obj>":
|
42 |
+
current = 'o'
|
43 |
+
relation = ''
|
44 |
+
else:
|
45 |
+
if current == 't':
|
46 |
+
subject += ' ' + token
|
47 |
+
elif current == 's':
|
48 |
+
object_ += ' ' + token
|
49 |
+
elif current == 'o':
|
50 |
+
relation += ' ' + token
|
51 |
+
if subject != '' and relation != '' and object_ != '':
|
52 |
+
triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
|
53 |
+
return triplets
|
54 |
+
extracted_triplets = extract_triplets(extracted_text[0])
|
55 |
+
print(extracted_triplets)
|
56 |
+
```
|