memyprokotow commited on
Commit
9c6de98
·
1 Parent(s): 4bb7d18

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +56 -0
README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # REBEL-ru
2
+ Based on russian part of wikipedia (scrapped with CROCODILE).
3
+ Model trained for 3 epochs on russian ruT5-base
4
+
5
+
6
+ # How to use
7
+ Same code as REBEL
8
+
9
+ ```
10
+
11
+ text = '''За последние 9 месяцев инвесторы в азиатские долларовые долговые обязательства потеряли 155 миллиардов долларов, пострадав от слабости Китая в дополнение к глобальной распродаже фиксированного дохода, наблюдаемой во всем мире по мере роста процентных ставок. '''
12
+
13
+
14
+ model_path = r"memyprokotow/rut5-REBEL-base"
15
+ triplet_extractor = pipeline('text2text-generation', model=model_path,
16
+ tokenizer=model_path,
17
+ #device=0
18
+ )
19
+ # We need to use the tokenizer manually since we need special tokens.
20
+ extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False, max_length=500)[0]["generated_token_ids"]])
21
+
22
+ print(extracted_text[0])
23
+ # Function to parse the generated text and extract the triplets
24
+ def extract_triplets(text):
25
+ triplets = []
26
+ relation, subject, relation, object_ = '', '', '', ''
27
+ text = text.strip()
28
+ current = 'x'
29
+ for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
30
+ if token == "<triplet>":
31
+ current = 't'
32
+ if relation != '':
33
+ triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
34
+ relation = ''
35
+ subject = ''
36
+ elif token == "<subj>":
37
+ current = 's'
38
+ if relation != '':
39
+ triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
40
+ object_ = ''
41
+ elif token == "<obj>":
42
+ current = 'o'
43
+ relation = ''
44
+ else:
45
+ if current == 't':
46
+ subject += ' ' + token
47
+ elif current == 's':
48
+ object_ += ' ' + token
49
+ elif current == 'o':
50
+ relation += ' ' + token
51
+ if subject != '' and relation != '' and object_ != '':
52
+ triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
53
+ return triplets
54
+ extracted_triplets = extract_triplets(extracted_text[0])
55
+ print(extracted_triplets)
56
+ ```