REBEL-ru

Based on russian part of wikipedia (scrapped with CROCODILE). Model trained for 3 epochs on russian ruT5-base

How to use

Same code as REBEL-large (https://huggingface.co/Babelscape/rebel-large)


text = '''За последние 9 месяцев инвесторы в азиатские долларовые долговые обязательства потеряли 155 миллиардов долларов, пострадав от слабости Китая в дополнение к глобальной распродаже фиксированного дохода, наблюдаемой во всем мире по мере роста процентных ставок. '''


model_path = r"memyprokotow/rut5-REBEL-base"
triplet_extractor = pipeline('text2text-generation', model=model_path, 
                             tokenizer=model_path,
                             #device=0
                             )
# We need to use the tokenizer manually since we need special tokens.
extracted_text = triplet_extractor.tokenizer.batch_decode([triplet_extractor(text, return_tensors=True, return_text=False, max_length=500)[0]["generated_token_ids"]])

print(extracted_text[0])
# Function to parse the generated text and extract the triplets
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets
extracted_triplets = extract_triplets(extracted_text[0])
print(extracted_triplets)
Downloads last month
16
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train memyprokotow/rut5-REBEL-base