pedramyazdipoor
/

persian_xlm_roberta_large

Question Answering

Transformers

PyTorch

xlm-roberta

Model card Files Files and versions Community

pedramyazdipoor commited on Sep 18, 2022

Commit

646133d

1 Parent(s): addb83f

Update README.md

Browse files

Files changed (1) hide show

README.md +46 -4

README.md CHANGED Viewed

@@ -39,10 +39,52 @@ I trained for more than 1 epoch as well, but I get worse results.
 ## How to use
 ```python
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering
-tokenizer = AutoTokenizer.from_pretrained('pedramyazdipoor/persian_xlm_roberta_large')
-model = AutoModel.from_pretrained('pedramyazdipoor/persian_xlm_roberta_large')
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model.eval().to(device)
 text = 'سلام من پدرامم 26 سالمه'
 question = 'نامم چیست؟'
 print(tokenizer.tokenize(text + question))
@@ -54,10 +96,10 @@ encoding = tokenizer(text,question,add_special_tokens = True,
                      truncation = 'only_first',
                      max_length = 32)
 out = model(encoding['input_ids'].to(device),encoding['attention_mask'].to(device), encoding['token_type_ids'].to(device))
-#out.start_logits
-#out.end_logits
 >>> ['▁سلام', '▁من', '▁پدر', 'ام', 'م', '▁26', '▁سالم', 'ه', 'نام', 'م', '▁چیست', '؟']
 ```
 ## Acknowledgments
 We hereby, express our gratitude to the [Newsha Shahbodaghkhan](https://huggingface.co/datasets/newsha/PQuAD/tree/main) for facilitating dataset gathering.
 ## Contributors

 ## How to use
 ```python
 from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+path = 'pedramyazdipoor/persian_xlm_roberta_large'
+tokenizer = AutoTokenizer.from_pretrained(path)
+model = AutoModelForQuestionAnswering.from_pretrained(path)
+```
+```python
+def generate_indexes(start_logits, end_logits, N, min_index_list):
+  output_start = start_logits
+  output_end = end_logits
+  start_indexes = np.arange(len(start_logits))
+  start_probs = output_start
+  list_start = dict(zip(start_indexes, start_probs.tolist()))
+  end_indexes = np.arange(len(end_logits))
+  end_probs = output_end
+  list_end = dict(zip(end_indexes, end_probs.tolist()))
+  sort_start_probs = sorted(list_start.items(), key=lambda x: x[1], reverse=True) #Descending sort by probability
+  sort_end_probs = sorted(list_end.items(), key=lambda x: x[1], reverse=True)
+  sorted_start_list = (sort_start_probs)
+  sorted_end_list = (sort_end_probs)
+  final_start_idx, final_end_idx = [[] for l in range(2)]
+  start_idx, end_idx, prob = 0, 0, (start_probs.tolist()[0] + end_probs.tolist()[0])
+  for a in range(0,N):
+    for b in range(0,N):
+      if (sorted_start_list[a][1] + sorted_end_list[b][1]) > prob :
+        if (sorted_start_list[a][0] <= sorted_end_list[b][0]) and (sorted_start_list[a][0] > min_index_list) :
+          prob = sorted_start_list[a][1] + sorted_end_list[b][1]
+          start_idx = sorted_start_list[a][0]
+          end_idx = sorted_end_list[b][0]
+  final_start_idx.append(start_idx)
+  final_end_idx.append(end_idx)
+  return final_start_idx, final_end_idx
+print(generate_indexes(out['start_logits'][0][1:], out['end_logits'][0][1:], 5, 0))
+print(tokenizer.tokenize(text+question))
+```
+```python
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model.eval().to(device)
 text = 'سلام من پدرامم 26 سالمه'
 question = 'نامم چیست؟'
 print(tokenizer.tokenize(text + question))
                      truncation = 'only_first',
                      max_length = 32)
 out = model(encoding['input_ids'].to(device),encoding['attention_mask'].to(device), encoding['token_type_ids'].to(device))
 >>> ['▁سلام', '▁من', '▁پدر', 'ام', 'م', '▁26', '▁سالم', 'ه', 'نام', 'م', '▁چیست', '؟']
 ```
 ## Acknowledgments
 We hereby, express our gratitude to the [Newsha Shahbodaghkhan](https://huggingface.co/datasets/newsha/PQuAD/tree/main) for facilitating dataset gathering.
 ## Contributors