pedramyazdipoor commited on
Commit
646133d
·
1 Parent(s): addb83f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +46 -4
README.md CHANGED
@@ -39,10 +39,52 @@ I trained for more than 1 epoch as well, but I get worse results.
39
  ## How to use
40
  ```python
41
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering
42
- tokenizer = AutoTokenizer.from_pretrained('pedramyazdipoor/persian_xlm_roberta_large')
43
- model = AutoModel.from_pretrained('pedramyazdipoor/persian_xlm_roberta_large')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
45
  model.eval().to(device)
 
46
  text = 'سلام من پدرامم 26 سالمه'
47
  question = 'نامم چیست؟'
48
  print(tokenizer.tokenize(text + question))
@@ -54,10 +96,10 @@ encoding = tokenizer(text,question,add_special_tokens = True,
54
  truncation = 'only_first',
55
  max_length = 32)
56
  out = model(encoding['input_ids'].to(device),encoding['attention_mask'].to(device), encoding['token_type_ids'].to(device))
57
- #out.start_logits
58
- #out.end_logits
59
  >>> ['▁سلام', '▁من', '▁پدر', 'ام', 'م', '▁26', '▁سالم', 'ه', 'نام', 'م', '▁چیست', '؟']
60
  ```
 
61
  ## Acknowledgments
62
  We hereby, express our gratitude to the [Newsha Shahbodaghkhan](https://huggingface.co/datasets/newsha/PQuAD/tree/main) for facilitating dataset gathering.
63
  ## Contributors
 
39
  ## How to use
40
  ```python
41
  from transformers import AutoTokenizer, AutoModelForQuestionAnswering
42
+ path = 'pedramyazdipoor/persian_xlm_roberta_large'
43
+ tokenizer = AutoTokenizer.from_pretrained(path)
44
+ model = AutoModelForQuestionAnswering.from_pretrained(path)
45
+ ```
46
+
47
+ ```python
48
+ def generate_indexes(start_logits, end_logits, N, min_index_list):
49
+
50
+ output_start = start_logits
51
+ output_end = end_logits
52
+
53
+ start_indexes = np.arange(len(start_logits))
54
+ start_probs = output_start
55
+ list_start = dict(zip(start_indexes, start_probs.tolist()))
56
+ end_indexes = np.arange(len(end_logits))
57
+ end_probs = output_end
58
+ list_end = dict(zip(end_indexes, end_probs.tolist()))
59
+
60
+ sort_start_probs = sorted(list_start.items(), key=lambda x: x[1], reverse=True) #Descending sort by probability
61
+ sort_end_probs = sorted(list_end.items(), key=lambda x: x[1], reverse=True)
62
+ sorted_start_list = (sort_start_probs)
63
+ sorted_end_list = (sort_end_probs)
64
+
65
+ final_start_idx, final_end_idx = [[] for l in range(2)]
66
+
67
+ start_idx, end_idx, prob = 0, 0, (start_probs.tolist()[0] + end_probs.tolist()[0])
68
+ for a in range(0,N):
69
+ for b in range(0,N):
70
+ if (sorted_start_list[a][1] + sorted_end_list[b][1]) > prob :
71
+ if (sorted_start_list[a][0] <= sorted_end_list[b][0]) and (sorted_start_list[a][0] > min_index_list) :
72
+ prob = sorted_start_list[a][1] + sorted_end_list[b][1]
73
+ start_idx = sorted_start_list[a][0]
74
+ end_idx = sorted_end_list[b][0]
75
+ final_start_idx.append(start_idx)
76
+ final_end_idx.append(end_idx)
77
+
78
+ return final_start_idx, final_end_idx
79
+
80
+ print(generate_indexes(out['start_logits'][0][1:], out['end_logits'][0][1:], 5, 0))
81
+ print(tokenizer.tokenize(text+question))
82
+ ```
83
+
84
+ ```python
85
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
86
  model.eval().to(device)
87
+
88
  text = 'سلام من پدرامم 26 سالمه'
89
  question = 'نامم چیست؟'
90
  print(tokenizer.tokenize(text + question))
 
96
  truncation = 'only_first',
97
  max_length = 32)
98
  out = model(encoding['input_ids'].to(device),encoding['attention_mask'].to(device), encoding['token_type_ids'].to(device))
99
+
 
100
  >>> ['▁سلام', '▁من', '▁پدر', 'ام', 'م', '▁26', '▁سالم', 'ه', 'نام', 'م', '▁چیست', '؟']
101
  ```
102
+
103
  ## Acknowledgments
104
  We hereby, express our gratitude to the [Newsha Shahbodaghkhan](https://huggingface.co/datasets/newsha/PQuAD/tree/main) for facilitating dataset gathering.
105
  ## Contributors