Commit
·
646133d
1
Parent(s):
addb83f
Update README.md
Browse files
README.md
CHANGED
@@ -39,10 +39,52 @@ I trained for more than 1 epoch as well, but I get worse results.
|
|
39 |
## How to use
|
40 |
```python
|
41 |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
45 |
model.eval().to(device)
|
|
|
46 |
text = 'سلام من پدرامم 26 سالمه'
|
47 |
question = 'نامم چیست؟'
|
48 |
print(tokenizer.tokenize(text + question))
|
@@ -54,10 +96,10 @@ encoding = tokenizer(text,question,add_special_tokens = True,
|
|
54 |
truncation = 'only_first',
|
55 |
max_length = 32)
|
56 |
out = model(encoding['input_ids'].to(device),encoding['attention_mask'].to(device), encoding['token_type_ids'].to(device))
|
57 |
-
|
58 |
-
#out.end_logits
|
59 |
>>> ['▁سلام', '▁من', '▁پدر', 'ام', 'م', '▁26', '▁سالم', 'ه', 'نام', 'م', '▁چیست', '؟']
|
60 |
```
|
|
|
61 |
## Acknowledgments
|
62 |
We hereby, express our gratitude to the [Newsha Shahbodaghkhan](https://huggingface.co/datasets/newsha/PQuAD/tree/main) for facilitating dataset gathering.
|
63 |
## Contributors
|
|
|
39 |
## How to use
|
40 |
```python
|
41 |
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
42 |
+
path = 'pedramyazdipoor/persian_xlm_roberta_large'
|
43 |
+
tokenizer = AutoTokenizer.from_pretrained(path)
|
44 |
+
model = AutoModelForQuestionAnswering.from_pretrained(path)
|
45 |
+
```
|
46 |
+
|
47 |
+
```python
|
48 |
+
def generate_indexes(start_logits, end_logits, N, min_index_list):
|
49 |
+
|
50 |
+
output_start = start_logits
|
51 |
+
output_end = end_logits
|
52 |
+
|
53 |
+
start_indexes = np.arange(len(start_logits))
|
54 |
+
start_probs = output_start
|
55 |
+
list_start = dict(zip(start_indexes, start_probs.tolist()))
|
56 |
+
end_indexes = np.arange(len(end_logits))
|
57 |
+
end_probs = output_end
|
58 |
+
list_end = dict(zip(end_indexes, end_probs.tolist()))
|
59 |
+
|
60 |
+
sort_start_probs = sorted(list_start.items(), key=lambda x: x[1], reverse=True) #Descending sort by probability
|
61 |
+
sort_end_probs = sorted(list_end.items(), key=lambda x: x[1], reverse=True)
|
62 |
+
sorted_start_list = (sort_start_probs)
|
63 |
+
sorted_end_list = (sort_end_probs)
|
64 |
+
|
65 |
+
final_start_idx, final_end_idx = [[] for l in range(2)]
|
66 |
+
|
67 |
+
start_idx, end_idx, prob = 0, 0, (start_probs.tolist()[0] + end_probs.tolist()[0])
|
68 |
+
for a in range(0,N):
|
69 |
+
for b in range(0,N):
|
70 |
+
if (sorted_start_list[a][1] + sorted_end_list[b][1]) > prob :
|
71 |
+
if (sorted_start_list[a][0] <= sorted_end_list[b][0]) and (sorted_start_list[a][0] > min_index_list) :
|
72 |
+
prob = sorted_start_list[a][1] + sorted_end_list[b][1]
|
73 |
+
start_idx = sorted_start_list[a][0]
|
74 |
+
end_idx = sorted_end_list[b][0]
|
75 |
+
final_start_idx.append(start_idx)
|
76 |
+
final_end_idx.append(end_idx)
|
77 |
+
|
78 |
+
return final_start_idx, final_end_idx
|
79 |
+
|
80 |
+
print(generate_indexes(out['start_logits'][0][1:], out['end_logits'][0][1:], 5, 0))
|
81 |
+
print(tokenizer.tokenize(text+question))
|
82 |
+
```
|
83 |
+
|
84 |
+
```python
|
85 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
86 |
model.eval().to(device)
|
87 |
+
|
88 |
text = 'سلام من پدرامم 26 سالمه'
|
89 |
question = 'نامم چیست؟'
|
90 |
print(tokenizer.tokenize(text + question))
|
|
|
96 |
truncation = 'only_first',
|
97 |
max_length = 32)
|
98 |
out = model(encoding['input_ids'].to(device),encoding['attention_mask'].to(device), encoding['token_type_ids'].to(device))
|
99 |
+
|
|
|
100 |
>>> ['▁سلام', '▁من', '▁پدر', 'ام', 'م', '▁26', '▁سالم', 'ه', 'نام', 'م', '▁چیست', '؟']
|
101 |
```
|
102 |
+
|
103 |
## Acknowledgments
|
104 |
We hereby, express our gratitude to the [Newsha Shahbodaghkhan](https://huggingface.co/datasets/newsha/PQuAD/tree/main) for facilitating dataset gathering.
|
105 |
## Contributors
|