Spaces:

jingxiangmo
/

Azza

Runtime error

App Files Files Community

Walid Aissa commited on Mar 25, 2023

Commit

288a5de

1 Parent(s): 17cc89d

tokeniser tweaking

Browse files

Files changed (1) hide show

app.py +41 -24

app.py CHANGED Viewed

@@ -80,62 +80,79 @@ def answer_question(question):
     # Apply the tokenizer to the input text, treating them as a text-pair.
     input_ids = tokenizer.encode(question, context)
-    # Report how long the input sequence is. if longer than 512 tokens, make it shorter
-    while(len(input_ids) > 512):
-        input_ids.pop()
-    print('Query has {:,} tokens.\n'.format(len(input_ids)))
     # ======== Set Segment IDs ========
     # Search the input_ids for the first instance of the `[SEP]` token.
-    sep_index = input_ids.index(tokenizer.sep_token_id)
     # The number of segment A tokens includes the [SEP] token istelf.
-    num_seg_a = sep_index + 1
     # The remainder are segment B.
-    num_seg_b = len(input_ids) - num_seg_a
     # Construct the list of 0s and 1s.
-    segment_ids = [0]*num_seg_a + [1]*num_seg_b
     # There should be a segment_id for every input token.
-    assert len(segment_ids) == len(input_ids)
     # ======== Evaluate ========
     # Run our example through the model.
-    outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                     token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                     return_dict=True)
-    start_scores = outputs.start_logits
-    end_scores = outputs.end_logits
-    print(start_scores)
-    print(end_scores)
     # ======== Reconstruct Answer ========
     # Find the tokens with the highest `start` and `end` scores.
-    answer_start = torch.argmax(start_scores)
-    answer_end = torch.argmax(end_scores)
     # Get the string versions of the input tokens.
-    tokens = tokenizer.convert_ids_to_tokens(input_ids)
     # Start with the first token.
-    answer = tokens[answer_start]
     # Select the remaining answer tokens and join them with whitespace.
-    for i in range(answer_start + 1, answer_end + 1):
         # If it's a subword token, then recombine it with the previous token.
-        if tokens[i][0:2] == '##':
-            answer += tokens[i][2:]
         # Otherwise, add a space then the token.
-        else:
-            answer += ' ' + tokens[i]
-    return 'Answer: "' + answer + '"'
 # =====[ DEFINE INTERFACE ]===== #'
 title = "Azza Conversational Agent"

     # Apply the tokenizer to the input text, treating them as a text-pair.
     input_ids = tokenizer.encode(question, context)
+    # Report how long the input sequence is. if longer than 512 tokens divide it multiple sequences
+    print(f"Query has {len(input_ids)} tokens, divided in {len(input_ids)//513 + 1}.\n")
+    input_ids_split = []
+    for group in range(len(input_ids)//513):
+        input_ids_split.append(input_ids[512*group:512*(group+1)-1])
+    input_ids_split.append(input_ids[512*(len(input_ids)//513):len(input_ids)-1])
+    scores = []
+    for input in input_ids_split:
     # ======== Set Segment IDs ========
     # Search the input_ids for the first instance of the `[SEP]` token.
+        sep_index = input.index(tokenizer.sep_token_id)
     # The number of segment A tokens includes the [SEP] token istelf.
+        num_seg_a = sep_index + 1
     # The remainder are segment B.
+        num_seg_b = len(input) - num_seg_a
     # Construct the list of 0s and 1s.
+        segment_ids = [0]*num_seg_a + [1]*num_seg_b
     # There should be a segment_id for every input token.
+        assert len(segment_ids) == len(input)
     # ======== Evaluate ========
     # Run our example through the model.
+        outputs = model(torch.tensor([input]), # The tokens representing our input text.
                     token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                     return_dict=True)
+        start_scores = outputs.start_logits
+        end_scores = outputs.end_logits
+        max_start_score = torch.max(start_scores)
+        max_end_score = torch.max(end_scores)
+        print(max_start_score)
+        print(max_end_score)
     # ======== Reconstruct Answer ========
     # Find the tokens with the highest `start` and `end` scores.
+        answer_start = torch.argmax(start_scores)
+        answer_end = torch.argmax(end_scores)
     # Get the string versions of the input tokens.
+        tokens = tokenizer.convert_ids_to_tokens(input_ids)
     # Start with the first token.
+        answer = tokens[answer_start]
     # Select the remaining answer tokens and join them with whitespace.
+        for i in range(answer_start + 1, answer_end + 1):
         # If it's a subword token, then recombine it with the previous token.
+            if tokens[i][0:2] == '##':
+                answer += tokens[i][2:]
         # Otherwise, add a space then the token.
+            else:
+                answer += ' ' + tokens[i]
+        scores.append((max_start_score, max_end_score, answer))
+    # Compare scores for answers found and each paragraph and pick the most relevant.
+    final_answer = max(scores, key=lambda x: x[0] + x[1])[2]
 # =====[ DEFINE INTERFACE ]===== #'
 title = "Azza Conversational Agent"