anamargarida commited on
Commit
238fa61
·
verified ·
1 Parent(s): a2f5856

Rename app_23.py to app_24.py

Browse files
Files changed (1) hide show
  1. app_23.py → app_24.py +48 -3
app_23.py → app_24.py RENAMED
@@ -60,7 +60,8 @@ def extract_arguments(text, tokenizer, model, beam_search=True):
60
  self.pretrained_signal_detector = False
61
 
62
  args = Args()
63
- inputs = tokenizer(text, return_tensors="pt")
 
64
  # Get tokenized words (for reconstruction later)
65
  word_ids = inputs.word_ids()
66
 
@@ -133,9 +134,14 @@ def extract_arguments(text, tokenizer, model, beam_search=True):
133
 
134
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
135
  token_ids = inputs["input_ids"][0]
 
136
 
137
  for i, (token, word_id) in enumerate(zip(tokens, word_ids)):
138
  st.write(f"Token {i}: {token}, Word ID: {word_id}")
 
 
 
 
139
 
140
 
141
  st.write("Token Positions, IDs, and Corresponding Tokens:")
@@ -250,11 +256,50 @@ def extract_arguments(text, tokenizer, model, beam_search=True):
250
  # Join tokens back into a string
251
  return ' '.join(this_space_splitted_tokens)
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
 
255
  # Apply the tags to the sentence tokens
256
- tagged_sentence1 = add_tags_find(input_text, word_ids, start_cause1, end_cause1, start_effect1, end_effect1, start_signal, end_signal)
257
- tagged_sentence2 = add_tags_find(input_text, word_ids, start_cause2, end_cause2, start_effect2, end_effect2, start_signal, end_signal)
258
  return tagged_sentence1, tagged_sentence2
259
 
260
 
 
60
  self.pretrained_signal_detector = False
61
 
62
  args = Args()
63
+ inputs = tokenizer(text, return_offsets_mapping=True, return_tensors="pt")
64
+
65
  # Get tokenized words (for reconstruction later)
66
  word_ids = inputs.word_ids()
67
 
 
134
 
135
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
136
  token_ids = inputs["input_ids"][0]
137
+ offset_mapping = inputs["offset_mapping"][0].tolist()
138
 
139
  for i, (token, word_id) in enumerate(zip(tokens, word_ids)):
140
  st.write(f"Token {i}: {token}, Word ID: {word_id}")
141
+
142
+ st.write("Token & offset:")
143
+ for i, (token, offset) in enumerate(zip(tokens, offset_mapping)):
144
+ st.write(f"Token {i}: {token}, Offset: {offset}")
145
 
146
 
147
  st.write("Token Positions, IDs, and Corresponding Tokens:")
 
256
  # Join tokens back into a string
257
  return ' '.join(this_space_splitted_tokens)
258
 
259
+ def add_tags_offset(text, start_cause, end_cause, start_effect, end_effect, start_signal, end_signal):
260
+ """
261
+ Inserts tags into the original text based on token offsets.
262
+
263
+ Args:
264
+ text (str): The original input text.
265
+ tokenizer: The tokenizer used for tokenization.
266
+ start_cause (int): Start token index of the cause span.
267
+ end_cause (int): End token index of the cause span.
268
+ start_effect (int): Start token index of the effect span.
269
+ end_effect (int): End token index of the effect span.
270
+ start_signal (int, optional): Start token index of the signal span.
271
+ end_signal (int, optional): End token index of the signal span.
272
+
273
+ Returns:
274
+ str: The modified text with annotated spans.
275
+ """
276
+
277
+
278
+
279
+ # Convert token-based indices to character-based indices
280
+ start_cause_char, end_cause_char = offset_mapping[start_cause][0], offset_mapping[end_cause][1]
281
+ start_effect_char, end_effect_char = offset_mapping[start_effect][0], offset_mapping[end_effect][1]
282
+
283
+ # Insert tags into the original text
284
+ annotated_text = text[:start_cause_char] + "<ARG0>" + text[start_cause_char:end_cause_char] + "</ARG0>" + text[end_cause_char:start_effect_char] + "<ARG1>" + text[start_effect_char:end_effect_char] + "</ARG1>" + text[end_effect_char:]
285
+
286
+ # If signal span exists, insert signal tags
287
+ if start_signal is not None and end_signal is not None:
288
+ start_signal_char, end_signal_char = offset_mapping[start_signal][0], offset_mapping[end_signal][1]
289
+ annotated_text = (
290
+ annotated_text[:start_signal_char]
291
+ + "<SIG0>" + annotated_text[start_signal_char:end_signal_char] + "</SIG0>"
292
+ + annotated_text[end_signal_char:]
293
+ )
294
+
295
+ return annotated_text
296
+
297
+
298
 
299
 
300
  # Apply the tags to the sentence tokens
301
+ tagged_sentence1 = add_tags_offset(input_text, start_cause1, end_cause1, start_effect1, end_effect1, start_signal, end_signal)
302
+ tagged_sentence2 = add_tags_offset(input_text, start_cause2, end_cause2, start_effect2, end_effect2, start_signal, end_signal)
303
  return tagged_sentence1, tagged_sentence2
304
 
305