pmkhanh7890 commited on
Commit
56cf7e3
Β·
1 Parent(s): a6b0abd

add entities to demo

Browse files
app.py DELETED
@@ -1,8 +0,0 @@
1
- import gradio as gr
2
- path = "T://Projects//prj-nict-ai-content-detection//example_image_input.jpg"
3
- html_code = input_image = f"""<img src="file://{path}" width="200" height="150">"""
4
-
5
- with gr.Blocks() as demo:
6
- gr.HTML(html_code)
7
-
8
- demo.launch(share=False)
 
 
 
 
 
 
 
 
 
application_2.py CHANGED
@@ -85,11 +85,16 @@ with gr.Blocks() as demo:
85
  with gr.Accordion("Input News"):
86
  news_title = gr.Textbox(label="Title", value="")
87
  news_image = gr.Image(label="Image", type="filepath")
88
- news_content = gr.Textbox(label="Content", value="", lines=12)
89
 
90
  # NEWS ANALYSIS REPORT
 
 
 
 
91
  with gr.Column(scale=2):
92
  with gr.Accordion("News Analysis"):
 
93
  detection_button = gr.Button("Verify news")
94
  detailed_analysis = gr.HTML("<br>"*40)
95
 
@@ -124,13 +129,16 @@ with gr.Blocks() as demo:
124
  text_llm_topic = file.read()
125
  with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
126
  text_llm_modification = file.read()
 
 
127
  except FileNotFoundError:
128
  print("File not found.")
129
  except Exception as e:
130
  print(f"An error occurred: {e}")
131
 
132
- title_1 = "Southampton news: Leeds target striker Cameron Archer"
133
- title_2 = "Southampton news: Leeds target striker Cameron Archer"
 
134
 
135
  image_1 = "examples/example_image_real_1.jpg.webp"
136
  image_2 = "examples/example_image_real_2.jpg.webp"
@@ -141,6 +149,7 @@ with gr.Blocks() as demo:
141
  [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
142
  [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
143
  [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
 
144
  ],
145
  inputs=[news_title, news_image, news_content],
146
  label="Examples",
@@ -148,7 +157,8 @@ with gr.Blocks() as demo:
148
  "2 real news",
149
  "1 real news + 1 LLM modification-based news",
150
  "1 real news + 1 LLM topic-based news",
 
151
  ],
152
  )
153
 
154
- demo.launch(share=True)
 
85
  with gr.Accordion("Input News"):
86
  news_title = gr.Textbox(label="Title", value="")
87
  news_image = gr.Image(label="Image", type="filepath")
88
+ news_content = gr.Textbox(label="Content", value="", lines=13)
89
 
90
  # NEWS ANALYSIS REPORT
91
+ explanation = """
92
+ - Green texts are the matched words in the input and source news.<br>
93
+ - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
94
+ """
95
  with gr.Column(scale=2):
96
  with gr.Accordion("News Analysis"):
97
+ gr.HTML(explanation)
98
  detection_button = gr.Button("Verify news")
99
  detailed_analysis = gr.HTML("<br>"*40)
100
 
 
129
  text_llm_topic = file.read()
130
  with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
131
  text_llm_modification = file.read()
132
+ with open('examples/example_text_LLM_entities.txt','r', encoding='utf-8') as file:
133
+ text_llm_entities = file.read()
134
  except FileNotFoundError:
135
  print("File not found.")
136
  except Exception as e:
137
  print(f"An error occurred: {e}")
138
 
139
+ title_1 = "Southampton news: Leeds target striker Cameron Archer."
140
+ title_2 = "Southampton news: Leeds target striker Cameron Archer."
141
+ title_4 = "Japan pledges support for Ukraine with 100-year pact."
142
 
143
  image_1 = "examples/example_image_real_1.jpg.webp"
144
  image_2 = "examples/example_image_real_2.jpg.webp"
 
149
  [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
150
  [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
151
  [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
152
+ [title_4, image_3, text_llm_entities],
153
  ],
154
  inputs=[news_title, news_image, news_content],
155
  label="Examples",
 
157
  "2 real news",
158
  "1 real news + 1 LLM modification-based news",
159
  "1 real news + 1 LLM topic-based news",
160
+ "1 LLM changed-entities news",
161
  ],
162
  )
163
 
164
+ demo.launch(share=False)
example_image_input.jpg DELETED
Binary file (25 kB)
 
examples/example_text_LLM_entities.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Japan Prime Minister has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
src/application/content_detection.py CHANGED
@@ -1,7 +1,8 @@
1
  from difflib import SequenceMatcher
2
  import difflib
3
- from src.application.highlight_text import generate_color
4
  from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
 
5
  from src.application.text.model_detection import detect_text_by_ai_model
6
  from src.application.text.preprocessing import split_into_sentences
7
  from src.application.text.search_detection import check_human, detect_text_by_relative_search
@@ -60,13 +61,22 @@ class NewsVerification():
60
  "paraphrase": False,
61
  "url": "",
62
  }
 
63
  for index, sentence in enumerate(input_sentences):
64
- if current_index >= index:
65
- continue
66
  print(f"-------index = {index}-------")
67
- paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
 
 
 
 
 
 
 
 
68
  if paraphrase is False:
69
  # add sentence to ai_sentence
 
 
70
  ai_sentence["input_sentence"] += sentence
71
  if index == len(input_sentences) - 1:
72
  # add ai_sentences to align_sentences
@@ -104,7 +114,6 @@ class NewsVerification():
104
  self.aligned_sentences.append(searched_sentences)
105
 
106
  previous_paraphrase = paraphrase
107
- #self.found_img_url = list(set(self.found_img_url))
108
 
109
  def detect_image_origin(self):
110
  print("CHECK IMAGE:")
@@ -173,22 +182,31 @@ class NewsVerification():
173
 
174
  def analyze_details(self):
175
  self.analyzed_table = []
176
- # IMAGES:
177
-
178
-
179
- # TEXT
180
- for pair in self.aligned_sentences:
181
- print(f"pair: {pair}")
182
- if "input_sentence" not in pair:
183
  continue
184
- input_words, source_words, input_indexes, source_indexes = (
185
- self.highlight_overlap_by_word_to_list(
186
- pair["input_sentence"],
187
- pair["matched_sentence"],
 
188
  )
189
- )
 
 
 
 
 
 
190
  self.analyzed_table.append(
191
- (input_words, source_words, input_indexes, source_indexes),
 
 
 
 
 
 
192
  )
193
 
194
  if len(self.analyzed_table) != 0:
@@ -197,92 +215,34 @@ class NewsVerification():
197
  html_table = ""
198
  return html_table
199
 
200
- def highlight_overlap_by_word_to_list(self, text1, text2):
201
- """
202
- Return
203
- - list of words in text1
204
- - list of words in text2
205
- - list of index of highlight words in text 1
206
- - list of index of highlight words in text 2
207
- """
208
- # TΓ‘ch chuα»—i thΓ nh cΓ‘c tα»« (word) dα»±a vΓ o khoαΊ£ng trαΊ―ng
209
- words1 = text1.split()
210
- words2 = text2.split()
211
-
212
- index1 = []
213
- index2 = []
214
-
215
- # Sα»­ dα»₯ng SequenceMatcher để tΓ¬m cΓ‘c Δ‘oαΊ‘n trΓΉng lαΊ·p giα»―a danh sΓ‘ch cΓ‘c tα»«
216
- matcher = SequenceMatcher(None, words1, words2)
217
 
218
- highlighted_text1 = []
219
- highlighted_text2 = []
220
-
221
- # Theo dΓ΅i vα»‹ trΓ­ hiện tαΊ‘i trong words1 vΓ  words2
222
- current_pos1 = 0
223
- current_pos2 = 0
224
-
225
- # LαΊ·p qua cΓ‘c Δ‘oαΊ‘n so khα»›p
226
- for match in matcher.get_matching_blocks():
227
- start1, start2, length = match
228
- print(start1, start2, length)
229
-
230
- # ThΓͺm cΓ‘c tα»« khΓ΄ng trΓΉng lαΊ·p vΓ o (giα»― nguyΓͺn)
231
- highlighted_text1.extend(words1[current_pos1:start1])
232
- highlighted_text2.extend(words2[current_pos2:start2])
233
-
234
- if length > 0:
235
- for i in range(start1, start1 + length):
236
- index1.append(i)
237
- for i in range(start2, start2 + length):
238
- index2.append(i)
239
-
240
- # CαΊ­p nhαΊ­t vα»‹ trΓ­ hiện tαΊ‘i
241
- current_pos1 = start1 + length
242
- current_pos2 = start2 + length
243
 
244
- return words1, words2, index1, index2
245
-
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  def get_text_urls(self):
248
  return set(self.text_referent_url)
249
-
250
- def generate_colors_list(self, set_urls):
251
- color_dict = {}
252
- num_urls = len(set_urls)
253
- for i in range(num_urls):
254
- color_dict[i] = generate_color(i, num_urls)
255
-
256
- return color_dict
257
 
258
- def analyze_details_2(self):
259
- html_text = ""
260
-
261
- self.analyzed_table = []
262
- # TEXT
263
- # Assign unique colors to each index
264
- set_urls = self.get_text_urls()
265
- color_dict = self.generate_colors_list(set_urls)
266
-
267
- # position of the color in the input contents
268
- position = 0
269
- for pair in self.aligned_sentences:
270
- if "input_sentence" not in pair:
271
- continue
272
- common_phrases, position = self.compare_sentences(
273
- pair["input_sentence"],
274
- pair["matched_sentence"],
275
- position,
276
- color_dict["0"], # TODO: set color
277
- )
278
-
279
-
280
- if len(self.analyzed_table) != 0:
281
- html_table = self.create_table()
282
- else:
283
- html_table = ""
284
- return html_text, html_table
285
-
286
  def compare_sentences(self, sentence_1, sentence_2, position, color):
287
  """
288
  Compares two sentences and identifies common phrases, outputting their start and end positions.
@@ -358,22 +318,39 @@ class NewsVerification():
358
  """
359
 
360
  def format_text_row(self, row, index = 0, max_length=30):
361
- input_sentence = self.highlight_text(row[0], row[2]) # text, index of highlight words
362
- source_sentence = self.highlight_text(row[1], row[3]) # text, index of highlight words
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
  url = self.aligned_sentences[index]["url"] #
365
  short_url = self.shorten_url(url, max_length)
366
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
367
 
368
- # short_url = self.shorten_url(self.text_referent_url[index], max_length)
369
- # source_text_url = f"""<a href="{self.text_referent_url[index]}">{short_url}</a>"""
370
- #label = self.aligned_sentences[index]["label"]
371
- print(self.aligned_sentences)
372
- print(index)
373
- label = self.aligned_sentences[index]["label"]
374
- score = self.aligned_sentences[index]["similarity"]
375
- return f"""<tr><td>{input_sentence}</td><td>{source_sentence}</td><td>{label}<br>({score*100:.2f}%)</td><td>{source_text_url}</td></tr>"""
376
-
377
  def format_image_row(self, max_length=30):
378
  # input_image = f"""<img src="example_image_input.jpg" width="200" height="150">"""
379
 
@@ -397,11 +374,118 @@ class NewsVerification():
397
  short_url = url
398
  return short_url
399
 
400
- def highlight_text(self, words, indexes):
401
- final_words = words
402
- for index in indexes:
403
- final_words[index] = (
404
- f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
405
- )
406
- return " ".join(final_words)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from difflib import SequenceMatcher
2
  import difflib
3
+ import string
4
  from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
5
+ from src.application.text.entity import apply_highlight, highlight_entities
6
  from src.application.text.model_detection import detect_text_by_ai_model
7
  from src.application.text.preprocessing import split_into_sentences
8
  from src.application.text.search_detection import check_human, detect_text_by_relative_search
 
61
  "paraphrase": False,
62
  "url": "",
63
  }
64
+
65
  for index, sentence in enumerate(input_sentences):
 
 
66
  print(f"-------index = {index}-------")
67
+ print(f"current_sentence = {input_sentences[index]}")
68
+
69
+ if current_index >= len(input_sentences):
70
+ break
71
+ if current_index >= index and index != 0 and index != len(input_sentences) - 1:
72
+ continue
73
+
74
+ paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
75
+
76
  if paraphrase is False:
77
  # add sentence to ai_sentence
78
+ if ai_sentence["input_sentence"] != "":
79
+ ai_sentence["input_sentence"] += "<br>"
80
  ai_sentence["input_sentence"] += sentence
81
  if index == len(input_sentences) - 1:
82
  # add ai_sentences to align_sentences
 
114
  self.aligned_sentences.append(searched_sentences)
115
 
116
  previous_paraphrase = paraphrase
 
117
 
118
  def detect_image_origin(self):
119
  print("CHECK IMAGE:")
 
182
 
183
  def analyze_details(self):
184
  self.analyzed_table = []
185
+
186
+ for aligned_sentence in self.aligned_sentences:
187
+ if "input_sentence" not in aligned_sentence:
 
 
 
 
188
  continue
189
+
190
+ # Get index of equal phrases in input and source sentences
191
+ equal_idx_1, equal_idx_2 = self.extract_equal_text(
192
+ aligned_sentence["input_sentence"],
193
+ aligned_sentence["matched_sentence"],
194
  )
195
+
196
+ # Get entity-words (in pair) with colors
197
+ entities_with_colors = highlight_entities(
198
+ aligned_sentence["input_sentence"],
199
+ aligned_sentence["matched_sentence"],
200
+ )
201
+
202
  self.analyzed_table.append(
203
+ [
204
+ aligned_sentence["input_sentence"],
205
+ aligned_sentence["matched_sentence"],
206
+ equal_idx_1,
207
+ equal_idx_2,
208
+ entities_with_colors,
209
+ ]
210
  )
211
 
212
  if len(self.analyzed_table) != 0:
 
215
  html_table = ""
216
  return html_table
217
 
218
+ def extract_equal_text(self, text1, text2):
219
+ def cleanup(text):
220
+ text = text.lower()
221
+ text = text.translate(str.maketrans('', '', string.punctuation))
222
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
+ splited_text1 = cleanup(text1).split()
225
+ splited_text2 = cleanup(text2).split()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ s = SequenceMatcher(None, splited_text1, splited_text2)
228
+
229
+ equal_idx_1 = []
230
+ equal_idx_2 = []
231
+ text1 = text1.split()
232
+ text2 = text2.split()
233
+ for tag, i1, i2, j1, j2 in s.get_opcodes():
234
+ if tag == 'equal':
235
+ equal_idx_1.append({"start": i1, "end": i2})
236
+ equal_idx_2.append({"start": j1, "end": j2})
237
+ # subtext_1 = " ".join(text1[i1:i2])
238
+ # subtext_2 = " ".join(text2[j1:j2])
239
+ # print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
240
+ return equal_idx_1, equal_idx_2
241
 
242
  def get_text_urls(self):
243
  return set(self.text_referent_url)
 
 
 
 
 
 
 
 
244
 
245
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  def compare_sentences(self, sentence_1, sentence_2, position, color):
247
  """
248
  Compares two sentences and identifies common phrases, outputting their start and end positions.
 
318
  """
319
 
320
  def format_text_row(self, row, index = 0, max_length=30):
321
+ if row[1] != "": # source is not empty
322
+ # highlight entities
323
+ input_sentence, highlight_idx_input = apply_highlight(row[0], row[4], "input")
324
+ source_sentence, highlight_idx_source = apply_highlight(row[1], row[4], "source")
325
+ print(f"highlighted_input: {input_sentence}")
326
+
327
+ # Color overlapping words
328
+ input_sentence = self.color_text(input_sentence, row[2], highlight_idx_input) # text, index of highlight words
329
+ source_sentence = self.color_text(source_sentence, row[3], highlight_idx_source) # text, index of highlight words
330
+ print(f"input_sentence: {input_sentence}")
331
+
332
+ input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
333
+ source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
334
+ else:
335
+ input_sentence = row[0]
336
+ source_sentence = row[1]
337
+
338
+ label = self.aligned_sentences[index]["label"]
339
+ score = self.aligned_sentences[index]["similarity"]
340
 
341
  url = self.aligned_sentences[index]["url"] #
342
  short_url = self.shorten_url(url, max_length)
343
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
344
 
345
+ return f"""
346
+ <tr>
347
+ <td>{input_sentence}</td>
348
+ <td>{source_sentence}</td>
349
+ <td>{label}<br>({score*100:.2f}%)</td>
350
+ <td>{source_text_url}</td>
351
+ </tr>
352
+ """
353
+
354
  def format_image_row(self, max_length=30):
355
  # input_image = f"""<img src="example_image_input.jpg" width="200" height="150">"""
356
 
 
374
  short_url = url
375
  return short_url
376
 
377
+ def color_text(self, text, colored_idx, highlighted_idx):
378
+ paragraph = ""
379
+ words = text.split()
380
+
381
+ starts, ends = self.extract_starts_ends(colored_idx)
382
+ starts, ends = self.filter_indices(starts, ends, highlighted_idx)
383
+ print(f"highlighted_idx: {highlighted_idx}")
384
+ print(f"starts_2: {starts}")
385
+ print(f"ends_2: {ends}")
386
+ previous_end = 0
387
+ for start, end in zip(starts, ends):
388
+ paragraph += " ".join(words[previous_end:start])
389
+
390
+ equal_words = " ".join(words[start:end])
391
+ print(f"starts_2: {start}")
392
+ print(f"ends_2: {end}")
393
+ print(f"equal_words: {words[start:end]}")
394
+ paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
395
+
396
+ previous_end = end
397
+
398
+ # Some left words due to the punctuation separated from
399
+ # the highlighting text
400
+ equal_words = " ".join(words[previous_end:])
401
+ print(f"starts_2: {previous_end}")
402
+ print(f"ends_2: {len(words)-1}")
403
+ print(f"equal_words: {words[previous_end:]}")
404
+ paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
405
+
406
+ return paragraph
407
+
408
+ def extract_starts_ends(self, colored_idx):
409
+ starts = []
410
+ ends = []
411
+ for index in colored_idx:
412
+ starts.append(index['start'])
413
+ ends.append(index['end'])
414
+ return starts, ends
415
+
416
 
417
+ def filter_indices(self, starts, ends, ignore_indices):
418
+ """
419
+ Filters start and end indices to exclude any indices present in the ignore_indices list.
420
+
421
+ Args:
422
+ starts: A list of starting indices.
423
+ ends: A list of ending indices. Must be the same length as starts.
424
+ ignore_indices: A list of indices to exclude.
425
+
426
+ Returns:
427
+ A tuple containing two new lists: filtered_starts and filtered_ends.
428
+ Returns empty lists if the input is invalid or if all ranges are filtered out.
429
+ Prints error messages for invalid input.
430
+
431
+ Examples:
432
+ starts = [0, 5, 10]
433
+ ends = [3, 7, 12]
434
+ ignore_indices = [1, 2, 11, 17]
435
+
436
+ # Output:
437
+ starts = [0, 3, 5, 10, 12]
438
+ ends = [0, 3, 7, 10, 12]
439
+
440
+ """
441
+
442
+ if len(starts) != len(ends):
443
+ print("Error: The 'starts' and 'ends' lists must have the same length.")
444
+ return [], []
445
+
446
+ filtered_starts = []
447
+ filtered_ends = []
448
+
449
+ for i in range(len(starts)):
450
+ start = starts[i]
451
+ end = ends[i]
452
+
453
+ if end < start:
454
+ print(f"Error: End index {end} is less than start index {start} at position {i}.")
455
+ return [], []
456
+
457
+
458
+ start_end = list(range(start, end + 1, 1))
459
+ start_end = list(set(start_end) - set(ignore_indices))
460
+ new_start, new_end = self.extract_sequences(start_end)
461
+ filtered_starts.extend(new_start)
462
+ filtered_ends.extend(new_end)
463
+
464
+ return filtered_starts, filtered_ends
465
+
466
+ def extract_sequences(self, numbers):
467
+ if len(numbers) == 1:
468
+ return [numbers[0]], [numbers[0]]
469
+
470
+ numbers.sort()
471
+ starts = []
472
+ ends = []
473
+ for i, number in enumerate(numbers):
474
+ if i == 0:
475
+ start = number
476
+ end = number
477
+ continue
478
+
479
+ if number - 1 == numbers[i-1]:
480
+ end = number
481
+ else:
482
+ starts.append(start)
483
+ ends.append(end + 1)
484
+ start = number
485
+ end = number
486
+
487
+ if i == len(numbers) - 1:
488
+ starts.append(start)
489
+ ends.append(end + 1)
490
+
491
+ return starts, ends
src/application/text/entity.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import colorsys
2
+ import json
3
+ import re
4
+ import openai
5
+ from dotenv import load_dotenv
6
+ import os
7
+ from transformers import pipeline
8
+ import gradio as gr
9
+
10
+ ner_pipeline = pipeline("ner")
11
+
12
+ load_dotenv()
13
+ AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
14
+ AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
15
+ AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
16
+
17
+ client = openai.AzureOpenAI(
18
+ api_version = AZURE_OPENAI_API_VERSION,
19
+ api_key = AZURE_OPENAI_API_KEY,
20
+ azure_endpoint = AZURE_OPENAI_ENDPOINT,
21
+ )
22
+
23
+
24
+ def extract_entities_gpt(original_text, compared_text, text_generation_model="gpt-4o-mini"):
25
+ # "o1-mini-2024-09-12"
26
+ # Generate text using the selected models
27
+ prompt = f"""
28
+ Compare the ORIGINAL TEXT and the COMPARED TEXT.
29
+ Identify and extract pairs of corresponding entities where the paraphrasing has resulted in a *significant* change in meaning.
30
+ Focus *only* on entities where the paraphrasing has resulted in a *significant* change in meaning. This includes, but is not limited to:
31
+ * **Numerical changes:** e.g., "five" changed to "ten," "10%" changed to "50%"
32
+ * **Name changes:** e.g., "Tokyo" changed to "New York," "Japan" changed to "Japanese"
33
+ * **Opposite meanings:** e.g., "increase" changed to "decrease," "good" changed to "bad"
34
+ * **Semantically different words:** e.g., "car" changed to "truck," "walk" changed to "run"
35
+
36
+ Exclude entities where the meaning remains essentially the same, even if the wording is different (e.g., "big" changed to "large," "house" changed to "residence"). Also exclude purely stylistic changes that don't affect the core meaning.
37
+
38
+ Output the extracted entity pairs, one pair per line, in the following JSON-like list format:
39
+ [
40
+ ["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
41
+ ["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
42
+ ]
43
+
44
+ If there are no entities that satisfy above condition, output empty list "[]".
45
+ ---
46
+ # ORIGINAL TEXT:
47
+ {original_text}
48
+ ---
49
+ # COMPARED TEXT:
50
+ {compared_text}
51
+ """
52
+
53
+ # Generate text using the text generation model
54
+ # Generate text using the selected model
55
+ try:
56
+ response = client.chat.completions.create(
57
+ model=text_generation_model,
58
+ messages = [{"role": "system", "content": prompt}],
59
+ )
60
+
61
+ res = response.choices[0].message.content
62
+
63
+ except openai.OpenAIError as e:
64
+ print(f"Error interacting with OpenAI API: {e}")
65
+ res = ""
66
+
67
+ return res
68
+
69
+ def read_json(json_string) -> list[list[str]]:
70
+ try:
71
+ entities = json.loads(json_string)
72
+ return entities
73
+
74
+ except json.JSONDecodeError as e:
75
+ print(f"Error decoding JSON: {e}")
76
+ return []
77
+
78
+ def lighten_color(hex_color, factor=1.8):
79
+ """Lightens a HEX color by increasing its brightness in HSV space."""
80
+
81
+ hex_color = hex_color.lstrip("#")
82
+ r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
83
+
84
+ # Convert to HSV
85
+ h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
86
+ v = min(1.0, v * factor) # Increase brightness
87
+
88
+ # Convert back to HEX
89
+ r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
90
+ return f'#{r:02x}{g:02x}{b:02x}'
91
+
92
+ def darken_color(hex_color, factor=0.7):
93
+ """Darkens a hex color by reducing its brightness in the HSV space."""
94
+
95
+ hex_color = hex_color.lstrip("#")
96
+ r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
97
+
98
+ # Convert to HSV to adjust brightness
99
+ h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
100
+ v = max(0, v * factor) # Reduce brightness
101
+
102
+ # Convert back to HEX
103
+ r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
104
+ return f'#{r:02x}{g:02x}{b:02x}'
105
+
106
+ def generate_color(index, total_colors=20):
107
+ """Generates a unique, evenly spaced color for each index using HSL."""
108
+
109
+ hue = index / total_colors # Spread hues in range [0,1]
110
+ saturation = 0.65 # Keep colors vivid
111
+ lightness = 0.75 # Balanced brightness
112
+
113
+ # Convert HSL to RGB
114
+ r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
115
+ r, g, b = int(r * 255), int(g * 255), int(b * 255)
116
+
117
+ return f'#{r:02x}{g:02x}{b:02x}' # Convert to hex
118
+
119
+
120
+ def assign_colors_to_entities(entities):
121
+ total_colors = len(entities)
122
+ # Assign colors to entities
123
+ entities_colors = []
124
+ for index, entity in enumerate(entities):
125
+ color = generate_color(index, total_colors)
126
+
127
+ # append color and index to entities_colors
128
+ entities_colors.append({"color": color, "input": entity[0], "source": entity[1]})
129
+
130
+ return entities_colors
131
+
132
+ def highlight_entities(text1, text2):
133
+ if text1 == "" or text2 == "":
134
+ return []
135
+
136
+ print(f"text1: {text1}")
137
+ print(f"text2: {text2}")
138
+ entities_text = extract_entities_gpt(text1, text2)
139
+ print(f"entities_text: {entities_text}")
140
+
141
+ entities = read_json(entities_text)
142
+
143
+ # Assign colors to entities
144
+ entities_with_colors = assign_colors_to_entities(entities)
145
+ print(f"entities_colors: ", entities_with_colors)
146
+
147
+ # Apply highlighting to entities
148
+ # highlighted_text_1 = apply_highlight(text1, entities_with_colors, "input")
149
+ # highlighted_text_2 = apply_highlight(text2, entities_with_colors, "source")
150
+
151
+ return entities_with_colors
152
+
153
+
154
+ def apply_highlight(text, entities_with_colors, key="input"):
155
+ if entities_with_colors == []:
156
+ return text, []
157
+
158
+ all_starts = []
159
+ all_ends = []
160
+ highlighted_text = ""
161
+ temp_text = text
162
+ for index, entity in enumerate(entities_with_colors):
163
+ highlighted_text = ""
164
+
165
+ # find a list of starts and ends of entity in text:
166
+ starts = [m.start() for m in re.finditer(entity[key], temp_text)]
167
+ ends = [m.end() for m in re.finditer(entity[key], temp_text)]
168
+
169
+ all_starts.extend(starts)
170
+ all_ends.extend(ends)
171
+
172
+ color = entities_with_colors[index]["color"]
173
+ entity_color = lighten_color(color, factor=2.2) # Lightened color for background text
174
+ label_color = darken_color(entity_color, factor=0.7) # Darker color for background label (index)
175
+
176
+ # Apply highlighting to each entity
177
+ prev_end = 0
178
+ for start, end in zip(starts, ends):
179
+ # Append non-highlighted text
180
+ highlighted_text += temp_text[prev_end:start]
181
+
182
+ # Style the index as a label
183
+ index_label = (f'<span_style="background-color:{label_color};color:white;'
184
+ f'padding:1px_4px;border-radius:4px;font-size:12px;'
185
+ f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1}</span>')
186
+
187
+ # Append highlighted text with index label
188
+ highlighted_text += (f'\n<span_style="background-color:{entity_color};color:black;'
189
+ f'border-radius:3px;font-size:14px;display:inline-block;">'
190
+ f'{index_label}{temp_text[start:end]}</span>\n')
191
+ prev_end = end
192
+ highlighted_text += temp_text[prev_end:]
193
+ temp_text = highlighted_text
194
+
195
+ if highlighted_text == "":
196
+ return text, []
197
+ highlight_idx_list = get_index_list(highlighted_text)
198
+ return highlighted_text, highlight_idx_list
199
+
200
+ def get_index_list(highlighted_text):
201
+ """
202
+ Generates a list of indices between corresponding start and end indices.
203
+
204
+ Args:
205
+ starts: A list of starting indices.
206
+ ends: A list of ending indices. Must be the same length as starts.
207
+
208
+ Returns:
209
+ A list containing all indices within the specified ranges.
210
+ Returns an empty list if the input is invalid (e.g., different lengths,
211
+ end < start, etc.).
212
+ """
213
+ highlighted_index = []
214
+ words = highlighted_text.split()
215
+ for index, word in enumerate(words):
216
+ if word.startswith("<span_style"):
217
+ start_index = index
218
+ if word.endswith("</span>"):
219
+ end_index = index
220
+
221
+ highlighted_index.extend(list(range(start_index, end_index + 1)))
222
+
223
+ return highlighted_index
224
+
225
+ def extract_entities(text):
226
+ output = ner_pipeline(text)
227
+ words = extract_words(output)
228
+ words = combine_subwords(words)
229
+
230
+ # extract word in each entity and assign to a list of entities, connect words if there is no space between them
231
+ entities = []
232
+ for entity in words:
233
+ if entity not in entities:
234
+ entities.append(entity)
235
+
236
+ return entities
237
+
238
+
239
+ def extract_words(entities):
240
+ """
241
+ Extracts the words from a list of entities.
242
+
243
+ Args:
244
+ entities: A list of entities.
245
+
246
+ Returns:
247
+ A list of words extracted from the entities.
248
+ """
249
+ words = []
250
+ for entity in entities:
251
+ words.append(entity["word"])
252
+ return words
253
+
254
+
255
+ def combine_subwords(word_list):
256
+ """
257
+ Combines subwords (indicated by "##") with the preceding word in a list.
258
+
259
+ Args:
260
+ word_list: A list of words, where subwords are prefixed with "##".
261
+
262
+ Returns:
263
+ A new list with subwords combined with their preceding words.
264
+ """
265
+ result = []
266
+ i = 0
267
+ while i < len(word_list):
268
+ if word_list[i].startswith("##"):
269
+ result[-1] += word_list[i][2:] # Remove "##" and append to the previous word
270
+ elif i < len(word_list) - 2 and word_list[i + 1] == "-": # Combine hyphenated words
271
+ result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
272
+ i += 2 # Skip the next two words
273
+ else:
274
+ result.append(word_list[i])
275
+ i += 1
276
+ return result
277
+
278
+
279
+ original_text = """
280
+ Title: UK pledges support for Ukraine with 100-year pact
281
+ Content: Sir Keir Starmer has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east. Zelensky praised the UK's commitment on Thursday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
282
+ """
283
+ compared_text = """
284
+ Title: Japan pledges support for Ukraine with 100-year pact
285
+ Content: A leading Japanese figure has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where they signed a "landmark" 100-year pact with the war-stricken country. The visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east. Zelensky praised Japan's commitment on Thursday, amid wider concerns that the next US President, who is set to take office on Monday, could potentially reduce aid.
286
+ """
287
+ if __name__ == "__main__":
288
+ # text = "The Saudi authorities, I am told, are currently working flat out" \
289
+ # "to collate everything they have on the Magdeburg market suspect," \
290
+ # "Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \
291
+ # "investigation"
292
+ # print(extract_entities(text))
293
+
294
+
295
+ with gr.Blocks() as demo:
296
+ gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
297
+ text1_input = gr.Textbox(
298
+ label="Paragraph 1",
299
+ lines=5,
300
+ value=original_text,
301
+ )
302
+ text2_input = gr.Textbox(
303
+ label="Paragraph 2",
304
+ lines=5,
305
+ value=compared_text,
306
+ )
307
+ submit_button = gr.Button("Highlight Matches")
308
+ output1 = gr.HTML("<br>"*10)
309
+ output2 = gr.HTML("<br>"*10)
310
+
311
+
312
+ submit_button.click(
313
+ fn=highlight_entities,
314
+ inputs=[text1_input, text2_input],
315
+ outputs=[output1, output2]
316
+ )
317
+
318
+ # Launch the Gradio app
319
+ demo.launch()
320
+
src/application/{highlight_text.py β†’ text/highlight_text.py} RENAMED
File without changes
src/application/text/identity.py DELETED
@@ -1,63 +0,0 @@
1
- from transformers import pipeline
2
-
3
- ner_pipeline = pipeline("ner")
4
-
5
- def extract_entities(text):
6
- output = ner_pipeline(text)
7
- words = extract_words(output)
8
- words = combine_subwords(words)
9
-
10
- # extract word in each entity and assign to a list of entities, connect words if there is no space between them
11
- entities = []
12
- for entity in words:
13
- if entity not in entities:
14
- entities.append(entity)
15
-
16
- return entities
17
-
18
-
19
- def extract_words(entities):
20
- """
21
- Extracts the words from a list of entities.
22
-
23
- Args:
24
- entities: A list of entities.
25
-
26
- Returns:
27
- A list of words extracted from the entities.
28
- """
29
- words = []
30
- for entity in entities:
31
- words.append(entity["word"])
32
- return words
33
-
34
-
35
- def combine_subwords(word_list):
36
- """
37
- Combines subwords (indicated by "##") with the preceding word in a list.
38
-
39
- Args:
40
- word_list: A list of words, where subwords are prefixed with "##".
41
-
42
- Returns:
43
- A new list with subwords combined with their preceding words.
44
- """
45
- result = []
46
- i = 0
47
- while i < len(word_list):
48
- if word_list[i].startswith("##"):
49
- result[-1] += word_list[i][2:] # Remove "##" and append to the previous word
50
- elif i < len(word_list) - 2 and word_list[i + 1] == "-": # Combine hyphenated words
51
- result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
52
- i += 2 # Skip the next two words
53
- else:
54
- result.append(word_list[i])
55
- i += 1
56
- return result
57
-
58
- if __name__ == "__main__":
59
- text = "The Saudi authorities, I am told, are currently working flat out" \
60
- "to collate everything they have on the Magdeburg market suspect," \
61
- "Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \
62
- "investigation"
63
- print(extract_entities(text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/application/text/model_detection.py CHANGED
@@ -35,6 +35,7 @@ def detect_text_by_ai_model(
35
  truncation=True,
36
  device_map="auto", # good for GPU usage
37
  )
 
38
  result = pipe(input_text)[0]
39
  confidence_score = result["score"]
40
  if result["label"] == MODEL_HUMAN_LABEL[model]:
 
35
  truncation=True,
36
  device_map="auto", # good for GPU usage
37
  )
38
+ input_text = input_text.replace("<br>", " ")
39
  result = pipe(input_text)[0]
40
  confidence_score = result["score"]
41
  if result["label"] == MODEL_HUMAN_LABEL[model]:
src/application/text/preprocessing.py CHANGED
@@ -13,10 +13,10 @@ def split_into_sentences(input_text):
13
  if not isinstance(input_text, str):
14
  return []
15
 
16
- paragraphs = input_text.splitlines()
17
  sentences = []
18
  for paragraph in paragraphs:
19
  paragraph = paragraph.strip()
20
- if paragraph:
21
  sentences.extend(sent_tokenize(paragraph))
22
  return sentences
 
13
  if not isinstance(input_text, str):
14
  return []
15
 
16
+ paragraphs = input_text.splitlines(keepends=True)
17
  sentences = []
18
  for paragraph in paragraphs:
19
  paragraph = paragraph.strip()
20
+ if paragraph and paragraph != '\n':
21
  sentences.extend(sent_tokenize(paragraph))
22
  return sentences
src/application/text/search.py CHANGED
@@ -7,7 +7,7 @@ from nltk.corpus import stopwords
7
  from nltk.tokenize import word_tokenize
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
 
10
- from src.application.text.identity import extract_entities
11
 
12
  load_dotenv()
13
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 
7
  from nltk.tokenize import word_tokenize
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
 
10
+ from src.application.text.entity import extract_entities
11
 
12
  load_dotenv()
13
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
src/application/text/search_detection.py CHANGED
@@ -41,7 +41,7 @@ def detect_text_by_relative_search(input_text, index, is_support_opposite = Fals
41
  search_results = search_by_google(candidate)
42
  urls = [item['link'] for item in search_results.get("items", [])]
43
 
44
- for url in urls[:3]:
45
  if url in checked_urls: # visited url
46
  continue
47
  if "bbc.com" not in url:
@@ -58,11 +58,10 @@ def detect_text_by_relative_search(input_text, index, is_support_opposite = Fals
58
  continue
59
 
60
  page_text = content.title + "\n" + content.text
61
- print(f"page_text: {page_text}")
62
  if len(page_text) > MAX_CHAR_SIZE:
63
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
64
  continue
65
-
66
  paraphrase, aligned_first_sentences = check_paraphrase(input_text[index], page_text, url)
67
 
68
  if paraphrase is False:
@@ -71,15 +70,17 @@ def detect_text_by_relative_search(input_text, index, is_support_opposite = Fals
71
  sub_paraphrase = True
72
  while sub_paraphrase == True:
73
  index += 1
74
- print(f"----search {index}----")
75
  if index >= len(input_text):
 
76
  break
 
77
  sub_paraphrase, sub_sentences = check_paraphrase(input_text[index], page_text, url)
78
  print(f"sub_paraphrase: {sub_paraphrase}")
79
  print(f"sub_sentences: {sub_sentences}")
80
- if sub_paraphrase == True:
81
- aligned_first_sentences["input_sentence"] += sub_sentences["input_sentence"]
82
- aligned_first_sentences["matched_sentence"] += sub_sentences["matched_sentence"]
83
  aligned_first_sentences["similarity"] += sub_sentences["similarity"]
84
  aligned_first_sentences["similarity"] /= 2
85
 
 
41
  search_results = search_by_google(candidate)
42
  urls = [item['link'] for item in search_results.get("items", [])]
43
 
44
+ for url in urls[:10]:
45
  if url in checked_urls: # visited url
46
  continue
47
  if "bbc.com" not in url:
 
58
  continue
59
 
60
  page_text = content.title + "\n" + content.text
 
61
  if len(page_text) > MAX_CHAR_SIZE:
62
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
63
  continue
64
+ print(f"\t\t\t↑↑↑ Title: {content.title}")
65
  paraphrase, aligned_first_sentences = check_paraphrase(input_text[index], page_text, url)
66
 
67
  if paraphrase is False:
 
70
  sub_paraphrase = True
71
  while sub_paraphrase == True:
72
  index += 1
73
+ print(f"----search {index} < {len(input_text)}----")
74
  if index >= len(input_text):
75
+ print(f"input_text_last: {input_text[-1]}")
76
  break
77
+ print(f"input_text: {input_text[index]}")
78
  sub_paraphrase, sub_sentences = check_paraphrase(input_text[index], page_text, url)
79
  print(f"sub_paraphrase: {sub_paraphrase}")
80
  print(f"sub_sentences: {sub_sentences}")
81
+ if sub_paraphrase == True:
82
+ aligned_first_sentences["input_sentence"] += "<br>" + sub_sentences["input_sentence"]
83
+ aligned_first_sentences["matched_sentence"] += "<br>" + sub_sentences["matched_sentence"]
84
  aligned_first_sentences["similarity"] += sub_sentences["similarity"]
85
  aligned_first_sentences["similarity"] /= 2
86
 
test.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from difflib import SequenceMatcher
2
+ import string
3
+
4
+ def extract_equal_text(text1, text2):
5
+ def cleanup(text):
6
+ text = text.lower()
7
+ text = text.translate(str.maketrans('', '', string.punctuation))
8
+ return text
9
+
10
+ splited_text1 = cleanup(text1).split()
11
+ splited_text2 = cleanup(text2).split()
12
+
13
+ s = SequenceMatcher(None, splited_text1, splited_text2)
14
+
15
+ equal_idx_1 = []
16
+ equal_idx_2 = []
17
+ text1 = text1.split()
18
+ text2 = text2.split()
19
+ for tag, i1, i2, j1, j2 in s.get_opcodes():
20
+ if tag == 'equal':
21
+ equal_idx_1.append({"start": i1, "end": i2})
22
+ equal_idx_2.append({"start": j1, "end": j2})
23
+ subtext_1 = " ".join(text1[i1:i2])
24
+ subtext_2 = " ".join(text2[j1:j2])
25
+ print(f'{tag:7} a[{i1:2}:{i2:2}] --> b[{j1:2}:{j2:2}] {subtext_1!r:>55} --> {subtext_2!r}')
26
+
27
+ return equal_idx_1, equal_idx_2
28
+
29
+ text1 = """
30
+ Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for Β£8m.
31
+ Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
32
+ He made a substitute appearance and waved farewell to fans in Newcastle's recent win against Southampton.
33
+ Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2022-23, and scored against Paris St-Germain in the Champions League.
34
+ """
35
+ text2 = """
36
+ Newcastle United winger Miguel Almiron has rejoined Atlanta United on a permanent deal for Β£8m.
37
+ Almiron has made 223 appearances for Newcastle, scoring 30 goals, but has struggled recently to gain a place in manager Eddie Howe's starting line-up.
38
+ Last weekend he came on as a substitute in Newcastle's 3-1 win against Southampton and waved farewell to the travelling supporters.
39
+ Almiron played a significant role in Newcastle reaching the Carabao Cup final and finishing fourth in the Premier League in 2022-23.
40
+ """
41
+
42
+ idx_1, idx_2 = extract_equal_text(text1, text2)
43
+
44
+ # text1_split = text1.split()
45
+ # for idx in idx_1:
46
+ # print(text1_split[idx["start"]:idx["end"]])