pmkhanh7890 commited on
Commit
504f37b
Β·
1 Parent(s): 8617832

solve bugs, update combination score and label, add method for better searching.

Browse files
application.py CHANGED
@@ -8,10 +8,6 @@ from src.application.content_detection import NewsVerification
8
  from src.application.url_reader import URLReader
9
  from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
10
 
11
-
12
- GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
13
- SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
14
-
15
  AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
16
  AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
17
 
@@ -43,14 +39,6 @@ def load_url(url):
43
 
44
 
45
  def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
46
- if news_image is not None:
47
- # Convert to PIL Image for easier saving
48
- img = Image.open(news_image)
49
-
50
- # Save the image (you can customize the filename)
51
- filepath = "example_image_input.jpg" # Or use a dynamic filename
52
- img.save(filepath)
53
-
54
  news_analysis = NewsVerification()
55
  news_analysis.load_news(news_title, news_content, news_image)
56
  news_analysis.generate_analysis_report()
@@ -60,7 +48,7 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
60
  # Define the GUI
61
  with gr.Blocks() as demo:
62
  gr.Markdown("# NEWS VERIFICATION")
63
-
64
  with gr.Row():
65
  # SETTINGS
66
  with gr.Column(scale=1):
@@ -93,13 +81,59 @@ with gr.Blocks() as demo:
93
  with gr.Accordion("Input News"):
94
  news_title = gr.Textbox(label="Title", value="")
95
  news_image = gr.Image(label="Image", type="filepath")
96
- news_content = gr.Textbox(label="Content", value="", lines=12)
97
 
98
  # NEWS ANALYSIS REPORT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  with gr.Column(scale=2):
100
- with gr.Accordion("News Analysis"):
101
- detection_button = gr.Button("Verify news")
102
- detailed_analysis = gr.HTML("<br>"*40)
 
 
 
 
 
 
 
 
103
 
104
  # Connect events
105
  load_button.click(
@@ -116,9 +150,9 @@ with gr.Blocks() as demo:
116
  generate_image_button.click(generate_fake_image,
117
  inputs=[image_generation_model, news_title],
118
  outputs=[news_image])
119
- detection_button.click(generate_analysis_report,
120
  inputs=[news_title, news_content, news_image],
121
- outputs=[detailed_analysis])
122
 
123
  # change Image
124
  #url_input.change(load_image, inputs=url_input, outputs=image_view)
@@ -132,23 +166,28 @@ with gr.Blocks() as demo:
132
  text_llm_topic = file.read()
133
  with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
134
  text_llm_modification = file.read()
 
 
135
  except FileNotFoundError:
136
  print("File not found.")
137
  except Exception as e:
138
  print(f"An error occurred: {e}")
139
 
140
- title_1 = "Southampton news: Leeds target striker Cameron Archer"
141
- title_2 = "Southampton news: Leeds target striker Cameron Archer"
 
142
 
143
  image_1 = "examples/example_image_real_1.jpg.webp"
144
  image_2 = "examples/example_image_real_2.jpg.webp"
145
  image_3 = "examples/example_image_real_3.jpg"
 
146
 
147
  gr.Examples(
148
  examples=[
149
  [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
150
  [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
151
  [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
 
152
  ],
153
  inputs=[news_title, news_image, news_content],
154
  label="Examples",
@@ -156,7 +195,8 @@ with gr.Blocks() as demo:
156
  "2 real news",
157
  "1 real news + 1 LLM modification-based news",
158
  "1 real news + 1 LLM topic-based news",
 
159
  ],
160
  )
161
 
162
- demo.launch(share=False)
 
8
  from src.application.url_reader import URLReader
9
  from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
10
 
 
 
 
 
11
  AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
12
  AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
13
 
 
39
 
40
 
41
  def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
 
 
 
 
 
 
 
 
42
  news_analysis = NewsVerification()
43
  news_analysis.load_news(news_title, news_content, news_image)
44
  news_analysis.generate_analysis_report()
 
48
  # Define the GUI
49
  with gr.Blocks() as demo:
50
  gr.Markdown("# NEWS VERIFICATION")
51
+
52
  with gr.Row():
53
  # SETTINGS
54
  with gr.Column(scale=1):
 
81
  with gr.Accordion("Input News"):
82
  news_title = gr.Textbox(label="Title", value="")
83
  news_image = gr.Image(label="Image", type="filepath")
84
+ news_content = gr.Textbox(label="Content", value="", lines=13)
85
 
86
  # NEWS ANALYSIS REPORT
87
+ ordinary_user_explanation = """
88
+ FOR ORDINARY USER<br>
89
+ - Green texts are the matched words in the input and source news.<br>
90
+ - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
91
+ """
92
+ fact_checker_explanation = """
93
+ FOR FACT CHECKER<br>
94
+ - Green texts are the matched words in the input and source news.<br>
95
+ - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
96
+ """
97
+ governor_explanation = """
98
+ FOR GOVERNOR<br>
99
+ - Green texts are the matched words in the input and source news.<br>
100
+ - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
101
+ """
102
+ table = """
103
+ <h5>Comparison between input news and source news:</h5>
104
+ <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
105
+ <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
106
+ <thead>
107
+ <tr>
108
+ <th>Input news</th>
109
+ <th>Source (URL provided in Originality column correspondingly)</th>
110
+ <th>Forensic</th>
111
+ <th>Originality</th>
112
+ </tr>
113
+ </thead>
114
+ <tbody>
115
+ <tr>
116
+ <th>TBD</th>
117
+ <th>TBD</th>
118
+ <th>TBD</th>
119
+ <th>TBD</th>
120
+ </tr>
121
+ </tbody>
122
+ </table>
123
+
124
+ <style>"""
125
  with gr.Column(scale=2):
126
+ with gr.Accordion("NEWS ANALYSIS"):
127
+ verification_button = gr.Button("Verify news")
128
+ with gr.Tab("Orinary User"):
129
+ gr.HTML(ordinary_user_explanation)
130
+ ordinary_user_result = gr.HTML(table)
131
+ with gr.Tab("Fact Checker"):
132
+ gr.HTML(fact_checker_explanation)
133
+ fact_checker_result = gr.HTML(table)
134
+ with gr.Tab("Governor"):
135
+ gr.HTML(governor_explanation)
136
+ governor_result = gr.HTML(table)
137
 
138
  # Connect events
139
  load_button.click(
 
150
  generate_image_button.click(generate_fake_image,
151
  inputs=[image_generation_model, news_title],
152
  outputs=[news_image])
153
+ verification_button.click(generate_analysis_report,
154
  inputs=[news_title, news_content, news_image],
155
+ outputs=[ordinary_user_result, fact_checker_result, governor_result])
156
 
157
  # change Image
158
  #url_input.change(load_image, inputs=url_input, outputs=image_view)
 
166
  text_llm_topic = file.read()
167
  with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
168
  text_llm_modification = file.read()
169
+ with open('examples/example_text_LLM_entities.txt','r', encoding='utf-8') as file:
170
+ text_llm_entities = file.read()
171
  except FileNotFoundError:
172
  print("File not found.")
173
  except Exception as e:
174
  print(f"An error occurred: {e}")
175
 
176
+ title_1 = "Southampton news: Leeds target striker Cameron Archer."
177
+ title_2 = "Southampton news: Leeds target striker Cameron Archer."
178
+ title_4 = "Japan pledges support for Ukraine with 100-year pact."
179
 
180
  image_1 = "examples/example_image_real_1.jpg.webp"
181
  image_2 = "examples/example_image_real_2.jpg.webp"
182
  image_3 = "examples/example_image_real_3.jpg"
183
+ image_4 = "examples/example_image_real_4.jpg.webp"
184
 
185
  gr.Examples(
186
  examples=[
187
  [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
188
  [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
189
  [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
190
+ [title_4, image_4, text_llm_entities],
191
  ],
192
  inputs=[news_title, news_image, news_content],
193
  label="Examples",
 
195
  "2 real news",
196
  "1 real news + 1 LLM modification-based news",
197
  "1 real news + 1 LLM topic-based news",
198
+ "1 LLM changed-entities news",
199
  ],
200
  )
201
 
202
+ demo.launch(share=True)
application_2.py CHANGED
@@ -100,7 +100,7 @@ with gr.Blocks() as demo:
100
  - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
101
  """
102
  table = """
103
- <h5>Comparison between input news and source news</h5>
104
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
105
  <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
106
  <thead>
@@ -132,7 +132,7 @@ with gr.Blocks() as demo:
132
  gr.HTML(fact_checker_explanation)
133
  fact_checker_result = gr.HTML(table)
134
  with gr.Tab("Governor"):
135
- gr.HTML(fact_checker_explanation)
136
  governor_result = gr.HTML(table)
137
 
138
  # Connect events
@@ -180,13 +180,14 @@ with gr.Blocks() as demo:
180
  image_1 = "examples/example_image_real_1.jpg.webp"
181
  image_2 = "examples/example_image_real_2.jpg.webp"
182
  image_3 = "examples/example_image_real_3.jpg"
 
183
 
184
  gr.Examples(
185
  examples=[
186
  [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
187
  [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
188
  [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
189
- [title_4, image_3, text_llm_entities],
190
  ],
191
  inputs=[news_title, news_image, news_content],
192
  label="Examples",
@@ -198,4 +199,4 @@ with gr.Blocks() as demo:
198
  ],
199
  )
200
 
201
- demo.launch(share=False)
 
100
  - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
101
  """
102
  table = """
103
+ <h5>Comparison between input news and source news:</h5>
104
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
105
  <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
106
  <thead>
 
132
  gr.HTML(fact_checker_explanation)
133
  fact_checker_result = gr.HTML(table)
134
  with gr.Tab("Governor"):
135
+ gr.HTML(governor_explanation)
136
  governor_result = gr.HTML(table)
137
 
138
  # Connect events
 
180
  image_1 = "examples/example_image_real_1.jpg.webp"
181
  image_2 = "examples/example_image_real_2.jpg.webp"
182
  image_3 = "examples/example_image_real_3.jpg"
183
+ image_4 = "examples/example_image_real_4.jpg.webp"
184
 
185
  gr.Examples(
186
  examples=[
187
  [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
188
  [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
189
  [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
190
+ [title_4, image_4, text_llm_entities],
191
  ],
192
  inputs=[news_title, news_image, news_content],
193
  label="Examples",
 
199
  ],
200
  )
201
 
202
+ demo.launch(share=True)
examples/example_image_real_4.jpg.webp ADDED
examples/example_text_LLM_entities.txt CHANGED
@@ -1 +1 @@
1
- Japan Prime Minister has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
 
1
+ Shigeru Ishiba has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
gpt_test.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from openai import AzureOpenAI
4
+ load_dotenv()
5
+ AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
6
+ AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
7
+ AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
8
+
9
+ azure_client = AzureOpenAI(
10
+ azure_endpoint = "https://quoc-nguyen.openai.azure.com/",
11
+ api_key=AZURE_OPENAI_API_KEY,
12
+ api_version="2024-05-01-preview"
13
+ )
14
+
15
+ deplopment_name = "o1-mini" # or "gpt-4o"
16
+ TEXT_PROMPT = """
17
+ replace Ukraine with Denmark:
18
+
19
+ "Sir Keir Starmer has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country.
20
+
21
+ The prime minister's visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems.
22
+
23
+ Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back".
24
+
25
+ An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east.
26
+
27
+ Zelensky praised the UK's commitment on Thursday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid."
28
+ """
29
+
30
+ response = azure_client.chat.completions.create(
31
+ model=deplopment_name, # model = "deployment_name".
32
+ messages=[
33
+ # {"role": "system", "content": "You are a helpful assistant."},
34
+ {"role": "user", "content": TEXT_PROMPT},
35
+ ],
36
+ # max_tokens=512,
37
+ # temperature=0,
38
+ )
39
+ print(response.choices[0].message.content)
src/application/content_detection.py CHANGED
@@ -1,10 +1,12 @@
1
  from difflib import SequenceMatcher
 
 
2
  from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
3
  from src.application.text.entity import apply_highlight, highlight_entities
4
  from src.application.text.helper import extract_equal_text
5
  from src.application.text.model_detection import detect_text_by_ai_model
6
  from src.application.text.preprocessing import split_into_paragraphs
7
- from src.application.text.search_detection import check_human, detect_text_by_relative_search
8
 
9
 
10
  class NewsVerification():
@@ -25,11 +27,22 @@ class NewsVerification():
25
 
26
  self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
27
  self.aligned_sentences:list[dict] = []
 
 
 
 
 
 
 
 
 
 
28
  self.is_paraphrased:list[bool] = []
29
 
30
  self.ordinary_user_table:list = []
31
  self.fact_checker_table:list = []
32
  self.governor_table:list = []
 
33
 
34
  def load_news(self, news_title, news_content, news_image):
35
  self.news_text = news_title + "\n\n" + news_content
@@ -70,7 +83,7 @@ class NewsVerification():
70
 
71
  if current_index >= len(input_sentences):
72
  break
73
- if current_index >= index and index != 0 and index != len(input_sentences) - 1:
74
  continue
75
 
76
  paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
@@ -117,6 +130,40 @@ class NewsVerification():
117
 
118
  previous_paraphrase = paraphrase
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  def detect_image_origin(self):
121
  print("CHECK IMAGE:")
122
  if self.news_image is None:
@@ -183,6 +230,15 @@ class NewsVerification():
183
  self.detect_image_origin()
184
 
185
  def analyze_details(self):
 
 
 
 
 
 
 
 
 
186
  ordinary_user_table = self.create_ordinary_user_table()
187
  fact_checker_table = self.create_fact_checker_table()
188
  governor_table = self.create_governor_table()
@@ -253,17 +309,17 @@ class NewsVerification():
253
  )
254
 
255
  # Get entity-words (in pair) with colors
256
- entities_with_colors = highlight_entities(
257
- aligned_sentence["input_sentence"],
258
- aligned_sentence["matched_sentence"],
259
- )
260
-
261
  self.fact_checker_table.append(
262
  [
263
  aligned_sentence,
264
  equal_idx_1,
265
  equal_idx_2,
266
- entities_with_colors,
267
  ]
268
  )
269
 
@@ -273,7 +329,7 @@ class NewsVerification():
273
 
274
  table = "\n".join(rows)
275
  return f"""
276
- <h5>Comparison between input news and source news</h5>
277
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
278
  <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
279
  <thead>
@@ -293,12 +349,14 @@ class NewsVerification():
293
  """
294
 
295
  def format_text_fact_checker_row(self, row, max_length=30):
 
296
  if row[0]["input_sentence"] == "":
297
  return ""
298
  if row[0]["matched_sentence"] != "": # source is not empty
299
  # highlight entities
300
  input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
301
  source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
 
302
 
303
  # Color overlapping words
304
  input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
@@ -317,11 +375,13 @@ class NewsVerification():
317
  short_url = self.shorten_url(url, max_length)
318
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
319
 
 
 
320
  return f"""
321
  <tr>
322
  <td>{input_sentence}</td>
323
  <td>{source_sentence}</td>
324
- <td>{label}<br>({score*100:.2f}%)</td>
325
  <td>{source_text_url}</td>
326
  </tr>
327
  """
@@ -347,7 +407,7 @@ class NewsVerification():
347
  table = "\n".join(rows)
348
 
349
  return f"""
350
- <h5>Comparison between input news and source news</h5>
351
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
352
  <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
353
  <thead>
@@ -374,28 +434,15 @@ class NewsVerification():
374
  for index, row in enumerate(self.aligned_sentences):
375
  if row["input_sentence"] == "":
376
  continue
377
- input_sentences += row["input_sentence"]
378
  label = self.aligned_sentences[index]["label"]
379
- if label == "HUMAN":
380
- score = self.aligned_sentences[index]["similarity"]
381
- if label == "MACHINE":
382
- score = 1 - self.aligned_sentences[index]["similarity"]
383
- scores += score
384
 
385
  url = self.aligned_sentences[index]["url"] #
386
  short_url = self.shorten_url(url, max_length)
387
  source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
388
  sentence_count += 1
389
 
390
- if scores == 0:
391
- label = "UNKNOWN"
392
- else:
393
- scores /= sentence_count
394
- if scores > 0.5:
395
- label = "HUMAN"
396
- else:
397
- label = "MACHINE"
398
- scores = 1 - scores
399
 
400
  return f"""
401
  <tr>
@@ -408,14 +455,14 @@ class NewsVerification():
408
  def format_image_ordinary_user_row(self, max_length=30):
409
 
410
  if self.image_referent_url is not None or self.image_referent_url != "":
411
- source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
412
  short_url = self.shorten_url(self.image_referent_url, max_length)
413
  source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
414
  else:
415
- source_image = "Image not found"
416
  source_image_url = ""
417
 
418
- return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
419
 
420
 
421
  def create_governor_table(self):
@@ -434,17 +481,17 @@ class NewsVerification():
434
  )
435
 
436
  # Get entity-words (in pair) with colors
437
- entities_with_colors = highlight_entities(
438
- aligned_sentence["input_sentence"],
439
- aligned_sentence["matched_sentence"],
440
- )
441
 
442
  self.governor_table.append(
443
  [
444
  aligned_sentence,
445
  equal_idx_1,
446
  equal_idx_2,
447
- entities_with_colors,
448
  ]
449
  )
450
 
@@ -453,7 +500,7 @@ class NewsVerification():
453
 
454
  table = "\n".join(rows)
455
  return f"""
456
- <h5>Comparison between input news and source news</h5>
457
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
458
  <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
459
  <thead>
@@ -502,36 +549,24 @@ class NewsVerification():
502
  input_sentence = row[0]["input_sentence"]
503
  source_sentence = row[0]["matched_sentence"]
504
 
505
- input_sentences += input_sentence
506
- source_sentences += source_sentence
507
- score = row[0]["similarity"]
508
- label = row[0]["label"]
509
- if label == "HUMAN":
510
- score = row[0]["similarity"]
511
- if label == "MACHINE":
512
- score = 1 - row[0]["similarity"]
513
- scores += score
514
 
515
  url = row[0]["url"]
516
  short_url = self.shorten_url(url, max_length)
517
  source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
518
  sentence_count += 1
519
 
520
- if scores == 0:
521
- label = "UNKNOWN"
522
- else:
523
- scores /= sentence_count
524
- if scores > 0.5:
525
- label = "HUMAN"
526
- else:
527
- label = "MACHINE"
528
- scores = 1 - scores
529
-
530
  return f"""
531
  <tr>
532
  <td>{input_sentences}</td>
533
  <td>{source_sentences}</td>
534
- <td>{label}<br>({score*100:.2f}%)</td>
535
  <td>{source_text_urls}</td>
536
  </tr>
537
  """
@@ -548,6 +583,15 @@ class NewsVerification():
548
  return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
549
 
550
 
 
 
 
 
 
 
 
 
 
551
  def shorten_url(self, url, max_length=30):
552
  if url is None:
553
  return ""
@@ -668,4 +712,32 @@ class NewsVerification():
668
  starts.append(start)
669
  ends.append(end + 1)
670
 
671
- return starts, ends
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from difflib import SequenceMatcher
2
+
3
+ import pandas as pd
4
  from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
5
  from src.application.text.entity import apply_highlight, highlight_entities
6
  from src.application.text.helper import extract_equal_text
7
  from src.application.text.model_detection import detect_text_by_ai_model
8
  from src.application.text.preprocessing import split_into_paragraphs
9
+ from src.application.text.search_detection import check_human, detect_text_by_relative_search, find_text_source
10
 
11
 
12
  class NewsVerification():
 
27
 
28
  self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
29
  self.aligned_sentences:list[dict] = []
30
+ self.aligned_sentences_df:pd.DataFrame = pd.DataFrame(columns=[
31
+ "input_sentence",
32
+ "matched_sentence",
33
+ "label",
34
+ "similarity",
35
+ "paraphrase",
36
+ "url",
37
+ "group",
38
+ "entities",
39
+ ])
40
  self.is_paraphrased:list[bool] = []
41
 
42
  self.ordinary_user_table:list = []
43
  self.fact_checker_table:list = []
44
  self.governor_table:list = []
45
+ self.entities_with_colors = []
46
 
47
  def load_news(self, news_title, news_content, news_image):
48
  self.news_text = news_title + "\n\n" + news_content
 
83
 
84
  if current_index >= len(input_sentences):
85
  break
86
+ if current_index > index and index != 0 and index != len(input_sentences) - 1:
87
  continue
88
 
89
  paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
 
130
 
131
  previous_paraphrase = paraphrase
132
 
133
+ def determine_text_origin_2(self):
134
+ """
135
+ Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
136
+
137
+ Args:
138
+ text: The input text to be analyzed.
139
+
140
+ Returns:
141
+ str: The predicted origin of the text:
142
+ - "HUMAN": If the text is likely written by a human.
143
+ - "MACHINE": If the text is likely generated by a machine.
144
+ """
145
+ print("CHECK TEXT:")
146
+ print("\tFrom search engine:")
147
+ # Classify by search engine
148
+ input_sentences = split_into_paragraphs(self.news_text)
149
+ for _ in range(5):
150
+ self.aligned_sentences_df = pd.concat(
151
+ [self.aligned_sentences_df, pd.DataFrame([{}])],
152
+ ignore_index=False,
153
+ )
154
+
155
+ for index, sentence in enumerate(input_sentences):
156
+ print(f"-------index = {index}-------")
157
+ print(f"current_sentence = {input_sentences[index]}")
158
+
159
+ if self.aligned_sentences_df["url"] is not None:
160
+ continue
161
+
162
+ self.aligned_sentences_df, img_urls = find_text_source(
163
+ input_sentences[index],
164
+ self.aligned_sentences_df,
165
+ )
166
+
167
  def detect_image_origin(self):
168
  print("CHECK IMAGE:")
169
  if self.news_image is None:
 
230
  self.detect_image_origin()
231
 
232
  def analyze_details(self):
233
+ entities_with_colors = []
234
+ for index, aligned_sentence in enumerate(self.aligned_sentences):
235
+ # Get entity-words (in pair) with colors
236
+ entities_with_colors = highlight_entities(
237
+ aligned_sentence["input_sentence"],
238
+ aligned_sentence["matched_sentence"],
239
+ )
240
+ self.aligned_sentences[index]["entities"] = entities_with_colors
241
+
242
  ordinary_user_table = self.create_ordinary_user_table()
243
  fact_checker_table = self.create_fact_checker_table()
244
  governor_table = self.create_governor_table()
 
309
  )
310
 
311
  # Get entity-words (in pair) with colors
312
+ # entities_with_colors = highlight_entities(
313
+ # aligned_sentence["input_sentence"],
314
+ # aligned_sentence["matched_sentence"],
315
+ # )
316
+
317
  self.fact_checker_table.append(
318
  [
319
  aligned_sentence,
320
  equal_idx_1,
321
  equal_idx_2,
322
+ aligned_sentence["entities"],
323
  ]
324
  )
325
 
 
329
 
330
  table = "\n".join(rows)
331
  return f"""
332
+ <h5>Comparison between input news and source news:</h5>
333
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
334
  <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
335
  <thead>
 
349
  """
350
 
351
  def format_text_fact_checker_row(self, row, max_length=30):
352
+ entity_count = 0
353
  if row[0]["input_sentence"] == "":
354
  return ""
355
  if row[0]["matched_sentence"] != "": # source is not empty
356
  # highlight entities
357
  input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
358
  source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
359
+ entity_count = len(row[3])
360
 
361
  # Color overlapping words
362
  input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
 
375
  short_url = self.shorten_url(url, max_length)
376
  source_text_url = f"""<a href="{url}">{short_url}</a>"""
377
 
378
+ entity_count_text = self.get_entity_count_text(entity_count)
379
+
380
  return f"""
381
  <tr>
382
  <td>{input_sentence}</td>
383
  <td>{source_sentence}</td>
384
+ <td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
385
  <td>{source_text_url}</td>
386
  </tr>
387
  """
 
407
  table = "\n".join(rows)
408
 
409
  return f"""
410
+ <h5>Comparison between input news and source news:</h5>
411
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
412
  <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
413
  <thead>
 
434
  for index, row in enumerate(self.aligned_sentences):
435
  if row["input_sentence"] == "":
436
  continue
437
+ input_sentences += row["input_sentence"] + "<br><br>"
438
  label = self.aligned_sentences[index]["label"]
 
 
 
 
 
439
 
440
  url = self.aligned_sentences[index]["url"] #
441
  short_url = self.shorten_url(url, max_length)
442
  source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
443
  sentence_count += 1
444
 
445
+ scores, label = self.calculate_score_label()
 
 
 
 
 
 
 
 
446
 
447
  return f"""
448
  <tr>
 
455
  def format_image_ordinary_user_row(self, max_length=30):
456
 
457
  if self.image_referent_url is not None or self.image_referent_url != "":
458
+ # source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
459
  short_url = self.shorten_url(self.image_referent_url, max_length)
460
  source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
461
  else:
462
+ # source_image = "Image not found"
463
  source_image_url = ""
464
 
465
+ return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
466
 
467
 
468
  def create_governor_table(self):
 
481
  )
482
 
483
  # Get entity-words (in pair) with colors
484
+ # entities_with_colors = highlight_entities(
485
+ # aligned_sentence["input_sentence"],
486
+ # aligned_sentence["matched_sentence"],
487
+ # )
488
 
489
  self.governor_table.append(
490
  [
491
  aligned_sentence,
492
  equal_idx_1,
493
  equal_idx_2,
494
+ aligned_sentence["entities"],
495
  ]
496
  )
497
 
 
500
 
501
  table = "\n".join(rows)
502
  return f"""
503
+ <h5>Comparison between input news and source news:</h5>
504
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
505
  <col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
506
  <thead>
 
549
  input_sentence = row[0]["input_sentence"]
550
  source_sentence = row[0]["matched_sentence"]
551
 
552
+ # convert score to HUMAN-based score:
553
+ input_sentences += input_sentence + "<br><br>"
554
+ source_sentences += source_sentence + "<br><br>"
555
+
 
 
 
 
 
556
 
557
  url = row[0]["url"]
558
  short_url = self.shorten_url(url, max_length)
559
  source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
560
  sentence_count += 1
561
 
562
+ score, label = self.calculate_score_label()
563
+ entity_count_text = self.get_entity_count_text(entity_count)
564
+
 
 
 
 
 
 
 
565
  return f"""
566
  <tr>
567
  <td>{input_sentences}</td>
568
  <td>{source_sentences}</td>
569
+ <td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
570
  <td>{source_text_urls}</td>
571
  </tr>
572
  """
 
583
  return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
584
 
585
 
586
+ def get_entity_count_text(self, entity_count):
587
+ if entity_count <= 0:
588
+ entity_count_text = ""
589
+ elif entity_count == 1:
590
+ entity_count_text = "with altered entity"
591
+ else:
592
+ entity_count_text = "with altered entities"
593
+ return entity_count_text
594
+
595
  def shorten_url(self, url, max_length=30):
596
  if url is None:
597
  return ""
 
712
  starts.append(start)
713
  ends.append(end + 1)
714
 
715
+ return starts, ends
716
+
717
+ def calculate_score_label(self):
718
+ human_score = []
719
+ machine_score = []
720
+ machine_flag = False
721
+ for sentence in self.aligned_sentences:
722
+ if sentence["input_sentence"] == "":
723
+ continue
724
+ if sentence["label"] == "HUMAN":
725
+ human_score.append(sentence["similarity"])
726
+ elif sentence["label"] == "MACHINE":
727
+ machine_score.append(1 - sentence["similarity"])
728
+ machine_flag = True
729
+
730
+ if machine_flag is True and len(machine_score) > 0:
731
+ # average value of machine_score
732
+ machine_score_avg = sum(machine_score) / len(machine_score)
733
+ if machine_score_avg < 0.5:
734
+ machine_score_avg = 1 - machine_score_avg
735
+ return machine_score_avg, "MACHINE"
736
+ elif machine_flag is False and len(human_score) > 0:
737
+ # average value of human_score
738
+ human_score_avg = sum(human_score) / len(human_score)
739
+ return human_score_avg, "HUMAN"
740
+ else:
741
+ return 0, "UNKNOWN"
742
+
743
+
src/application/image/image_detection.py CHANGED
@@ -12,6 +12,10 @@ def compare_list_of_images(news_image_path, img_urls):
12
  matched_url = ""
13
  max_similarity = 0
14
  for url in img_urls:
 
 
 
 
15
  print(f"\t{url}")
16
  referred_image = get_image_from_url(url)
17
  if referred_image is None:
@@ -29,8 +33,6 @@ def compare_list_of_images(news_image_path, img_urls):
29
 
30
  def detect_image_from_news_image(news_image_path, image_urls):
31
  print("\tFrom news:")
32
- for url in image_urls:
33
- print(f"\t{url}")
34
  return compare_list_of_images(news_image_path, image_urls)
35
 
36
  def detect_image_by_reverse_search(news_image_path):
 
12
  matched_url = ""
13
  max_similarity = 0
14
  for url in img_urls:
15
+ if "ichef.bbci.co.uk" in url and " " in url:
16
+ url_list = url.split(",")
17
+ if len(url_list) > 0:
18
+ url = url_list[0].split(" ")[0]
19
  print(f"\t{url}")
20
  referred_image = get_image_from_url(url)
21
  if referred_image is None:
 
33
 
34
  def detect_image_from_news_image(news_image_path, image_urls):
35
  print("\tFrom news:")
 
 
36
  return compare_list_of_images(news_image_path, image_urls)
37
 
38
  def detect_image_by_reverse_search(news_image_path):
src/application/text/entity.py CHANGED
@@ -166,8 +166,14 @@ def apply_highlight(text, entities_with_colors, key="input", count = 0):
166
  highlighted_text = ""
167
 
168
  # find a list of starts and ends of entity in text:
169
- starts = [m.start() for m in re.finditer(entity[key], temp_text)]
170
- ends = [m.end() for m in re.finditer(entity[key], temp_text)]
 
 
 
 
 
 
171
 
172
  all_starts.extend(starts)
173
  all_ends.extend(ends)
 
166
  highlighted_text = ""
167
 
168
  # find a list of starts and ends of entity in text:
169
+ # starts = [m.start() for m in re.finditer(entity[key], temp_text)]
170
+ # ends = [m.end() for m in re.finditer(entity[key], temp_text)]
171
+ starts =[]
172
+ ends = []
173
+ # "\b" is for bound a word
174
+ for m in re.finditer(r"\b" + re.escape(entity[key]) + r"\b", temp_text):
175
+ starts.append(m.start())
176
+ ends.append(m.end())
177
 
178
  all_starts.extend(starts)
179
  all_ends.extend(ends)
src/application/text/search.py CHANGED
@@ -160,12 +160,28 @@ def generate_search_phrases(input_text):
160
  search_phrases.append(input_text)
161
 
162
  # Method 3: Split text by chunks
163
- # search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
164
 
165
  # Method 4: Get most identities and key words
166
  entities = extract_entities(input_text)
167
- keywords = get_keywords(input_text, 16)
168
- search_phrase = " ".join(entities) + " " + " ".join(keywords)
 
 
 
169
  # search_phrases.append(search_phrase) # TODO: for demo purposes
170
 
171
  return search_phrases
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  search_phrases.append(input_text)
161
 
162
  # Method 3: Split text by chunks
163
+ search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
164
 
165
  # Method 4: Get most identities and key words
166
  entities = extract_entities(input_text)
167
+ text_without_entities = remove_identities_from_text(input_text, entities)
168
+ print(f"text_without_entities: {text_without_entities}")
169
+ search_phrases.append(text_without_entities)
170
+ #keywords = get_keywords(input_text, 16)
171
+ #search_phrase = " ".join(entities) + " " + " ".join(keywords)
172
  # search_phrases.append(search_phrase) # TODO: for demo purposes
173
 
174
  return search_phrases
175
+
176
+ def remove_identities_from_text(input_text, entities):
177
+ """
178
+ Removes entities from the input text.
179
+
180
+ Args:
181
+ input_text: The input text as a string.
182
+ entities: A list of entities to be removed.
183
+ """
184
+ for entity in entities:
185
+ input_text = input_text.replace(entity, "")
186
+
187
+ return input_text
src/application/text/search_detection.py CHANGED
@@ -92,6 +92,76 @@ def detect_text_by_relative_search(input_text, index, is_support_opposite = Fals
92
 
93
  return False, None, [], [], index
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  def longest_common_subsequence(arr1, arr2):
96
  """
97
  Finds the length of the longest common subsequence (contiguous) between
 
92
 
93
  return False, None, [], [], index
94
 
95
+ def find_text_source(text, text_index, sentences_df):
96
+ sentence = {
97
+ "input_sentence": text[text_index],
98
+ "matched_sentence": "",
99
+ "label": "",
100
+ "similarity": None,
101
+ "paraphrase": None,
102
+ "url": "",
103
+ "group": None,
104
+ }
105
+ checked_urls = set()
106
+ searched_phrases = generate_search_phrases(text[text_index])
107
+
108
+ for candidate in searched_phrases:
109
+ search_results = search_by_google(candidate)
110
+ urls = [item['link'] for item in search_results.get("items", [])]
111
+
112
+ for url in urls[:3]:
113
+ if url in checked_urls: # visited url
114
+ continue
115
+ if "bbc.com" not in url:
116
+ continue
117
+
118
+ checked_urls.add(url)
119
+ print(f"\t\tChecking URL: {url}")
120
+
121
+ content = URLReader(url)
122
+
123
+ if content.is_extracted is True:
124
+ if content.title is None or content.text is None:
125
+ print(f"\t\t\t↑↑↑ Title or text not found")
126
+ continue
127
+
128
+ page_text = content.title + "\n" + content.text
129
+ if len(page_text) > MAX_CHAR_SIZE:
130
+ print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
131
+ continue
132
+ print(f"\t\t\t↑↑↑ Title: {content.title}")
133
+ paraphrase, aligned_sentence = check_paraphrase(text, page_text, url)
134
+
135
+ # add one more key "group" into aligned_sentence
136
+ sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
137
+ sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
138
+ sentences_df.loc[index, "label"] = aligned_sentence["label"]
139
+ sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
140
+ sentences_df.loc[index, "url"] = aligned_sentence["url"]
141
+
142
+ if aligned_sentence["paraphrase"] is False:
143
+ return paraphrase, sentences_df
144
+
145
+ for index, _ in enumerate(sentences_df):
146
+ if sentences_df[index]["url"] is not None:
147
+ continue
148
+
149
+ # find content in new url
150
+ _, aligned_sentence = check_paraphrase(text[index], page_text, url)
151
+
152
+ if aligned_sentence["url"] is not None:
153
+ continue
154
+
155
+ sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
156
+ sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
157
+ sentences_df.loc[index, "label"] = aligned_sentence["label"]
158
+ sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
159
+ sentences_df.loc[index, "url"] = aligned_sentence["url"]
160
+
161
+ return sentences_df, content.images
162
+
163
+ return sentence, []
164
+
165
  def longest_common_subsequence(arr1, arr2):
166
  """
167
  Finds the length of the longest common subsequence (contiguous) between
test.py CHANGED
@@ -1,27 +1,50 @@
1
- import json
2
-
3
- text = """```json
4
- [
5
- ["Sunday", "Thursday"],
6
- ["two millions", "one million"],
7
- ["north", "east"],
8
- ["Japan", "UK"],
9
- ["Sunday", "Thursday"]
10
- ]
11
- ```
12
- """
13
- def read_json(json_string) -> list[list[str]]:
14
- try:
15
- entities = json.loads(json_string)
16
- # Remove duplicates pair of entities
17
- unique_data = []
18
- for inner_list in entities:
19
- if inner_list not in unique_data:
20
- unique_data.append(inner_list)
21
-
22
- return unique_data
23
-
24
- except json.JSONDecodeError as e:
25
- print(f"Error decoding JSON: {e}")
26
- return []
27
- print(read_json(text.replace("```json", "").replace("```", "")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def find_entity_spans(entity, text):
4
+ """
5
+ Finds the start and end indices of whole word entities in text.
6
+
7
+ Args:
8
+ entity: The entity string to search for.
9
+ text: The text to search within.
10
+
11
+ Returns:
12
+ A list of tuples, where each tuple contains the start and end indices
13
+ of a found entity. Returns an empty list if no entities are found.
14
+ """
15
+ spans = []
16
+ for m in re.finditer(r"\b" + re.escape(entity) + r"\b", text): # The crucial change
17
+ spans.append((m.start(), m.end()))
18
+ return spans
19
+
20
+ # Example usage:
21
+ temp_text = "win winger winning"
22
+ entity = {"key": "win"} # Example dictionary (adjust as needed)
23
+
24
+ spans = find_entity_spans(entity["key"], temp_text)
25
+ print(spans) # Output: [(0, 3)] (Only "win" at the beginning)
26
+
27
+ temp_text = "The quick brown fox jumps over the lazy dog."
28
+ entity = {"key": "fox"}
29
+ spans = find_entity_spans(entity["key"], temp_text)
30
+ print(spans) # Output: [(16, 19)]
31
+
32
+ temp_text = "foxes fox foxing"
33
+ entity = {"key": "fox"}
34
+ spans = find_entity_spans(entity["key"], temp_text)
35
+ print(spans) # Output: [(0, 3), (6, 9)]
36
+
37
+ temp_text = "winger win winning"
38
+ entity = {"key": "win"}
39
+ spans = find_entity_spans(entity["key"], temp_text)
40
+ print(spans) # Output: [(8, 11)]
41
+
42
+ temp_text = "winger win winning"
43
+ entity = {"key": "winger"}
44
+ spans = find_entity_spans(entity["key"], temp_text)
45
+ print(spans) # Output: [(0, 6)]
46
+
47
+ temp_text = "winger win winning"
48
+ entity = {"key": "winning"}
49
+ spans = find_entity_spans(entity["key"], temp_text)
50
+ print(spans) # Output: [(12, 19)]