Spaces:
Running
Running
Commit
Β·
504f37b
1
Parent(s):
8617832
solve bugs, update combination score and label, add method for better searching.
Browse files- application.py +62 -22
- application_2.py +5 -4
- examples/example_image_real_4.jpg.webp +0 -0
- examples/example_text_LLM_entities.txt +1 -1
- gpt_test.py +39 -0
- src/application/content_detection.py +128 -56
- src/application/image/image_detection.py +4 -2
- src/application/text/entity.py +8 -2
- src/application/text/search.py +19 -3
- src/application/text/search_detection.py +70 -0
- test.py +50 -27
application.py
CHANGED
@@ -8,10 +8,6 @@ from src.application.content_detection import NewsVerification
|
|
8 |
from src.application.url_reader import URLReader
|
9 |
from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
|
10 |
|
11 |
-
|
12 |
-
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
13 |
-
SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
|
14 |
-
|
15 |
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
16 |
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
17 |
|
@@ -43,14 +39,6 @@ def load_url(url):
|
|
43 |
|
44 |
|
45 |
def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
|
46 |
-
if news_image is not None:
|
47 |
-
# Convert to PIL Image for easier saving
|
48 |
-
img = Image.open(news_image)
|
49 |
-
|
50 |
-
# Save the image (you can customize the filename)
|
51 |
-
filepath = "example_image_input.jpg" # Or use a dynamic filename
|
52 |
-
img.save(filepath)
|
53 |
-
|
54 |
news_analysis = NewsVerification()
|
55 |
news_analysis.load_news(news_title, news_content, news_image)
|
56 |
news_analysis.generate_analysis_report()
|
@@ -60,7 +48,7 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
|
|
60 |
# Define the GUI
|
61 |
with gr.Blocks() as demo:
|
62 |
gr.Markdown("# NEWS VERIFICATION")
|
63 |
-
|
64 |
with gr.Row():
|
65 |
# SETTINGS
|
66 |
with gr.Column(scale=1):
|
@@ -93,13 +81,59 @@ with gr.Blocks() as demo:
|
|
93 |
with gr.Accordion("Input News"):
|
94 |
news_title = gr.Textbox(label="Title", value="")
|
95 |
news_image = gr.Image(label="Image", type="filepath")
|
96 |
-
news_content = gr.Textbox(label="Content", value="", lines=
|
97 |
|
98 |
# NEWS ANALYSIS REPORT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
with gr.Column(scale=2):
|
100 |
-
with gr.Accordion("
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
# Connect events
|
105 |
load_button.click(
|
@@ -116,9 +150,9 @@ with gr.Blocks() as demo:
|
|
116 |
generate_image_button.click(generate_fake_image,
|
117 |
inputs=[image_generation_model, news_title],
|
118 |
outputs=[news_image])
|
119 |
-
|
120 |
inputs=[news_title, news_content, news_image],
|
121 |
-
outputs=[
|
122 |
|
123 |
# change Image
|
124 |
#url_input.change(load_image, inputs=url_input, outputs=image_view)
|
@@ -132,23 +166,28 @@ with gr.Blocks() as demo:
|
|
132 |
text_llm_topic = file.read()
|
133 |
with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
|
134 |
text_llm_modification = file.read()
|
|
|
|
|
135 |
except FileNotFoundError:
|
136 |
print("File not found.")
|
137 |
except Exception as e:
|
138 |
print(f"An error occurred: {e}")
|
139 |
|
140 |
-
title_1 = "Southampton news: Leeds target striker Cameron Archer"
|
141 |
-
title_2 = "Southampton news: Leeds target striker Cameron Archer"
|
|
|
142 |
|
143 |
image_1 = "examples/example_image_real_1.jpg.webp"
|
144 |
image_2 = "examples/example_image_real_2.jpg.webp"
|
145 |
image_3 = "examples/example_image_real_3.jpg"
|
|
|
146 |
|
147 |
gr.Examples(
|
148 |
examples=[
|
149 |
[title_1, image_1, text_real_1 + '\n\n' + text_real_2],
|
150 |
[title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
|
151 |
[title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
|
|
|
152 |
],
|
153 |
inputs=[news_title, news_image, news_content],
|
154 |
label="Examples",
|
@@ -156,7 +195,8 @@ with gr.Blocks() as demo:
|
|
156 |
"2 real news",
|
157 |
"1 real news + 1 LLM modification-based news",
|
158 |
"1 real news + 1 LLM topic-based news",
|
|
|
159 |
],
|
160 |
)
|
161 |
|
162 |
-
demo.launch(share=
|
|
|
8 |
from src.application.url_reader import URLReader
|
9 |
from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
|
10 |
|
|
|
|
|
|
|
|
|
11 |
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
12 |
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
13 |
|
|
|
39 |
|
40 |
|
41 |
def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
news_analysis = NewsVerification()
|
43 |
news_analysis.load_news(news_title, news_content, news_image)
|
44 |
news_analysis.generate_analysis_report()
|
|
|
48 |
# Define the GUI
|
49 |
with gr.Blocks() as demo:
|
50 |
gr.Markdown("# NEWS VERIFICATION")
|
51 |
+
|
52 |
with gr.Row():
|
53 |
# SETTINGS
|
54 |
with gr.Column(scale=1):
|
|
|
81 |
with gr.Accordion("Input News"):
|
82 |
news_title = gr.Textbox(label="Title", value="")
|
83 |
news_image = gr.Image(label="Image", type="filepath")
|
84 |
+
news_content = gr.Textbox(label="Content", value="", lines=13)
|
85 |
|
86 |
# NEWS ANALYSIS REPORT
|
87 |
+
ordinary_user_explanation = """
|
88 |
+
FOR ORDINARY USER<br>
|
89 |
+
- Green texts are the matched words in the input and source news.<br>
|
90 |
+
- Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
|
91 |
+
"""
|
92 |
+
fact_checker_explanation = """
|
93 |
+
FOR FACT CHECKER<br>
|
94 |
+
- Green texts are the matched words in the input and source news.<br>
|
95 |
+
- Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
|
96 |
+
"""
|
97 |
+
governor_explanation = """
|
98 |
+
FOR GOVERNOR<br>
|
99 |
+
- Green texts are the matched words in the input and source news.<br>
|
100 |
+
- Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
|
101 |
+
"""
|
102 |
+
table = """
|
103 |
+
<h5>Comparison between input news and source news:</h5>
|
104 |
+
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
105 |
+
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
106 |
+
<thead>
|
107 |
+
<tr>
|
108 |
+
<th>Input news</th>
|
109 |
+
<th>Source (URL provided in Originality column correspondingly)</th>
|
110 |
+
<th>Forensic</th>
|
111 |
+
<th>Originality</th>
|
112 |
+
</tr>
|
113 |
+
</thead>
|
114 |
+
<tbody>
|
115 |
+
<tr>
|
116 |
+
<th>TBD</th>
|
117 |
+
<th>TBD</th>
|
118 |
+
<th>TBD</th>
|
119 |
+
<th>TBD</th>
|
120 |
+
</tr>
|
121 |
+
</tbody>
|
122 |
+
</table>
|
123 |
+
|
124 |
+
<style>"""
|
125 |
with gr.Column(scale=2):
|
126 |
+
with gr.Accordion("NEWS ANALYSIS"):
|
127 |
+
verification_button = gr.Button("Verify news")
|
128 |
+
with gr.Tab("Orinary User"):
|
129 |
+
gr.HTML(ordinary_user_explanation)
|
130 |
+
ordinary_user_result = gr.HTML(table)
|
131 |
+
with gr.Tab("Fact Checker"):
|
132 |
+
gr.HTML(fact_checker_explanation)
|
133 |
+
fact_checker_result = gr.HTML(table)
|
134 |
+
with gr.Tab("Governor"):
|
135 |
+
gr.HTML(governor_explanation)
|
136 |
+
governor_result = gr.HTML(table)
|
137 |
|
138 |
# Connect events
|
139 |
load_button.click(
|
|
|
150 |
generate_image_button.click(generate_fake_image,
|
151 |
inputs=[image_generation_model, news_title],
|
152 |
outputs=[news_image])
|
153 |
+
verification_button.click(generate_analysis_report,
|
154 |
inputs=[news_title, news_content, news_image],
|
155 |
+
outputs=[ordinary_user_result, fact_checker_result, governor_result])
|
156 |
|
157 |
# change Image
|
158 |
#url_input.change(load_image, inputs=url_input, outputs=image_view)
|
|
|
166 |
text_llm_topic = file.read()
|
167 |
with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
|
168 |
text_llm_modification = file.read()
|
169 |
+
with open('examples/example_text_LLM_entities.txt','r', encoding='utf-8') as file:
|
170 |
+
text_llm_entities = file.read()
|
171 |
except FileNotFoundError:
|
172 |
print("File not found.")
|
173 |
except Exception as e:
|
174 |
print(f"An error occurred: {e}")
|
175 |
|
176 |
+
title_1 = "Southampton news: Leeds target striker Cameron Archer."
|
177 |
+
title_2 = "Southampton news: Leeds target striker Cameron Archer."
|
178 |
+
title_4 = "Japan pledges support for Ukraine with 100-year pact."
|
179 |
|
180 |
image_1 = "examples/example_image_real_1.jpg.webp"
|
181 |
image_2 = "examples/example_image_real_2.jpg.webp"
|
182 |
image_3 = "examples/example_image_real_3.jpg"
|
183 |
+
image_4 = "examples/example_image_real_4.jpg.webp"
|
184 |
|
185 |
gr.Examples(
|
186 |
examples=[
|
187 |
[title_1, image_1, text_real_1 + '\n\n' + text_real_2],
|
188 |
[title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
|
189 |
[title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
|
190 |
+
[title_4, image_4, text_llm_entities],
|
191 |
],
|
192 |
inputs=[news_title, news_image, news_content],
|
193 |
label="Examples",
|
|
|
195 |
"2 real news",
|
196 |
"1 real news + 1 LLM modification-based news",
|
197 |
"1 real news + 1 LLM topic-based news",
|
198 |
+
"1 LLM changed-entities news",
|
199 |
],
|
200 |
)
|
201 |
|
202 |
+
demo.launch(share=True)
|
application_2.py
CHANGED
@@ -100,7 +100,7 @@ with gr.Blocks() as demo:
|
|
100 |
- Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
|
101 |
"""
|
102 |
table = """
|
103 |
-
<h5>Comparison between input news and source news
|
104 |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
105 |
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
106 |
<thead>
|
@@ -132,7 +132,7 @@ with gr.Blocks() as demo:
|
|
132 |
gr.HTML(fact_checker_explanation)
|
133 |
fact_checker_result = gr.HTML(table)
|
134 |
with gr.Tab("Governor"):
|
135 |
-
gr.HTML(
|
136 |
governor_result = gr.HTML(table)
|
137 |
|
138 |
# Connect events
|
@@ -180,13 +180,14 @@ with gr.Blocks() as demo:
|
|
180 |
image_1 = "examples/example_image_real_1.jpg.webp"
|
181 |
image_2 = "examples/example_image_real_2.jpg.webp"
|
182 |
image_3 = "examples/example_image_real_3.jpg"
|
|
|
183 |
|
184 |
gr.Examples(
|
185 |
examples=[
|
186 |
[title_1, image_1, text_real_1 + '\n\n' + text_real_2],
|
187 |
[title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
|
188 |
[title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
|
189 |
-
[title_4,
|
190 |
],
|
191 |
inputs=[news_title, news_image, news_content],
|
192 |
label="Examples",
|
@@ -198,4 +199,4 @@ with gr.Blocks() as demo:
|
|
198 |
],
|
199 |
)
|
200 |
|
201 |
-
demo.launch(share=
|
|
|
100 |
- Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
|
101 |
"""
|
102 |
table = """
|
103 |
+
<h5>Comparison between input news and source news:</h5>
|
104 |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
105 |
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
106 |
<thead>
|
|
|
132 |
gr.HTML(fact_checker_explanation)
|
133 |
fact_checker_result = gr.HTML(table)
|
134 |
with gr.Tab("Governor"):
|
135 |
+
gr.HTML(governor_explanation)
|
136 |
governor_result = gr.HTML(table)
|
137 |
|
138 |
# Connect events
|
|
|
180 |
image_1 = "examples/example_image_real_1.jpg.webp"
|
181 |
image_2 = "examples/example_image_real_2.jpg.webp"
|
182 |
image_3 = "examples/example_image_real_3.jpg"
|
183 |
+
image_4 = "examples/example_image_real_4.jpg.webp"
|
184 |
|
185 |
gr.Examples(
|
186 |
examples=[
|
187 |
[title_1, image_1, text_real_1 + '\n\n' + text_real_2],
|
188 |
[title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
|
189 |
[title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
|
190 |
+
[title_4, image_4, text_llm_entities],
|
191 |
],
|
192 |
inputs=[news_title, news_image, news_content],
|
193 |
label="Examples",
|
|
|
199 |
],
|
200 |
)
|
201 |
|
202 |
+
demo.launch(share=True)
|
examples/example_image_real_4.jpg.webp
ADDED
![]() |
examples/example_text_LLM_entities.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
|
|
|
1 |
+
Shigeru Ishiba has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
|
gpt_test.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from openai import AzureOpenAI
|
4 |
+
load_dotenv()
|
5 |
+
AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
|
6 |
+
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
|
7 |
+
AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
|
8 |
+
|
9 |
+
azure_client = AzureOpenAI(
|
10 |
+
azure_endpoint = "https://quoc-nguyen.openai.azure.com/",
|
11 |
+
api_key=AZURE_OPENAI_API_KEY,
|
12 |
+
api_version="2024-05-01-preview"
|
13 |
+
)
|
14 |
+
|
15 |
+
deplopment_name = "o1-mini" # or "gpt-4o"
|
16 |
+
TEXT_PROMPT = """
|
17 |
+
replace Ukraine with Denmark:
|
18 |
+
|
19 |
+
"Sir Keir Starmer has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country.
|
20 |
+
|
21 |
+
The prime minister's visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems.
|
22 |
+
|
23 |
+
Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back".
|
24 |
+
|
25 |
+
An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east.
|
26 |
+
|
27 |
+
Zelensky praised the UK's commitment on Thursday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid."
|
28 |
+
"""
|
29 |
+
|
30 |
+
response = azure_client.chat.completions.create(
|
31 |
+
model=deplopment_name, # model = "deployment_name".
|
32 |
+
messages=[
|
33 |
+
# {"role": "system", "content": "You are a helpful assistant."},
|
34 |
+
{"role": "user", "content": TEXT_PROMPT},
|
35 |
+
],
|
36 |
+
# max_tokens=512,
|
37 |
+
# temperature=0,
|
38 |
+
)
|
39 |
+
print(response.choices[0].message.content)
|
src/application/content_detection.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
from difflib import SequenceMatcher
|
|
|
|
|
2 |
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
|
3 |
from src.application.text.entity import apply_highlight, highlight_entities
|
4 |
from src.application.text.helper import extract_equal_text
|
5 |
from src.application.text.model_detection import detect_text_by_ai_model
|
6 |
from src.application.text.preprocessing import split_into_paragraphs
|
7 |
-
from src.application.text.search_detection import check_human, detect_text_by_relative_search
|
8 |
|
9 |
|
10 |
class NewsVerification():
|
@@ -25,11 +27,22 @@ class NewsVerification():
|
|
25 |
|
26 |
self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
|
27 |
self.aligned_sentences:list[dict] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
self.is_paraphrased:list[bool] = []
|
29 |
|
30 |
self.ordinary_user_table:list = []
|
31 |
self.fact_checker_table:list = []
|
32 |
self.governor_table:list = []
|
|
|
33 |
|
34 |
def load_news(self, news_title, news_content, news_image):
|
35 |
self.news_text = news_title + "\n\n" + news_content
|
@@ -70,7 +83,7 @@ class NewsVerification():
|
|
70 |
|
71 |
if current_index >= len(input_sentences):
|
72 |
break
|
73 |
-
if current_index
|
74 |
continue
|
75 |
|
76 |
paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
|
@@ -117,6 +130,40 @@ class NewsVerification():
|
|
117 |
|
118 |
previous_paraphrase = paraphrase
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
def detect_image_origin(self):
|
121 |
print("CHECK IMAGE:")
|
122 |
if self.news_image is None:
|
@@ -183,6 +230,15 @@ class NewsVerification():
|
|
183 |
self.detect_image_origin()
|
184 |
|
185 |
def analyze_details(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
ordinary_user_table = self.create_ordinary_user_table()
|
187 |
fact_checker_table = self.create_fact_checker_table()
|
188 |
governor_table = self.create_governor_table()
|
@@ -253,17 +309,17 @@ class NewsVerification():
|
|
253 |
)
|
254 |
|
255 |
# Get entity-words (in pair) with colors
|
256 |
-
entities_with_colors = highlight_entities(
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
self.fact_checker_table.append(
|
262 |
[
|
263 |
aligned_sentence,
|
264 |
equal_idx_1,
|
265 |
equal_idx_2,
|
266 |
-
|
267 |
]
|
268 |
)
|
269 |
|
@@ -273,7 +329,7 @@ class NewsVerification():
|
|
273 |
|
274 |
table = "\n".join(rows)
|
275 |
return f"""
|
276 |
-
<h5>Comparison between input news and source news
|
277 |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
278 |
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
279 |
<thead>
|
@@ -293,12 +349,14 @@ class NewsVerification():
|
|
293 |
"""
|
294 |
|
295 |
def format_text_fact_checker_row(self, row, max_length=30):
|
|
|
296 |
if row[0]["input_sentence"] == "":
|
297 |
return ""
|
298 |
if row[0]["matched_sentence"] != "": # source is not empty
|
299 |
# highlight entities
|
300 |
input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
|
301 |
source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
|
|
|
302 |
|
303 |
# Color overlapping words
|
304 |
input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
|
@@ -317,11 +375,13 @@ class NewsVerification():
|
|
317 |
short_url = self.shorten_url(url, max_length)
|
318 |
source_text_url = f"""<a href="{url}">{short_url}</a>"""
|
319 |
|
|
|
|
|
320 |
return f"""
|
321 |
<tr>
|
322 |
<td>{input_sentence}</td>
|
323 |
<td>{source_sentence}</td>
|
324 |
-
<td>{label}<br>({score*100:.2f}%)</td>
|
325 |
<td>{source_text_url}</td>
|
326 |
</tr>
|
327 |
"""
|
@@ -347,7 +407,7 @@ class NewsVerification():
|
|
347 |
table = "\n".join(rows)
|
348 |
|
349 |
return f"""
|
350 |
-
<h5>Comparison between input news and source news
|
351 |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
352 |
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
353 |
<thead>
|
@@ -374,28 +434,15 @@ class NewsVerification():
|
|
374 |
for index, row in enumerate(self.aligned_sentences):
|
375 |
if row["input_sentence"] == "":
|
376 |
continue
|
377 |
-
input_sentences += row["input_sentence"]
|
378 |
label = self.aligned_sentences[index]["label"]
|
379 |
-
if label == "HUMAN":
|
380 |
-
score = self.aligned_sentences[index]["similarity"]
|
381 |
-
if label == "MACHINE":
|
382 |
-
score = 1 - self.aligned_sentences[index]["similarity"]
|
383 |
-
scores += score
|
384 |
|
385 |
url = self.aligned_sentences[index]["url"] #
|
386 |
short_url = self.shorten_url(url, max_length)
|
387 |
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
388 |
sentence_count += 1
|
389 |
|
390 |
-
|
391 |
-
label = "UNKNOWN"
|
392 |
-
else:
|
393 |
-
scores /= sentence_count
|
394 |
-
if scores > 0.5:
|
395 |
-
label = "HUMAN"
|
396 |
-
else:
|
397 |
-
label = "MACHINE"
|
398 |
-
scores = 1 - scores
|
399 |
|
400 |
return f"""
|
401 |
<tr>
|
@@ -408,14 +455,14 @@ class NewsVerification():
|
|
408 |
def format_image_ordinary_user_row(self, max_length=30):
|
409 |
|
410 |
if self.image_referent_url is not None or self.image_referent_url != "":
|
411 |
-
source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
|
412 |
short_url = self.shorten_url(self.image_referent_url, max_length)
|
413 |
source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
|
414 |
else:
|
415 |
-
source_image = "Image not found"
|
416 |
source_image_url = ""
|
417 |
|
418 |
-
return f"""<tr><td>input image</td><td>{
|
419 |
|
420 |
|
421 |
def create_governor_table(self):
|
@@ -434,17 +481,17 @@ class NewsVerification():
|
|
434 |
)
|
435 |
|
436 |
# Get entity-words (in pair) with colors
|
437 |
-
entities_with_colors = highlight_entities(
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
|
442 |
self.governor_table.append(
|
443 |
[
|
444 |
aligned_sentence,
|
445 |
equal_idx_1,
|
446 |
equal_idx_2,
|
447 |
-
|
448 |
]
|
449 |
)
|
450 |
|
@@ -453,7 +500,7 @@ class NewsVerification():
|
|
453 |
|
454 |
table = "\n".join(rows)
|
455 |
return f"""
|
456 |
-
<h5>Comparison between input news and source news
|
457 |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
458 |
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
459 |
<thead>
|
@@ -502,36 +549,24 @@ class NewsVerification():
|
|
502 |
input_sentence = row[0]["input_sentence"]
|
503 |
source_sentence = row[0]["matched_sentence"]
|
504 |
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
if label == "HUMAN":
|
510 |
-
score = row[0]["similarity"]
|
511 |
-
if label == "MACHINE":
|
512 |
-
score = 1 - row[0]["similarity"]
|
513 |
-
scores += score
|
514 |
|
515 |
url = row[0]["url"]
|
516 |
short_url = self.shorten_url(url, max_length)
|
517 |
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
518 |
sentence_count += 1
|
519 |
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
scores /= sentence_count
|
524 |
-
if scores > 0.5:
|
525 |
-
label = "HUMAN"
|
526 |
-
else:
|
527 |
-
label = "MACHINE"
|
528 |
-
scores = 1 - scores
|
529 |
-
|
530 |
return f"""
|
531 |
<tr>
|
532 |
<td>{input_sentences}</td>
|
533 |
<td>{source_sentences}</td>
|
534 |
-
<td>{label}<br>({score*100:.2f}%)</td>
|
535 |
<td>{source_text_urls}</td>
|
536 |
</tr>
|
537 |
"""
|
@@ -548,6 +583,15 @@ class NewsVerification():
|
|
548 |
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
549 |
|
550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
def shorten_url(self, url, max_length=30):
|
552 |
if url is None:
|
553 |
return ""
|
@@ -668,4 +712,32 @@ class NewsVerification():
|
|
668 |
starts.append(start)
|
669 |
ends.append(end + 1)
|
670 |
|
671 |
-
return starts, ends
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from difflib import SequenceMatcher
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
|
5 |
from src.application.text.entity import apply_highlight, highlight_entities
|
6 |
from src.application.text.helper import extract_equal_text
|
7 |
from src.application.text.model_detection import detect_text_by_ai_model
|
8 |
from src.application.text.preprocessing import split_into_paragraphs
|
9 |
+
from src.application.text.search_detection import check_human, detect_text_by_relative_search, find_text_source
|
10 |
|
11 |
|
12 |
class NewsVerification():
|
|
|
27 |
|
28 |
self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
|
29 |
self.aligned_sentences:list[dict] = []
|
30 |
+
self.aligned_sentences_df:pd.DataFrame = pd.DataFrame(columns=[
|
31 |
+
"input_sentence",
|
32 |
+
"matched_sentence",
|
33 |
+
"label",
|
34 |
+
"similarity",
|
35 |
+
"paraphrase",
|
36 |
+
"url",
|
37 |
+
"group",
|
38 |
+
"entities",
|
39 |
+
])
|
40 |
self.is_paraphrased:list[bool] = []
|
41 |
|
42 |
self.ordinary_user_table:list = []
|
43 |
self.fact_checker_table:list = []
|
44 |
self.governor_table:list = []
|
45 |
+
self.entities_with_colors = []
|
46 |
|
47 |
def load_news(self, news_title, news_content, news_image):
|
48 |
self.news_text = news_title + "\n\n" + news_content
|
|
|
83 |
|
84 |
if current_index >= len(input_sentences):
|
85 |
break
|
86 |
+
if current_index > index and index != 0 and index != len(input_sentences) - 1:
|
87 |
continue
|
88 |
|
89 |
paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
|
|
|
130 |
|
131 |
previous_paraphrase = paraphrase
|
132 |
|
133 |
+
def determine_text_origin_2(self):
|
134 |
+
"""
|
135 |
+
Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
|
136 |
+
|
137 |
+
Args:
|
138 |
+
text: The input text to be analyzed.
|
139 |
+
|
140 |
+
Returns:
|
141 |
+
str: The predicted origin of the text:
|
142 |
+
- "HUMAN": If the text is likely written by a human.
|
143 |
+
- "MACHINE": If the text is likely generated by a machine.
|
144 |
+
"""
|
145 |
+
print("CHECK TEXT:")
|
146 |
+
print("\tFrom search engine:")
|
147 |
+
# Classify by search engine
|
148 |
+
input_sentences = split_into_paragraphs(self.news_text)
|
149 |
+
for _ in range(5):
|
150 |
+
self.aligned_sentences_df = pd.concat(
|
151 |
+
[self.aligned_sentences_df, pd.DataFrame([{}])],
|
152 |
+
ignore_index=False,
|
153 |
+
)
|
154 |
+
|
155 |
+
for index, sentence in enumerate(input_sentences):
|
156 |
+
print(f"-------index = {index}-------")
|
157 |
+
print(f"current_sentence = {input_sentences[index]}")
|
158 |
+
|
159 |
+
if self.aligned_sentences_df["url"] is not None:
|
160 |
+
continue
|
161 |
+
|
162 |
+
self.aligned_sentences_df, img_urls = find_text_source(
|
163 |
+
input_sentences[index],
|
164 |
+
self.aligned_sentences_df,
|
165 |
+
)
|
166 |
+
|
167 |
def detect_image_origin(self):
|
168 |
print("CHECK IMAGE:")
|
169 |
if self.news_image is None:
|
|
|
230 |
self.detect_image_origin()
|
231 |
|
232 |
def analyze_details(self):
|
233 |
+
entities_with_colors = []
|
234 |
+
for index, aligned_sentence in enumerate(self.aligned_sentences):
|
235 |
+
# Get entity-words (in pair) with colors
|
236 |
+
entities_with_colors = highlight_entities(
|
237 |
+
aligned_sentence["input_sentence"],
|
238 |
+
aligned_sentence["matched_sentence"],
|
239 |
+
)
|
240 |
+
self.aligned_sentences[index]["entities"] = entities_with_colors
|
241 |
+
|
242 |
ordinary_user_table = self.create_ordinary_user_table()
|
243 |
fact_checker_table = self.create_fact_checker_table()
|
244 |
governor_table = self.create_governor_table()
|
|
|
309 |
)
|
310 |
|
311 |
# Get entity-words (in pair) with colors
|
312 |
+
# entities_with_colors = highlight_entities(
|
313 |
+
# aligned_sentence["input_sentence"],
|
314 |
+
# aligned_sentence["matched_sentence"],
|
315 |
+
# )
|
316 |
+
|
317 |
self.fact_checker_table.append(
|
318 |
[
|
319 |
aligned_sentence,
|
320 |
equal_idx_1,
|
321 |
equal_idx_2,
|
322 |
+
aligned_sentence["entities"],
|
323 |
]
|
324 |
)
|
325 |
|
|
|
329 |
|
330 |
table = "\n".join(rows)
|
331 |
return f"""
|
332 |
+
<h5>Comparison between input news and source news:</h5>
|
333 |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
334 |
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
335 |
<thead>
|
|
|
349 |
"""
|
350 |
|
351 |
def format_text_fact_checker_row(self, row, max_length=30):
|
352 |
+
entity_count = 0
|
353 |
if row[0]["input_sentence"] == "":
|
354 |
return ""
|
355 |
if row[0]["matched_sentence"] != "": # source is not empty
|
356 |
# highlight entities
|
357 |
input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
|
358 |
source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
|
359 |
+
entity_count = len(row[3])
|
360 |
|
361 |
# Color overlapping words
|
362 |
input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input) # text, index of highlight words
|
|
|
375 |
short_url = self.shorten_url(url, max_length)
|
376 |
source_text_url = f"""<a href="{url}">{short_url}</a>"""
|
377 |
|
378 |
+
entity_count_text = self.get_entity_count_text(entity_count)
|
379 |
+
|
380 |
return f"""
|
381 |
<tr>
|
382 |
<td>{input_sentence}</td>
|
383 |
<td>{source_sentence}</td>
|
384 |
+
<td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
|
385 |
<td>{source_text_url}</td>
|
386 |
</tr>
|
387 |
"""
|
|
|
407 |
table = "\n".join(rows)
|
408 |
|
409 |
return f"""
|
410 |
+
<h5>Comparison between input news and source news:</h5>
|
411 |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
412 |
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
413 |
<thead>
|
|
|
434 |
for index, row in enumerate(self.aligned_sentences):
|
435 |
if row["input_sentence"] == "":
|
436 |
continue
|
437 |
+
input_sentences += row["input_sentence"] + "<br><br>"
|
438 |
label = self.aligned_sentences[index]["label"]
|
|
|
|
|
|
|
|
|
|
|
439 |
|
440 |
url = self.aligned_sentences[index]["url"] #
|
441 |
short_url = self.shorten_url(url, max_length)
|
442 |
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
443 |
sentence_count += 1
|
444 |
|
445 |
+
scores, label = self.calculate_score_label()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
446 |
|
447 |
return f"""
|
448 |
<tr>
|
|
|
455 |
def format_image_ordinary_user_row(self, max_length=30):
|
456 |
|
457 |
if self.image_referent_url is not None or self.image_referent_url != "":
|
458 |
+
# source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
|
459 |
short_url = self.shorten_url(self.image_referent_url, max_length)
|
460 |
source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
|
461 |
else:
|
462 |
+
# source_image = "Image not found"
|
463 |
source_image_url = ""
|
464 |
|
465 |
+
return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
466 |
|
467 |
|
468 |
def create_governor_table(self):
|
|
|
481 |
)
|
482 |
|
483 |
# Get entity-words (in pair) with colors
|
484 |
+
# entities_with_colors = highlight_entities(
|
485 |
+
# aligned_sentence["input_sentence"],
|
486 |
+
# aligned_sentence["matched_sentence"],
|
487 |
+
# )
|
488 |
|
489 |
self.governor_table.append(
|
490 |
[
|
491 |
aligned_sentence,
|
492 |
equal_idx_1,
|
493 |
equal_idx_2,
|
494 |
+
aligned_sentence["entities"],
|
495 |
]
|
496 |
)
|
497 |
|
|
|
500 |
|
501 |
table = "\n".join(rows)
|
502 |
return f"""
|
503 |
+
<h5>Comparison between input news and source news:</h5>
|
504 |
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
505 |
<col style="width: 170px;"> <col style="width: 170px;"> <col style="width: 30px;"> <col style="width: 75px;">
|
506 |
<thead>
|
|
|
549 |
input_sentence = row[0]["input_sentence"]
|
550 |
source_sentence = row[0]["matched_sentence"]
|
551 |
|
552 |
+
# convert score to HUMAN-based score:
|
553 |
+
input_sentences += input_sentence + "<br><br>"
|
554 |
+
source_sentences += source_sentence + "<br><br>"
|
555 |
+
|
|
|
|
|
|
|
|
|
|
|
556 |
|
557 |
url = row[0]["url"]
|
558 |
short_url = self.shorten_url(url, max_length)
|
559 |
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
560 |
sentence_count += 1
|
561 |
|
562 |
+
score, label = self.calculate_score_label()
|
563 |
+
entity_count_text = self.get_entity_count_text(entity_count)
|
564 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
return f"""
|
566 |
<tr>
|
567 |
<td>{input_sentences}</td>
|
568 |
<td>{source_sentences}</td>
|
569 |
+
<td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
|
570 |
<td>{source_text_urls}</td>
|
571 |
</tr>
|
572 |
"""
|
|
|
583 |
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
584 |
|
585 |
|
586 |
+
def get_entity_count_text(self, entity_count):
|
587 |
+
if entity_count <= 0:
|
588 |
+
entity_count_text = ""
|
589 |
+
elif entity_count == 1:
|
590 |
+
entity_count_text = "with altered entity"
|
591 |
+
else:
|
592 |
+
entity_count_text = "with altered entities"
|
593 |
+
return entity_count_text
|
594 |
+
|
595 |
def shorten_url(self, url, max_length=30):
|
596 |
if url is None:
|
597 |
return ""
|
|
|
712 |
starts.append(start)
|
713 |
ends.append(end + 1)
|
714 |
|
715 |
+
return starts, ends
|
716 |
+
|
717 |
+
def calculate_score_label(self):
|
718 |
+
human_score = []
|
719 |
+
machine_score = []
|
720 |
+
machine_flag = False
|
721 |
+
for sentence in self.aligned_sentences:
|
722 |
+
if sentence["input_sentence"] == "":
|
723 |
+
continue
|
724 |
+
if sentence["label"] == "HUMAN":
|
725 |
+
human_score.append(sentence["similarity"])
|
726 |
+
elif sentence["label"] == "MACHINE":
|
727 |
+
machine_score.append(1 - sentence["similarity"])
|
728 |
+
machine_flag = True
|
729 |
+
|
730 |
+
if machine_flag is True and len(machine_score) > 0:
|
731 |
+
# average value of machine_score
|
732 |
+
machine_score_avg = sum(machine_score) / len(machine_score)
|
733 |
+
if machine_score_avg < 0.5:
|
734 |
+
machine_score_avg = 1 - machine_score_avg
|
735 |
+
return machine_score_avg, "MACHINE"
|
736 |
+
elif machine_flag is False and len(human_score) > 0:
|
737 |
+
# average value of human_score
|
738 |
+
human_score_avg = sum(human_score) / len(human_score)
|
739 |
+
return human_score_avg, "HUMAN"
|
740 |
+
else:
|
741 |
+
return 0, "UNKNOWN"
|
742 |
+
|
743 |
+
|
src/application/image/image_detection.py
CHANGED
@@ -12,6 +12,10 @@ def compare_list_of_images(news_image_path, img_urls):
|
|
12 |
matched_url = ""
|
13 |
max_similarity = 0
|
14 |
for url in img_urls:
|
|
|
|
|
|
|
|
|
15 |
print(f"\t{url}")
|
16 |
referred_image = get_image_from_url(url)
|
17 |
if referred_image is None:
|
@@ -29,8 +33,6 @@ def compare_list_of_images(news_image_path, img_urls):
|
|
29 |
|
30 |
def detect_image_from_news_image(news_image_path, image_urls):
|
31 |
print("\tFrom news:")
|
32 |
-
for url in image_urls:
|
33 |
-
print(f"\t{url}")
|
34 |
return compare_list_of_images(news_image_path, image_urls)
|
35 |
|
36 |
def detect_image_by_reverse_search(news_image_path):
|
|
|
12 |
matched_url = ""
|
13 |
max_similarity = 0
|
14 |
for url in img_urls:
|
15 |
+
if "ichef.bbci.co.uk" in url and " " in url:
|
16 |
+
url_list = url.split(",")
|
17 |
+
if len(url_list) > 0:
|
18 |
+
url = url_list[0].split(" ")[0]
|
19 |
print(f"\t{url}")
|
20 |
referred_image = get_image_from_url(url)
|
21 |
if referred_image is None:
|
|
|
33 |
|
34 |
def detect_image_from_news_image(news_image_path, image_urls):
|
35 |
print("\tFrom news:")
|
|
|
|
|
36 |
return compare_list_of_images(news_image_path, image_urls)
|
37 |
|
38 |
def detect_image_by_reverse_search(news_image_path):
|
src/application/text/entity.py
CHANGED
@@ -166,8 +166,14 @@ def apply_highlight(text, entities_with_colors, key="input", count = 0):
|
|
166 |
highlighted_text = ""
|
167 |
|
168 |
# find a list of starts and ends of entity in text:
|
169 |
-
starts = [m.start() for m in re.finditer(entity[key], temp_text)]
|
170 |
-
ends = [m.end() for m in re.finditer(entity[key], temp_text)]
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
all_starts.extend(starts)
|
173 |
all_ends.extend(ends)
|
|
|
166 |
highlighted_text = ""
|
167 |
|
168 |
# find a list of starts and ends of entity in text:
|
169 |
+
# starts = [m.start() for m in re.finditer(entity[key], temp_text)]
|
170 |
+
# ends = [m.end() for m in re.finditer(entity[key], temp_text)]
|
171 |
+
starts =[]
|
172 |
+
ends = []
|
173 |
+
# "\b" is for bound a word
|
174 |
+
for m in re.finditer(r"\b" + re.escape(entity[key]) + r"\b", temp_text):
|
175 |
+
starts.append(m.start())
|
176 |
+
ends.append(m.end())
|
177 |
|
178 |
all_starts.extend(starts)
|
179 |
all_ends.extend(ends)
|
src/application/text/search.py
CHANGED
@@ -160,12 +160,28 @@ def generate_search_phrases(input_text):
|
|
160 |
search_phrases.append(input_text)
|
161 |
|
162 |
# Method 3: Split text by chunks
|
163 |
-
|
164 |
|
165 |
# Method 4: Get most identities and key words
|
166 |
entities = extract_entities(input_text)
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
169 |
# search_phrases.append(search_phrase) # TODO: for demo purposes
|
170 |
|
171 |
return search_phrases
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
search_phrases.append(input_text)
|
161 |
|
162 |
# Method 3: Split text by chunks
|
163 |
+
search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
|
164 |
|
165 |
# Method 4: Get most identities and key words
|
166 |
entities = extract_entities(input_text)
|
167 |
+
text_without_entities = remove_identities_from_text(input_text, entities)
|
168 |
+
print(f"text_without_entities: {text_without_entities}")
|
169 |
+
search_phrases.append(text_without_entities)
|
170 |
+
#keywords = get_keywords(input_text, 16)
|
171 |
+
#search_phrase = " ".join(entities) + " " + " ".join(keywords)
|
172 |
# search_phrases.append(search_phrase) # TODO: for demo purposes
|
173 |
|
174 |
return search_phrases
|
175 |
+
|
176 |
+
def remove_identities_from_text(input_text, entities):
|
177 |
+
"""
|
178 |
+
Removes entities from the input text.
|
179 |
+
|
180 |
+
Args:
|
181 |
+
input_text: The input text as a string.
|
182 |
+
entities: A list of entities to be removed.
|
183 |
+
"""
|
184 |
+
for entity in entities:
|
185 |
+
input_text = input_text.replace(entity, "")
|
186 |
+
|
187 |
+
return input_text
|
src/application/text/search_detection.py
CHANGED
@@ -92,6 +92,76 @@ def detect_text_by_relative_search(input_text, index, is_support_opposite = Fals
|
|
92 |
|
93 |
return False, None, [], [], index
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
def longest_common_subsequence(arr1, arr2):
|
96 |
"""
|
97 |
Finds the length of the longest common subsequence (contiguous) between
|
|
|
92 |
|
93 |
return False, None, [], [], index
|
94 |
|
95 |
+
def find_text_source(text, text_index, sentences_df):
|
96 |
+
sentence = {
|
97 |
+
"input_sentence": text[text_index],
|
98 |
+
"matched_sentence": "",
|
99 |
+
"label": "",
|
100 |
+
"similarity": None,
|
101 |
+
"paraphrase": None,
|
102 |
+
"url": "",
|
103 |
+
"group": None,
|
104 |
+
}
|
105 |
+
checked_urls = set()
|
106 |
+
searched_phrases = generate_search_phrases(text[text_index])
|
107 |
+
|
108 |
+
for candidate in searched_phrases:
|
109 |
+
search_results = search_by_google(candidate)
|
110 |
+
urls = [item['link'] for item in search_results.get("items", [])]
|
111 |
+
|
112 |
+
for url in urls[:3]:
|
113 |
+
if url in checked_urls: # visited url
|
114 |
+
continue
|
115 |
+
if "bbc.com" not in url:
|
116 |
+
continue
|
117 |
+
|
118 |
+
checked_urls.add(url)
|
119 |
+
print(f"\t\tChecking URL: {url}")
|
120 |
+
|
121 |
+
content = URLReader(url)
|
122 |
+
|
123 |
+
if content.is_extracted is True:
|
124 |
+
if content.title is None or content.text is None:
|
125 |
+
print(f"\t\t\tβββ Title or text not found")
|
126 |
+
continue
|
127 |
+
|
128 |
+
page_text = content.title + "\n" + content.text
|
129 |
+
if len(page_text) > MAX_CHAR_SIZE:
|
130 |
+
print(f"\t\t\tβββ More than {MAX_CHAR_SIZE} characters")
|
131 |
+
continue
|
132 |
+
print(f"\t\t\tβββ Title: {content.title}")
|
133 |
+
paraphrase, aligned_sentence = check_paraphrase(text, page_text, url)
|
134 |
+
|
135 |
+
# add one more key "group" into aligned_sentence
|
136 |
+
sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
|
137 |
+
sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
|
138 |
+
sentences_df.loc[index, "label"] = aligned_sentence["label"]
|
139 |
+
sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
|
140 |
+
sentences_df.loc[index, "url"] = aligned_sentence["url"]
|
141 |
+
|
142 |
+
if aligned_sentence["paraphrase"] is False:
|
143 |
+
return paraphrase, sentences_df
|
144 |
+
|
145 |
+
for index, _ in enumerate(sentences_df):
|
146 |
+
if sentences_df[index]["url"] is not None:
|
147 |
+
continue
|
148 |
+
|
149 |
+
# find content in new url
|
150 |
+
_, aligned_sentence = check_paraphrase(text[index], page_text, url)
|
151 |
+
|
152 |
+
if aligned_sentence["url"] is not None:
|
153 |
+
continue
|
154 |
+
|
155 |
+
sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
|
156 |
+
sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
|
157 |
+
sentences_df.loc[index, "label"] = aligned_sentence["label"]
|
158 |
+
sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
|
159 |
+
sentences_df.loc[index, "url"] = aligned_sentence["url"]
|
160 |
+
|
161 |
+
return sentences_df, content.images
|
162 |
+
|
163 |
+
return sentence, []
|
164 |
+
|
165 |
def longest_common_subsequence(arr1, arr2):
|
166 |
"""
|
167 |
Finds the length of the longest common subsequence (contiguous) between
|
test.py
CHANGED
@@ -1,27 +1,50 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def find_entity_spans(entity, text):
|
4 |
+
"""
|
5 |
+
Finds the start and end indices of whole word entities in text.
|
6 |
+
|
7 |
+
Args:
|
8 |
+
entity: The entity string to search for.
|
9 |
+
text: The text to search within.
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
A list of tuples, where each tuple contains the start and end indices
|
13 |
+
of a found entity. Returns an empty list if no entities are found.
|
14 |
+
"""
|
15 |
+
spans = []
|
16 |
+
for m in re.finditer(r"\b" + re.escape(entity) + r"\b", text): # The crucial change
|
17 |
+
spans.append((m.start(), m.end()))
|
18 |
+
return spans
|
19 |
+
|
20 |
+
# Example usage:
|
21 |
+
temp_text = "win winger winning"
|
22 |
+
entity = {"key": "win"} # Example dictionary (adjust as needed)
|
23 |
+
|
24 |
+
spans = find_entity_spans(entity["key"], temp_text)
|
25 |
+
print(spans) # Output: [(0, 3)] (Only "win" at the beginning)
|
26 |
+
|
27 |
+
temp_text = "The quick brown fox jumps over the lazy dog."
|
28 |
+
entity = {"key": "fox"}
|
29 |
+
spans = find_entity_spans(entity["key"], temp_text)
|
30 |
+
print(spans) # Output: [(16, 19)]
|
31 |
+
|
32 |
+
temp_text = "foxes fox foxing"
|
33 |
+
entity = {"key": "fox"}
|
34 |
+
spans = find_entity_spans(entity["key"], temp_text)
|
35 |
+
print(spans) # Output: [(0, 3), (6, 9)]
|
36 |
+
|
37 |
+
temp_text = "winger win winning"
|
38 |
+
entity = {"key": "win"}
|
39 |
+
spans = find_entity_spans(entity["key"], temp_text)
|
40 |
+
print(spans) # Output: [(8, 11)]
|
41 |
+
|
42 |
+
temp_text = "winger win winning"
|
43 |
+
entity = {"key": "winger"}
|
44 |
+
spans = find_entity_spans(entity["key"], temp_text)
|
45 |
+
print(spans) # Output: [(0, 6)]
|
46 |
+
|
47 |
+
temp_text = "winger win winning"
|
48 |
+
entity = {"key": "winning"}
|
49 |
+
spans = find_entity_spans(entity["key"], temp_text)
|
50 |
+
print(spans) # Output: [(12, 19)]
|