Spaces:
Running
Running
Commit
Β·
38fd181
1
Parent(s):
504f37b
run pre-commit
Browse files- .sample-env +1 -1
- README.md +7 -7
- application.py +155 -103
- application_2.py +155 -103
- application_3.py +254 -0
- examples/example_text_LLM_entities.txt +1 -1
- examples/example_text_LLM_modification.txt +3 -3
- examples/example_text_LLM_topic.txt +6 -6
- examples/example_text_real.txt +1 -1
- examples/example_text_real_2.txt +1 -1
- gpt_test.py +16 -20
- requirements.txt +1 -1
- src/application/content_detection.py +342 -244
- src/application/content_generation.py +40 -32
- src/application/image/image_comparison.py +23 -16
- src/application/image/image_detection.py +21 -10
- src/application/image/model_detection.py +70 -43
- src/application/image/search_yandex.py +91 -47
- src/application/text/entity.py +152 -95
- src/application/text/helper.py +59 -68
- src/application/text/highlight_text.py +127 -50
- src/application/text/model_detection.py +1 -1
- src/application/text/preprocessing.py +3 -2
- src/application/text/search.py +43 -33
- src/application/text/search_detection.py +195 -101
- src/application/url_reader.py +49 -35
- test.py +11 -6
.sample-env
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
[API_KEY]
|
2 |
OPENAI_API_KEY=your_api_key # Replace with your actual OpenAI API key
|
3 |
GEMINI_API_KEY=your_api_key
|
4 |
-
TOGETHER_API_KEY=your_api_key
|
|
|
1 |
[API_KEY]
|
2 |
OPENAI_API_KEY=your_api_key # Replace with your actual OpenAI API key
|
3 |
GEMINI_API_KEY=your_api_key
|
4 |
+
TOGETHER_API_KEY=your_api_key
|
README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
---
|
2 |
-
title: "FAKE NEWS DETECTION"
|
3 |
-
emoji: "π"
|
4 |
-
colorFrom: "green"
|
5 |
-
colorTo: "blue"
|
6 |
-
sdk: "gradio"
|
7 |
-
sdk_version: "5.13.1"
|
8 |
-
app_file: "application.py"
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
1 |
---
|
2 |
+
title: "FAKE NEWS DETECTION"
|
3 |
+
emoji: "π"
|
4 |
+
colorFrom: "green"
|
5 |
+
colorTo: "blue"
|
6 |
+
sdk: "gradio"
|
7 |
+
sdk_version: "5.13.1"
|
8 |
+
app_file: "application.py"
|
9 |
pinned: false
|
10 |
---
|
11 |
|
application.py
CHANGED
@@ -1,44 +1,53 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
import gradio as gr
|
4 |
import requests
|
5 |
from PIL import Image
|
6 |
|
7 |
from src.application.content_detection import NewsVerification
|
|
|
|
|
|
|
|
|
|
|
8 |
from src.application.url_reader import URLReader
|
9 |
-
from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
|
10 |
|
11 |
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
12 |
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
13 |
|
|
|
14 |
def load_url(url):
|
15 |
"""
|
16 |
Load content from the given URL.
|
17 |
"""
|
18 |
content = URLReader(url)
|
19 |
image = None
|
20 |
-
header = {
|
|
|
|
|
21 |
try:
|
22 |
response = requests.get(
|
23 |
-
url,
|
24 |
-
headers
|
25 |
-
stream
|
26 |
)
|
27 |
response.raise_for_status() # Raise an exception for bad status codes
|
28 |
-
|
29 |
image_response = requests.get(content.top_image, stream=True)
|
30 |
try:
|
31 |
image = Image.open(image_response.raw)
|
32 |
-
except:
|
33 |
-
print(f"Error loading image from {content.top_image}")
|
34 |
-
|
35 |
except (requests.exceptions.RequestException, FileNotFoundError) as e:
|
36 |
print(f"Error fetching image: {e}")
|
37 |
|
38 |
return content.title, content.text, image
|
39 |
|
40 |
|
41 |
-
def generate_analysis_report(
|
|
|
|
|
|
|
|
|
42 |
news_analysis = NewsVerification()
|
43 |
news_analysis.load_news(news_title, news_content, news_image)
|
44 |
news_analysis.generate_analysis_report()
|
@@ -48,80 +57,100 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
|
|
48 |
# Define the GUI
|
49 |
with gr.Blocks() as demo:
|
50 |
gr.Markdown("# NEWS VERIFICATION")
|
51 |
-
|
52 |
with gr.Row():
|
53 |
-
# SETTINGS
|
54 |
with gr.Column(scale=1):
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
interactive=True
|
77 |
)
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
|
86 |
# NEWS ANALYSIS REPORT
|
87 |
ordinary_user_explanation = """
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
91 |
"""
|
92 |
fact_checker_explanation = """
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
96 |
"""
|
97 |
governor_explanation = """
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
101 |
"""
|
102 |
table = """
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
</
|
114 |
-
<
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
</
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
125 |
with gr.Column(scale=2):
|
126 |
with gr.Accordion("NEWS ANALYSIS"):
|
127 |
verification_button = gr.Button("Verify news")
|
@@ -137,56 +166,79 @@ with gr.Blocks() as demo:
|
|
137 |
|
138 |
# Connect events
|
139 |
load_button.click(
|
140 |
-
load_url,
|
141 |
-
inputs=url_input,
|
142 |
-
outputs=[news_title, news_content, news_image]
|
143 |
-
|
144 |
-
replace_button.click(
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
# change Image
|
158 |
-
#url_input.change(load_image, inputs=url_input, outputs=image_view)
|
159 |
-
|
160 |
try:
|
161 |
-
with open(
|
|
|
|
|
|
|
162 |
text_real_1 = file.read()
|
163 |
-
with open(
|
|
|
|
|
|
|
164 |
text_real_2 = file.read()
|
165 |
-
with open(
|
|
|
|
|
|
|
166 |
text_llm_topic = file.read()
|
167 |
-
with open(
|
|
|
|
|
|
|
168 |
text_llm_modification = file.read()
|
169 |
-
with open(
|
|
|
|
|
|
|
170 |
text_llm_entities = file.read()
|
171 |
except FileNotFoundError:
|
172 |
print("File not found.")
|
173 |
except Exception as e:
|
174 |
print(f"An error occurred: {e}")
|
175 |
-
|
176 |
title_1 = "Southampton news: Leeds target striker Cameron Archer."
|
177 |
title_2 = "Southampton news: Leeds target striker Cameron Archer."
|
178 |
title_4 = "Japan pledges support for Ukraine with 100-year pact."
|
179 |
-
|
180 |
image_1 = "examples/example_image_real_1.jpg.webp"
|
181 |
image_2 = "examples/example_image_real_2.jpg.webp"
|
182 |
image_3 = "examples/example_image_real_3.jpg"
|
183 |
image_4 = "examples/example_image_real_4.jpg.webp"
|
184 |
-
|
185 |
gr.Examples(
|
186 |
examples=[
|
187 |
-
[title_1, image_1, text_real_1 +
|
188 |
-
[title_1, image_2, text_real_1 +
|
189 |
-
[title_1, image_3, text_real_1 +
|
190 |
[title_4, image_4, text_llm_entities],
|
191 |
],
|
192 |
inputs=[news_title, news_image, news_content],
|
@@ -199,4 +251,4 @@ with gr.Blocks() as demo:
|
|
199 |
],
|
200 |
)
|
201 |
|
202 |
-
demo.launch(share=True)
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
from PIL import Image
|
4 |
|
5 |
from src.application.content_detection import NewsVerification
|
6 |
+
from src.application.content_generation import (
|
7 |
+
generate_fake_image,
|
8 |
+
generate_fake_text,
|
9 |
+
replace_text,
|
10 |
+
)
|
11 |
from src.application.url_reader import URLReader
|
|
|
12 |
|
13 |
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
14 |
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
15 |
|
16 |
+
|
17 |
def load_url(url):
|
18 |
"""
|
19 |
Load content from the given URL.
|
20 |
"""
|
21 |
content = URLReader(url)
|
22 |
image = None
|
23 |
+
header = {
|
24 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", # noqa: E501
|
25 |
+
}
|
26 |
try:
|
27 |
response = requests.get(
|
28 |
+
url,
|
29 |
+
headers=header,
|
30 |
+
stream=True,
|
31 |
)
|
32 |
response.raise_for_status() # Raise an exception for bad status codes
|
33 |
+
|
34 |
image_response = requests.get(content.top_image, stream=True)
|
35 |
try:
|
36 |
image = Image.open(image_response.raw)
|
37 |
+
except OSError as e:
|
38 |
+
print(f"Error loading image from {content.top_image}: {e}")
|
39 |
+
|
40 |
except (requests.exceptions.RequestException, FileNotFoundError) as e:
|
41 |
print(f"Error fetching image: {e}")
|
42 |
|
43 |
return content.title, content.text, image
|
44 |
|
45 |
|
46 |
+
def generate_analysis_report(
|
47 |
+
news_title: str,
|
48 |
+
news_content: str,
|
49 |
+
news_image: Image,
|
50 |
+
):
|
51 |
news_analysis = NewsVerification()
|
52 |
news_analysis.load_news(news_title, news_content, news_image)
|
53 |
news_analysis.generate_analysis_report()
|
|
|
57 |
# Define the GUI
|
58 |
with gr.Blocks() as demo:
|
59 |
gr.Markdown("# NEWS VERIFICATION")
|
60 |
+
|
61 |
with gr.Row():
|
62 |
+
# SETTINGS
|
63 |
with gr.Column(scale=1):
|
64 |
+
with gr.Accordion("1. Enter a URL"):
|
65 |
+
url_input = gr.Textbox(
|
66 |
+
label="",
|
67 |
+
show_label=False,
|
68 |
+
value="",
|
69 |
+
)
|
70 |
+
load_button = gr.Button("Load URL")
|
71 |
+
|
72 |
+
with gr.Accordion(
|
73 |
+
"2. Select content-generation models",
|
74 |
+
open=True,
|
75 |
+
visible=False,
|
76 |
+
):
|
77 |
+
with gr.Row():
|
78 |
+
text_generation_model = gr.Dropdown(
|
79 |
+
choices=AZURE_TEXT_MODEL,
|
80 |
+
label="Text-generation model",
|
81 |
+
)
|
82 |
+
image_generation_model = gr.Dropdown(
|
83 |
+
choices=AZURE_IMAGE_MODEL,
|
84 |
+
label="Image-generation model",
|
|
|
85 |
)
|
86 |
+
generate_text_button = gr.Button("Generate text")
|
87 |
+
generate_image_button = gr.Button("Generate image")
|
88 |
+
|
89 |
+
with gr.Accordion(
|
90 |
+
"3. Replace any terms",
|
91 |
+
open=True,
|
92 |
+
visible=False,
|
93 |
+
):
|
94 |
+
replace_df = gr.Dataframe(
|
95 |
+
headers=["Find what:", "Replace with:"],
|
96 |
+
datatype=["str", "str"],
|
97 |
+
row_count=(1, "dynamic"),
|
98 |
+
col_count=(2, "fixed"),
|
99 |
+
interactive=True,
|
100 |
+
)
|
101 |
+
replace_button = gr.Button("Replace all")
|
102 |
|
103 |
+
# GENERATED CONTENT
|
104 |
+
with gr.Accordion("Input News"):
|
105 |
+
news_title = gr.Textbox(label="Title", value="")
|
106 |
+
news_image = gr.Image(label="Image", type="filepath")
|
107 |
+
news_content = gr.Textbox(label="Content", value="", lines=13)
|
108 |
|
109 |
# NEWS ANALYSIS REPORT
|
110 |
ordinary_user_explanation = """
|
111 |
+
FOR ORDINARY USER<br>
|
112 |
+
- Green texts are the matched words in the input and source news.<br>
|
113 |
+
- Each highlighted pair (marked with a number) shows the key differences
|
114 |
+
between the input text and the source.
|
115 |
"""
|
116 |
fact_checker_explanation = """
|
117 |
+
FOR FACT CHECKER<br>
|
118 |
+
- Green texts are the matched words in the input and source news.<br>
|
119 |
+
- Each highlighted pair (marked with a number) shows the key differences
|
120 |
+
between the input text and the source.
|
121 |
"""
|
122 |
governor_explanation = """
|
123 |
+
FOR GOVERNOR<br>
|
124 |
+
- Green texts are the matched words in the input and source news.<br>
|
125 |
+
- Each highlighted pair (marked with a number) shows the key differences
|
126 |
+
between the input text and the source.
|
127 |
"""
|
128 |
table = """
|
129 |
+
<h5>Comparison between input news and source news:</h5>
|
130 |
+
<table border="1" style="width:100%; text-align:left;">
|
131 |
+
<col style="width: 170px;">
|
132 |
+
<col style="width: 170px;">
|
133 |
+
<col style="width: 30px;">
|
134 |
+
<col style="width: 75px;">
|
135 |
+
<thead>
|
136 |
+
<tr>
|
137 |
+
<th>Input news</th>
|
138 |
+
<th>Source (corresponding URL provided in Originality)</th>
|
139 |
+
<th>Forensic</th>
|
140 |
+
<th>Originality</th>
|
141 |
+
</tr>
|
142 |
+
</thead>
|
143 |
+
<tbody>
|
144 |
+
<tr>
|
145 |
+
<th>TBD</th>
|
146 |
+
<th>TBD</th>
|
147 |
+
<th>TBD</th>
|
148 |
+
<th>TBD</th>
|
149 |
+
</tr>
|
150 |
+
</tbody>
|
151 |
+
</table>
|
152 |
+
|
153 |
+
<style>"""
|
154 |
with gr.Column(scale=2):
|
155 |
with gr.Accordion("NEWS ANALYSIS"):
|
156 |
verification_button = gr.Button("Verify news")
|
|
|
166 |
|
167 |
# Connect events
|
168 |
load_button.click(
|
169 |
+
load_url,
|
170 |
+
inputs=url_input,
|
171 |
+
outputs=[news_title, news_content, news_image],
|
172 |
+
)
|
173 |
+
replace_button.click(
|
174 |
+
replace_text,
|
175 |
+
inputs=[news_title, news_content, replace_df],
|
176 |
+
outputs=[news_title, news_content],
|
177 |
+
)
|
178 |
+
generate_text_button.click(
|
179 |
+
generate_fake_text,
|
180 |
+
inputs=[text_generation_model, news_title, news_content],
|
181 |
+
outputs=[news_title, news_content],
|
182 |
+
)
|
183 |
+
generate_image_button.click(
|
184 |
+
generate_fake_image,
|
185 |
+
inputs=[image_generation_model, news_title],
|
186 |
+
outputs=[news_image],
|
187 |
+
)
|
188 |
+
verification_button.click(
|
189 |
+
generate_analysis_report,
|
190 |
+
inputs=[news_title, news_content, news_image],
|
191 |
+
outputs=[ordinary_user_result, fact_checker_result, governor_result],
|
192 |
+
)
|
193 |
|
194 |
# change Image
|
195 |
+
# url_input.change(load_image, inputs=url_input, outputs=image_view)
|
196 |
+
|
197 |
try:
|
198 |
+
with open(
|
199 |
+
"examples/example_text_real.txt",
|
200 |
+
encoding="utf-8",
|
201 |
+
) as file:
|
202 |
text_real_1 = file.read()
|
203 |
+
with open(
|
204 |
+
"examples/example_text_real_2.txt",
|
205 |
+
encoding="utf-8",
|
206 |
+
) as file:
|
207 |
text_real_2 = file.read()
|
208 |
+
with open(
|
209 |
+
"examples/example_text_LLM_topic.txt",
|
210 |
+
encoding="utf-8",
|
211 |
+
) as file:
|
212 |
text_llm_topic = file.read()
|
213 |
+
with open(
|
214 |
+
"examples/example_text_LLM_modification.txt",
|
215 |
+
encoding="utf-8",
|
216 |
+
) as file:
|
217 |
text_llm_modification = file.read()
|
218 |
+
with open(
|
219 |
+
"examples/example_text_LLM_entities.txt",
|
220 |
+
encoding="utf-8",
|
221 |
+
) as file:
|
222 |
text_llm_entities = file.read()
|
223 |
except FileNotFoundError:
|
224 |
print("File not found.")
|
225 |
except Exception as e:
|
226 |
print(f"An error occurred: {e}")
|
227 |
+
|
228 |
title_1 = "Southampton news: Leeds target striker Cameron Archer."
|
229 |
title_2 = "Southampton news: Leeds target striker Cameron Archer."
|
230 |
title_4 = "Japan pledges support for Ukraine with 100-year pact."
|
231 |
+
|
232 |
image_1 = "examples/example_image_real_1.jpg.webp"
|
233 |
image_2 = "examples/example_image_real_2.jpg.webp"
|
234 |
image_3 = "examples/example_image_real_3.jpg"
|
235 |
image_4 = "examples/example_image_real_4.jpg.webp"
|
236 |
+
|
237 |
gr.Examples(
|
238 |
examples=[
|
239 |
+
[title_1, image_1, text_real_1 + "\n\n" + text_real_2],
|
240 |
+
[title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
|
241 |
+
[title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
|
242 |
[title_4, image_4, text_llm_entities],
|
243 |
],
|
244 |
inputs=[news_title, news_image, news_content],
|
|
|
251 |
],
|
252 |
)
|
253 |
|
254 |
+
demo.launch(share=True)
|
application_2.py
CHANGED
@@ -1,44 +1,53 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
import gradio as gr
|
4 |
import requests
|
5 |
from PIL import Image
|
6 |
|
7 |
from src.application.content_detection import NewsVerification
|
|
|
|
|
|
|
|
|
|
|
8 |
from src.application.url_reader import URLReader
|
9 |
-
from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
|
10 |
|
11 |
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
12 |
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
13 |
|
|
|
14 |
def load_url(url):
|
15 |
"""
|
16 |
Load content from the given URL.
|
17 |
"""
|
18 |
content = URLReader(url)
|
19 |
image = None
|
20 |
-
header = {
|
|
|
|
|
21 |
try:
|
22 |
response = requests.get(
|
23 |
-
url,
|
24 |
-
headers
|
25 |
-
stream
|
26 |
)
|
27 |
response.raise_for_status() # Raise an exception for bad status codes
|
28 |
-
|
29 |
image_response = requests.get(content.top_image, stream=True)
|
30 |
try:
|
31 |
image = Image.open(image_response.raw)
|
32 |
-
except:
|
33 |
-
print(f"Error loading image from {content.top_image}")
|
34 |
-
|
35 |
except (requests.exceptions.RequestException, FileNotFoundError) as e:
|
36 |
print(f"Error fetching image: {e}")
|
37 |
|
38 |
return content.title, content.text, image
|
39 |
|
40 |
|
41 |
-
def generate_analysis_report(
|
|
|
|
|
|
|
|
|
42 |
news_analysis = NewsVerification()
|
43 |
news_analysis.load_news(news_title, news_content, news_image)
|
44 |
news_analysis.generate_analysis_report()
|
@@ -48,80 +57,100 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
|
|
48 |
# Define the GUI
|
49 |
with gr.Blocks() as demo:
|
50 |
gr.Markdown("# NEWS VERIFICATION")
|
51 |
-
|
52 |
with gr.Row():
|
53 |
-
# SETTINGS
|
54 |
with gr.Column(scale=1):
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
interactive=True
|
77 |
)
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
|
86 |
# NEWS ANALYSIS REPORT
|
87 |
ordinary_user_explanation = """
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
91 |
"""
|
92 |
fact_checker_explanation = """
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
96 |
"""
|
97 |
governor_explanation = """
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
101 |
"""
|
102 |
table = """
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
</
|
114 |
-
<
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
</
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
125 |
with gr.Column(scale=2):
|
126 |
with gr.Accordion("NEWS ANALYSIS"):
|
127 |
verification_button = gr.Button("Verify news")
|
@@ -137,56 +166,79 @@ with gr.Blocks() as demo:
|
|
137 |
|
138 |
# Connect events
|
139 |
load_button.click(
|
140 |
-
load_url,
|
141 |
-
inputs=url_input,
|
142 |
-
outputs=[news_title, news_content, news_image]
|
143 |
-
|
144 |
-
replace_button.click(
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
# change Image
|
158 |
-
#url_input.change(load_image, inputs=url_input, outputs=image_view)
|
159 |
-
|
160 |
try:
|
161 |
-
with open(
|
|
|
|
|
|
|
162 |
text_real_1 = file.read()
|
163 |
-
with open(
|
|
|
|
|
|
|
164 |
text_real_2 = file.read()
|
165 |
-
with open(
|
|
|
|
|
|
|
166 |
text_llm_topic = file.read()
|
167 |
-
with open(
|
|
|
|
|
|
|
168 |
text_llm_modification = file.read()
|
169 |
-
with open(
|
|
|
|
|
|
|
170 |
text_llm_entities = file.read()
|
171 |
except FileNotFoundError:
|
172 |
print("File not found.")
|
173 |
except Exception as e:
|
174 |
print(f"An error occurred: {e}")
|
175 |
-
|
176 |
title_1 = "Southampton news: Leeds target striker Cameron Archer."
|
177 |
title_2 = "Southampton news: Leeds target striker Cameron Archer."
|
178 |
title_4 = "Japan pledges support for Ukraine with 100-year pact."
|
179 |
-
|
180 |
image_1 = "examples/example_image_real_1.jpg.webp"
|
181 |
image_2 = "examples/example_image_real_2.jpg.webp"
|
182 |
image_3 = "examples/example_image_real_3.jpg"
|
183 |
image_4 = "examples/example_image_real_4.jpg.webp"
|
184 |
-
|
185 |
gr.Examples(
|
186 |
examples=[
|
187 |
-
[title_1, image_1, text_real_1 +
|
188 |
-
[title_1, image_2, text_real_1 +
|
189 |
-
[title_1, image_3, text_real_1 +
|
190 |
[title_4, image_4, text_llm_entities],
|
191 |
],
|
192 |
inputs=[news_title, news_image, news_content],
|
@@ -199,4 +251,4 @@ with gr.Blocks() as demo:
|
|
199 |
],
|
200 |
)
|
201 |
|
202 |
-
demo.launch(share=True)
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
from PIL import Image
|
4 |
|
5 |
from src.application.content_detection import NewsVerification
|
6 |
+
from src.application.content_generation import (
|
7 |
+
generate_fake_image,
|
8 |
+
generate_fake_text,
|
9 |
+
replace_text,
|
10 |
+
)
|
11 |
from src.application.url_reader import URLReader
|
|
|
12 |
|
13 |
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
14 |
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
15 |
|
16 |
+
|
17 |
def load_url(url):
|
18 |
"""
|
19 |
Load content from the given URL.
|
20 |
"""
|
21 |
content = URLReader(url)
|
22 |
image = None
|
23 |
+
header = {
|
24 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", # noqa: E501
|
25 |
+
}
|
26 |
try:
|
27 |
response = requests.get(
|
28 |
+
url,
|
29 |
+
headers=header,
|
30 |
+
stream=True,
|
31 |
)
|
32 |
response.raise_for_status() # Raise an exception for bad status codes
|
33 |
+
|
34 |
image_response = requests.get(content.top_image, stream=True)
|
35 |
try:
|
36 |
image = Image.open(image_response.raw)
|
37 |
+
except OSError as e:
|
38 |
+
print(f"Error loading image from {content.top_image}: {e}")
|
39 |
+
|
40 |
except (requests.exceptions.RequestException, FileNotFoundError) as e:
|
41 |
print(f"Error fetching image: {e}")
|
42 |
|
43 |
return content.title, content.text, image
|
44 |
|
45 |
|
46 |
+
def generate_analysis_report(
|
47 |
+
news_title: str,
|
48 |
+
news_content: str,
|
49 |
+
news_image: Image,
|
50 |
+
):
|
51 |
news_analysis = NewsVerification()
|
52 |
news_analysis.load_news(news_title, news_content, news_image)
|
53 |
news_analysis.generate_analysis_report()
|
|
|
57 |
# Define the GUI
|
58 |
with gr.Blocks() as demo:
|
59 |
gr.Markdown("# NEWS VERIFICATION")
|
60 |
+
|
61 |
with gr.Row():
|
62 |
+
# SETTINGS
|
63 |
with gr.Column(scale=1):
|
64 |
+
with gr.Accordion("1. Enter a URL"):
|
65 |
+
url_input = gr.Textbox(
|
66 |
+
label="",
|
67 |
+
show_label=False,
|
68 |
+
value="",
|
69 |
+
)
|
70 |
+
load_button = gr.Button("Load URL")
|
71 |
+
|
72 |
+
with gr.Accordion(
|
73 |
+
"2. Select content-generation models",
|
74 |
+
open=True,
|
75 |
+
visible=False,
|
76 |
+
):
|
77 |
+
with gr.Row():
|
78 |
+
text_generation_model = gr.Dropdown(
|
79 |
+
choices=AZURE_TEXT_MODEL,
|
80 |
+
label="Text-generation model",
|
81 |
+
)
|
82 |
+
image_generation_model = gr.Dropdown(
|
83 |
+
choices=AZURE_IMAGE_MODEL,
|
84 |
+
label="Image-generation model",
|
|
|
85 |
)
|
86 |
+
generate_text_button = gr.Button("Generate text")
|
87 |
+
generate_image_button = gr.Button("Generate image")
|
88 |
+
|
89 |
+
with gr.Accordion(
|
90 |
+
"3. Replace any terms",
|
91 |
+
open=True,
|
92 |
+
visible=False,
|
93 |
+
):
|
94 |
+
replace_df = gr.Dataframe(
|
95 |
+
headers=["Find what:", "Replace with:"],
|
96 |
+
datatype=["str", "str"],
|
97 |
+
row_count=(1, "dynamic"),
|
98 |
+
col_count=(2, "fixed"),
|
99 |
+
interactive=True,
|
100 |
+
)
|
101 |
+
replace_button = gr.Button("Replace all")
|
102 |
|
103 |
+
# GENERATED CONTENT
|
104 |
+
with gr.Accordion("Input News"):
|
105 |
+
news_title = gr.Textbox(label="Title", value="")
|
106 |
+
news_image = gr.Image(label="Image", type="filepath")
|
107 |
+
news_content = gr.Textbox(label="Content", value="", lines=13)
|
108 |
|
109 |
# NEWS ANALYSIS REPORT
|
110 |
ordinary_user_explanation = """
|
111 |
+
FOR ORDINARY USER<br>
|
112 |
+
- Green texts are the matched words in the input and source news.<br>
|
113 |
+
- Each highlighted pair (marked with a number) shows the key differences
|
114 |
+
between the input text and the source.
|
115 |
"""
|
116 |
fact_checker_explanation = """
|
117 |
+
FOR FACT CHECKER<br>
|
118 |
+
- Green texts are the matched words in the input and source news.<br>
|
119 |
+
- Each highlighted pair (marked with a number) shows the key differences
|
120 |
+
between the input text and the source.
|
121 |
"""
|
122 |
governor_explanation = """
|
123 |
+
FOR GOVERNOR<br>
|
124 |
+
- Green texts are the matched words in the input and source news.<br>
|
125 |
+
- Each highlighted pair (marked with a number) shows the key differences
|
126 |
+
between the input text and the source.
|
127 |
"""
|
128 |
table = """
|
129 |
+
<h5>Comparison between input news and source news:</h5>
|
130 |
+
<table border="1" style="width:100%; text-align:left;">
|
131 |
+
<col style="width: 170px;">
|
132 |
+
<col style="width: 170px;">
|
133 |
+
<col style="width: 30px;">
|
134 |
+
<col style="width: 75px;">
|
135 |
+
<thead>
|
136 |
+
<tr>
|
137 |
+
<th>Input news</th>
|
138 |
+
<th>Source (corresponding URL provided in Originality)</th>
|
139 |
+
<th>Forensic</th>
|
140 |
+
<th>Originality</th>
|
141 |
+
</tr>
|
142 |
+
</thead>
|
143 |
+
<tbody>
|
144 |
+
<tr>
|
145 |
+
<th>TBD</th>
|
146 |
+
<th>TBD</th>
|
147 |
+
<th>TBD</th>
|
148 |
+
<th>TBD</th>
|
149 |
+
</tr>
|
150 |
+
</tbody>
|
151 |
+
</table>
|
152 |
+
|
153 |
+
<style>"""
|
154 |
with gr.Column(scale=2):
|
155 |
with gr.Accordion("NEWS ANALYSIS"):
|
156 |
verification_button = gr.Button("Verify news")
|
|
|
166 |
|
167 |
# Connect events
|
168 |
load_button.click(
|
169 |
+
load_url,
|
170 |
+
inputs=url_input,
|
171 |
+
outputs=[news_title, news_content, news_image],
|
172 |
+
)
|
173 |
+
replace_button.click(
|
174 |
+
replace_text,
|
175 |
+
inputs=[news_title, news_content, replace_df],
|
176 |
+
outputs=[news_title, news_content],
|
177 |
+
)
|
178 |
+
generate_text_button.click(
|
179 |
+
generate_fake_text,
|
180 |
+
inputs=[text_generation_model, news_title, news_content],
|
181 |
+
outputs=[news_title, news_content],
|
182 |
+
)
|
183 |
+
generate_image_button.click(
|
184 |
+
generate_fake_image,
|
185 |
+
inputs=[image_generation_model, news_title],
|
186 |
+
outputs=[news_image],
|
187 |
+
)
|
188 |
+
verification_button.click(
|
189 |
+
generate_analysis_report,
|
190 |
+
inputs=[news_title, news_content, news_image],
|
191 |
+
outputs=[ordinary_user_result, fact_checker_result, governor_result],
|
192 |
+
)
|
193 |
|
194 |
# change Image
|
195 |
+
# url_input.change(load_image, inputs=url_input, outputs=image_view)
|
196 |
+
|
197 |
try:
|
198 |
+
with open(
|
199 |
+
"examples/example_text_real.txt",
|
200 |
+
encoding="utf-8",
|
201 |
+
) as file:
|
202 |
text_real_1 = file.read()
|
203 |
+
with open(
|
204 |
+
"examples/example_text_real_2.txt",
|
205 |
+
encoding="utf-8",
|
206 |
+
) as file:
|
207 |
text_real_2 = file.read()
|
208 |
+
with open(
|
209 |
+
"examples/example_text_LLM_topic.txt",
|
210 |
+
encoding="utf-8",
|
211 |
+
) as file:
|
212 |
text_llm_topic = file.read()
|
213 |
+
with open(
|
214 |
+
"examples/example_text_LLM_modification.txt",
|
215 |
+
encoding="utf-8",
|
216 |
+
) as file:
|
217 |
text_llm_modification = file.read()
|
218 |
+
with open(
|
219 |
+
"examples/example_text_LLM_entities.txt",
|
220 |
+
encoding="utf-8",
|
221 |
+
) as file:
|
222 |
text_llm_entities = file.read()
|
223 |
except FileNotFoundError:
|
224 |
print("File not found.")
|
225 |
except Exception as e:
|
226 |
print(f"An error occurred: {e}")
|
227 |
+
|
228 |
title_1 = "Southampton news: Leeds target striker Cameron Archer."
|
229 |
title_2 = "Southampton news: Leeds target striker Cameron Archer."
|
230 |
title_4 = "Japan pledges support for Ukraine with 100-year pact."
|
231 |
+
|
232 |
image_1 = "examples/example_image_real_1.jpg.webp"
|
233 |
image_2 = "examples/example_image_real_2.jpg.webp"
|
234 |
image_3 = "examples/example_image_real_3.jpg"
|
235 |
image_4 = "examples/example_image_real_4.jpg.webp"
|
236 |
+
|
237 |
gr.Examples(
|
238 |
examples=[
|
239 |
+
[title_1, image_1, text_real_1 + "\n\n" + text_real_2],
|
240 |
+
[title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
|
241 |
+
[title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
|
242 |
[title_4, image_4, text_llm_entities],
|
243 |
],
|
244 |
inputs=[news_title, news_image, news_content],
|
|
|
251 |
],
|
252 |
)
|
253 |
|
254 |
+
demo.launch(share=True)
|
application_3.py
ADDED
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
from src.application.content_detection import NewsVerification
|
6 |
+
from src.application.content_generation import (
|
7 |
+
generate_fake_image,
|
8 |
+
generate_fake_text,
|
9 |
+
replace_text,
|
10 |
+
)
|
11 |
+
from src.application.url_reader import URLReader
|
12 |
+
|
13 |
+
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
14 |
+
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
15 |
+
|
16 |
+
|
17 |
+
def load_url(url):
|
18 |
+
"""
|
19 |
+
Load content from the given URL.
|
20 |
+
"""
|
21 |
+
content = URLReader(url)
|
22 |
+
image = None
|
23 |
+
header = {
|
24 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", # noqa: E501
|
25 |
+
}
|
26 |
+
try:
|
27 |
+
response = requests.get(
|
28 |
+
url,
|
29 |
+
headers=header,
|
30 |
+
stream=True,
|
31 |
+
)
|
32 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
33 |
+
|
34 |
+
image_response = requests.get(content.top_image, stream=True)
|
35 |
+
try:
|
36 |
+
image = Image.open(image_response.raw)
|
37 |
+
except OSError as e:
|
38 |
+
print(f"Error loading image from {content.top_image}: {e}")
|
39 |
+
|
40 |
+
except (requests.exceptions.RequestException, FileNotFoundError) as e:
|
41 |
+
print(f"Error fetching image: {e}")
|
42 |
+
|
43 |
+
return content.title, content.text, image
|
44 |
+
|
45 |
+
|
46 |
+
def generate_analysis_report(
|
47 |
+
news_title: str,
|
48 |
+
news_content: str,
|
49 |
+
news_image: Image,
|
50 |
+
):
|
51 |
+
news_analysis = NewsVerification()
|
52 |
+
news_analysis.load_news(news_title, news_content, news_image)
|
53 |
+
news_analysis.generate_analysis_report()
|
54 |
+
return news_analysis.analyze_details()
|
55 |
+
|
56 |
+
|
57 |
+
# Define the GUI
|
58 |
+
with gr.Blocks() as demo:
|
59 |
+
gr.Markdown("# NEWS VERIFICATION")
|
60 |
+
|
61 |
+
with gr.Row():
|
62 |
+
# SETTINGS
|
63 |
+
with gr.Column(scale=1):
|
64 |
+
with gr.Accordion("1. Enter a URL"):
|
65 |
+
url_input = gr.Textbox(
|
66 |
+
label="",
|
67 |
+
show_label=False,
|
68 |
+
value="",
|
69 |
+
)
|
70 |
+
load_button = gr.Button("Load URL")
|
71 |
+
|
72 |
+
with gr.Accordion(
|
73 |
+
"2. Select content-generation models",
|
74 |
+
open=True,
|
75 |
+
visible=False,
|
76 |
+
):
|
77 |
+
with gr.Row():
|
78 |
+
text_generation_model = gr.Dropdown(
|
79 |
+
choices=AZURE_TEXT_MODEL,
|
80 |
+
label="Text-generation model",
|
81 |
+
)
|
82 |
+
image_generation_model = gr.Dropdown(
|
83 |
+
choices=AZURE_IMAGE_MODEL,
|
84 |
+
label="Image-generation model",
|
85 |
+
)
|
86 |
+
generate_text_button = gr.Button("Generate text")
|
87 |
+
generate_image_button = gr.Button("Generate image")
|
88 |
+
|
89 |
+
with gr.Accordion(
|
90 |
+
"3. Replace any terms",
|
91 |
+
open=True,
|
92 |
+
visible=False,
|
93 |
+
):
|
94 |
+
replace_df = gr.Dataframe(
|
95 |
+
headers=["Find what:", "Replace with:"],
|
96 |
+
datatype=["str", "str"],
|
97 |
+
row_count=(1, "dynamic"),
|
98 |
+
col_count=(2, "fixed"),
|
99 |
+
interactive=True,
|
100 |
+
)
|
101 |
+
replace_button = gr.Button("Replace all")
|
102 |
+
|
103 |
+
# GENERATED CONTENT
|
104 |
+
with gr.Accordion("Input News"):
|
105 |
+
news_title = gr.Textbox(label="Title", value="")
|
106 |
+
news_image = gr.Image(label="Image", type="filepath")
|
107 |
+
news_content = gr.Textbox(label="Content", value="", lines=13)
|
108 |
+
|
109 |
+
# NEWS ANALYSIS REPORT
|
110 |
+
ordinary_user_explanation = """
|
111 |
+
FOR ORDINARY USER<br>
|
112 |
+
- Green texts are the matched words in the input and source news.<br>
|
113 |
+
- Each highlighted pair (marked with a number) shows the key differences
|
114 |
+
between the input text and the source.
|
115 |
+
"""
|
116 |
+
fact_checker_explanation = """
|
117 |
+
FOR FACT CHECKER<br>
|
118 |
+
- Green texts are the matched words in the input and source news.<br>
|
119 |
+
- Each highlighted pair (marked with a number) shows the key differences
|
120 |
+
between the input text and the source.
|
121 |
+
"""
|
122 |
+
governor_explanation = """
|
123 |
+
FOR GOVERNOR<br>
|
124 |
+
- Green texts are the matched words in the input and source news.<br>
|
125 |
+
- Each highlighted pair (marked with a number) shows the key differences
|
126 |
+
between the input text and the source.
|
127 |
+
"""
|
128 |
+
table = """
|
129 |
+
<h5>Comparison between input news and source news:</h5>
|
130 |
+
<table border="1" style="width:100%; text-align:left;">
|
131 |
+
<col style="width: 170px;">
|
132 |
+
<col style="width: 170px;">
|
133 |
+
<col style="width: 30px;">
|
134 |
+
<col style="width: 75px;">
|
135 |
+
<thead>
|
136 |
+
<tr>
|
137 |
+
<th>Input news</th>
|
138 |
+
<th>Source (corresponding URL provided in Originality)</th>
|
139 |
+
<th>Forensic</th>
|
140 |
+
<th>Originality</th>
|
141 |
+
</tr>
|
142 |
+
</thead>
|
143 |
+
<tbody>
|
144 |
+
<tr>
|
145 |
+
<th>TBD</th>
|
146 |
+
<th>TBD</th>
|
147 |
+
<th>TBD</th>
|
148 |
+
<th>TBD</th>
|
149 |
+
</tr>
|
150 |
+
</tbody>
|
151 |
+
</table>
|
152 |
+
|
153 |
+
<style>"""
|
154 |
+
with gr.Column(scale=2):
|
155 |
+
with gr.Accordion("NEWS ANALYSIS"):
|
156 |
+
verification_button = gr.Button("Verify news")
|
157 |
+
with gr.Tab("Orinary User"):
|
158 |
+
gr.HTML(ordinary_user_explanation)
|
159 |
+
ordinary_user_result = gr.HTML(table)
|
160 |
+
with gr.Tab("Fact Checker"):
|
161 |
+
gr.HTML(fact_checker_explanation)
|
162 |
+
fact_checker_result = gr.HTML(table)
|
163 |
+
with gr.Tab("Governor"):
|
164 |
+
gr.HTML(governor_explanation)
|
165 |
+
governor_result = gr.HTML(table)
|
166 |
+
|
167 |
+
# Connect events
|
168 |
+
load_button.click(
|
169 |
+
load_url,
|
170 |
+
inputs=url_input,
|
171 |
+
outputs=[news_title, news_content, news_image],
|
172 |
+
)
|
173 |
+
replace_button.click(
|
174 |
+
replace_text,
|
175 |
+
inputs=[news_title, news_content, replace_df],
|
176 |
+
outputs=[news_title, news_content],
|
177 |
+
)
|
178 |
+
generate_text_button.click(
|
179 |
+
generate_fake_text,
|
180 |
+
inputs=[text_generation_model, news_title, news_content],
|
181 |
+
outputs=[news_title, news_content],
|
182 |
+
)
|
183 |
+
generate_image_button.click(
|
184 |
+
generate_fake_image,
|
185 |
+
inputs=[image_generation_model, news_title],
|
186 |
+
outputs=[news_image],
|
187 |
+
)
|
188 |
+
verification_button.click(
|
189 |
+
generate_analysis_report,
|
190 |
+
inputs=[news_title, news_content, news_image],
|
191 |
+
outputs=[ordinary_user_result, fact_checker_result, governor_result],
|
192 |
+
)
|
193 |
+
|
194 |
+
# change Image
|
195 |
+
# url_input.change(load_image, inputs=url_input, outputs=image_view)
|
196 |
+
|
197 |
+
try:
|
198 |
+
with open(
|
199 |
+
"examples/example_text_real.txt",
|
200 |
+
encoding="utf-8",
|
201 |
+
) as file:
|
202 |
+
text_real_1 = file.read()
|
203 |
+
with open(
|
204 |
+
"examples/example_text_real_2.txt",
|
205 |
+
encoding="utf-8",
|
206 |
+
) as file:
|
207 |
+
text_real_2 = file.read()
|
208 |
+
with open(
|
209 |
+
"examples/example_text_LLM_topic.txt",
|
210 |
+
encoding="utf-8",
|
211 |
+
) as file:
|
212 |
+
text_llm_topic = file.read()
|
213 |
+
with open(
|
214 |
+
"examples/example_text_LLM_modification.txt",
|
215 |
+
encoding="utf-8",
|
216 |
+
) as file:
|
217 |
+
text_llm_modification = file.read()
|
218 |
+
with open(
|
219 |
+
"examples/example_text_LLM_entities.txt",
|
220 |
+
encoding="utf-8",
|
221 |
+
) as file:
|
222 |
+
text_llm_entities = file.read()
|
223 |
+
except FileNotFoundError:
|
224 |
+
print("File not found.")
|
225 |
+
except Exception as e:
|
226 |
+
print(f"An error occurred: {e}")
|
227 |
+
|
228 |
+
title_1 = "Southampton news: Leeds target striker Cameron Archer."
|
229 |
+
title_2 = "Southampton news: Leeds target striker Cameron Archer."
|
230 |
+
title_4 = "Japan pledges support for Ukraine with 100-year pact."
|
231 |
+
|
232 |
+
image_1 = "examples/example_image_real_1.jpg.webp"
|
233 |
+
image_2 = "examples/example_image_real_2.jpg.webp"
|
234 |
+
image_3 = "examples/example_image_real_3.jpg"
|
235 |
+
image_4 = "examples/example_image_real_4.jpg.webp"
|
236 |
+
|
237 |
+
gr.Examples(
|
238 |
+
examples=[
|
239 |
+
[title_1, image_1, text_real_1 + "\n\n" + text_real_2],
|
240 |
+
[title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
|
241 |
+
[title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
|
242 |
+
[title_4, image_4, text_llm_entities],
|
243 |
+
],
|
244 |
+
inputs=[news_title, news_image, news_content],
|
245 |
+
label="Examples",
|
246 |
+
example_labels=[
|
247 |
+
"2 real news",
|
248 |
+
"1 real news + 1 LLM modification-based news",
|
249 |
+
"1 real news + 1 LLM topic-based news",
|
250 |
+
"1 LLM changed-entities news",
|
251 |
+
],
|
252 |
+
)
|
253 |
+
|
254 |
+
demo.launch(share=True)
|
examples/example_text_LLM_entities.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
Shigeru Ishiba has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
|
|
|
1 |
+
Shigeru Ishiba has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
|
examples/example_text_LLM_modification.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for Β£8m. Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
|
2 |
-
He made a substitute appearance and waved farewell to fans in Newcastle's recent loss against Southampton.
|
3 |
-
Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2020-21, and scored against Paris St-Germain in the Champions League.
|
|
|
1 |
+
Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for Β£8m. Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
|
2 |
+
He made a substitute appearance and waved farewell to fans in Newcastle's recent loss against Southampton.
|
3 |
+
Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2020-21, and scored against Paris St-Germain in the Champions League.
|
examples/example_text_LLM_topic.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
The January transfer window is in full swing, with clubs across the globe scrambling to strengthen their squads for the remainder of the season.
|
2 |
-
Premier League giants Manchester City have reportedly made a substantial bid for highly-rated midfielder Enzo Fernandez.
|
3 |
-
Meanwhile, struggling Serie A side Sampdoria are looking to bolster their attack with the loan signing of veteran striker Fabio Quagliarella.
|
4 |
-
Rumors are swirling around a potential move for Brazilian wonderkid Endrick to Real Madrid.
|
5 |
-
The transfer window officially closes on January 31st, leaving clubs with limited time to finalize their deals.
|
6 |
-
Fans are eagerly awaiting to see which teams make the shrewdest moves in this crucial period.
|
|
|
1 |
+
The January transfer window is in full swing, with clubs across the globe scrambling to strengthen their squads for the remainder of the season.
|
2 |
+
Premier League giants Manchester City have reportedly made a substantial bid for highly-rated midfielder Enzo Fernandez.
|
3 |
+
Meanwhile, struggling Serie A side Sampdoria are looking to bolster their attack with the loan signing of veteran striker Fabio Quagliarella.
|
4 |
+
Rumors are swirling around a potential move for Brazilian wonderkid Endrick to Real Madrid.
|
5 |
+
The transfer window officially closes on January 31st, leaving clubs with limited time to finalize their deals.
|
6 |
+
Fans are eagerly awaiting to see which teams make the shrewdest moves in this crucial period.
|
examples/example_text_real.txt
CHANGED
@@ -2,4 +2,4 @@ Leeds are targeting a move for Southampton striker Cameron Archer with early tal
|
|
2 |
|
3 |
It is unclear whether a deal can be achieved but the 23-year-old is open to a move before deadline day.
|
4 |
|
5 |
-
Other options are believed to be on the table as Archer seeks a guaranteed starting role after increasingly finding himself on the bench under recently appointed Saints manager Ivan Juric.
|
|
|
2 |
|
3 |
It is unclear whether a deal can be achieved but the 23-year-old is open to a move before deadline day.
|
4 |
|
5 |
+
Other options are believed to be on the table as Archer seeks a guaranteed starting role after increasingly finding himself on the bench under recently appointed Saints manager Ivan Juric.
|
examples/example_text_real_2.txt
CHANGED
@@ -4,4 +4,4 @@ The resignation brings a long political chapter to an end. Trudeau has been in o
|
|
4 |
|
5 |
Trudeau said he will remain at the helm until a new Liberal leader is selected.
|
6 |
|
7 |
-
But many questions remain for the party, including who will take over and how they will manage a looming federal election. So what happens next?
|
|
|
4 |
|
5 |
Trudeau said he will remain at the helm until a new Liberal leader is selected.
|
6 |
|
7 |
+
But many questions remain for the party, including who will take over and how they will manage a looming federal election. So what happens next?
|
gpt_test.py
CHANGED
@@ -1,34 +1,30 @@
|
|
1 |
import os
|
|
|
2 |
from dotenv import load_dotenv
|
3 |
from openai import AzureOpenAI
|
|
|
4 |
load_dotenv()
|
5 |
-
AZURE_OPENAI_API_KEY = os.getenv(
|
6 |
-
AZURE_OPENAI_ENDPOINT = os.getenv(
|
7 |
-
AZURE_OPENAI_API_VERSION = os.getenv(
|
8 |
|
9 |
azure_client = AzureOpenAI(
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
)
|
14 |
-
|
15 |
-
deplopment_name = "o1-mini"
|
16 |
TEXT_PROMPT = """
|
17 |
replace Ukraine with Denmark:
|
18 |
|
19 |
-
"Sir Keir Starmer has pledged to put Ukraine in the "strongest
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back".
|
24 |
-
|
25 |
-
An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east.
|
26 |
-
|
27 |
-
Zelensky praised the UK's commitment on Thursday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid."
|
28 |
"""
|
29 |
-
|
30 |
response = azure_client.chat.completions.create(
|
31 |
-
model=deplopment_name,
|
32 |
messages=[
|
33 |
# {"role": "system", "content": "You are a helpful assistant."},
|
34 |
{"role": "user", "content": TEXT_PROMPT},
|
@@ -36,4 +32,4 @@ response = azure_client.chat.completions.create(
|
|
36 |
# max_tokens=512,
|
37 |
# temperature=0,
|
38 |
)
|
39 |
-
print(response.choices[0].message.content)
|
|
|
1 |
import os
|
2 |
+
|
3 |
from dotenv import load_dotenv
|
4 |
from openai import AzureOpenAI
|
5 |
+
|
6 |
load_dotenv()
|
7 |
+
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
8 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
9 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
10 |
|
11 |
azure_client = AzureOpenAI(
|
12 |
+
azure_endpoint="https://quoc-nguyen.openai.azure.com/",
|
13 |
+
api_key=AZURE_OPENAI_API_KEY,
|
14 |
+
api_version="2024-05-01-preview",
|
15 |
)
|
16 |
+
|
17 |
+
deplopment_name = "o1-mini" # or "gpt-4o"
|
18 |
TEXT_PROMPT = """
|
19 |
replace Ukraine with Denmark:
|
20 |
|
21 |
+
"Sir Keir Starmer has pledged to put Ukraine in the "strongest
|
22 |
+
possible position" on a trip to Kyiv where he signed a
|
23 |
+
"landmark" 100-year pact with the war-stricken country.
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
"""
|
25 |
+
|
26 |
response = azure_client.chat.completions.create(
|
27 |
+
model=deplopment_name, # model = "deployment_name".
|
28 |
messages=[
|
29 |
# {"role": "system", "content": "You are a helpful assistant."},
|
30 |
{"role": "user", "content": TEXT_PROMPT},
|
|
|
32 |
# max_tokens=512,
|
33 |
# temperature=0,
|
34 |
)
|
35 |
+
print(response.choices[0].message.content)
|
requirements.txt
CHANGED
@@ -28,4 +28,4 @@ pytorch_lightning
|
|
28 |
torchvision
|
29 |
torch
|
30 |
lightning
|
31 |
-
timm
|
|
|
28 |
torchvision
|
29 |
torch
|
30 |
lightning
|
31 |
+
timm
|
src/application/content_detection.py
CHANGED
@@ -1,49 +1,63 @@
|
|
1 |
from difflib import SequenceMatcher
|
2 |
|
3 |
import pandas as pd
|
4 |
-
|
5 |
-
from src.application.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from src.application.text.helper import extract_equal_text
|
7 |
from src.application.text.model_detection import detect_text_by_ai_model
|
8 |
from src.application.text.preprocessing import split_into_paragraphs
|
9 |
-
from src.application.text.search_detection import
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
-
class NewsVerification
|
13 |
def __init__(self):
|
14 |
self.news_text = ""
|
15 |
self.news_title = ""
|
16 |
self.news_content = ""
|
17 |
self.news_image = ""
|
18 |
-
|
19 |
-
self.text_prediction_label:list[str] = []
|
20 |
-
self.text_prediction_score:list[float] = []
|
21 |
-
self.text_referent_url:list[str] = []
|
22 |
-
self.image_prediction_label:list[str] = []
|
23 |
-
self.image_prediction_score:list[str] = []
|
24 |
-
self.image_referent_url:list[str] = []
|
25 |
self.news_prediction_label = ""
|
26 |
self.news_prediction_score = -1
|
27 |
-
|
28 |
-
self.found_img_url:list[str] = [
|
29 |
-
self.aligned_sentences:list[dict] = []
|
30 |
-
self.aligned_sentences_df:pd.DataFrame = pd.DataFrame(
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
self.
|
43 |
-
|
44 |
-
self.
|
|
|
|
|
45 |
self.entities_with_colors = []
|
46 |
-
|
47 |
def load_news(self, news_title, news_content, news_image):
|
48 |
self.news_text = news_title + "\n\n" + news_content
|
49 |
self.news_title = news_title
|
@@ -52,13 +66,14 @@ class NewsVerification():
|
|
52 |
|
53 |
def determine_text_origin(self):
|
54 |
"""
|
55 |
-
Determines the origin of the given text based on paraphrasing detection
|
|
|
56 |
|
57 |
Args:
|
58 |
text: The input text to be analyzed.
|
59 |
|
60 |
Returns:
|
61 |
-
str: The predicted origin of the text:
|
62 |
- "HUMAN": If the text is likely written by a human.
|
63 |
- "MACHINE": If the text is likely generated by a machine.
|
64 |
"""
|
@@ -75,7 +90,7 @@ class NewsVerification():
|
|
75 |
"similarity": None,
|
76 |
"paraphrase": False,
|
77 |
"url": "",
|
78 |
-
|
79 |
|
80 |
for index, sentence in enumerate(input_sentences):
|
81 |
print(f"-------index = {index}-------")
|
@@ -83,10 +98,20 @@ class NewsVerification():
|
|
83 |
|
84 |
if current_index >= len(input_sentences):
|
85 |
break
|
86 |
-
if
|
|
|
|
|
|
|
|
|
87 |
continue
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
if paraphrase is False:
|
92 |
# add sentence to ai_sentence
|
@@ -95,19 +120,27 @@ class NewsVerification():
|
|
95 |
ai_sentence["input_sentence"] += sentence
|
96 |
if index == len(input_sentences) - 1:
|
97 |
# add ai_sentences to align_sentences
|
98 |
-
text_prediction_label, text_prediction_score =
|
|
|
|
|
99 |
ai_sentence["label"] = text_prediction_label
|
100 |
ai_sentence["similarity"] = text_prediction_score
|
101 |
self.aligned_sentences.append(ai_sentence)
|
102 |
else:
|
103 |
if previous_paraphrase is False or previous_paraphrase is None:
|
104 |
# add ai_sentences to align_sentences
|
105 |
-
if ai_sentence[
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
ai_sentence["label"] = text_prediction_label
|
108 |
ai_sentence["similarity"] = text_prediction_score
|
109 |
self.aligned_sentences.append(ai_sentence)
|
110 |
-
|
111 |
# reset
|
112 |
ai_sentence = {
|
113 |
"input_sentence": "",
|
@@ -116,7 +149,7 @@ class NewsVerification():
|
|
116 |
"similarity": None,
|
117 |
"paraphrase": False,
|
118 |
"url": "",
|
119 |
-
|
120 |
|
121 |
# add searched_sentences to align_sentences
|
122 |
if searched_sentences["input_sentence"] != "":
|
@@ -125,20 +158,21 @@ class NewsVerification():
|
|
125 |
searched_sentences["label"] = "HUMAN"
|
126 |
else:
|
127 |
searched_sentences["label"] = "MACHINE"
|
128 |
-
|
129 |
self.aligned_sentences.append(searched_sentences)
|
130 |
|
131 |
previous_paraphrase = paraphrase
|
132 |
|
133 |
def determine_text_origin_2(self):
|
134 |
"""
|
135 |
-
Determines the origin of the given text based on paraphrasing detection
|
|
|
136 |
|
137 |
Args:
|
138 |
text: The input text to be analyzed.
|
139 |
|
140 |
Returns:
|
141 |
-
str: The predicted origin of the text:
|
142 |
- "HUMAN": If the text is likely written by a human.
|
143 |
- "MACHINE": If the text is likely generated by a machine.
|
144 |
"""
|
@@ -150,17 +184,17 @@ class NewsVerification():
|
|
150 |
self.aligned_sentences_df = pd.concat(
|
151 |
[self.aligned_sentences_df, pd.DataFrame([{}])],
|
152 |
ignore_index=False,
|
153 |
-
|
154 |
|
155 |
for index, sentence in enumerate(input_sentences):
|
156 |
print(f"-------index = {index}-------")
|
157 |
print(f"current_sentence = {input_sentences[index]}")
|
158 |
-
|
159 |
if self.aligned_sentences_df["url"] is not None:
|
160 |
continue
|
161 |
|
162 |
self.aligned_sentences_df, img_urls = find_text_source(
|
163 |
-
input_sentences[index],
|
164 |
self.aligned_sentences_df,
|
165 |
)
|
166 |
|
@@ -171,25 +205,30 @@ class NewsVerification():
|
|
171 |
self.image_prediction_score = 0.0
|
172 |
self.image_referent_url = None
|
173 |
return
|
174 |
-
|
175 |
for image in self.found_img_url:
|
176 |
-
print(f"\tfound_img_url: {image}")
|
177 |
-
matched_url, similarity = detect_image_from_news_image(
|
|
|
|
|
|
|
178 |
if matched_url is not None:
|
179 |
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
|
180 |
self.image_prediction_label = "HUMAN"
|
181 |
self.image_prediction_score = similarity
|
182 |
self.image_referent_url = matched_url
|
183 |
return
|
184 |
-
|
185 |
-
matched_url, similarity = detect_image_by_reverse_search(
|
|
|
|
|
186 |
if matched_url is not None:
|
187 |
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
|
188 |
self.image_prediction_label = "HUMAN"
|
189 |
self.image_prediction_score = similarity
|
190 |
self.image_referent_url = matched_url
|
191 |
return
|
192 |
-
|
193 |
detected_label, score = detect_image_by_ai_model(self.news_image)
|
194 |
if detected_label:
|
195 |
print(f"detected_label: {detected_label} ({score})")
|
@@ -197,7 +236,7 @@ class NewsVerification():
|
|
197 |
self.image_prediction_score = score
|
198 |
self.image_referent_url = None
|
199 |
return
|
200 |
-
|
201 |
self.image_prediction_label = "UNKNOWN"
|
202 |
self.image_prediction_score = 50
|
203 |
self.image_referent_url = None
|
@@ -209,15 +248,17 @@ class NewsVerification():
|
|
209 |
text_prediction_score = 50
|
210 |
else:
|
211 |
text_prediction_score = self.text_prediction_score
|
212 |
-
|
213 |
if self.image_prediction_label == "MACHINE":
|
214 |
image_prediction_score = 100 - self.image_prediction_score
|
215 |
elif self.image_prediction_label == "UNKNOWN":
|
216 |
image_prediction_score = 50
|
217 |
else:
|
218 |
image_prediction_score = self.image_prediction_score
|
219 |
-
|
220 |
-
news_prediction_score = (
|
|
|
|
|
221 |
if news_prediction_score > 50:
|
222 |
self.news_prediction_score = news_prediction_score
|
223 |
self.news_prediction_label = "HUMAN"
|
@@ -234,37 +275,25 @@ class NewsVerification():
|
|
234 |
for index, aligned_sentence in enumerate(self.aligned_sentences):
|
235 |
# Get entity-words (in pair) with colors
|
236 |
entities_with_colors = highlight_entities(
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
self.aligned_sentences[index]["entities"] = entities_with_colors
|
241 |
-
|
242 |
ordinary_user_table = self.create_ordinary_user_table()
|
243 |
fact_checker_table = self.create_fact_checker_table()
|
244 |
governor_table = self.create_governor_table()
|
245 |
|
246 |
return ordinary_user_table, fact_checker_table, governor_table
|
247 |
-
|
248 |
def get_text_urls(self):
|
249 |
return set(self.text_referent_url)
|
250 |
|
251 |
-
|
252 |
def compare_sentences(self, sentence_1, sentence_2, position, color):
|
253 |
"""
|
254 |
-
Compares two sentences and identifies common phrases,
|
255 |
-
|
256 |
-
Args:
|
257 |
-
sentence_1: The first sentence (string).
|
258 |
-
sentence_2: The second sentence (string).
|
259 |
|
260 |
-
Returns:
|
261 |
-
A list of dictionaries, where each dictionary represents a common phrase and contains:
|
262 |
-
- "phrase": The common phrase (string).
|
263 |
-
- "start_1": The starting index of the phrase in sentence_1 (int).
|
264 |
-
- "end_1": The ending index of the phrase in sentence_1 (int).
|
265 |
-
- "start_2": The starting index of the phrase in sentence_2 (int).
|
266 |
-
- "end_2": The ending index of the phrase in sentence_2 (int).
|
267 |
-
Returns an empty list if no common phrases are found. Handles edge cases like empty strings.
|
268 |
"""
|
269 |
|
270 |
if not sentence_1 or not sentence_2: # Handle empty strings
|
@@ -280,16 +309,20 @@ class NewsVerification():
|
|
280 |
start_2 = block.b
|
281 |
end_2 = block.b + block.size
|
282 |
|
283 |
-
phrase = sentence_1[
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
|
|
|
|
|
|
|
|
293 |
position += len(sentence_1)
|
294 |
return common_phrases, position
|
295 |
|
@@ -297,17 +330,17 @@ class NewsVerification():
|
|
297 |
rows = []
|
298 |
max_length = 30 # TODO: put this in configuration
|
299 |
rows.append(self.format_image_fact_checker_row(max_length))
|
300 |
-
|
301 |
for aligned_sentence in self.aligned_sentences:
|
302 |
if "input_sentence" not in aligned_sentence:
|
303 |
continue
|
304 |
-
|
305 |
# Get index of equal phrases in input and source sentences
|
306 |
equal_idx_1, equal_idx_2 = extract_equal_text(
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
# Get entity-words (in pair) with colors
|
312 |
# entities_with_colors = highlight_entities(
|
313 |
# aligned_sentence["input_sentence"],
|
@@ -320,32 +353,35 @@ class NewsVerification():
|
|
320 |
equal_idx_1,
|
321 |
equal_idx_2,
|
322 |
aligned_sentence["entities"],
|
323 |
-
]
|
324 |
)
|
325 |
|
326 |
for row in self.fact_checker_table:
|
327 |
formatted_row = self.format_text_fact_checker_row(row, max_length)
|
328 |
rows.append(formatted_row)
|
329 |
-
|
330 |
table = "\n".join(rows)
|
331 |
return f"""
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
</
|
343 |
-
<
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
|
|
|
|
|
|
349 |
"""
|
350 |
|
351 |
def format_text_fact_checker_row(self, row, max_length=30):
|
@@ -354,50 +390,76 @@ class NewsVerification():
|
|
354 |
return ""
|
355 |
if row[0]["matched_sentence"] != "": # source is not empty
|
356 |
# highlight entities
|
357 |
-
input_sentence, highlight_idx_input = apply_highlight(
|
358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
359 |
entity_count = len(row[3])
|
360 |
-
|
361 |
# Color overlapping words
|
362 |
-
input_sentence = self.color_text(
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
else:
|
368 |
input_sentence = row[0]["input_sentence"]
|
369 |
source_sentence = row[0]["matched_sentence"]
|
370 |
|
371 |
label = row[0]["label"]
|
372 |
score = row[0]["similarity"]
|
373 |
-
|
374 |
-
url = row[0]["url"]
|
375 |
short_url = self.shorten_url(url, max_length)
|
376 |
source_text_url = f"""<a href="{url}">{short_url}</a>"""
|
377 |
-
|
378 |
entity_count_text = self.get_entity_count_text(entity_count)
|
379 |
-
|
380 |
return f"""
|
381 |
<tr>
|
382 |
<td>{input_sentence}</td>
|
383 |
<td>{source_sentence}</td>
|
384 |
-
<td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
|
385 |
<td>{source_text_url}</td>
|
386 |
</tr>
|
387 |
"""
|
388 |
|
389 |
-
def format_image_fact_checker_row(self, max_length=30):
|
390 |
-
|
391 |
-
if
|
392 |
-
|
|
|
|
|
|
|
393 |
short_url = self.shorten_url(self.image_referent_url, max_length)
|
394 |
-
source_image_url =
|
|
|
|
|
395 |
else:
|
396 |
source_image = "Image not found"
|
397 |
source_image_url = ""
|
398 |
|
399 |
-
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
400 |
-
|
401 |
|
402 |
def create_ordinary_user_table(self):
|
403 |
rows = []
|
@@ -405,24 +467,27 @@ class NewsVerification():
|
|
405 |
rows.append(self.format_image_ordinary_user_row(max_length))
|
406 |
rows.append(self.format_text_ordinary_user_row(max_length))
|
407 |
table = "\n".join(rows)
|
408 |
-
|
409 |
return f"""
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
</
|
420 |
-
<
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
|
|
|
|
|
|
426 |
"""
|
427 |
|
428 |
def format_text_ordinary_user_row(self, max_length=30):
|
@@ -436,152 +501,184 @@ class NewsVerification():
|
|
436 |
continue
|
437 |
input_sentences += row["input_sentence"] + "<br><br>"
|
438 |
label = self.aligned_sentences[index]["label"]
|
439 |
-
|
440 |
-
url = self.aligned_sentences[index]["url"]
|
441 |
short_url = self.shorten_url(url, max_length)
|
442 |
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
443 |
sentence_count += 1
|
444 |
-
|
445 |
scores, label = self.calculate_score_label()
|
446 |
-
|
447 |
return f"""
|
448 |
<tr>
|
449 |
<td>{input_sentences}</td>
|
450 |
-
<td>{label}<br>({scores*100:.2f}%)</td>
|
451 |
<td>{source_text_urls}</td>
|
452 |
</tr>
|
453 |
"""
|
454 |
|
455 |
-
def format_image_ordinary_user_row(self, max_length=30):
|
456 |
-
|
457 |
-
if
|
458 |
-
|
|
|
|
|
459 |
short_url = self.shorten_url(self.image_referent_url, max_length)
|
460 |
-
source_image_url =
|
|
|
|
|
461 |
else:
|
462 |
# source_image = "Image not found"
|
463 |
source_image_url = ""
|
464 |
|
465 |
-
return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
466 |
-
|
467 |
|
468 |
def create_governor_table(self):
|
469 |
rows = []
|
470 |
max_length = 30 # TODO: put this in configuration
|
471 |
rows.append(self.format_image_governor_row(max_length))
|
472 |
-
|
473 |
for aligned_sentence in self.aligned_sentences:
|
474 |
if "input_sentence" not in aligned_sentence:
|
475 |
continue
|
476 |
-
|
477 |
# Get index of equal phrases in input and source sentences
|
478 |
equal_idx_1, equal_idx_2 = extract_equal_text(
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
# Get entity-words (in pair) with colors
|
484 |
# entities_with_colors = highlight_entities(
|
485 |
# aligned_sentence["input_sentence"],
|
486 |
# aligned_sentence["matched_sentence"],
|
487 |
# )
|
488 |
-
|
489 |
self.governor_table.append(
|
490 |
[
|
491 |
aligned_sentence,
|
492 |
equal_idx_1,
|
493 |
equal_idx_2,
|
494 |
aligned_sentence["entities"],
|
495 |
-
]
|
496 |
)
|
497 |
|
498 |
formatted_row = self.format_text_governor_row(max_length)
|
499 |
rows.append(formatted_row)
|
500 |
-
|
501 |
table = "\n".join(rows)
|
502 |
return f"""
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
|
|
|
|
|
|
520 |
"""
|
521 |
|
522 |
-
def format_text_governor_row(self,
|
523 |
input_sentences = ""
|
524 |
source_sentences = ""
|
525 |
source_text_urls = ""
|
526 |
label = ""
|
527 |
-
scores = 0
|
528 |
sentence_count = 0
|
529 |
entity_count = 0
|
530 |
for row in self.governor_table:
|
531 |
print(f"governor_row: {row}")
|
532 |
if row[0]["input_sentence"] == "":
|
533 |
continue
|
534 |
-
|
535 |
if row[0]["matched_sentence"] != "": # source is not empty
|
536 |
# highlight entities
|
537 |
-
input_sentence, highlight_idx_input = apply_highlight(
|
538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
entity_count += len(row[3])
|
540 |
-
|
541 |
# Color overlapping words
|
542 |
-
input_sentence = self.color_text(
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
547 |
|
548 |
else:
|
549 |
input_sentence = row[0]["input_sentence"]
|
550 |
source_sentence = row[0]["matched_sentence"]
|
551 |
-
|
552 |
-
# convert score to HUMAN-based score:
|
553 |
input_sentences += input_sentence + "<br><br>"
|
554 |
source_sentences += source_sentence + "<br><br>"
|
555 |
-
|
556 |
-
|
557 |
url = row[0]["url"]
|
558 |
short_url = self.shorten_url(url, max_length)
|
559 |
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
560 |
sentence_count += 1
|
561 |
-
|
562 |
score, label = self.calculate_score_label()
|
563 |
entity_count_text = self.get_entity_count_text(entity_count)
|
564 |
|
565 |
return f"""
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
"""
|
573 |
|
574 |
def format_image_governor_row(self, max_length=30):
|
575 |
-
if
|
576 |
-
|
|
|
|
|
|
|
577 |
short_url = self.shorten_url(self.image_referent_url, max_length)
|
578 |
-
source_image_url =
|
|
|
|
|
579 |
else:
|
580 |
source_image = "Image not found"
|
581 |
source_image_url = ""
|
582 |
|
583 |
-
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
|
584 |
-
|
585 |
|
586 |
def get_entity_count_text(self, entity_count):
|
587 |
if entity_count <= 0:
|
@@ -595,52 +692,51 @@ class NewsVerification():
|
|
595 |
def shorten_url(self, url, max_length=30):
|
596 |
if url is None:
|
597 |
return ""
|
598 |
-
|
599 |
if len(url) > max_length:
|
600 |
short_url = url[:max_length] + "..."
|
601 |
else:
|
602 |
short_url = url
|
603 |
return short_url
|
604 |
|
605 |
-
|
606 |
def color_text(self, text, colored_idx, highlighted_idx):
|
607 |
paragraph = ""
|
608 |
words = text.split()
|
609 |
-
|
610 |
starts, ends = self.extract_starts_ends(colored_idx)
|
611 |
starts, ends = self.filter_indices(starts, ends, highlighted_idx)
|
612 |
|
613 |
previous_end = 0
|
614 |
for start, end in zip(starts, ends):
|
615 |
paragraph += " ".join(words[previous_end:start])
|
616 |
-
|
617 |
equal_words = " ".join(words[start:end])
|
618 |
paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
619 |
-
|
620 |
previous_end = end
|
621 |
-
|
622 |
-
# Some left words due to the punctuation separated from
|
623 |
# the highlighting text
|
624 |
equal_words = " ".join(words[previous_end:])
|
625 |
print(f"starts_2: {previous_end}")
|
626 |
-
print(f"ends_2: {len(words)-1}")
|
627 |
print(f"equal_words: {words[previous_end:]}")
|
628 |
paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
629 |
|
630 |
return paragraph
|
631 |
-
|
632 |
def extract_starts_ends(self, colored_idx):
|
633 |
starts = []
|
634 |
ends = []
|
635 |
for index in colored_idx:
|
636 |
-
starts.append(index[
|
637 |
-
ends.append(index[
|
638 |
return starts, ends
|
639 |
-
|
640 |
-
|
641 |
def filter_indices(self, starts, ends, ignore_indices):
|
642 |
"""
|
643 |
-
Filters start and end indices to exclude any indices present in the
|
|
|
644 |
|
645 |
Args:
|
646 |
starts: A list of starting indices.
|
@@ -648,23 +744,26 @@ class NewsVerification():
|
|
648 |
ignore_indices: A list of indices to exclude.
|
649 |
|
650 |
Returns:
|
651 |
-
A tuple
|
652 |
-
Returns empty lists if the input is invalid
|
|
|
653 |
Prints error messages for invalid input.
|
654 |
-
|
655 |
Examples:
|
656 |
starts = [0, 5, 10]
|
657 |
ends = [3, 7, 12]
|
658 |
ignore_indices = [1, 2, 11, 17]
|
659 |
-
|
660 |
-
# Output:
|
661 |
starts = [0, 3, 5, 10, 12]
|
662 |
ends = [0, 3, 7, 10, 12]
|
663 |
|
664 |
"""
|
665 |
|
666 |
if len(starts) != len(ends):
|
667 |
-
print(
|
|
|
|
|
668 |
return [], []
|
669 |
|
670 |
filtered_starts = []
|
@@ -675,10 +774,11 @@ class NewsVerification():
|
|
675 |
end = ends[i]
|
676 |
|
677 |
if end < start:
|
678 |
-
print(
|
|
|
|
|
679 |
return [], []
|
680 |
|
681 |
-
|
682 |
start_end = list(range(start, end + 1, 1))
|
683 |
start_end = list(set(start_end) - set(ignore_indices))
|
684 |
new_start, new_end = self.extract_sequences(start_end)
|
@@ -690,7 +790,7 @@ class NewsVerification():
|
|
690 |
def extract_sequences(self, numbers):
|
691 |
if len(numbers) == 1:
|
692 |
return [numbers[0]], [numbers[0]]
|
693 |
-
|
694 |
numbers.sort()
|
695 |
starts = []
|
696 |
ends = []
|
@@ -699,21 +799,21 @@ class NewsVerification():
|
|
699 |
start = number
|
700 |
end = number
|
701 |
continue
|
702 |
-
|
703 |
-
if number - 1 == numbers[i-1]:
|
704 |
end = number
|
705 |
else:
|
706 |
starts.append(start)
|
707 |
ends.append(end + 1)
|
708 |
start = number
|
709 |
end = number
|
710 |
-
|
711 |
if i == len(numbers) - 1:
|
712 |
starts.append(start)
|
713 |
ends.append(end + 1)
|
714 |
-
|
715 |
return starts, ends
|
716 |
-
|
717 |
def calculate_score_label(self):
|
718 |
human_score = []
|
719 |
machine_score = []
|
@@ -726,7 +826,7 @@ class NewsVerification():
|
|
726 |
elif sentence["label"] == "MACHINE":
|
727 |
machine_score.append(1 - sentence["similarity"])
|
728 |
machine_flag = True
|
729 |
-
|
730 |
if machine_flag is True and len(machine_score) > 0:
|
731 |
# average value of machine_score
|
732 |
machine_score_avg = sum(machine_score) / len(machine_score)
|
@@ -739,5 +839,3 @@ class NewsVerification():
|
|
739 |
return human_score_avg, "HUMAN"
|
740 |
else:
|
741 |
return 0, "UNKNOWN"
|
742 |
-
|
743 |
-
|
|
|
1 |
from difflib import SequenceMatcher
|
2 |
|
3 |
import pandas as pd
|
4 |
+
|
5 |
+
from src.application.image.image_detection import (
|
6 |
+
detect_image_by_ai_model,
|
7 |
+
detect_image_by_reverse_search,
|
8 |
+
detect_image_from_news_image,
|
9 |
+
)
|
10 |
+
from src.application.text.entity import (
|
11 |
+
apply_highlight,
|
12 |
+
highlight_entities,
|
13 |
+
)
|
14 |
from src.application.text.helper import extract_equal_text
|
15 |
from src.application.text.model_detection import detect_text_by_ai_model
|
16 |
from src.application.text.preprocessing import split_into_paragraphs
|
17 |
+
from src.application.text.search_detection import (
|
18 |
+
check_human,
|
19 |
+
detect_text_by_relative_search,
|
20 |
+
find_text_source,
|
21 |
+
)
|
22 |
|
23 |
|
24 |
+
class NewsVerification:
|
25 |
def __init__(self):
|
26 |
self.news_text = ""
|
27 |
self.news_title = ""
|
28 |
self.news_content = ""
|
29 |
self.news_image = ""
|
30 |
+
|
31 |
+
self.text_prediction_label: list[str] = []
|
32 |
+
self.text_prediction_score: list[float] = []
|
33 |
+
self.text_referent_url: list[str] = []
|
34 |
+
self.image_prediction_label: list[str] = []
|
35 |
+
self.image_prediction_score: list[str] = []
|
36 |
+
self.image_referent_url: list[str] = []
|
37 |
self.news_prediction_label = ""
|
38 |
self.news_prediction_score = -1
|
39 |
+
|
40 |
+
self.found_img_url: list[str] = []
|
41 |
+
self.aligned_sentences: list[dict] = []
|
42 |
+
self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
|
43 |
+
columns=[
|
44 |
+
"input_sentence",
|
45 |
+
"matched_sentence",
|
46 |
+
"label",
|
47 |
+
"similarity",
|
48 |
+
"paraphrase",
|
49 |
+
"url",
|
50 |
+
"group",
|
51 |
+
"entities",
|
52 |
+
],
|
53 |
+
)
|
54 |
+
self.is_paraphrased: list[bool] = []
|
55 |
+
|
56 |
+
self.ordinary_user_table: list = []
|
57 |
+
self.fact_checker_table: list = []
|
58 |
+
self.governor_table: list = []
|
59 |
self.entities_with_colors = []
|
60 |
+
|
61 |
def load_news(self, news_title, news_content, news_image):
|
62 |
self.news_text = news_title + "\n\n" + news_content
|
63 |
self.news_title = news_title
|
|
|
66 |
|
67 |
def determine_text_origin(self):
|
68 |
"""
|
69 |
+
Determines the origin of the given text based on paraphrasing detection
|
70 |
+
and human authorship analysis.
|
71 |
|
72 |
Args:
|
73 |
text: The input text to be analyzed.
|
74 |
|
75 |
Returns:
|
76 |
+
str: The predicted origin of the text:
|
77 |
- "HUMAN": If the text is likely written by a human.
|
78 |
- "MACHINE": If the text is likely generated by a machine.
|
79 |
"""
|
|
|
90 |
"similarity": None,
|
91 |
"paraphrase": False,
|
92 |
"url": "",
|
93 |
+
}
|
94 |
|
95 |
for index, sentence in enumerate(input_sentences):
|
96 |
print(f"-------index = {index}-------")
|
|
|
98 |
|
99 |
if current_index >= len(input_sentences):
|
100 |
break
|
101 |
+
if (
|
102 |
+
current_index > index
|
103 |
+
and index != 0
|
104 |
+
and index != len(input_sentences) - 1
|
105 |
+
):
|
106 |
continue
|
107 |
+
|
108 |
+
(
|
109 |
+
paraphrase,
|
110 |
+
text_url,
|
111 |
+
searched_sentences,
|
112 |
+
img_urls,
|
113 |
+
current_index,
|
114 |
+
) = detect_text_by_relative_search(input_sentences, index)
|
115 |
|
116 |
if paraphrase is False:
|
117 |
# add sentence to ai_sentence
|
|
|
120 |
ai_sentence["input_sentence"] += sentence
|
121 |
if index == len(input_sentences) - 1:
|
122 |
# add ai_sentences to align_sentences
|
123 |
+
text_prediction_label, text_prediction_score = (
|
124 |
+
detect_text_by_ai_model(ai_sentence["input_sentence"])
|
125 |
+
)
|
126 |
ai_sentence["label"] = text_prediction_label
|
127 |
ai_sentence["similarity"] = text_prediction_score
|
128 |
self.aligned_sentences.append(ai_sentence)
|
129 |
else:
|
130 |
if previous_paraphrase is False or previous_paraphrase is None:
|
131 |
# add ai_sentences to align_sentences
|
132 |
+
if ai_sentence[
|
133 |
+
"input_sentence"
|
134 |
+
] != "" or current_index >= len(input_sentences):
|
135 |
+
text_prediction_label, text_prediction_score = (
|
136 |
+
detect_text_by_ai_model(
|
137 |
+
ai_sentence["input_sentence"],
|
138 |
+
)
|
139 |
+
)
|
140 |
ai_sentence["label"] = text_prediction_label
|
141 |
ai_sentence["similarity"] = text_prediction_score
|
142 |
self.aligned_sentences.append(ai_sentence)
|
143 |
+
|
144 |
# reset
|
145 |
ai_sentence = {
|
146 |
"input_sentence": "",
|
|
|
149 |
"similarity": None,
|
150 |
"paraphrase": False,
|
151 |
"url": "",
|
152 |
+
}
|
153 |
|
154 |
# add searched_sentences to align_sentences
|
155 |
if searched_sentences["input_sentence"] != "":
|
|
|
158 |
searched_sentences["label"] = "HUMAN"
|
159 |
else:
|
160 |
searched_sentences["label"] = "MACHINE"
|
161 |
+
|
162 |
self.aligned_sentences.append(searched_sentences)
|
163 |
|
164 |
previous_paraphrase = paraphrase
|
165 |
|
166 |
def determine_text_origin_2(self):
|
167 |
"""
|
168 |
+
Determines the origin of the given text based on paraphrasing detection
|
169 |
+
and human authorship analysis.
|
170 |
|
171 |
Args:
|
172 |
text: The input text to be analyzed.
|
173 |
|
174 |
Returns:
|
175 |
+
str: The predicted origin of the text:
|
176 |
- "HUMAN": If the text is likely written by a human.
|
177 |
- "MACHINE": If the text is likely generated by a machine.
|
178 |
"""
|
|
|
184 |
self.aligned_sentences_df = pd.concat(
|
185 |
[self.aligned_sentences_df, pd.DataFrame([{}])],
|
186 |
ignore_index=False,
|
187 |
+
)
|
188 |
|
189 |
for index, sentence in enumerate(input_sentences):
|
190 |
print(f"-------index = {index}-------")
|
191 |
print(f"current_sentence = {input_sentences[index]}")
|
192 |
+
|
193 |
if self.aligned_sentences_df["url"] is not None:
|
194 |
continue
|
195 |
|
196 |
self.aligned_sentences_df, img_urls = find_text_source(
|
197 |
+
input_sentences[index],
|
198 |
self.aligned_sentences_df,
|
199 |
)
|
200 |
|
|
|
205 |
self.image_prediction_score = 0.0
|
206 |
self.image_referent_url = None
|
207 |
return
|
208 |
+
|
209 |
for image in self.found_img_url:
|
210 |
+
print(f"\tfound_img_url: {image}")
|
211 |
+
matched_url, similarity = detect_image_from_news_image(
|
212 |
+
self.news_image,
|
213 |
+
self.found_img_url,
|
214 |
+
)
|
215 |
if matched_url is not None:
|
216 |
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
|
217 |
self.image_prediction_label = "HUMAN"
|
218 |
self.image_prediction_score = similarity
|
219 |
self.image_referent_url = matched_url
|
220 |
return
|
221 |
+
|
222 |
+
matched_url, similarity = detect_image_by_reverse_search(
|
223 |
+
self.news_image,
|
224 |
+
)
|
225 |
if matched_url is not None:
|
226 |
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
|
227 |
self.image_prediction_label = "HUMAN"
|
228 |
self.image_prediction_score = similarity
|
229 |
self.image_referent_url = matched_url
|
230 |
return
|
231 |
+
|
232 |
detected_label, score = detect_image_by_ai_model(self.news_image)
|
233 |
if detected_label:
|
234 |
print(f"detected_label: {detected_label} ({score})")
|
|
|
236 |
self.image_prediction_score = score
|
237 |
self.image_referent_url = None
|
238 |
return
|
239 |
+
|
240 |
self.image_prediction_label = "UNKNOWN"
|
241 |
self.image_prediction_score = 50
|
242 |
self.image_referent_url = None
|
|
|
248 |
text_prediction_score = 50
|
249 |
else:
|
250 |
text_prediction_score = self.text_prediction_score
|
251 |
+
|
252 |
if self.image_prediction_label == "MACHINE":
|
253 |
image_prediction_score = 100 - self.image_prediction_score
|
254 |
elif self.image_prediction_label == "UNKNOWN":
|
255 |
image_prediction_score = 50
|
256 |
else:
|
257 |
image_prediction_score = self.image_prediction_score
|
258 |
+
|
259 |
+
news_prediction_score = (
|
260 |
+
text_prediction_score + image_prediction_score
|
261 |
+
) / 2
|
262 |
if news_prediction_score > 50:
|
263 |
self.news_prediction_score = news_prediction_score
|
264 |
self.news_prediction_label = "HUMAN"
|
|
|
275 |
for index, aligned_sentence in enumerate(self.aligned_sentences):
|
276 |
# Get entity-words (in pair) with colors
|
277 |
entities_with_colors = highlight_entities(
|
278 |
+
aligned_sentence["input_sentence"],
|
279 |
+
aligned_sentence["matched_sentence"],
|
280 |
+
)
|
281 |
self.aligned_sentences[index]["entities"] = entities_with_colors
|
282 |
+
|
283 |
ordinary_user_table = self.create_ordinary_user_table()
|
284 |
fact_checker_table = self.create_fact_checker_table()
|
285 |
governor_table = self.create_governor_table()
|
286 |
|
287 |
return ordinary_user_table, fact_checker_table, governor_table
|
288 |
+
|
289 |
def get_text_urls(self):
|
290 |
return set(self.text_referent_url)
|
291 |
|
|
|
292 |
def compare_sentences(self, sentence_1, sentence_2, position, color):
|
293 |
"""
|
294 |
+
Compares two sentences and identifies common phrases,
|
295 |
+
outputting their start and end positions.
|
|
|
|
|
|
|
296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
"""
|
298 |
|
299 |
if not sentence_1 or not sentence_2: # Handle empty strings
|
|
|
309 |
start_2 = block.b
|
310 |
end_2 = block.b + block.size
|
311 |
|
312 |
+
phrase = sentence_1[
|
313 |
+
start_1:end_1
|
314 |
+
] # Or sentence_2[start_2:end_2], they are the same
|
315 |
+
|
316 |
+
common_phrases.append(
|
317 |
+
{
|
318 |
+
"phrase": phrase,
|
319 |
+
"start_1": start_1 + position,
|
320 |
+
"end_1": end_1 + position,
|
321 |
+
"start_2": start_2,
|
322 |
+
"end_2": end_2,
|
323 |
+
"color": color,
|
324 |
+
},
|
325 |
+
)
|
326 |
position += len(sentence_1)
|
327 |
return common_phrases, position
|
328 |
|
|
|
330 |
rows = []
|
331 |
max_length = 30 # TODO: put this in configuration
|
332 |
rows.append(self.format_image_fact_checker_row(max_length))
|
333 |
+
|
334 |
for aligned_sentence in self.aligned_sentences:
|
335 |
if "input_sentence" not in aligned_sentence:
|
336 |
continue
|
337 |
+
|
338 |
# Get index of equal phrases in input and source sentences
|
339 |
equal_idx_1, equal_idx_2 = extract_equal_text(
|
340 |
+
aligned_sentence["input_sentence"],
|
341 |
+
aligned_sentence["matched_sentence"],
|
342 |
+
)
|
343 |
+
|
344 |
# Get entity-words (in pair) with colors
|
345 |
# entities_with_colors = highlight_entities(
|
346 |
# aligned_sentence["input_sentence"],
|
|
|
353 |
equal_idx_1,
|
354 |
equal_idx_2,
|
355 |
aligned_sentence["entities"],
|
356 |
+
],
|
357 |
)
|
358 |
|
359 |
for row in self.fact_checker_table:
|
360 |
formatted_row = self.format_text_fact_checker_row(row, max_length)
|
361 |
rows.append(formatted_row)
|
362 |
+
|
363 |
table = "\n".join(rows)
|
364 |
return f"""
|
365 |
+
<h5>Comparison between input news and source news:</h5>
|
366 |
+
<table border="1" style="width:100%; text-align:left;">
|
367 |
+
<col style="width: 170px;">
|
368 |
+
<col style="width: 170px;">
|
369 |
+
<col style="width: 30px;">
|
370 |
+
<col style="width: 75px;">
|
371 |
+
<thead>
|
372 |
+
<tr>
|
373 |
+
<th>Input news</th>
|
374 |
+
<th>Source (corresponding URL provided in Originality)</th>
|
375 |
+
<th>Forensic</th>
|
376 |
+
<th>Originality</th>
|
377 |
+
</tr>
|
378 |
+
</thead>
|
379 |
+
<tbody>
|
380 |
+
{table}
|
381 |
+
</tbody>
|
382 |
+
</table>
|
383 |
+
|
384 |
+
<style>
|
385 |
"""
|
386 |
|
387 |
def format_text_fact_checker_row(self, row, max_length=30):
|
|
|
390 |
return ""
|
391 |
if row[0]["matched_sentence"] != "": # source is not empty
|
392 |
# highlight entities
|
393 |
+
input_sentence, highlight_idx_input = apply_highlight(
|
394 |
+
row[0]["input_sentence"],
|
395 |
+
row[3],
|
396 |
+
"input",
|
397 |
+
)
|
398 |
+
source_sentence, highlight_idx_source = apply_highlight(
|
399 |
+
row[0]["matched_sentence"],
|
400 |
+
row[3],
|
401 |
+
"source",
|
402 |
+
)
|
403 |
entity_count = len(row[3])
|
404 |
+
|
405 |
# Color overlapping words
|
406 |
+
input_sentence = self.color_text(
|
407 |
+
input_sentence,
|
408 |
+
row[1],
|
409 |
+
highlight_idx_input,
|
410 |
+
) # text, index of highlight words
|
411 |
+
source_sentence = self.color_text(
|
412 |
+
source_sentence,
|
413 |
+
row[2],
|
414 |
+
highlight_idx_source,
|
415 |
+
) # text, index of highlight words
|
416 |
+
|
417 |
+
input_sentence = input_sentence.replace(
|
418 |
+
"span_style",
|
419 |
+
"span style",
|
420 |
+
).replace("1px_4px", "1px 4px")
|
421 |
+
source_sentence = source_sentence.replace(
|
422 |
+
"span_style",
|
423 |
+
"span style",
|
424 |
+
).replace("1px_4px", "1px 4px")
|
425 |
else:
|
426 |
input_sentence = row[0]["input_sentence"]
|
427 |
source_sentence = row[0]["matched_sentence"]
|
428 |
|
429 |
label = row[0]["label"]
|
430 |
score = row[0]["similarity"]
|
431 |
+
|
432 |
+
url = row[0]["url"] #
|
433 |
short_url = self.shorten_url(url, max_length)
|
434 |
source_text_url = f"""<a href="{url}">{short_url}</a>"""
|
435 |
+
|
436 |
entity_count_text = self.get_entity_count_text(entity_count)
|
437 |
+
|
438 |
return f"""
|
439 |
<tr>
|
440 |
<td>{input_sentence}</td>
|
441 |
<td>{source_sentence}</td>
|
442 |
+
<td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td> # noqa: E501
|
443 |
<td>{source_text_url}</td>
|
444 |
</tr>
|
445 |
"""
|
446 |
|
447 |
+
def format_image_fact_checker_row(self, max_length=30):
|
448 |
+
|
449 |
+
if (
|
450 |
+
self.image_referent_url is not None
|
451 |
+
or self.image_referent_url != ""
|
452 |
+
):
|
453 |
+
source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">""" # noqa: E501
|
454 |
short_url = self.shorten_url(self.image_referent_url, max_length)
|
455 |
+
source_image_url = (
|
456 |
+
f"""<a href="{self.image_referent_url}">{short_url}</a>"""
|
457 |
+
)
|
458 |
else:
|
459 |
source_image = "Image not found"
|
460 |
source_image_url = ""
|
461 |
|
462 |
+
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>""" # noqa: E501
|
|
|
463 |
|
464 |
def create_ordinary_user_table(self):
|
465 |
rows = []
|
|
|
467 |
rows.append(self.format_image_ordinary_user_row(max_length))
|
468 |
rows.append(self.format_text_ordinary_user_row(max_length))
|
469 |
table = "\n".join(rows)
|
470 |
+
|
471 |
return f"""
|
472 |
+
<h5>Comparison between input news and source news:</h5>
|
473 |
+
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;"> # noqa: E501
|
474 |
+
<col style="width: 170px;">
|
475 |
+
<col style="width: 170px;">
|
476 |
+
<col style="width: 30px;">
|
477 |
+
<col style="width: 75px;">
|
478 |
+
<thead>
|
479 |
+
<tr>
|
480 |
+
<th>Input news</th>
|
481 |
+
<th>Forensic</th>
|
482 |
+
<th>Originality</th>
|
483 |
+
</tr>
|
484 |
+
</thead>
|
485 |
+
<tbody>
|
486 |
+
{table}
|
487 |
+
</tbody>
|
488 |
+
</table>
|
489 |
+
|
490 |
+
<style>
|
491 |
"""
|
492 |
|
493 |
def format_text_ordinary_user_row(self, max_length=30):
|
|
|
501 |
continue
|
502 |
input_sentences += row["input_sentence"] + "<br><br>"
|
503 |
label = self.aligned_sentences[index]["label"]
|
504 |
+
|
505 |
+
url = self.aligned_sentences[index]["url"] #
|
506 |
short_url = self.shorten_url(url, max_length)
|
507 |
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
508 |
sentence_count += 1
|
509 |
+
|
510 |
scores, label = self.calculate_score_label()
|
511 |
+
|
512 |
return f"""
|
513 |
<tr>
|
514 |
<td>{input_sentences}</td>
|
515 |
+
<td>{label}<br>({scores * 100:.2f}%)</td>
|
516 |
<td>{source_text_urls}</td>
|
517 |
</tr>
|
518 |
"""
|
519 |
|
520 |
+
def format_image_ordinary_user_row(self, max_length=30):
|
521 |
+
|
522 |
+
if (
|
523 |
+
self.image_referent_url is not None
|
524 |
+
or self.image_referent_url != ""
|
525 |
+
):
|
526 |
short_url = self.shorten_url(self.image_referent_url, max_length)
|
527 |
+
source_image_url = (
|
528 |
+
f"""<a href="{self.image_referent_url}">{short_url}</a>"""
|
529 |
+
)
|
530 |
else:
|
531 |
# source_image = "Image not found"
|
532 |
source_image_url = ""
|
533 |
|
534 |
+
return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>""" # noqa: E501
|
|
|
535 |
|
536 |
def create_governor_table(self):
|
537 |
rows = []
|
538 |
max_length = 30 # TODO: put this in configuration
|
539 |
rows.append(self.format_image_governor_row(max_length))
|
540 |
+
|
541 |
for aligned_sentence in self.aligned_sentences:
|
542 |
if "input_sentence" not in aligned_sentence:
|
543 |
continue
|
544 |
+
|
545 |
# Get index of equal phrases in input and source sentences
|
546 |
equal_idx_1, equal_idx_2 = extract_equal_text(
|
547 |
+
aligned_sentence["input_sentence"],
|
548 |
+
aligned_sentence["matched_sentence"],
|
549 |
+
)
|
550 |
+
|
551 |
# Get entity-words (in pair) with colors
|
552 |
# entities_with_colors = highlight_entities(
|
553 |
# aligned_sentence["input_sentence"],
|
554 |
# aligned_sentence["matched_sentence"],
|
555 |
# )
|
556 |
+
|
557 |
self.governor_table.append(
|
558 |
[
|
559 |
aligned_sentence,
|
560 |
equal_idx_1,
|
561 |
equal_idx_2,
|
562 |
aligned_sentence["entities"],
|
563 |
+
],
|
564 |
)
|
565 |
|
566 |
formatted_row = self.format_text_governor_row(max_length)
|
567 |
rows.append(formatted_row)
|
568 |
+
|
569 |
table = "\n".join(rows)
|
570 |
return f"""
|
571 |
+
<h5>Comparison between input news and source news:</h5>
|
572 |
+
<table border="1" style="width:100%; text-align:left;">
|
573 |
+
<col style="width: 170px;">
|
574 |
+
<col style="width: 170px;">
|
575 |
+
<col style="width: 30px;">
|
576 |
+
<col style="width: 75px;">
|
577 |
+
<thead>
|
578 |
+
<tr>
|
579 |
+
<th>Input news</th>
|
580 |
+
<th>Source (corresponding URL provided in Originality)</th>
|
581 |
+
<th>Forensic</th>
|
582 |
+
<th>Originality</th>
|
583 |
+
</tr>
|
584 |
+
</thead>
|
585 |
+
<tbody>
|
586 |
+
{table}
|
587 |
+
</tbody>
|
588 |
+
</table>
|
589 |
+
|
590 |
+
<style>
|
591 |
"""
|
592 |
|
593 |
+
def format_text_governor_row(self, max_length=30):
|
594 |
input_sentences = ""
|
595 |
source_sentences = ""
|
596 |
source_text_urls = ""
|
597 |
label = ""
|
|
|
598 |
sentence_count = 0
|
599 |
entity_count = 0
|
600 |
for row in self.governor_table:
|
601 |
print(f"governor_row: {row}")
|
602 |
if row[0]["input_sentence"] == "":
|
603 |
continue
|
604 |
+
|
605 |
if row[0]["matched_sentence"] != "": # source is not empty
|
606 |
# highlight entities
|
607 |
+
input_sentence, highlight_idx_input = apply_highlight(
|
608 |
+
row[0]["input_sentence"],
|
609 |
+
row[3],
|
610 |
+
"input",
|
611 |
+
entity_count,
|
612 |
+
)
|
613 |
+
source_sentence, highlight_idx_source = apply_highlight(
|
614 |
+
row[0]["matched_sentence"],
|
615 |
+
row[3],
|
616 |
+
"source",
|
617 |
+
entity_count,
|
618 |
+
)
|
619 |
entity_count += len(row[3])
|
620 |
+
|
621 |
# Color overlapping words
|
622 |
+
input_sentence = self.color_text(
|
623 |
+
input_sentence,
|
624 |
+
row[1],
|
625 |
+
highlight_idx_input,
|
626 |
+
) # text, index of highlight words
|
627 |
+
source_sentence = self.color_text(
|
628 |
+
source_sentence,
|
629 |
+
row[2],
|
630 |
+
highlight_idx_source,
|
631 |
+
) # text, index of highlight words
|
632 |
+
|
633 |
+
input_sentence = input_sentence.replace(
|
634 |
+
"span_style",
|
635 |
+
"span style",
|
636 |
+
).replace("1px_4px", "1px 4px")
|
637 |
+
source_sentence = source_sentence.replace(
|
638 |
+
"span_style",
|
639 |
+
"span style",
|
640 |
+
).replace("1px_4px", "1px 4px")
|
641 |
|
642 |
else:
|
643 |
input_sentence = row[0]["input_sentence"]
|
644 |
source_sentence = row[0]["matched_sentence"]
|
645 |
+
|
646 |
+
# convert score to HUMAN-based score:
|
647 |
input_sentences += input_sentence + "<br><br>"
|
648 |
source_sentences += source_sentence + "<br><br>"
|
649 |
+
|
|
|
650 |
url = row[0]["url"]
|
651 |
short_url = self.shorten_url(url, max_length)
|
652 |
source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
|
653 |
sentence_count += 1
|
654 |
+
|
655 |
score, label = self.calculate_score_label()
|
656 |
entity_count_text = self.get_entity_count_text(entity_count)
|
657 |
|
658 |
return f"""
|
659 |
+
<tr>
|
660 |
+
<td>{input_sentences}</td>
|
661 |
+
<td>{source_sentences}</td>
|
662 |
+
<td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td>
|
663 |
+
<td>{source_text_urls}</td>
|
664 |
+
</tr>
|
665 |
"""
|
666 |
|
667 |
def format_image_governor_row(self, max_length=30):
|
668 |
+
if (
|
669 |
+
self.image_referent_url is not None
|
670 |
+
or self.image_referent_url != ""
|
671 |
+
):
|
672 |
+
source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">""" # noqa: E501
|
673 |
short_url = self.shorten_url(self.image_referent_url, max_length)
|
674 |
+
source_image_url = (
|
675 |
+
f"""<a href="{self.image_referent_url}">{short_url}</a>"""
|
676 |
+
)
|
677 |
else:
|
678 |
source_image = "Image not found"
|
679 |
source_image_url = ""
|
680 |
|
681 |
+
return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>""" # noqa: E501
|
|
|
682 |
|
683 |
def get_entity_count_text(self, entity_count):
|
684 |
if entity_count <= 0:
|
|
|
692 |
def shorten_url(self, url, max_length=30):
|
693 |
if url is None:
|
694 |
return ""
|
695 |
+
|
696 |
if len(url) > max_length:
|
697 |
short_url = url[:max_length] + "..."
|
698 |
else:
|
699 |
short_url = url
|
700 |
return short_url
|
701 |
|
|
|
702 |
def color_text(self, text, colored_idx, highlighted_idx):
|
703 |
paragraph = ""
|
704 |
words = text.split()
|
705 |
+
|
706 |
starts, ends = self.extract_starts_ends(colored_idx)
|
707 |
starts, ends = self.filter_indices(starts, ends, highlighted_idx)
|
708 |
|
709 |
previous_end = 0
|
710 |
for start, end in zip(starts, ends):
|
711 |
paragraph += " ".join(words[previous_end:start])
|
712 |
+
|
713 |
equal_words = " ".join(words[start:end])
|
714 |
paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
715 |
+
|
716 |
previous_end = end
|
717 |
+
|
718 |
+
# Some left words due to the punctuation separated from
|
719 |
# the highlighting text
|
720 |
equal_words = " ".join(words[previous_end:])
|
721 |
print(f"starts_2: {previous_end}")
|
722 |
+
print(f"ends_2: {len(words) - 1}")
|
723 |
print(f"equal_words: {words[previous_end:]}")
|
724 |
paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
|
725 |
|
726 |
return paragraph
|
727 |
+
|
728 |
def extract_starts_ends(self, colored_idx):
|
729 |
starts = []
|
730 |
ends = []
|
731 |
for index in colored_idx:
|
732 |
+
starts.append(index["start"])
|
733 |
+
ends.append(index["end"])
|
734 |
return starts, ends
|
735 |
+
|
|
|
736 |
def filter_indices(self, starts, ends, ignore_indices):
|
737 |
"""
|
738 |
+
Filters start and end indices to exclude any indices present in the
|
739 |
+
ignore_indices list.
|
740 |
|
741 |
Args:
|
742 |
starts: A list of starting indices.
|
|
|
744 |
ignore_indices: A list of indices to exclude.
|
745 |
|
746 |
Returns:
|
747 |
+
A tuple of two lists: filtered_starts and filtered_ends.
|
748 |
+
Returns empty lists if the input is invalid
|
749 |
+
or if all ranges are filtered out.
|
750 |
Prints error messages for invalid input.
|
751 |
+
|
752 |
Examples:
|
753 |
starts = [0, 5, 10]
|
754 |
ends = [3, 7, 12]
|
755 |
ignore_indices = [1, 2, 11, 17]
|
756 |
+
|
757 |
+
# Output:
|
758 |
starts = [0, 3, 5, 10, 12]
|
759 |
ends = [0, 3, 7, 10, 12]
|
760 |
|
761 |
"""
|
762 |
|
763 |
if len(starts) != len(ends):
|
764 |
+
print(
|
765 |
+
"Error: The 'starts' and 'ends' lists must have the same length.", # noqa: E501
|
766 |
+
)
|
767 |
return [], []
|
768 |
|
769 |
filtered_starts = []
|
|
|
774 |
end = ends[i]
|
775 |
|
776 |
if end < start:
|
777 |
+
print(
|
778 |
+
f"Error: End index {end} is less than start index {start} at position {i}.", # noqa: E501
|
779 |
+
)
|
780 |
return [], []
|
781 |
|
|
|
782 |
start_end = list(range(start, end + 1, 1))
|
783 |
start_end = list(set(start_end) - set(ignore_indices))
|
784 |
new_start, new_end = self.extract_sequences(start_end)
|
|
|
790 |
def extract_sequences(self, numbers):
|
791 |
if len(numbers) == 1:
|
792 |
return [numbers[0]], [numbers[0]]
|
793 |
+
|
794 |
numbers.sort()
|
795 |
starts = []
|
796 |
ends = []
|
|
|
799 |
start = number
|
800 |
end = number
|
801 |
continue
|
802 |
+
|
803 |
+
if number - 1 == numbers[i - 1]:
|
804 |
end = number
|
805 |
else:
|
806 |
starts.append(start)
|
807 |
ends.append(end + 1)
|
808 |
start = number
|
809 |
end = number
|
810 |
+
|
811 |
if i == len(numbers) - 1:
|
812 |
starts.append(start)
|
813 |
ends.append(end + 1)
|
814 |
+
|
815 |
return starts, ends
|
816 |
+
|
817 |
def calculate_score_label(self):
|
818 |
human_score = []
|
819 |
machine_score = []
|
|
|
826 |
elif sentence["label"] == "MACHINE":
|
827 |
machine_score.append(1 - sentence["similarity"])
|
828 |
machine_flag = True
|
829 |
+
|
830 |
if machine_flag is True and len(machine_score) > 0:
|
831 |
# average value of machine_score
|
832 |
machine_score_avg = sum(machine_score) / len(machine_score)
|
|
|
839 |
return human_score_avg, "HUMAN"
|
840 |
else:
|
841 |
return 0, "UNKNOWN"
|
|
|
|
src/application/content_generation.py
CHANGED
@@ -1,25 +1,27 @@
|
|
1 |
import json
|
|
|
|
|
2 |
import openai
|
3 |
from dotenv import load_dotenv
|
4 |
-
import os
|
5 |
|
6 |
load_dotenv()
|
7 |
-
AZURE_OPENAI_API_KEY = os.getenv(
|
8 |
-
AZURE_OPENAI_ENDPOINT = os.getenv(
|
9 |
-
AZURE_OPENAI_API_VERSION = os.getenv(
|
10 |
|
11 |
client = openai.AzureOpenAI(
|
12 |
-
api_version
|
13 |
-
api_key
|
14 |
-
azure_endpoint
|
15 |
-
|
|
|
16 |
|
17 |
def generate_fake_text(text_generation_model, title, content):
|
18 |
# Generate text using the selected models
|
19 |
-
prompt = """Generate a random fake news tittle in this format:
|
20 |
---
|
21 |
# Title: [Fake Title]
|
22 |
-
# Content:
|
23 |
[Fake Content]
|
24 |
---
|
25 |
"""
|
@@ -32,22 +34,25 @@ def generate_fake_text(text_generation_model, title, content):
|
|
32 |
elif content:
|
33 |
prompt += """base on the following context:
|
34 |
# Content: {news_content}"""
|
35 |
-
|
36 |
# Generate text using the text generation model
|
37 |
-
# Generate text using the selected model
|
38 |
try:
|
39 |
response = client.chat.completions.create(
|
40 |
-
model=text_generation_model,
|
41 |
-
messages
|
|
|
|
|
|
|
|
|
|
|
42 |
)
|
43 |
-
|
44 |
-
print("Response from OpenAI API: ", response.choices[0].message.content)
|
45 |
fake_text = response.choices[0].message.content
|
46 |
|
47 |
except openai.OpenAIError as e:
|
48 |
print(f"Error interacting with OpenAI API: {e}")
|
49 |
-
fake_text =
|
50 |
-
|
51 |
if fake_text != "":
|
52 |
fake_title, fake_content = extract_title_content(fake_text)
|
53 |
return fake_title, fake_content
|
@@ -57,12 +62,12 @@ def extract_title_content(fake_news):
|
|
57 |
"""
|
58 |
Extracts the title and content from the generated fake news string.
|
59 |
|
60 |
-
This function parses a string containing fake news, which is expected
|
61 |
-
a specific format with a title and content section marked by
|
62 |
-
'# Content:' respectively.
|
63 |
|
64 |
Args:
|
65 |
-
fake_news (str): A string containing the generated fake news
|
66 |
|
67 |
Returns:
|
68 |
tuple: A tuple containing two elements:
|
@@ -77,33 +82,36 @@ def extract_title_content(fake_news):
|
|
77 |
title_start_index = fake_news.find("# Title: ") + len("# Title: ")
|
78 |
title_end_index = fake_news.find("\n", title_start_index)
|
79 |
title = fake_news[title_start_index:title_end_index].strip()
|
80 |
-
|
81 |
-
content_start_index = fake_news.find("\n# Content: ") + len(
|
|
|
|
|
82 |
content = fake_news[content_start_index:].strip()
|
83 |
-
|
84 |
return title, content
|
85 |
|
|
|
86 |
def generate_fake_image(model, title):
|
87 |
if len(title) > 0:
|
88 |
IMAGE_PROMPT = f"Generate a random image about {title}"
|
89 |
else:
|
90 |
IMAGE_PROMPT = "Generate a random image"
|
91 |
result = client.images.generate(
|
92 |
-
model="dall-e-3",
|
93 |
prompt=IMAGE_PROMPT,
|
94 |
-
n=1
|
95 |
)
|
96 |
-
image_url = json.loads(result.model_dump_json())[
|
97 |
return image_url
|
98 |
-
|
99 |
-
|
100 |
def replace_text(news_title, news_content, replace_df):
|
101 |
"""
|
102 |
Replaces occurrences in the input text based on the provided DataFrame.
|
103 |
|
104 |
Args:
|
105 |
text: The input text.
|
106 |
-
replace_df: A
|
107 |
|
108 |
Returns:
|
109 |
The text after all replacements have been made.
|
@@ -113,4 +121,4 @@ def replace_text(news_title, news_content, replace_df):
|
|
113 |
replace_with = row["Replace with:"]
|
114 |
news_content = news_content.replace(find_what, replace_with)
|
115 |
news_title = news_title.replace(find_what, replace_with)
|
116 |
-
return news_title, news_content
|
|
|
1 |
import json
|
2 |
+
import os
|
3 |
+
|
4 |
import openai
|
5 |
from dotenv import load_dotenv
|
|
|
6 |
|
7 |
load_dotenv()
|
8 |
+
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
9 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
10 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
11 |
|
12 |
client = openai.AzureOpenAI(
|
13 |
+
api_version=AZURE_OPENAI_API_VERSION,
|
14 |
+
api_key=AZURE_OPENAI_API_KEY,
|
15 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
16 |
+
)
|
17 |
+
|
18 |
|
19 |
def generate_fake_text(text_generation_model, title, content):
|
20 |
# Generate text using the selected models
|
21 |
+
prompt = """Generate a random fake news tittle in this format:
|
22 |
---
|
23 |
# Title: [Fake Title]
|
24 |
+
# Content:
|
25 |
[Fake Content]
|
26 |
---
|
27 |
"""
|
|
|
34 |
elif content:
|
35 |
prompt += """base on the following context:
|
36 |
# Content: {news_content}"""
|
37 |
+
|
38 |
# Generate text using the text generation model
|
39 |
+
# Generate text using the selected model
|
40 |
try:
|
41 |
response = client.chat.completions.create(
|
42 |
+
model=text_generation_model,
|
43 |
+
messages=[{"role": "system", "content": prompt}],
|
44 |
+
)
|
45 |
+
|
46 |
+
print(
|
47 |
+
"Response from OpenAI API: ",
|
48 |
+
response.choices[0].message.content,
|
49 |
)
|
|
|
|
|
50 |
fake_text = response.choices[0].message.content
|
51 |
|
52 |
except openai.OpenAIError as e:
|
53 |
print(f"Error interacting with OpenAI API: {e}")
|
54 |
+
fake_text = ""
|
55 |
+
|
56 |
if fake_text != "":
|
57 |
fake_title, fake_content = extract_title_content(fake_text)
|
58 |
return fake_title, fake_content
|
|
|
62 |
"""
|
63 |
Extracts the title and content from the generated fake news string.
|
64 |
|
65 |
+
This function parses a string containing fake news, which is expected
|
66 |
+
to have a specific format with a title and content section marked by
|
67 |
+
'# Title:' and '# Content:' respectively.
|
68 |
|
69 |
Args:
|
70 |
+
fake_news (str): A string containing the generated fake news.
|
71 |
|
72 |
Returns:
|
73 |
tuple: A tuple containing two elements:
|
|
|
82 |
title_start_index = fake_news.find("# Title: ") + len("# Title: ")
|
83 |
title_end_index = fake_news.find("\n", title_start_index)
|
84 |
title = fake_news[title_start_index:title_end_index].strip()
|
85 |
+
|
86 |
+
content_start_index = fake_news.find("\n# Content: ") + len(
|
87 |
+
"\n# Content: ",
|
88 |
+
)
|
89 |
content = fake_news[content_start_index:].strip()
|
90 |
+
|
91 |
return title, content
|
92 |
|
93 |
+
|
94 |
def generate_fake_image(model, title):
|
95 |
if len(title) > 0:
|
96 |
IMAGE_PROMPT = f"Generate a random image about {title}"
|
97 |
else:
|
98 |
IMAGE_PROMPT = "Generate a random image"
|
99 |
result = client.images.generate(
|
100 |
+
model="dall-e-3", # the name of your DALL-E 3 deployment
|
101 |
prompt=IMAGE_PROMPT,
|
102 |
+
n=1,
|
103 |
)
|
104 |
+
image_url = json.loads(result.model_dump_json())["data"][0]["url"]
|
105 |
return image_url
|
106 |
+
|
107 |
+
|
108 |
def replace_text(news_title, news_content, replace_df):
|
109 |
"""
|
110 |
Replaces occurrences in the input text based on the provided DataFrame.
|
111 |
|
112 |
Args:
|
113 |
text: The input text.
|
114 |
+
replace_df: A DF with 2 columns: "find_what" & "replace_with".
|
115 |
|
116 |
Returns:
|
117 |
The text after all replacements have been made.
|
|
|
121 |
replace_with = row["Replace with:"]
|
122 |
news_content = news_content.replace(find_what, replace_with)
|
123 |
news_title = news_title.replace(find_what, replace_with)
|
124 |
+
return news_title, news_content
|
src/application/image/image_comparison.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
-
import requests
|
2 |
from io import BytesIO
|
3 |
-
|
4 |
import imagehash
|
|
|
|
|
|
|
5 |
from src.application.image.search_yandex import YandexReverseImageSearcher
|
6 |
|
|
|
7 |
def get_image_from_url(url):
|
8 |
try:
|
9 |
response = requests.get(url)
|
@@ -12,6 +15,7 @@ def get_image_from_url(url):
|
|
12 |
print(f"Error opening image: {e}")
|
13 |
return None
|
14 |
|
|
|
15 |
def get_image_from_file(file_path):
|
16 |
try:
|
17 |
return Image.open(file_path)
|
@@ -19,33 +23,36 @@ def get_image_from_file(file_path):
|
|
19 |
print(f"Error occurred while opening image from file: {file_path}")
|
20 |
return None
|
21 |
|
|
|
22 |
def standardize_image(image):
|
23 |
# Convert to RGB if needed
|
24 |
-
if image.mode in (
|
25 |
-
background = Image.new(
|
26 |
background.paste(image, mask=image.split()[-1])
|
27 |
image = background
|
28 |
-
elif image.mode !=
|
29 |
-
image = image.convert(
|
30 |
-
|
31 |
# Resize to standard size (e.g. 256x256)
|
32 |
standard_size = (256, 256)
|
33 |
image = image.resize(standard_size)
|
34 |
-
|
35 |
return image
|
36 |
|
|
|
37 |
def compare_images(image1, image2):
|
38 |
# Standardize both images first
|
39 |
img1_std = standardize_image(image1)
|
40 |
img2_std = standardize_image(image2)
|
41 |
-
|
42 |
hash1 = imagehash.average_hash(img1_std)
|
43 |
hash2 = imagehash.average_hash(img2_std)
|
44 |
return hash1 - hash2 # Returns the Hamming distance between the hashes
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
49 |
# Get the image from URL
|
50 |
url_image = get_image_from_url(image_url)
|
51 |
|
@@ -54,13 +61,13 @@ if __name__ == '__main__':
|
|
54 |
res = rev_img_searcher.search(image_url)
|
55 |
|
56 |
for search_item in res:
|
57 |
-
print(f
|
58 |
# print(f'Site: {search_item.page_url}')
|
59 |
-
print(f
|
60 |
|
61 |
# Compare each search result image with the input image
|
62 |
result_image = get_image_from_url(search_item.image_url)
|
63 |
result_difference = compare_images(result_image, url_image)
|
64 |
print(f"Difference with search result: {result_difference}")
|
65 |
-
if result_difference == 0:
|
66 |
-
break
|
|
|
|
|
1 |
from io import BytesIO
|
2 |
+
|
3 |
import imagehash
|
4 |
+
import requests
|
5 |
+
from PIL import Image
|
6 |
+
|
7 |
from src.application.image.search_yandex import YandexReverseImageSearcher
|
8 |
|
9 |
+
|
10 |
def get_image_from_url(url):
|
11 |
try:
|
12 |
response = requests.get(url)
|
|
|
15 |
print(f"Error opening image: {e}")
|
16 |
return None
|
17 |
|
18 |
+
|
19 |
def get_image_from_file(file_path):
|
20 |
try:
|
21 |
return Image.open(file_path)
|
|
|
23 |
print(f"Error occurred while opening image from file: {file_path}")
|
24 |
return None
|
25 |
|
26 |
+
|
27 |
def standardize_image(image):
|
28 |
# Convert to RGB if needed
|
29 |
+
if image.mode in ("RGBA", "LA"):
|
30 |
+
background = Image.new("RGB", image.size, (255, 255, 255))
|
31 |
background.paste(image, mask=image.split()[-1])
|
32 |
image = background
|
33 |
+
elif image.mode != "RGB":
|
34 |
+
image = image.convert("RGB")
|
35 |
+
|
36 |
# Resize to standard size (e.g. 256x256)
|
37 |
standard_size = (256, 256)
|
38 |
image = image.resize(standard_size)
|
39 |
+
|
40 |
return image
|
41 |
|
42 |
+
|
43 |
def compare_images(image1, image2):
|
44 |
# Standardize both images first
|
45 |
img1_std = standardize_image(image1)
|
46 |
img2_std = standardize_image(image2)
|
47 |
+
|
48 |
hash1 = imagehash.average_hash(img1_std)
|
49 |
hash2 = imagehash.average_hash(img2_std)
|
50 |
return hash1 - hash2 # Returns the Hamming distance between the hashes
|
51 |
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
54 |
+
image_url = "https://i.pinimg.com/originals/c4/50/35/c450352ac6ea8645ead206721673e8fb.png" # noqa: E501
|
55 |
+
|
56 |
# Get the image from URL
|
57 |
url_image = get_image_from_url(image_url)
|
58 |
|
|
|
61 |
res = rev_img_searcher.search(image_url)
|
62 |
|
63 |
for search_item in res:
|
64 |
+
print(f"Title: {search_item.page_title}")
|
65 |
# print(f'Site: {search_item.page_url}')
|
66 |
+
print(f"Img: {search_item.image_url}\n")
|
67 |
|
68 |
# Compare each search result image with the input image
|
69 |
result_image = get_image_from_url(search_item.image_url)
|
70 |
result_difference = compare_images(result_image, url_image)
|
71 |
print(f"Difference with search result: {result_difference}")
|
72 |
+
if result_difference == 0:
|
73 |
+
break
|
src/application/image/image_detection.py
CHANGED
@@ -1,14 +1,19 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
3 |
from src.application.image.model_detection import image_generation_detection
|
4 |
from src.application.image.search_yandex import yandex_reverse_image_search
|
5 |
|
6 |
|
7 |
def compare_list_of_images(news_image_path, img_urls):
|
8 |
-
news_image = get_image_from_file(
|
|
|
|
|
9 |
if news_image is None:
|
10 |
return None, -1
|
11 |
-
|
12 |
matched_url = ""
|
13 |
max_similarity = 0
|
14 |
for url in img_urls:
|
@@ -20,7 +25,10 @@ def compare_list_of_images(news_image_path, img_urls):
|
|
20 |
referred_image = get_image_from_url(url)
|
21 |
if referred_image is None:
|
22 |
continue
|
23 |
-
distance = compare_images(
|
|
|
|
|
|
|
24 |
similarity = max(100 - distance, 0)
|
25 |
if similarity > max_similarity:
|
26 |
max_similarity = similarity
|
@@ -29,14 +37,17 @@ def compare_list_of_images(news_image_path, img_urls):
|
|
29 |
if max_similarity > 90:
|
30 |
return matched_url, max_similarity
|
31 |
return None, -1
|
32 |
-
|
33 |
-
|
34 |
def detect_image_from_news_image(news_image_path, image_urls):
|
35 |
print("\tFrom news:")
|
36 |
return compare_list_of_images(news_image_path, image_urls)
|
37 |
|
|
|
38 |
def detect_image_by_reverse_search(news_image_path):
|
39 |
-
image_urls = yandex_reverse_image_search(
|
|
|
|
|
40 |
print("\tFrom search engine:")
|
41 |
for url in image_urls:
|
42 |
print(f"\t\t{url}")
|
@@ -47,5 +58,5 @@ def detect_image_by_ai_model(news_image_path):
|
|
47 |
print("\tFrom AI model:")
|
48 |
image_prediction_label, image_confidence = image_generation_detection(
|
49 |
news_image_path,
|
50 |
-
|
51 |
-
return image_prediction_label, image_confidence
|
|
|
1 |
+
from src.application.image.image_comparison import (
|
2 |
+
compare_images,
|
3 |
+
get_image_from_file,
|
4 |
+
get_image_from_url,
|
5 |
+
)
|
6 |
from src.application.image.model_detection import image_generation_detection
|
7 |
from src.application.image.search_yandex import yandex_reverse_image_search
|
8 |
|
9 |
|
10 |
def compare_list_of_images(news_image_path, img_urls):
|
11 |
+
news_image = get_image_from_file(
|
12 |
+
news_image_path,
|
13 |
+
) # TODO: news_image_path is arrays
|
14 |
if news_image is None:
|
15 |
return None, -1
|
16 |
+
|
17 |
matched_url = ""
|
18 |
max_similarity = 0
|
19 |
for url in img_urls:
|
|
|
25 |
referred_image = get_image_from_url(url)
|
26 |
if referred_image is None:
|
27 |
continue
|
28 |
+
distance = compare_images(
|
29 |
+
news_image,
|
30 |
+
referred_image,
|
31 |
+
) # Hamming algorithm
|
32 |
similarity = max(100 - distance, 0)
|
33 |
if similarity > max_similarity:
|
34 |
max_similarity = similarity
|
|
|
37 |
if max_similarity > 90:
|
38 |
return matched_url, max_similarity
|
39 |
return None, -1
|
40 |
+
|
41 |
+
|
42 |
def detect_image_from_news_image(news_image_path, image_urls):
|
43 |
print("\tFrom news:")
|
44 |
return compare_list_of_images(news_image_path, image_urls)
|
45 |
|
46 |
+
|
47 |
def detect_image_by_reverse_search(news_image_path):
|
48 |
+
image_urls = yandex_reverse_image_search(
|
49 |
+
news_image_path,
|
50 |
+
) # url or file_path
|
51 |
print("\tFrom search engine:")
|
52 |
for url in image_urls:
|
53 |
print(f"\t\t{url}")
|
|
|
58 |
print("\tFrom AI model:")
|
59 |
image_prediction_label, image_confidence = image_generation_detection(
|
60 |
news_image_path,
|
61 |
+
)
|
62 |
+
return image_prediction_label, image_confidence
|
src/application/image/model_detection.py
CHANGED
@@ -1,23 +1,39 @@
|
|
1 |
-
|
2 |
-
|
3 |
import pytorch_lightning as pl
|
4 |
-
import timm
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
7 |
-
import logging
|
8 |
-
from PIL import Image
|
9 |
import torchvision.transforms as transforms
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
from torchvision.transforms import v2
|
11 |
|
12 |
-
logging.basicConfig(
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
class ImageClassifier(pl.LightningModule):
|
16 |
def __init__(self, lmd=0):
|
17 |
super().__init__()
|
18 |
-
self.model = timm.create_model(
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
21 |
self.validation_outputs = []
|
22 |
self.lmd = lmd
|
23 |
|
@@ -27,13 +43,13 @@ class ImageClassifier(pl.LightningModule):
|
|
27 |
def training_step(self, batch):
|
28 |
images, labels, _ = batch
|
29 |
outputs = self.forward(images).squeeze()
|
30 |
-
|
31 |
print(f"Shape of outputs (training): {outputs.shape}")
|
32 |
print(f"Shape of labels (training): {labels.shape}")
|
33 |
-
|
34 |
loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
|
35 |
logging.info(f"Training Step - ERM loss: {loss.item()}")
|
36 |
-
loss += self.lmd * (outputs
|
37 |
logging.info(f"Training Step - SD loss: {loss.item()}")
|
38 |
return loss
|
39 |
|
@@ -43,20 +59,30 @@ class ImageClassifier(pl.LightningModule):
|
|
43 |
|
44 |
if outputs.shape == torch.Size([]):
|
45 |
return
|
46 |
-
|
47 |
print(f"Shape of outputs (validation): {outputs.shape}")
|
48 |
print(f"Shape of labels (validation): {labels.shape}")
|
49 |
|
50 |
loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
|
51 |
preds = torch.sigmoid(outputs)
|
52 |
-
self.log(
|
53 |
-
self.log(
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
output = {"val_loss": loss, "preds": preds, "labels": labels}
|
56 |
self.validation_outputs.append(output)
|
57 |
logging.info(f"Validation Step - Batch loss: {loss.item()}")
|
58 |
return output
|
59 |
-
|
60 |
def predict_step(self, batch):
|
61 |
images, label, domain = batch
|
62 |
outputs = self.forward(images).squeeze()
|
@@ -67,13 +93,13 @@ class ImageClassifier(pl.LightningModule):
|
|
67 |
if not self.validation_outputs:
|
68 |
logging.warning("No outputs in validation step to process")
|
69 |
return
|
70 |
-
preds = torch.cat([x[
|
71 |
-
labels = torch.cat([x[
|
72 |
if labels.unique().size(0) == 1:
|
73 |
logging.warning("Only one class in validation step")
|
74 |
return
|
75 |
auc_score = roc_auc_score(labels.cpu(), preds.cpu())
|
76 |
-
self.log(
|
77 |
logging.info(f"Validation Epoch End - AUC score: {auc_score}")
|
78 |
self.validation_outputs = []
|
79 |
|
@@ -82,45 +108,46 @@ class ImageClassifier(pl.LightningModule):
|
|
82 |
return optimizer
|
83 |
|
84 |
|
85 |
-
|
86 |
def load_image(image_path, transform=None):
|
87 |
-
image = Image.open(image_path).convert(
|
88 |
-
|
89 |
if transform:
|
90 |
image = transform(image)
|
91 |
-
|
92 |
return image
|
93 |
|
94 |
|
95 |
def predict_single_image(image_path, model, transform=None):
|
96 |
-
image = load_image(image_path, transform)
|
97 |
-
|
98 |
-
device = torch.device(
|
99 |
-
|
100 |
model.to(device)
|
101 |
-
|
102 |
image = image.to(device)
|
103 |
|
104 |
model.eval()
|
105 |
-
|
106 |
with torch.no_grad():
|
107 |
-
image = image.unsqueeze(0)
|
108 |
-
output = model(image).squeeze()
|
109 |
-
prediction = torch.sigmoid(output).item()
|
110 |
-
|
111 |
return prediction
|
112 |
|
113 |
|
114 |
def image_generation_detection(image_path):
|
115 |
model = ImageClassifier.load_from_checkpoint(CHECKPOINT)
|
116 |
|
117 |
-
transform = v2.Compose(
|
118 |
-
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
121 |
|
122 |
-
prediction = predict_single_image(image_path, model, transform)
|
123 |
-
|
124 |
result = ""
|
125 |
if prediction <= 0.2:
|
126 |
result += "Most likely human"
|
@@ -134,8 +161,8 @@ def image_generation_detection(image_path):
|
|
134 |
return image_prediction_label, image_confidence
|
135 |
|
136 |
|
137 |
-
if __name__ == "__main__":
|
138 |
image_path = "path_to_your_image.jpg" # Replace with your image path
|
139 |
image_prediction_label, image_confidence = image_generation_detection(
|
140 |
image_path,
|
141 |
-
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
import pytorch_lightning as pl
|
4 |
+
import timm
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
|
|
|
|
7 |
import torchvision.transforms as transforms
|
8 |
+
from PIL import Image
|
9 |
+
from sklearn.metrics import roc_auc_score
|
10 |
+
from torchmetrics import (
|
11 |
+
Accuracy,
|
12 |
+
Recall,
|
13 |
+
)
|
14 |
from torchvision.transforms import v2
|
15 |
|
16 |
+
logging.basicConfig(
|
17 |
+
filename="training.log",
|
18 |
+
filemode="w",
|
19 |
+
level=logging.INFO,
|
20 |
+
force=True,
|
21 |
+
)
|
22 |
+
CHECKPOINT = (
|
23 |
+
"models/image_classifier/image-classifier-step=8008-val_loss=0.11.ckpt"
|
24 |
+
)
|
25 |
+
|
26 |
|
27 |
class ImageClassifier(pl.LightningModule):
|
28 |
def __init__(self, lmd=0):
|
29 |
super().__init__()
|
30 |
+
self.model = timm.create_model(
|
31 |
+
"resnet50",
|
32 |
+
pretrained=True,
|
33 |
+
num_classes=1,
|
34 |
+
)
|
35 |
+
self.accuracy = Accuracy(task="binary", threshold=0.5)
|
36 |
+
self.recall = Recall(task="binary", threshold=0.5)
|
37 |
self.validation_outputs = []
|
38 |
self.lmd = lmd
|
39 |
|
|
|
43 |
def training_step(self, batch):
|
44 |
images, labels, _ = batch
|
45 |
outputs = self.forward(images).squeeze()
|
46 |
+
|
47 |
print(f"Shape of outputs (training): {outputs.shape}")
|
48 |
print(f"Shape of labels (training): {labels.shape}")
|
49 |
+
|
50 |
loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
|
51 |
logging.info(f"Training Step - ERM loss: {loss.item()}")
|
52 |
+
loss += self.lmd * (outputs**2).mean() # SD loss penalty
|
53 |
logging.info(f"Training Step - SD loss: {loss.item()}")
|
54 |
return loss
|
55 |
|
|
|
59 |
|
60 |
if outputs.shape == torch.Size([]):
|
61 |
return
|
62 |
+
|
63 |
print(f"Shape of outputs (validation): {outputs.shape}")
|
64 |
print(f"Shape of labels (validation): {labels.shape}")
|
65 |
|
66 |
loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
|
67 |
preds = torch.sigmoid(outputs)
|
68 |
+
self.log("val_loss", loss, prog_bar=True, sync_dist=True)
|
69 |
+
self.log(
|
70 |
+
"val_acc",
|
71 |
+
self.accuracy(preds, labels.int()),
|
72 |
+
prog_bar=True,
|
73 |
+
sync_dist=True,
|
74 |
+
)
|
75 |
+
self.log(
|
76 |
+
"val_recall",
|
77 |
+
self.recall(preds, labels.int()),
|
78 |
+
prog_bar=True,
|
79 |
+
sync_dist=True,
|
80 |
+
)
|
81 |
output = {"val_loss": loss, "preds": preds, "labels": labels}
|
82 |
self.validation_outputs.append(output)
|
83 |
logging.info(f"Validation Step - Batch loss: {loss.item()}")
|
84 |
return output
|
85 |
+
|
86 |
def predict_step(self, batch):
|
87 |
images, label, domain = batch
|
88 |
outputs = self.forward(images).squeeze()
|
|
|
93 |
if not self.validation_outputs:
|
94 |
logging.warning("No outputs in validation step to process")
|
95 |
return
|
96 |
+
preds = torch.cat([x["preds"] for x in self.validation_outputs])
|
97 |
+
labels = torch.cat([x["labels"] for x in self.validation_outputs])
|
98 |
if labels.unique().size(0) == 1:
|
99 |
logging.warning("Only one class in validation step")
|
100 |
return
|
101 |
auc_score = roc_auc_score(labels.cpu(), preds.cpu())
|
102 |
+
self.log("val_auc", auc_score, prog_bar=True, sync_dist=True)
|
103 |
logging.info(f"Validation Epoch End - AUC score: {auc_score}")
|
104 |
self.validation_outputs = []
|
105 |
|
|
|
108 |
return optimizer
|
109 |
|
110 |
|
|
|
111 |
def load_image(image_path, transform=None):
|
112 |
+
image = Image.open(image_path).convert("RGB")
|
113 |
+
|
114 |
if transform:
|
115 |
image = transform(image)
|
116 |
+
|
117 |
return image
|
118 |
|
119 |
|
120 |
def predict_single_image(image_path, model, transform=None):
|
121 |
+
image = load_image(image_path, transform)
|
122 |
+
|
123 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
124 |
+
|
125 |
model.to(device)
|
126 |
+
|
127 |
image = image.to(device)
|
128 |
|
129 |
model.eval()
|
130 |
+
|
131 |
with torch.no_grad():
|
132 |
+
image = image.unsqueeze(0)
|
133 |
+
output = model(image).squeeze()
|
134 |
+
prediction = torch.sigmoid(output).item()
|
135 |
+
|
136 |
return prediction
|
137 |
|
138 |
|
139 |
def image_generation_detection(image_path):
|
140 |
model = ImageClassifier.load_from_checkpoint(CHECKPOINT)
|
141 |
|
142 |
+
transform = v2.Compose(
|
143 |
+
[
|
144 |
+
transforms.ToTensor(),
|
145 |
+
v2.CenterCrop((256, 256)),
|
146 |
+
],
|
147 |
+
)
|
148 |
+
|
149 |
+
prediction = predict_single_image(image_path, model, transform)
|
150 |
|
|
|
|
|
151 |
result = ""
|
152 |
if prediction <= 0.2:
|
153 |
result += "Most likely human"
|
|
|
161 |
return image_prediction_label, image_confidence
|
162 |
|
163 |
|
164 |
+
if __name__ == "__main__":
|
165 |
image_path = "path_to_your_image.jpg" # Replace with your image path
|
166 |
image_prediction_label, image_confidence = image_generation_detection(
|
167 |
image_path,
|
168 |
+
)
|
src/application/image/search_yandex.py
CHANGED
@@ -1,17 +1,22 @@
|
|
1 |
-
import
|
2 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import requests
|
4 |
-
import json
|
5 |
from bs4 import BeautifulSoup
|
6 |
-
from urllib.parse import quote, urlparse
|
7 |
|
8 |
logging.basicConfig(
|
9 |
-
filename=
|
10 |
level=logging.INFO,
|
11 |
-
format=
|
12 |
-
datefmt=
|
13 |
)
|
14 |
|
|
|
15 |
class SearchResults:
|
16 |
def __init__(self, results):
|
17 |
self.results = results
|
@@ -25,20 +30,29 @@ class SearchResults:
|
|
25 |
output += "---\n"
|
26 |
return output
|
27 |
|
|
|
28 |
class YandexReverseImageSearcher:
|
29 |
def __init__(self):
|
30 |
self.base_url = "https://yandex.ru/images/search"
|
31 |
-
self.headers = {
|
|
|
|
|
32 |
self.retry_count = 3
|
33 |
self.retry_delay = 1
|
34 |
|
35 |
-
def response(
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
self._validate_input(query, image_url)
|
37 |
-
|
38 |
encoded_query = quote(query)
|
39 |
encoded_image_url = quote(image_url)
|
40 |
|
41 |
-
url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2"
|
42 |
|
43 |
all_results = []
|
44 |
start_index = 0
|
@@ -46,14 +60,16 @@ class YandexReverseImageSearcher:
|
|
46 |
while len(all_results) < max_results:
|
47 |
if start_index != 0:
|
48 |
time.sleep(delay)
|
49 |
-
|
50 |
paginated_url = f"{url}&start={start_index}"
|
51 |
|
52 |
response = self._make_request(paginated_url)
|
53 |
if response is None:
|
54 |
break
|
55 |
|
56 |
-
search_results, valid_content = self._parse_search_results(
|
|
|
|
|
57 |
if not valid_content:
|
58 |
logging.warning("Unexpected HTML structure encountered.")
|
59 |
break
|
@@ -65,34 +81,44 @@ class YandexReverseImageSearcher:
|
|
65 |
if data and data not in all_results:
|
66 |
all_results.append(data)
|
67 |
|
68 |
-
start_index +=
|
69 |
|
70 |
if len(all_results) == 0:
|
71 |
-
logging.warning(
|
72 |
-
|
|
|
|
|
73 |
else:
|
74 |
return SearchResults(all_results[:max_results])
|
75 |
-
|
76 |
def _validate_input(self, query: str, image_url: str):
|
77 |
if not query:
|
78 |
-
raise ValueError(
|
|
|
|
|
79 |
if not image_url:
|
80 |
-
raise ValueError(
|
|
|
|
|
81 |
if not self._validate_image_url(image_url):
|
82 |
-
raise ValueError(
|
83 |
-
|
|
|
|
|
84 |
def _validate_image_url(self, url: str) -> bool:
|
85 |
parsed_url = urlparse(url)
|
86 |
path = parsed_url.path.lower()
|
87 |
valid_extensions = (".jpg", ".jpeg", ".png", ".webp")
|
88 |
return any(path.endswith(ext) for ext in valid_extensions)
|
89 |
-
|
90 |
def _make_request(self, url: str):
|
91 |
attempts = 0
|
92 |
while attempts < self.retry_count:
|
93 |
try:
|
94 |
response = requests.get(url, headers=self.headers)
|
95 |
-
if response.headers.get(
|
|
|
|
|
96 |
response.raise_for_status()
|
97 |
return response
|
98 |
else:
|
@@ -110,14 +136,22 @@ class YandexReverseImageSearcher:
|
|
110 |
def _parse_search_results(self, html_content: str):
|
111 |
try:
|
112 |
soup = BeautifulSoup(html_content, "html.parser")
|
113 |
-
return soup.find_all(
|
114 |
except Exception as e:
|
115 |
logging.error(f"Error parsing HTML content: {e}")
|
116 |
return None, False
|
117 |
|
118 |
def _extract_result_data(self, result):
|
119 |
-
link =
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
return {"link": link, "title": title} if link and title else {}
|
122 |
|
123 |
|
@@ -131,24 +165,27 @@ def get_image_links(page):
|
|
131 |
Returns:
|
132 |
A list of image URLs.
|
133 |
"""
|
134 |
-
soup = BeautifulSoup(page,
|
135 |
-
|
136 |
# Find the specific section containing image links
|
137 |
-
gallery_data = soup.find(
|
|
|
|
|
|
|
138 |
if gallery_data is None:
|
139 |
return []
|
140 |
-
|
141 |
# Find the container of image links
|
142 |
-
image_links_container = gallery_data.find(
|
143 |
if image_links_container is None:
|
144 |
return []
|
145 |
-
|
146 |
-
data_state = json.loads(image_links_container[
|
147 |
|
148 |
# Extract URLs from each div
|
149 |
image_urls = []
|
150 |
-
for site in data_state[
|
151 |
-
original_image_url = site[
|
152 |
image_urls.append(original_image_url)
|
153 |
|
154 |
return image_urls
|
@@ -158,19 +195,19 @@ def yandex_reverse_image_search(file_path):
|
|
158 |
img_search_url = generate_images_search_links(file_path)
|
159 |
if img_search_url is None:
|
160 |
return []
|
161 |
-
|
162 |
# Simulate a user agent to avoid being blocked
|
163 |
headers = {
|
164 |
-
|
165 |
-
|
166 |
}
|
167 |
-
|
168 |
try:
|
169 |
response = requests.get(img_search_url, headers=headers)
|
170 |
response.raise_for_status() # Raise an exception for bad status codes
|
171 |
|
172 |
# Parse the HTML content
|
173 |
-
soup = BeautifulSoup(response.content,
|
174 |
image_urls = get_image_links(soup.prettify())
|
175 |
return image_urls
|
176 |
|
@@ -180,21 +217,28 @@ def yandex_reverse_image_search(file_path):
|
|
180 |
|
181 |
|
182 |
def generate_images_search_links(file_path):
|
183 |
-
search_url =
|
184 |
-
params = {
|
185 |
-
|
|
|
|
|
|
|
|
|
186 |
try:
|
187 |
-
files = {
|
188 |
response = requests.post(search_url, params=params, files=files)
|
189 |
-
query_string = json.loads(response.content)[
|
190 |
-
|
|
|
|
|
191 |
return img_search_url
|
192 |
-
except:
|
|
|
193 |
return None
|
194 |
|
195 |
|
196 |
if __name__ == "__main__":
|
197 |
-
file_path = "T:\\Projects\\prj-nict-ai-content-detection\\data\\test_data\\towels.jpg.webp"
|
198 |
image_urls = yandex_reverse_image_search(file_path)
|
199 |
for image_url in image_urls:
|
200 |
print(f"Image URL: {image_url}")
|
|
|
1 |
+
import json
|
2 |
import logging
|
3 |
+
import time
|
4 |
+
from urllib.parse import (
|
5 |
+
quote,
|
6 |
+
urlparse,
|
7 |
+
)
|
8 |
+
|
9 |
import requests
|
|
|
10 |
from bs4 import BeautifulSoup
|
|
|
11 |
|
12 |
logging.basicConfig(
|
13 |
+
filename="error.log",
|
14 |
level=logging.INFO,
|
15 |
+
format="%(asctime)s | [%(levelname)s]: %(message)s",
|
16 |
+
datefmt="%m-%d-%Y / %I:%M:%S %p",
|
17 |
)
|
18 |
|
19 |
+
|
20 |
class SearchResults:
|
21 |
def __init__(self, results):
|
22 |
self.results = results
|
|
|
30 |
output += "---\n"
|
31 |
return output
|
32 |
|
33 |
+
|
34 |
class YandexReverseImageSearcher:
|
35 |
def __init__(self):
|
36 |
self.base_url = "https://yandex.ru/images/search"
|
37 |
+
self.headers = {
|
38 |
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # noqa: E501
|
39 |
+
}
|
40 |
self.retry_count = 3
|
41 |
self.retry_delay = 1
|
42 |
|
43 |
+
def response(
|
44 |
+
self,
|
45 |
+
query: str,
|
46 |
+
image_url: str,
|
47 |
+
max_results: int = 10,
|
48 |
+
delay: int = 1,
|
49 |
+
) -> SearchResults:
|
50 |
self._validate_input(query, image_url)
|
51 |
+
|
52 |
encoded_query = quote(query)
|
53 |
encoded_image_url = quote(image_url)
|
54 |
|
55 |
+
url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2" # noqa: E501
|
56 |
|
57 |
all_results = []
|
58 |
start_index = 0
|
|
|
60 |
while len(all_results) < max_results:
|
61 |
if start_index != 0:
|
62 |
time.sleep(delay)
|
63 |
+
|
64 |
paginated_url = f"{url}&start={start_index}"
|
65 |
|
66 |
response = self._make_request(paginated_url)
|
67 |
if response is None:
|
68 |
break
|
69 |
|
70 |
+
search_results, valid_content = self._parse_search_results(
|
71 |
+
response.text,
|
72 |
+
)
|
73 |
if not valid_content:
|
74 |
logging.warning("Unexpected HTML structure encountered.")
|
75 |
break
|
|
|
81 |
if data and data not in all_results:
|
82 |
all_results.append(data)
|
83 |
|
84 |
+
start_index += len(all_results) - start_index
|
85 |
|
86 |
if len(all_results) == 0:
|
87 |
+
logging.warning(
|
88 |
+
f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].", # noqa: E501
|
89 |
+
)
|
90 |
+
return "No results found. Please try again with a different query and/or image URL." # noqa: E501
|
91 |
else:
|
92 |
return SearchResults(all_results[:max_results])
|
93 |
+
|
94 |
def _validate_input(self, query: str, image_url: str):
|
95 |
if not query:
|
96 |
+
raise ValueError(
|
97 |
+
"Query not found. Enter a query and try again.",
|
98 |
+
)
|
99 |
if not image_url:
|
100 |
+
raise ValueError(
|
101 |
+
"Image URL not found. Enter an image URL and try again.",
|
102 |
+
)
|
103 |
if not self._validate_image_url(image_url):
|
104 |
+
raise ValueError(
|
105 |
+
"Invalid image URL. Enter a valid image URL and try again.",
|
106 |
+
)
|
107 |
+
|
108 |
def _validate_image_url(self, url: str) -> bool:
|
109 |
parsed_url = urlparse(url)
|
110 |
path = parsed_url.path.lower()
|
111 |
valid_extensions = (".jpg", ".jpeg", ".png", ".webp")
|
112 |
return any(path.endswith(ext) for ext in valid_extensions)
|
113 |
+
|
114 |
def _make_request(self, url: str):
|
115 |
attempts = 0
|
116 |
while attempts < self.retry_count:
|
117 |
try:
|
118 |
response = requests.get(url, headers=self.headers)
|
119 |
+
if response.headers.get("Content-Type", "").startswith(
|
120 |
+
"text/html",
|
121 |
+
):
|
122 |
response.raise_for_status()
|
123 |
return response
|
124 |
else:
|
|
|
136 |
def _parse_search_results(self, html_content: str):
|
137 |
try:
|
138 |
soup = BeautifulSoup(html_content, "html.parser")
|
139 |
+
return soup.find_all("div", class_="g"), True
|
140 |
except Exception as e:
|
141 |
logging.error(f"Error parsing HTML content: {e}")
|
142 |
return None, False
|
143 |
|
144 |
def _extract_result_data(self, result):
|
145 |
+
link = (
|
146 |
+
result.find("a", href=True)["href"]
|
147 |
+
if result.find("a", href=True)
|
148 |
+
else None
|
149 |
+
)
|
150 |
+
title = (
|
151 |
+
result.find("h3").get_text(strip=True)
|
152 |
+
if result.find("h3")
|
153 |
+
else None
|
154 |
+
)
|
155 |
return {"link": link, "title": title} if link and title else {}
|
156 |
|
157 |
|
|
|
165 |
Returns:
|
166 |
A list of image URLs.
|
167 |
"""
|
168 |
+
soup = BeautifulSoup(page, "html.parser")
|
169 |
+
|
170 |
# Find the specific section containing image links
|
171 |
+
gallery_data = soup.find(
|
172 |
+
"div",
|
173 |
+
{"class": "cbir-section cbir-section_name_sites"},
|
174 |
+
)
|
175 |
if gallery_data is None:
|
176 |
return []
|
177 |
+
|
178 |
# Find the container of image links
|
179 |
+
image_links_container = gallery_data.find("div", {"class": "Root"})
|
180 |
if image_links_container is None:
|
181 |
return []
|
182 |
+
|
183 |
+
data_state = json.loads(image_links_container["data-state"])
|
184 |
|
185 |
# Extract URLs from each div
|
186 |
image_urls = []
|
187 |
+
for site in data_state["sites"]:
|
188 |
+
original_image_url = site["originalImage"]["url"]
|
189 |
image_urls.append(original_image_url)
|
190 |
|
191 |
return image_urls
|
|
|
195 |
img_search_url = generate_images_search_links(file_path)
|
196 |
if img_search_url is None:
|
197 |
return []
|
198 |
+
|
199 |
# Simulate a user agent to avoid being blocked
|
200 |
headers = {
|
201 |
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", # noqa: E501
|
202 |
+
"Content-Type": "application/json",
|
203 |
}
|
204 |
+
|
205 |
try:
|
206 |
response = requests.get(img_search_url, headers=headers)
|
207 |
response.raise_for_status() # Raise an exception for bad status codes
|
208 |
|
209 |
# Parse the HTML content
|
210 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
211 |
image_urls = get_image_links(soup.prettify())
|
212 |
return image_urls
|
213 |
|
|
|
217 |
|
218 |
|
219 |
def generate_images_search_links(file_path):
|
220 |
+
search_url = "https://yandex.ru/images/search"
|
221 |
+
params = {
|
222 |
+
"rpt": "imageview",
|
223 |
+
"format": "json",
|
224 |
+
"request": '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}', # noqa: E501
|
225 |
+
}
|
226 |
+
|
227 |
try:
|
228 |
+
files = {"upfile": ("blob", open(file_path, "rb"), "image/jpeg/webp")}
|
229 |
response = requests.post(search_url, params=params, files=files)
|
230 |
+
query_string = json.loads(response.content)["blocks"][0]["params"][
|
231 |
+
"url"
|
232 |
+
]
|
233 |
+
img_search_url = search_url + "?" + query_string
|
234 |
return img_search_url
|
235 |
+
except requests.exceptions as e:
|
236 |
+
print(f"Error generating search URL: {e}")
|
237 |
return None
|
238 |
|
239 |
|
240 |
if __name__ == "__main__":
|
241 |
+
file_path = "T:\\Projects\\prj-nict-ai-content-detection\\data\\test_data\\towels.jpg.webp" # noqa: E501
|
242 |
image_urls = yandex_reverse_image_search(file_path)
|
243 |
for image_url in image_urls:
|
244 |
print(f"Image URL: {image_url}")
|
src/application/text/entity.py
CHANGED
@@ -1,42 +1,51 @@
|
|
1 |
import colorsys
|
2 |
import json
|
|
|
3 |
import re
|
|
|
|
|
4 |
import openai
|
5 |
from dotenv import load_dotenv
|
6 |
-
import os
|
7 |
from transformers import pipeline
|
8 |
-
import gradio as gr
|
9 |
|
10 |
ner_pipeline = pipeline("ner")
|
11 |
|
12 |
load_dotenv()
|
13 |
-
AZURE_OPENAI_API_KEY = os.getenv(
|
14 |
-
AZURE_OPENAI_ENDPOINT = os.getenv(
|
15 |
-
AZURE_OPENAI_API_VERSION = os.getenv(
|
16 |
|
17 |
client = openai.AzureOpenAI(
|
18 |
-
api_version
|
19 |
-
api_key
|
20 |
-
azure_endpoint
|
21 |
-
|
22 |
|
23 |
|
24 |
-
def extract_entities_gpt(
|
|
|
|
|
|
|
|
|
25 |
# "gpt-4o-mini" or "o1-mini"
|
26 |
# Generate text using the selected models
|
27 |
prompt = f"""
|
28 |
-
Compare the ORIGINAL TEXT and the COMPARED TEXT.
|
29 |
-
|
30 |
-
Focus
|
31 |
-
* **Numerical changes:** e.g., "five"
|
32 |
-
* **Time changes:** e.g., "Monday"
|
33 |
-
* **Name changes:** e.g., "Tokyo"
|
34 |
-
* **Opposite meanings:** e.g., "increase"
|
35 |
-
* **Semantically different words:** e.g., "car"
|
36 |
-
|
37 |
-
Exclude entities where the meaning remains essentially the same,
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
40 |
[
|
41 |
["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
|
42 |
["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
|
@@ -50,23 +59,24 @@ If there are no entities that satisfy above condition, output empty list "[]".
|
|
50 |
# COMPARED TEXT:
|
51 |
{compared_text}
|
52 |
"""
|
53 |
-
|
54 |
# Generate text using the text generation model
|
55 |
-
# Generate text using the selected model
|
56 |
try:
|
57 |
response = client.chat.completions.create(
|
58 |
-
model=text_generation_model,
|
59 |
-
messages
|
60 |
)
|
61 |
-
|
62 |
res = response.choices[0].message.content
|
63 |
|
64 |
except openai.OpenAIError as e:
|
65 |
print(f"Error interacting with OpenAI API: {e}")
|
66 |
-
res =
|
67 |
|
68 |
return res
|
69 |
-
|
|
|
70 |
def read_json(json_string) -> list[list[str]]:
|
71 |
try:
|
72 |
entities = json.loads(json_string)
|
@@ -75,53 +85,64 @@ def read_json(json_string) -> list[list[str]]:
|
|
75 |
for inner_list in entities:
|
76 |
if inner_list not in unique_entities:
|
77 |
unique_entities.append(inner_list)
|
78 |
-
|
79 |
return unique_entities
|
80 |
|
81 |
except json.JSONDecodeError as e:
|
82 |
print(f"Error decoding JSON: {e}")
|
83 |
return []
|
84 |
|
|
|
85 |
def lighten_color(hex_color, factor=1.8):
|
86 |
"""Lightens a HEX color by increasing its brightness in HSV space."""
|
87 |
|
88 |
hex_color = hex_color.lstrip("#")
|
89 |
-
r, g, b =
|
90 |
-
|
|
|
|
|
|
|
|
|
91 |
# Convert to HSV
|
92 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
93 |
v = min(1.0, v * factor) # Increase brightness
|
94 |
-
|
95 |
# Convert back to HEX
|
96 |
-
r, g, b =
|
97 |
-
return f
|
|
|
98 |
|
99 |
def darken_color(hex_color, factor=0.7):
|
100 |
"""Darkens a hex color by reducing its brightness in the HSV space."""
|
101 |
|
102 |
hex_color = hex_color.lstrip("#")
|
103 |
-
r, g, b =
|
104 |
-
|
|
|
|
|
|
|
|
|
105 |
# Convert to HSV to adjust brightness
|
106 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
107 |
v = max(0, v * factor) # Reduce brightness
|
108 |
-
|
109 |
# Convert back to HEX
|
110 |
-
r, g, b =
|
111 |
-
return f
|
|
|
112 |
|
113 |
def generate_color(index, total_colors=20):
|
114 |
"""Generates a unique, evenly spaced color for each index using HSL."""
|
115 |
|
116 |
hue = index / total_colors # Spread hues in range [0,1]
|
117 |
saturation = 0.65 # Keep colors vivid
|
118 |
-
lightness = 0.75
|
119 |
-
|
120 |
# Convert HSL to RGB
|
121 |
r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
|
122 |
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
123 |
-
|
124 |
-
return f
|
125 |
|
126 |
|
127 |
def assign_colors_to_entities(entities):
|
@@ -130,12 +151,15 @@ def assign_colors_to_entities(entities):
|
|
130 |
entities_colors = []
|
131 |
for index, entity in enumerate(entities):
|
132 |
color = generate_color(index, total_colors)
|
133 |
-
|
134 |
# append color and index to entities_colors
|
135 |
-
entities_colors.append(
|
136 |
-
|
|
|
|
|
137 |
return entities_colors
|
138 |
|
|
|
139 |
def highlight_entities(text1, text2):
|
140 |
if text1 == "" or text2 == "":
|
141 |
return []
|
@@ -154,49 +178,62 @@ def highlight_entities(text1, text2):
|
|
154 |
return entities_with_colors
|
155 |
|
156 |
|
157 |
-
def apply_highlight(text, entities_with_colors, key="input", count
|
158 |
if entities_with_colors == []:
|
159 |
return text, []
|
160 |
-
|
161 |
all_starts = []
|
162 |
all_ends = []
|
163 |
highlighted_text = ""
|
164 |
temp_text = text
|
165 |
for index, entity in enumerate(entities_with_colors):
|
166 |
highlighted_text = ""
|
167 |
-
|
168 |
-
# find a list of starts and ends of entity in text:
|
169 |
# starts = [m.start() for m in re.finditer(entity[key], temp_text)]
|
170 |
# ends = [m.end() for m in re.finditer(entity[key], temp_text)]
|
171 |
-
starts =[]
|
172 |
ends = []
|
173 |
# "\b" is for bound a word
|
174 |
-
for m in re.finditer(
|
|
|
|
|
|
|
175 |
starts.append(m.start())
|
176 |
ends.append(m.end())
|
177 |
-
|
178 |
all_starts.extend(starts)
|
179 |
all_ends.extend(ends)
|
180 |
-
|
181 |
color = entities_with_colors[index]["color"]
|
182 |
-
entity_color = lighten_color(
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
# Apply highlighting to each entity
|
186 |
prev_end = 0
|
187 |
for start, end in zip(starts, ends):
|
188 |
# Append non-highlighted text
|
189 |
highlighted_text += temp_text[prev_end:start]
|
190 |
-
|
191 |
# Style the index as a label
|
192 |
-
index_label = (
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
196 |
# Append highlighted text with index label
|
197 |
-
highlighted_text += (
|
198 |
-
|
199 |
-
|
|
|
|
|
200 |
prev_end = end
|
201 |
highlighted_text += temp_text[prev_end:]
|
202 |
temp_text = highlighted_text
|
@@ -206,6 +243,7 @@ def apply_highlight(text, entities_with_colors, key="input", count = 0):
|
|
206 |
highlight_idx_list = get_index_list(highlighted_text)
|
207 |
return highlighted_text, highlight_idx_list
|
208 |
|
|
|
209 |
def get_index_list(highlighted_text):
|
210 |
"""
|
211 |
Generates a list of indices between corresponding start and end indices.
|
@@ -216,7 +254,7 @@ def get_index_list(highlighted_text):
|
|
216 |
|
217 |
Returns:
|
218 |
A list containing all indices within the specified ranges.
|
219 |
-
Returns an empty list if the input is invalid (e.g., different lengths,
|
220 |
end < start, etc.).
|
221 |
"""
|
222 |
highlighted_index = []
|
@@ -226,22 +264,24 @@ def get_index_list(highlighted_text):
|
|
226 |
start_index = index
|
227 |
if word.endswith("</span>"):
|
228 |
end_index = index
|
229 |
-
|
230 |
highlighted_index.extend(list(range(start_index, end_index + 1)))
|
231 |
|
232 |
return highlighted_index
|
233 |
|
|
|
234 |
def extract_entities(text):
|
235 |
output = ner_pipeline(text)
|
236 |
words = extract_words(output)
|
237 |
words = combine_subwords(words)
|
238 |
-
|
239 |
-
# extract word in each entity and assign to a list of entities,
|
240 |
-
|
|
|
241 |
for entity in words:
|
242 |
if entity not in entities:
|
243 |
entities.append(entity)
|
244 |
-
|
245 |
return entities
|
246 |
|
247 |
|
@@ -275,8 +315,12 @@ def combine_subwords(word_list):
|
|
275 |
i = 0
|
276 |
while i < len(word_list):
|
277 |
if word_list[i].startswith("##"):
|
278 |
-
result[-1] += word_list[i][
|
279 |
-
|
|
|
|
|
|
|
|
|
280 |
result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
|
281 |
i += 2 # Skip the next two words
|
282 |
else:
|
@@ -286,44 +330,57 @@ def combine_subwords(word_list):
|
|
286 |
|
287 |
|
288 |
original_text = """
|
289 |
-
Title: UK pledges support for Ukraine with 100-year pact
|
290 |
-
Content: Sir Keir Starmer has pledged to put Ukraine in the "strongest
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
"""
|
292 |
compared_text = """
|
293 |
Title: Japan pledges support for Ukraine with 100-year pact
|
294 |
-
Content: A leading Japanese figure has pledged to put Ukraine
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
"""
|
296 |
if __name__ == "__main__":
|
297 |
-
# text = "The Saudi authorities, I am told, are currently working flat out" \
|
298 |
-
# "to collate everything they have on the Magdeburg market suspect," \
|
299 |
-
# "Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \
|
300 |
-
# "investigation"
|
301 |
-
# print(extract_entities(text))
|
302 |
-
|
303 |
-
|
304 |
with gr.Blocks() as demo:
|
305 |
gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
|
306 |
text1_input = gr.Textbox(
|
307 |
-
label="Paragraph 1",
|
308 |
-
lines=5,
|
309 |
value=original_text,
|
310 |
)
|
311 |
text2_input = gr.Textbox(
|
312 |
-
label="Paragraph 2",
|
313 |
-
lines=5,
|
314 |
value=compared_text,
|
315 |
)
|
316 |
submit_button = gr.Button("Highlight Matches")
|
317 |
-
output1 = gr.HTML("<br>"*10)
|
318 |
-
output2 = gr.HTML("<br>"*10)
|
319 |
-
|
320 |
-
|
321 |
submit_button.click(
|
322 |
fn=highlight_entities,
|
323 |
inputs=[text1_input, text2_input],
|
324 |
-
outputs=[output1, output2]
|
325 |
)
|
326 |
-
|
327 |
# Launch the Gradio app
|
328 |
demo.launch()
|
329 |
-
|
|
|
1 |
import colorsys
|
2 |
import json
|
3 |
+
import os
|
4 |
import re
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
import openai
|
8 |
from dotenv import load_dotenv
|
|
|
9 |
from transformers import pipeline
|
|
|
10 |
|
11 |
ner_pipeline = pipeline("ner")
|
12 |
|
13 |
load_dotenv()
|
14 |
+
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
15 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
16 |
+
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
|
17 |
|
18 |
client = openai.AzureOpenAI(
|
19 |
+
api_version="2024-05-01-preview", # AZURE_OPENAI_API_VERSION,
|
20 |
+
api_key=AZURE_OPENAI_API_KEY,
|
21 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
22 |
+
)
|
23 |
|
24 |
|
25 |
+
def extract_entities_gpt(
|
26 |
+
original_text,
|
27 |
+
compared_text,
|
28 |
+
text_generation_model="o1-mini",
|
29 |
+
):
|
30 |
# "gpt-4o-mini" or "o1-mini"
|
31 |
# Generate text using the selected models
|
32 |
prompt = f"""
|
33 |
+
Compare the ORIGINAL TEXT and the COMPARED TEXT.
|
34 |
+
Find entity pairs with significantly different meanings after paraphrasing.
|
35 |
+
Focus only on these significantly changed entities. These include:
|
36 |
+
* **Numerical changes:** e.g., "five" -> "ten," "10%" -> "50%"
|
37 |
+
* **Time changes:** e.g., "Monday" -> "Sunday," "10th" -> "21st"
|
38 |
+
* **Name changes:** e.g., "Tokyo" -> "New York," "Japan" -> "Japanese"
|
39 |
+
* **Opposite meanings:** e.g., "increase" -> "decrease," "good" -> "bad"
|
40 |
+
* **Semantically different words:** e.g., "car" -> "truck," "walk" -> "run"
|
41 |
+
|
42 |
+
Exclude entities where the meaning remains essentially the same,
|
43 |
+
even if the wording is different
|
44 |
+
(e.g., "big" changed to "large," "house" changed to "residence").
|
45 |
+
Also exclude purely stylistic changes that don't affect the core meaning.
|
46 |
+
|
47 |
+
Output the extracted entity pairs, one pair per line,
|
48 |
+
in the following JSON-like list format without wrapping characters:
|
49 |
[
|
50 |
["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
|
51 |
["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
|
|
|
59 |
# COMPARED TEXT:
|
60 |
{compared_text}
|
61 |
"""
|
62 |
+
|
63 |
# Generate text using the text generation model
|
64 |
+
# Generate text using the selected model
|
65 |
try:
|
66 |
response = client.chat.completions.create(
|
67 |
+
model=text_generation_model,
|
68 |
+
messages=[{"role": "user", "content": prompt}],
|
69 |
)
|
70 |
+
|
71 |
res = response.choices[0].message.content
|
72 |
|
73 |
except openai.OpenAIError as e:
|
74 |
print(f"Error interacting with OpenAI API: {e}")
|
75 |
+
res = ""
|
76 |
|
77 |
return res
|
78 |
+
|
79 |
+
|
80 |
def read_json(json_string) -> list[list[str]]:
|
81 |
try:
|
82 |
entities = json.loads(json_string)
|
|
|
85 |
for inner_list in entities:
|
86 |
if inner_list not in unique_entities:
|
87 |
unique_entities.append(inner_list)
|
88 |
+
|
89 |
return unique_entities
|
90 |
|
91 |
except json.JSONDecodeError as e:
|
92 |
print(f"Error decoding JSON: {e}")
|
93 |
return []
|
94 |
|
95 |
+
|
96 |
def lighten_color(hex_color, factor=1.8):
|
97 |
"""Lightens a HEX color by increasing its brightness in HSV space."""
|
98 |
|
99 |
hex_color = hex_color.lstrip("#")
|
100 |
+
r, g, b = (
|
101 |
+
int(hex_color[0:2], 16),
|
102 |
+
int(hex_color[2:4], 16),
|
103 |
+
int(hex_color[4:6], 16),
|
104 |
+
)
|
105 |
+
|
106 |
# Convert to HSV
|
107 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
108 |
v = min(1.0, v * factor) # Increase brightness
|
109 |
+
|
110 |
# Convert back to HEX
|
111 |
+
r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
|
112 |
+
return f"#{r:02x}{g:02x}{b:02x}"
|
113 |
+
|
114 |
|
115 |
def darken_color(hex_color, factor=0.7):
|
116 |
"""Darkens a hex color by reducing its brightness in the HSV space."""
|
117 |
|
118 |
hex_color = hex_color.lstrip("#")
|
119 |
+
r, g, b = (
|
120 |
+
int(hex_color[0:2], 16),
|
121 |
+
int(hex_color[2:4], 16),
|
122 |
+
int(hex_color[4:6], 16),
|
123 |
+
)
|
124 |
+
|
125 |
# Convert to HSV to adjust brightness
|
126 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
127 |
v = max(0, v * factor) # Reduce brightness
|
128 |
+
|
129 |
# Convert back to HEX
|
130 |
+
r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
|
131 |
+
return f"#{r:02x}{g:02x}{b:02x}"
|
132 |
+
|
133 |
|
134 |
def generate_color(index, total_colors=20):
|
135 |
"""Generates a unique, evenly spaced color for each index using HSL."""
|
136 |
|
137 |
hue = index / total_colors # Spread hues in range [0,1]
|
138 |
saturation = 0.65 # Keep colors vivid
|
139 |
+
lightness = 0.75 # Balanced brightness
|
140 |
+
|
141 |
# Convert HSL to RGB
|
142 |
r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
|
143 |
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
144 |
+
|
145 |
+
return f"#{r:02x}{g:02x}{b:02x}" # Convert to hex
|
146 |
|
147 |
|
148 |
def assign_colors_to_entities(entities):
|
|
|
151 |
entities_colors = []
|
152 |
for index, entity in enumerate(entities):
|
153 |
color = generate_color(index, total_colors)
|
154 |
+
|
155 |
# append color and index to entities_colors
|
156 |
+
entities_colors.append(
|
157 |
+
{"color": color, "input": entity[0], "source": entity[1]},
|
158 |
+
)
|
159 |
+
|
160 |
return entities_colors
|
161 |
|
162 |
+
|
163 |
def highlight_entities(text1, text2):
|
164 |
if text1 == "" or text2 == "":
|
165 |
return []
|
|
|
178 |
return entities_with_colors
|
179 |
|
180 |
|
181 |
+
def apply_highlight(text, entities_with_colors, key="input", count=0):
|
182 |
if entities_with_colors == []:
|
183 |
return text, []
|
184 |
+
|
185 |
all_starts = []
|
186 |
all_ends = []
|
187 |
highlighted_text = ""
|
188 |
temp_text = text
|
189 |
for index, entity in enumerate(entities_with_colors):
|
190 |
highlighted_text = ""
|
191 |
+
|
192 |
+
# find a list of starts and ends of entity in text:
|
193 |
# starts = [m.start() for m in re.finditer(entity[key], temp_text)]
|
194 |
# ends = [m.end() for m in re.finditer(entity[key], temp_text)]
|
195 |
+
starts = []
|
196 |
ends = []
|
197 |
# "\b" is for bound a word
|
198 |
+
for m in re.finditer(
|
199 |
+
r"\b" + re.escape(entity[key]) + r"\b",
|
200 |
+
temp_text,
|
201 |
+
):
|
202 |
starts.append(m.start())
|
203 |
ends.append(m.end())
|
204 |
+
|
205 |
all_starts.extend(starts)
|
206 |
all_ends.extend(ends)
|
207 |
+
|
208 |
color = entities_with_colors[index]["color"]
|
209 |
+
entity_color = lighten_color(
|
210 |
+
color,
|
211 |
+
factor=2.2,
|
212 |
+
) # Lightened color for background text
|
213 |
+
label_color = darken_color(
|
214 |
+
entity_color,
|
215 |
+
factor=0.7,
|
216 |
+
) # Darker color for background label (index)
|
217 |
+
|
218 |
# Apply highlighting to each entity
|
219 |
prev_end = 0
|
220 |
for start, end in zip(starts, ends):
|
221 |
# Append non-highlighted text
|
222 |
highlighted_text += temp_text[prev_end:start]
|
223 |
+
|
224 |
# Style the index as a label
|
225 |
+
index_label = (
|
226 |
+
f'<span_style="background-color:{label_color};color:white;'
|
227 |
+
f"padding:1px_4px;border-radius:4px;font-size:12px;"
|
228 |
+
f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>' # noqa: E501
|
229 |
+
)
|
230 |
+
|
231 |
# Append highlighted text with index label
|
232 |
+
highlighted_text += (
|
233 |
+
f'\n<span_style="background-color:{entity_color};color:black;'
|
234 |
+
f'border-radius:3px;font-size:14px;display:inline-block;">'
|
235 |
+
f"{index_label}{temp_text[start:end]}</span>\n"
|
236 |
+
)
|
237 |
prev_end = end
|
238 |
highlighted_text += temp_text[prev_end:]
|
239 |
temp_text = highlighted_text
|
|
|
243 |
highlight_idx_list = get_index_list(highlighted_text)
|
244 |
return highlighted_text, highlight_idx_list
|
245 |
|
246 |
+
|
247 |
def get_index_list(highlighted_text):
|
248 |
"""
|
249 |
Generates a list of indices between corresponding start and end indices.
|
|
|
254 |
|
255 |
Returns:
|
256 |
A list containing all indices within the specified ranges.
|
257 |
+
Returns an empty list if the input is invalid (e.g., different lengths,
|
258 |
end < start, etc.).
|
259 |
"""
|
260 |
highlighted_index = []
|
|
|
264 |
start_index = index
|
265 |
if word.endswith("</span>"):
|
266 |
end_index = index
|
267 |
+
|
268 |
highlighted_index.extend(list(range(start_index, end_index + 1)))
|
269 |
|
270 |
return highlighted_index
|
271 |
|
272 |
+
|
273 |
def extract_entities(text):
|
274 |
output = ner_pipeline(text)
|
275 |
words = extract_words(output)
|
276 |
words = combine_subwords(words)
|
277 |
+
|
278 |
+
# extract word in each entity and assign to a list of entities,
|
279 |
+
# connect words if there is no space between them
|
280 |
+
entities = []
|
281 |
for entity in words:
|
282 |
if entity not in entities:
|
283 |
entities.append(entity)
|
284 |
+
|
285 |
return entities
|
286 |
|
287 |
|
|
|
315 |
i = 0
|
316 |
while i < len(word_list):
|
317 |
if word_list[i].startswith("##"):
|
318 |
+
result[-1] += word_list[i][
|
319 |
+
2:
|
320 |
+
] # Remove "##" and append to the previous word
|
321 |
+
elif (
|
322 |
+
i < len(word_list) - 2 and word_list[i + 1] == "-"
|
323 |
+
): # Combine hyphenated words
|
324 |
result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
|
325 |
i += 2 # Skip the next two words
|
326 |
else:
|
|
|
330 |
|
331 |
|
332 |
original_text = """
|
333 |
+
Title: UK pledges support for Ukraine with 100-year pact
|
334 |
+
Content: Sir Keir Starmer has pledged to put Ukraine in the "strongest
|
335 |
+
possible position" on a trip to Kyiv where he signed a "landmark"
|
336 |
+
100-year pact with the war-stricken country. The prime minister's
|
337 |
+
visit on Thursday was at one point marked by loud blasts and air
|
338 |
+
raid sirens after a reported Russian drone attack was intercepted
|
339 |
+
by Ukraine's defence systems. Acknowledging the "hello" from Russia,
|
340 |
+
Volodymyr Zelensky said Ukraine would send its own "hello back".
|
341 |
+
An estimated one million people have been killed or wounded in the
|
342 |
+
war so far. As the invasion reaches the end of its third year, Ukraine
|
343 |
+
is losing territory in the east. Zelensky praised the UK's commitment
|
344 |
+
on Thursday, amid wider concerns that the US President-elect Donald
|
345 |
+
Trump, who is set to take office on Monday, could potentially reduce aid.
|
346 |
"""
|
347 |
compared_text = """
|
348 |
Title: Japan pledges support for Ukraine with 100-year pact
|
349 |
+
Content: A leading Japanese figure has pledged to put Ukraine
|
350 |
+
in the "strongest possible position" on a trip to Kyiv where
|
351 |
+
they signed a "landmark" 100-year pact with the war-stricken country.
|
352 |
+
The visit on Thursday was at one point marked by loud blasts and air
|
353 |
+
raid sirens after a reported Russian drone attack was intercepted by
|
354 |
+
Ukraine's defence systems. Acknowledging the "hello" from Russia,
|
355 |
+
Volodymyr Zelensky said Ukraine would send its own "hello back".
|
356 |
+
An estimated one million people have been killed or wounded in the
|
357 |
+
war so far. As the invasion reaches the end of its third year, Ukraine
|
358 |
+
is losing territory in the east. Zelensky praised Japan's commitment
|
359 |
+
on Thursday, amid wider concerns that the next US President, who is
|
360 |
+
set to take office on Monday, could potentially reduce aid.
|
361 |
"""
|
362 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
with gr.Blocks() as demo:
|
364 |
gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
|
365 |
text1_input = gr.Textbox(
|
366 |
+
label="Paragraph 1",
|
367 |
+
lines=5,
|
368 |
value=original_text,
|
369 |
)
|
370 |
text2_input = gr.Textbox(
|
371 |
+
label="Paragraph 2",
|
372 |
+
lines=5,
|
373 |
value=compared_text,
|
374 |
)
|
375 |
submit_button = gr.Button("Highlight Matches")
|
376 |
+
output1 = gr.HTML("<br>" * 10)
|
377 |
+
output2 = gr.HTML("<br>" * 10)
|
378 |
+
|
|
|
379 |
submit_button.click(
|
380 |
fn=highlight_entities,
|
381 |
inputs=[text1_input, text2_input],
|
382 |
+
outputs=[output1, output2],
|
383 |
)
|
384 |
+
|
385 |
# Launch the Gradio app
|
386 |
demo.launch()
|
|
src/application/text/helper.py
CHANGED
@@ -1,73 +1,72 @@
|
|
1 |
-
from collections import Counter
|
2 |
-
from difflib import SequenceMatcher
|
3 |
import re
|
4 |
import string
|
5 |
-
from
|
|
|
|
|
6 |
from nltk.tokenize import word_tokenize
|
7 |
from nltk.util import ngrams
|
|
|
8 |
|
9 |
|
10 |
def clean_text(text):
|
11 |
"""Doc cleaning"""
|
12 |
-
|
|
|
|
|
13 |
# Lowering text
|
14 |
text = text.lower()
|
15 |
-
|
16 |
# Removing punctuation
|
17 |
text = "".join([c for c in text if c not in punctuations])
|
18 |
-
|
19 |
# Removing whitespace and newlines
|
20 |
-
text = re.sub(r
|
21 |
-
|
22 |
text.replace("Β£", " * ")
|
23 |
-
|
24 |
words = text.split()
|
25 |
-
text =
|
26 |
-
|
27 |
return text
|
28 |
|
|
|
29 |
def remove_punctuation(text):
|
30 |
"""Remove punctuation from a given text."""
|
31 |
punctuation_without_dot = string.punctuation.replace(".", "")
|
32 |
-
translator = str.maketrans(
|
33 |
return text.translate(translator)
|
34 |
|
|
|
35 |
def get_keywords(text, num_keywords=5):
|
36 |
"""Return top k keywords from a doc using TF-IDF method"""
|
37 |
-
|
38 |
# Create a TF-IDF Vectorizer
|
39 |
-
vectorizer = TfidfVectorizer(stop_words=
|
40 |
-
|
41 |
# Fit and transform the text
|
42 |
tfidf_matrix = vectorizer.fit_transform([text])
|
43 |
-
|
44 |
# Get feature names (words)
|
45 |
feature_names = vectorizer.get_feature_names_out()
|
46 |
-
|
47 |
# Get TF-IDF scores
|
48 |
tfidf_scores = tfidf_matrix.toarray()[0]
|
49 |
-
|
50 |
# Sort words by TF-IDF score
|
51 |
word_scores = list(zip(feature_names, tfidf_scores))
|
52 |
word_scores.sort(key=lambda x: x[1], reverse=True)
|
53 |
-
|
54 |
# Return top keywords
|
55 |
return [word for word, score in word_scores[:num_keywords]]
|
56 |
|
57 |
-
"""
|
58 |
-
# Example usage
|
59 |
-
text = "Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals. Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however this definition is rejected by major AI researchers."
|
60 |
-
print(f"\n# Input text:\n'{text}'")
|
61 |
-
print("\n----------------------\n")
|
62 |
-
|
63 |
-
keywords = get_keywords(text)
|
64 |
-
print("# Top keywords:", keywords)
|
65 |
-
print("\n----------------------\n")
|
66 |
-
"""
|
67 |
|
68 |
-
def get_important_sentences(
|
|
|
|
|
|
|
|
|
69 |
"""
|
70 |
-
Selects important sentences
|
71 |
|
72 |
Args:
|
73 |
paragraph (str): The input paragraph.
|
@@ -78,8 +77,10 @@ def get_important_sentences(paragraph: str, keywords: list[str], num_sentences:
|
|
78 |
list: A list of important sentences.
|
79 |
"""
|
80 |
# Clean and split the paragraph into sentences
|
81 |
-
sentences = [
|
82 |
-
|
|
|
|
|
83 |
# Calculate the importance score for each sentence
|
84 |
sentence_scores = []
|
85 |
for sentence in sentences:
|
@@ -87,54 +88,49 @@ def get_important_sentences(paragraph: str, keywords: list[str], num_sentences:
|
|
87 |
score = 0
|
88 |
words = processed_sentence.lower().split()
|
89 |
word_count = Counter(words)
|
90 |
-
|
91 |
for keyword in keywords:
|
92 |
if keyword.lower() in word_count:
|
93 |
score += word_count[keyword.lower()]
|
94 |
-
|
95 |
sentence_scores.append((sentence, score))
|
96 |
-
|
97 |
# Sort sentences by their scores in descending order
|
98 |
sentence_scores.sort(key=lambda x: x[1], reverse=True)
|
99 |
-
|
100 |
# Return the top N sentences
|
101 |
return [sentence for sentence, score in sentence_scores[:num_sentences]]
|
102 |
|
103 |
-
"""# Example usage
|
104 |
-
keywords = get_keywords(paragraph)
|
105 |
-
important_sentences = get_important_sentences(paragraph, keywords)
|
106 |
-
|
107 |
-
print("# Important sentences:")
|
108 |
-
for i, sentence in enumerate(important_sentences, 1):
|
109 |
-
print(f"{i}. {sentence}")
|
110 |
-
print("\n----------------------\n")
|
111 |
-
"""
|
112 |
|
113 |
-
def extract_important_phrases(
|
|
|
|
|
|
|
|
|
114 |
"""
|
115 |
-
Extracts important phrases
|
116 |
Phrase length is auto-determined, and overlapped parts are less than 20%.
|
117 |
|
118 |
Args:
|
119 |
paragraph (str): The input paragraph.
|
120 |
keywords (list[str]): List of important keywords.
|
121 |
-
phrase_length (int):
|
122 |
|
123 |
Returns:
|
124 |
list: A list of important phrases.
|
125 |
"""
|
126 |
# Tokenize the paragraph into words
|
127 |
words = word_tokenize(paragraph.lower())
|
128 |
-
|
129 |
# Determine phrase length (between 3 and 7 words)
|
130 |
phrase_length = min(max(len(words) // 10, 5), 7)
|
131 |
-
|
132 |
# Generate n-grams (phrases) from the paragraph
|
133 |
phrases = list(ngrams(words, phrase_length))
|
134 |
-
|
135 |
important_phrases = []
|
136 |
used_indices = set()
|
137 |
-
|
138 |
for i, phrase in enumerate(phrases):
|
139 |
# Check if the phrase contains any keyword
|
140 |
if any(keyword.lower() in phrase for keyword in keywords):
|
@@ -142,33 +138,36 @@ def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length
|
|
142 |
if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
|
143 |
important_phrases.append(clean_text(" ".join(phrase)))
|
144 |
used_indices.add(i)
|
145 |
-
|
146 |
return important_phrases
|
147 |
|
|
|
148 |
def extract_equal_text(text1, text2):
|
149 |
def cleanup(text):
|
150 |
text = text.lower()
|
151 |
-
text = text.translate(str.maketrans(
|
152 |
return text
|
153 |
-
|
154 |
splited_text1 = cleanup(text1).split()
|
155 |
splited_text2 = cleanup(text2).split()
|
156 |
-
|
157 |
s = SequenceMatcher(None, splited_text1, splited_text2)
|
158 |
-
|
159 |
equal_idx_1 = []
|
160 |
equal_idx_2 = []
|
161 |
text1 = text1.split()
|
162 |
text2 = text2.split()
|
163 |
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
164 |
-
if tag ==
|
165 |
equal_idx_1.append({"start": i1, "end": i2})
|
166 |
equal_idx_2.append({"start": j1, "end": j2})
|
167 |
# subtext_1 = " ".join(text1[i1:i2])
|
168 |
# subtext_2 = " ".join(text2[j1:j2])
|
169 |
-
# print(f'{tag:7} a[{i1:2}:{i2:2}]
|
|
|
170 |
return equal_idx_1, equal_idx_2
|
171 |
|
|
|
172 |
def connect_consecutive_indexes(nums):
|
173 |
"""
|
174 |
Connects consecutive integers in a list.
|
@@ -197,11 +196,3 @@ def connect_consecutive_indexes(nums):
|
|
197 |
|
198 |
result.append([start, end]) # Add the last range
|
199 |
return result
|
200 |
-
|
201 |
-
"""# Example usage
|
202 |
-
keywords = get_keywords(paragraph)
|
203 |
-
important_phrases = extract_important_phrases(paragraph, keywords)
|
204 |
-
|
205 |
-
print("# Important phrases:")
|
206 |
-
for i, phrase in enumerate(important_phrases[:5], 1): # Print top 5 phrases
|
207 |
-
print(f"{i}. {phrase}")"""
|
|
|
|
|
|
|
1 |
import re
|
2 |
import string
|
3 |
+
from collections import Counter
|
4 |
+
from difflib import SequenceMatcher
|
5 |
+
|
6 |
from nltk.tokenize import word_tokenize
|
7 |
from nltk.util import ngrams
|
8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
|
10 |
|
11 |
def clean_text(text):
|
12 |
"""Doc cleaning"""
|
13 |
+
# exclude , and . due to number
|
14 |
+
punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
|
15 |
+
|
16 |
# Lowering text
|
17 |
text = text.lower()
|
18 |
+
|
19 |
# Removing punctuation
|
20 |
text = "".join([c for c in text if c not in punctuations])
|
21 |
+
|
22 |
# Removing whitespace and newlines
|
23 |
+
text = re.sub(r"\s+", " ", text)
|
24 |
+
|
25 |
text.replace("Β£", " * ")
|
26 |
+
|
27 |
words = text.split()
|
28 |
+
text = " ".join(words[:18]) # Join the first 18 words back into a string
|
29 |
+
|
30 |
return text
|
31 |
|
32 |
+
|
33 |
def remove_punctuation(text):
|
34 |
"""Remove punctuation from a given text."""
|
35 |
punctuation_without_dot = string.punctuation.replace(".", "")
|
36 |
+
translator = str.maketrans("", "", punctuation_without_dot)
|
37 |
return text.translate(translator)
|
38 |
|
39 |
+
|
40 |
def get_keywords(text, num_keywords=5):
|
41 |
"""Return top k keywords from a doc using TF-IDF method"""
|
42 |
+
|
43 |
# Create a TF-IDF Vectorizer
|
44 |
+
vectorizer = TfidfVectorizer(stop_words="english")
|
45 |
+
|
46 |
# Fit and transform the text
|
47 |
tfidf_matrix = vectorizer.fit_transform([text])
|
48 |
+
|
49 |
# Get feature names (words)
|
50 |
feature_names = vectorizer.get_feature_names_out()
|
51 |
+
|
52 |
# Get TF-IDF scores
|
53 |
tfidf_scores = tfidf_matrix.toarray()[0]
|
54 |
+
|
55 |
# Sort words by TF-IDF score
|
56 |
word_scores = list(zip(feature_names, tfidf_scores))
|
57 |
word_scores.sort(key=lambda x: x[1], reverse=True)
|
58 |
+
|
59 |
# Return top keywords
|
60 |
return [word for word, score in word_scores[:num_keywords]]
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
+
def get_important_sentences(
|
64 |
+
paragraph: str,
|
65 |
+
keywords: list[str],
|
66 |
+
num_sentences: int = 3,
|
67 |
+
) -> list[str]:
|
68 |
"""
|
69 |
+
Selects important sentences based on a list of keywords.
|
70 |
|
71 |
Args:
|
72 |
paragraph (str): The input paragraph.
|
|
|
77 |
list: A list of important sentences.
|
78 |
"""
|
79 |
# Clean and split the paragraph into sentences
|
80 |
+
sentences = [
|
81 |
+
s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
|
82 |
+
]
|
83 |
+
|
84 |
# Calculate the importance score for each sentence
|
85 |
sentence_scores = []
|
86 |
for sentence in sentences:
|
|
|
88 |
score = 0
|
89 |
words = processed_sentence.lower().split()
|
90 |
word_count = Counter(words)
|
91 |
+
|
92 |
for keyword in keywords:
|
93 |
if keyword.lower() in word_count:
|
94 |
score += word_count[keyword.lower()]
|
95 |
+
|
96 |
sentence_scores.append((sentence, score))
|
97 |
+
|
98 |
# Sort sentences by their scores in descending order
|
99 |
sentence_scores.sort(key=lambda x: x[1], reverse=True)
|
100 |
+
|
101 |
# Return the top N sentences
|
102 |
return [sentence for sentence, score in sentence_scores[:num_sentences]]
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
+
def extract_important_phrases(
|
106 |
+
paragraph: str,
|
107 |
+
keywords: list[str],
|
108 |
+
phrase_length: int = 5,
|
109 |
+
) -> list[str]:
|
110 |
"""
|
111 |
+
Extracts important phrases based on a list of keywords.
|
112 |
Phrase length is auto-determined, and overlapped parts are less than 20%.
|
113 |
|
114 |
Args:
|
115 |
paragraph (str): The input paragraph.
|
116 |
keywords (list[str]): List of important keywords.
|
117 |
+
phrase_length (int): Length of phrases to extract (default: 5 words).
|
118 |
|
119 |
Returns:
|
120 |
list: A list of important phrases.
|
121 |
"""
|
122 |
# Tokenize the paragraph into words
|
123 |
words = word_tokenize(paragraph.lower())
|
124 |
+
|
125 |
# Determine phrase length (between 3 and 7 words)
|
126 |
phrase_length = min(max(len(words) // 10, 5), 7)
|
127 |
+
|
128 |
# Generate n-grams (phrases) from the paragraph
|
129 |
phrases = list(ngrams(words, phrase_length))
|
130 |
+
|
131 |
important_phrases = []
|
132 |
used_indices = set()
|
133 |
+
|
134 |
for i, phrase in enumerate(phrases):
|
135 |
# Check if the phrase contains any keyword
|
136 |
if any(keyword.lower() in phrase for keyword in keywords):
|
|
|
138 |
if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
|
139 |
important_phrases.append(clean_text(" ".join(phrase)))
|
140 |
used_indices.add(i)
|
141 |
+
|
142 |
return important_phrases
|
143 |
|
144 |
+
|
145 |
def extract_equal_text(text1, text2):
|
146 |
def cleanup(text):
|
147 |
text = text.lower()
|
148 |
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
149 |
return text
|
150 |
+
|
151 |
splited_text1 = cleanup(text1).split()
|
152 |
splited_text2 = cleanup(text2).split()
|
153 |
+
|
154 |
s = SequenceMatcher(None, splited_text1, splited_text2)
|
155 |
+
|
156 |
equal_idx_1 = []
|
157 |
equal_idx_2 = []
|
158 |
text1 = text1.split()
|
159 |
text2 = text2.split()
|
160 |
for tag, i1, i2, j1, j2 in s.get_opcodes():
|
161 |
+
if tag == "equal":
|
162 |
equal_idx_1.append({"start": i1, "end": i2})
|
163 |
equal_idx_2.append({"start": j1, "end": j2})
|
164 |
# subtext_1 = " ".join(text1[i1:i2])
|
165 |
# subtext_2 = " ".join(text2[j1:j2])
|
166 |
+
# print(f'{tag:7} a[{i1:2}:{i2:2}]
|
167 |
+
# --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
|
168 |
return equal_idx_1, equal_idx_2
|
169 |
|
170 |
+
|
171 |
def connect_consecutive_indexes(nums):
|
172 |
"""
|
173 |
Connects consecutive integers in a list.
|
|
|
196 |
|
197 |
result.append([start, end]) # Add the last range
|
198 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/application/text/highlight_text.py
CHANGED
@@ -1,36 +1,45 @@
|
|
1 |
-
import gradio as gr
|
2 |
import colorsys
|
3 |
-
|
4 |
-
import
|
5 |
|
6 |
|
7 |
def lighten_color(hex_color, factor=1.8):
|
8 |
"""Lightens a HEX color by increasing its brightness in HSV space."""
|
9 |
|
10 |
hex_color = hex_color.lstrip("#")
|
11 |
-
r, g, b =
|
12 |
-
|
|
|
|
|
|
|
|
|
13 |
# Convert to HSV
|
14 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
15 |
v = min(1.0, v * factor) # Increase brightness
|
16 |
-
|
17 |
# Convert back to HEX
|
18 |
-
r, g, b =
|
19 |
-
return f
|
|
|
20 |
|
21 |
def darken_color(hex_color, factor=0.7):
|
22 |
"""Darkens a hex color by reducing its brightness in the HSV space."""
|
23 |
|
24 |
hex_color = hex_color.lstrip("#")
|
25 |
-
r, g, b =
|
26 |
-
|
|
|
|
|
|
|
|
|
27 |
# Convert to HSV to adjust brightness
|
28 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
29 |
v = max(0, v * factor) # Reduce brightness
|
30 |
-
|
31 |
# Convert back to HEX
|
32 |
-
r, g, b =
|
33 |
-
return f
|
|
|
34 |
|
35 |
# Generate unique colors for pairs
|
36 |
def generate_color(index, total_colors=20):
|
@@ -38,51 +47,98 @@ def generate_color(index, total_colors=20):
|
|
38 |
|
39 |
hue = index / total_colors # Spread hues in range [0,1]
|
40 |
saturation = 0.65 # Keep colors vivid
|
41 |
-
lightness = 0.75
|
42 |
-
|
43 |
# Convert HSL to RGB
|
44 |
r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
|
45 |
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
46 |
-
|
47 |
-
return f
|
|
|
48 |
|
49 |
def highlight_pairs(text1, text2):
|
50 |
"""Highlight matching pairs between two paragraphs"""
|
51 |
# Predefined matching pairs
|
52 |
match_pairs = [
|
53 |
-
{
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
]
|
60 |
|
61 |
# Assign unique colors to each index
|
62 |
-
pair_colors = {
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
highlighted_text = ""
|
67 |
prev_end = 0
|
68 |
-
|
69 |
for pair in sorted(pairs, key=lambda x: x[key_start]):
|
70 |
start, end, index = pair[key_start], pair[key_end], pair[key_index]
|
71 |
-
color = pair_colors.get(
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# Style the index as a label
|
76 |
-
index_label = (
|
77 |
-
|
78 |
-
|
|
|
|
|
79 |
|
80 |
# Append non-highlighted text
|
81 |
highlighted_text += text[prev_end:start]
|
82 |
# Append highlighted text with index label
|
83 |
-
highlighted_text += (
|
84 |
-
|
85 |
-
|
|
|
|
|
86 |
prev_end = end
|
87 |
|
88 |
# Append remaining text
|
@@ -90,36 +146,57 @@ def highlight_pairs(text1, text2):
|
|
90 |
return highlighted_text
|
91 |
|
92 |
# Apply highlighting to both paragraphs using the global MATCH_PAIRS
|
93 |
-
highlighted_text1 = apply_highlight(
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
return highlighted_text1, highlighted_text2
|
97 |
|
98 |
-
|
|
|
99 |
# Create Gradio Interface
|
100 |
text1 = ""
|
101 |
-
|
102 |
with gr.Blocks() as demo:
|
103 |
gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
|
104 |
text1_input = gr.Textbox(
|
105 |
-
label="Paragraph 1",
|
106 |
-
lines=5,
|
107 |
-
value="
|
|
|
|
|
|
|
108 |
)
|
109 |
text2_input = gr.Textbox(
|
110 |
-
label="Paragraph 2",
|
111 |
-
lines=5,
|
112 |
-
value="
|
|
|
|
|
|
|
113 |
)
|
114 |
output1 = gr.HTML()
|
115 |
output2 = gr.HTML()
|
116 |
submit_button = gr.Button("Highlight Matches")
|
117 |
-
|
118 |
submit_button.click(
|
119 |
fn=highlight_pairs,
|
120 |
inputs=[text1_input, text2_input],
|
121 |
-
outputs=[output1, output2]
|
122 |
)
|
123 |
-
|
124 |
# Launch the Gradio app
|
125 |
demo.launch()
|
|
|
|
|
1 |
import colorsys
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
|
5 |
|
6 |
def lighten_color(hex_color, factor=1.8):
|
7 |
"""Lightens a HEX color by increasing its brightness in HSV space."""
|
8 |
|
9 |
hex_color = hex_color.lstrip("#")
|
10 |
+
r, g, b = (
|
11 |
+
int(hex_color[0:2], 16),
|
12 |
+
int(hex_color[2:4], 16),
|
13 |
+
int(hex_color[4:6], 16),
|
14 |
+
)
|
15 |
+
|
16 |
# Convert to HSV
|
17 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
18 |
v = min(1.0, v * factor) # Increase brightness
|
19 |
+
|
20 |
# Convert back to HEX
|
21 |
+
r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
|
22 |
+
return f"#{r:02x}{g:02x}{b:02x}"
|
23 |
+
|
24 |
|
25 |
def darken_color(hex_color, factor=0.7):
|
26 |
"""Darkens a hex color by reducing its brightness in the HSV space."""
|
27 |
|
28 |
hex_color = hex_color.lstrip("#")
|
29 |
+
r, g, b = (
|
30 |
+
int(hex_color[0:2], 16),
|
31 |
+
int(hex_color[2:4], 16),
|
32 |
+
int(hex_color[4:6], 16),
|
33 |
+
)
|
34 |
+
|
35 |
# Convert to HSV to adjust brightness
|
36 |
h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
|
37 |
v = max(0, v * factor) # Reduce brightness
|
38 |
+
|
39 |
# Convert back to HEX
|
40 |
+
r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
|
41 |
+
return f"#{r:02x}{g:02x}{b:02x}"
|
42 |
+
|
43 |
|
44 |
# Generate unique colors for pairs
|
45 |
def generate_color(index, total_colors=20):
|
|
|
47 |
|
48 |
hue = index / total_colors # Spread hues in range [0,1]
|
49 |
saturation = 0.65 # Keep colors vivid
|
50 |
+
lightness = 0.75 # Balanced brightness
|
51 |
+
|
52 |
# Convert HSL to RGB
|
53 |
r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
|
54 |
r, g, b = int(r * 255), int(g * 255), int(b * 255)
|
55 |
+
|
56 |
+
return f"#{r:02x}{g:02x}{b:02x}" # Convert to hex
|
57 |
+
|
58 |
|
59 |
def highlight_pairs(text1, text2):
|
60 |
"""Highlight matching pairs between two paragraphs"""
|
61 |
# Predefined matching pairs
|
62 |
match_pairs = [
|
63 |
+
{
|
64 |
+
"index": 1,
|
65 |
+
"text1": "deep learning",
|
66 |
+
"start1": 13,
|
67 |
+
"end1": 26,
|
68 |
+
"text2": "deep learning",
|
69 |
+
"start2": 12,
|
70 |
+
"end2": 25,
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"index": 2,
|
74 |
+
"text1": "neural networks",
|
75 |
+
"start1": 56,
|
76 |
+
"end1": 71,
|
77 |
+
"text2": "neural networks",
|
78 |
+
"start2": 68,
|
79 |
+
"end2": 83,
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"index": 3,
|
83 |
+
"text1": "AI research",
|
84 |
+
"start1": 86,
|
85 |
+
"end1": 97,
|
86 |
+
"text2": "AI research",
|
87 |
+
"start2": 55,
|
88 |
+
"end2": 66,
|
89 |
+
},
|
90 |
]
|
91 |
|
92 |
# Assign unique colors to each index
|
93 |
+
pair_colors = {
|
94 |
+
pair["index"]: generate_color(
|
95 |
+
pair["index"],
|
96 |
+
total_colors=len(match_pairs),
|
97 |
+
)
|
98 |
+
for pair in match_pairs
|
99 |
+
}
|
100 |
+
|
101 |
+
def apply_highlight(
|
102 |
+
text,
|
103 |
+
pairs,
|
104 |
+
key_start,
|
105 |
+
key_end,
|
106 |
+
key_index,
|
107 |
+
pair_colors,
|
108 |
+
):
|
109 |
highlighted_text = ""
|
110 |
prev_end = 0
|
111 |
+
|
112 |
for pair in sorted(pairs, key=lambda x: x[key_start]):
|
113 |
start, end, index = pair[key_start], pair[key_end], pair[key_index]
|
114 |
+
color = pair_colors.get(
|
115 |
+
index,
|
116 |
+
"#ddd",
|
117 |
+
) # Default color if not found
|
118 |
+
color = lighten_color(
|
119 |
+
color,
|
120 |
+
factor=2.2,
|
121 |
+
) # Lightened color for background text
|
122 |
+
label_color = darken_color(
|
123 |
+
color,
|
124 |
+
factor=0.7,
|
125 |
+
) # Make label color darker
|
126 |
|
127 |
# Style the index as a label
|
128 |
+
index_label = (
|
129 |
+
f'<span style="background-color:{label_color}; color:white; '
|
130 |
+
f"padding:1px 4px; border-radius:4px; font-size:12px; "
|
131 |
+
f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>' # noqa: E501
|
132 |
+
)
|
133 |
|
134 |
# Append non-highlighted text
|
135 |
highlighted_text += text[prev_end:start]
|
136 |
# Append highlighted text with index label
|
137 |
+
highlighted_text += (
|
138 |
+
f'<span style="background-color:{color}; '
|
139 |
+
f'border-radius:3px; font-size:14px; display:inline-block;">'
|
140 |
+
f"{index_label} {text[start:end]}</span>"
|
141 |
+
)
|
142 |
prev_end = end
|
143 |
|
144 |
# Append remaining text
|
|
|
146 |
return highlighted_text
|
147 |
|
148 |
# Apply highlighting to both paragraphs using the global MATCH_PAIRS
|
149 |
+
highlighted_text1 = apply_highlight(
|
150 |
+
text1,
|
151 |
+
match_pairs,
|
152 |
+
"start1",
|
153 |
+
"end1",
|
154 |
+
"index",
|
155 |
+
pair_colors,
|
156 |
+
)
|
157 |
+
highlighted_text2 = apply_highlight(
|
158 |
+
text2,
|
159 |
+
match_pairs,
|
160 |
+
"start2",
|
161 |
+
"end2",
|
162 |
+
"index",
|
163 |
+
pair_colors,
|
164 |
+
)
|
165 |
|
166 |
return highlighted_text1, highlighted_text2
|
167 |
|
168 |
+
|
169 |
+
if __name__ == "__main__":
|
170 |
# Create Gradio Interface
|
171 |
text1 = ""
|
172 |
+
|
173 |
with gr.Blocks() as demo:
|
174 |
gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
|
175 |
text1_input = gr.Textbox(
|
176 |
+
label="Paragraph 1",
|
177 |
+
lines=5,
|
178 |
+
value="""
|
179 |
+
The field of deep learning is advancing rapidly.
|
180 |
+
Modern neural networks are improving AI research significantly.
|
181 |
+
""",
|
182 |
)
|
183 |
text2_input = gr.Textbox(
|
184 |
+
label="Paragraph 2",
|
185 |
+
lines=5,
|
186 |
+
value="""
|
187 |
+
Advances in deep learning have led to breakthroughs in AI research.
|
188 |
+
Neural networks are at the core of these innovations",
|
189 |
+
""",
|
190 |
)
|
191 |
output1 = gr.HTML()
|
192 |
output2 = gr.HTML()
|
193 |
submit_button = gr.Button("Highlight Matches")
|
194 |
+
|
195 |
submit_button.click(
|
196 |
fn=highlight_pairs,
|
197 |
inputs=[text1_input, text2_input],
|
198 |
+
outputs=[output1, output2],
|
199 |
)
|
200 |
+
|
201 |
# Launch the Gradio app
|
202 |
demo.launch()
|
src/application/text/model_detection.py
CHANGED
@@ -19,7 +19,7 @@ def detect_text_by_ai_model(
|
|
19 |
"""
|
20 |
Model: chatgpt_detector_roberta
|
21 |
Ref: https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta
|
22 |
-
|
23 |
Detects if text is human or machine generated.
|
24 |
|
25 |
Returns:
|
|
|
19 |
"""
|
20 |
Model: chatgpt_detector_roberta
|
21 |
Ref: https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta
|
22 |
+
|
23 |
Detects if text is human or machine generated.
|
24 |
|
25 |
Returns:
|
src/application/text/preprocessing.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from nltk.tokenize import sent_tokenize
|
2 |
|
|
|
3 |
def split_into_paragraphs(input_text):
|
4 |
"""
|
5 |
Splits input text into sentences by newlines.
|
@@ -17,6 +18,6 @@ def split_into_paragraphs(input_text):
|
|
17 |
sentences = []
|
18 |
for paragraph in paragraphs:
|
19 |
paragraph = paragraph.strip()
|
20 |
-
if paragraph and paragraph !=
|
21 |
sentences.extend(sent_tokenize(paragraph))
|
22 |
-
return sentences
|
|
|
1 |
from nltk.tokenize import sent_tokenize
|
2 |
|
3 |
+
|
4 |
def split_into_paragraphs(input_text):
|
5 |
"""
|
6 |
Splits input text into sentences by newlines.
|
|
|
18 |
sentences = []
|
19 |
for paragraph in paragraphs:
|
20 |
paragraph = paragraph.strip()
|
21 |
+
if paragraph and paragraph != "\n":
|
22 |
sentences.extend(sent_tokenize(paragraph))
|
23 |
+
return sentences
|
src/application/text/search.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
-
from collections import Counter
|
2 |
import os
|
3 |
import string
|
|
|
|
|
4 |
import requests
|
5 |
from dotenv import load_dotenv
|
6 |
from nltk.corpus import stopwords
|
@@ -9,27 +10,28 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|
9 |
|
10 |
from src.application.text.entity import extract_entities
|
11 |
|
12 |
-
load_dotenv()
|
13 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
14 |
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
|
15 |
|
|
|
16 |
def search_by_google(
|
17 |
-
query,
|
18 |
num_results=10,
|
19 |
-
is_exact_terms
|
20 |
-
|
21 |
"""
|
22 |
Searches the Google Custom Search Engine for the given query.
|
23 |
|
24 |
Args:
|
25 |
query: The search query.
|
26 |
-
is_exact_terms: Whether to use exact terms search (True) or
|
27 |
num_results: The number of results to return (default: 10).
|
28 |
|
29 |
Returns:
|
30 |
-
A
|
31 |
"""
|
32 |
-
|
33 |
url = "https://www.googleapis.com/customsearch/v1"
|
34 |
params = {
|
35 |
"key": GOOGLE_API_KEY,
|
@@ -40,7 +42,7 @@ def search_by_google(
|
|
40 |
params["exactTerms"] = query
|
41 |
else:
|
42 |
params["q"] = query.replace('"', "")
|
43 |
-
|
44 |
response = requests.get(url, params=params)
|
45 |
if response.status_code == 200:
|
46 |
return response.json()
|
@@ -48,9 +50,11 @@ def search_by_google(
|
|
48 |
print(f"Error: {response.status_code}, {response.text}")
|
49 |
return None
|
50 |
|
|
|
51 |
def get_most_frequent_words(input_text, number_word=32):
|
52 |
"""
|
53 |
-
Gets the top words from the input text,
|
|
|
54 |
|
55 |
Args:
|
56 |
input_text: The input text as a string.
|
@@ -65,18 +69,21 @@ def get_most_frequent_words(input_text, number_word=32):
|
|
65 |
|
66 |
words = word_tokenize(input_text.lower()) # Tokenize and lowercase
|
67 |
|
68 |
-
stop_words = set(stopwords.words(
|
69 |
-
punctuation = set(string.punctuation)
|
70 |
filtered_words = [
|
71 |
-
word
|
72 |
-
|
|
|
|
|
|
|
73 |
]
|
74 |
word_frequencies = Counter(filtered_words)
|
75 |
top_words = word_frequencies.most_common(number_word)
|
76 |
-
|
77 |
for top_word in top_words:
|
78 |
words.append(top_word[0])
|
79 |
-
|
80 |
if len(words) > 32:
|
81 |
search_phrase = " ".join(words[:32])
|
82 |
else:
|
@@ -84,6 +91,7 @@ def get_most_frequent_words(input_text, number_word=32):
|
|
84 |
|
85 |
return search_phrase
|
86 |
|
|
|
87 |
def get_chunk(input_text, chunk_length=32, num_chunk=3):
|
88 |
"""
|
89 |
Splits the input text into chunks of a specified length.
|
@@ -94,7 +102,7 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
|
|
94 |
chunk_length: The desired length of each chunk (in words).
|
95 |
|
96 |
Returns:
|
97 |
-
A list of string chunks.
|
98 |
Returns an empty list if input is invalid.
|
99 |
"""
|
100 |
if not isinstance(input_text, str):
|
@@ -112,25 +120,26 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
|
|
112 |
|
113 |
return chunks
|
114 |
|
|
|
115 |
def get_keywords(text, num_keywords=5):
|
116 |
"""Return top k keywords from a doc using TF-IDF method"""
|
117 |
-
|
118 |
# Create a TF-IDF Vectorizer
|
119 |
-
vectorizer = TfidfVectorizer(stop_words=
|
120 |
-
|
121 |
# Fit and transform the text
|
122 |
tfidf_matrix = vectorizer.fit_transform([text])
|
123 |
-
|
124 |
# Get feature names (words)
|
125 |
feature_names = vectorizer.get_feature_names_out()
|
126 |
-
|
127 |
# Get TF-IDF scores
|
128 |
tfidf_scores = tfidf_matrix.toarray()[0]
|
129 |
-
|
130 |
# Sort words by TF-IDF score
|
131 |
word_scores = list(zip(feature_names, tfidf_scores))
|
132 |
word_scores.sort(key=lambda x: x[1], reverse=True)
|
133 |
-
|
134 |
# Return top keywords
|
135 |
return [word for word, score in word_scores[:num_keywords]]
|
136 |
|
@@ -150,29 +159,30 @@ def generate_search_phrases(input_text):
|
|
150 |
"""
|
151 |
if not isinstance(input_text, str):
|
152 |
return []
|
153 |
-
|
154 |
search_phrases = []
|
155 |
-
|
156 |
# Method 1: Get most frequent words
|
157 |
search_phrases.append(get_most_frequent_words(input_text))
|
158 |
-
|
159 |
# Method 2: Get the whole text
|
160 |
search_phrases.append(input_text)
|
161 |
-
|
162 |
# Method 3: Split text by chunks
|
163 |
search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
|
164 |
-
|
165 |
# Method 4: Get most identities and key words
|
166 |
entities = extract_entities(input_text)
|
167 |
text_without_entities = remove_identities_from_text(input_text, entities)
|
168 |
print(f"text_without_entities: {text_without_entities}")
|
169 |
search_phrases.append(text_without_entities)
|
170 |
-
#keywords = get_keywords(input_text, 16)
|
171 |
-
#search_phrase = " ".join(entities) + " " + " ".join(keywords)
|
172 |
# search_phrases.append(search_phrase) # TODO: for demo purposes
|
173 |
-
|
174 |
return search_phrases
|
175 |
|
|
|
176 |
def remove_identities_from_text(input_text, entities):
|
177 |
"""
|
178 |
Removes entities from the input text.
|
@@ -183,5 +193,5 @@ def remove_identities_from_text(input_text, entities):
|
|
183 |
"""
|
184 |
for entity in entities:
|
185 |
input_text = input_text.replace(entity, "")
|
186 |
-
|
187 |
return input_text
|
|
|
|
|
1 |
import os
|
2 |
import string
|
3 |
+
from collections import Counter
|
4 |
+
|
5 |
import requests
|
6 |
from dotenv import load_dotenv
|
7 |
from nltk.corpus import stopwords
|
|
|
10 |
|
11 |
from src.application.text.entity import extract_entities
|
12 |
|
13 |
+
load_dotenv()
|
14 |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
15 |
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
|
16 |
|
17 |
+
|
18 |
def search_by_google(
|
19 |
+
query,
|
20 |
num_results=10,
|
21 |
+
is_exact_terms=False,
|
22 |
+
) -> dict:
|
23 |
"""
|
24 |
Searches the Google Custom Search Engine for the given query.
|
25 |
|
26 |
Args:
|
27 |
query: The search query.
|
28 |
+
is_exact_terms: Whether to use exact terms search (True) or not.
|
29 |
num_results: The number of results to return (default: 10).
|
30 |
|
31 |
Returns:
|
32 |
+
A dict containing the search results or None if there was an error.
|
33 |
"""
|
34 |
+
|
35 |
url = "https://www.googleapis.com/customsearch/v1"
|
36 |
params = {
|
37 |
"key": GOOGLE_API_KEY,
|
|
|
42 |
params["exactTerms"] = query
|
43 |
else:
|
44 |
params["q"] = query.replace('"', "")
|
45 |
+
|
46 |
response = requests.get(url, params=params)
|
47 |
if response.status_code == 200:
|
48 |
return response.json()
|
|
|
50 |
print(f"Error: {response.status_code}, {response.text}")
|
51 |
return None
|
52 |
|
53 |
+
|
54 |
def get_most_frequent_words(input_text, number_word=32):
|
55 |
"""
|
56 |
+
Gets the top words from the input text,
|
57 |
+
excluding stop words and punctuation.
|
58 |
|
59 |
Args:
|
60 |
input_text: The input text as a string.
|
|
|
69 |
|
70 |
words = word_tokenize(input_text.lower()) # Tokenize and lowercase
|
71 |
|
72 |
+
stop_words = set(stopwords.words("english"))
|
73 |
+
punctuation = set(string.punctuation) # get all punctuation
|
74 |
filtered_words = [
|
75 |
+
word
|
76 |
+
for word in words
|
77 |
+
if word.isalnum()
|
78 |
+
and word not in stop_words
|
79 |
+
and word not in punctuation
|
80 |
]
|
81 |
word_frequencies = Counter(filtered_words)
|
82 |
top_words = word_frequencies.most_common(number_word)
|
83 |
+
|
84 |
for top_word in top_words:
|
85 |
words.append(top_word[0])
|
86 |
+
|
87 |
if len(words) > 32:
|
88 |
search_phrase = " ".join(words[:32])
|
89 |
else:
|
|
|
91 |
|
92 |
return search_phrase
|
93 |
|
94 |
+
|
95 |
def get_chunk(input_text, chunk_length=32, num_chunk=3):
|
96 |
"""
|
97 |
Splits the input text into chunks of a specified length.
|
|
|
102 |
chunk_length: The desired length of each chunk (in words).
|
103 |
|
104 |
Returns:
|
105 |
+
A list of string chunks.
|
106 |
Returns an empty list if input is invalid.
|
107 |
"""
|
108 |
if not isinstance(input_text, str):
|
|
|
120 |
|
121 |
return chunks
|
122 |
|
123 |
+
|
124 |
def get_keywords(text, num_keywords=5):
|
125 |
"""Return top k keywords from a doc using TF-IDF method"""
|
126 |
+
|
127 |
# Create a TF-IDF Vectorizer
|
128 |
+
vectorizer = TfidfVectorizer(stop_words="english")
|
129 |
+
|
130 |
# Fit and transform the text
|
131 |
tfidf_matrix = vectorizer.fit_transform([text])
|
132 |
+
|
133 |
# Get feature names (words)
|
134 |
feature_names = vectorizer.get_feature_names_out()
|
135 |
+
|
136 |
# Get TF-IDF scores
|
137 |
tfidf_scores = tfidf_matrix.toarray()[0]
|
138 |
+
|
139 |
# Sort words by TF-IDF score
|
140 |
word_scores = list(zip(feature_names, tfidf_scores))
|
141 |
word_scores.sort(key=lambda x: x[1], reverse=True)
|
142 |
+
|
143 |
# Return top keywords
|
144 |
return [word for word, score in word_scores[:num_keywords]]
|
145 |
|
|
|
159 |
"""
|
160 |
if not isinstance(input_text, str):
|
161 |
return []
|
162 |
+
|
163 |
search_phrases = []
|
164 |
+
|
165 |
# Method 1: Get most frequent words
|
166 |
search_phrases.append(get_most_frequent_words(input_text))
|
167 |
+
|
168 |
# Method 2: Get the whole text
|
169 |
search_phrases.append(input_text)
|
170 |
+
|
171 |
# Method 3: Split text by chunks
|
172 |
search_phrases.extend(get_chunk(input_text)) # TODO: for demo purposes
|
173 |
+
|
174 |
# Method 4: Get most identities and key words
|
175 |
entities = extract_entities(input_text)
|
176 |
text_without_entities = remove_identities_from_text(input_text, entities)
|
177 |
print(f"text_without_entities: {text_without_entities}")
|
178 |
search_phrases.append(text_without_entities)
|
179 |
+
# keywords = get_keywords(input_text, 16)
|
180 |
+
# search_phrase = " ".join(entities) + " " + " ".join(keywords)
|
181 |
# search_phrases.append(search_phrase) # TODO: for demo purposes
|
182 |
+
|
183 |
return search_phrases
|
184 |
|
185 |
+
|
186 |
def remove_identities_from_text(input_text, entities):
|
187 |
"""
|
188 |
Removes entities from the input text.
|
|
|
193 |
"""
|
194 |
for entity in entities:
|
195 |
input_text = input_text.replace(entity, "")
|
196 |
+
|
197 |
return input_text
|
src/application/text/search_detection.py
CHANGED
@@ -1,28 +1,33 @@
|
|
1 |
import string
|
2 |
import warnings
|
3 |
-
|
4 |
|
5 |
-
from src.application.text.preprocessing import split_into_paragraphs
|
6 |
-
from src.application.text.search import generate_search_phrases, search_by_google
|
7 |
-
from src.application.url_reader import URLReader
|
8 |
-
from src.application.text.helper import extract_equal_text
|
9 |
-
import numpy as np
|
10 |
import nltk
|
|
|
11 |
import torch
|
12 |
-
from
|
13 |
-
|
14 |
-
|
|
|
15 |
|
16 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
# Download necessary NLTK data files
|
19 |
-
nltk.download(
|
20 |
-
nltk.download(
|
21 |
-
nltk.download(
|
22 |
|
23 |
# load the model
|
24 |
-
DEVICE = torch.device(
|
25 |
-
PARAPHASE_MODEL = SentenceTransformer(
|
26 |
PARAPHASE_MODEL.to(DEVICE)
|
27 |
|
28 |
BATCH_SIZE = 8
|
@@ -35,63 +40,94 @@ MIN_RATIO_PARAPHRASE_NUM = 0.7
|
|
35 |
MAX_CHAR_SIZE = 30000
|
36 |
|
37 |
|
38 |
-
def detect_text_by_relative_search(
|
|
|
|
|
|
|
|
|
39 |
checked_urls = set()
|
40 |
searched_phrases = generate_search_phrases(input_text[index])
|
41 |
|
42 |
for candidate in searched_phrases:
|
43 |
search_results = search_by_google(candidate)
|
44 |
-
urls = [item[
|
45 |
|
46 |
for url in urls[:3]:
|
47 |
-
if url in checked_urls:
|
48 |
continue
|
49 |
if "bbc.com" not in url:
|
50 |
continue
|
51 |
-
|
52 |
checked_urls.add(url)
|
53 |
print(f"\t\tChecking URL: {url}")
|
54 |
-
|
55 |
content = URLReader(url)
|
56 |
-
|
57 |
if content.is_extracted is True:
|
58 |
if content.title is None or content.text is None:
|
59 |
-
print(
|
60 |
continue
|
61 |
-
|
62 |
page_text = content.title + "\n" + content.text
|
63 |
if len(page_text) > MAX_CHAR_SIZE:
|
64 |
print(f"\t\t\tβββ More than {MAX_CHAR_SIZE} characters")
|
65 |
continue
|
66 |
-
print(f"\t\t\tβββ Title: {content.title}")
|
67 |
-
paraphrase, aligned_first_sentences = check_paraphrase(
|
68 |
-
|
|
|
|
|
|
|
|
|
69 |
if paraphrase is False:
|
70 |
-
return
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
sub_paraphrase = True
|
73 |
-
while sub_paraphrase
|
74 |
index += 1
|
75 |
print(f"----search {index} < {len(input_text)}----")
|
76 |
if index >= len(input_text):
|
77 |
print(f"input_text_last: {input_text[-1]}")
|
78 |
break
|
79 |
print(f"input_text: {input_text[index]}")
|
80 |
-
sub_paraphrase, sub_sentences = check_paraphrase(
|
|
|
|
|
|
|
|
|
81 |
print(f"sub_paraphrase: {sub_paraphrase}")
|
82 |
print(f"sub_sentences: {sub_sentences}")
|
83 |
-
if sub_paraphrase
|
84 |
-
aligned_first_sentences["input_sentence"] +=
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
aligned_first_sentences["similarity"] /= 2
|
88 |
-
|
89 |
print(f"paraphrase: {paraphrase}")
|
90 |
print(f"aligned_first_sentences: {aligned_first_sentences}")
|
91 |
-
return
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
return False, None, [], [], index
|
94 |
|
|
|
95 |
def find_text_source(text, text_index, sentences_df):
|
96 |
sentence = {
|
97 |
"input_sentence": text[text_index],
|
@@ -101,67 +137,94 @@ def find_text_source(text, text_index, sentences_df):
|
|
101 |
"paraphrase": None,
|
102 |
"url": "",
|
103 |
"group": None,
|
104 |
-
|
105 |
checked_urls = set()
|
106 |
searched_phrases = generate_search_phrases(text[text_index])
|
107 |
|
108 |
for candidate in searched_phrases:
|
109 |
search_results = search_by_google(candidate)
|
110 |
-
urls = [item[
|
111 |
|
112 |
for url in urls[:3]:
|
113 |
-
if url in checked_urls:
|
114 |
continue
|
115 |
if "bbc.com" not in url:
|
116 |
continue
|
117 |
-
|
118 |
checked_urls.add(url)
|
119 |
print(f"\t\tChecking URL: {url}")
|
120 |
-
|
121 |
content = URLReader(url)
|
122 |
-
|
123 |
if content.is_extracted is True:
|
124 |
if content.title is None or content.text is None:
|
125 |
-
print(
|
126 |
continue
|
127 |
-
|
128 |
page_text = content.title + "\n" + content.text
|
129 |
if len(page_text) > MAX_CHAR_SIZE:
|
130 |
print(f"\t\t\tβββ More than {MAX_CHAR_SIZE} characters")
|
131 |
continue
|
132 |
-
print(f"\t\t\tβββ Title: {content.title}")
|
133 |
-
paraphrase, aligned_sentence = check_paraphrase(
|
134 |
-
|
|
|
|
|
|
|
|
|
135 |
# add one more key "group" into aligned_sentence
|
136 |
-
sentences_df.loc[
|
137 |
-
|
138 |
-
|
139 |
-
sentences_df.loc[
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
if aligned_sentence["paraphrase"] is False:
|
143 |
return paraphrase, sentences_df
|
144 |
-
|
145 |
-
for
|
146 |
-
if sentences_df[
|
147 |
continue
|
148 |
-
|
149 |
# find content in new url
|
150 |
-
_, aligned_sentence = check_paraphrase(
|
151 |
-
|
|
|
|
|
|
|
|
|
152 |
if aligned_sentence["url"] is not None:
|
153 |
continue
|
154 |
-
|
155 |
-
sentences_df.loc[
|
156 |
-
|
157 |
-
|
158 |
-
sentences_df.loc[
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
return sentences_df, content.images
|
162 |
-
|
163 |
return sentence, []
|
164 |
|
|
|
165 |
def longest_common_subsequence(arr1, arr2):
|
166 |
"""
|
167 |
Finds the length of the longest common subsequence (contiguous) between
|
@@ -172,7 +235,7 @@ def longest_common_subsequence(arr1, arr2):
|
|
172 |
arr2: The second array.
|
173 |
|
174 |
Returns:
|
175 |
-
The length of the longest common subsequence.
|
176 |
Returns 0 if either input is invalid.
|
177 |
"""
|
178 |
|
@@ -182,7 +245,7 @@ def longest_common_subsequence(arr1, arr2):
|
|
182 |
n = len(arr1)
|
183 |
m = len(arr2)
|
184 |
|
185 |
-
if n == 0 or m == 0:
|
186 |
return 0
|
187 |
|
188 |
# Create table dp with size (n+1) x (m+1)
|
@@ -200,10 +263,15 @@ def longest_common_subsequence(arr1, arr2):
|
|
200 |
return max_length
|
201 |
|
202 |
|
203 |
-
def check_sentence(
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
205 |
"""
|
206 |
-
Checks if two sentences are similar based on exact match or
|
207 |
longest common subsequence.
|
208 |
|
209 |
Args:
|
@@ -218,7 +286,10 @@ def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
|
|
218 |
Returns False if input is not valid.
|
219 |
"""
|
220 |
|
221 |
-
if not isinstance(input_sentence, str) or not isinstance(
|
|
|
|
|
|
|
222 |
return False
|
223 |
|
224 |
input_sentence = input_sentence.strip()
|
@@ -230,7 +301,10 @@ def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
|
|
230 |
input_words = input_sentence.split() # split without arguments
|
231 |
source_words = source_sentence.split() # split without arguments
|
232 |
|
233 |
-
if
|
|
|
|
|
|
|
234 |
if verbose:
|
235 |
print("Exact match found.")
|
236 |
return True
|
@@ -251,29 +325,24 @@ def check_paraphrase(input_text, page_text, url):
|
|
251 |
Args:
|
252 |
input_text: The text to check for paraphrase.
|
253 |
page_text: The text of the web page to compare with.
|
254 |
-
|
255 |
|
256 |
Returns:
|
257 |
A tuple containing:
|
258 |
-
|
259 |
-
- paraphrase_results: A list of dictionaries, each containing:
|
260 |
-
- input_sentence: The sentence from the input text.
|
261 |
-
- matched_sentence: The corresponding sentence from the web page (if found).
|
262 |
-
- similarity: The cosine similarity score between the sentences.
|
263 |
-
- is_paraphrase_sentence: True if the individual sentence pair meets the paraphrase criteria, False otherwise.
|
264 |
"""
|
265 |
is_paraphrase_text = False
|
266 |
-
|
267 |
if not isinstance(input_text, str) or not isinstance(page_text, str):
|
268 |
return False, []
|
269 |
|
270 |
# Extract sentences from input text and web page
|
271 |
# input_sentences = split_into_paragraphs(input_text)
|
272 |
input_sentences = [input_text]
|
273 |
-
|
274 |
if not page_text:
|
275 |
return is_paraphrase_text, []
|
276 |
-
|
277 |
page_sentences = split_into_paragraphs(page_text)
|
278 |
if not input_sentences or not page_sentences:
|
279 |
return is_paraphrase_text, []
|
@@ -283,10 +352,18 @@ def check_paraphrase(input_text, page_text, url):
|
|
283 |
if ", external" in sentence:
|
284 |
additional_sentences.append(sentence.replace(", external", ""))
|
285 |
page_sentences.extend(additional_sentences)
|
286 |
-
|
287 |
# Encode sentences into embeddings
|
288 |
-
embeddings1 = PARAPHASE_MODEL.encode(
|
289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
# Compute cosine similarity matrix
|
292 |
similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
|
@@ -298,7 +375,7 @@ def check_paraphrase(input_text, page_text, url):
|
|
298 |
for i, sentence1 in enumerate(input_sentences):
|
299 |
max_sim_index = np.argmax(similarity_matrix[i])
|
300 |
max_similarity = similarity_matrix[i][max_sim_index]
|
301 |
-
|
302 |
best_matched_sentence = page_sentences[max_sim_index]
|
303 |
is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
|
304 |
|
@@ -321,29 +398,40 @@ def check_paraphrase(input_text, page_text, url):
|
|
321 |
"url": url,
|
322 |
}
|
323 |
|
324 |
-
# Check for individual sentence paraphrase
|
|
|
325 |
if not is_paraphrase_text and check_sentence(
|
326 |
-
sentence1,
|
|
|
|
|
|
|
327 |
):
|
328 |
is_paraphrase_text = True
|
329 |
|
330 |
-
#alignment.append(item)
|
331 |
paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
|
332 |
|
333 |
# Check if enough sentences are paraphrases
|
334 |
-
|
335 |
-
is_paraphrase_text =
|
336 |
-
|
|
|
|
|
337 |
# Method 2: Check if overlapped words between sentences are more than 50%
|
338 |
-
equal_idx_1, _ = extract_equal_text(
|
|
|
|
|
|
|
339 |
matched_count = 0
|
340 |
for index in equal_idx_1:
|
341 |
matched_count += index["end"] - index["start"]
|
342 |
-
sent = input_sentences[0].translate(
|
|
|
|
|
343 |
num_words = len(sent.split())
|
344 |
if matched_count > num_words / 2:
|
345 |
is_paraphrase_text = True
|
346 |
-
|
347 |
return is_paraphrase_text, alignment
|
348 |
|
349 |
|
@@ -359,10 +447,16 @@ def similarity_ratio(a, b):
|
|
359 |
A float representing the similarity ratio between 0.0 and 1.0.
|
360 |
Returns 0.0 if either input is None or not a string.
|
361 |
"""
|
362 |
-
if
|
|
|
|
|
|
|
|
|
|
|
363 |
return 0.0 # Handle cases where inputs are not strings or None
|
364 |
return SequenceMatcher(None, a, b).ratio()
|
365 |
|
|
|
366 |
def check_human(alligned_sentences):
|
367 |
"""
|
368 |
Checks if a sufficient number of input sentences are found within
|
@@ -379,5 +473,5 @@ def check_human(alligned_sentences):
|
|
379 |
return False
|
380 |
|
381 |
|
382 |
-
if __name__ ==
|
383 |
-
pass
|
|
|
1 |
import string
|
2 |
import warnings
|
3 |
+
from difflib import SequenceMatcher
|
4 |
|
|
|
|
|
|
|
|
|
|
|
5 |
import nltk
|
6 |
+
import numpy as np
|
7 |
import torch
|
8 |
+
from sentence_transformers import (
|
9 |
+
SentenceTransformer,
|
10 |
+
util,
|
11 |
+
)
|
12 |
|
13 |
+
from src.application.text.helper import extract_equal_text
|
14 |
+
from src.application.text.preprocessing import split_into_paragraphs
|
15 |
+
from src.application.text.search import (
|
16 |
+
generate_search_phrases,
|
17 |
+
search_by_google,
|
18 |
+
)
|
19 |
+
from src.application.url_reader import URLReader
|
20 |
+
|
21 |
+
warnings.simplefilter(action="ignore", category=FutureWarning)
|
22 |
|
23 |
# Download necessary NLTK data files
|
24 |
+
nltk.download("punkt", quiet=True)
|
25 |
+
nltk.download("punkt_tab", quiet=True)
|
26 |
+
nltk.download("stopwords", quiet=True)
|
27 |
|
28 |
# load the model
|
29 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
30 |
+
PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
31 |
PARAPHASE_MODEL.to(DEVICE)
|
32 |
|
33 |
BATCH_SIZE = 8
|
|
|
40 |
MAX_CHAR_SIZE = 30000
|
41 |
|
42 |
|
43 |
+
def detect_text_by_relative_search(
|
44 |
+
input_text,
|
45 |
+
index,
|
46 |
+
is_support_opposite=False,
|
47 |
+
):
|
48 |
checked_urls = set()
|
49 |
searched_phrases = generate_search_phrases(input_text[index])
|
50 |
|
51 |
for candidate in searched_phrases:
|
52 |
search_results = search_by_google(candidate)
|
53 |
+
urls = [item["link"] for item in search_results.get("items", [])]
|
54 |
|
55 |
for url in urls[:3]:
|
56 |
+
if url in checked_urls: # visited url
|
57 |
continue
|
58 |
if "bbc.com" not in url:
|
59 |
continue
|
60 |
+
|
61 |
checked_urls.add(url)
|
62 |
print(f"\t\tChecking URL: {url}")
|
63 |
+
|
64 |
content = URLReader(url)
|
65 |
+
|
66 |
if content.is_extracted is True:
|
67 |
if content.title is None or content.text is None:
|
68 |
+
print("\t\t\tβββ Title or text not found")
|
69 |
continue
|
70 |
+
|
71 |
page_text = content.title + "\n" + content.text
|
72 |
if len(page_text) > MAX_CHAR_SIZE:
|
73 |
print(f"\t\t\tβββ More than {MAX_CHAR_SIZE} characters")
|
74 |
continue
|
75 |
+
print(f"\t\t\tβββ Title: {content.title}")
|
76 |
+
paraphrase, aligned_first_sentences = check_paraphrase(
|
77 |
+
input_text[index],
|
78 |
+
page_text,
|
79 |
+
url,
|
80 |
+
)
|
81 |
+
|
82 |
if paraphrase is False:
|
83 |
+
return (
|
84 |
+
paraphrase,
|
85 |
+
url,
|
86 |
+
aligned_first_sentences,
|
87 |
+
content.images,
|
88 |
+
index,
|
89 |
+
)
|
90 |
+
|
91 |
sub_paraphrase = True
|
92 |
+
while sub_paraphrase is True:
|
93 |
index += 1
|
94 |
print(f"----search {index} < {len(input_text)}----")
|
95 |
if index >= len(input_text):
|
96 |
print(f"input_text_last: {input_text[-1]}")
|
97 |
break
|
98 |
print(f"input_text: {input_text[index]}")
|
99 |
+
sub_paraphrase, sub_sentences = check_paraphrase(
|
100 |
+
input_text[index],
|
101 |
+
page_text,
|
102 |
+
url,
|
103 |
+
)
|
104 |
print(f"sub_paraphrase: {sub_paraphrase}")
|
105 |
print(f"sub_sentences: {sub_sentences}")
|
106 |
+
if sub_paraphrase is True:
|
107 |
+
aligned_first_sentences["input_sentence"] += (
|
108 |
+
"<br>" + sub_sentences["input_sentence"]
|
109 |
+
)
|
110 |
+
aligned_first_sentences["matched_sentence"] += (
|
111 |
+
"<br>" + sub_sentences["matched_sentence"]
|
112 |
+
)
|
113 |
+
aligned_first_sentences["similarity"] += sub_sentences[
|
114 |
+
"similarity"
|
115 |
+
]
|
116 |
aligned_first_sentences["similarity"] /= 2
|
117 |
+
|
118 |
print(f"paraphrase: {paraphrase}")
|
119 |
print(f"aligned_first_sentences: {aligned_first_sentences}")
|
120 |
+
return (
|
121 |
+
paraphrase,
|
122 |
+
url,
|
123 |
+
aligned_first_sentences,
|
124 |
+
content.images,
|
125 |
+
index,
|
126 |
+
)
|
127 |
+
|
128 |
return False, None, [], [], index
|
129 |
|
130 |
+
|
131 |
def find_text_source(text, text_index, sentences_df):
|
132 |
sentence = {
|
133 |
"input_sentence": text[text_index],
|
|
|
137 |
"paraphrase": None,
|
138 |
"url": "",
|
139 |
"group": None,
|
140 |
+
}
|
141 |
checked_urls = set()
|
142 |
searched_phrases = generate_search_phrases(text[text_index])
|
143 |
|
144 |
for candidate in searched_phrases:
|
145 |
search_results = search_by_google(candidate)
|
146 |
+
urls = [item["link"] for item in search_results.get("items", [])]
|
147 |
|
148 |
for url in urls[:3]:
|
149 |
+
if url in checked_urls: # visited url
|
150 |
continue
|
151 |
if "bbc.com" not in url:
|
152 |
continue
|
153 |
+
|
154 |
checked_urls.add(url)
|
155 |
print(f"\t\tChecking URL: {url}")
|
156 |
+
|
157 |
content = URLReader(url)
|
158 |
+
|
159 |
if content.is_extracted is True:
|
160 |
if content.title is None or content.text is None:
|
161 |
+
print("\t\t\tβββ Title or text not found")
|
162 |
continue
|
163 |
+
|
164 |
page_text = content.title + "\n" + content.text
|
165 |
if len(page_text) > MAX_CHAR_SIZE:
|
166 |
print(f"\t\t\tβββ More than {MAX_CHAR_SIZE} characters")
|
167 |
continue
|
168 |
+
print(f"\t\t\tβββ Title: {content.title}")
|
169 |
+
paraphrase, aligned_sentence = check_paraphrase(
|
170 |
+
text,
|
171 |
+
page_text,
|
172 |
+
url,
|
173 |
+
)
|
174 |
+
|
175 |
# add one more key "group" into aligned_sentence
|
176 |
+
sentences_df.loc[text_index, "input_sentence"] = (
|
177 |
+
aligned_sentence["input_sentence"]
|
178 |
+
)
|
179 |
+
sentences_df.loc[text_index, "matched_sentence"] = (
|
180 |
+
aligned_sentence["matched_sentence"]
|
181 |
+
)
|
182 |
+
sentences_df.loc[text_index, "label"] = aligned_sentence[
|
183 |
+
"label"
|
184 |
+
]
|
185 |
+
sentences_df.loc[text_index, "similarity"] = aligned_sentence[
|
186 |
+
"similarity"
|
187 |
+
]
|
188 |
+
sentences_df.loc[text_index, "url"] = aligned_sentence["url"]
|
189 |
+
|
190 |
if aligned_sentence["paraphrase"] is False:
|
191 |
return paraphrase, sentences_df
|
192 |
+
|
193 |
+
for text_index, _ in enumerate(sentences_df):
|
194 |
+
if sentences_df[text_index]["url"] is not None:
|
195 |
continue
|
196 |
+
|
197 |
# find content in new url
|
198 |
+
_, aligned_sentence = check_paraphrase(
|
199 |
+
text[text_index],
|
200 |
+
page_text,
|
201 |
+
url,
|
202 |
+
)
|
203 |
+
|
204 |
if aligned_sentence["url"] is not None:
|
205 |
continue
|
206 |
+
|
207 |
+
sentences_df.loc[text_index, "input_sentence"] = (
|
208 |
+
aligned_sentence["input_sentence"]
|
209 |
+
)
|
210 |
+
sentences_df.loc[text_index, "matched_sentence"] = (
|
211 |
+
aligned_sentence["matched_sentence"]
|
212 |
+
)
|
213 |
+
sentences_df.loc[text_index, "label"] = aligned_sentence[
|
214 |
+
"label"
|
215 |
+
]
|
216 |
+
sentences_df.loc[text_index, "similarity"] = (
|
217 |
+
aligned_sentence["similarity"]
|
218 |
+
)
|
219 |
+
sentences_df.loc[text_index, "url"] = aligned_sentence[
|
220 |
+
"url"
|
221 |
+
]
|
222 |
+
|
223 |
return sentences_df, content.images
|
224 |
+
|
225 |
return sentence, []
|
226 |
|
227 |
+
|
228 |
def longest_common_subsequence(arr1, arr2):
|
229 |
"""
|
230 |
Finds the length of the longest common subsequence (contiguous) between
|
|
|
235 |
arr2: The second array.
|
236 |
|
237 |
Returns:
|
238 |
+
The length of the longest common subsequence.
|
239 |
Returns 0 if either input is invalid.
|
240 |
"""
|
241 |
|
|
|
245 |
n = len(arr1)
|
246 |
m = len(arr2)
|
247 |
|
248 |
+
if n == 0 or m == 0: # handle empty list
|
249 |
return 0
|
250 |
|
251 |
# Create table dp with size (n+1) x (m+1)
|
|
|
263 |
return max_length
|
264 |
|
265 |
|
266 |
+
def check_sentence(
|
267 |
+
input_sentence,
|
268 |
+
source_sentence,
|
269 |
+
min_same_sentence_len,
|
270 |
+
min_phrase_sentence_len,
|
271 |
+
verbose=False,
|
272 |
+
):
|
273 |
"""
|
274 |
+
Checks if two sentences are similar based on exact match or
|
275 |
longest common subsequence.
|
276 |
|
277 |
Args:
|
|
|
286 |
Returns False if input is not valid.
|
287 |
"""
|
288 |
|
289 |
+
if not isinstance(input_sentence, str) or not isinstance(
|
290 |
+
source_sentence,
|
291 |
+
str,
|
292 |
+
):
|
293 |
return False
|
294 |
|
295 |
input_sentence = input_sentence.strip()
|
|
|
301 |
input_words = input_sentence.split() # split without arguments
|
302 |
source_words = source_sentence.split() # split without arguments
|
303 |
|
304 |
+
if (
|
305 |
+
input_sentence == source_sentence
|
306 |
+
and len(input_words) >= min_same_sentence_len
|
307 |
+
):
|
308 |
if verbose:
|
309 |
print("Exact match found.")
|
310 |
return True
|
|
|
325 |
Args:
|
326 |
input_text: The text to check for paraphrase.
|
327 |
page_text: The text of the web page to compare with.
|
328 |
+
url
|
329 |
|
330 |
Returns:
|
331 |
A tuple containing:
|
332 |
+
|
|
|
|
|
|
|
|
|
|
|
333 |
"""
|
334 |
is_paraphrase_text = False
|
335 |
+
|
336 |
if not isinstance(input_text, str) or not isinstance(page_text, str):
|
337 |
return False, []
|
338 |
|
339 |
# Extract sentences from input text and web page
|
340 |
# input_sentences = split_into_paragraphs(input_text)
|
341 |
input_sentences = [input_text]
|
342 |
+
|
343 |
if not page_text:
|
344 |
return is_paraphrase_text, []
|
345 |
+
|
346 |
page_sentences = split_into_paragraphs(page_text)
|
347 |
if not input_sentences or not page_sentences:
|
348 |
return is_paraphrase_text, []
|
|
|
352 |
if ", external" in sentence:
|
353 |
additional_sentences.append(sentence.replace(", external", ""))
|
354 |
page_sentences.extend(additional_sentences)
|
355 |
+
|
356 |
# Encode sentences into embeddings
|
357 |
+
embeddings1 = PARAPHASE_MODEL.encode(
|
358 |
+
input_sentences,
|
359 |
+
convert_to_tensor=True,
|
360 |
+
device=DEVICE,
|
361 |
+
)
|
362 |
+
embeddings2 = PARAPHASE_MODEL.encode(
|
363 |
+
page_sentences,
|
364 |
+
convert_to_tensor=True,
|
365 |
+
device=DEVICE,
|
366 |
+
)
|
367 |
|
368 |
# Compute cosine similarity matrix
|
369 |
similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
|
|
|
375 |
for i, sentence1 in enumerate(input_sentences):
|
376 |
max_sim_index = np.argmax(similarity_matrix[i])
|
377 |
max_similarity = similarity_matrix[i][max_sim_index]
|
378 |
+
|
379 |
best_matched_sentence = page_sentences[max_sim_index]
|
380 |
is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
|
381 |
|
|
|
398 |
"url": url,
|
399 |
}
|
400 |
|
401 |
+
# Check for individual sentence paraphrase
|
402 |
+
# if overall paraphrase not yet found
|
403 |
if not is_paraphrase_text and check_sentence(
|
404 |
+
sentence1,
|
405 |
+
page_sentences[max_sim_index],
|
406 |
+
MIN_SAME_SENTENCE_LEN,
|
407 |
+
MIN_PHRASE_SENTENCE_LEN,
|
408 |
):
|
409 |
is_paraphrase_text = True
|
410 |
|
411 |
+
# alignment.append(item)
|
412 |
paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
|
413 |
|
414 |
# Check if enough sentences are paraphrases
|
415 |
+
|
416 |
+
is_paraphrase_text = (
|
417 |
+
paraphrased_sentence_count > 0
|
418 |
+
) # min_matching_sentences
|
419 |
+
|
420 |
# Method 2: Check if overlapped words between sentences are more than 50%
|
421 |
+
equal_idx_1, _ = extract_equal_text(
|
422 |
+
input_sentences[0],
|
423 |
+
best_matched_sentence,
|
424 |
+
)
|
425 |
matched_count = 0
|
426 |
for index in equal_idx_1:
|
427 |
matched_count += index["end"] - index["start"]
|
428 |
+
sent = input_sentences[0].translate(
|
429 |
+
str.maketrans("", "", string.punctuation),
|
430 |
+
)
|
431 |
num_words = len(sent.split())
|
432 |
if matched_count > num_words / 2:
|
433 |
is_paraphrase_text = True
|
434 |
+
|
435 |
return is_paraphrase_text, alignment
|
436 |
|
437 |
|
|
|
447 |
A float representing the similarity ratio between 0.0 and 1.0.
|
448 |
Returns 0.0 if either input is None or not a string.
|
449 |
"""
|
450 |
+
if (
|
451 |
+
not isinstance(a, str)
|
452 |
+
or not isinstance(b, str)
|
453 |
+
or a is None
|
454 |
+
or b is None
|
455 |
+
):
|
456 |
return 0.0 # Handle cases where inputs are not strings or None
|
457 |
return SequenceMatcher(None, a, b).ratio()
|
458 |
|
459 |
+
|
460 |
def check_human(alligned_sentences):
|
461 |
"""
|
462 |
Checks if a sufficient number of input sentences are found within
|
|
|
473 |
return False
|
474 |
|
475 |
|
476 |
+
if __name__ == "__main__":
|
477 |
+
pass
|
src/application/url_reader.py
CHANGED
@@ -1,31 +1,40 @@
|
|
1 |
import string
|
2 |
-
|
3 |
-
from newspaper import article, ArticleException, ArticleBinaryDataException
|
4 |
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# TODO: move this to a config file
|
7 |
-
MAX_URL_SIZE = 2000000
|
8 |
-
|
9 |
-
|
|
|
|
|
10 |
self.url = url
|
11 |
self.text = None # string
|
12 |
self.title = None # string
|
13 |
self.images = None # list of Image objects
|
14 |
self.top_image = None # Image object
|
15 |
self.is_extracted = False
|
16 |
-
|
17 |
url_size = self.get_size()
|
18 |
-
if url_size
|
19 |
return
|
20 |
-
else:
|
21 |
self.is_extracted = True
|
22 |
-
|
23 |
-
self.newspaper =
|
|
|
|
|
24 |
if self.newspaper is True:
|
25 |
self.extract_content_newspaper()
|
26 |
else:
|
27 |
self.extract_content_bs()
|
28 |
-
|
29 |
def extract_content_newspaper(self):
|
30 |
"""
|
31 |
Use newspaper4k to extracts content from a URL
|
@@ -36,20 +45,20 @@ class URLReader():
|
|
36 |
Returns:
|
37 |
The extracted content (title, text, images)
|
38 |
"""
|
39 |
-
|
40 |
try:
|
41 |
response = requests.get(self.url)
|
42 |
-
response.raise_for_status()
|
43 |
except requests.exceptions.RequestException as e:
|
44 |
print(f"Error fetching URL: {e}")
|
45 |
return None
|
46 |
-
|
47 |
try:
|
48 |
news = article(url=self.url, fetch_images=True)
|
49 |
except (ArticleException, ArticleBinaryDataException) as e:
|
50 |
print(f"\t\tβββ Error downloading article: {e}")
|
51 |
return None
|
52 |
-
|
53 |
self.title = news.title
|
54 |
self.text = news.text
|
55 |
self.images = list(set(news.images)) # Remove duplicates
|
@@ -61,30 +70,30 @@ class URLReader():
|
|
61 |
"""
|
62 |
response = requests.get(self.url)
|
63 |
response.raise_for_status()
|
64 |
-
|
65 |
response.encoding = response.apparent_encoding
|
66 |
-
|
67 |
try:
|
68 |
soup = BeautifulSoup(response.content, "html.parser")
|
69 |
-
except:
|
70 |
-
print(f"Error parsing HTML content from {self.url}")
|
71 |
return None
|
72 |
-
|
73 |
self.title = soup.title.string.strip() if soup.title else None
|
74 |
-
|
75 |
-
image_urls = [img[
|
76 |
self.images = image_urls
|
77 |
self.top_image = self.images[0]
|
78 |
-
|
79 |
# Exclude text within specific elements
|
80 |
for element in soup(["img", "figcaption", "table", "script", "style"]):
|
81 |
element.extract()
|
82 |
-
#text = soup.get_text(separator="\n")
|
83 |
-
paragraphs = soup.find_all(
|
84 |
-
text =
|
85 |
|
86 |
self.text = text
|
87 |
-
|
88 |
def get_size(self):
|
89 |
"""
|
90 |
Retrieves the size of a URL's content using a HEAD request.
|
@@ -93,27 +102,32 @@ class URLReader():
|
|
93 |
url: The URL to check.
|
94 |
|
95 |
Returns:
|
96 |
-
The size of the content in bytes,
|
|
|
97 |
(e.g., due to network errors or missing Content-Length header).
|
98 |
"""
|
99 |
try:
|
100 |
-
response = requests.head(
|
101 |
-
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
content_length = response.headers.get(
|
104 |
if content_length is not None:
|
105 |
return int(content_length)
|
106 |
else:
|
107 |
-
print(
|
108 |
return None
|
109 |
|
110 |
except requests.exceptions.RequestException as e:
|
111 |
print(f"\t\tβββ Error getting URL size: {e}")
|
112 |
return None
|
113 |
-
|
114 |
|
115 |
-
|
|
|
116 |
url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
|
117 |
reader = URLReader(url)
|
118 |
print(f"Title: {reader.title}")
|
119 |
-
print(f"Text: {reader.text}")
|
|
|
1 |
import string
|
2 |
+
|
|
|
3 |
import requests
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from newspaper import (
|
6 |
+
ArticleBinaryDataException,
|
7 |
+
ArticleException,
|
8 |
+
article,
|
9 |
+
)
|
10 |
|
11 |
# TODO: move this to a config file
|
12 |
+
MAX_URL_SIZE = 2000000 # ~2MB
|
13 |
+
|
14 |
+
|
15 |
+
class URLReader:
|
16 |
+
def __init__(self, url: string, newspaper: bool = True):
|
17 |
self.url = url
|
18 |
self.text = None # string
|
19 |
self.title = None # string
|
20 |
self.images = None # list of Image objects
|
21 |
self.top_image = None # Image object
|
22 |
self.is_extracted = False
|
23 |
+
|
24 |
url_size = self.get_size()
|
25 |
+
if url_size is None or url_size > MAX_URL_SIZE:
|
26 |
return
|
27 |
+
else:
|
28 |
self.is_extracted = True
|
29 |
+
|
30 |
+
self.newspaper = (
|
31 |
+
newspaper # True if using newspaper4k, False if using BS
|
32 |
+
)
|
33 |
if self.newspaper is True:
|
34 |
self.extract_content_newspaper()
|
35 |
else:
|
36 |
self.extract_content_bs()
|
37 |
+
|
38 |
def extract_content_newspaper(self):
|
39 |
"""
|
40 |
Use newspaper4k to extracts content from a URL
|
|
|
45 |
Returns:
|
46 |
The extracted content (title, text, images)
|
47 |
"""
|
48 |
+
|
49 |
try:
|
50 |
response = requests.get(self.url)
|
51 |
+
response.raise_for_status()
|
52 |
except requests.exceptions.RequestException as e:
|
53 |
print(f"Error fetching URL: {e}")
|
54 |
return None
|
55 |
+
|
56 |
try:
|
57 |
news = article(url=self.url, fetch_images=True)
|
58 |
except (ArticleException, ArticleBinaryDataException) as e:
|
59 |
print(f"\t\tβββ Error downloading article: {e}")
|
60 |
return None
|
61 |
+
|
62 |
self.title = news.title
|
63 |
self.text = news.text
|
64 |
self.images = list(set(news.images)) # Remove duplicates
|
|
|
70 |
"""
|
71 |
response = requests.get(self.url)
|
72 |
response.raise_for_status()
|
73 |
+
|
74 |
response.encoding = response.apparent_encoding
|
75 |
+
|
76 |
try:
|
77 |
soup = BeautifulSoup(response.content, "html.parser")
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Error parsing HTML content from {self.url}: {e}")
|
80 |
return None
|
81 |
+
|
82 |
self.title = soup.title.string.strip() if soup.title else None
|
83 |
+
|
84 |
+
image_urls = [img["src"] for img in soup.find_all("img")]
|
85 |
self.images = image_urls
|
86 |
self.top_image = self.images[0]
|
87 |
+
|
88 |
# Exclude text within specific elements
|
89 |
for element in soup(["img", "figcaption", "table", "script", "style"]):
|
90 |
element.extract()
|
91 |
+
# text = soup.get_text(separator="\n")
|
92 |
+
paragraphs = soup.find_all("p")
|
93 |
+
text = " ".join([p.get_text() for p in paragraphs])
|
94 |
|
95 |
self.text = text
|
96 |
+
|
97 |
def get_size(self):
|
98 |
"""
|
99 |
Retrieves the size of a URL's content using a HEAD request.
|
|
|
102 |
url: The URL to check.
|
103 |
|
104 |
Returns:
|
105 |
+
The size of the content in bytes,
|
106 |
+
or None if the size cannot be determined
|
107 |
(e.g., due to network errors or missing Content-Length header).
|
108 |
"""
|
109 |
try:
|
110 |
+
response = requests.head(
|
111 |
+
self.url,
|
112 |
+
allow_redirects=True,
|
113 |
+
timeout=5,
|
114 |
+
) # Add timeout
|
115 |
+
response.raise_for_status() # Raise HTTPError for bad responses
|
116 |
|
117 |
+
content_length = response.headers.get("Content-Length")
|
118 |
if content_length is not None:
|
119 |
return int(content_length)
|
120 |
else:
|
121 |
+
print("\t\tβββ Content-Length header not found")
|
122 |
return None
|
123 |
|
124 |
except requests.exceptions.RequestException as e:
|
125 |
print(f"\t\tβββ Error getting URL size: {e}")
|
126 |
return None
|
|
|
127 |
|
128 |
+
|
129 |
+
if __name__ == "__main__":
|
130 |
url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
|
131 |
reader = URLReader(url)
|
132 |
print(f"Title: {reader.title}")
|
133 |
+
print(f"Text: {reader.text}")
|
test.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import re
|
2 |
|
|
|
3 |
def find_entity_spans(entity, text):
|
4 |
"""
|
5 |
Finds the start and end indices of whole word entities in text.
|
@@ -13,10 +14,14 @@ def find_entity_spans(entity, text):
|
|
13 |
of a found entity. Returns an empty list if no entities are found.
|
14 |
"""
|
15 |
spans = []
|
16 |
-
for m in re.finditer(
|
|
|
|
|
|
|
17 |
spans.append((m.start(), m.end()))
|
18 |
return spans
|
19 |
|
|
|
20 |
# Example usage:
|
21 |
temp_text = "win winger winning"
|
22 |
entity = {"key": "win"} # Example dictionary (adjust as needed)
|
@@ -27,24 +32,24 @@ print(spans) # Output: [(0, 3)] (Only "win" at the beginning)
|
|
27 |
temp_text = "The quick brown fox jumps over the lazy dog."
|
28 |
entity = {"key": "fox"}
|
29 |
spans = find_entity_spans(entity["key"], temp_text)
|
30 |
-
print(spans)
|
31 |
|
32 |
temp_text = "foxes fox foxing"
|
33 |
entity = {"key": "fox"}
|
34 |
spans = find_entity_spans(entity["key"], temp_text)
|
35 |
-
print(spans)
|
36 |
|
37 |
temp_text = "winger win winning"
|
38 |
entity = {"key": "win"}
|
39 |
spans = find_entity_spans(entity["key"], temp_text)
|
40 |
-
print(spans)
|
41 |
|
42 |
temp_text = "winger win winning"
|
43 |
entity = {"key": "winger"}
|
44 |
spans = find_entity_spans(entity["key"], temp_text)
|
45 |
-
print(spans)
|
46 |
|
47 |
temp_text = "winger win winning"
|
48 |
entity = {"key": "winning"}
|
49 |
spans = find_entity_spans(entity["key"], temp_text)
|
50 |
-
print(spans)
|
|
|
1 |
import re
|
2 |
|
3 |
+
|
4 |
def find_entity_spans(entity, text):
|
5 |
"""
|
6 |
Finds the start and end indices of whole word entities in text.
|
|
|
14 |
of a found entity. Returns an empty list if no entities are found.
|
15 |
"""
|
16 |
spans = []
|
17 |
+
for m in re.finditer(
|
18 |
+
r"\b" + re.escape(entity) + r"\b",
|
19 |
+
text,
|
20 |
+
): # The crucial change
|
21 |
spans.append((m.start(), m.end()))
|
22 |
return spans
|
23 |
|
24 |
+
|
25 |
# Example usage:
|
26 |
temp_text = "win winger winning"
|
27 |
entity = {"key": "win"} # Example dictionary (adjust as needed)
|
|
|
32 |
temp_text = "The quick brown fox jumps over the lazy dog."
|
33 |
entity = {"key": "fox"}
|
34 |
spans = find_entity_spans(entity["key"], temp_text)
|
35 |
+
print(spans) # Output: [(16, 19)]
|
36 |
|
37 |
temp_text = "foxes fox foxing"
|
38 |
entity = {"key": "fox"}
|
39 |
spans = find_entity_spans(entity["key"], temp_text)
|
40 |
+
print(spans) # Output: [(0, 3), (6, 9)]
|
41 |
|
42 |
temp_text = "winger win winning"
|
43 |
entity = {"key": "win"}
|
44 |
spans = find_entity_spans(entity["key"], temp_text)
|
45 |
+
print(spans) # Output: [(8, 11)]
|
46 |
|
47 |
temp_text = "winger win winning"
|
48 |
entity = {"key": "winger"}
|
49 |
spans = find_entity_spans(entity["key"], temp_text)
|
50 |
+
print(spans) # Output: [(0, 6)]
|
51 |
|
52 |
temp_text = "winger win winning"
|
53 |
entity = {"key": "winning"}
|
54 |
spans = find_entity_spans(entity["key"], temp_text)
|
55 |
+
print(spans) # Output: [(12, 19)]
|