Spaces:
Running
Running
Commit
Β·
da7dbd0
1
Parent(s):
0542c93
complete the 1st version of GUI
Browse files- Yandexsample.html +0 -0
- app.py +20 -19
- application.py +49 -38
- src/application/content_detection.py +289 -101
- src/application/content_generation.py +68 -22
- src/application/image/image_comparison.py +66 -0
- src/application/image/image_detection.py +49 -0
- src/application/image/model_detection.py +141 -0
- src/application/image/search_yandex.py +200 -0
- src/application/text/model_detection.py +2 -2
- src/application/text/search_detection.py +15 -10
- src/application/url_reader.py +1 -1
Yandexsample.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
CHANGED
@@ -1,22 +1,23 @@
|
|
1 |
-
|
2 |
-
import
|
3 |
|
4 |
-
|
5 |
-
return "aaaa"
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
news_content = gr.Textbox(label="Content", value="", lines=12)
|
17 |
-
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
import requests
|
3 |
|
4 |
+
from src.application.image.search_yandex import get_image_links
|
|
|
5 |
|
6 |
+
|
7 |
+
img_search_url = """https://yandex.ru/images/search?cbir_id=4481385%2Fw-xYJ246B9thwtVBmNcpkg9409&rpt=imageview&lr=10636"""
|
8 |
+
print(img_search_url)
|
9 |
+
|
10 |
+
|
11 |
+
headers = {
|
12 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
|
13 |
+
'Content-Type': 'application/json',
|
14 |
+
}
|
|
|
|
|
15 |
|
16 |
+
|
17 |
+
response = requests.get(img_search_url, headers=headers)
|
18 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
19 |
+
|
20 |
+
# Parse the HTML content
|
21 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
22 |
+
image_urls = get_image_links(soup.prettify())
|
23 |
+
print(f"image_urls: {image_urls}")
|
application.py
CHANGED
@@ -1,32 +1,19 @@
|
|
1 |
import os
|
2 |
|
3 |
import gradio as gr
|
4 |
-
import openai
|
5 |
import requests
|
6 |
from PIL import Image
|
7 |
-
import re
|
8 |
|
9 |
-
from src.application.content_detection import
|
10 |
from src.application.url_reader import URLReader
|
11 |
-
from src.application.content_generation import
|
12 |
|
13 |
-
# from dotenv import load_dotenv
|
14 |
-
|
15 |
-
# load_dotenv()
|
16 |
-
# AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
|
17 |
-
# AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
|
18 |
-
# AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
|
19 |
-
|
20 |
-
# client = openai.AzureOpenAI(
|
21 |
-
# api_version = AZURE_OPENAI_API_VERSION,
|
22 |
-
# api_key = AZURE_OPENAI_API_KEY,
|
23 |
-
# azure_endpoint = AZURE_OPENAI_ENDPOINT,
|
24 |
-
# )
|
25 |
|
26 |
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
27 |
SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
|
28 |
|
29 |
-
|
|
|
30 |
|
31 |
def load_url(url):
|
32 |
"""
|
@@ -54,9 +41,12 @@ def load_url(url):
|
|
54 |
|
55 |
return content.title, content.text, image
|
56 |
|
57 |
-
def show_detailed_analysis(title):
|
58 |
-
return f"More details of {title} will be shown here."
|
59 |
|
|
|
|
|
|
|
|
|
|
|
60 |
# Define the GUI
|
61 |
with gr.Blocks() as demo:
|
62 |
gr.Markdown("# FAKE NEWS DETECTION")
|
@@ -69,16 +59,18 @@ with gr.Blocks() as demo:
|
|
69 |
|
70 |
with gr.Accordion("1. Enter a URL"):
|
71 |
url_input = gr.Textbox(
|
72 |
-
label="
|
|
|
73 |
value="https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science",
|
74 |
)
|
75 |
load_button = gr.Button("Load URL")
|
76 |
|
77 |
-
with gr.Accordion("2. Select
|
78 |
with gr.Row():
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
82 |
|
83 |
with gr.Accordion("3. Replace any terms", open=True):
|
84 |
replace_df = gr.Dataframe(
|
@@ -93,16 +85,17 @@ with gr.Blocks() as demo:
|
|
93 |
# GENERATED CONTENT
|
94 |
with gr.Column(scale=1):
|
95 |
with gr.Accordion("Generated News Contents"):
|
96 |
-
detection_button = gr.Button("Check for fake news")
|
97 |
news_title = gr.Textbox(label="Title", value="")
|
98 |
-
news_image = gr.Image(label="Image")
|
99 |
news_content = gr.Textbox(label="Content", value="", lines=12)
|
100 |
|
101 |
# FAKE NEWS ANALYSIS REPORT
|
102 |
with gr.Column(scale=1):
|
103 |
with gr.Accordion("Fake News Analysis"):
|
104 |
-
|
105 |
-
|
|
|
|
|
106 |
|
107 |
# Connect events
|
108 |
load_button.click(
|
@@ -110,19 +103,37 @@ with gr.Blocks() as demo:
|
|
110 |
inputs=url_input,
|
111 |
outputs=[news_title, news_content, news_image]
|
112 |
)
|
113 |
-
replace_button.click(replace_text,
|
114 |
inputs=[news_title, news_content, replace_df],
|
115 |
outputs=[news_title, news_content])
|
116 |
-
|
117 |
-
inputs=[text_generation_model,
|
118 |
outputs=[news_title, news_content])
|
|
|
|
|
|
|
119 |
detection_button.click(generate_analysis_report,
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
inputs=[news_title],
|
124 |
-
outputs=[html_out])
|
125 |
# change Image
|
126 |
#url_input.change(load_image, inputs=url_input, outputs=image_view)
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
|
3 |
import gradio as gr
|
|
|
4 |
import requests
|
5 |
from PIL import Image
|
|
|
6 |
|
7 |
+
from src.application.content_detection import NewsAnalysis
|
8 |
from src.application.url_reader import URLReader
|
9 |
+
from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
|
13 |
SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
|
14 |
|
15 |
+
AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
|
16 |
+
AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
|
17 |
|
18 |
def load_url(url):
|
19 |
"""
|
|
|
41 |
|
42 |
return content.title, content.text, image
|
43 |
|
|
|
|
|
44 |
|
45 |
+
def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
|
46 |
+
news_analysis.load_news(news_title, news_content, news_image)
|
47 |
+
return news_analysis.generate_analysis_report(), news_analysis.analyze_details()
|
48 |
+
|
49 |
+
news_analysis = NewsAnalysis()
|
50 |
# Define the GUI
|
51 |
with gr.Blocks() as demo:
|
52 |
gr.Markdown("# FAKE NEWS DETECTION")
|
|
|
59 |
|
60 |
with gr.Accordion("1. Enter a URL"):
|
61 |
url_input = gr.Textbox(
|
62 |
+
label="",
|
63 |
+
show_label=False,
|
64 |
value="https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science",
|
65 |
)
|
66 |
load_button = gr.Button("Load URL")
|
67 |
|
68 |
+
with gr.Accordion("2. Select content-generation models", open=True):
|
69 |
with gr.Row():
|
70 |
+
text_generation_model = gr.Dropdown(choices=AZURE_TEXT_MODEL, label="Text-generation model")
|
71 |
+
image_generation_model = gr.Dropdown(choices=AZURE_IMAGE_MODEL, label="Image-generation model")
|
72 |
+
generate_text_button = gr.Button("Generate text")
|
73 |
+
generate_image_button = gr.Button("Generate image")
|
74 |
|
75 |
with gr.Accordion("3. Replace any terms", open=True):
|
76 |
replace_df = gr.Dataframe(
|
|
|
85 |
# GENERATED CONTENT
|
86 |
with gr.Column(scale=1):
|
87 |
with gr.Accordion("Generated News Contents"):
|
|
|
88 |
news_title = gr.Textbox(label="Title", value="")
|
89 |
+
news_image = gr.Image(label="Image", type="filepath")
|
90 |
news_content = gr.Textbox(label="Content", value="", lines=12)
|
91 |
|
92 |
# FAKE NEWS ANALYSIS REPORT
|
93 |
with gr.Column(scale=1):
|
94 |
with gr.Accordion("Fake News Analysis"):
|
95 |
+
detection_button = gr.Button("Check for fake news")
|
96 |
+
analyzed_information = gr.HTML()
|
97 |
+
with gr.Accordion("Detailed information"):
|
98 |
+
detailed_analysis = gr.HTML()
|
99 |
|
100 |
# Connect events
|
101 |
load_button.click(
|
|
|
103 |
inputs=url_input,
|
104 |
outputs=[news_title, news_content, news_image]
|
105 |
)
|
106 |
+
replace_button.click(replace_text,
|
107 |
inputs=[news_title, news_content, replace_df],
|
108 |
outputs=[news_title, news_content])
|
109 |
+
generate_text_button.click(generate_fake_text,
|
110 |
+
inputs=[text_generation_model, news_title, news_content],
|
111 |
outputs=[news_title, news_content])
|
112 |
+
generate_image_button.click(generate_fake_image,
|
113 |
+
inputs=[image_generation_model, news_title],
|
114 |
+
outputs=[news_image])
|
115 |
detection_button.click(generate_analysis_report,
|
116 |
+
inputs=[news_title, news_content, news_image],
|
117 |
+
outputs=[analyzed_information, detailed_analysis])
|
118 |
+
|
|
|
|
|
119 |
# change Image
|
120 |
#url_input.change(load_image, inputs=url_input, outputs=image_view)
|
121 |
+
|
122 |
+
gr.Examples(
|
123 |
+
examples=[
|
124 |
+
["https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road"],
|
125 |
+
["https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science"],
|
126 |
+
],
|
127 |
+
inputs=[url_input],
|
128 |
+
label="Examples",
|
129 |
+
example_labels=[
|
130 |
+
"BBC news 1",
|
131 |
+
"BBC news 2",
|
132 |
+
],
|
133 |
+
)
|
134 |
+
|
135 |
+
demo.launch()
|
136 |
+
|
137 |
+
|
138 |
+
# https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road
|
139 |
+
# https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science
|
src/application/content_detection.py
CHANGED
@@ -1,110 +1,298 @@
|
|
1 |
-
from
|
2 |
-
from src.application.
|
|
|
|
|
3 |
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
|
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
if
|
26 |
-
|
27 |
else:
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
def
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
<
|
105 |
-
<
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
"""
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from difflib import SequenceMatcher
|
2 |
+
from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
|
3 |
+
from src.application.text.model_detection import detect_text_by_ai_model
|
4 |
+
from src.application.text.search_detection import check_human, detect_text_by_relative_search
|
5 |
|
6 |
|
7 |
+
class NewsAnalysis():
|
8 |
+
def __init__(self):
|
9 |
+
self.news_text = ""
|
10 |
+
self.news_title = ""
|
11 |
+
self.news_content = ""
|
12 |
+
self.news_image = ""
|
13 |
+
|
14 |
+
self.text_prediction_label = ""
|
15 |
+
self.text_prediction_score = -1
|
16 |
+
self.text_referent_url = None
|
17 |
+
self.image_prediction_label = ""
|
18 |
+
self.image_prediction_score = -1
|
19 |
+
self.image_referent_url = None
|
20 |
+
self.news_prediction_label = ""
|
21 |
+
self.news_prediction_score = -1
|
22 |
+
|
23 |
+
self.found_img_url = []
|
24 |
+
self.aligned_sentences = []
|
25 |
+
self.is_paraphrased = False
|
26 |
+
|
27 |
+
def load_news(self, news_title, news_content, news_image):
|
28 |
+
self.news_text = news_title + "\n\n" + news_content
|
29 |
+
self.news_title = news_title
|
30 |
+
self.news_content = news_content
|
31 |
+
self.news_image = news_image
|
32 |
|
33 |
+
def determine_text_origin(self):
|
34 |
+
"""
|
35 |
+
Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
|
36 |
|
37 |
+
Args:
|
38 |
+
text: The input text to be analyzed.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
str: The predicted origin of the text:
|
42 |
+
- "HUMAN": If the text is likely written by a human.
|
43 |
+
- "MACHINE": If the text is likely generated by a machine.
|
44 |
+
"""
|
45 |
+
print("CHECK TEXT:")
|
46 |
+
print("\tFrom search engine:")
|
47 |
+
# Classify by search engine
|
48 |
+
self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text)
|
49 |
+
|
50 |
+
if self.is_paraphrased is False:
|
51 |
+
self.text_prediction_label = "UNKNOWN"
|
52 |
else:
|
53 |
+
self.text_prediction_score = 100
|
54 |
+
if check_human(self.aligned_sentences):
|
55 |
+
self.text_prediction_label = "HUMAN"
|
56 |
+
else:
|
57 |
+
self.text_prediction_label = "MACHINE"
|
58 |
+
|
59 |
+
# Classify text by AI model
|
60 |
+
print("\tFrom AI model:")
|
61 |
+
if self.text_prediction_label == "UNKNOWN":
|
62 |
+
self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text)
|
63 |
+
self.text_prediction_score *= 100
|
64 |
|
65 |
+
def detect_image_origin(self):
|
66 |
+
print("CHECK IMAGE:")
|
67 |
+
if self.news_image is None:
|
68 |
+
self.image_prediction_label = "UNKNOWN"
|
69 |
+
self.image_prediction_score = 0.0
|
70 |
+
self.image_referent_url = None
|
71 |
+
return
|
72 |
+
|
73 |
+
print(f"\t: Img path: {self.news_image}")
|
74 |
+
matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
|
75 |
+
if matched_url is not None:
|
76 |
+
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
|
77 |
+
self.image_prediction_label = "HUMAN"
|
78 |
+
self.image_prediction_score = similarity
|
79 |
+
self.image_referent_url = matched_url
|
80 |
+
return
|
81 |
+
|
82 |
+
matched_url, similarity = detect_image_by_reverse_search(self.news_image)
|
83 |
+
if matched_url is not None:
|
84 |
+
print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
|
85 |
+
self.image_prediction_label = "HUMAN"
|
86 |
+
self.image_prediction_score = similarity
|
87 |
+
self.image_referent_url = matched_url
|
88 |
+
return
|
89 |
+
|
90 |
+
detected_label, score = detect_image_by_ai_model(self.news_image)
|
91 |
+
if detected_label:
|
92 |
+
self.image_prediction_label = detected_label
|
93 |
+
self.image_prediction_score = score
|
94 |
+
self.image_referent_url = None
|
95 |
+
return
|
96 |
+
|
97 |
+
self.image_prediction_label = "UNKNOWN"
|
98 |
+
self.image_prediction_score = 50
|
99 |
+
self.image_referent_url = None
|
100 |
|
101 |
+
def determine_news_origin(self):
|
102 |
+
if self.text_prediction_label == "MACHINE":
|
103 |
+
text_prediction_score = 100 - self.text_prediction_score
|
104 |
+
elif self.text_prediction_label == "UNKNOWN":
|
105 |
+
text_prediction_score = 50
|
106 |
+
else:
|
107 |
+
text_prediction_score = self.text_prediction_score
|
108 |
+
|
109 |
+
if self.image_prediction_label == "MACHINE":
|
110 |
+
image_prediction_score = 100 - self.image_prediction_score
|
111 |
+
elif self.image_prediction_label == "UNKNOWN":
|
112 |
+
image_prediction_score = 50
|
113 |
+
else:
|
114 |
+
image_prediction_score = self.image_prediction_score
|
115 |
+
|
116 |
+
news_prediction_score = (text_prediction_score + image_prediction_score) / 2
|
117 |
+
if news_prediction_score > 50:
|
118 |
+
self.news_prediction_score = news_prediction_score
|
119 |
+
self.news_prediction_label = "HUMAN"
|
120 |
+
else:
|
121 |
+
self.news_prediction_score = 100 - news_prediction_score
|
122 |
+
self.news_prediction_label = "MACHINE"
|
123 |
+
|
124 |
+
def generate_analysis_report(self):
|
125 |
+
self.determine_text_origin()
|
126 |
+
self.detect_image_origin()
|
127 |
+
self.determine_news_origin()
|
128 |
+
|
129 |
+
# Forensic analysis
|
130 |
+
if self.text_prediction_label == "MACHINE":
|
131 |
+
text_prediction_label = "The text is modified by GPT-4o (AI)"
|
132 |
+
else:
|
133 |
+
text_prediction_label = "The text is written by HUMAN"
|
134 |
+
|
135 |
+
if self.image_prediction_label == "MACHINE":
|
136 |
+
image_prediction_label = "The image is generated by Dall-e (AI)"
|
137 |
+
else:
|
138 |
+
image_prediction_label = "The image is generated by HUMAN"
|
139 |
+
|
140 |
+
if self.news_prediction_label == "MACHINE":
|
141 |
+
news_prediction_label = "The whole news generated by AI"
|
142 |
+
else:
|
143 |
+
news_prediction_label = "The whole news written by HUMAN"
|
144 |
+
|
145 |
+
# Misinformation analysis
|
146 |
+
out_of_context_results = "cohesive"
|
147 |
+
if out_of_context_results == "cohesive":
|
148 |
+
out_of_context_results = "The input news is cohesive (non-out-of-context)"
|
149 |
+
else:
|
150 |
+
out_of_context_results = "The input news is out-of-context"
|
151 |
+
out_of_context_prediction_score = 96.7
|
152 |
+
|
153 |
+
# Description
|
154 |
+
description = "The description should be concise, clear, and aimed at helping general readers understand the case."
|
155 |
+
|
156 |
+
if self.text_referent_url is None:
|
157 |
+
referred_news = "<li>No referent information</li>"
|
158 |
+
else:
|
159 |
+
print (f"self.text_referent_url: {self.text_referent_url}")
|
160 |
+
referred_news = f"""<li><a href="{self.text_referent_url}" target="_blank">"Referred news: " + {self.text_referent_url[:40] + "..."}</a></li>"""
|
161 |
+
|
162 |
+
if self.image_referent_url is None:
|
163 |
+
referred_image = "<li>No referent information</li>"
|
164 |
+
else:
|
165 |
+
referred_image = f"""<li><a href="{self.text_referent_url}" target="_blank">"Referred news: " + {self.text_referent_url[:40] + "..."}</a></li>"""
|
166 |
+
|
167 |
+
html_template = f"""
|
168 |
+
<div>
|
169 |
+
<h3>Originality:</h3>
|
170 |
+
<ul>
|
171 |
+
{referred_news}
|
172 |
+
{referred_image}
|
173 |
+
</ul>
|
174 |
+
</div>
|
175 |
+
|
176 |
+
<div>
|
177 |
+
<h3>Forensic:</h3>
|
178 |
+
<b>{news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)</b>
|
179 |
+
<ul>
|
180 |
+
<li>{text_prediction_label} (confidence = {self.text_prediction_score:.2f}%)</li>
|
181 |
+
<li>{image_prediction_label} (confidence = {self.image_prediction_score:.2f}%)</li>
|
182 |
+
</ul>
|
183 |
+
</div>
|
184 |
+
|
185 |
+
<div>
|
186 |
+
<h3>Misinformation (placeholder):</h3>
|
187 |
+
<ul>
|
188 |
+
<li>The input news is {out_of_context_results} (confidence = {out_of_context_prediction_score:.2f}%)</li>
|
189 |
+
</ul>
|
190 |
+
</div>
|
191 |
+
|
192 |
+
<div>
|
193 |
+
<h3>Description (optional, placeholder):</h3>
|
194 |
+
<ul>
|
195 |
+
<li>{description}</li>
|
196 |
+
</ul>
|
197 |
+
</div>
|
198 |
+
"""
|
199 |
+
|
200 |
+
return html_template
|
201 |
+
|
202 |
+
|
203 |
+
def analyze_details(self):
|
204 |
+
self.aligned_sentences
|
205 |
+
final_table = []
|
206 |
+
|
207 |
+
for pair in self.aligned_sentences:
|
208 |
+
input_words, source_words, input_indexes, source_indexes = (
|
209 |
+
self.highlight_overlap_by_word_to_list(
|
210 |
+
pair["input_sentence"],
|
211 |
+
pair["matched_sentence"],
|
212 |
+
)
|
213 |
+
)
|
214 |
+
final_table.append(
|
215 |
+
(input_words, source_words, input_indexes, source_indexes),
|
216 |
+
)
|
217 |
+
|
218 |
+
if len(final_table) != 0:
|
219 |
+
html_table = self.create_table(final_table)
|
220 |
+
else:
|
221 |
+
html_table = ""
|
222 |
+
return html_table
|
223 |
+
|
224 |
+
def highlight_overlap_by_word_to_list(self, text1, text2):
|
225 |
+
"""
|
226 |
+
Return
|
227 |
+
- list of words in text1
|
228 |
+
- list of words in text2
|
229 |
+
- list of index of highlight words in text 1
|
230 |
+
- list of index of highlight words in text 2
|
231 |
+
"""
|
232 |
+
# TΓ‘ch chuα»i thΓ nh cΓ‘c tα»« (word) dα»±a vΓ o khoαΊ£ng trαΊ―ng
|
233 |
+
words1 = text1.split()
|
234 |
+
words2 = text2.split()
|
235 |
+
|
236 |
+
index1 = []
|
237 |
+
index2 = []
|
238 |
+
|
239 |
+
# Sα» dα»₯ng SequenceMatcher Δα» tΓ¬m cΓ‘c ΔoαΊ‘n trΓΉng lαΊ·p giα»―a danh sΓ‘ch cΓ‘c tα»«
|
240 |
+
matcher = SequenceMatcher(None, words1, words2)
|
241 |
+
|
242 |
+
highlighted_text1 = []
|
243 |
+
highlighted_text2 = []
|
244 |
+
|
245 |
+
# Theo dΓ΅i vα» trΓ hiα»n tαΊ‘i trong words1 vΓ words2
|
246 |
+
current_pos1 = 0
|
247 |
+
current_pos2 = 0
|
248 |
+
|
249 |
+
# LαΊ·p qua cΓ‘c ΔoαΊ‘n so khα»p
|
250 |
+
for match in matcher.get_matching_blocks():
|
251 |
+
start1, start2, length = match
|
252 |
+
|
253 |
+
# ThΓͺm cΓ‘c tα»« khΓ΄ng trΓΉng lαΊ·p vΓ o (giα»― nguyΓͺn)
|
254 |
+
highlighted_text1.extend(words1[current_pos1:start1])
|
255 |
+
highlighted_text2.extend(words2[current_pos2:start2])
|
256 |
+
|
257 |
+
if length > 0:
|
258 |
+
for i in range(start1, start1 + length):
|
259 |
+
index1.append(i)
|
260 |
+
for i in range(start2, start2 + length):
|
261 |
+
index2.append(i)
|
262 |
+
|
263 |
+
# CαΊp nhαΊt vα» trΓ hiα»n tαΊ‘i
|
264 |
+
current_pos1 = start1 + length
|
265 |
+
current_pos2 = start2 + length
|
266 |
+
|
267 |
+
return words1, words2, index1, index2
|
268 |
+
|
269 |
+
def create_table(self, data):
|
270 |
+
table_rows = "\n".join([self.format_pair(pair) for pair in data])
|
271 |
+
return f"""
|
272 |
+
<h5>Comparison between input news and <a href={self.text_referent_url} target="_blank">source news</a></h5>
|
273 |
+
<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
|
274 |
+
<thead>
|
275 |
+
<tr>
|
276 |
+
<th>Input sentence</th>
|
277 |
+
<th>Source sentence</th>
|
278 |
+
</tr>
|
279 |
+
</thead>
|
280 |
+
<tbody>
|
281 |
+
{table_rows}
|
282 |
+
</tbody>
|
283 |
+
</table>
|
284 |
"""
|
285 |
+
|
286 |
+
def format_pair(self, pair):
|
287 |
+
input_sentence = self.highlight_text(pair[0], pair[2])
|
288 |
+
source_sentence = self.highlight_text(pair[1], pair[3])
|
289 |
+
return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"
|
290 |
|
291 |
+
def highlight_text(self, words, indexes):
|
292 |
+
final_words = words
|
293 |
+
for index in indexes:
|
294 |
+
final_words[index] = (
|
295 |
+
f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
|
296 |
+
)
|
297 |
+
return " ".join(final_words)
|
298 |
+
|
src/application/content_generation.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import openai
|
2 |
from dotenv import load_dotenv
|
3 |
import os
|
@@ -13,43 +14,88 @@ client = openai.AzureOpenAI(
|
|
13 |
azure_endpoint = AZURE_OPENAI_ENDPOINT,
|
14 |
)
|
15 |
|
16 |
-
def
|
17 |
# Generate text using the selected models
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
20 |
if title and content:
|
21 |
-
|
22 |
-
|
23 |
elif title:
|
24 |
-
|
25 |
-
|
26 |
elif content:
|
27 |
-
|
28 |
-
|
29 |
|
30 |
# Generate text using the text generation model
|
31 |
-
|
32 |
-
return title, generated_text
|
33 |
-
|
34 |
-
def generate_text(model, full_context, input_type):
|
35 |
-
# Generate text using the selected model
|
36 |
-
if input_type == "":
|
37 |
-
prompt = "Generate a random fake news article"
|
38 |
-
else:
|
39 |
-
prompt = f"Generate a fake news article (title and content) based on the following: # Title: {input_type}:\n\n# Content: {full_context}"
|
40 |
-
|
41 |
try:
|
42 |
response = client.chat.completions.create(
|
43 |
-
model=
|
44 |
messages = [{"role": "system", "content": prompt}],
|
45 |
)
|
46 |
|
47 |
print("Response from OpenAI API: ", response.choices[0].message.content)
|
48 |
-
|
49 |
|
50 |
except openai.OpenAIError as e:
|
51 |
print(f"Error interacting with OpenAI API: {e}")
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
def replace_text(news_title, news_content, replace_df):
|
55 |
"""
|
|
|
1 |
+
import json
|
2 |
import openai
|
3 |
from dotenv import load_dotenv
|
4 |
import os
|
|
|
14 |
azure_endpoint = AZURE_OPENAI_ENDPOINT,
|
15 |
)
|
16 |
|
17 |
+
def generate_fake_text(text_generation_model, title, content):
|
18 |
# Generate text using the selected models
|
19 |
+
prompt = """Generate a random fake news tittle in this format:
|
20 |
+
---
|
21 |
+
# Title: [Fake Title]
|
22 |
+
# Content:
|
23 |
+
[Fake Content]
|
24 |
+
---
|
25 |
+
"""
|
26 |
if title and content:
|
27 |
+
prompt += """base on the following context:
|
28 |
+
# Title: {news_title}:\n# Content: {news_content}"""
|
29 |
elif title:
|
30 |
+
prompt += """base on the following context:
|
31 |
+
# Title: {news_title}:\n"""
|
32 |
elif content:
|
33 |
+
prompt += """base on the following context:
|
34 |
+
# Content: {news_content}"""
|
35 |
|
36 |
# Generate text using the text generation model
|
37 |
+
# Generate text using the selected model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
try:
|
39 |
response = client.chat.completions.create(
|
40 |
+
model=text_generation_model,
|
41 |
messages = [{"role": "system", "content": prompt}],
|
42 |
)
|
43 |
|
44 |
print("Response from OpenAI API: ", response.choices[0].message.content)
|
45 |
+
fake_text = response.choices[0].message.content
|
46 |
|
47 |
except openai.OpenAIError as e:
|
48 |
print(f"Error interacting with OpenAI API: {e}")
|
49 |
+
fake_text = ""
|
50 |
+
|
51 |
+
if fake_text != "":
|
52 |
+
fake_title, fake_content = extract_title_content(fake_text)
|
53 |
+
return fake_title, fake_content
|
54 |
+
|
55 |
+
|
56 |
+
def extract_title_content(fake_news):
|
57 |
+
"""
|
58 |
+
Extracts the title and content from the generated fake news string.
|
59 |
+
|
60 |
+
This function parses a string containing fake news, which is expected to have
|
61 |
+
a specific format with a title and content section marked by '# Title:' and
|
62 |
+
'# Content:' respectively.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
fake_news (str): A string containing the generated fake news in the expected format.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
tuple: A tuple containing two elements:
|
69 |
+
- title (str): The extracted title of the fake news.
|
70 |
+
- content (str): The extracted content of the fake news.
|
71 |
+
|
72 |
+
Note:
|
73 |
+
The function assumes that the input string follows the expected format.
|
74 |
+
If the format is not as expected, it may return unexpected results.
|
75 |
+
"""
|
76 |
+
# Extract the title and content from the generated fake news
|
77 |
+
title_start_index = fake_news.find("# Title: ") + len("# Title: ")
|
78 |
+
title_end_index = fake_news.find("\n", title_start_index)
|
79 |
+
title = fake_news[title_start_index:title_end_index].strip()
|
80 |
+
|
81 |
+
content_start_index = fake_news.find("\n# Content: ") + len("\n# Content: ")
|
82 |
+
content = fake_news[content_start_index:].strip()
|
83 |
+
|
84 |
+
return title, content
|
85 |
+
|
86 |
+
def generate_fake_image(model, title):
|
87 |
+
if len(title) > 0:
|
88 |
+
IMAGE_PROMPT = f"Generate a random image about {title}"
|
89 |
+
else:
|
90 |
+
IMAGE_PROMPT = "Generate a random image"
|
91 |
+
result = client.images.generate(
|
92 |
+
model="dall-e-3", # the name of your DALL-E 3 deployment
|
93 |
+
prompt=IMAGE_PROMPT,
|
94 |
+
n=1
|
95 |
+
)
|
96 |
+
image_url = json.loads(result.model_dump_json())['data'][0]['url']
|
97 |
+
return image_url
|
98 |
+
|
99 |
|
100 |
def replace_text(news_title, news_content, replace_df):
|
101 |
"""
|
src/application/image/image_comparison.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from io import BytesIO
|
3 |
+
from PIL import Image
|
4 |
+
import imagehash
|
5 |
+
from src.application.image.search_yandex import YandexReverseImageSearcher
|
6 |
+
|
7 |
+
def get_image_from_url(url):
|
8 |
+
try:
|
9 |
+
response = requests.get(url)
|
10 |
+
return Image.open(BytesIO(response.content))
|
11 |
+
except Exception as e:
|
12 |
+
print(f"Error opening image: {e}")
|
13 |
+
return None
|
14 |
+
|
15 |
+
def get_image_from_file(file_path):
|
16 |
+
try:
|
17 |
+
return Image.open(file_path)
|
18 |
+
except FileNotFoundError:
|
19 |
+
print(f"Error occurred while opening image from file: {file_path}")
|
20 |
+
return None
|
21 |
+
|
22 |
+
def standardize_image(image):
|
23 |
+
# Convert to RGB if needed
|
24 |
+
if image.mode in ('RGBA', 'LA'):
|
25 |
+
background = Image.new('RGB', image.size, (255, 255, 255))
|
26 |
+
background.paste(image, mask=image.split()[-1])
|
27 |
+
image = background
|
28 |
+
elif image.mode != 'RGB':
|
29 |
+
image = image.convert('RGB')
|
30 |
+
|
31 |
+
# Resize to standard size (e.g. 256x256)
|
32 |
+
standard_size = (256, 256)
|
33 |
+
image = image.resize(standard_size)
|
34 |
+
|
35 |
+
return image
|
36 |
+
|
37 |
+
def compare_images(image1, image2):
|
38 |
+
# Standardize both images first
|
39 |
+
img1_std = standardize_image(image1)
|
40 |
+
img2_std = standardize_image(image2)
|
41 |
+
|
42 |
+
hash1 = imagehash.average_hash(img1_std)
|
43 |
+
hash2 = imagehash.average_hash(img2_std)
|
44 |
+
return hash1 - hash2 # Returns the Hamming distance between the hashes
|
45 |
+
|
46 |
+
if __name__ == '__main__':
|
47 |
+
image_url = 'https://i.pinimg.com/originals/c4/50/35/c450352ac6ea8645ead206721673e8fb.png'
|
48 |
+
|
49 |
+
# Get the image from URL
|
50 |
+
url_image = get_image_from_url(image_url)
|
51 |
+
|
52 |
+
# Search image
|
53 |
+
rev_img_searcher = YandexReverseImageSearcher()
|
54 |
+
res = rev_img_searcher.search(image_url)
|
55 |
+
|
56 |
+
for search_item in res:
|
57 |
+
print(f'Title: {search_item.page_title}')
|
58 |
+
# print(f'Site: {search_item.page_url}')
|
59 |
+
print(f'Img: {search_item.image_url}\n')
|
60 |
+
|
61 |
+
# Compare each search result image with the input image
|
62 |
+
result_image = get_image_from_url(search_item.image_url)
|
63 |
+
result_difference = compare_images(result_image, url_image)
|
64 |
+
print(f"Difference with search result: {result_difference}")
|
65 |
+
if result_difference == 0:
|
66 |
+
break
|
src/application/image/image_detection.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from src.application.image.image_comparison import compare_images, get_image_from_file, get_image_from_url
|
3 |
+
from src.application.image.model_detection import image_generation_detection
|
4 |
+
from src.application.image.search_yandex import yandex_reverse_image_search
|
5 |
+
|
6 |
+
|
7 |
+
def compare_list_of_images(news_image_path, img_urls):
|
8 |
+
news_image = get_image_from_file(news_image_path) # TODO: news_image_path is arrays
|
9 |
+
if news_image is None:
|
10 |
+
return None, -1
|
11 |
+
|
12 |
+
matched_url = ""
|
13 |
+
max_similarity = 0
|
14 |
+
for url in img_urls:
|
15 |
+
print(f"\t{url}")
|
16 |
+
referred_image = get_image_from_url(url)
|
17 |
+
if referred_image is None:
|
18 |
+
continue
|
19 |
+
distance = compare_images(news_image, referred_image) # Hamming algorithm
|
20 |
+
similarity = max(100 - distance, 0)
|
21 |
+
if similarity > max_similarity:
|
22 |
+
max_similarity = similarity
|
23 |
+
matched_url = url
|
24 |
+
|
25 |
+
if max_similarity > 90:
|
26 |
+
return matched_url, max_similarity
|
27 |
+
return None, -1
|
28 |
+
|
29 |
+
|
30 |
+
def detect_image_from_news_image(news_image_path, image_urls):
|
31 |
+
print("\tFrom news:")
|
32 |
+
for url in image_urls:
|
33 |
+
print(f"\t{url}")
|
34 |
+
return compare_list_of_images(news_image_path, image_urls)
|
35 |
+
|
36 |
+
def detect_image_by_reverse_search(news_image_path):
|
37 |
+
image_urls = yandex_reverse_image_search(news_image_path) # url or file_path
|
38 |
+
print("\tFrom search engine:")
|
39 |
+
for url in image_urls:
|
40 |
+
print(f"\t\t{url}")
|
41 |
+
return compare_list_of_images(news_image_path, image_urls)
|
42 |
+
|
43 |
+
|
44 |
+
def detect_image_by_ai_model(news_image_path):
|
45 |
+
print("\tFrom AI model:")
|
46 |
+
image_prediction_label, image_confidence = image_generation_detection(
|
47 |
+
news_image_path,
|
48 |
+
)
|
49 |
+
return image_prediction_label, image_confidence
|
src/application/image/model_detection.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.metrics import roc_auc_score
|
2 |
+
from torchmetrics import Accuracy, Recall
|
3 |
+
import pytorch_lightning as pl
|
4 |
+
import timm
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import logging
|
8 |
+
from PIL import Image
|
9 |
+
import torchvision.transforms as transforms
|
10 |
+
from torchvision.transforms import v2
|
11 |
+
|
12 |
+
logging.basicConfig(filename='training.log',filemode='w',level=logging.INFO, force=True)
|
13 |
+
CHECKPOINT = "models/image_classifier/image-classifier-step=8008-val_loss=0.11.ckpt"
|
14 |
+
|
15 |
+
class ImageClassifier(pl.LightningModule):
|
16 |
+
def __init__(self, lmd=0):
|
17 |
+
super().__init__()
|
18 |
+
self.model = timm.create_model('resnet50', pretrained=True, num_classes=1)
|
19 |
+
self.accuracy = Accuracy(task='binary', threshold=0.5)
|
20 |
+
self.recall = Recall(task='binary', threshold=0.5)
|
21 |
+
self.validation_outputs = []
|
22 |
+
self.lmd = lmd
|
23 |
+
|
24 |
+
def forward(self, x):
|
25 |
+
return self.model(x)
|
26 |
+
|
27 |
+
def training_step(self, batch):
|
28 |
+
images, labels, _ = batch
|
29 |
+
outputs = self.forward(images).squeeze()
|
30 |
+
|
31 |
+
print(f"Shape of outputs (training): {outputs.shape}")
|
32 |
+
print(f"Shape of labels (training): {labels.shape}")
|
33 |
+
|
34 |
+
loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
|
35 |
+
logging.info(f"Training Step - ERM loss: {loss.item()}")
|
36 |
+
loss += self.lmd * (outputs ** 2).mean() # SD loss penalty
|
37 |
+
logging.info(f"Training Step - SD loss: {loss.item()}")
|
38 |
+
return loss
|
39 |
+
|
40 |
+
def validation_step(self, batch):
|
41 |
+
images, labels, _ = batch
|
42 |
+
outputs = self.forward(images).squeeze()
|
43 |
+
|
44 |
+
if outputs.shape == torch.Size([]):
|
45 |
+
return
|
46 |
+
|
47 |
+
print(f"Shape of outputs (validation): {outputs.shape}")
|
48 |
+
print(f"Shape of labels (validation): {labels.shape}")
|
49 |
+
|
50 |
+
loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
|
51 |
+
preds = torch.sigmoid(outputs)
|
52 |
+
self.log('val_loss', loss, prog_bar=True, sync_dist=True)
|
53 |
+
self.log('val_acc', self.accuracy(preds, labels.int()), prog_bar=True, sync_dist=True)
|
54 |
+
self.log('val_recall', self.recall(preds, labels.int()), prog_bar=True, sync_dist=True)
|
55 |
+
output = {"val_loss": loss, "preds": preds, "labels": labels}
|
56 |
+
self.validation_outputs.append(output)
|
57 |
+
logging.info(f"Validation Step - Batch loss: {loss.item()}")
|
58 |
+
return output
|
59 |
+
|
60 |
+
def predict_step(self, batch):
|
61 |
+
images, label, domain = batch
|
62 |
+
outputs = self.forward(images).squeeze()
|
63 |
+
preds = torch.sigmoid(outputs)
|
64 |
+
return preds, label, domain
|
65 |
+
|
66 |
+
def on_validation_epoch_end(self):
|
67 |
+
if not self.validation_outputs:
|
68 |
+
logging.warning("No outputs in validation step to process")
|
69 |
+
return
|
70 |
+
preds = torch.cat([x['preds'] for x in self.validation_outputs])
|
71 |
+
labels = torch.cat([x['labels'] for x in self.validation_outputs])
|
72 |
+
if labels.unique().size(0) == 1:
|
73 |
+
logging.warning("Only one class in validation step")
|
74 |
+
return
|
75 |
+
auc_score = roc_auc_score(labels.cpu(), preds.cpu())
|
76 |
+
self.log('val_auc', auc_score, prog_bar=True, sync_dist=True)
|
77 |
+
logging.info(f"Validation Epoch End - AUC score: {auc_score}")
|
78 |
+
self.validation_outputs = []
|
79 |
+
|
80 |
+
def configure_optimizers(self):
|
81 |
+
optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0005)
|
82 |
+
return optimizer
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
def load_image(image_path, transform=None):
|
87 |
+
image = Image.open(image_path).convert('RGB')
|
88 |
+
|
89 |
+
if transform:
|
90 |
+
image = transform(image)
|
91 |
+
|
92 |
+
return image
|
93 |
+
|
94 |
+
|
95 |
+
def predict_single_image(image_path, model, transform=None):
|
96 |
+
image = load_image(image_path, transform)
|
97 |
+
|
98 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
99 |
+
|
100 |
+
model.to(device)
|
101 |
+
|
102 |
+
image = image.to(device)
|
103 |
+
|
104 |
+
model.eval()
|
105 |
+
|
106 |
+
with torch.no_grad():
|
107 |
+
image = image.unsqueeze(0)
|
108 |
+
output = model(image).squeeze()
|
109 |
+
prediction = torch.sigmoid(output).item()
|
110 |
+
|
111 |
+
return prediction
|
112 |
+
|
113 |
+
|
114 |
+
def image_generation_detection(image_path):
|
115 |
+
model = ImageClassifier.load_from_checkpoint(CHECKPOINT)
|
116 |
+
|
117 |
+
transform = v2.Compose([
|
118 |
+
transforms.ToTensor(),
|
119 |
+
v2.CenterCrop((256, 256)),
|
120 |
+
])
|
121 |
+
|
122 |
+
prediction = predict_single_image(image_path, model, transform)
|
123 |
+
|
124 |
+
result = ""
|
125 |
+
if prediction <= 0.2:
|
126 |
+
result += "Most likely human"
|
127 |
+
image_prediction_label = "HUMAN"
|
128 |
+
else:
|
129 |
+
result += "Most likely machine"
|
130 |
+
image_prediction_label = "MACHINE"
|
131 |
+
image_confidence = min(1, 0.5 + abs(prediction - 0.2))
|
132 |
+
result += f" with confidence = {round(image_confidence * 100, 2)}%"
|
133 |
+
image_confidence = round(image_confidence * 100, 2)
|
134 |
+
return image_prediction_label, image_confidence
|
135 |
+
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
image_path = "path_to_your_image.jpg" # Replace with your image path
|
139 |
+
image_prediction_label, image_confidence = image_generation_detection(
|
140 |
+
image_path,
|
141 |
+
)
|
src/application/image/search_yandex.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import logging
|
3 |
+
import requests
|
4 |
+
import json
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
from urllib.parse import quote, urlparse
|
7 |
+
|
8 |
+
logging.basicConfig(
|
9 |
+
filename='error.log',
|
10 |
+
level=logging.INFO,
|
11 |
+
format='%(asctime)s | [%(levelname)s]: %(message)s',
|
12 |
+
datefmt='%m-%d-%Y / %I:%M:%S %p'
|
13 |
+
)
|
14 |
+
|
15 |
+
class SearchResults:
|
16 |
+
def __init__(self, results):
|
17 |
+
self.results = results
|
18 |
+
|
19 |
+
def __str__(self):
|
20 |
+
output = ""
|
21 |
+
for result in self.results:
|
22 |
+
output += "---\n"
|
23 |
+
output += f"Title: {result.get('title', 'Title not found')}\n"
|
24 |
+
output += f"Link: {result.get('link', 'Link not found')}\n"
|
25 |
+
output += "---\n"
|
26 |
+
return output
|
27 |
+
|
28 |
+
class YandexReverseImageSearcher:
|
29 |
+
def __init__(self):
|
30 |
+
self.base_url = "https://yandex.ru/images/search"
|
31 |
+
self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
|
32 |
+
self.retry_count = 3
|
33 |
+
self.retry_delay = 1
|
34 |
+
|
35 |
+
def response(self, query: str, image_url: str, max_results: int = 10, delay: int = 1) -> SearchResults:
|
36 |
+
self._validate_input(query, image_url)
|
37 |
+
|
38 |
+
encoded_query = quote(query)
|
39 |
+
encoded_image_url = quote(image_url)
|
40 |
+
|
41 |
+
url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2"
|
42 |
+
|
43 |
+
all_results = []
|
44 |
+
start_index = 0
|
45 |
+
|
46 |
+
while len(all_results) < max_results:
|
47 |
+
if start_index != 0:
|
48 |
+
time.sleep(delay)
|
49 |
+
|
50 |
+
paginated_url = f"{url}&start={start_index}"
|
51 |
+
|
52 |
+
response = self._make_request(paginated_url)
|
53 |
+
if response is None:
|
54 |
+
break
|
55 |
+
|
56 |
+
search_results, valid_content = self._parse_search_results(response.text)
|
57 |
+
if not valid_content:
|
58 |
+
logging.warning("Unexpected HTML structure encountered.")
|
59 |
+
break
|
60 |
+
|
61 |
+
for result in search_results:
|
62 |
+
if len(all_results) >= max_results:
|
63 |
+
break
|
64 |
+
data = self._extract_result_data(result)
|
65 |
+
if data and data not in all_results:
|
66 |
+
all_results.append(data)
|
67 |
+
|
68 |
+
start_index += (len(all_results)-start_index)
|
69 |
+
|
70 |
+
if len(all_results) == 0:
|
71 |
+
logging.warning(f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].")
|
72 |
+
return "No results found. Please try again with a different query and/or image URL."
|
73 |
+
else:
|
74 |
+
return SearchResults(all_results[:max_results])
|
75 |
+
|
76 |
+
def _validate_input(self, query: str, image_url: str):
|
77 |
+
if not query:
|
78 |
+
raise ValueError("Query not found. Please enter a query and try again.")
|
79 |
+
if not image_url:
|
80 |
+
raise ValueError("Image URL not found. Please enter an image URL and try again.")
|
81 |
+
if not self._validate_image_url(image_url):
|
82 |
+
raise ValueError("Invalid image URL. Please enter a valid image URL and try again.")
|
83 |
+
|
84 |
+
def _validate_image_url(self, url: str) -> bool:
|
85 |
+
parsed_url = urlparse(url)
|
86 |
+
path = parsed_url.path.lower()
|
87 |
+
valid_extensions = (".jpg", ".jpeg", ".png", ".webp")
|
88 |
+
return any(path.endswith(ext) for ext in valid_extensions)
|
89 |
+
|
90 |
+
def _make_request(self, url: str):
|
91 |
+
attempts = 0
|
92 |
+
while attempts < self.retry_count:
|
93 |
+
try:
|
94 |
+
response = requests.get(url, headers=self.headers)
|
95 |
+
if response.headers.get('Content-Type', '').startswith('text/html'):
|
96 |
+
response.raise_for_status()
|
97 |
+
return response
|
98 |
+
else:
|
99 |
+
logging.warning("Non-HTML content received.")
|
100 |
+
return None
|
101 |
+
except requests.exceptions.HTTPError as http_err:
|
102 |
+
logging.error(f"HTTP error occurred: {http_err}")
|
103 |
+
attempts += 1
|
104 |
+
time.sleep(self.retry_delay)
|
105 |
+
except Exception as err:
|
106 |
+
logging.error(f"An error occurred: {err}")
|
107 |
+
return None
|
108 |
+
return None
|
109 |
+
|
110 |
+
def _parse_search_results(self, html_content: str):
|
111 |
+
try:
|
112 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
113 |
+
return soup.find_all('div', class_='g'), True
|
114 |
+
except Exception as e:
|
115 |
+
logging.error(f"Error parsing HTML content: {e}")
|
116 |
+
return None, False
|
117 |
+
|
118 |
+
def _extract_result_data(self, result):
|
119 |
+
link = result.find('a', href=True)['href'] if result.find('a', href=True) else None
|
120 |
+
title = result.find('h3').get_text(strip=True) if result.find('h3') else None
|
121 |
+
return {"link": link, "title": title} if link and title else {}
|
122 |
+
|
123 |
+
|
124 |
+
def get_image_links(page):
|
125 |
+
"""
|
126 |
+
Extracts image URLs from the given HTML page.
|
127 |
+
|
128 |
+
Args:
|
129 |
+
page: The HTML content as a string.
|
130 |
+
|
131 |
+
Returns:
|
132 |
+
A list of image URLs.
|
133 |
+
"""
|
134 |
+
soup = BeautifulSoup(page, 'html.parser')
|
135 |
+
|
136 |
+
# Find the specific section containing image links
|
137 |
+
gallery_data = soup.find('div', {'class': 'cbir-section cbir-section_name_sites'})
|
138 |
+
if gallery_data is None:
|
139 |
+
return []
|
140 |
+
|
141 |
+
# Find the container of image links
|
142 |
+
image_links_container = gallery_data.find('div', {'class': 'Root'})
|
143 |
+
if image_links_container is None:
|
144 |
+
return []
|
145 |
+
|
146 |
+
data_state = json.loads(image_links_container['data-state'])
|
147 |
+
|
148 |
+
# Extract URLs from each div
|
149 |
+
image_urls = []
|
150 |
+
for site in data_state['sites']:
|
151 |
+
original_image_url = site['originalImage']['url']
|
152 |
+
image_urls.append(original_image_url)
|
153 |
+
|
154 |
+
return image_urls
|
155 |
+
|
156 |
+
|
157 |
+
def yandex_reverse_image_search(file_path):
|
158 |
+
img_search_url = generate_images_search_links(file_path)
|
159 |
+
if img_search_url is None:
|
160 |
+
return []
|
161 |
+
|
162 |
+
# Simulate a user agent to avoid being blocked
|
163 |
+
headers = {
|
164 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
|
165 |
+
'Content-Type': 'application/json',
|
166 |
+
}
|
167 |
+
|
168 |
+
try:
|
169 |
+
response = requests.get(img_search_url, headers=headers)
|
170 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
171 |
+
|
172 |
+
# Parse the HTML content
|
173 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
174 |
+
image_urls = get_image_links(soup.prettify())
|
175 |
+
return image_urls
|
176 |
+
|
177 |
+
except requests.exceptions.RequestException as e:
|
178 |
+
print(f"Error fetching image: {e}")
|
179 |
+
return []
|
180 |
+
|
181 |
+
|
182 |
+
def generate_images_search_links(file_path):
|
183 |
+
search_url = 'https://yandex.ru/images/search'
|
184 |
+
params = {'rpt': 'imageview', 'format': 'json', 'request': '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}'}
|
185 |
+
|
186 |
+
try:
|
187 |
+
files = {'upfile': ('blob', open(file_path, 'rb'), 'image/jpeg/webp')}
|
188 |
+
response = requests.post(search_url, params=params, files=files)
|
189 |
+
query_string = json.loads(response.content)['blocks'][0]['params']['url']
|
190 |
+
img_search_url = search_url + '?' + query_string
|
191 |
+
return img_search_url
|
192 |
+
except:
|
193 |
+
return None
|
194 |
+
|
195 |
+
|
196 |
+
if __name__ == "__main__":
|
197 |
+
file_path = "T:\\Projects\\prj-nict-ai-content-detection\\data\\test_data\\towels.jpg.webp"
|
198 |
+
image_urls = yandex_reverse_image_search(file_path)
|
199 |
+
for image_url in image_urls:
|
200 |
+
print(f"Image URL: {image_url}")
|
src/application/text/model_detection.py
CHANGED
@@ -11,7 +11,7 @@ PARAPHRASE = "PARAPHRASE"
|
|
11 |
NON_PARAPHRASE = "NON_PARAPHRASE"
|
12 |
|
13 |
|
14 |
-
def
|
15 |
input_text: str,
|
16 |
model: str = DEFAULT_MODEL,
|
17 |
max_length: int = 512,
|
@@ -44,4 +44,4 @@ def detect_by_ai_model(
|
|
44 |
return label, confidence_score
|
45 |
except Exception as e: # Add exception handling
|
46 |
print(f"Error in Roberta model inference: {e}")
|
47 |
-
return UNKNOWN,
|
|
|
11 |
NON_PARAPHRASE = "NON_PARAPHRASE"
|
12 |
|
13 |
|
14 |
+
def detect_text_by_ai_model(
|
15 |
input_text: str,
|
16 |
model: str = DEFAULT_MODEL,
|
17 |
max_length: int = 512,
|
|
|
44 |
return label, confidence_score
|
45 |
except Exception as e: # Add exception handling
|
46 |
print(f"Error in Roberta model inference: {e}")
|
47 |
+
return UNKNOWN, 50 # Return UNKNOWN and 0.0 confidence if error
|
src/application/text/search_detection.py
CHANGED
@@ -33,7 +33,7 @@ MIN_RATIO_PARAPHRASE_NUM = 0.7
|
|
33 |
MAX_CHAR_SIZE = 30000
|
34 |
|
35 |
|
36 |
-
def
|
37 |
|
38 |
checked_urls = set()
|
39 |
searched_phrases = generate_search_phrases(input_text)
|
@@ -46,19 +46,24 @@ def detect_by_relative_search(input_text, is_support_opposite = False):
|
|
46 |
if url in checked_urls: # visited url
|
47 |
continue
|
48 |
checked_urls.add(url)
|
49 |
-
print(f"\tChecking URL: {url}")
|
50 |
|
51 |
content = URLReader(url)
|
52 |
|
53 |
if content.is_extracted is True:
|
|
|
|
|
|
|
|
|
54 |
page_text = content.title + "\n" + content.text
|
55 |
-
if
|
56 |
-
print(f"\t\tβββ More than {MAX_CHAR_SIZE} characters")
|
57 |
continue
|
|
|
58 |
is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text)
|
59 |
if is_paraphrase:
|
60 |
-
return is_paraphrase, url, aligned_sentences
|
61 |
-
return False, None, []
|
62 |
|
63 |
def longest_common_subsequence(arr1, arr2):
|
64 |
"""
|
@@ -256,7 +261,7 @@ def similarity_ratio(a, b):
|
|
256 |
return 0.0 # Handle cases where inputs are not strings or None
|
257 |
return SequenceMatcher(None, a, b).ratio()
|
258 |
|
259 |
-
def check_human(
|
260 |
"""
|
261 |
Checks if a sufficient number of input sentences are found within
|
262 |
source sentences.
|
@@ -264,14 +269,14 @@ def check_human(data, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
|
|
264 |
Returns:
|
265 |
bool: True if the condition is met, False otherwise.
|
266 |
"""
|
267 |
-
if not
|
268 |
return False
|
269 |
-
min_matching = math.ceil(len(
|
270 |
|
271 |
count = 0
|
272 |
|
273 |
#for input_sentence, source_sentence, similiarity, is_paraprhase in data:
|
274 |
-
for sentence in
|
275 |
if sentence["similarity"] >= 0.99:
|
276 |
count += 1
|
277 |
print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}")
|
|
|
33 |
MAX_CHAR_SIZE = 30000
|
34 |
|
35 |
|
36 |
+
def detect_text_by_relative_search(input_text, is_support_opposite = False):
|
37 |
|
38 |
checked_urls = set()
|
39 |
searched_phrases = generate_search_phrases(input_text)
|
|
|
46 |
if url in checked_urls: # visited url
|
47 |
continue
|
48 |
checked_urls.add(url)
|
49 |
+
print(f"\t\tChecking URL: {url}")
|
50 |
|
51 |
content = URLReader(url)
|
52 |
|
53 |
if content.is_extracted is True:
|
54 |
+
if content.title is None or content.text is None:
|
55 |
+
print(f"\t\t\tβββ Title or text not found")
|
56 |
+
continue
|
57 |
+
|
58 |
page_text = content.title + "\n" + content.text
|
59 |
+
if len(page_text) > MAX_CHAR_SIZE:
|
60 |
+
print(f"\t\t\tβββ More than {MAX_CHAR_SIZE} characters")
|
61 |
continue
|
62 |
+
|
63 |
is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text)
|
64 |
if is_paraphrase:
|
65 |
+
return is_paraphrase, url, aligned_sentences, content.images
|
66 |
+
return False, None, [], []
|
67 |
|
68 |
def longest_common_subsequence(arr1, arr2):
|
69 |
"""
|
|
|
261 |
return 0.0 # Handle cases where inputs are not strings or None
|
262 |
return SequenceMatcher(None, a, b).ratio()
|
263 |
|
264 |
+
def check_human(alligned_sentences, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
|
265 |
"""
|
266 |
Checks if a sufficient number of input sentences are found within
|
267 |
source sentences.
|
|
|
269 |
Returns:
|
270 |
bool: True if the condition is met, False otherwise.
|
271 |
"""
|
272 |
+
if not alligned_sentences: # Handle empty data case
|
273 |
return False
|
274 |
+
min_matching = math.ceil(len(alligned_sentences) * min_ratio)
|
275 |
|
276 |
count = 0
|
277 |
|
278 |
#for input_sentence, source_sentence, similiarity, is_paraprhase in data:
|
279 |
+
for sentence in alligned_sentences:
|
280 |
if sentence["similarity"] >= 0.99:
|
281 |
count += 1
|
282 |
print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}")
|
src/application/url_reader.py
CHANGED
@@ -52,7 +52,7 @@ class URLReader():
|
|
52 |
|
53 |
self.title = news.title
|
54 |
self.text = news.text
|
55 |
-
self.images = news.images
|
56 |
self.top_image = news.top_image
|
57 |
|
58 |
def extract_content_bs(self):
|
|
|
52 |
|
53 |
self.title = news.title
|
54 |
self.text = news.text
|
55 |
+
self.images = list(set(news.images)) # Remove duplicates
|
56 |
self.top_image = news.top_image
|
57 |
|
58 |
def extract_content_bs(self):
|