pmkhanh7890 commited on
Commit
da7dbd0
Β·
1 Parent(s): 0542c93

complete the 1st version of GUI

Browse files
Yandexsample.html ADDED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -1,22 +1,23 @@
1
- #display a data
2
- import gradio as gr
3
 
4
- def data_display(replace_df):
5
- return "aaaa"
6
 
7
- with gr.Blocks() as demo:
8
- replace_df = gr.Dataframe(
9
- # headers=["Find what:", "Replace with:"],
10
- # datatype=["str", "str"],
11
- # row_count=(1, "dynamic"),
12
- # col_count=(2, "fixed"),
13
- # interactive=True
14
- )
15
- replace_button = gr.Button("Replace all")
16
- news_content = gr.Textbox(label="Content", value="", lines=12)
17
-
18
 
19
- replace_button.click(data_display,
20
- inputs=[replace_df],
21
- outputs=[news_content])
22
- demo.launch()
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
 
4
+ from src.application.image.search_yandex import get_image_links
 
5
 
6
+
7
+ img_search_url = """https://yandex.ru/images/search?cbir_id=4481385%2Fw-xYJ246B9thwtVBmNcpkg9409&rpt=imageview&lr=10636"""
8
+ print(img_search_url)
9
+
10
+
11
+ headers = {
12
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
13
+ 'Content-Type': 'application/json',
14
+ }
 
 
15
 
16
+
17
+ response = requests.get(img_search_url, headers=headers)
18
+ response.raise_for_status() # Raise an exception for bad status codes
19
+
20
+ # Parse the HTML content
21
+ soup = BeautifulSoup(response.content, 'html.parser')
22
+ image_urls = get_image_links(soup.prettify())
23
+ print(f"image_urls: {image_urls}")
application.py CHANGED
@@ -1,32 +1,19 @@
1
  import os
2
 
3
  import gradio as gr
4
- import openai
5
  import requests
6
  from PIL import Image
7
- import re
8
 
9
- from src.application.content_detection import generate_analysis_report
10
  from src.application.url_reader import URLReader
11
- from src.application.content_generation import generate_content, replace_text
12
 
13
- # from dotenv import load_dotenv
14
-
15
- # load_dotenv()
16
- # AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
17
- # AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
18
- # AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
19
-
20
- # client = openai.AzureOpenAI(
21
- # api_version = AZURE_OPENAI_API_VERSION,
22
- # api_key = AZURE_OPENAI_API_KEY,
23
- # azure_endpoint = AZURE_OPENAI_ENDPOINT,
24
- # )
25
 
26
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
27
  SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
28
 
29
- AZURE_OPENAI_MODEL = ["gpt-4o-mini", "gpt-4o"]
 
30
 
31
  def load_url(url):
32
  """
@@ -54,9 +41,12 @@ def load_url(url):
54
 
55
  return content.title, content.text, image
56
 
57
- def show_detailed_analysis(title):
58
- return f"More details of {title} will be shown here."
59
 
 
 
 
 
 
60
  # Define the GUI
61
  with gr.Blocks() as demo:
62
  gr.Markdown("# FAKE NEWS DETECTION")
@@ -69,16 +59,18 @@ with gr.Blocks() as demo:
69
 
70
  with gr.Accordion("1. Enter a URL"):
71
  url_input = gr.Textbox(
72
- label="URL",
 
73
  value="https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science",
74
  )
75
  load_button = gr.Button("Load URL")
76
 
77
- with gr.Accordion("2. Select a content-generation model", open=True):
78
  with gr.Row():
79
- text_generation_model = gr.Dropdown(choices=AZURE_OPENAI_MODEL, label="Text-generation model")
80
- image_generation_model = gr.Dropdown(choices=["Dall-e", "Stable Diffusion"], label="Image-generation model")
81
- generate_button = gr.Button("Random generation")
 
82
 
83
  with gr.Accordion("3. Replace any terms", open=True):
84
  replace_df = gr.Dataframe(
@@ -93,16 +85,17 @@ with gr.Blocks() as demo:
93
  # GENERATED CONTENT
94
  with gr.Column(scale=1):
95
  with gr.Accordion("Generated News Contents"):
96
- detection_button = gr.Button("Check for fake news")
97
  news_title = gr.Textbox(label="Title", value="")
98
- news_image = gr.Image(label="Image")
99
  news_content = gr.Textbox(label="Content", value="", lines=12)
100
 
101
  # FAKE NEWS ANALYSIS REPORT
102
  with gr.Column(scale=1):
103
  with gr.Accordion("Fake News Analysis"):
104
- html_out = gr.HTML()
105
- detailed_analysis_button = gr.Button("Show detailed analysis...")
 
 
106
 
107
  # Connect events
108
  load_button.click(
@@ -110,19 +103,37 @@ with gr.Blocks() as demo:
110
  inputs=url_input,
111
  outputs=[news_title, news_content, news_image]
112
  )
113
- replace_button.click(replace_text,
114
  inputs=[news_title, news_content, replace_df],
115
  outputs=[news_title, news_content])
116
- generate_button.click(generate_content,
117
- inputs=[text_generation_model, image_generation_model, news_title, news_content],
118
  outputs=[news_title, news_content])
 
 
 
119
  detection_button.click(generate_analysis_report,
120
- inputs=[news_title, news_content, news_image],
121
- outputs=html_out)
122
- detailed_analysis_button.click(show_detailed_analysis,
123
- inputs=[news_title],
124
- outputs=[html_out])
125
  # change Image
126
  #url_input.change(load_image, inputs=url_input, outputs=image_view)
127
-
128
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
 
3
  import gradio as gr
 
4
  import requests
5
  from PIL import Image
 
6
 
7
+ from src.application.content_detection import NewsAnalysis
8
  from src.application.url_reader import URLReader
9
+ from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
10
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
13
  SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
14
 
15
+ AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
16
+ AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
17
 
18
  def load_url(url):
19
  """
 
41
 
42
  return content.title, content.text, image
43
 
 
 
44
 
45
+ def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
46
+ news_analysis.load_news(news_title, news_content, news_image)
47
+ return news_analysis.generate_analysis_report(), news_analysis.analyze_details()
48
+
49
+ news_analysis = NewsAnalysis()
50
  # Define the GUI
51
  with gr.Blocks() as demo:
52
  gr.Markdown("# FAKE NEWS DETECTION")
 
59
 
60
  with gr.Accordion("1. Enter a URL"):
61
  url_input = gr.Textbox(
62
+ label="",
63
+ show_label=False,
64
  value="https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science",
65
  )
66
  load_button = gr.Button("Load URL")
67
 
68
+ with gr.Accordion("2. Select content-generation models", open=True):
69
  with gr.Row():
70
+ text_generation_model = gr.Dropdown(choices=AZURE_TEXT_MODEL, label="Text-generation model")
71
+ image_generation_model = gr.Dropdown(choices=AZURE_IMAGE_MODEL, label="Image-generation model")
72
+ generate_text_button = gr.Button("Generate text")
73
+ generate_image_button = gr.Button("Generate image")
74
 
75
  with gr.Accordion("3. Replace any terms", open=True):
76
  replace_df = gr.Dataframe(
 
85
  # GENERATED CONTENT
86
  with gr.Column(scale=1):
87
  with gr.Accordion("Generated News Contents"):
 
88
  news_title = gr.Textbox(label="Title", value="")
89
+ news_image = gr.Image(label="Image", type="filepath")
90
  news_content = gr.Textbox(label="Content", value="", lines=12)
91
 
92
  # FAKE NEWS ANALYSIS REPORT
93
  with gr.Column(scale=1):
94
  with gr.Accordion("Fake News Analysis"):
95
+ detection_button = gr.Button("Check for fake news")
96
+ analyzed_information = gr.HTML()
97
+ with gr.Accordion("Detailed information"):
98
+ detailed_analysis = gr.HTML()
99
 
100
  # Connect events
101
  load_button.click(
 
103
  inputs=url_input,
104
  outputs=[news_title, news_content, news_image]
105
  )
106
+ replace_button.click(replace_text,
107
  inputs=[news_title, news_content, replace_df],
108
  outputs=[news_title, news_content])
109
+ generate_text_button.click(generate_fake_text,
110
+ inputs=[text_generation_model, news_title, news_content],
111
  outputs=[news_title, news_content])
112
+ generate_image_button.click(generate_fake_image,
113
+ inputs=[image_generation_model, news_title],
114
+ outputs=[news_image])
115
  detection_button.click(generate_analysis_report,
116
+ inputs=[news_title, news_content, news_image],
117
+ outputs=[analyzed_information, detailed_analysis])
118
+
 
 
119
  # change Image
120
  #url_input.change(load_image, inputs=url_input, outputs=image_view)
121
+
122
+ gr.Examples(
123
+ examples=[
124
+ ["https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road"],
125
+ ["https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science"],
126
+ ],
127
+ inputs=[url_input],
128
+ label="Examples",
129
+ example_labels=[
130
+ "BBC news 1",
131
+ "BBC news 2",
132
+ ],
133
+ )
134
+
135
+ demo.launch()
136
+
137
+
138
+ # https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road
139
+ # https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science
src/application/content_detection.py CHANGED
@@ -1,110 +1,298 @@
1
- from src.application.text.model_detection import detect_by_ai_model
2
- from src.application.text.search_detection import check_human, detect_by_relative_search
 
 
3
 
4
 
5
- def determine_text_origin(title, content):
6
- """
7
- Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- Args:
10
- text: The input text to be analyzed.
 
11
 
12
- Returns:
13
- str: The predicted origin of the text:
14
- - "HUMAN": If the text is likely written by a human.
15
- - "MACHINE": If the text is likely generated by a machine.
16
- """
17
- # Classify by search engine
18
- text = title + "\n\n" + content
19
- is_paraphrased, referent_url, aligned_sentences = detect_by_relative_search(text)
20
- prediction_score = 0.0
21
- if not is_paraphrased:
22
- prediction_label = "UNKNOWN"
23
- else:
24
- prediction_score = 100.0
25
- if check_human(aligned_sentences):
26
- prediction_label = "HUMAN"
27
  else:
28
- prediction_label = "MACHINE"
29
-
30
- if prediction_label == "UNKNOWN":
31
- # Classify by SOTA model
32
- prediction_label, prediction_score = detect_by_ai_model(text)
33
-
34
- return prediction_label, prediction_score, referent_url
 
 
 
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- def generate_analysis_report(news_title, news_content, news_image):
38
-
39
- text_prediction_label, text_confidence_score, text_referent_url = determine_text_origin(news_title, news_content)
40
-
41
- # Analyze text content
42
- url1 = text_referent_url
43
- #url2 = "https://example.com/article2"
44
-
45
- # Forensic analysis
46
- if text_prediction_label == "MACHINE":
47
- text_prediction_label = "The text is modified by GPT-4o (AI)"
48
- else:
49
- text_prediction_label = "The text is written by HUMAN"
50
-
51
- image_detection_results = "MACHINE"
52
- if image_detection_results == "MACHINE":
53
- image_detection_results = "The image is generated by Dall-e (AI)"
54
- else:
55
- image_detection_results = "The image is generated by HUMAN"
56
- image_confidence_score = 90.5
57
-
58
- news_detection_results = "MACHINE"
59
- if news_detection_results == "MACHINE":
60
- news_detection_results = "The whole news generated by AI"
61
- else:
62
- news_detection_results = "The whole news written by HUMAN"
63
- news_confidence_score = 97.4
64
-
65
- # Misinformation analysis
66
- out_of_context_results = "cohesive"
67
- if out_of_context_results == "cohesive":
68
- out_of_context_results = "The input news is cohesive (non-out-of-context)"
69
- else:
70
- out_of_context_results = "The input news is out-of-context"
71
- out_of_context_confidence_score = 96.7
72
-
73
- # Description
74
- description = "The description should be concise, clear, and aimed at helping general readers understand the case."
75
-
76
- html_template = f"""
77
- <h2>Placeholder for results</h2>
78
-
79
- <div>
80
- <h3>Originality:</h3>
81
- <ul>
82
- <li><a href="{url1}" target="_blank">{url1[:40] + "..."}</a></li>
83
- </ul>
84
- </div>
85
-
86
- <div>
87
- <h3>Forensic:</h3>
88
- <b>{news_detection_results} (confidence = {news_confidence_score}%)</b>
89
- <ul>
90
- <li>{text_prediction_label} (confidence = {text_confidence_score}%)</li>
91
- <li>{image_detection_results} (confidence = {image_confidence_score}%)</li>
92
- </ul>
93
- </div>
94
-
95
- <div>
96
- <h3>Misinformation:</h3>
97
- <ul>
98
- <li>The input news is {out_of_context_results} (confidence = {out_of_context_confidence_score}%)</li>
99
- </ul>
100
- </div>
101
-
102
- <div>
103
- <h3>Description (optional):</h3>
104
- <ul>
105
- <li>{description}</li>
106
- </ul>
107
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  """
 
 
 
 
 
109
 
110
- return html_template
 
 
 
 
 
 
 
 
1
+ from difflib import SequenceMatcher
2
+ from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
3
+ from src.application.text.model_detection import detect_text_by_ai_model
4
+ from src.application.text.search_detection import check_human, detect_text_by_relative_search
5
 
6
 
7
+ class NewsAnalysis():
8
+ def __init__(self):
9
+ self.news_text = ""
10
+ self.news_title = ""
11
+ self.news_content = ""
12
+ self.news_image = ""
13
+
14
+ self.text_prediction_label = ""
15
+ self.text_prediction_score = -1
16
+ self.text_referent_url = None
17
+ self.image_prediction_label = ""
18
+ self.image_prediction_score = -1
19
+ self.image_referent_url = None
20
+ self.news_prediction_label = ""
21
+ self.news_prediction_score = -1
22
+
23
+ self.found_img_url = []
24
+ self.aligned_sentences = []
25
+ self.is_paraphrased = False
26
+
27
+ def load_news(self, news_title, news_content, news_image):
28
+ self.news_text = news_title + "\n\n" + news_content
29
+ self.news_title = news_title
30
+ self.news_content = news_content
31
+ self.news_image = news_image
32
 
33
+ def determine_text_origin(self):
34
+ """
35
+ Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
36
 
37
+ Args:
38
+ text: The input text to be analyzed.
39
+
40
+ Returns:
41
+ str: The predicted origin of the text:
42
+ - "HUMAN": If the text is likely written by a human.
43
+ - "MACHINE": If the text is likely generated by a machine.
44
+ """
45
+ print("CHECK TEXT:")
46
+ print("\tFrom search engine:")
47
+ # Classify by search engine
48
+ self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text)
49
+
50
+ if self.is_paraphrased is False:
51
+ self.text_prediction_label = "UNKNOWN"
52
  else:
53
+ self.text_prediction_score = 100
54
+ if check_human(self.aligned_sentences):
55
+ self.text_prediction_label = "HUMAN"
56
+ else:
57
+ self.text_prediction_label = "MACHINE"
58
+
59
+ # Classify text by AI model
60
+ print("\tFrom AI model:")
61
+ if self.text_prediction_label == "UNKNOWN":
62
+ self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text)
63
+ self.text_prediction_score *= 100
64
 
65
+ def detect_image_origin(self):
66
+ print("CHECK IMAGE:")
67
+ if self.news_image is None:
68
+ self.image_prediction_label = "UNKNOWN"
69
+ self.image_prediction_score = 0.0
70
+ self.image_referent_url = None
71
+ return
72
+
73
+ print(f"\t: Img path: {self.news_image}")
74
+ matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
75
+ if matched_url is not None:
76
+ print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
77
+ self.image_prediction_label = "HUMAN"
78
+ self.image_prediction_score = similarity
79
+ self.image_referent_url = matched_url
80
+ return
81
+
82
+ matched_url, similarity = detect_image_by_reverse_search(self.news_image)
83
+ if matched_url is not None:
84
+ print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
85
+ self.image_prediction_label = "HUMAN"
86
+ self.image_prediction_score = similarity
87
+ self.image_referent_url = matched_url
88
+ return
89
+
90
+ detected_label, score = detect_image_by_ai_model(self.news_image)
91
+ if detected_label:
92
+ self.image_prediction_label = detected_label
93
+ self.image_prediction_score = score
94
+ self.image_referent_url = None
95
+ return
96
+
97
+ self.image_prediction_label = "UNKNOWN"
98
+ self.image_prediction_score = 50
99
+ self.image_referent_url = None
100
 
101
+ def determine_news_origin(self):
102
+ if self.text_prediction_label == "MACHINE":
103
+ text_prediction_score = 100 - self.text_prediction_score
104
+ elif self.text_prediction_label == "UNKNOWN":
105
+ text_prediction_score = 50
106
+ else:
107
+ text_prediction_score = self.text_prediction_score
108
+
109
+ if self.image_prediction_label == "MACHINE":
110
+ image_prediction_score = 100 - self.image_prediction_score
111
+ elif self.image_prediction_label == "UNKNOWN":
112
+ image_prediction_score = 50
113
+ else:
114
+ image_prediction_score = self.image_prediction_score
115
+
116
+ news_prediction_score = (text_prediction_score + image_prediction_score) / 2
117
+ if news_prediction_score > 50:
118
+ self.news_prediction_score = news_prediction_score
119
+ self.news_prediction_label = "HUMAN"
120
+ else:
121
+ self.news_prediction_score = 100 - news_prediction_score
122
+ self.news_prediction_label = "MACHINE"
123
+
124
+ def generate_analysis_report(self):
125
+ self.determine_text_origin()
126
+ self.detect_image_origin()
127
+ self.determine_news_origin()
128
+
129
+ # Forensic analysis
130
+ if self.text_prediction_label == "MACHINE":
131
+ text_prediction_label = "The text is modified by GPT-4o (AI)"
132
+ else:
133
+ text_prediction_label = "The text is written by HUMAN"
134
+
135
+ if self.image_prediction_label == "MACHINE":
136
+ image_prediction_label = "The image is generated by Dall-e (AI)"
137
+ else:
138
+ image_prediction_label = "The image is generated by HUMAN"
139
+
140
+ if self.news_prediction_label == "MACHINE":
141
+ news_prediction_label = "The whole news generated by AI"
142
+ else:
143
+ news_prediction_label = "The whole news written by HUMAN"
144
+
145
+ # Misinformation analysis
146
+ out_of_context_results = "cohesive"
147
+ if out_of_context_results == "cohesive":
148
+ out_of_context_results = "The input news is cohesive (non-out-of-context)"
149
+ else:
150
+ out_of_context_results = "The input news is out-of-context"
151
+ out_of_context_prediction_score = 96.7
152
+
153
+ # Description
154
+ description = "The description should be concise, clear, and aimed at helping general readers understand the case."
155
+
156
+ if self.text_referent_url is None:
157
+ referred_news = "<li>No referent information</li>"
158
+ else:
159
+ print (f"self.text_referent_url: {self.text_referent_url}")
160
+ referred_news = f"""<li><a href="{self.text_referent_url}" target="_blank">"Referred news: " + {self.text_referent_url[:40] + "..."}</a></li>"""
161
+
162
+ if self.image_referent_url is None:
163
+ referred_image = "<li>No referent information</li>"
164
+ else:
165
+ referred_image = f"""<li><a href="{self.text_referent_url}" target="_blank">"Referred news: " + {self.text_referent_url[:40] + "..."}</a></li>"""
166
+
167
+ html_template = f"""
168
+ <div>
169
+ <h3>Originality:</h3>
170
+ <ul>
171
+ {referred_news}
172
+ {referred_image}
173
+ </ul>
174
+ </div>
175
+
176
+ <div>
177
+ <h3>Forensic:</h3>
178
+ <b>{news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)</b>
179
+ <ul>
180
+ <li>{text_prediction_label} (confidence = {self.text_prediction_score:.2f}%)</li>
181
+ <li>{image_prediction_label} (confidence = {self.image_prediction_score:.2f}%)</li>
182
+ </ul>
183
+ </div>
184
+
185
+ <div>
186
+ <h3>Misinformation (placeholder):</h3>
187
+ <ul>
188
+ <li>The input news is {out_of_context_results} (confidence = {out_of_context_prediction_score:.2f}%)</li>
189
+ </ul>
190
+ </div>
191
+
192
+ <div>
193
+ <h3>Description (optional, placeholder):</h3>
194
+ <ul>
195
+ <li>{description}</li>
196
+ </ul>
197
+ </div>
198
+ """
199
+
200
+ return html_template
201
+
202
+
203
+ def analyze_details(self):
204
+ self.aligned_sentences
205
+ final_table = []
206
+
207
+ for pair in self.aligned_sentences:
208
+ input_words, source_words, input_indexes, source_indexes = (
209
+ self.highlight_overlap_by_word_to_list(
210
+ pair["input_sentence"],
211
+ pair["matched_sentence"],
212
+ )
213
+ )
214
+ final_table.append(
215
+ (input_words, source_words, input_indexes, source_indexes),
216
+ )
217
+
218
+ if len(final_table) != 0:
219
+ html_table = self.create_table(final_table)
220
+ else:
221
+ html_table = ""
222
+ return html_table
223
+
224
+ def highlight_overlap_by_word_to_list(self, text1, text2):
225
+ """
226
+ Return
227
+ - list of words in text1
228
+ - list of words in text2
229
+ - list of index of highlight words in text 1
230
+ - list of index of highlight words in text 2
231
+ """
232
+ # TΓ‘ch chuα»—i thΓ nh cΓ‘c tα»« (word) dα»±a vΓ o khoαΊ£ng trαΊ―ng
233
+ words1 = text1.split()
234
+ words2 = text2.split()
235
+
236
+ index1 = []
237
+ index2 = []
238
+
239
+ # Sα»­ dα»₯ng SequenceMatcher để tΓ¬m cΓ‘c Δ‘oαΊ‘n trΓΉng lαΊ·p giα»―a danh sΓ‘ch cΓ‘c tα»«
240
+ matcher = SequenceMatcher(None, words1, words2)
241
+
242
+ highlighted_text1 = []
243
+ highlighted_text2 = []
244
+
245
+ # Theo dΓ΅i vα»‹ trΓ­ hiện tαΊ‘i trong words1 vΓ  words2
246
+ current_pos1 = 0
247
+ current_pos2 = 0
248
+
249
+ # LαΊ·p qua cΓ‘c Δ‘oαΊ‘n so khα»›p
250
+ for match in matcher.get_matching_blocks():
251
+ start1, start2, length = match
252
+
253
+ # ThΓͺm cΓ‘c tα»« khΓ΄ng trΓΉng lαΊ·p vΓ o (giα»― nguyΓͺn)
254
+ highlighted_text1.extend(words1[current_pos1:start1])
255
+ highlighted_text2.extend(words2[current_pos2:start2])
256
+
257
+ if length > 0:
258
+ for i in range(start1, start1 + length):
259
+ index1.append(i)
260
+ for i in range(start2, start2 + length):
261
+ index2.append(i)
262
+
263
+ # CαΊ­p nhαΊ­t vα»‹ trΓ­ hiện tαΊ‘i
264
+ current_pos1 = start1 + length
265
+ current_pos2 = start2 + length
266
+
267
+ return words1, words2, index1, index2
268
+
269
+ def create_table(self, data):
270
+ table_rows = "\n".join([self.format_pair(pair) for pair in data])
271
+ return f"""
272
+ <h5>Comparison between input news and <a href={self.text_referent_url} target="_blank">source news</a></h5>
273
+ <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
274
+ <thead>
275
+ <tr>
276
+ <th>Input sentence</th>
277
+ <th>Source sentence</th>
278
+ </tr>
279
+ </thead>
280
+ <tbody>
281
+ {table_rows}
282
+ </tbody>
283
+ </table>
284
  """
285
+
286
+ def format_pair(self, pair):
287
+ input_sentence = self.highlight_text(pair[0], pair[2])
288
+ source_sentence = self.highlight_text(pair[1], pair[3])
289
+ return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"
290
 
291
+ def highlight_text(self, words, indexes):
292
+ final_words = words
293
+ for index in indexes:
294
+ final_words[index] = (
295
+ f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
296
+ )
297
+ return " ".join(final_words)
298
+
src/application/content_generation.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import openai
2
  from dotenv import load_dotenv
3
  import os
@@ -13,43 +14,88 @@ client = openai.AzureOpenAI(
13
  azure_endpoint = AZURE_OPENAI_ENDPOINT,
14
  )
15
 
16
- def generate_content(text_generation_model, image_generation_model, title, content):
17
  # Generate text using the selected models
18
- full_content = ""
19
- input_type = ""
 
 
 
 
 
20
  if title and content:
21
- full_content = title + "\n" + content
22
- input_type = "title and content"
23
  elif title:
24
- full_content = title
25
- input_type = "title"
26
  elif content:
27
- full_content = title
28
- input_type = "content"
29
 
30
  # Generate text using the text generation model
31
- generated_text = generate_text(text_generation_model, full_content, input_type)
32
- return title, generated_text
33
-
34
- def generate_text(model, full_context, input_type):
35
- # Generate text using the selected model
36
- if input_type == "":
37
- prompt = "Generate a random fake news article"
38
- else:
39
- prompt = f"Generate a fake news article (title and content) based on the following: # Title: {input_type}:\n\n# Content: {full_context}"
40
-
41
  try:
42
  response = client.chat.completions.create(
43
- model=model,
44
  messages = [{"role": "system", "content": prompt}],
45
  )
46
 
47
  print("Response from OpenAI API: ", response.choices[0].message.content)
48
- return response.choices[0].message.content
49
 
50
  except openai.OpenAIError as e:
51
  print(f"Error interacting with OpenAI API: {e}")
52
- return "An error occurred while processing your request."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def replace_text(news_title, news_content, replace_df):
55
  """
 
1
+ import json
2
  import openai
3
  from dotenv import load_dotenv
4
  import os
 
14
  azure_endpoint = AZURE_OPENAI_ENDPOINT,
15
  )
16
 
17
+ def generate_fake_text(text_generation_model, title, content):
18
  # Generate text using the selected models
19
+ prompt = """Generate a random fake news tittle in this format:
20
+ ---
21
+ # Title: [Fake Title]
22
+ # Content:
23
+ [Fake Content]
24
+ ---
25
+ """
26
  if title and content:
27
+ prompt += """base on the following context:
28
+ # Title: {news_title}:\n# Content: {news_content}"""
29
  elif title:
30
+ prompt += """base on the following context:
31
+ # Title: {news_title}:\n"""
32
  elif content:
33
+ prompt += """base on the following context:
34
+ # Content: {news_content}"""
35
 
36
  # Generate text using the text generation model
37
+ # Generate text using the selected model
 
 
 
 
 
 
 
 
 
38
  try:
39
  response = client.chat.completions.create(
40
+ model=text_generation_model,
41
  messages = [{"role": "system", "content": prompt}],
42
  )
43
 
44
  print("Response from OpenAI API: ", response.choices[0].message.content)
45
+ fake_text = response.choices[0].message.content
46
 
47
  except openai.OpenAIError as e:
48
  print(f"Error interacting with OpenAI API: {e}")
49
+ fake_text = ""
50
+
51
+ if fake_text != "":
52
+ fake_title, fake_content = extract_title_content(fake_text)
53
+ return fake_title, fake_content
54
+
55
+
56
+ def extract_title_content(fake_news):
57
+ """
58
+ Extracts the title and content from the generated fake news string.
59
+
60
+ This function parses a string containing fake news, which is expected to have
61
+ a specific format with a title and content section marked by '# Title:' and
62
+ '# Content:' respectively.
63
+
64
+ Args:
65
+ fake_news (str): A string containing the generated fake news in the expected format.
66
+
67
+ Returns:
68
+ tuple: A tuple containing two elements:
69
+ - title (str): The extracted title of the fake news.
70
+ - content (str): The extracted content of the fake news.
71
+
72
+ Note:
73
+ The function assumes that the input string follows the expected format.
74
+ If the format is not as expected, it may return unexpected results.
75
+ """
76
+ # Extract the title and content from the generated fake news
77
+ title_start_index = fake_news.find("# Title: ") + len("# Title: ")
78
+ title_end_index = fake_news.find("\n", title_start_index)
79
+ title = fake_news[title_start_index:title_end_index].strip()
80
+
81
+ content_start_index = fake_news.find("\n# Content: ") + len("\n# Content: ")
82
+ content = fake_news[content_start_index:].strip()
83
+
84
+ return title, content
85
+
86
+ def generate_fake_image(model, title):
87
+ if len(title) > 0:
88
+ IMAGE_PROMPT = f"Generate a random image about {title}"
89
+ else:
90
+ IMAGE_PROMPT = "Generate a random image"
91
+ result = client.images.generate(
92
+ model="dall-e-3", # the name of your DALL-E 3 deployment
93
+ prompt=IMAGE_PROMPT,
94
+ n=1
95
+ )
96
+ image_url = json.loads(result.model_dump_json())['data'][0]['url']
97
+ return image_url
98
+
99
 
100
  def replace_text(news_title, news_content, replace_df):
101
  """
src/application/image/image_comparison.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from io import BytesIO
3
+ from PIL import Image
4
+ import imagehash
5
+ from src.application.image.search_yandex import YandexReverseImageSearcher
6
+
7
+ def get_image_from_url(url):
8
+ try:
9
+ response = requests.get(url)
10
+ return Image.open(BytesIO(response.content))
11
+ except Exception as e:
12
+ print(f"Error opening image: {e}")
13
+ return None
14
+
15
+ def get_image_from_file(file_path):
16
+ try:
17
+ return Image.open(file_path)
18
+ except FileNotFoundError:
19
+ print(f"Error occurred while opening image from file: {file_path}")
20
+ return None
21
+
22
+ def standardize_image(image):
23
+ # Convert to RGB if needed
24
+ if image.mode in ('RGBA', 'LA'):
25
+ background = Image.new('RGB', image.size, (255, 255, 255))
26
+ background.paste(image, mask=image.split()[-1])
27
+ image = background
28
+ elif image.mode != 'RGB':
29
+ image = image.convert('RGB')
30
+
31
+ # Resize to standard size (e.g. 256x256)
32
+ standard_size = (256, 256)
33
+ image = image.resize(standard_size)
34
+
35
+ return image
36
+
37
+ def compare_images(image1, image2):
38
+ # Standardize both images first
39
+ img1_std = standardize_image(image1)
40
+ img2_std = standardize_image(image2)
41
+
42
+ hash1 = imagehash.average_hash(img1_std)
43
+ hash2 = imagehash.average_hash(img2_std)
44
+ return hash1 - hash2 # Returns the Hamming distance between the hashes
45
+
46
+ if __name__ == '__main__':
47
+ image_url = 'https://i.pinimg.com/originals/c4/50/35/c450352ac6ea8645ead206721673e8fb.png'
48
+
49
+ # Get the image from URL
50
+ url_image = get_image_from_url(image_url)
51
+
52
+ # Search image
53
+ rev_img_searcher = YandexReverseImageSearcher()
54
+ res = rev_img_searcher.search(image_url)
55
+
56
+ for search_item in res:
57
+ print(f'Title: {search_item.page_title}')
58
+ # print(f'Site: {search_item.page_url}')
59
+ print(f'Img: {search_item.image_url}\n')
60
+
61
+ # Compare each search result image with the input image
62
+ result_image = get_image_from_url(search_item.image_url)
63
+ result_difference = compare_images(result_image, url_image)
64
+ print(f"Difference with search result: {result_difference}")
65
+ if result_difference == 0:
66
+ break
src/application/image/image_detection.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from src.application.image.image_comparison import compare_images, get_image_from_file, get_image_from_url
3
+ from src.application.image.model_detection import image_generation_detection
4
+ from src.application.image.search_yandex import yandex_reverse_image_search
5
+
6
+
7
+ def compare_list_of_images(news_image_path, img_urls):
8
+ news_image = get_image_from_file(news_image_path) # TODO: news_image_path is arrays
9
+ if news_image is None:
10
+ return None, -1
11
+
12
+ matched_url = ""
13
+ max_similarity = 0
14
+ for url in img_urls:
15
+ print(f"\t{url}")
16
+ referred_image = get_image_from_url(url)
17
+ if referred_image is None:
18
+ continue
19
+ distance = compare_images(news_image, referred_image) # Hamming algorithm
20
+ similarity = max(100 - distance, 0)
21
+ if similarity > max_similarity:
22
+ max_similarity = similarity
23
+ matched_url = url
24
+
25
+ if max_similarity > 90:
26
+ return matched_url, max_similarity
27
+ return None, -1
28
+
29
+
30
+ def detect_image_from_news_image(news_image_path, image_urls):
31
+ print("\tFrom news:")
32
+ for url in image_urls:
33
+ print(f"\t{url}")
34
+ return compare_list_of_images(news_image_path, image_urls)
35
+
36
+ def detect_image_by_reverse_search(news_image_path):
37
+ image_urls = yandex_reverse_image_search(news_image_path) # url or file_path
38
+ print("\tFrom search engine:")
39
+ for url in image_urls:
40
+ print(f"\t\t{url}")
41
+ return compare_list_of_images(news_image_path, image_urls)
42
+
43
+
44
+ def detect_image_by_ai_model(news_image_path):
45
+ print("\tFrom AI model:")
46
+ image_prediction_label, image_confidence = image_generation_detection(
47
+ news_image_path,
48
+ )
49
+ return image_prediction_label, image_confidence
src/application/image/model_detection.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.metrics import roc_auc_score
2
+ from torchmetrics import Accuracy, Recall
3
+ import pytorch_lightning as pl
4
+ import timm
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import logging
8
+ from PIL import Image
9
+ import torchvision.transforms as transforms
10
+ from torchvision.transforms import v2
11
+
12
+ logging.basicConfig(filename='training.log',filemode='w',level=logging.INFO, force=True)
13
+ CHECKPOINT = "models/image_classifier/image-classifier-step=8008-val_loss=0.11.ckpt"
14
+
15
+ class ImageClassifier(pl.LightningModule):
16
+ def __init__(self, lmd=0):
17
+ super().__init__()
18
+ self.model = timm.create_model('resnet50', pretrained=True, num_classes=1)
19
+ self.accuracy = Accuracy(task='binary', threshold=0.5)
20
+ self.recall = Recall(task='binary', threshold=0.5)
21
+ self.validation_outputs = []
22
+ self.lmd = lmd
23
+
24
+ def forward(self, x):
25
+ return self.model(x)
26
+
27
+ def training_step(self, batch):
28
+ images, labels, _ = batch
29
+ outputs = self.forward(images).squeeze()
30
+
31
+ print(f"Shape of outputs (training): {outputs.shape}")
32
+ print(f"Shape of labels (training): {labels.shape}")
33
+
34
+ loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
35
+ logging.info(f"Training Step - ERM loss: {loss.item()}")
36
+ loss += self.lmd * (outputs ** 2).mean() # SD loss penalty
37
+ logging.info(f"Training Step - SD loss: {loss.item()}")
38
+ return loss
39
+
40
+ def validation_step(self, batch):
41
+ images, labels, _ = batch
42
+ outputs = self.forward(images).squeeze()
43
+
44
+ if outputs.shape == torch.Size([]):
45
+ return
46
+
47
+ print(f"Shape of outputs (validation): {outputs.shape}")
48
+ print(f"Shape of labels (validation): {labels.shape}")
49
+
50
+ loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
51
+ preds = torch.sigmoid(outputs)
52
+ self.log('val_loss', loss, prog_bar=True, sync_dist=True)
53
+ self.log('val_acc', self.accuracy(preds, labels.int()), prog_bar=True, sync_dist=True)
54
+ self.log('val_recall', self.recall(preds, labels.int()), prog_bar=True, sync_dist=True)
55
+ output = {"val_loss": loss, "preds": preds, "labels": labels}
56
+ self.validation_outputs.append(output)
57
+ logging.info(f"Validation Step - Batch loss: {loss.item()}")
58
+ return output
59
+
60
+ def predict_step(self, batch):
61
+ images, label, domain = batch
62
+ outputs = self.forward(images).squeeze()
63
+ preds = torch.sigmoid(outputs)
64
+ return preds, label, domain
65
+
66
+ def on_validation_epoch_end(self):
67
+ if not self.validation_outputs:
68
+ logging.warning("No outputs in validation step to process")
69
+ return
70
+ preds = torch.cat([x['preds'] for x in self.validation_outputs])
71
+ labels = torch.cat([x['labels'] for x in self.validation_outputs])
72
+ if labels.unique().size(0) == 1:
73
+ logging.warning("Only one class in validation step")
74
+ return
75
+ auc_score = roc_auc_score(labels.cpu(), preds.cpu())
76
+ self.log('val_auc', auc_score, prog_bar=True, sync_dist=True)
77
+ logging.info(f"Validation Epoch End - AUC score: {auc_score}")
78
+ self.validation_outputs = []
79
+
80
+ def configure_optimizers(self):
81
+ optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0005)
82
+ return optimizer
83
+
84
+
85
+
86
+ def load_image(image_path, transform=None):
87
+ image = Image.open(image_path).convert('RGB')
88
+
89
+ if transform:
90
+ image = transform(image)
91
+
92
+ return image
93
+
94
+
95
+ def predict_single_image(image_path, model, transform=None):
96
+ image = load_image(image_path, transform)
97
+
98
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
99
+
100
+ model.to(device)
101
+
102
+ image = image.to(device)
103
+
104
+ model.eval()
105
+
106
+ with torch.no_grad():
107
+ image = image.unsqueeze(0)
108
+ output = model(image).squeeze()
109
+ prediction = torch.sigmoid(output).item()
110
+
111
+ return prediction
112
+
113
+
114
+ def image_generation_detection(image_path):
115
+ model = ImageClassifier.load_from_checkpoint(CHECKPOINT)
116
+
117
+ transform = v2.Compose([
118
+ transforms.ToTensor(),
119
+ v2.CenterCrop((256, 256)),
120
+ ])
121
+
122
+ prediction = predict_single_image(image_path, model, transform)
123
+
124
+ result = ""
125
+ if prediction <= 0.2:
126
+ result += "Most likely human"
127
+ image_prediction_label = "HUMAN"
128
+ else:
129
+ result += "Most likely machine"
130
+ image_prediction_label = "MACHINE"
131
+ image_confidence = min(1, 0.5 + abs(prediction - 0.2))
132
+ result += f" with confidence = {round(image_confidence * 100, 2)}%"
133
+ image_confidence = round(image_confidence * 100, 2)
134
+ return image_prediction_label, image_confidence
135
+
136
+
137
+ if __name__ == "__main__":
138
+ image_path = "path_to_your_image.jpg" # Replace with your image path
139
+ image_prediction_label, image_confidence = image_generation_detection(
140
+ image_path,
141
+ )
src/application/image/search_yandex.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import logging
3
+ import requests
4
+ import json
5
+ from bs4 import BeautifulSoup
6
+ from urllib.parse import quote, urlparse
7
+
8
+ logging.basicConfig(
9
+ filename='error.log',
10
+ level=logging.INFO,
11
+ format='%(asctime)s | [%(levelname)s]: %(message)s',
12
+ datefmt='%m-%d-%Y / %I:%M:%S %p'
13
+ )
14
+
15
+ class SearchResults:
16
+ def __init__(self, results):
17
+ self.results = results
18
+
19
+ def __str__(self):
20
+ output = ""
21
+ for result in self.results:
22
+ output += "---\n"
23
+ output += f"Title: {result.get('title', 'Title not found')}\n"
24
+ output += f"Link: {result.get('link', 'Link not found')}\n"
25
+ output += "---\n"
26
+ return output
27
+
28
+ class YandexReverseImageSearcher:
29
+ def __init__(self):
30
+ self.base_url = "https://yandex.ru/images/search"
31
+ self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
32
+ self.retry_count = 3
33
+ self.retry_delay = 1
34
+
35
+ def response(self, query: str, image_url: str, max_results: int = 10, delay: int = 1) -> SearchResults:
36
+ self._validate_input(query, image_url)
37
+
38
+ encoded_query = quote(query)
39
+ encoded_image_url = quote(image_url)
40
+
41
+ url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2"
42
+
43
+ all_results = []
44
+ start_index = 0
45
+
46
+ while len(all_results) < max_results:
47
+ if start_index != 0:
48
+ time.sleep(delay)
49
+
50
+ paginated_url = f"{url}&start={start_index}"
51
+
52
+ response = self._make_request(paginated_url)
53
+ if response is None:
54
+ break
55
+
56
+ search_results, valid_content = self._parse_search_results(response.text)
57
+ if not valid_content:
58
+ logging.warning("Unexpected HTML structure encountered.")
59
+ break
60
+
61
+ for result in search_results:
62
+ if len(all_results) >= max_results:
63
+ break
64
+ data = self._extract_result_data(result)
65
+ if data and data not in all_results:
66
+ all_results.append(data)
67
+
68
+ start_index += (len(all_results)-start_index)
69
+
70
+ if len(all_results) == 0:
71
+ logging.warning(f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].")
72
+ return "No results found. Please try again with a different query and/or image URL."
73
+ else:
74
+ return SearchResults(all_results[:max_results])
75
+
76
+ def _validate_input(self, query: str, image_url: str):
77
+ if not query:
78
+ raise ValueError("Query not found. Please enter a query and try again.")
79
+ if not image_url:
80
+ raise ValueError("Image URL not found. Please enter an image URL and try again.")
81
+ if not self._validate_image_url(image_url):
82
+ raise ValueError("Invalid image URL. Please enter a valid image URL and try again.")
83
+
84
+ def _validate_image_url(self, url: str) -> bool:
85
+ parsed_url = urlparse(url)
86
+ path = parsed_url.path.lower()
87
+ valid_extensions = (".jpg", ".jpeg", ".png", ".webp")
88
+ return any(path.endswith(ext) for ext in valid_extensions)
89
+
90
+ def _make_request(self, url: str):
91
+ attempts = 0
92
+ while attempts < self.retry_count:
93
+ try:
94
+ response = requests.get(url, headers=self.headers)
95
+ if response.headers.get('Content-Type', '').startswith('text/html'):
96
+ response.raise_for_status()
97
+ return response
98
+ else:
99
+ logging.warning("Non-HTML content received.")
100
+ return None
101
+ except requests.exceptions.HTTPError as http_err:
102
+ logging.error(f"HTTP error occurred: {http_err}")
103
+ attempts += 1
104
+ time.sleep(self.retry_delay)
105
+ except Exception as err:
106
+ logging.error(f"An error occurred: {err}")
107
+ return None
108
+ return None
109
+
110
+ def _parse_search_results(self, html_content: str):
111
+ try:
112
+ soup = BeautifulSoup(html_content, "html.parser")
113
+ return soup.find_all('div', class_='g'), True
114
+ except Exception as e:
115
+ logging.error(f"Error parsing HTML content: {e}")
116
+ return None, False
117
+
118
+ def _extract_result_data(self, result):
119
+ link = result.find('a', href=True)['href'] if result.find('a', href=True) else None
120
+ title = result.find('h3').get_text(strip=True) if result.find('h3') else None
121
+ return {"link": link, "title": title} if link and title else {}
122
+
123
+
124
+ def get_image_links(page):
125
+ """
126
+ Extracts image URLs from the given HTML page.
127
+
128
+ Args:
129
+ page: The HTML content as a string.
130
+
131
+ Returns:
132
+ A list of image URLs.
133
+ """
134
+ soup = BeautifulSoup(page, 'html.parser')
135
+
136
+ # Find the specific section containing image links
137
+ gallery_data = soup.find('div', {'class': 'cbir-section cbir-section_name_sites'})
138
+ if gallery_data is None:
139
+ return []
140
+
141
+ # Find the container of image links
142
+ image_links_container = gallery_data.find('div', {'class': 'Root'})
143
+ if image_links_container is None:
144
+ return []
145
+
146
+ data_state = json.loads(image_links_container['data-state'])
147
+
148
+ # Extract URLs from each div
149
+ image_urls = []
150
+ for site in data_state['sites']:
151
+ original_image_url = site['originalImage']['url']
152
+ image_urls.append(original_image_url)
153
+
154
+ return image_urls
155
+
156
+
157
+ def yandex_reverse_image_search(file_path):
158
+ img_search_url = generate_images_search_links(file_path)
159
+ if img_search_url is None:
160
+ return []
161
+
162
+ # Simulate a user agent to avoid being blocked
163
+ headers = {
164
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
165
+ 'Content-Type': 'application/json',
166
+ }
167
+
168
+ try:
169
+ response = requests.get(img_search_url, headers=headers)
170
+ response.raise_for_status() # Raise an exception for bad status codes
171
+
172
+ # Parse the HTML content
173
+ soup = BeautifulSoup(response.content, 'html.parser')
174
+ image_urls = get_image_links(soup.prettify())
175
+ return image_urls
176
+
177
+ except requests.exceptions.RequestException as e:
178
+ print(f"Error fetching image: {e}")
179
+ return []
180
+
181
+
182
+ def generate_images_search_links(file_path):
183
+ search_url = 'https://yandex.ru/images/search'
184
+ params = {'rpt': 'imageview', 'format': 'json', 'request': '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}'}
185
+
186
+ try:
187
+ files = {'upfile': ('blob', open(file_path, 'rb'), 'image/jpeg/webp')}
188
+ response = requests.post(search_url, params=params, files=files)
189
+ query_string = json.loads(response.content)['blocks'][0]['params']['url']
190
+ img_search_url = search_url + '?' + query_string
191
+ return img_search_url
192
+ except:
193
+ return None
194
+
195
+
196
+ if __name__ == "__main__":
197
+ file_path = "T:\\Projects\\prj-nict-ai-content-detection\\data\\test_data\\towels.jpg.webp"
198
+ image_urls = yandex_reverse_image_search(file_path)
199
+ for image_url in image_urls:
200
+ print(f"Image URL: {image_url}")
src/application/text/model_detection.py CHANGED
@@ -11,7 +11,7 @@ PARAPHRASE = "PARAPHRASE"
11
  NON_PARAPHRASE = "NON_PARAPHRASE"
12
 
13
 
14
- def detect_by_ai_model(
15
  input_text: str,
16
  model: str = DEFAULT_MODEL,
17
  max_length: int = 512,
@@ -44,4 +44,4 @@ def detect_by_ai_model(
44
  return label, confidence_score
45
  except Exception as e: # Add exception handling
46
  print(f"Error in Roberta model inference: {e}")
47
- return UNKNOWN, 0.0 # Return UNKNOWN and 0.0 confidence if error
 
11
  NON_PARAPHRASE = "NON_PARAPHRASE"
12
 
13
 
14
+ def detect_text_by_ai_model(
15
  input_text: str,
16
  model: str = DEFAULT_MODEL,
17
  max_length: int = 512,
 
44
  return label, confidence_score
45
  except Exception as e: # Add exception handling
46
  print(f"Error in Roberta model inference: {e}")
47
+ return UNKNOWN, 50 # Return UNKNOWN and 0.0 confidence if error
src/application/text/search_detection.py CHANGED
@@ -33,7 +33,7 @@ MIN_RATIO_PARAPHRASE_NUM = 0.7
33
  MAX_CHAR_SIZE = 30000
34
 
35
 
36
- def detect_by_relative_search(input_text, is_support_opposite = False):
37
 
38
  checked_urls = set()
39
  searched_phrases = generate_search_phrases(input_text)
@@ -46,19 +46,24 @@ def detect_by_relative_search(input_text, is_support_opposite = False):
46
  if url in checked_urls: # visited url
47
  continue
48
  checked_urls.add(url)
49
- print(f"\tChecking URL: {url}")
50
 
51
  content = URLReader(url)
52
 
53
  if content.is_extracted is True:
 
 
 
 
54
  page_text = content.title + "\n" + content.text
55
- if page_text is None or len(page_text) > MAX_CHAR_SIZE:
56
- print(f"\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
57
  continue
 
58
  is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text)
59
  if is_paraphrase:
60
- return is_paraphrase, url, aligned_sentences
61
- return False, None, []
62
 
63
  def longest_common_subsequence(arr1, arr2):
64
  """
@@ -256,7 +261,7 @@ def similarity_ratio(a, b):
256
  return 0.0 # Handle cases where inputs are not strings or None
257
  return SequenceMatcher(None, a, b).ratio()
258
 
259
- def check_human(data, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
260
  """
261
  Checks if a sufficient number of input sentences are found within
262
  source sentences.
@@ -264,14 +269,14 @@ def check_human(data, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
264
  Returns:
265
  bool: True if the condition is met, False otherwise.
266
  """
267
- if not data: # Handle empty data case
268
  return False
269
- min_matching = math.ceil(len(data) * min_ratio)
270
 
271
  count = 0
272
 
273
  #for input_sentence, source_sentence, similiarity, is_paraprhase in data:
274
- for sentence in data:
275
  if sentence["similarity"] >= 0.99:
276
  count += 1
277
  print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}")
 
33
  MAX_CHAR_SIZE = 30000
34
 
35
 
36
+ def detect_text_by_relative_search(input_text, is_support_opposite = False):
37
 
38
  checked_urls = set()
39
  searched_phrases = generate_search_phrases(input_text)
 
46
  if url in checked_urls: # visited url
47
  continue
48
  checked_urls.add(url)
49
+ print(f"\t\tChecking URL: {url}")
50
 
51
  content = URLReader(url)
52
 
53
  if content.is_extracted is True:
54
+ if content.title is None or content.text is None:
55
+ print(f"\t\t\t↑↑↑ Title or text not found")
56
+ continue
57
+
58
  page_text = content.title + "\n" + content.text
59
+ if len(page_text) > MAX_CHAR_SIZE:
60
+ print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
61
  continue
62
+
63
  is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text)
64
  if is_paraphrase:
65
+ return is_paraphrase, url, aligned_sentences, content.images
66
+ return False, None, [], []
67
 
68
  def longest_common_subsequence(arr1, arr2):
69
  """
 
261
  return 0.0 # Handle cases where inputs are not strings or None
262
  return SequenceMatcher(None, a, b).ratio()
263
 
264
+ def check_human(alligned_sentences, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
265
  """
266
  Checks if a sufficient number of input sentences are found within
267
  source sentences.
 
269
  Returns:
270
  bool: True if the condition is met, False otherwise.
271
  """
272
+ if not alligned_sentences: # Handle empty data case
273
  return False
274
+ min_matching = math.ceil(len(alligned_sentences) * min_ratio)
275
 
276
  count = 0
277
 
278
  #for input_sentence, source_sentence, similiarity, is_paraprhase in data:
279
+ for sentence in alligned_sentences:
280
  if sentence["similarity"] >= 0.99:
281
  count += 1
282
  print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}")
src/application/url_reader.py CHANGED
@@ -52,7 +52,7 @@ class URLReader():
52
 
53
  self.title = news.title
54
  self.text = news.text
55
- self.images = news.images
56
  self.top_image = news.top_image
57
 
58
  def extract_content_bs(self):
 
52
 
53
  self.title = news.title
54
  self.text = news.text
55
+ self.images = list(set(news.images)) # Remove duplicates
56
  self.top_image = news.top_image
57
 
58
  def extract_content_bs(self):