Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Jan 30

Commit

da7dbd0

1 Parent(s): 0542c93

complete the 1st version of GUI

Browse files

Files changed (12) hide show

Yandexsample.html +0 -0
app.py +20 -19
application.py +49 -38
src/application/content_detection.py +289 -101
src/application/content_generation.py +68 -22
src/application/image/image_comparison.py +66 -0
src/application/image/image_detection.py +49 -0
src/application/image/model_detection.py +141 -0
src/application/image/search_yandex.py +200 -0
src/application/text/model_detection.py +2 -2
src/application/text/search_detection.py +15 -10
src/application/url_reader.py +1 -1

Yandexsample.html ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py CHANGED Viewed

@@ -1,22 +1,23 @@
-#display a data
-import gradio as gr
-def data_display(replace_df):
-    return "aaaa"
-with gr.Blocks() as demo:
-    replace_df = gr.Dataframe(
-        # headers=["Find what:", "Replace with:"],
-        # datatype=["str", "str"],
-        # row_count=(1, "dynamic"),
-        # col_count=(2, "fixed"),
-        # interactive=True
-    )
-    replace_button = gr.Button("Replace all")
-    news_content = gr.Textbox(label="Content", value="", lines=12)
-    replace_button.click(data_display,
-                        inputs=[replace_df],
-                        outputs=[news_content])
-demo.launch()

+from bs4 import BeautifulSoup
+import requests
+from src.application.image.search_yandex import get_image_links
+img_search_url = """https://yandex.ru/images/search?cbir_id=4481385%2Fw-xYJ246B9thwtVBmNcpkg9409&rpt=imageview&lr=10636"""
+print(img_search_url)
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
+    'Content-Type': 'application/json',
+    }
+response = requests.get(img_search_url, headers=headers)
+response.raise_for_status()  # Raise an exception for bad status codes
+# Parse the HTML content
+soup = BeautifulSoup(response.content, 'html.parser')
+image_urls = get_image_links(soup.prettify())
+print(f"image_urls: {image_urls}")

application.py CHANGED Viewed

@@ -1,32 +1,19 @@
 import os
 import gradio as gr
-import openai
 import requests
 from PIL import Image
-import re
-from src.application.content_detection import generate_analysis_report
 from src.application.url_reader import URLReader
-from src.application.content_generation import generate_content, replace_text
-# from dotenv import load_dotenv
-# load_dotenv()
-# AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
-# AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
-# AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
-# client = openai.AzureOpenAI(
-#     api_version = AZURE_OPENAI_API_VERSION,
-#     api_key = AZURE_OPENAI_API_KEY,
-#     azure_endpoint = AZURE_OPENAI_ENDPOINT,
-#     )
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
-AZURE_OPENAI_MODEL = ["gpt-4o-mini", "gpt-4o"]
 def load_url(url):
     """
@@ -54,9 +41,12 @@ def load_url(url):
     return content.title, content.text, image
-def show_detailed_analysis(title):
-    return f"More details of {title} will be shown here."
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# FAKE NEWS DETECTION")
@@ -69,16 +59,18 @@ with gr.Blocks() as demo:
                 with gr.Accordion("1. Enter a URL"):
                     url_input = gr.Textbox(
-                        label="URL",
                         value="https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science",
                         )
                     load_button = gr.Button("Load URL")
-                with gr.Accordion("2. Select a content-generation model", open=True):
                     with gr.Row():
-                        text_generation_model = gr.Dropdown(choices=AZURE_OPENAI_MODEL, label="Text-generation model")
-                        image_generation_model = gr.Dropdown(choices=["Dall-e", "Stable Diffusion"], label="Image-generation model")
-                    generate_button = gr.Button("Random generation")
                 with gr.Accordion("3. Replace any terms", open=True):
                     replace_df = gr.Dataframe(
@@ -93,16 +85,17 @@ with gr.Blocks() as demo:
         # GENERATED CONTENT
         with gr.Column(scale=1):
             with gr.Accordion("Generated News Contents"):
-                detection_button = gr.Button("Check for fake news")
                 news_title = gr.Textbox(label="Title", value="")
-                news_image = gr.Image(label="Image")
                 news_content = gr.Textbox(label="Content", value="", lines=12)
         # FAKE NEWS ANALYSIS REPORT
         with gr.Column(scale=1):
             with gr.Accordion("Fake News Analysis"):
-                html_out = gr.HTML()
-                detailed_analysis_button = gr.Button("Show detailed analysis...")
     # Connect events
     load_button.click(
@@ -110,19 +103,37 @@ with gr.Blocks() as demo:
         inputs=url_input,
         outputs=[news_title, news_content, news_image]
         )
-    replace_button.click(replace_text,
                         inputs=[news_title, news_content, replace_df],
                         outputs=[news_title, news_content])
-    generate_button.click(generate_content,
-                        inputs=[text_generation_model, image_generation_model, news_title, news_content],
                         outputs=[news_title, news_content])
     detection_button.click(generate_analysis_report,
-                           inputs=[news_title, news_content, news_image],
-                           outputs=html_out)
-    detailed_analysis_button.click(show_detailed_analysis,
-                                   inputs=[news_title],
-                                   outputs=[html_out])
     # change Image
     #url_input.change(load_image, inputs=url_input, outputs=image_view)
-demo.launch()

 import os
 import gradio as gr
 import requests
 from PIL import Image
+from src.application.content_detection import NewsAnalysis
 from src.application.url_reader import URLReader
+from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
 GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
 SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
+AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
+AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
 def load_url(url):
     """
     return content.title, content.text, image
+def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
+    news_analysis.load_news(news_title, news_content, news_image)
+    return news_analysis.generate_analysis_report(), news_analysis.analyze_details()
+news_analysis = NewsAnalysis()
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# FAKE NEWS DETECTION")
                 with gr.Accordion("1. Enter a URL"):
                     url_input = gr.Textbox(
+                        label="",
+                        show_label=False,
                         value="https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science",
                         )
                     load_button = gr.Button("Load URL")
+                with gr.Accordion("2. Select content-generation models", open=True):
                     with gr.Row():
+                            text_generation_model = gr.Dropdown(choices=AZURE_TEXT_MODEL, label="Text-generation model")
+                            image_generation_model = gr.Dropdown(choices=AZURE_IMAGE_MODEL, label="Image-generation model")
+                            generate_text_button = gr.Button("Generate text")
+                            generate_image_button = gr.Button("Generate image")
                 with gr.Accordion("3. Replace any terms", open=True):
                     replace_df = gr.Dataframe(
         # GENERATED CONTENT
         with gr.Column(scale=1):
             with gr.Accordion("Generated News Contents"):
                 news_title = gr.Textbox(label="Title", value="")
+                news_image = gr.Image(label="Image", type="filepath")
                 news_content = gr.Textbox(label="Content", value="", lines=12)
         # FAKE NEWS ANALYSIS REPORT
         with gr.Column(scale=1):
             with gr.Accordion("Fake News Analysis"):
+                detection_button = gr.Button("Check for fake news")
+                analyzed_information = gr.HTML()
+                with gr.Accordion("Detailed information"):
+                    detailed_analysis = gr.HTML()
     # Connect events
     load_button.click(
         inputs=url_input,
         outputs=[news_title, news_content, news_image]
         )
+    replace_button.click(replace_text,
                         inputs=[news_title, news_content, replace_df],
                         outputs=[news_title, news_content])
+    generate_text_button.click(generate_fake_text,
+                        inputs=[text_generation_model, news_title, news_content],
                         outputs=[news_title, news_content])
+    generate_image_button.click(generate_fake_image,
+                        inputs=[image_generation_model, news_title],
+                        outputs=[news_image])
     detection_button.click(generate_analysis_report,
+                            inputs=[news_title, news_content, news_image],
+                            outputs=[analyzed_information, detailed_analysis])
     # change Image
     #url_input.change(load_image, inputs=url_input, outputs=image_view)
+    gr.Examples(
+        examples=[
+            ["https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road"],
+            ["https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science"],
+        ],
+        inputs=[url_input],
+        label="Examples",
+        example_labels=[
+            "BBC news 1",
+            "BBC news 2",
+        ],
+    )
+demo.launch()
+# https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road
+# https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science

src/application/content_detection.py CHANGED Viewed

@@ -1,110 +1,298 @@
-from src.application.text.model_detection import detect_by_ai_model
-from src.application.text.search_detection import check_human, detect_by_relative_search
-def determine_text_origin(title, content):
-    """
-    Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
-    Args:
-        text: The input text to be analyzed.
-    Returns:
-        str: The predicted origin of the text:
-             - "HUMAN": If the text is likely written by a human.
-             - "MACHINE": If the text is likely generated by a machine.
-    """
-    # Classify by search engine
-    text = title + "\n\n" + content
-    is_paraphrased, referent_url, aligned_sentences = detect_by_relative_search(text)
-    prediction_score = 0.0
-    if not is_paraphrased:
-        prediction_label = "UNKNOWN"
-    else:
-        prediction_score = 100.0
-        if check_human(aligned_sentences):
-            prediction_label = "HUMAN"
         else:
-            prediction_label = "MACHINE"
-    if prediction_label == "UNKNOWN":
-        # Classify by SOTA model
-        prediction_label, prediction_score = detect_by_ai_model(text)
-    return prediction_label, prediction_score, referent_url
-def generate_analysis_report(news_title, news_content, news_image):
-    text_prediction_label, text_confidence_score, text_referent_url = determine_text_origin(news_title, news_content)
-    # Analyze text content
-    url1 = text_referent_url
-    #url2 = "https://example.com/article2"
-    # Forensic analysis
-    if text_prediction_label == "MACHINE":
-        text_prediction_label = "The text is modified by GPT-4o (AI)"
-    else:
-        text_prediction_label = "The text is written by HUMAN"
-    image_detection_results = "MACHINE"
-    if image_detection_results == "MACHINE":
-        image_detection_results = "The image is generated by Dall-e (AI)"
-    else:
-        image_detection_results = "The image is generated by HUMAN"
-    image_confidence_score = 90.5
-    news_detection_results = "MACHINE"
-    if news_detection_results == "MACHINE":
-        news_detection_results = "The whole news generated by AI"
-    else:
-        news_detection_results = "The whole news written by HUMAN"
-    news_confidence_score = 97.4
-    # Misinformation analysis
-    out_of_context_results = "cohesive"
-    if out_of_context_results == "cohesive":
-        out_of_context_results = "The input news is cohesive (non-out-of-context)"
-    else:
-        out_of_context_results = "The input news is out-of-context"
-    out_of_context_confidence_score = 96.7
-    # Description
-    description = "The description should be concise, clear, and aimed at helping general readers understand the case."
-    html_template = f"""
-    <h2>Placeholder for results</h2>
-    <div>
-        <h3>Originality:</h3>
-        <ul>
-            <li><a href="{url1}" target="_blank">{url1[:40] + "..."}</a></li>
-        </ul>
-    </div>
-    <div>
-        <h3>Forensic:</h3>
-        <b>{news_detection_results} (confidence = {news_confidence_score}%)</b>
-        <ul>
-            <li>{text_prediction_label} (confidence = {text_confidence_score}%)</li>
-            <li>{image_detection_results} (confidence = {image_confidence_score}%)</li>
-        </ul>
-    </div>
-    <div>
-        <h3>Misinformation:</h3>
-        <ul>
-            <li>The input news is {out_of_context_results} (confidence = {out_of_context_confidence_score}%)</li>
-        </ul>
-    </div>
-    <div>
-        <h3>Description (optional):</h3>
-        <ul>
-            <li>{description}</li>
-        </ul>
-    </div>
     """
-    return html_template

+from difflib import SequenceMatcher
+from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
+from src.application.text.model_detection import detect_text_by_ai_model
+from src.application.text.search_detection import check_human, detect_text_by_relative_search
+class NewsAnalysis():
+    def __init__(self):
+        self.news_text = ""
+        self.news_title = ""
+        self.news_content = ""
+        self.news_image = ""
+        self.text_prediction_label = ""
+        self.text_prediction_score = -1
+        self.text_referent_url = None
+        self.image_prediction_label = ""
+        self.image_prediction_score = -1
+        self.image_referent_url = None
+        self.news_prediction_label = ""
+        self.news_prediction_score = -1
+        self.found_img_url = []
+        self.aligned_sentences = []
+        self.is_paraphrased = False
+    def load_news(self, news_title, news_content, news_image):
+        self.news_text = news_title + "\n\n" + news_content
+        self.news_title = news_title
+        self.news_content = news_content
+        self.news_image = news_image
+    def determine_text_origin(self):
+        """
+        Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
+        Args:
+            text: The input text to be analyzed.
+        Returns:
+            str: The predicted origin of the text:
+                - "HUMAN": If the text is likely written by a human.
+                - "MACHINE": If the text is likely generated by a machine.
+        """
+        print("CHECK TEXT:")
+        print("\tFrom search engine:")
+        # Classify by search engine
+        self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text)
+        if self.is_paraphrased is False:
+            self.text_prediction_label = "UNKNOWN"
         else:
+            self.text_prediction_score = 100
+            if check_human(self.aligned_sentences):
+                self.text_prediction_label = "HUMAN"
+            else:
+                self.text_prediction_label = "MACHINE"
+        # Classify text by AI model
+        print("\tFrom AI model:")
+        if self.text_prediction_label == "UNKNOWN":
+            self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text)
+            self.text_prediction_score *= 100
+    def detect_image_origin(self):
+        print("CHECK IMAGE:")
+        if self.news_image is None:
+            self.image_prediction_label = "UNKNOWN"
+            self.image_prediction_score = 0.0
+            self.image_referent_url = None
+            return
+        print(f"\t: Img path: {self.news_image}")
+        matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
+        if matched_url is not None:
+            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
+            self.image_prediction_label = "HUMAN"
+            self.image_prediction_score = similarity
+            self.image_referent_url = matched_url
+            return
+        matched_url, similarity = detect_image_by_reverse_search(self.news_image)
+        if matched_url is not None:
+            print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
+            self.image_prediction_label = "HUMAN"
+            self.image_prediction_score = similarity
+            self.image_referent_url = matched_url
+            return
+        detected_label, score = detect_image_by_ai_model(self.news_image)
+        if detected_label:
+            self.image_prediction_label = detected_label
+            self.image_prediction_score = score
+            self.image_referent_url = None
+            return
+        self.image_prediction_label = "UNKNOWN"
+        self.image_prediction_score = 50
+        self.image_referent_url = None
+    def determine_news_origin(self):
+        if self.text_prediction_label == "MACHINE":
+            text_prediction_score = 100 - self.text_prediction_score
+        elif self.text_prediction_label == "UNKNOWN":
+            text_prediction_score = 50
+        else:
+            text_prediction_score = self.text_prediction_score
+        if self.image_prediction_label == "MACHINE":
+            image_prediction_score = 100 - self.image_prediction_score
+        elif self.image_prediction_label == "UNKNOWN":
+            image_prediction_score = 50
+        else:
+            image_prediction_score = self.image_prediction_score
+        news_prediction_score = (text_prediction_score + image_prediction_score) / 2
+        if news_prediction_score > 50:
+            self.news_prediction_score = news_prediction_score
+            self.news_prediction_label = "HUMAN"
+        else:
+            self.news_prediction_score = 100 - news_prediction_score
+            self.news_prediction_label = "MACHINE"
+    def generate_analysis_report(self):
+        self.determine_text_origin()
+        self.detect_image_origin()
+        self.determine_news_origin()
+        # Forensic analysis
+        if self.text_prediction_label == "MACHINE":
+            text_prediction_label = "The text is modified by GPT-4o (AI)"
+        else:
+            text_prediction_label = "The text is written by HUMAN"
+        if self.image_prediction_label == "MACHINE":
+            image_prediction_label = "The image is generated by Dall-e (AI)"
+        else:
+            image_prediction_label = "The image is generated by HUMAN"
+        if self.news_prediction_label == "MACHINE":
+            news_prediction_label = "The whole news generated by AI"
+        else:
+            news_prediction_label = "The whole news written by HUMAN"
+        # Misinformation analysis
+        out_of_context_results = "cohesive"
+        if out_of_context_results == "cohesive":
+            out_of_context_results = "The input news is cohesive (non-out-of-context)"
+        else:
+            out_of_context_results = "The input news is out-of-context"
+        out_of_context_prediction_score = 96.7
+        # Description
+        description = "The description should be concise, clear, and aimed at helping general readers understand the case."
+        if self.text_referent_url is None:
+            referred_news = "<li>No referent information</li>"
+        else:
+            print (f"self.text_referent_url: {self.text_referent_url}")
+            referred_news = f"""<li><a href="{self.text_referent_url}" target="_blank">"Referred news: " + {self.text_referent_url[:40] + "..."}</a></li>"""
+        if self.image_referent_url is None:
+            referred_image = "<li>No referent information</li>"
+        else:
+            referred_image = f"""<li><a href="{self.text_referent_url}" target="_blank">"Referred news: " + {self.text_referent_url[:40] + "..."}</a></li>"""
+        html_template = f"""
+        <div>
+            <h3>Originality:</h3>
+            <ul>
+                {referred_news}
+                {referred_image}
+            </ul>
+        </div>
+        <div>
+            <h3>Forensic:</h3>
+            <b>{news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)</b>
+            <ul>
+                <li>{text_prediction_label} (confidence = {self.text_prediction_score:.2f}%)</li>
+                <li>{image_prediction_label} (confidence = {self.image_prediction_score:.2f}%)</li>
+            </ul>
+        </div>
+        <div>
+            <h3>Misinformation (placeholder):</h3>
+            <ul>
+                <li>The input news is {out_of_context_results} (confidence = {out_of_context_prediction_score:.2f}%)</li>
+            </ul>
+        </div>
+        <div>
+            <h3>Description (optional, placeholder):</h3>
+            <ul>
+                <li>{description}</li>
+            </ul>
+        </div>
+        """
+        return html_template
+    def analyze_details(self):
+        self.aligned_sentences
+        final_table = []
+        for pair in self.aligned_sentences:
+            input_words, source_words, input_indexes, source_indexes = (
+                self.highlight_overlap_by_word_to_list(
+                    pair["input_sentence"],
+                    pair["matched_sentence"],
+                )
+            )
+            final_table.append(
+                (input_words, source_words, input_indexes, source_indexes),
+            )
+        if len(final_table) != 0:
+            html_table = self.create_table(final_table)
+        else:
+            html_table = ""
+        return html_table
+    def highlight_overlap_by_word_to_list(self, text1, text2):
+        """
+        Return
+        - list of words in text1
+        - list of words in text2
+        - list of index of highlight words in text 1
+        - list of index of highlight words in text 2
+        """
+        # Tách chuỗi thành các từ (word) dựa vào khoảng trắng
+        words1 = text1.split()
+        words2 = text2.split()
+        index1 = []
+        index2 = []
+        # Sử dụng SequenceMatcher để tìm các đoạn trùng lặp giữa danh sách các từ
+        matcher = SequenceMatcher(None, words1, words2)
+        highlighted_text1 = []
+        highlighted_text2 = []
+        # Theo dõi vị trí hiện tại trong words1 và words2
+        current_pos1 = 0
+        current_pos2 = 0
+        # Lặp qua các đoạn so khớp
+        for match in matcher.get_matching_blocks():
+            start1, start2, length = match
+            # Thêm các từ không trùng lặp vào (giữ nguyên)
+            highlighted_text1.extend(words1[current_pos1:start1])
+            highlighted_text2.extend(words2[current_pos2:start2])
+            if length > 0:
+                for i in range(start1, start1 + length):
+                    index1.append(i)
+                for i in range(start2, start2 + length):
+                    index2.append(i)
+            # Cập nhật vị trí hiện tại
+            current_pos1 = start1 + length
+            current_pos2 = start2 + length
+        return words1, words2, index1, index2
+    def create_table(self, data):
+        table_rows = "\n".join([self.format_pair(pair) for pair in data])
+        return f"""
+        <h5>Comparison between input news and <a href={self.text_referent_url} target="_blank">source news</a></h5>
+        <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
+            <thead>
+                <tr>
+                    <th>Input sentence</th>
+                    <th>Source sentence</th>
+                </tr>
+            </thead>
+            <tbody>
+                {table_rows}
+            </tbody>
+        </table>
     """
+    def format_pair(self, pair):
+        input_sentence = self.highlight_text(pair[0], pair[2])
+        source_sentence = self.highlight_text(pair[1], pair[3])
+        return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"
+    def highlight_text(self, words, indexes):
+        final_words = words
+        for index in indexes:
+            final_words[index] = (
+                f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
+            )
+        return " ".join(final_words)

src/application/content_generation.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import openai
 from dotenv import load_dotenv
 import os
@@ -13,43 +14,88 @@ client = openai.AzureOpenAI(
     azure_endpoint = AZURE_OPENAI_ENDPOINT,
     )
-def generate_content(text_generation_model, image_generation_model, title, content):
     # Generate text using the selected models
-    full_content = ""
-    input_type = ""
     if title and content:
-        full_content = title + "\n" + content
-        input_type = "title and content"
     elif title:
-        full_content = title
-        input_type = "title"
     elif content:
-        full_content = title
-        input_type = "content"
     # Generate text using the text generation model
-    generated_text = generate_text(text_generation_model, full_content, input_type)
-    return title, generated_text
-def generate_text(model, full_context, input_type):
-    # Generate text using the selected model
-    if input_type == "":
-        prompt = "Generate a random fake news article"
-    else:
-        prompt = f"Generate a fake news article (title and content) based on the following: # Title: {input_type}:\n\n# Content: {full_context}"
     try:
         response = client.chat.completions.create(
-            model=model,
             messages = [{"role": "system", "content": prompt}],
         )
         print("Response from OpenAI API: ", response.choices[0].message.content)
-        return response.choices[0].message.content
     except openai.OpenAIError as e:
         print(f"Error interacting with OpenAI API: {e}")
-        return "An error occurred while processing your request."
 def replace_text(news_title, news_content, replace_df):
     """

+import json
 import openai
 from dotenv import load_dotenv
 import os
     azure_endpoint = AZURE_OPENAI_ENDPOINT,
     )
+def generate_fake_text(text_generation_model, title, content):
     # Generate text using the selected models
+    prompt = """Generate a random fake news tittle in this format:
+    ---
+    # Title: [Fake Title]
+    # Content:
+    [Fake Content]
+    ---
+    """
     if title and content:
+        prompt += """base on the following context:
+        # Title: {news_title}:\n# Content: {news_content}"""
     elif title:
+        prompt += """base on the following context:
+        # Title: {news_title}:\n"""
     elif content:
+        prompt += """base on the following context:
+        # Content: {news_content}"""
     # Generate text using the text generation model
+    # Generate text using the selected model
     try:
         response = client.chat.completions.create(
+            model=text_generation_model,
             messages = [{"role": "system", "content": prompt}],
         )
         print("Response from OpenAI API: ", response.choices[0].message.content)
+        fake_text = response.choices[0].message.content
     except openai.OpenAIError as e:
         print(f"Error interacting with OpenAI API: {e}")
+        fake_text =  ""
+    if fake_text != "":
+        fake_title, fake_content = extract_title_content(fake_text)
+    return fake_title, fake_content
+def extract_title_content(fake_news):
+    """
+    Extracts the title and content from the generated fake news string.
+    This function parses a string containing fake news, which is expected to have
+    a specific format with a title and content section marked by '# Title:' and
+    '# Content:' respectively.
+    Args:
+        fake_news (str): A string containing the generated fake news in the expected format.
+    Returns:
+        tuple: A tuple containing two elements:
+            - title (str): The extracted title of the fake news.
+            - content (str): The extracted content of the fake news.
+    Note:
+        The function assumes that the input string follows the expected format.
+        If the format is not as expected, it may return unexpected results.
+    """
+    # Extract the title and content from the generated fake news
+    title_start_index = fake_news.find("# Title: ") + len("# Title: ")
+    title_end_index = fake_news.find("\n", title_start_index)
+    title = fake_news[title_start_index:title_end_index].strip()
+    content_start_index = fake_news.find("\n# Content: ") + len("\n# Content: ")
+    content = fake_news[content_start_index:].strip()
+    return title, content
+def generate_fake_image(model, title):
+    if len(title) > 0:
+        IMAGE_PROMPT = f"Generate a random image about {title}"
+    else:
+        IMAGE_PROMPT = "Generate a random image"
+    result = client.images.generate(
+        model="dall-e-3", # the name of your DALL-E 3 deployment
+        prompt=IMAGE_PROMPT,
+        n=1
+    )
+    image_url = json.loads(result.model_dump_json())['data'][0]['url']
+    return image_url
 def replace_text(news_title, news_content, replace_df):
     """

src/application/image/image_comparison.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import requests
+from io import BytesIO
+from PIL import Image
+import imagehash
+from src.application.image.search_yandex import YandexReverseImageSearcher
+def get_image_from_url(url):
+    try:
+        response = requests.get(url)
+        return Image.open(BytesIO(response.content))
+    except Exception as e:
+        print(f"Error opening image: {e}")
+        return None
+def get_image_from_file(file_path):
+    try:
+        return Image.open(file_path)
+    except FileNotFoundError:
+        print(f"Error occurred while opening image from file: {file_path}")
+        return None
+def standardize_image(image):
+    # Convert to RGB if needed
+    if image.mode in ('RGBA', 'LA'):
+        background = Image.new('RGB', image.size, (255, 255, 255))
+        background.paste(image, mask=image.split()[-1])
+        image = background
+    elif image.mode != 'RGB':
+        image = image.convert('RGB')
+    # Resize to standard size (e.g. 256x256)
+    standard_size = (256, 256)
+    image = image.resize(standard_size)
+    return image
+def compare_images(image1, image2):
+    # Standardize both images first
+    img1_std = standardize_image(image1)
+    img2_std = standardize_image(image2)
+    hash1 = imagehash.average_hash(img1_std)
+    hash2 = imagehash.average_hash(img2_std)
+    return hash1 - hash2  # Returns the Hamming distance between the hashes
+if __name__ == '__main__':
+    image_url = 'https://i.pinimg.com/originals/c4/50/35/c450352ac6ea8645ead206721673e8fb.png'
+    # Get the image from URL
+    url_image = get_image_from_url(image_url)
+    # Search image
+    rev_img_searcher = YandexReverseImageSearcher()
+    res = rev_img_searcher.search(image_url)
+    for search_item in res:
+        print(f'Title: {search_item.page_title}')
+        # print(f'Site: {search_item.page_url}')
+        print(f'Img: {search_item.image_url}\n')
+        # Compare each search result image with the input image
+        result_image = get_image_from_url(search_item.image_url)
+        result_difference = compare_images(result_image, url_image)
+        print(f"Difference with search result: {result_difference}")
+        if result_difference == 0:
+            break

src/application/image/image_detection.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from src.application.image.image_comparison import compare_images, get_image_from_file, get_image_from_url
+from src.application.image.model_detection import image_generation_detection
+from src.application.image.search_yandex import yandex_reverse_image_search
+def compare_list_of_images(news_image_path, img_urls):
+    news_image = get_image_from_file(news_image_path)  # TODO: news_image_path is arrays
+    if news_image is None:
+        return None, -1
+    matched_url = ""
+    max_similarity = 0
+    for url in img_urls:
+        print(f"\t{url}")
+        referred_image = get_image_from_url(url)
+        if referred_image is None:
+            continue
+        distance = compare_images(news_image, referred_image)  # Hamming algorithm
+        similarity = max(100 - distance, 0)
+        if similarity > max_similarity:
+            max_similarity = similarity
+            matched_url = url
+    if max_similarity > 90:
+        return matched_url, max_similarity
+    return None, -1
+def detect_image_from_news_image(news_image_path, image_urls):
+    print("\tFrom news:")
+    for url in image_urls:
+        print(f"\t{url}")
+    return compare_list_of_images(news_image_path, image_urls)
+def detect_image_by_reverse_search(news_image_path):
+    image_urls = yandex_reverse_image_search(news_image_path) # url or file_path
+    print("\tFrom search engine:")
+    for url in image_urls:
+        print(f"\t\t{url}")
+    return compare_list_of_images(news_image_path, image_urls)
+def detect_image_by_ai_model(news_image_path):
+    print("\tFrom AI model:")
+    image_prediction_label, image_confidence = image_generation_detection(
+        news_image_path,
+        )
+    return image_prediction_label, image_confidence

src/application/image/model_detection.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from sklearn.metrics import roc_auc_score
+from torchmetrics import Accuracy, Recall
+import pytorch_lightning as pl
+import timm
+import torch
+import torch.nn.functional as F
+import logging
+from PIL import Image
+import torchvision.transforms as transforms
+from torchvision.transforms import v2
+logging.basicConfig(filename='training.log',filemode='w',level=logging.INFO, force=True)
+CHECKPOINT = "models/image_classifier/image-classifier-step=8008-val_loss=0.11.ckpt"
+class ImageClassifier(pl.LightningModule):
+    def __init__(self, lmd=0):
+        super().__init__()
+        self.model = timm.create_model('resnet50', pretrained=True, num_classes=1)
+        self.accuracy = Accuracy(task='binary', threshold=0.5)
+        self.recall = Recall(task='binary', threshold=0.5)
+        self.validation_outputs = []
+        self.lmd = lmd
+    def forward(self, x):
+        return self.model(x)
+    def training_step(self, batch):
+        images, labels, _ = batch
+        outputs = self.forward(images).squeeze()
+        print(f"Shape of outputs (training): {outputs.shape}")
+        print(f"Shape of labels (training): {labels.shape}")
+        loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
+        logging.info(f"Training Step - ERM loss: {loss.item()}")
+        loss += self.lmd * (outputs ** 2).mean() # SD loss penalty
+        logging.info(f"Training Step - SD loss: {loss.item()}")
+        return loss
+    def validation_step(self, batch):
+        images, labels, _ = batch
+        outputs = self.forward(images).squeeze()
+        if outputs.shape == torch.Size([]):
+            return
+        print(f"Shape of outputs (validation): {outputs.shape}")
+        print(f"Shape of labels (validation): {labels.shape}")
+        loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
+        preds = torch.sigmoid(outputs)
+        self.log('val_loss', loss, prog_bar=True, sync_dist=True)
+        self.log('val_acc', self.accuracy(preds, labels.int()), prog_bar=True, sync_dist=True)
+        self.log('val_recall', self.recall(preds, labels.int()), prog_bar=True, sync_dist=True)
+        output = {"val_loss": loss, "preds": preds, "labels": labels}
+        self.validation_outputs.append(output)
+        logging.info(f"Validation Step - Batch loss: {loss.item()}")
+        return output
+    def predict_step(self, batch):
+        images, label, domain = batch
+        outputs = self.forward(images).squeeze()
+        preds = torch.sigmoid(outputs)
+        return preds, label, domain
+    def on_validation_epoch_end(self):
+        if not self.validation_outputs:
+            logging.warning("No outputs in validation step to process")
+            return
+        preds = torch.cat([x['preds'] for x in self.validation_outputs])
+        labels = torch.cat([x['labels'] for x in self.validation_outputs])
+        if labels.unique().size(0) == 1:
+            logging.warning("Only one class in validation step")
+            return
+        auc_score = roc_auc_score(labels.cpu(), preds.cpu())
+        self.log('val_auc', auc_score, prog_bar=True, sync_dist=True)
+        logging.info(f"Validation Epoch End - AUC score: {auc_score}")
+        self.validation_outputs = []
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0005)
+        return optimizer
+def load_image(image_path, transform=None):
+    image = Image.open(image_path).convert('RGB')
+    if transform:
+        image = transform(image)
+    return image
+def predict_single_image(image_path, model, transform=None):
+    image = load_image(image_path, transform)
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+    image = image.to(device)
+    model.eval()
+    with torch.no_grad():
+        image = image.unsqueeze(0)
+        output = model(image).squeeze()
+        prediction = torch.sigmoid(output).item()
+    return prediction
+def image_generation_detection(image_path):
+    model = ImageClassifier.load_from_checkpoint(CHECKPOINT)
+    transform = v2.Compose([
+        transforms.ToTensor(),
+        v2.CenterCrop((256, 256)),
+    ])
+    prediction = predict_single_image(image_path, model, transform)
+    result = ""
+    if prediction <= 0.2:
+        result += "Most likely human"
+        image_prediction_label = "HUMAN"
+    else:
+        result += "Most likely machine"
+        image_prediction_label = "MACHINE"
+    image_confidence = min(1, 0.5 + abs(prediction - 0.2))
+    result += f" with confidence = {round(image_confidence * 100, 2)}%"
+    image_confidence = round(image_confidence * 100, 2)
+    return image_prediction_label, image_confidence
+if __name__ == "__main__":
+    image_path = "path_to_your_image.jpg"  # Replace with your image path
+    image_prediction_label, image_confidence = image_generation_detection(
+        image_path,
+        )

src/application/image/search_yandex.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import time
+import logging
+import requests
+import json
+from bs4 import BeautifulSoup
+from urllib.parse import quote, urlparse
+logging.basicConfig(
+    filename='error.log',
+    level=logging.INFO,
+    format='%(asctime)s | [%(levelname)s]: %(message)s',
+    datefmt='%m-%d-%Y / %I:%M:%S %p'
+)
+class SearchResults:
+    def __init__(self, results):
+        self.results = results
+    def __str__(self):
+        output = ""
+        for result in self.results:
+            output += "---\n"
+            output += f"Title: {result.get('title', 'Title not found')}\n"
+            output += f"Link: {result.get('link', 'Link not found')}\n"
+            output += "---\n"
+        return output
+class YandexReverseImageSearcher:
+    def __init__(self):
+        self.base_url = "https://yandex.ru/images/search"
+        self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
+        self.retry_count = 3
+        self.retry_delay = 1
+    def response(self, query: str, image_url: str, max_results: int = 10, delay: int = 1) -> SearchResults:
+        self._validate_input(query, image_url)
+        encoded_query = quote(query)
+        encoded_image_url = quote(image_url)
+        url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2"
+        all_results = []
+        start_index = 0
+        while len(all_results) < max_results:
+            if start_index != 0:
+                time.sleep(delay)
+            paginated_url = f"{url}&start={start_index}"
+            response = self._make_request(paginated_url)
+            if response is None:
+                break
+            search_results, valid_content = self._parse_search_results(response.text)
+            if not valid_content:
+                logging.warning("Unexpected HTML structure encountered.")
+                break
+            for result in search_results:
+                if len(all_results) >= max_results:
+                    break
+                data = self._extract_result_data(result)
+                if data and data not in all_results:
+                    all_results.append(data)
+            start_index += (len(all_results)-start_index)
+        if len(all_results) == 0:
+            logging.warning(f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].")
+            return "No results found. Please try again with a different query and/or image URL."
+        else:
+            return SearchResults(all_results[:max_results])
+    def _validate_input(self, query: str, image_url: str):
+        if not query:
+            raise ValueError("Query not found. Please enter a query and try again.")
+        if not image_url:
+            raise ValueError("Image URL not found. Please enter an image URL and try again.")
+        if not self._validate_image_url(image_url):
+            raise ValueError("Invalid image URL. Please enter a valid image URL and try again.")
+    def _validate_image_url(self, url: str) -> bool:
+        parsed_url = urlparse(url)
+        path = parsed_url.path.lower()
+        valid_extensions = (".jpg", ".jpeg", ".png", ".webp")
+        return any(path.endswith(ext) for ext in valid_extensions)
+    def _make_request(self, url: str):
+        attempts = 0
+        while attempts < self.retry_count:
+            try:
+                response = requests.get(url, headers=self.headers)
+                if response.headers.get('Content-Type', '').startswith('text/html'):
+                    response.raise_for_status()
+                    return response
+                else:
+                    logging.warning("Non-HTML content received.")
+                    return None
+            except requests.exceptions.HTTPError as http_err:
+                logging.error(f"HTTP error occurred: {http_err}")
+                attempts += 1
+                time.sleep(self.retry_delay)
+            except Exception as err:
+                logging.error(f"An error occurred: {err}")
+                return None
+        return None
+    def _parse_search_results(self, html_content: str):
+        try:
+            soup = BeautifulSoup(html_content, "html.parser")
+            return soup.find_all('div', class_='g'), True
+        except Exception as e:
+            logging.error(f"Error parsing HTML content: {e}")
+            return None, False
+    def _extract_result_data(self, result):
+        link = result.find('a', href=True)['href'] if result.find('a', href=True) else None
+        title = result.find('h3').get_text(strip=True) if result.find('h3') else None
+        return {"link": link, "title": title} if link and title else {}
+def get_image_links(page):
+    """
+    Extracts image URLs from the given HTML page.
+    Args:
+        page: The HTML content as a string.
+    Returns:
+        A list of image URLs.
+    """
+    soup = BeautifulSoup(page, 'html.parser')
+    # Find the specific section containing image links
+    gallery_data = soup.find('div', {'class': 'cbir-section cbir-section_name_sites'})
+    if gallery_data is None:
+        return []
+    # Find the container of image links
+    image_links_container = gallery_data.find('div', {'class': 'Root'})
+    if image_links_container is None:
+        return []
+    data_state = json.loads(image_links_container['data-state'])
+    # Extract URLs from each div
+    image_urls = []
+    for site in data_state['sites']:
+        original_image_url = site['originalImage']['url']
+        image_urls.append(original_image_url)
+    return image_urls
+def yandex_reverse_image_search(file_path):
+    img_search_url = generate_images_search_links(file_path)
+    if img_search_url is None:
+        return []
+    # Simulate a user agent to avoid being blocked
+    headers = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
+    'Content-Type': 'application/json',
+    }
+    try:
+        response = requests.get(img_search_url, headers=headers)
+        response.raise_for_status()  # Raise an exception for bad status codes
+        # Parse the HTML content
+        soup = BeautifulSoup(response.content, 'html.parser')
+        image_urls = get_image_links(soup.prettify())
+        return image_urls
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching image: {e}")
+        return []
+def generate_images_search_links(file_path):
+    search_url = 'https://yandex.ru/images/search'
+    params = {'rpt': 'imageview', 'format': 'json', 'request': '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}'}
+    try:
+        files = {'upfile': ('blob', open(file_path, 'rb'), 'image/jpeg/webp')}
+        response = requests.post(search_url, params=params, files=files)
+        query_string = json.loads(response.content)['blocks'][0]['params']['url']
+        img_search_url = search_url + '?' + query_string
+        return img_search_url
+    except:
+        return None
+if __name__ == "__main__":
+    file_path = "T:\\Projects\\prj-nict-ai-content-detection\\data\\test_data\\towels.jpg.webp"
+    image_urls = yandex_reverse_image_search(file_path)
+    for image_url in image_urls:
+        print(f"Image URL: {image_url}")

src/application/text/model_detection.py CHANGED Viewed

@@ -11,7 +11,7 @@ PARAPHRASE = "PARAPHRASE"
 NON_PARAPHRASE = "NON_PARAPHRASE"
-def detect_by_ai_model(
     input_text: str,
     model: str = DEFAULT_MODEL,
     max_length: int = 512,
@@ -44,4 +44,4 @@ def detect_by_ai_model(
         return label, confidence_score
     except Exception as e:  # Add exception handling
         print(f"Error in Roberta model inference: {e}")
-        return UNKNOWN, 0.0  # Return UNKNOWN and 0.0 confidence if error

 NON_PARAPHRASE = "NON_PARAPHRASE"
+def detect_text_by_ai_model(
     input_text: str,
     model: str = DEFAULT_MODEL,
     max_length: int = 512,
         return label, confidence_score
     except Exception as e:  # Add exception handling
         print(f"Error in Roberta model inference: {e}")
+        return UNKNOWN, 50  # Return UNKNOWN and 0.0 confidence if error

src/application/text/search_detection.py CHANGED Viewed

@@ -33,7 +33,7 @@ MIN_RATIO_PARAPHRASE_NUM = 0.7
 MAX_CHAR_SIZE = 30000
-def detect_by_relative_search(input_text, is_support_opposite = False):
     checked_urls = set()
     searched_phrases = generate_search_phrases(input_text)
@@ -46,19 +46,24 @@ def detect_by_relative_search(input_text, is_support_opposite = False):
             if url in checked_urls: # visited url
                 continue
             checked_urls.add(url)
-            print(f"\tChecking URL: {url}")
             content = URLReader(url)
             if content.is_extracted is True:
                 page_text = content.title + "\n" + content.text
-                if page_text is None or len(page_text) > MAX_CHAR_SIZE:
-                    print(f"\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
                 is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text)
                 if is_paraphrase:
-                    return is_paraphrase, url, aligned_sentences
-    return False, None, []
 def longest_common_subsequence(arr1, arr2):
     """
@@ -256,7 +261,7 @@ def similarity_ratio(a, b):
         return 0.0  # Handle cases where inputs are not strings or None
     return SequenceMatcher(None, a, b).ratio()
-def check_human(data, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
     """
     Checks if a sufficient number of input sentences are found within
         source sentences.
@@ -264,14 +269,14 @@ def check_human(data, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
     Returns:
         bool: True if the condition is met, False otherwise.
     """
-    if not data:  # Handle empty data case
         return False
-    min_matching = math.ceil(len(data) * min_ratio)
     count = 0
     #for input_sentence, source_sentence, similiarity, is_paraprhase in data:
-    for sentence in data:
         if sentence["similarity"] >= 0.99:
             count += 1
     print(f"\tmatching_sentence_count   : {count}, min_matching: {min_matching}")

 MAX_CHAR_SIZE = 30000
+def detect_text_by_relative_search(input_text, is_support_opposite = False):
     checked_urls = set()
     searched_phrases = generate_search_phrases(input_text)
             if url in checked_urls: # visited url
                 continue
             checked_urls.add(url)
+            print(f"\t\tChecking URL: {url}")
             content = URLReader(url)
             if content.is_extracted is True:
+                if content.title is None or content.text is None:
+                    print(f"\t\t\t↑↑↑ Title or text not found")
+                    continue
                 page_text = content.title + "\n" + content.text
+                if len(page_text) > MAX_CHAR_SIZE:
+                    print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
                 is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text)
                 if is_paraphrase:
+                    return is_paraphrase, url, aligned_sentences, content.images
+    return False, None, [], []
 def longest_common_subsequence(arr1, arr2):
     """
         return 0.0  # Handle cases where inputs are not strings or None
     return SequenceMatcher(None, a, b).ratio()
+def check_human(alligned_sentences, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
     """
     Checks if a sufficient number of input sentences are found within
         source sentences.
     Returns:
         bool: True if the condition is met, False otherwise.
     """
+    if not alligned_sentences:  # Handle empty data case
         return False
+    min_matching = math.ceil(len(alligned_sentences) * min_ratio)
     count = 0
     #for input_sentence, source_sentence, similiarity, is_paraprhase in data:
+    for sentence in alligned_sentences:
         if sentence["similarity"] >= 0.99:
             count += 1
     print(f"\tmatching_sentence_count   : {count}, min_matching: {min_matching}")

src/application/url_reader.py CHANGED Viewed

@@ -52,7 +52,7 @@ class URLReader():
         self.title = news.title
         self.text = news.text
-        self.images = news.images
         self.top_image = news.top_image
     def extract_content_bs(self):

         self.title = news.title
         self.text = news.text
+        self.images = list(set(news.images))  # Remove duplicates
         self.top_image = news.top_image
     def extract_content_bs(self):