Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Feb 12

Commit

38fd181

1 Parent(s): 504f37b

run pre-commit

Browse files

Files changed (27) hide show

.sample-env +1 -1
README.md +7 -7
application.py +155 -103
application_2.py +155 -103
application_3.py +254 -0
examples/example_text_LLM_entities.txt +1 -1
examples/example_text_LLM_modification.txt +3 -3
examples/example_text_LLM_topic.txt +6 -6
examples/example_text_real.txt +1 -1
examples/example_text_real_2.txt +1 -1
gpt_test.py +16 -20
requirements.txt +1 -1
src/application/content_detection.py +342 -244
src/application/content_generation.py +40 -32
src/application/image/image_comparison.py +23 -16
src/application/image/image_detection.py +21 -10
src/application/image/model_detection.py +70 -43
src/application/image/search_yandex.py +91 -47
src/application/text/entity.py +152 -95
src/application/text/helper.py +59 -68
src/application/text/highlight_text.py +127 -50
src/application/text/model_detection.py +1 -1
src/application/text/preprocessing.py +3 -2
src/application/text/search.py +43 -33
src/application/text/search_detection.py +195 -101
src/application/url_reader.py +49 -35
test.py +11 -6

.sample-env CHANGED Viewed

@@ -1,4 +1,4 @@
 [API_KEY]
 OPENAI_API_KEY=your_api_key  # Replace with your actual OpenAI API key
 GEMINI_API_KEY=your_api_key
-TOGETHER_API_KEY=your_api_key

 [API_KEY]
 OPENAI_API_KEY=your_api_key  # Replace with your actual OpenAI API key
 GEMINI_API_KEY=your_api_key
+TOGETHER_API_KEY=your_api_key

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
-title: "FAKE NEWS DETECTION"
-emoji: "🚀"
-colorFrom: "green"
-colorTo: "blue"
-sdk: "gradio"
-sdk_version: "5.13.1"
-app_file: "application.py"
 pinned: false
 ---

 ---
+title: "FAKE NEWS DETECTION"
+emoji: "🚀"
+colorFrom: "green"
+colorTo: "blue"
+sdk: "gradio"
+sdk_version: "5.13.1"
+app_file: "application.py"
 pinned: false
 ---

application.py CHANGED Viewed

@@ -1,44 +1,53 @@
-import os
 import gradio as gr
 import requests
 from PIL import Image
 from src.application.content_detection import NewsVerification
 from src.application.url_reader import URLReader
-from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
 AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
 AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
 def load_url(url):
     """
     Load content from the given URL.
     """
     content = URLReader(url)
     image = None
-    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'}
     try:
         response = requests.get(
-            url,
-            headers = header,
-            stream = True
         )
         response.raise_for_status()  # Raise an exception for bad status codes
         image_response = requests.get(content.top_image, stream=True)
         try:
             image = Image.open(image_response.raw)
-        except:
-            print(f"Error loading image from {content.top_image}")
     except (requests.exceptions.RequestException, FileNotFoundError) as e:
         print(f"Error fetching image: {e}")
     return content.title, content.text, image
-def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
     news_analysis = NewsVerification()
     news_analysis.load_news(news_title, news_content, news_image)
     news_analysis.generate_analysis_report()
@@ -48,80 +57,100 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
     with gr.Row():
-        # SETTINGS
         with gr.Column(scale=1):
-                with gr.Accordion("1. Enter a URL"):
-                    url_input = gr.Textbox(
-                        label="",
-                        show_label=False,
-                        value="",
-                        )
-                    load_button = gr.Button("Load URL")
-                with gr.Accordion("2. Select content-generation models", open=True, visible=False):
-                    with gr.Row():
-                            text_generation_model = gr.Dropdown(choices=AZURE_TEXT_MODEL, label="Text-generation model")
-                            image_generation_model = gr.Dropdown(choices=AZURE_IMAGE_MODEL, label="Image-generation model")
-                            generate_text_button = gr.Button("Generate text")
-                            generate_image_button = gr.Button("Generate image")
-                with gr.Accordion("3. Replace any terms", open=True, visible=False):
-                    replace_df = gr.Dataframe(
-                        headers=["Find what:", "Replace with:"],
-                        datatype=["str", "str"],
-                        row_count=(1, "dynamic"),
-                        col_count=(2, "fixed"),
-                        interactive=True
                     )
-                    replace_button = gr.Button("Replace all")
-                # GENERATED CONTENT
-                with gr.Accordion("Input News"):
-                    news_title = gr.Textbox(label="Title", value="")
-                    news_image = gr.Image(label="Image", type="filepath")
-                    news_content = gr.Textbox(label="Content", value="", lines=13)
         # NEWS ANALYSIS REPORT
         ordinary_user_explanation = """
-        FOR ORDINARY USER<br>
-        - Green texts are the matched words in the input and source news.<br>
-        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
         fact_checker_explanation = """
-        FOR FACT CHECKER<br>
-        - Green texts are the matched words in the input and source news.<br>
-        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
         governor_explanation = """
-        FOR GOVERNOR<br>
-        - Green texts are the matched words in the input and source news.<br>
-        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
         table = """
-        <h5>Comparison between input news and source news:</h5>
-            <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
-            <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
-                <thead>
-                    <tr>
-                        <th>Input news</th>
-                        <th>Source (URL provided in Originality column correspondingly)</th>
-                        <th>Forensic</th>
-                        <th>Originality</th>
-                    </tr>
-                </thead>
-                <tbody>
-                    <tr>
-                        <th>TBD</th>
-                        <th>TBD</th>
-                        <th>TBD</th>
-                        <th>TBD</th>
-                    </tr>
-                </tbody>
-            </table>
-            <style>"""
         with gr.Column(scale=2):
             with gr.Accordion("NEWS ANALYSIS"):
                 verification_button = gr.Button("Verify news")
@@ -137,56 +166,79 @@ with gr.Blocks() as demo:
     # Connect events
     load_button.click(
-        load_url,
-        inputs=url_input,
-        outputs=[news_title, news_content, news_image]
-        )
-    replace_button.click(replace_text,
-                        inputs=[news_title, news_content, replace_df],
-                        outputs=[news_title, news_content])
-    generate_text_button.click(generate_fake_text,
-                        inputs=[text_generation_model, news_title, news_content],
-                        outputs=[news_title, news_content])
-    generate_image_button.click(generate_fake_image,
-                        inputs=[image_generation_model, news_title],
-                        outputs=[news_image])
-    verification_button.click(generate_analysis_report,
-                            inputs=[news_title, news_content, news_image],
-                            outputs=[ordinary_user_result, fact_checker_result, governor_result])
     # change Image
-    #url_input.change(load_image, inputs=url_input, outputs=image_view)
     try:
-        with open('examples/example_text_real.txt','r', encoding='utf-8') as file:
             text_real_1 = file.read()
-        with open('examples/example_text_real_2.txt','r', encoding='utf-8') as file:
             text_real_2 = file.read()
-        with open('examples/example_text_LLM_topic.txt','r', encoding='utf-8') as file:
             text_llm_topic = file.read()
-        with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
             text_llm_modification = file.read()
-        with open('examples/example_text_LLM_entities.txt','r', encoding='utf-8') as file:
             text_llm_entities = file.read()
     except FileNotFoundError:
         print("File not found.")
     except Exception as e:
         print(f"An error occurred: {e}")
     title_1 = "Southampton news: Leeds target striker Cameron Archer."
     title_2 = "Southampton news: Leeds target striker Cameron Archer."
     title_4 = "Japan pledges support for Ukraine with 100-year pact."
     image_1 = "examples/example_image_real_1.jpg.webp"
     image_2 = "examples/example_image_real_2.jpg.webp"
     image_3 = "examples/example_image_real_3.jpg"
     image_4 = "examples/example_image_real_4.jpg.webp"
     gr.Examples(
         examples=[
-            [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
-            [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
-            [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
             [title_4, image_4, text_llm_entities],
         ],
         inputs=[news_title, news_image, news_content],
@@ -199,4 +251,4 @@ with gr.Blocks() as demo:
         ],
     )
-demo.launch(share=True)

 import gradio as gr
 import requests
 from PIL import Image
 from src.application.content_detection import NewsVerification
+from src.application.content_generation import (
+    generate_fake_image,
+    generate_fake_text,
+    replace_text,
+)
 from src.application.url_reader import URLReader
 AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
 AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
 def load_url(url):
     """
     Load content from the given URL.
     """
     content = URLReader(url)
     image = None
+    header = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",  # noqa: E501
+    }
     try:
         response = requests.get(
+            url,
+            headers=header,
+            stream=True,
         )
         response.raise_for_status()  # Raise an exception for bad status codes
         image_response = requests.get(content.top_image, stream=True)
         try:
             image = Image.open(image_response.raw)
+        except OSError as e:
+            print(f"Error loading image from {content.top_image}: {e}")
     except (requests.exceptions.RequestException, FileNotFoundError) as e:
         print(f"Error fetching image: {e}")
     return content.title, content.text, image
+def generate_analysis_report(
+    news_title: str,
+    news_content: str,
+    news_image: Image,
+):
     news_analysis = NewsVerification()
     news_analysis.load_news(news_title, news_content, news_image)
     news_analysis.generate_analysis_report()
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
     with gr.Row():
+        # SETTINGS
         with gr.Column(scale=1):
+            with gr.Accordion("1. Enter a URL"):
+                url_input = gr.Textbox(
+                    label="",
+                    show_label=False,
+                    value="",
+                )
+                load_button = gr.Button("Load URL")
+            with gr.Accordion(
+                "2. Select content-generation models",
+                open=True,
+                visible=False,
+            ):
+                with gr.Row():
+                    text_generation_model = gr.Dropdown(
+                        choices=AZURE_TEXT_MODEL,
+                        label="Text-generation model",
+                    )
+                    image_generation_model = gr.Dropdown(
+                        choices=AZURE_IMAGE_MODEL,
+                        label="Image-generation model",
                     )
+                    generate_text_button = gr.Button("Generate text")
+                    generate_image_button = gr.Button("Generate image")
+            with gr.Accordion(
+                "3. Replace any terms",
+                open=True,
+                visible=False,
+            ):
+                replace_df = gr.Dataframe(
+                    headers=["Find what:", "Replace with:"],
+                    datatype=["str", "str"],
+                    row_count=(1, "dynamic"),
+                    col_count=(2, "fixed"),
+                    interactive=True,
+                )
+                replace_button = gr.Button("Replace all")
+            # GENERATED CONTENT
+            with gr.Accordion("Input News"):
+                news_title = gr.Textbox(label="Title", value="")
+                news_image = gr.Image(label="Image", type="filepath")
+                news_content = gr.Textbox(label="Content", value="", lines=13)
         # NEWS ANALYSIS REPORT
         ordinary_user_explanation = """
+FOR ORDINARY USER<br>
+- Green texts are the matched words in the input and source news.<br>
+- Each highlighted pair (marked with a number) shows the key differences
+between the input text and the source.
         """
         fact_checker_explanation = """
+FOR FACT CHECKER<br>
+- Green texts are the matched words in the input and source news.<br>
+- Each highlighted pair (marked with a number) shows the key differences
+between the input text and the source.
         """
         governor_explanation = """
+FOR GOVERNOR<br>
+- Green texts are the matched words in the input and source news.<br>
+- Each highlighted pair (marked with a number) shows the key differences
+between the input text and the source.
         """
         table = """
+<h5>Comparison between input news and source news:</h5>
+    <table border="1" style="width:100%; text-align:left;">
+    <col style="width: 170px;">
+    <col style="width: 170px;">
+    <col style="width: 30px;">
+    <col style="width: 75px;">
+        <thead>
+            <tr>
+                <th>Input news</th>
+                <th>Source (corresponding URL provided in Originality)</th>
+                <th>Forensic</th>
+                <th>Originality</th>
+            </tr>
+        </thead>
+        <tbody>
+            <tr>
+                <th>TBD</th>
+                <th>TBD</th>
+                <th>TBD</th>
+                <th>TBD</th>
+            </tr>
+        </tbody>
+    </table>
+    <style>"""
         with gr.Column(scale=2):
             with gr.Accordion("NEWS ANALYSIS"):
                 verification_button = gr.Button("Verify news")
     # Connect events
     load_button.click(
+        load_url,
+        inputs=url_input,
+        outputs=[news_title, news_content, news_image],
+    )
+    replace_button.click(
+        replace_text,
+        inputs=[news_title, news_content, replace_df],
+        outputs=[news_title, news_content],
+    )
+    generate_text_button.click(
+        generate_fake_text,
+        inputs=[text_generation_model, news_title, news_content],
+        outputs=[news_title, news_content],
+    )
+    generate_image_button.click(
+        generate_fake_image,
+        inputs=[image_generation_model, news_title],
+        outputs=[news_image],
+    )
+    verification_button.click(
+        generate_analysis_report,
+        inputs=[news_title, news_content, news_image],
+        outputs=[ordinary_user_result, fact_checker_result, governor_result],
+    )
     # change Image
+    # url_input.change(load_image, inputs=url_input, outputs=image_view)
     try:
+        with open(
+            "examples/example_text_real.txt",
+            encoding="utf-8",
+        ) as file:
             text_real_1 = file.read()
+        with open(
+            "examples/example_text_real_2.txt",
+            encoding="utf-8",
+        ) as file:
             text_real_2 = file.read()
+        with open(
+            "examples/example_text_LLM_topic.txt",
+            encoding="utf-8",
+        ) as file:
             text_llm_topic = file.read()
+        with open(
+            "examples/example_text_LLM_modification.txt",
+            encoding="utf-8",
+        ) as file:
             text_llm_modification = file.read()
+        with open(
+            "examples/example_text_LLM_entities.txt",
+            encoding="utf-8",
+        ) as file:
             text_llm_entities = file.read()
     except FileNotFoundError:
         print("File not found.")
     except Exception as e:
         print(f"An error occurred: {e}")
     title_1 = "Southampton news: Leeds target striker Cameron Archer."
     title_2 = "Southampton news: Leeds target striker Cameron Archer."
     title_4 = "Japan pledges support for Ukraine with 100-year pact."
     image_1 = "examples/example_image_real_1.jpg.webp"
     image_2 = "examples/example_image_real_2.jpg.webp"
     image_3 = "examples/example_image_real_3.jpg"
     image_4 = "examples/example_image_real_4.jpg.webp"
     gr.Examples(
         examples=[
+            [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
+            [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
+            [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
             [title_4, image_4, text_llm_entities],
         ],
         inputs=[news_title, news_image, news_content],
         ],
     )
+demo.launch(share=True)

application_2.py CHANGED Viewed

@@ -1,44 +1,53 @@
-import os
 import gradio as gr
 import requests
 from PIL import Image
 from src.application.content_detection import NewsVerification
 from src.application.url_reader import URLReader
-from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
 AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
 AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
 def load_url(url):
     """
     Load content from the given URL.
     """
     content = URLReader(url)
     image = None
-    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'}
     try:
         response = requests.get(
-            url,
-            headers = header,
-            stream = True
         )
         response.raise_for_status()  # Raise an exception for bad status codes
         image_response = requests.get(content.top_image, stream=True)
         try:
             image = Image.open(image_response.raw)
-        except:
-            print(f"Error loading image from {content.top_image}")
     except (requests.exceptions.RequestException, FileNotFoundError) as e:
         print(f"Error fetching image: {e}")
     return content.title, content.text, image
-def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
     news_analysis = NewsVerification()
     news_analysis.load_news(news_title, news_content, news_image)
     news_analysis.generate_analysis_report()
@@ -48,80 +57,100 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
     with gr.Row():
-        # SETTINGS
         with gr.Column(scale=1):
-                with gr.Accordion("1. Enter a URL"):
-                    url_input = gr.Textbox(
-                        label="",
-                        show_label=False,
-                        value="",
-                        )
-                    load_button = gr.Button("Load URL")
-                with gr.Accordion("2. Select content-generation models", open=True, visible=False):
-                    with gr.Row():
-                            text_generation_model = gr.Dropdown(choices=AZURE_TEXT_MODEL, label="Text-generation model")
-                            image_generation_model = gr.Dropdown(choices=AZURE_IMAGE_MODEL, label="Image-generation model")
-                            generate_text_button = gr.Button("Generate text")
-                            generate_image_button = gr.Button("Generate image")
-                with gr.Accordion("3. Replace any terms", open=True, visible=False):
-                    replace_df = gr.Dataframe(
-                        headers=["Find what:", "Replace with:"],
-                        datatype=["str", "str"],
-                        row_count=(1, "dynamic"),
-                        col_count=(2, "fixed"),
-                        interactive=True
                     )
-                    replace_button = gr.Button("Replace all")
-                # GENERATED CONTENT
-                with gr.Accordion("Input News"):
-                    news_title = gr.Textbox(label="Title", value="")
-                    news_image = gr.Image(label="Image", type="filepath")
-                    news_content = gr.Textbox(label="Content", value="", lines=13)
         # NEWS ANALYSIS REPORT
         ordinary_user_explanation = """
-        FOR ORDINARY USER<br>
-        - Green texts are the matched words in the input and source news.<br>
-        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
         fact_checker_explanation = """
-        FOR FACT CHECKER<br>
-        - Green texts are the matched words in the input and source news.<br>
-        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
         governor_explanation = """
-        FOR GOVERNOR<br>
-        - Green texts are the matched words in the input and source news.<br>
-        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
         table = """
-        <h5>Comparison between input news and source news:</h5>
-            <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
-            <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
-                <thead>
-                    <tr>
-                        <th>Input news</th>
-                        <th>Source (URL provided in Originality column correspondingly)</th>
-                        <th>Forensic</th>
-                        <th>Originality</th>
-                    </tr>
-                </thead>
-                <tbody>
-                    <tr>
-                        <th>TBD</th>
-                        <th>TBD</th>
-                        <th>TBD</th>
-                        <th>TBD</th>
-                    </tr>
-                </tbody>
-            </table>
-            <style>"""
         with gr.Column(scale=2):
             with gr.Accordion("NEWS ANALYSIS"):
                 verification_button = gr.Button("Verify news")
@@ -137,56 +166,79 @@ with gr.Blocks() as demo:
     # Connect events
     load_button.click(
-        load_url,
-        inputs=url_input,
-        outputs=[news_title, news_content, news_image]
-        )
-    replace_button.click(replace_text,
-                        inputs=[news_title, news_content, replace_df],
-                        outputs=[news_title, news_content])
-    generate_text_button.click(generate_fake_text,
-                        inputs=[text_generation_model, news_title, news_content],
-                        outputs=[news_title, news_content])
-    generate_image_button.click(generate_fake_image,
-                        inputs=[image_generation_model, news_title],
-                        outputs=[news_image])
-    verification_button.click(generate_analysis_report,
-                            inputs=[news_title, news_content, news_image],
-                            outputs=[ordinary_user_result, fact_checker_result, governor_result])
     # change Image
-    #url_input.change(load_image, inputs=url_input, outputs=image_view)
     try:
-        with open('examples/example_text_real.txt','r', encoding='utf-8') as file:
             text_real_1 = file.read()
-        with open('examples/example_text_real_2.txt','r', encoding='utf-8') as file:
             text_real_2 = file.read()
-        with open('examples/example_text_LLM_topic.txt','r', encoding='utf-8') as file:
             text_llm_topic = file.read()
-        with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
             text_llm_modification = file.read()
-        with open('examples/example_text_LLM_entities.txt','r', encoding='utf-8') as file:
             text_llm_entities = file.read()
     except FileNotFoundError:
         print("File not found.")
     except Exception as e:
         print(f"An error occurred: {e}")
     title_1 = "Southampton news: Leeds target striker Cameron Archer."
     title_2 = "Southampton news: Leeds target striker Cameron Archer."
     title_4 = "Japan pledges support for Ukraine with 100-year pact."
     image_1 = "examples/example_image_real_1.jpg.webp"
     image_2 = "examples/example_image_real_2.jpg.webp"
     image_3 = "examples/example_image_real_3.jpg"
     image_4 = "examples/example_image_real_4.jpg.webp"
     gr.Examples(
         examples=[
-            [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
-            [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
-            [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
             [title_4, image_4, text_llm_entities],
         ],
         inputs=[news_title, news_image, news_content],
@@ -199,4 +251,4 @@ with gr.Blocks() as demo:
         ],
     )
-demo.launch(share=True)

 import gradio as gr
 import requests
 from PIL import Image
 from src.application.content_detection import NewsVerification
+from src.application.content_generation import (
+    generate_fake_image,
+    generate_fake_text,
+    replace_text,
+)
 from src.application.url_reader import URLReader
 AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
 AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
 def load_url(url):
     """
     Load content from the given URL.
     """
     content = URLReader(url)
     image = None
+    header = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",  # noqa: E501
+    }
     try:
         response = requests.get(
+            url,
+            headers=header,
+            stream=True,
         )
         response.raise_for_status()  # Raise an exception for bad status codes
         image_response = requests.get(content.top_image, stream=True)
         try:
             image = Image.open(image_response.raw)
+        except OSError as e:
+            print(f"Error loading image from {content.top_image}: {e}")
     except (requests.exceptions.RequestException, FileNotFoundError) as e:
         print(f"Error fetching image: {e}")
     return content.title, content.text, image
+def generate_analysis_report(
+    news_title: str,
+    news_content: str,
+    news_image: Image,
+):
     news_analysis = NewsVerification()
     news_analysis.load_news(news_title, news_content, news_image)
     news_analysis.generate_analysis_report()
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
     with gr.Row():
+        # SETTINGS
         with gr.Column(scale=1):
+            with gr.Accordion("1. Enter a URL"):
+                url_input = gr.Textbox(
+                    label="",
+                    show_label=False,
+                    value="",
+                )
+                load_button = gr.Button("Load URL")
+            with gr.Accordion(
+                "2. Select content-generation models",
+                open=True,
+                visible=False,
+            ):
+                with gr.Row():
+                    text_generation_model = gr.Dropdown(
+                        choices=AZURE_TEXT_MODEL,
+                        label="Text-generation model",
+                    )
+                    image_generation_model = gr.Dropdown(
+                        choices=AZURE_IMAGE_MODEL,
+                        label="Image-generation model",
                     )
+                    generate_text_button = gr.Button("Generate text")
+                    generate_image_button = gr.Button("Generate image")
+            with gr.Accordion(
+                "3. Replace any terms",
+                open=True,
+                visible=False,
+            ):
+                replace_df = gr.Dataframe(
+                    headers=["Find what:", "Replace with:"],
+                    datatype=["str", "str"],
+                    row_count=(1, "dynamic"),
+                    col_count=(2, "fixed"),
+                    interactive=True,
+                )
+                replace_button = gr.Button("Replace all")
+            # GENERATED CONTENT
+            with gr.Accordion("Input News"):
+                news_title = gr.Textbox(label="Title", value="")
+                news_image = gr.Image(label="Image", type="filepath")
+                news_content = gr.Textbox(label="Content", value="", lines=13)
         # NEWS ANALYSIS REPORT
         ordinary_user_explanation = """
+FOR ORDINARY USER<br>
+- Green texts are the matched words in the input and source news.<br>
+- Each highlighted pair (marked with a number) shows the key differences
+between the input text and the source.
         """
         fact_checker_explanation = """
+FOR FACT CHECKER<br>
+- Green texts are the matched words in the input and source news.<br>
+- Each highlighted pair (marked with a number) shows the key differences
+between the input text and the source.
         """
         governor_explanation = """
+FOR GOVERNOR<br>
+- Green texts are the matched words in the input and source news.<br>
+- Each highlighted pair (marked with a number) shows the key differences
+between the input text and the source.
         """
         table = """
+<h5>Comparison between input news and source news:</h5>
+    <table border="1" style="width:100%; text-align:left;">
+    <col style="width: 170px;">
+    <col style="width: 170px;">
+    <col style="width: 30px;">
+    <col style="width: 75px;">
+        <thead>
+            <tr>
+                <th>Input news</th>
+                <th>Source (corresponding URL provided in Originality)</th>
+                <th>Forensic</th>
+                <th>Originality</th>
+            </tr>
+        </thead>
+        <tbody>
+            <tr>
+                <th>TBD</th>
+                <th>TBD</th>
+                <th>TBD</th>
+                <th>TBD</th>
+            </tr>
+        </tbody>
+    </table>
+    <style>"""
         with gr.Column(scale=2):
             with gr.Accordion("NEWS ANALYSIS"):
                 verification_button = gr.Button("Verify news")
     # Connect events
     load_button.click(
+        load_url,
+        inputs=url_input,
+        outputs=[news_title, news_content, news_image],
+    )
+    replace_button.click(
+        replace_text,
+        inputs=[news_title, news_content, replace_df],
+        outputs=[news_title, news_content],
+    )
+    generate_text_button.click(
+        generate_fake_text,
+        inputs=[text_generation_model, news_title, news_content],
+        outputs=[news_title, news_content],
+    )
+    generate_image_button.click(
+        generate_fake_image,
+        inputs=[image_generation_model, news_title],
+        outputs=[news_image],
+    )
+    verification_button.click(
+        generate_analysis_report,
+        inputs=[news_title, news_content, news_image],
+        outputs=[ordinary_user_result, fact_checker_result, governor_result],
+    )
     # change Image
+    # url_input.change(load_image, inputs=url_input, outputs=image_view)
     try:
+        with open(
+            "examples/example_text_real.txt",
+            encoding="utf-8",
+        ) as file:
             text_real_1 = file.read()
+        with open(
+            "examples/example_text_real_2.txt",
+            encoding="utf-8",
+        ) as file:
             text_real_2 = file.read()
+        with open(
+            "examples/example_text_LLM_topic.txt",
+            encoding="utf-8",
+        ) as file:
             text_llm_topic = file.read()
+        with open(
+            "examples/example_text_LLM_modification.txt",
+            encoding="utf-8",
+        ) as file:
             text_llm_modification = file.read()
+        with open(
+            "examples/example_text_LLM_entities.txt",
+            encoding="utf-8",
+        ) as file:
             text_llm_entities = file.read()
     except FileNotFoundError:
         print("File not found.")
     except Exception as e:
         print(f"An error occurred: {e}")
     title_1 = "Southampton news: Leeds target striker Cameron Archer."
     title_2 = "Southampton news: Leeds target striker Cameron Archer."
     title_4 = "Japan pledges support for Ukraine with 100-year pact."
     image_1 = "examples/example_image_real_1.jpg.webp"
     image_2 = "examples/example_image_real_2.jpg.webp"
     image_3 = "examples/example_image_real_3.jpg"
     image_4 = "examples/example_image_real_4.jpg.webp"
     gr.Examples(
         examples=[
+            [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
+            [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
+            [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
             [title_4, image_4, text_llm_entities],
         ],
         inputs=[news_title, news_image, news_content],
         ],
     )
+demo.launch(share=True)

application_3.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import gradio as gr
+import requests
+from PIL import Image
+from src.application.content_detection import NewsVerification
+from src.application.content_generation import (
+    generate_fake_image,
+    generate_fake_text,
+    replace_text,
+)
+from src.application.url_reader import URLReader
+AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
+AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
+def load_url(url):
+    """
+    Load content from the given URL.
+    """
+    content = URLReader(url)
+    image = None
+    header = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36",  # noqa: E501
+    }
+    try:
+        response = requests.get(
+            url,
+            headers=header,
+            stream=True,
+        )
+        response.raise_for_status()  # Raise an exception for bad status codes
+        image_response = requests.get(content.top_image, stream=True)
+        try:
+            image = Image.open(image_response.raw)
+        except OSError as e:
+            print(f"Error loading image from {content.top_image}: {e}")
+    except (requests.exceptions.RequestException, FileNotFoundError) as e:
+        print(f"Error fetching image: {e}")
+    return content.title, content.text, image
+def generate_analysis_report(
+    news_title: str,
+    news_content: str,
+    news_image: Image,
+):
+    news_analysis = NewsVerification()
+    news_analysis.load_news(news_title, news_content, news_image)
+    news_analysis.generate_analysis_report()
+    return news_analysis.analyze_details()
+# Define the GUI
+with gr.Blocks() as demo:
+    gr.Markdown("# NEWS VERIFICATION")
+    with gr.Row():
+        # SETTINGS
+        with gr.Column(scale=1):
+            with gr.Accordion("1. Enter a URL"):
+                url_input = gr.Textbox(
+                    label="",
+                    show_label=False,
+                    value="",
+                )
+                load_button = gr.Button("Load URL")
+            with gr.Accordion(
+                "2. Select content-generation models",
+                open=True,
+                visible=False,
+            ):
+                with gr.Row():
+                    text_generation_model = gr.Dropdown(
+                        choices=AZURE_TEXT_MODEL,
+                        label="Text-generation model",
+                    )
+                    image_generation_model = gr.Dropdown(
+                        choices=AZURE_IMAGE_MODEL,
+                        label="Image-generation model",
+                    )
+                    generate_text_button = gr.Button("Generate text")
+                    generate_image_button = gr.Button("Generate image")
+            with gr.Accordion(
+                "3. Replace any terms",
+                open=True,
+                visible=False,
+            ):
+                replace_df = gr.Dataframe(
+                    headers=["Find what:", "Replace with:"],
+                    datatype=["str", "str"],
+                    row_count=(1, "dynamic"),
+                    col_count=(2, "fixed"),
+                    interactive=True,
+                )
+                replace_button = gr.Button("Replace all")
+            # GENERATED CONTENT
+            with gr.Accordion("Input News"):
+                news_title = gr.Textbox(label="Title", value="")
+                news_image = gr.Image(label="Image", type="filepath")
+                news_content = gr.Textbox(label="Content", value="", lines=13)
+        # NEWS ANALYSIS REPORT
+        ordinary_user_explanation = """
+FOR ORDINARY USER<br>
+- Green texts are the matched words in the input and source news.<br>
+- Each highlighted pair (marked with a number) shows the key differences
+between the input text and the source.
+        """
+        fact_checker_explanation = """
+FOR FACT CHECKER<br>
+- Green texts are the matched words in the input and source news.<br>
+- Each highlighted pair (marked with a number) shows the key differences
+between the input text and the source.
+        """
+        governor_explanation = """
+FOR GOVERNOR<br>
+- Green texts are the matched words in the input and source news.<br>
+- Each highlighted pair (marked with a number) shows the key differences
+between the input text and the source.
+        """
+        table = """
+<h5>Comparison between input news and source news:</h5>
+    <table border="1" style="width:100%; text-align:left;">
+    <col style="width: 170px;">
+    <col style="width: 170px;">
+    <col style="width: 30px;">
+    <col style="width: 75px;">
+        <thead>
+            <tr>
+                <th>Input news</th>
+                <th>Source (corresponding URL provided in Originality)</th>
+                <th>Forensic</th>
+                <th>Originality</th>
+            </tr>
+        </thead>
+        <tbody>
+            <tr>
+                <th>TBD</th>
+                <th>TBD</th>
+                <th>TBD</th>
+                <th>TBD</th>
+            </tr>
+        </tbody>
+    </table>
+    <style>"""
+        with gr.Column(scale=2):
+            with gr.Accordion("NEWS ANALYSIS"):
+                verification_button = gr.Button("Verify news")
+                with gr.Tab("Orinary User"):
+                    gr.HTML(ordinary_user_explanation)
+                    ordinary_user_result = gr.HTML(table)
+                with gr.Tab("Fact Checker"):
+                    gr.HTML(fact_checker_explanation)
+                    fact_checker_result = gr.HTML(table)
+                with gr.Tab("Governor"):
+                    gr.HTML(governor_explanation)
+                    governor_result = gr.HTML(table)
+    # Connect events
+    load_button.click(
+        load_url,
+        inputs=url_input,
+        outputs=[news_title, news_content, news_image],
+    )
+    replace_button.click(
+        replace_text,
+        inputs=[news_title, news_content, replace_df],
+        outputs=[news_title, news_content],
+    )
+    generate_text_button.click(
+        generate_fake_text,
+        inputs=[text_generation_model, news_title, news_content],
+        outputs=[news_title, news_content],
+    )
+    generate_image_button.click(
+        generate_fake_image,
+        inputs=[image_generation_model, news_title],
+        outputs=[news_image],
+    )
+    verification_button.click(
+        generate_analysis_report,
+        inputs=[news_title, news_content, news_image],
+        outputs=[ordinary_user_result, fact_checker_result, governor_result],
+    )
+    # change Image
+    # url_input.change(load_image, inputs=url_input, outputs=image_view)
+    try:
+        with open(
+            "examples/example_text_real.txt",
+            encoding="utf-8",
+        ) as file:
+            text_real_1 = file.read()
+        with open(
+            "examples/example_text_real_2.txt",
+            encoding="utf-8",
+        ) as file:
+            text_real_2 = file.read()
+        with open(
+            "examples/example_text_LLM_topic.txt",
+            encoding="utf-8",
+        ) as file:
+            text_llm_topic = file.read()
+        with open(
+            "examples/example_text_LLM_modification.txt",
+            encoding="utf-8",
+        ) as file:
+            text_llm_modification = file.read()
+        with open(
+            "examples/example_text_LLM_entities.txt",
+            encoding="utf-8",
+        ) as file:
+            text_llm_entities = file.read()
+    except FileNotFoundError:
+        print("File not found.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    title_1 = "Southampton news: Leeds target striker Cameron Archer."
+    title_2 = "Southampton news: Leeds target striker Cameron Archer."
+    title_4 = "Japan pledges support for Ukraine with 100-year pact."
+    image_1 = "examples/example_image_real_1.jpg.webp"
+    image_2 = "examples/example_image_real_2.jpg.webp"
+    image_3 = "examples/example_image_real_3.jpg"
+    image_4 = "examples/example_image_real_4.jpg.webp"
+    gr.Examples(
+        examples=[
+            [title_1, image_1, text_real_1 + "\n\n" + text_real_2],
+            [title_1, image_2, text_real_1 + "\n\n" + text_llm_modification],
+            [title_1, image_3, text_real_1 + "\n\n" + text_llm_topic],
+            [title_4, image_4, text_llm_entities],
+        ],
+        inputs=[news_title, news_image, news_content],
+        label="Examples",
+        example_labels=[
+            "2 real news",
+            "1 real news + 1 LLM modification-based news",
+            "1 real news + 1 LLM topic-based news",
+            "1 LLM changed-entities news",
+        ],
+    )
+demo.launch(share=True)

examples/example_text_LLM_entities.txt CHANGED Viewed

@@ -1 +1 @@

- Shigeru Ishiba has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.

+ Shigeru Ishiba has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.

examples/example_text_LLM_modification.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m.  Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
-He made a substitute appearance and waved farewell to fans in Newcastle's recent loss against Southampton.
-Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2020-21, and scored against Paris St-Germain in the Champions League.

+Miguel Almiron has permanently rejoined Atlanta United from Newcastle United for £8m.  Almiron made 223 appearances for Newcastle, scoring 30 goals, but recently struggled for a starting place under Eddie Howe.
+He made a substitute appearance and waved farewell to fans in Newcastle's recent loss against Southampton.
+Almiron played a key role in Newcastle reaching the Carabao Cup final and their Premier League top-four finish in 2020-21, and scored against Paris St-Germain in the Champions League.

examples/example_text_LLM_topic.txt CHANGED Viewed

@@ -1,6 +1,6 @@
-The January transfer window is in full swing, with clubs across the globe scrambling to strengthen their squads for the remainder of the season.
-Premier League giants Manchester City have reportedly made a substantial bid for highly-rated midfielder Enzo Fernandez.
-Meanwhile, struggling Serie A side Sampdoria are looking to bolster their attack with the loan signing of veteran striker Fabio Quagliarella.
-Rumors are swirling around a potential move for Brazilian wonderkid Endrick to Real Madrid.
-The transfer window officially closes on January 31st, leaving clubs with limited time to finalize their deals.
-Fans are eagerly awaiting to see which teams make the shrewdest moves in this crucial period.

+The January transfer window is in full swing, with clubs across the globe scrambling to strengthen their squads for the remainder of the season.
+Premier League giants Manchester City have reportedly made a substantial bid for highly-rated midfielder Enzo Fernandez.
+Meanwhile, struggling Serie A side Sampdoria are looking to bolster their attack with the loan signing of veteran striker Fabio Quagliarella.
+Rumors are swirling around a potential move for Brazilian wonderkid Endrick to Real Madrid.
+The transfer window officially closes on January 31st, leaving clubs with limited time to finalize their deals.
+Fans are eagerly awaiting to see which teams make the shrewdest moves in this crucial period.

examples/example_text_real.txt CHANGED Viewed

@@ -2,4 +2,4 @@ Leeds are targeting a move for Southampton striker Cameron Archer with early tal
 It is unclear whether a deal can be achieved but the 23-year-old is open to a move before deadline day.
-Other options are believed to be on the table as Archer seeks a guaranteed starting role after increasingly finding himself on the bench under recently appointed Saints manager Ivan Juric.


2
3	It is unclear whether a deal can be achieved but the 23-year-old is open to a move before deadline day.
4
5	+ Other options are believed to be on the table as Archer seeks a guaranteed starting role after increasingly finding himself on the bench under recently appointed Saints manager Ivan Juric.

examples/example_text_real_2.txt CHANGED Viewed

@@ -4,4 +4,4 @@ The resignation brings a long political chapter to an end. Trudeau has been in o
 Trudeau said he will remain at the helm until a new Liberal leader is selected.
-But many questions remain for the party, including who will take over and how they will manage a looming federal election. So what happens next?


4
5	Trudeau said he will remain at the helm until a new Liberal leader is selected.
6
7	+ But many questions remain for the party, including who will take over and how they will manage a looming federal election. So what happens next?

gpt_test.py CHANGED Viewed

@@ -1,34 +1,30 @@
 import os
 from dotenv import load_dotenv
 from openai import AzureOpenAI
 load_dotenv()
-AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
-AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
-AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
 azure_client = AzureOpenAI(
-  azure_endpoint = "https://quoc-nguyen.openai.azure.com/",
-  api_key=AZURE_OPENAI_API_KEY,
-  api_version="2024-05-01-preview"
 )
-deplopment_name = "o1-mini" # or "gpt-4o"
 TEXT_PROMPT = """
 replace Ukraine with Denmark:
-"Sir Keir Starmer has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country.
-The prime minister's visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems.
-Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back".
-An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east.
-Zelensky praised the UK's commitment on Thursday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid."
 """
 response = azure_client.chat.completions.create(
-    model=deplopment_name, # model = "deployment_name".
     messages=[
         # {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": TEXT_PROMPT},
@@ -36,4 +32,4 @@ response = azure_client.chat.completions.create(
     # max_tokens=512,
     # temperature=0,
 )
-print(response.choices[0].message.content)

 import os
 from dotenv import load_dotenv
 from openai import AzureOpenAI
 load_dotenv()
+AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
+AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
 azure_client = AzureOpenAI(
+    azure_endpoint="https://quoc-nguyen.openai.azure.com/",
+    api_key=AZURE_OPENAI_API_KEY,
+    api_version="2024-05-01-preview",
 )
+deplopment_name = "o1-mini"  # or "gpt-4o"
 TEXT_PROMPT = """
 replace Ukraine with Denmark:
+"Sir Keir Starmer has pledged to put Ukraine in the "strongest
+possible position" on a trip to Kyiv where he signed a
+"landmark" 100-year pact with the war-stricken country.
 """
 response = azure_client.chat.completions.create(
+    model=deplopment_name,  # model = "deployment_name".
     messages=[
         # {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": TEXT_PROMPT},
     # max_tokens=512,
     # temperature=0,
 )
+print(response.choices[0].message.content)

requirements.txt CHANGED Viewed

@@ -28,4 +28,4 @@ pytorch_lightning
 torchvision
 torch
 lightning
-timm

 torchvision
 torch
 lightning
+timm

src/application/content_detection.py CHANGED Viewed

@@ -1,49 +1,63 @@
 from difflib import SequenceMatcher
 import pandas as pd
-from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
-from src.application.text.entity import apply_highlight, highlight_entities
 from src.application.text.helper import extract_equal_text
 from src.application.text.model_detection import detect_text_by_ai_model
 from src.application.text.preprocessing import split_into_paragraphs
-from src.application.text.search_detection import check_human, detect_text_by_relative_search, find_text_source
-class NewsVerification():
     def __init__(self):
         self.news_text = ""
         self.news_title = ""
         self.news_content = ""
         self.news_image = ""
-        self.text_prediction_label:list[str] = []
-        self.text_prediction_score:list[float] = []
-        self.text_referent_url:list[str] = []
-        self.image_prediction_label:list[str] = []
-        self.image_prediction_score:list[str] = []
-        self.image_referent_url:list[str] = []
         self.news_prediction_label = ""
         self.news_prediction_score = -1
-        self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
-        self.aligned_sentences:list[dict] = []
-        self.aligned_sentences_df:pd.DataFrame = pd.DataFrame(columns=[
-            "input_sentence",
-            "matched_sentence",
-            "label",
-            "similarity",
-            "paraphrase",
-            "url",
-            "group",
-            "entities",
-            ])
-        self.is_paraphrased:list[bool] = []
-        self.ordinary_user_table:list = []
-        self.fact_checker_table:list = []
-        self.governor_table:list = []
         self.entities_with_colors = []
     def load_news(self, news_title, news_content, news_image):
         self.news_text = news_title + "\n\n" + news_content
         self.news_title = news_title
@@ -52,13 +66,14 @@ class NewsVerification():
     def determine_text_origin(self):
         """
-        Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
         Args:
             text: The input text to be analyzed.
         Returns:
-            str: The predicted origin of the text:
                 - "HUMAN": If the text is likely written by a human.
                 - "MACHINE": If the text is likely generated by a machine.
         """
@@ -75,7 +90,7 @@ class NewsVerification():
             "similarity": None,
             "paraphrase": False,
             "url": "",
-            }
         for index, sentence in enumerate(input_sentences):
             print(f"-------index = {index}-------")
@@ -83,10 +98,20 @@ class NewsVerification():
             if current_index >= len(input_sentences):
                 break
-            if current_index > index and index != 0 and index != len(input_sentences) - 1:
                 continue
-            paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
             if paraphrase is False:
                 # add sentence to ai_sentence
@@ -95,19 +120,27 @@ class NewsVerification():
                 ai_sentence["input_sentence"] += sentence
                 if index == len(input_sentences) - 1:
                     # add ai_sentences to align_sentences
-                    text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
                     ai_sentence["label"] = text_prediction_label
                     ai_sentence["similarity"] = text_prediction_score
                     self.aligned_sentences.append(ai_sentence)
             else:
                 if previous_paraphrase is False or previous_paraphrase is None:
                     # add ai_sentences to align_sentences
-                    if ai_sentence["input_sentence"] != "" or current_index >= len(input_sentences):
-                        text_prediction_label, text_prediction_score = detect_text_by_ai_model(ai_sentence["input_sentence"])
                         ai_sentence["label"] = text_prediction_label
                         ai_sentence["similarity"] = text_prediction_score
                         self.aligned_sentences.append(ai_sentence)
                         # reset
                         ai_sentence = {
                             "input_sentence": "",
@@ -116,7 +149,7 @@ class NewsVerification():
                             "similarity": None,
                             "paraphrase": False,
                             "url": "",
-                            }
                 # add searched_sentences to align_sentences
                 if searched_sentences["input_sentence"] != "":
@@ -125,20 +158,21 @@ class NewsVerification():
                         searched_sentences["label"] = "HUMAN"
                     else:
                         searched_sentences["label"] = "MACHINE"
                     self.aligned_sentences.append(searched_sentences)
             previous_paraphrase = paraphrase
     def determine_text_origin_2(self):
         """
-        Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
         Args:
             text: The input text to be analyzed.
         Returns:
-            str: The predicted origin of the text:
                 - "HUMAN": If the text is likely written by a human.
                 - "MACHINE": If the text is likely generated by a machine.
         """
@@ -150,17 +184,17 @@ class NewsVerification():
             self.aligned_sentences_df = pd.concat(
                 [self.aligned_sentences_df, pd.DataFrame([{}])],
                 ignore_index=False,
-                )
         for index, sentence in enumerate(input_sentences):
             print(f"-------index = {index}-------")
             print(f"current_sentence = {input_sentences[index]}")
             if self.aligned_sentences_df["url"] is not None:
                 continue
             self.aligned_sentences_df, img_urls = find_text_source(
-                input_sentences[index],
                 self.aligned_sentences_df,
             )
@@ -171,25 +205,30 @@ class NewsVerification():
             self.image_prediction_score = 0.0
             self.image_referent_url = None
             return
         for image in self.found_img_url:
-            print(f"\tfound_img_url: {image}")
-        matched_url, similarity = detect_image_from_news_image(self.news_image, self.found_img_url)
         if matched_url is not None:
             print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
             self.image_prediction_label = "HUMAN"
             self.image_prediction_score = similarity
             self.image_referent_url = matched_url
             return
-        matched_url, similarity = detect_image_by_reverse_search(self.news_image)
         if matched_url is not None:
             print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
             self.image_prediction_label = "HUMAN"
             self.image_prediction_score = similarity
             self.image_referent_url = matched_url
             return
         detected_label, score = detect_image_by_ai_model(self.news_image)
         if detected_label:
             print(f"detected_label: {detected_label} ({score})")
@@ -197,7 +236,7 @@ class NewsVerification():
             self.image_prediction_score = score
             self.image_referent_url = None
             return
         self.image_prediction_label = "UNKNOWN"
         self.image_prediction_score = 50
         self.image_referent_url = None
@@ -209,15 +248,17 @@ class NewsVerification():
             text_prediction_score = 50
         else:
             text_prediction_score = self.text_prediction_score
         if self.image_prediction_label == "MACHINE":
             image_prediction_score = 100 - self.image_prediction_score
         elif self.image_prediction_label == "UNKNOWN":
             image_prediction_score = 50
         else:
             image_prediction_score = self.image_prediction_score
-        news_prediction_score = (text_prediction_score + image_prediction_score) / 2
         if news_prediction_score > 50:
             self.news_prediction_score = news_prediction_score
             self.news_prediction_label = "HUMAN"
@@ -234,37 +275,25 @@ class NewsVerification():
         for index, aligned_sentence in enumerate(self.aligned_sentences):
             # Get entity-words (in pair) with colors
             entities_with_colors = highlight_entities(
-                    aligned_sentence["input_sentence"],
-                    aligned_sentence["matched_sentence"],
-                )
             self.aligned_sentences[index]["entities"] = entities_with_colors
         ordinary_user_table = self.create_ordinary_user_table()
         fact_checker_table = self.create_fact_checker_table()
         governor_table = self.create_governor_table()
         return ordinary_user_table, fact_checker_table, governor_table
     def get_text_urls(self):
         return set(self.text_referent_url)
     def compare_sentences(self, sentence_1, sentence_2, position, color):
         """
-        Compares two sentences and identifies common phrases, outputting their start and end positions.
-        Args:
-            sentence_1: The first sentence (string).
-            sentence_2: The second sentence (string).
-        Returns:
-            A list of dictionaries, where each dictionary represents a common phrase and contains:
-                - "phrase": The common phrase (string).
-                - "start_1": The starting index of the phrase in sentence_1 (int).
-                - "end_1": The ending index of the phrase in sentence_1 (int).
-                - "start_2": The starting index of the phrase in sentence_2 (int).
-                - "end_2": The ending index of the phrase in sentence_2 (int).
-            Returns an empty list if no common phrases are found.  Handles edge cases like empty strings.
         """
         if not sentence_1 or not sentence_2:  # Handle empty strings
@@ -280,16 +309,20 @@ class NewsVerification():
                 start_2 = block.b
                 end_2 = block.b + block.size
-                phrase = sentence_1[start_1:end_1]  # Or sentence_2[start_2:end_2], they are the same
-                common_phrases.append({
-                    "phrase": phrase,
-                    "start_1": start_1 + position,
-                    "end_1": end_1 + position,
-                    "start_2": start_2,
-                    "end_2": end_2,
-                    "color": color,
-                })
         position += len(sentence_1)
         return common_phrases, position
@@ -297,17 +330,17 @@ class NewsVerification():
         rows = []
         max_length = 30  # TODO: put this in configuration
         rows.append(self.format_image_fact_checker_row(max_length))
         for aligned_sentence in self.aligned_sentences:
             if "input_sentence" not in aligned_sentence:
                 continue
             # Get index of equal phrases in input and source sentences
             equal_idx_1, equal_idx_2 = extract_equal_text(
-                    aligned_sentence["input_sentence"],
-                    aligned_sentence["matched_sentence"],
-                )
             # Get entity-words (in pair) with colors
             # entities_with_colors = highlight_entities(
             #         aligned_sentence["input_sentence"],
@@ -320,32 +353,35 @@ class NewsVerification():
                     equal_idx_1,
                     equal_idx_2,
                     aligned_sentence["entities"],
-                ]
             )
         for row in self.fact_checker_table:
             formatted_row = self.format_text_fact_checker_row(row, max_length)
             rows.append(formatted_row)
         table = "\n".join(rows)
         return f"""
-        <h5>Comparison between input news and source news:</h5>
-        <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
-        <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
-            <thead>
-                <tr>
-                    <th>Input news</th>
-                    <th>Source (URL provided in Originality column correspondingly)</th>
-                    <th>Forensic</th>
-                    <th>Originality</th>
-                </tr>
-            </thead>
-            <tbody>
-                {table}
-            </tbody>
-        </table>
-        <style>
     """
     def format_text_fact_checker_row(self, row, max_length=30):
@@ -354,50 +390,76 @@ class NewsVerification():
             return ""
         if row[0]["matched_sentence"] != "":  # source is not empty
             # highlight entities
-            input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
-            source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
             entity_count = len(row[3])
             # Color overlapping words
-            input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input)  # text, index of highlight words
-            source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source)  # text, index of highlight words
-            input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
-            source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
         else:
             input_sentence = row[0]["input_sentence"]
             source_sentence = row[0]["matched_sentence"]
         label = row[0]["label"]
         score = row[0]["similarity"]
-        url = row[0]["url"] #
         short_url = self.shorten_url(url, max_length)
         source_text_url = f"""<a href="{url}">{short_url}</a>"""
         entity_count_text = self.get_entity_count_text(entity_count)
         return f"""
                 <tr>
                     <td>{input_sentence}</td>
                     <td>{source_sentence}</td>
-                    <td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
                     <td>{source_text_url}</td>
                 </tr>
                 """
-    def format_image_fact_checker_row(self, max_length=30):
-        if self.image_referent_url is not None or self.image_referent_url != "":
-            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
             short_url = self.shorten_url(self.image_referent_url, max_length)
-            source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
         else:
             source_image = "Image not found"
             source_image_url = ""
-        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
     def create_ordinary_user_table(self):
         rows = []
@@ -405,24 +467,27 @@ class NewsVerification():
         rows.append(self.format_image_ordinary_user_row(max_length))
         rows.append(self.format_text_ordinary_user_row(max_length))
         table = "\n".join(rows)
         return f"""
-        <h5>Comparison between input news and source news:</h5>
-        <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
-        <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
-            <thead>
-                <tr>
-                    <th>Input news</th>
-                    <th>Forensic</th>
-                    <th>Originality</th>
-                </tr>
-            </thead>
-            <tbody>
-                {table}
-            </tbody>
-        </table>
-        <style>
     """
     def format_text_ordinary_user_row(self, max_length=30):
@@ -436,152 +501,184 @@ class NewsVerification():
                 continue
             input_sentences += row["input_sentence"] + "<br><br>"
             label = self.aligned_sentences[index]["label"]
-            url = self.aligned_sentences[index]["url"] #
             short_url = self.shorten_url(url, max_length)
             source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
             sentence_count += 1
         scores, label = self.calculate_score_label()
         return f"""
                 <tr>
                     <td>{input_sentences}</td>
-                    <td>{label}<br>({scores*100:.2f}%)</td>
                     <td>{source_text_urls}</td>
                 </tr>
                 """
-    def format_image_ordinary_user_row(self, max_length=30):
-        if self.image_referent_url is not None or self.image_referent_url != "":
-            # source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
             short_url = self.shorten_url(self.image_referent_url, max_length)
-            source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
         else:
             # source_image = "Image not found"
             source_image_url = ""
-        return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
     def create_governor_table(self):
         rows = []
         max_length = 30  # TODO: put this in configuration
         rows.append(self.format_image_governor_row(max_length))
         for aligned_sentence in self.aligned_sentences:
             if "input_sentence" not in aligned_sentence:
                 continue
             # Get index of equal phrases in input and source sentences
             equal_idx_1, equal_idx_2 = extract_equal_text(
-                    aligned_sentence["input_sentence"],
-                    aligned_sentence["matched_sentence"],
-                )
             # Get entity-words (in pair) with colors
             # entities_with_colors = highlight_entities(
             #         aligned_sentence["input_sentence"],
             #         aligned_sentence["matched_sentence"],
             #     )
             self.governor_table.append(
                 [
                     aligned_sentence,
                     equal_idx_1,
                     equal_idx_2,
                     aligned_sentence["entities"],
-                ]
             )
         formatted_row = self.format_text_governor_row(max_length)
         rows.append(formatted_row)
         table = "\n".join(rows)
         return f"""
-            <h5>Comparison between input news and source news:</h5>
-            <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
-            <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
-                <thead>
-                    <tr>
-                        <th>Input news</th>
-                        <th>Source (URL provided in Originality column correspondingly)</th>
-                        <th>Forensic</th>
-                        <th>Originality</th>
-                    </tr>
-                </thead>
-                <tbody>
-                    {table}
-                </tbody>
-            </table>
-            <style>
         """
-    def format_text_governor_row(self,  max_length=30):
         input_sentences = ""
         source_sentences = ""
         source_text_urls = ""
         label = ""
-        scores = 0
         sentence_count = 0
         entity_count = 0
         for row in self.governor_table:
             print(f"governor_row: {row}")
             if row[0]["input_sentence"] == "":
                 continue
             if row[0]["matched_sentence"] != "":  # source is not empty
                 # highlight entities
-                input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input", entity_count)
-                source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source", entity_count)
                 entity_count += len(row[3])
                 # Color overlapping words
-                input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input)  # text, index of highlight words
-                source_sentence = self.color_text(source_sentence, row[2], highlight_idx_source)  # text, index of highlight words
-                input_sentence = input_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
-                source_sentence = source_sentence.replace("span_style", "span style").replace("1px_4px", "1px 4px")
             else:
                 input_sentence = row[0]["input_sentence"]
                 source_sentence = row[0]["matched_sentence"]
-            # convert score to HUMAN-based score:
             input_sentences += input_sentence + "<br><br>"
             source_sentences += source_sentence + "<br><br>"
             url = row[0]["url"]
             short_url = self.shorten_url(url, max_length)
             source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
             sentence_count += 1
         score, label = self.calculate_score_label()
         entity_count_text = self.get_entity_count_text(entity_count)
         return f"""
-                <tr>
-                    <td>{input_sentences}</td>
-                    <td>{source_sentences}</td>
-                    <td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
-                    <td>{source_text_urls}</td>
-                </tr>
                 """
     def format_image_governor_row(self, max_length=30):
-        if self.image_referent_url is not None or self.image_referent_url != "":
-            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
             short_url = self.shorten_url(self.image_referent_url, max_length)
-            source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
         else:
             source_image = "Image not found"
             source_image_url = ""
-        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
     def get_entity_count_text(self, entity_count):
         if entity_count <= 0:
@@ -595,52 +692,51 @@ class NewsVerification():
     def shorten_url(self, url, max_length=30):
         if url is None:
             return ""
         if len(url) > max_length:
             short_url = url[:max_length] + "..."
         else:
             short_url = url
         return short_url
     def color_text(self, text, colored_idx, highlighted_idx):
         paragraph = ""
         words = text.split()
         starts, ends = self.extract_starts_ends(colored_idx)
         starts, ends = self.filter_indices(starts, ends, highlighted_idx)
         previous_end = 0
         for start, end in zip(starts, ends):
             paragraph += " ".join(words[previous_end:start])
             equal_words = " ".join(words[start:end])
             paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
             previous_end = end
-        # Some left words due to the punctuation separated from
         # the highlighting text
         equal_words = " ".join(words[previous_end:])
         print(f"starts_2: {previous_end}")
-        print(f"ends_2: {len(words)-1}")
         print(f"equal_words: {words[previous_end:]}")
         paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
         return paragraph
     def extract_starts_ends(self, colored_idx):
         starts = []
         ends = []
         for index in colored_idx:
-            starts.append(index['start'])
-            ends.append(index['end'])
         return starts, ends
     def filter_indices(self, starts, ends, ignore_indices):
         """
-        Filters start and end indices to exclude any indices present in the ignore_indices list.
         Args:
             starts: A list of starting indices.
@@ -648,23 +744,26 @@ class NewsVerification():
             ignore_indices: A list of indices to exclude.
         Returns:
-            A tuple containing two new lists: filtered_starts and filtered_ends.
-            Returns empty lists if the input is invalid or if all ranges are filtered out.
             Prints error messages for invalid input.
         Examples:
             starts = [0, 5, 10]
             ends = [3, 7, 12]
             ignore_indices = [1, 2, 11, 17]
-            # Output:
                 starts = [0, 3, 5, 10, 12]
                 ends = [0, 3, 7, 10, 12]
         """
         if len(starts) != len(ends):
-            print("Error: The 'starts' and 'ends' lists must have the same length.")
             return [], []
         filtered_starts = []
@@ -675,10 +774,11 @@ class NewsVerification():
             end = ends[i]
             if end < start:
-                print(f"Error: End index {end} is less than start index {start} at position {i}.")
                 return [], []
             start_end = list(range(start, end + 1, 1))
             start_end = list(set(start_end) - set(ignore_indices))
             new_start, new_end = self.extract_sequences(start_end)
@@ -690,7 +790,7 @@ class NewsVerification():
     def extract_sequences(self, numbers):
         if len(numbers) == 1:
             return [numbers[0]], [numbers[0]]
         numbers.sort()
         starts = []
         ends = []
@@ -699,21 +799,21 @@ class NewsVerification():
                 start = number
                 end = number
                 continue
-            if number - 1 == numbers[i-1]:
                 end = number
             else:
                 starts.append(start)
                 ends.append(end + 1)
                 start = number
                 end = number
             if i == len(numbers) - 1:
                 starts.append(start)
                 ends.append(end + 1)
         return starts, ends
     def calculate_score_label(self):
         human_score = []
         machine_score = []
@@ -726,7 +826,7 @@ class NewsVerification():
             elif sentence["label"] == "MACHINE":
                 machine_score.append(1 - sentence["similarity"])
                 machine_flag = True
         if machine_flag is True and len(machine_score) > 0:
             # average value of machine_score
             machine_score_avg = sum(machine_score) / len(machine_score)
@@ -739,5 +839,3 @@ class NewsVerification():
             return human_score_avg, "HUMAN"
         else:
             return 0, "UNKNOWN"

 from difflib import SequenceMatcher
 import pandas as pd
+from src.application.image.image_detection import (
+    detect_image_by_ai_model,
+    detect_image_by_reverse_search,
+    detect_image_from_news_image,
+)
+from src.application.text.entity import (
+    apply_highlight,
+    highlight_entities,
+)
 from src.application.text.helper import extract_equal_text
 from src.application.text.model_detection import detect_text_by_ai_model
 from src.application.text.preprocessing import split_into_paragraphs
+from src.application.text.search_detection import (
+    check_human,
+    detect_text_by_relative_search,
+    find_text_source,
+)
+class NewsVerification:
     def __init__(self):
         self.news_text = ""
         self.news_title = ""
         self.news_content = ""
         self.news_image = ""
+        self.text_prediction_label: list[str] = []
+        self.text_prediction_score: list[float] = []
+        self.text_referent_url: list[str] = []
+        self.image_prediction_label: list[str] = []
+        self.image_prediction_score: list[str] = []
+        self.image_referent_url: list[str] = []
         self.news_prediction_label = ""
         self.news_prediction_score = -1
+        self.found_img_url: list[str] = []
+        self.aligned_sentences: list[dict] = []
+        self.aligned_sentences_df: pd.DataFrame = pd.DataFrame(
+            columns=[
+                "input_sentence",
+                "matched_sentence",
+                "label",
+                "similarity",
+                "paraphrase",
+                "url",
+                "group",
+                "entities",
+            ],
+        )
+        self.is_paraphrased: list[bool] = []
+        self.ordinary_user_table: list = []
+        self.fact_checker_table: list = []
+        self.governor_table: list = []
         self.entities_with_colors = []
     def load_news(self, news_title, news_content, news_image):
         self.news_text = news_title + "\n\n" + news_content
         self.news_title = news_title
     def determine_text_origin(self):
         """
+        Determines the origin of the given text based on paraphrasing detection
+            and human authorship analysis.
         Args:
             text: The input text to be analyzed.
         Returns:
+            str: The predicted origin of the text:
                 - "HUMAN": If the text is likely written by a human.
                 - "MACHINE": If the text is likely generated by a machine.
         """
             "similarity": None,
             "paraphrase": False,
             "url": "",
+        }
         for index, sentence in enumerate(input_sentences):
             print(f"-------index = {index}-------")
             if current_index >= len(input_sentences):
                 break
+            if (
+                current_index > index
+                and index != 0
+                and index != len(input_sentences) - 1
+            ):
                 continue
+            (
+                paraphrase,
+                text_url,
+                searched_sentences,
+                img_urls,
+                current_index,
+            ) = detect_text_by_relative_search(input_sentences, index)
             if paraphrase is False:
                 # add sentence to ai_sentence
                 ai_sentence["input_sentence"] += sentence
                 if index == len(input_sentences) - 1:
                     # add ai_sentences to align_sentences
+                    text_prediction_label, text_prediction_score = (
+                        detect_text_by_ai_model(ai_sentence["input_sentence"])
+                    )
                     ai_sentence["label"] = text_prediction_label
                     ai_sentence["similarity"] = text_prediction_score
                     self.aligned_sentences.append(ai_sentence)
             else:
                 if previous_paraphrase is False or previous_paraphrase is None:
                     # add ai_sentences to align_sentences
+                    if ai_sentence[
+                        "input_sentence"
+                    ] != "" or current_index >= len(input_sentences):
+                        text_prediction_label, text_prediction_score = (
+                            detect_text_by_ai_model(
+                                ai_sentence["input_sentence"],
+                            )
+                        )
                         ai_sentence["label"] = text_prediction_label
                         ai_sentence["similarity"] = text_prediction_score
                         self.aligned_sentences.append(ai_sentence)
                         # reset
                         ai_sentence = {
                             "input_sentence": "",
                             "similarity": None,
                             "paraphrase": False,
                             "url": "",
+                        }
                 # add searched_sentences to align_sentences
                 if searched_sentences["input_sentence"] != "":
                         searched_sentences["label"] = "HUMAN"
                     else:
                         searched_sentences["label"] = "MACHINE"
                     self.aligned_sentences.append(searched_sentences)
             previous_paraphrase = paraphrase
     def determine_text_origin_2(self):
         """
+        Determines the origin of the given text based on paraphrasing detection
+            and human authorship analysis.
         Args:
             text: The input text to be analyzed.
         Returns:
+            str: The predicted origin of the text:
                 - "HUMAN": If the text is likely written by a human.
                 - "MACHINE": If the text is likely generated by a machine.
         """
             self.aligned_sentences_df = pd.concat(
                 [self.aligned_sentences_df, pd.DataFrame([{}])],
                 ignore_index=False,
+            )
         for index, sentence in enumerate(input_sentences):
             print(f"-------index = {index}-------")
             print(f"current_sentence = {input_sentences[index]}")
             if self.aligned_sentences_df["url"] is not None:
                 continue
             self.aligned_sentences_df, img_urls = find_text_source(
+                input_sentences[index],
                 self.aligned_sentences_df,
             )
             self.image_prediction_score = 0.0
             self.image_referent_url = None
             return
         for image in self.found_img_url:
+            print(f"\tfound_img_url: {image}")
+        matched_url, similarity = detect_image_from_news_image(
+            self.news_image,
+            self.found_img_url,
+        )
         if matched_url is not None:
             print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
             self.image_prediction_label = "HUMAN"
             self.image_prediction_score = similarity
             self.image_referent_url = matched_url
             return
+        matched_url, similarity = detect_image_by_reverse_search(
+            self.news_image,
+        )
         if matched_url is not None:
             print(f"matching image: {matched_url}\nsimilarity: {similarity}\n")
             self.image_prediction_label = "HUMAN"
             self.image_prediction_score = similarity
             self.image_referent_url = matched_url
             return
         detected_label, score = detect_image_by_ai_model(self.news_image)
         if detected_label:
             print(f"detected_label: {detected_label} ({score})")
             self.image_prediction_score = score
             self.image_referent_url = None
             return
         self.image_prediction_label = "UNKNOWN"
         self.image_prediction_score = 50
         self.image_referent_url = None
             text_prediction_score = 50
         else:
             text_prediction_score = self.text_prediction_score
         if self.image_prediction_label == "MACHINE":
             image_prediction_score = 100 - self.image_prediction_score
         elif self.image_prediction_label == "UNKNOWN":
             image_prediction_score = 50
         else:
             image_prediction_score = self.image_prediction_score
+        news_prediction_score = (
+            text_prediction_score + image_prediction_score
+        ) / 2
         if news_prediction_score > 50:
             self.news_prediction_score = news_prediction_score
             self.news_prediction_label = "HUMAN"
         for index, aligned_sentence in enumerate(self.aligned_sentences):
             # Get entity-words (in pair) with colors
             entities_with_colors = highlight_entities(
+                aligned_sentence["input_sentence"],
+                aligned_sentence["matched_sentence"],
+            )
             self.aligned_sentences[index]["entities"] = entities_with_colors
         ordinary_user_table = self.create_ordinary_user_table()
         fact_checker_table = self.create_fact_checker_table()
         governor_table = self.create_governor_table()
         return ordinary_user_table, fact_checker_table, governor_table
     def get_text_urls(self):
         return set(self.text_referent_url)
     def compare_sentences(self, sentence_1, sentence_2, position, color):
         """
+        Compares two sentences and identifies common phrases,
+            outputting their start and end positions.
         """
         if not sentence_1 or not sentence_2:  # Handle empty strings
                 start_2 = block.b
                 end_2 = block.b + block.size
+                phrase = sentence_1[
+                    start_1:end_1
+                ]  # Or sentence_2[start_2:end_2], they are the same
+                common_phrases.append(
+                    {
+                        "phrase": phrase,
+                        "start_1": start_1 + position,
+                        "end_1": end_1 + position,
+                        "start_2": start_2,
+                        "end_2": end_2,
+                        "color": color,
+                    },
+                )
         position += len(sentence_1)
         return common_phrases, position
         rows = []
         max_length = 30  # TODO: put this in configuration
         rows.append(self.format_image_fact_checker_row(max_length))
         for aligned_sentence in self.aligned_sentences:
             if "input_sentence" not in aligned_sentence:
                 continue
             # Get index of equal phrases in input and source sentences
             equal_idx_1, equal_idx_2 = extract_equal_text(
+                aligned_sentence["input_sentence"],
+                aligned_sentence["matched_sentence"],
+            )
             # Get entity-words (in pair) with colors
             # entities_with_colors = highlight_entities(
             #         aligned_sentence["input_sentence"],
                     equal_idx_1,
                     equal_idx_2,
                     aligned_sentence["entities"],
+                ],
             )
         for row in self.fact_checker_table:
             formatted_row = self.format_text_fact_checker_row(row, max_length)
             rows.append(formatted_row)
         table = "\n".join(rows)
         return f"""
+<h5>Comparison between input news and source news:</h5>
+<table border="1" style="width:100%; text-align:left;">
+<col style="width: 170px;">
+<col style="width: 170px;">
+<col style="width: 30px;">
+<col style="width: 75px;">
+    <thead>
+        <tr>
+            <th>Input news</th>
+            <th>Source (corresponding URL provided in Originality)</th>
+            <th>Forensic</th>
+            <th>Originality</th>
+        </tr>
+    </thead>
+    <tbody>
+        {table}
+    </tbody>
+</table>
+<style>
     """
     def format_text_fact_checker_row(self, row, max_length=30):
             return ""
         if row[0]["matched_sentence"] != "":  # source is not empty
             # highlight entities
+            input_sentence, highlight_idx_input = apply_highlight(
+                row[0]["input_sentence"],
+                row[3],
+                "input",
+            )
+            source_sentence, highlight_idx_source = apply_highlight(
+                row[0]["matched_sentence"],
+                row[3],
+                "source",
+            )
             entity_count = len(row[3])
             # Color overlapping words
+            input_sentence = self.color_text(
+                input_sentence,
+                row[1],
+                highlight_idx_input,
+            )  # text, index of highlight words
+            source_sentence = self.color_text(
+                source_sentence,
+                row[2],
+                highlight_idx_source,
+            )  # text, index of highlight words
+            input_sentence = input_sentence.replace(
+                "span_style",
+                "span style",
+            ).replace("1px_4px", "1px 4px")
+            source_sentence = source_sentence.replace(
+                "span_style",
+                "span style",
+            ).replace("1px_4px", "1px 4px")
         else:
             input_sentence = row[0]["input_sentence"]
             source_sentence = row[0]["matched_sentence"]
         label = row[0]["label"]
         score = row[0]["similarity"]
+        url = row[0]["url"]  #
         short_url = self.shorten_url(url, max_length)
         source_text_url = f"""<a href="{url}">{short_url}</a>"""
         entity_count_text = self.get_entity_count_text(entity_count)
         return f"""
                 <tr>
                     <td>{input_sentence}</td>
                     <td>{source_sentence}</td>
+                    <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td>  # noqa: E501
                     <td>{source_text_url}</td>
                 </tr>
                 """
+    def format_image_fact_checker_row(self, max_length=30):
+        if (
+            self.image_referent_url is not None
+            or self.image_referent_url != ""
+        ):
+            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""  # noqa: E501
             short_url = self.shorten_url(self.image_referent_url, max_length)
+            source_image_url = (
+                f"""<a href="{self.image_referent_url}">{short_url}</a>"""
+            )
         else:
             source_image = "Image not found"
             source_image_url = ""
+        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""  # noqa: E501
     def create_ordinary_user_table(self):
         rows = []
         rows.append(self.format_image_ordinary_user_row(max_length))
         rows.append(self.format_text_ordinary_user_row(max_length))
         table = "\n".join(rows)
         return f"""
+<h5>Comparison between input news and source news:</h5>
+<table border="1" style="width:100%; text-align:left; border-collapse:collapse;">  # noqa: E501
+<col style="width: 170px;">
+<col style="width: 170px;">
+<col style="width: 30px;">
+<col style="width: 75px;">
+    <thead>
+        <tr>
+            <th>Input news</th>
+            <th>Forensic</th>
+            <th>Originality</th>
+        </tr>
+    </thead>
+    <tbody>
+        {table}
+    </tbody>
+</table>
+<style>
     """
     def format_text_ordinary_user_row(self, max_length=30):
                 continue
             input_sentences += row["input_sentence"] + "<br><br>"
             label = self.aligned_sentences[index]["label"]
+            url = self.aligned_sentences[index]["url"]  #
             short_url = self.shorten_url(url, max_length)
             source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
             sentence_count += 1
         scores, label = self.calculate_score_label()
         return f"""
                 <tr>
                     <td>{input_sentences}</td>
+                    <td>{label}<br>({scores * 100:.2f}%)</td>
                     <td>{source_text_urls}</td>
                 </tr>
                 """
+    def format_image_ordinary_user_row(self, max_length=30):
+        if (
+            self.image_referent_url is not None
+            or self.image_referent_url != ""
+        ):
             short_url = self.shorten_url(self.image_referent_url, max_length)
+            source_image_url = (
+                f"""<a href="{self.image_referent_url}">{short_url}</a>"""
+            )
         else:
             # source_image = "Image not found"
             source_image_url = ""
+        return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""  # noqa: E501
     def create_governor_table(self):
         rows = []
         max_length = 30  # TODO: put this in configuration
         rows.append(self.format_image_governor_row(max_length))
         for aligned_sentence in self.aligned_sentences:
             if "input_sentence" not in aligned_sentence:
                 continue
             # Get index of equal phrases in input and source sentences
             equal_idx_1, equal_idx_2 = extract_equal_text(
+                aligned_sentence["input_sentence"],
+                aligned_sentence["matched_sentence"],
+            )
             # Get entity-words (in pair) with colors
             # entities_with_colors = highlight_entities(
             #         aligned_sentence["input_sentence"],
             #         aligned_sentence["matched_sentence"],
             #     )
             self.governor_table.append(
                 [
                     aligned_sentence,
                     equal_idx_1,
                     equal_idx_2,
                     aligned_sentence["entities"],
+                ],
             )
         formatted_row = self.format_text_governor_row(max_length)
         rows.append(formatted_row)
         table = "\n".join(rows)
         return f"""
+<h5>Comparison between input news and source news:</h5>
+<table border="1" style="width:100%; text-align:left;">
+<col style="width: 170px;">
+<col style="width: 170px;">
+<col style="width: 30px;">
+<col style="width: 75px;">
+    <thead>
+        <tr>
+            <th>Input news</th>
+            <th>Source (corresponding URL provided in Originality)</th>
+            <th>Forensic</th>
+            <th>Originality</th>
+        </tr>
+    </thead>
+    <tbody>
+        {table}
+    </tbody>
+</table>
+<style>
         """
+    def format_text_governor_row(self, max_length=30):
         input_sentences = ""
         source_sentences = ""
         source_text_urls = ""
         label = ""
         sentence_count = 0
         entity_count = 0
         for row in self.governor_table:
             print(f"governor_row: {row}")
             if row[0]["input_sentence"] == "":
                 continue
             if row[0]["matched_sentence"] != "":  # source is not empty
                 # highlight entities
+                input_sentence, highlight_idx_input = apply_highlight(
+                    row[0]["input_sentence"],
+                    row[3],
+                    "input",
+                    entity_count,
+                )
+                source_sentence, highlight_idx_source = apply_highlight(
+                    row[0]["matched_sentence"],
+                    row[3],
+                    "source",
+                    entity_count,
+                )
                 entity_count += len(row[3])
                 # Color overlapping words
+                input_sentence = self.color_text(
+                    input_sentence,
+                    row[1],
+                    highlight_idx_input,
+                )  # text, index of highlight words
+                source_sentence = self.color_text(
+                    source_sentence,
+                    row[2],
+                    highlight_idx_source,
+                )  # text, index of highlight words
+                input_sentence = input_sentence.replace(
+                    "span_style",
+                    "span style",
+                ).replace("1px_4px", "1px 4px")
+                source_sentence = source_sentence.replace(
+                    "span_style",
+                    "span style",
+                ).replace("1px_4px", "1px 4px")
             else:
                 input_sentence = row[0]["input_sentence"]
                 source_sentence = row[0]["matched_sentence"]
+            # convert score to HUMAN-based score:
             input_sentences += input_sentence + "<br><br>"
             source_sentences += source_sentence + "<br><br>"
             url = row[0]["url"]
             short_url = self.shorten_url(url, max_length)
             source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
             sentence_count += 1
         score, label = self.calculate_score_label()
         entity_count_text = self.get_entity_count_text(entity_count)
         return f"""
+<tr>
+    <td>{input_sentences}</td>
+    <td>{source_sentences}</td>
+    <td>{label}<br>({score * 100:.2f}%)<br><br>{entity_count_text}</td>
+    <td>{source_text_urls}</td>
+</tr>
                 """
     def format_image_governor_row(self, max_length=30):
+        if (
+            self.image_referent_url is not None
+            or self.image_referent_url != ""
+        ):
+            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""  # noqa: E501
             short_url = self.shorten_url(self.image_referent_url, max_length)
+            source_image_url = (
+                f"""<a href="{self.image_referent_url}">{short_url}</a>"""
+            )
         else:
             source_image = "Image not found"
             source_image_url = ""
+        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""  # noqa: E501
     def get_entity_count_text(self, entity_count):
         if entity_count <= 0:
     def shorten_url(self, url, max_length=30):
         if url is None:
             return ""
         if len(url) > max_length:
             short_url = url[:max_length] + "..."
         else:
             short_url = url
         return short_url
     def color_text(self, text, colored_idx, highlighted_idx):
         paragraph = ""
         words = text.split()
         starts, ends = self.extract_starts_ends(colored_idx)
         starts, ends = self.filter_indices(starts, ends, highlighted_idx)
         previous_end = 0
         for start, end in zip(starts, ends):
             paragraph += " ".join(words[previous_end:start])
             equal_words = " ".join(words[start:end])
             paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
             previous_end = end
+        # Some left words due to the punctuation separated from
         # the highlighting text
         equal_words = " ".join(words[previous_end:])
         print(f"starts_2: {previous_end}")
+        print(f"ends_2: {len(words) - 1}")
         print(f"equal_words: {words[previous_end:]}")
         paragraph += f" <span style='color:#00FF00;'>{equal_words}</span> "
         return paragraph
     def extract_starts_ends(self, colored_idx):
         starts = []
         ends = []
         for index in colored_idx:
+            starts.append(index["start"])
+            ends.append(index["end"])
         return starts, ends
     def filter_indices(self, starts, ends, ignore_indices):
         """
+        Filters start and end indices to exclude any indices present in the
+            ignore_indices list.
         Args:
             starts: A list of starting indices.
             ignore_indices: A list of indices to exclude.
         Returns:
+            A tuple of two lists: filtered_starts and filtered_ends.
+            Returns empty lists if the input is invalid
+                or if all ranges are filtered out.
             Prints error messages for invalid input.
         Examples:
             starts = [0, 5, 10]
             ends = [3, 7, 12]
             ignore_indices = [1, 2, 11, 17]
+            # Output:
                 starts = [0, 3, 5, 10, 12]
                 ends = [0, 3, 7, 10, 12]
         """
         if len(starts) != len(ends):
+            print(
+                "Error: The 'starts' and 'ends' lists must have the same length.",  # noqa: E501
+            )
             return [], []
         filtered_starts = []
             end = ends[i]
             if end < start:
+                print(
+                    f"Error: End index {end} is less than start index {start} at position {i}.",  # noqa: E501
+                )
                 return [], []
             start_end = list(range(start, end + 1, 1))
             start_end = list(set(start_end) - set(ignore_indices))
             new_start, new_end = self.extract_sequences(start_end)
     def extract_sequences(self, numbers):
         if len(numbers) == 1:
             return [numbers[0]], [numbers[0]]
         numbers.sort()
         starts = []
         ends = []
                 start = number
                 end = number
                 continue
+            if number - 1 == numbers[i - 1]:
                 end = number
             else:
                 starts.append(start)
                 ends.append(end + 1)
                 start = number
                 end = number
             if i == len(numbers) - 1:
                 starts.append(start)
                 ends.append(end + 1)
         return starts, ends
     def calculate_score_label(self):
         human_score = []
         machine_score = []
             elif sentence["label"] == "MACHINE":
                 machine_score.append(1 - sentence["similarity"])
                 machine_flag = True
         if machine_flag is True and len(machine_score) > 0:
             # average value of machine_score
             machine_score_avg = sum(machine_score) / len(machine_score)
             return human_score_avg, "HUMAN"
         else:
             return 0, "UNKNOWN"

src/application/content_generation.py CHANGED Viewed

@@ -1,25 +1,27 @@
 import json
 import openai
 from dotenv import load_dotenv
-import os
 load_dotenv()
-AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
-AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
-AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
 client = openai.AzureOpenAI(
-    api_version = AZURE_OPENAI_API_VERSION,
-    api_key = AZURE_OPENAI_API_KEY,
-    azure_endpoint = AZURE_OPENAI_ENDPOINT,
-    )
 def generate_fake_text(text_generation_model, title, content):
     # Generate text using the selected models
-    prompt = """Generate a random fake news tittle in this format:
     ---
     # Title: [Fake Title]
-    # Content:
     [Fake Content]
     ---
     """
@@ -32,22 +34,25 @@ def generate_fake_text(text_generation_model, title, content):
     elif content:
         prompt += """base on the following context:
         # Content: {news_content}"""
     # Generate text using the text generation model
-    # Generate text using the selected model
     try:
         response = client.chat.completions.create(
-            model=text_generation_model,
-            messages = [{"role": "system", "content": prompt}],
         )
-        print("Response from OpenAI API: ", response.choices[0].message.content)
         fake_text = response.choices[0].message.content
     except openai.OpenAIError as e:
         print(f"Error interacting with OpenAI API: {e}")
-        fake_text =  ""
     if fake_text != "":
         fake_title, fake_content = extract_title_content(fake_text)
     return fake_title, fake_content
@@ -57,12 +62,12 @@ def extract_title_content(fake_news):
     """
     Extracts the title and content from the generated fake news string.
-    This function parses a string containing fake news, which is expected to have
-    a specific format with a title and content section marked by '# Title:' and
-    '# Content:' respectively.
     Args:
-        fake_news (str): A string containing the generated fake news in the expected format.
     Returns:
         tuple: A tuple containing two elements:
@@ -77,33 +82,36 @@ def extract_title_content(fake_news):
     title_start_index = fake_news.find("# Title: ") + len("# Title: ")
     title_end_index = fake_news.find("\n", title_start_index)
     title = fake_news[title_start_index:title_end_index].strip()
-    content_start_index = fake_news.find("\n# Content: ") + len("\n# Content: ")
     content = fake_news[content_start_index:].strip()
     return title, content
 def generate_fake_image(model, title):
     if len(title) > 0:
         IMAGE_PROMPT = f"Generate a random image about {title}"
     else:
         IMAGE_PROMPT = "Generate a random image"
     result = client.images.generate(
-        model="dall-e-3", # the name of your DALL-E 3 deployment
         prompt=IMAGE_PROMPT,
-        n=1
     )
-    image_url = json.loads(result.model_dump_json())['data'][0]['url']
     return image_url
 def replace_text(news_title, news_content, replace_df):
     """
     Replaces occurrences in the input text based on the provided DataFrame.
     Args:
         text: The input text.
-        replace_df: A pandas DataFrame with two columns: "find_what" and "replace_with".
     Returns:
         The text after all replacements have been made.
@@ -113,4 +121,4 @@ def replace_text(news_title, news_content, replace_df):
         replace_with = row["Replace with:"]
         news_content = news_content.replace(find_what, replace_with)
         news_title = news_title.replace(find_what, replace_with)
-    return news_title, news_content

 import json
+import os
 import openai
 from dotenv import load_dotenv
 load_dotenv()
+AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
+AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
 client = openai.AzureOpenAI(
+    api_version=AZURE_OPENAI_API_VERSION,
+    api_key=AZURE_OPENAI_API_KEY,
+    azure_endpoint=AZURE_OPENAI_ENDPOINT,
+)
 def generate_fake_text(text_generation_model, title, content):
     # Generate text using the selected models
+    prompt = """Generate a random fake news tittle in this format:
     ---
     # Title: [Fake Title]
+    # Content:
     [Fake Content]
     ---
     """
     elif content:
         prompt += """base on the following context:
         # Content: {news_content}"""
     # Generate text using the text generation model
+    # Generate text using the selected model
     try:
         response = client.chat.completions.create(
+            model=text_generation_model,
+            messages=[{"role": "system", "content": prompt}],
+        )
+        print(
+            "Response from OpenAI API: ",
+            response.choices[0].message.content,
         )
         fake_text = response.choices[0].message.content
     except openai.OpenAIError as e:
         print(f"Error interacting with OpenAI API: {e}")
+        fake_text = ""
     if fake_text != "":
         fake_title, fake_content = extract_title_content(fake_text)
     return fake_title, fake_content
     """
     Extracts the title and content from the generated fake news string.
+    This function parses a string containing fake news, which is expected
+    to have a specific format with a title and content section marked by
+    '# Title:' and '# Content:' respectively.
     Args:
+        fake_news (str): A string containing the generated fake news.
     Returns:
         tuple: A tuple containing two elements:
     title_start_index = fake_news.find("# Title: ") + len("# Title: ")
     title_end_index = fake_news.find("\n", title_start_index)
     title = fake_news[title_start_index:title_end_index].strip()
+    content_start_index = fake_news.find("\n# Content: ") + len(
+        "\n# Content: ",
+    )
     content = fake_news[content_start_index:].strip()
     return title, content
 def generate_fake_image(model, title):
     if len(title) > 0:
         IMAGE_PROMPT = f"Generate a random image about {title}"
     else:
         IMAGE_PROMPT = "Generate a random image"
     result = client.images.generate(
+        model="dall-e-3",  # the name of your DALL-E 3 deployment
         prompt=IMAGE_PROMPT,
+        n=1,
     )
+    image_url = json.loads(result.model_dump_json())["data"][0]["url"]
     return image_url
 def replace_text(news_title, news_content, replace_df):
     """
     Replaces occurrences in the input text based on the provided DataFrame.
     Args:
         text: The input text.
+        replace_df: A DF with 2 columns: "find_what" & "replace_with".
     Returns:
         The text after all replacements have been made.
         replace_with = row["Replace with:"]
         news_content = news_content.replace(find_what, replace_with)
         news_title = news_title.replace(find_what, replace_with)
+    return news_title, news_content

src/application/image/image_comparison.py CHANGED Viewed

@@ -1,9 +1,12 @@
-import requests
 from io import BytesIO
-from PIL import Image
 import imagehash
 from src.application.image.search_yandex import YandexReverseImageSearcher
 def get_image_from_url(url):
     try:
         response = requests.get(url)
@@ -12,6 +15,7 @@ def get_image_from_url(url):
         print(f"Error opening image: {e}")
         return None
 def get_image_from_file(file_path):
     try:
         return Image.open(file_path)
@@ -19,33 +23,36 @@ def get_image_from_file(file_path):
         print(f"Error occurred while opening image from file: {file_path}")
         return None
 def standardize_image(image):
     # Convert to RGB if needed
-    if image.mode in ('RGBA', 'LA'):
-        background = Image.new('RGB', image.size, (255, 255, 255))
         background.paste(image, mask=image.split()[-1])
         image = background
-    elif image.mode != 'RGB':
-        image = image.convert('RGB')
     # Resize to standard size (e.g. 256x256)
     standard_size = (256, 256)
     image = image.resize(standard_size)
     return image
 def compare_images(image1, image2):
     # Standardize both images first
     img1_std = standardize_image(image1)
     img2_std = standardize_image(image2)
     hash1 = imagehash.average_hash(img1_std)
     hash2 = imagehash.average_hash(img2_std)
     return hash1 - hash2  # Returns the Hamming distance between the hashes
-if __name__ == '__main__':
-    image_url = 'https://i.pinimg.com/originals/c4/50/35/c450352ac6ea8645ead206721673e8fb.png'
     # Get the image from URL
     url_image = get_image_from_url(image_url)
@@ -54,13 +61,13 @@ if __name__ == '__main__':
     res = rev_img_searcher.search(image_url)
     for search_item in res:
-        print(f'Title: {search_item.page_title}')
         # print(f'Site: {search_item.page_url}')
-        print(f'Img: {search_item.image_url}\n')
         # Compare each search result image with the input image
         result_image = get_image_from_url(search_item.image_url)
         result_difference = compare_images(result_image, url_image)
         print(f"Difference with search result: {result_difference}")
-        if result_difference == 0:
-            break

 from io import BytesIO
 import imagehash
+import requests
+from PIL import Image
 from src.application.image.search_yandex import YandexReverseImageSearcher
 def get_image_from_url(url):
     try:
         response = requests.get(url)
         print(f"Error opening image: {e}")
         return None
 def get_image_from_file(file_path):
     try:
         return Image.open(file_path)
         print(f"Error occurred while opening image from file: {file_path}")
         return None
 def standardize_image(image):
     # Convert to RGB if needed
+    if image.mode in ("RGBA", "LA"):
+        background = Image.new("RGB", image.size, (255, 255, 255))
         background.paste(image, mask=image.split()[-1])
         image = background
+    elif image.mode != "RGB":
+        image = image.convert("RGB")
     # Resize to standard size (e.g. 256x256)
     standard_size = (256, 256)
     image = image.resize(standard_size)
     return image
 def compare_images(image1, image2):
     # Standardize both images first
     img1_std = standardize_image(image1)
     img2_std = standardize_image(image2)
     hash1 = imagehash.average_hash(img1_std)
     hash2 = imagehash.average_hash(img2_std)
     return hash1 - hash2  # Returns the Hamming distance between the hashes
+if __name__ == "__main__":
+    image_url = "https://i.pinimg.com/originals/c4/50/35/c450352ac6ea8645ead206721673e8fb.png"  # noqa: E501
     # Get the image from URL
     url_image = get_image_from_url(image_url)
     res = rev_img_searcher.search(image_url)
     for search_item in res:
+        print(f"Title: {search_item.page_title}")
         # print(f'Site: {search_item.page_url}')
+        print(f"Img: {search_item.image_url}\n")
         # Compare each search result image with the input image
         result_image = get_image_from_url(search_item.image_url)
         result_difference = compare_images(result_image, url_image)
         print(f"Difference with search result: {result_difference}")
+        if result_difference == 0:
+            break

src/application/image/image_detection.py CHANGED Viewed

@@ -1,14 +1,19 @@
-from src.application.image.image_comparison import compare_images, get_image_from_file, get_image_from_url
 from src.application.image.model_detection import image_generation_detection
 from src.application.image.search_yandex import yandex_reverse_image_search
 def compare_list_of_images(news_image_path, img_urls):
-    news_image = get_image_from_file(news_image_path)  # TODO: news_image_path is arrays
     if news_image is None:
         return None, -1
     matched_url = ""
     max_similarity = 0
     for url in img_urls:
@@ -20,7 +25,10 @@ def compare_list_of_images(news_image_path, img_urls):
         referred_image = get_image_from_url(url)
         if referred_image is None:
             continue
-        distance = compare_images(news_image, referred_image)  # Hamming algorithm
         similarity = max(100 - distance, 0)
         if similarity > max_similarity:
             max_similarity = similarity
@@ -29,14 +37,17 @@ def compare_list_of_images(news_image_path, img_urls):
     if max_similarity > 90:
         return matched_url, max_similarity
     return None, -1
 def detect_image_from_news_image(news_image_path, image_urls):
     print("\tFrom news:")
     return compare_list_of_images(news_image_path, image_urls)
 def detect_image_by_reverse_search(news_image_path):
-    image_urls = yandex_reverse_image_search(news_image_path) # url or file_path
     print("\tFrom search engine:")
     for url in image_urls:
         print(f"\t\t{url}")
@@ -47,5 +58,5 @@ def detect_image_by_ai_model(news_image_path):
     print("\tFrom AI model:")
     image_prediction_label, image_confidence = image_generation_detection(
         news_image_path,
-        )
-    return image_prediction_label, image_confidence

+from src.application.image.image_comparison import (
+    compare_images,
+    get_image_from_file,
+    get_image_from_url,
+)
 from src.application.image.model_detection import image_generation_detection
 from src.application.image.search_yandex import yandex_reverse_image_search
 def compare_list_of_images(news_image_path, img_urls):
+    news_image = get_image_from_file(
+        news_image_path,
+    )  # TODO: news_image_path is arrays
     if news_image is None:
         return None, -1
     matched_url = ""
     max_similarity = 0
     for url in img_urls:
         referred_image = get_image_from_url(url)
         if referred_image is None:
             continue
+        distance = compare_images(
+            news_image,
+            referred_image,
+        )  # Hamming algorithm
         similarity = max(100 - distance, 0)
         if similarity > max_similarity:
             max_similarity = similarity
     if max_similarity > 90:
         return matched_url, max_similarity
     return None, -1
 def detect_image_from_news_image(news_image_path, image_urls):
     print("\tFrom news:")
     return compare_list_of_images(news_image_path, image_urls)
 def detect_image_by_reverse_search(news_image_path):
+    image_urls = yandex_reverse_image_search(
+        news_image_path,
+    )  # url or file_path
     print("\tFrom search engine:")
     for url in image_urls:
         print(f"\t\t{url}")
     print("\tFrom AI model:")
     image_prediction_label, image_confidence = image_generation_detection(
         news_image_path,
+    )
+    return image_prediction_label, image_confidence

src/application/image/model_detection.py CHANGED Viewed

@@ -1,23 +1,39 @@
-from sklearn.metrics import roc_auc_score
-from torchmetrics import Accuracy, Recall
 import pytorch_lightning as pl
-import timm
 import torch
 import torch.nn.functional as F
-import logging
-from PIL import Image
 import torchvision.transforms as transforms
 from torchvision.transforms import v2
-logging.basicConfig(filename='training.log',filemode='w',level=logging.INFO, force=True)
-CHECKPOINT = "models/image_classifier/image-classifier-step=8008-val_loss=0.11.ckpt"
 class ImageClassifier(pl.LightningModule):
     def __init__(self, lmd=0):
         super().__init__()
-        self.model = timm.create_model('resnet50', pretrained=True, num_classes=1)
-        self.accuracy = Accuracy(task='binary', threshold=0.5)
-        self.recall = Recall(task='binary', threshold=0.5)
         self.validation_outputs = []
         self.lmd = lmd
@@ -27,13 +43,13 @@ class ImageClassifier(pl.LightningModule):
     def training_step(self, batch):
         images, labels, _ = batch
         outputs = self.forward(images).squeeze()
         print(f"Shape of outputs (training): {outputs.shape}")
         print(f"Shape of labels (training): {labels.shape}")
         loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
         logging.info(f"Training Step - ERM loss: {loss.item()}")
-        loss += self.lmd * (outputs ** 2).mean() # SD loss penalty
         logging.info(f"Training Step - SD loss: {loss.item()}")
         return loss
@@ -43,20 +59,30 @@ class ImageClassifier(pl.LightningModule):
         if outputs.shape == torch.Size([]):
             return
         print(f"Shape of outputs (validation): {outputs.shape}")
         print(f"Shape of labels (validation): {labels.shape}")
         loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
         preds = torch.sigmoid(outputs)
-        self.log('val_loss', loss, prog_bar=True, sync_dist=True)
-        self.log('val_acc', self.accuracy(preds, labels.int()), prog_bar=True, sync_dist=True)
-        self.log('val_recall', self.recall(preds, labels.int()), prog_bar=True, sync_dist=True)
         output = {"val_loss": loss, "preds": preds, "labels": labels}
         self.validation_outputs.append(output)
         logging.info(f"Validation Step - Batch loss: {loss.item()}")
         return output
     def predict_step(self, batch):
         images, label, domain = batch
         outputs = self.forward(images).squeeze()
@@ -67,13 +93,13 @@ class ImageClassifier(pl.LightningModule):
         if not self.validation_outputs:
             logging.warning("No outputs in validation step to process")
             return
-        preds = torch.cat([x['preds'] for x in self.validation_outputs])
-        labels = torch.cat([x['labels'] for x in self.validation_outputs])
         if labels.unique().size(0) == 1:
             logging.warning("Only one class in validation step")
             return
         auc_score = roc_auc_score(labels.cpu(), preds.cpu())
-        self.log('val_auc', auc_score, prog_bar=True, sync_dist=True)
         logging.info(f"Validation Epoch End - AUC score: {auc_score}")
         self.validation_outputs = []
@@ -82,45 +108,46 @@ class ImageClassifier(pl.LightningModule):
         return optimizer
 def load_image(image_path, transform=None):
-    image = Image.open(image_path).convert('RGB')
     if transform:
         image = transform(image)
     return image
 def predict_single_image(image_path, model, transform=None):
-    image = load_image(image_path, transform)
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     model.to(device)
     image = image.to(device)
     model.eval()
     with torch.no_grad():
-        image = image.unsqueeze(0)
-        output = model(image).squeeze()
-        prediction = torch.sigmoid(output).item()
     return prediction
 def image_generation_detection(image_path):
     model = ImageClassifier.load_from_checkpoint(CHECKPOINT)
-    transform = v2.Compose([
-        transforms.ToTensor(),
-        v2.CenterCrop((256, 256)),
-    ])
-    prediction = predict_single_image(image_path, model, transform)
     result = ""
     if prediction <= 0.2:
         result += "Most likely human"
@@ -134,8 +161,8 @@ def image_generation_detection(image_path):
     return image_prediction_label, image_confidence
-if __name__ == "__main__":
     image_path = "path_to_your_image.jpg"  # Replace with your image path
     image_prediction_label, image_confidence = image_generation_detection(
         image_path,
-        )

+import logging
 import pytorch_lightning as pl
+import timm
 import torch
 import torch.nn.functional as F
 import torchvision.transforms as transforms
+from PIL import Image
+from sklearn.metrics import roc_auc_score
+from torchmetrics import (
+    Accuracy,
+    Recall,
+)
 from torchvision.transforms import v2
+logging.basicConfig(
+    filename="training.log",
+    filemode="w",
+    level=logging.INFO,
+    force=True,
+)
+CHECKPOINT = (
+    "models/image_classifier/image-classifier-step=8008-val_loss=0.11.ckpt"
+)
 class ImageClassifier(pl.LightningModule):
     def __init__(self, lmd=0):
         super().__init__()
+        self.model = timm.create_model(
+            "resnet50",
+            pretrained=True,
+            num_classes=1,
+        )
+        self.accuracy = Accuracy(task="binary", threshold=0.5)
+        self.recall = Recall(task="binary", threshold=0.5)
         self.validation_outputs = []
         self.lmd = lmd
     def training_step(self, batch):
         images, labels, _ = batch
         outputs = self.forward(images).squeeze()
         print(f"Shape of outputs (training): {outputs.shape}")
         print(f"Shape of labels (training): {labels.shape}")
         loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
         logging.info(f"Training Step - ERM loss: {loss.item()}")
+        loss += self.lmd * (outputs**2).mean()  # SD loss penalty
         logging.info(f"Training Step - SD loss: {loss.item()}")
         return loss
         if outputs.shape == torch.Size([]):
             return
         print(f"Shape of outputs (validation): {outputs.shape}")
         print(f"Shape of labels (validation): {labels.shape}")
         loss = F.binary_cross_entropy_with_logits(outputs, labels.float())
         preds = torch.sigmoid(outputs)
+        self.log("val_loss", loss, prog_bar=True, sync_dist=True)
+        self.log(
+            "val_acc",
+            self.accuracy(preds, labels.int()),
+            prog_bar=True,
+            sync_dist=True,
+        )
+        self.log(
+            "val_recall",
+            self.recall(preds, labels.int()),
+            prog_bar=True,
+            sync_dist=True,
+        )
         output = {"val_loss": loss, "preds": preds, "labels": labels}
         self.validation_outputs.append(output)
         logging.info(f"Validation Step - Batch loss: {loss.item()}")
         return output
     def predict_step(self, batch):
         images, label, domain = batch
         outputs = self.forward(images).squeeze()
         if not self.validation_outputs:
             logging.warning("No outputs in validation step to process")
             return
+        preds = torch.cat([x["preds"] for x in self.validation_outputs])
+        labels = torch.cat([x["labels"] for x in self.validation_outputs])
         if labels.unique().size(0) == 1:
             logging.warning("Only one class in validation step")
             return
         auc_score = roc_auc_score(labels.cpu(), preds.cpu())
+        self.log("val_auc", auc_score, prog_bar=True, sync_dist=True)
         logging.info(f"Validation Epoch End - AUC score: {auc_score}")
         self.validation_outputs = []
         return optimizer
 def load_image(image_path, transform=None):
+    image = Image.open(image_path).convert("RGB")
     if transform:
         image = transform(image)
     return image
 def predict_single_image(image_path, model, transform=None):
+    image = load_image(image_path, transform)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
     image = image.to(device)
     model.eval()
     with torch.no_grad():
+        image = image.unsqueeze(0)
+        output = model(image).squeeze()
+        prediction = torch.sigmoid(output).item()
     return prediction
 def image_generation_detection(image_path):
     model = ImageClassifier.load_from_checkpoint(CHECKPOINT)
+    transform = v2.Compose(
+        [
+            transforms.ToTensor(),
+            v2.CenterCrop((256, 256)),
+        ],
+    )
+    prediction = predict_single_image(image_path, model, transform)
     result = ""
     if prediction <= 0.2:
         result += "Most likely human"
     return image_prediction_label, image_confidence
+if __name__ == "__main__":
     image_path = "path_to_your_image.jpg"  # Replace with your image path
     image_prediction_label, image_confidence = image_generation_detection(
         image_path,
+    )

src/application/image/search_yandex.py CHANGED Viewed

@@ -1,17 +1,22 @@
-import time
 import logging
 import requests
-import json
 from bs4 import BeautifulSoup
-from urllib.parse import quote, urlparse
 logging.basicConfig(
-    filename='error.log',
     level=logging.INFO,
-    format='%(asctime)s | [%(levelname)s]: %(message)s',
-    datefmt='%m-%d-%Y / %I:%M:%S %p'
 )
 class SearchResults:
     def __init__(self, results):
         self.results = results
@@ -25,20 +30,29 @@ class SearchResults:
             output += "---\n"
         return output
 class YandexReverseImageSearcher:
     def __init__(self):
         self.base_url = "https://yandex.ru/images/search"
-        self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
         self.retry_count = 3
         self.retry_delay = 1
-    def response(self, query: str, image_url: str, max_results: int = 10, delay: int = 1) -> SearchResults:
         self._validate_input(query, image_url)
         encoded_query = quote(query)
         encoded_image_url = quote(image_url)
-        url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2"
         all_results = []
         start_index = 0
@@ -46,14 +60,16 @@ class YandexReverseImageSearcher:
         while len(all_results) < max_results:
             if start_index != 0:
                 time.sleep(delay)
             paginated_url = f"{url}&start={start_index}"
             response = self._make_request(paginated_url)
             if response is None:
                 break
-            search_results, valid_content = self._parse_search_results(response.text)
             if not valid_content:
                 logging.warning("Unexpected HTML structure encountered.")
                 break
@@ -65,34 +81,44 @@ class YandexReverseImageSearcher:
                 if data and data not in all_results:
                     all_results.append(data)
-            start_index += (len(all_results)-start_index)
         if len(all_results) == 0:
-            logging.warning(f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].")
-            return "No results found. Please try again with a different query and/or image URL."
         else:
             return SearchResults(all_results[:max_results])
     def _validate_input(self, query: str, image_url: str):
         if not query:
-            raise ValueError("Query not found. Please enter a query and try again.")
         if not image_url:
-            raise ValueError("Image URL not found. Please enter an image URL and try again.")
         if not self._validate_image_url(image_url):
-            raise ValueError("Invalid image URL. Please enter a valid image URL and try again.")
     def _validate_image_url(self, url: str) -> bool:
         parsed_url = urlparse(url)
         path = parsed_url.path.lower()
         valid_extensions = (".jpg", ".jpeg", ".png", ".webp")
         return any(path.endswith(ext) for ext in valid_extensions)
     def _make_request(self, url: str):
         attempts = 0
         while attempts < self.retry_count:
             try:
                 response = requests.get(url, headers=self.headers)
-                if response.headers.get('Content-Type', '').startswith('text/html'):
                     response.raise_for_status()
                     return response
                 else:
@@ -110,14 +136,22 @@ class YandexReverseImageSearcher:
     def _parse_search_results(self, html_content: str):
         try:
             soup = BeautifulSoup(html_content, "html.parser")
-            return soup.find_all('div', class_='g'), True
         except Exception as e:
             logging.error(f"Error parsing HTML content: {e}")
             return None, False
     def _extract_result_data(self, result):
-        link = result.find('a', href=True)['href'] if result.find('a', href=True) else None
-        title = result.find('h3').get_text(strip=True) if result.find('h3') else None
         return {"link": link, "title": title} if link and title else {}
@@ -131,24 +165,27 @@ def get_image_links(page):
     Returns:
         A list of image URLs.
     """
-    soup = BeautifulSoup(page, 'html.parser')
     # Find the specific section containing image links
-    gallery_data = soup.find('div', {'class': 'cbir-section cbir-section_name_sites'})
     if gallery_data is None:
         return []
     # Find the container of image links
-    image_links_container = gallery_data.find('div', {'class': 'Root'})
     if image_links_container is None:
         return []
-    data_state = json.loads(image_links_container['data-state'])
     # Extract URLs from each div
     image_urls = []
-    for site in data_state['sites']:
-        original_image_url = site['originalImage']['url']
         image_urls.append(original_image_url)
     return image_urls
@@ -158,19 +195,19 @@ def yandex_reverse_image_search(file_path):
     img_search_url = generate_images_search_links(file_path)
     if img_search_url is None:
         return []
     # Simulate a user agent to avoid being blocked
     headers = {
-    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
-    'Content-Type': 'application/json',
     }
     try:
         response = requests.get(img_search_url, headers=headers)
         response.raise_for_status()  # Raise an exception for bad status codes
         # Parse the HTML content
-        soup = BeautifulSoup(response.content, 'html.parser')
         image_urls = get_image_links(soup.prettify())
         return image_urls
@@ -180,21 +217,28 @@ def yandex_reverse_image_search(file_path):
 def generate_images_search_links(file_path):
-    search_url = 'https://yandex.ru/images/search'
-    params = {'rpt': 'imageview', 'format': 'json', 'request': '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}'}
     try:
-        files = {'upfile': ('blob', open(file_path, 'rb'), 'image/jpeg/webp')}
         response = requests.post(search_url, params=params, files=files)
-        query_string = json.loads(response.content)['blocks'][0]['params']['url']
-        img_search_url = search_url + '?' + query_string
         return img_search_url
-    except:
         return None
 if __name__ == "__main__":
-    file_path = "T:\\Projects\\prj-nict-ai-content-detection\\data\\test_data\\towels.jpg.webp"
     image_urls = yandex_reverse_image_search(file_path)
     for image_url in image_urls:
         print(f"Image URL: {image_url}")

+import json
 import logging
+import time
+from urllib.parse import (
+    quote,
+    urlparse,
+)
 import requests
 from bs4 import BeautifulSoup
 logging.basicConfig(
+    filename="error.log",
     level=logging.INFO,
+    format="%(asctime)s | [%(levelname)s]: %(message)s",
+    datefmt="%m-%d-%Y / %I:%M:%S %p",
 )
 class SearchResults:
     def __init__(self, results):
         self.results = results
             output += "---\n"
         return output
 class YandexReverseImageSearcher:
     def __init__(self):
         self.base_url = "https://yandex.ru/images/search"
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",  # noqa: E501
+        }
         self.retry_count = 3
         self.retry_delay = 1
+    def response(
+        self,
+        query: str,
+        image_url: str,
+        max_results: int = 10,
+        delay: int = 1,
+    ) -> SearchResults:
         self._validate_input(query, image_url)
         encoded_query = quote(query)
         encoded_image_url = quote(image_url)
+        url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2"  # noqa: E501
         all_results = []
         start_index = 0
         while len(all_results) < max_results:
             if start_index != 0:
                 time.sleep(delay)
             paginated_url = f"{url}&start={start_index}"
             response = self._make_request(paginated_url)
             if response is None:
                 break
+            search_results, valid_content = self._parse_search_results(
+                response.text,
+            )
             if not valid_content:
                 logging.warning("Unexpected HTML structure encountered.")
                 break
                 if data and data not in all_results:
                     all_results.append(data)
+            start_index += len(all_results) - start_index
         if len(all_results) == 0:
+            logging.warning(
+                f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].",  # noqa: E501
+            )
+            return "No results found. Please try again with a different query and/or image URL."  # noqa: E501
         else:
             return SearchResults(all_results[:max_results])
     def _validate_input(self, query: str, image_url: str):
         if not query:
+            raise ValueError(
+                "Query not found. Enter a query and try again.",
+            )
         if not image_url:
+            raise ValueError(
+                "Image URL not found. Enter an image URL and try again.",
+            )
         if not self._validate_image_url(image_url):
+            raise ValueError(
+                "Invalid image URL. Enter a valid image URL and try again.",
+            )
     def _validate_image_url(self, url: str) -> bool:
         parsed_url = urlparse(url)
         path = parsed_url.path.lower()
         valid_extensions = (".jpg", ".jpeg", ".png", ".webp")
         return any(path.endswith(ext) for ext in valid_extensions)
     def _make_request(self, url: str):
         attempts = 0
         while attempts < self.retry_count:
             try:
                 response = requests.get(url, headers=self.headers)
+                if response.headers.get("Content-Type", "").startswith(
+                    "text/html",
+                ):
                     response.raise_for_status()
                     return response
                 else:
     def _parse_search_results(self, html_content: str):
         try:
             soup = BeautifulSoup(html_content, "html.parser")
+            return soup.find_all("div", class_="g"), True
         except Exception as e:
             logging.error(f"Error parsing HTML content: {e}")
             return None, False
     def _extract_result_data(self, result):
+        link = (
+            result.find("a", href=True)["href"]
+            if result.find("a", href=True)
+            else None
+        )
+        title = (
+            result.find("h3").get_text(strip=True)
+            if result.find("h3")
+            else None
+        )
         return {"link": link, "title": title} if link and title else {}
     Returns:
         A list of image URLs.
     """
+    soup = BeautifulSoup(page, "html.parser")
     # Find the specific section containing image links
+    gallery_data = soup.find(
+        "div",
+        {"class": "cbir-section cbir-section_name_sites"},
+    )
     if gallery_data is None:
         return []
     # Find the container of image links
+    image_links_container = gallery_data.find("div", {"class": "Root"})
     if image_links_container is None:
         return []
+    data_state = json.loads(image_links_container["data-state"])
     # Extract URLs from each div
     image_urls = []
+    for site in data_state["sites"]:
+        original_image_url = site["originalImage"]["url"]
         image_urls.append(original_image_url)
     return image_urls
     img_search_url = generate_images_search_links(file_path)
     if img_search_url is None:
         return []
     # Simulate a user agent to avoid being blocked
     headers = {
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",  # noqa: E501
+        "Content-Type": "application/json",
     }
     try:
         response = requests.get(img_search_url, headers=headers)
         response.raise_for_status()  # Raise an exception for bad status codes
         # Parse the HTML content
+        soup = BeautifulSoup(response.content, "html.parser")
         image_urls = get_image_links(soup.prettify())
         return image_urls
 def generate_images_search_links(file_path):
+    search_url = "https://yandex.ru/images/search"
+    params = {
+        "rpt": "imageview",
+        "format": "json",
+        "request": '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}',  # noqa: E501
+    }
     try:
+        files = {"upfile": ("blob", open(file_path, "rb"), "image/jpeg/webp")}
         response = requests.post(search_url, params=params, files=files)
+        query_string = json.loads(response.content)["blocks"][0]["params"][
+            "url"
+        ]
+        img_search_url = search_url + "?" + query_string
         return img_search_url
+    except requests.exceptions as e:
+        print(f"Error generating search URL: {e}")
         return None
 if __name__ == "__main__":
+    file_path = "T:\\Projects\\prj-nict-ai-content-detection\\data\\test_data\\towels.jpg.webp"  # noqa: E501
     image_urls = yandex_reverse_image_search(file_path)
     for image_url in image_urls:
         print(f"Image URL: {image_url}")

src/application/text/entity.py CHANGED Viewed

@@ -1,42 +1,51 @@
 import colorsys
 import json
 import re
 import openai
 from dotenv import load_dotenv
-import os
 from transformers import pipeline
-import gradio as gr
 ner_pipeline = pipeline("ner")
 load_dotenv()
-AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
-AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
-AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
 client = openai.AzureOpenAI(
-    api_version = "2024-05-01-preview",  # AZURE_OPENAI_API_VERSION,
-    api_key = AZURE_OPENAI_API_KEY,
-    azure_endpoint = AZURE_OPENAI_ENDPOINT,
-    )
-def extract_entities_gpt(original_text, compared_text, text_generation_model="o1-mini"):
     # "gpt-4o-mini" or "o1-mini"
     # Generate text using the selected models
     prompt = f"""
-Compare the ORIGINAL TEXT and the COMPARED TEXT.
-Identify and extract pairs of corresponding entities where the paraphrasing has resulted in a *significant* change in meaning.
-Focus *only* on entities where the paraphrasing has resulted in a *significant* change in meaning. This includes, but is not limited to:
-* **Numerical changes:**  e.g., "five" changed to "ten," "10%" changed to "50%"
-* **Time changes:**  e.g., "Monday" changed to "Sunday," "10th" changed to "21st"
-* **Name changes:** e.g., "Tokyo" changed to "New York," "Japan" changed to "Japanese"
-* **Opposite meanings:** e.g., "increase" changed to "decrease," "good" changed to "bad"
-* **Semantically different words:** e.g., "car" changed to "truck," "walk" changed to "run"
-Exclude entities where the meaning remains essentially the same, even if the wording is different (e.g., "big" changed to "large," "house" changed to "residence"). Also exclude purely stylistic changes that don't affect the core meaning.
-Output the extracted entity pairs, one pair per line, in the following JSON-like list format without wrapping characters:
 [
     ["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
     ["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
@@ -50,23 +59,24 @@ If there are no entities that satisfy above condition, output empty list "[]".
 # COMPARED TEXT:
 {compared_text}
     """
     # Generate text using the text generation model
-    # Generate text using the selected model
     try:
         response = client.chat.completions.create(
-            model=text_generation_model,
-            messages = [{"role": "user", "content": prompt}],
         )
         res = response.choices[0].message.content
     except openai.OpenAIError as e:
         print(f"Error interacting with OpenAI API: {e}")
-        res =  ""
     return res
 def read_json(json_string) -> list[list[str]]:
     try:
         entities = json.loads(json_string)
@@ -75,53 +85,64 @@ def read_json(json_string) -> list[list[str]]:
         for inner_list in entities:
             if inner_list not in unique_entities:
                 unique_entities.append(inner_list)
         return unique_entities
     except json.JSONDecodeError as e:
         print(f"Error decoding JSON: {e}")
         return []
 def lighten_color(hex_color, factor=1.8):
     """Lightens a HEX color by increasing its brightness in HSV space."""
     hex_color = hex_color.lstrip("#")
-    r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
     # Convert to HSV
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
     v = min(1.0, v * factor)  # Increase brightness
     # Convert back to HEX
-    r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
-    return f'#{r:02x}{g:02x}{b:02x}'
 def darken_color(hex_color, factor=0.7):
     """Darkens a hex color by reducing its brightness in the HSV space."""
     hex_color = hex_color.lstrip("#")
-    r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
     # Convert to HSV to adjust brightness
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
     v = max(0, v * factor)  # Reduce brightness
     # Convert back to HEX
-    r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
-    return f'#{r:02x}{g:02x}{b:02x}'
 def generate_color(index, total_colors=20):
     """Generates a unique, evenly spaced color for each index using HSL."""
     hue = index / total_colors  # Spread hues in range [0,1]
     saturation = 0.65  # Keep colors vivid
-    lightness = 0.75   # Balanced brightness
     # Convert HSL to RGB
     r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
     r, g, b = int(r * 255), int(g * 255), int(b * 255)
-    return f'#{r:02x}{g:02x}{b:02x}'  # Convert to hex
 def assign_colors_to_entities(entities):
@@ -130,12 +151,15 @@ def assign_colors_to_entities(entities):
     entities_colors = []
     for index, entity in enumerate(entities):
         color = generate_color(index, total_colors)
         # append color and index to entities_colors
-        entities_colors.append({"color": color, "input": entity[0], "source": entity[1]})
     return entities_colors
 def highlight_entities(text1, text2):
     if text1 == "" or text2 == "":
         return []
@@ -154,49 +178,62 @@ def highlight_entities(text1, text2):
     return entities_with_colors
-def apply_highlight(text, entities_with_colors, key="input", count = 0):
     if entities_with_colors == []:
         return text, []
     all_starts = []
     all_ends = []
     highlighted_text = ""
     temp_text = text
     for index, entity in enumerate(entities_with_colors):
         highlighted_text = ""
-        # find a list of starts and ends of entity in text:
         # starts = [m.start() for m in re.finditer(entity[key], temp_text)]
         # ends = [m.end() for m in re.finditer(entity[key], temp_text)]
-        starts =[]
         ends = []
         # "\b" is for bound a word
-        for m in re.finditer(r"\b" + re.escape(entity[key]) + r"\b", temp_text):
             starts.append(m.start())
             ends.append(m.end())
         all_starts.extend(starts)
         all_ends.extend(ends)
         color = entities_with_colors[index]["color"]
-        entity_color = lighten_color(color, factor=2.2)  # Lightened color for background text
-        label_color = darken_color(entity_color, factor=0.7)  # Darker color for background label (index)
         # Apply highlighting to each entity
         prev_end = 0
         for start, end in zip(starts, ends):
             # Append non-highlighted text
             highlighted_text += temp_text[prev_end:start]
             # Style the index as a label
-            index_label = (f'<span_style="background-color:{label_color};color:white;'
-                            f'padding:1px_4px;border-radius:4px;font-size:12px;'
-                            f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>')
             # Append highlighted text with index label
-            highlighted_text += (f'\n<span_style="background-color:{entity_color};color:black;'
-                                    f'border-radius:3px;font-size:14px;display:inline-block;">'
-                                    f'{index_label}{temp_text[start:end]}</span>\n')
             prev_end = end
         highlighted_text += temp_text[prev_end:]
         temp_text = highlighted_text
@@ -206,6 +243,7 @@ def apply_highlight(text, entities_with_colors, key="input", count = 0):
     highlight_idx_list = get_index_list(highlighted_text)
     return highlighted_text, highlight_idx_list
 def get_index_list(highlighted_text):
     """
     Generates a list of indices between corresponding start and end indices.
@@ -216,7 +254,7 @@ def get_index_list(highlighted_text):
     Returns:
         A list containing all indices within the specified ranges.
-        Returns an empty list if the input is invalid (e.g., different lengths,
         end < start, etc.).
     """
     highlighted_index = []
@@ -226,22 +264,24 @@ def get_index_list(highlighted_text):
             start_index = index
         if word.endswith("</span>"):
             end_index = index
             highlighted_index.extend(list(range(start_index, end_index + 1)))
     return highlighted_index
 def extract_entities(text):
     output = ner_pipeline(text)
     words = extract_words(output)
     words = combine_subwords(words)
-    # extract word in each entity and assign to a list of entities, connect words if there is no space between them
-    entities = []
     for entity in words:
         if entity not in entities:
             entities.append(entity)
     return entities
@@ -275,8 +315,12 @@ def combine_subwords(word_list):
     i = 0
     while i < len(word_list):
         if word_list[i].startswith("##"):
-            result[-1] += word_list[i][2:]  # Remove "##" and append to the previous word
-        elif i < len(word_list) - 2 and word_list[i + 1] == "-":  # Combine hyphenated words
             result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
             i += 2  # Skip the next two words
         else:
@@ -286,44 +330,57 @@ def combine_subwords(word_list):
 original_text = """
-Title: UK pledges support for Ukraine with 100-year pact
-Content: Sir Keir Starmer has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east. Zelensky praised the UK's commitment on Thursday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.
     """
 compared_text = """
 Title: Japan pledges support for Ukraine with 100-year pact
-Content: A leading Japanese figure has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where they signed a "landmark" 100-year pact with the war-stricken country. The visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east. Zelensky praised Japan's commitment on Thursday, amid wider concerns that the next US President, who is set to take office on Monday, could potentially reduce aid.
     """
 if __name__ == "__main__":
-    # text = "The Saudi authorities, I am told, are currently working flat out" \
-    #         "to collate everything they have on the Magdeburg market suspect," \
-    #         "Taleb al-Abdulmohsen, and to share it with Germany's ongoing" \
-    #         "investigation"
-    # print(extract_entities(text))
     with gr.Blocks() as demo:
         gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
         text1_input = gr.Textbox(
-            label="Paragraph 1",
-            lines=5,
             value=original_text,
         )
         text2_input = gr.Textbox(
-            label="Paragraph 2",
-            lines=5,
             value=compared_text,
         )
         submit_button = gr.Button("Highlight Matches")
-        output1 = gr.HTML("<br>"*10)
-        output2 = gr.HTML("<br>"*10)
         submit_button.click(
             fn=highlight_entities,
             inputs=[text1_input, text2_input],
-            outputs=[output1, output2]
         )
     # Launch the Gradio app
     demo.launch()

 import colorsys
 import json
+import os
 import re
+import gradio as gr
 import openai
 from dotenv import load_dotenv
 from transformers import pipeline
 ner_pipeline = pipeline("ner")
 load_dotenv()
+AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
+AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
+AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")
 client = openai.AzureOpenAI(
+    api_version="2024-05-01-preview",  # AZURE_OPENAI_API_VERSION,
+    api_key=AZURE_OPENAI_API_KEY,
+    azure_endpoint=AZURE_OPENAI_ENDPOINT,
+)
+def extract_entities_gpt(
+    original_text,
+    compared_text,
+    text_generation_model="o1-mini",
+):
     # "gpt-4o-mini" or "o1-mini"
     # Generate text using the selected models
     prompt = f"""
+Compare the ORIGINAL TEXT and the COMPARED TEXT.
+Find entity pairs with significantly different meanings after paraphrasing.
+Focus only on these significantly changed entities.  These include:
+* **Numerical changes:**  e.g., "five" -> "ten," "10%" -> "50%"
+* **Time changes:**  e.g., "Monday" -> "Sunday," "10th" -> "21st"
+* **Name changes:** e.g., "Tokyo" -> "New York," "Japan" -> "Japanese"
+* **Opposite meanings:** e.g., "increase" -> "decrease," "good" -> "bad"
+* **Semantically different words:** e.g., "car" -> "truck," "walk" -> "run"
+Exclude entities where the meaning remains essentially the same,
+even if the wording is different
+(e.g., "big" changed to "large," "house" changed to "residence").
+Also exclude purely stylistic changes that don't affect the core meaning.
+Output the extracted entity pairs, one pair per line,
+in the following JSON-like list format without wrapping characters:
 [
     ["ORIGINAL_TEXT_entity_1", "COMPARED_TEXT_entity_1"],
     ["ORIGINAL_TEXT_entity_2", "COMPARED_TEXT_entity_2"]
 # COMPARED TEXT:
 {compared_text}
     """
     # Generate text using the text generation model
+    # Generate text using the selected model
     try:
         response = client.chat.completions.create(
+            model=text_generation_model,
+            messages=[{"role": "user", "content": prompt}],
         )
         res = response.choices[0].message.content
     except openai.OpenAIError as e:
         print(f"Error interacting with OpenAI API: {e}")
+        res = ""
     return res
 def read_json(json_string) -> list[list[str]]:
     try:
         entities = json.loads(json_string)
         for inner_list in entities:
             if inner_list not in unique_entities:
                 unique_entities.append(inner_list)
         return unique_entities
     except json.JSONDecodeError as e:
         print(f"Error decoding JSON: {e}")
         return []
 def lighten_color(hex_color, factor=1.8):
     """Lightens a HEX color by increasing its brightness in HSV space."""
     hex_color = hex_color.lstrip("#")
+    r, g, b = (
+        int(hex_color[0:2], 16),
+        int(hex_color[2:4], 16),
+        int(hex_color[4:6], 16),
+    )
     # Convert to HSV
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
     v = min(1.0, v * factor)  # Increase brightness
     # Convert back to HEX
+    r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
+    return f"#{r:02x}{g:02x}{b:02x}"
 def darken_color(hex_color, factor=0.7):
     """Darkens a hex color by reducing its brightness in the HSV space."""
     hex_color = hex_color.lstrip("#")
+    r, g, b = (
+        int(hex_color[0:2], 16),
+        int(hex_color[2:4], 16),
+        int(hex_color[4:6], 16),
+    )
     # Convert to HSV to adjust brightness
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
     v = max(0, v * factor)  # Reduce brightness
     # Convert back to HEX
+    r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
+    return f"#{r:02x}{g:02x}{b:02x}"
 def generate_color(index, total_colors=20):
     """Generates a unique, evenly spaced color for each index using HSL."""
     hue = index / total_colors  # Spread hues in range [0,1]
     saturation = 0.65  # Keep colors vivid
+    lightness = 0.75  # Balanced brightness
     # Convert HSL to RGB
     r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
     r, g, b = int(r * 255), int(g * 255), int(b * 255)
+    return f"#{r:02x}{g:02x}{b:02x}"  # Convert to hex
 def assign_colors_to_entities(entities):
     entities_colors = []
     for index, entity in enumerate(entities):
         color = generate_color(index, total_colors)
         # append color and index to entities_colors
+        entities_colors.append(
+            {"color": color, "input": entity[0], "source": entity[1]},
+        )
     return entities_colors
 def highlight_entities(text1, text2):
     if text1 == "" or text2 == "":
         return []
     return entities_with_colors
+def apply_highlight(text, entities_with_colors, key="input", count=0):
     if entities_with_colors == []:
         return text, []
     all_starts = []
     all_ends = []
     highlighted_text = ""
     temp_text = text
     for index, entity in enumerate(entities_with_colors):
         highlighted_text = ""
+        # find a list of starts and ends of entity in text:
         # starts = [m.start() for m in re.finditer(entity[key], temp_text)]
         # ends = [m.end() for m in re.finditer(entity[key], temp_text)]
+        starts = []
         ends = []
         # "\b" is for bound a word
+        for m in re.finditer(
+            r"\b" + re.escape(entity[key]) + r"\b",
+            temp_text,
+        ):
             starts.append(m.start())
             ends.append(m.end())
         all_starts.extend(starts)
         all_ends.extend(ends)
         color = entities_with_colors[index]["color"]
+        entity_color = lighten_color(
+            color,
+            factor=2.2,
+        )  # Lightened color for background text
+        label_color = darken_color(
+            entity_color,
+            factor=0.7,
+        )  # Darker color for background label (index)
         # Apply highlighting to each entity
         prev_end = 0
         for start, end in zip(starts, ends):
             # Append non-highlighted text
             highlighted_text += temp_text[prev_end:start]
             # Style the index as a label
+            index_label = (
+                f'<span_style="background-color:{label_color};color:white;'
+                f"padding:1px_4px;border-radius:4px;font-size:12px;"
+                f'font-weight:bold;display:inline-block;margin-right:4px;">{index + 1 + count}</span>'  # noqa: E501
+            )
             # Append highlighted text with index label
+            highlighted_text += (
+                f'\n<span_style="background-color:{entity_color};color:black;'
+                f'border-radius:3px;font-size:14px;display:inline-block;">'
+                f"{index_label}{temp_text[start:end]}</span>\n"
+            )
             prev_end = end
         highlighted_text += temp_text[prev_end:]
         temp_text = highlighted_text
     highlight_idx_list = get_index_list(highlighted_text)
     return highlighted_text, highlight_idx_list
 def get_index_list(highlighted_text):
     """
     Generates a list of indices between corresponding start and end indices.
     Returns:
         A list containing all indices within the specified ranges.
+        Returns an empty list if the input is invalid (e.g., different lengths,
         end < start, etc.).
     """
     highlighted_index = []
             start_index = index
         if word.endswith("</span>"):
             end_index = index
             highlighted_index.extend(list(range(start_index, end_index + 1)))
     return highlighted_index
 def extract_entities(text):
     output = ner_pipeline(text)
     words = extract_words(output)
     words = combine_subwords(words)
+    # extract word in each entity and assign to a list of entities,
+    # connect words if there is no space between them
+    entities = []
     for entity in words:
         if entity not in entities:
             entities.append(entity)
     return entities
     i = 0
     while i < len(word_list):
         if word_list[i].startswith("##"):
+            result[-1] += word_list[i][
+                2:
+            ]  # Remove "##" and append to the previous word
+        elif (
+            i < len(word_list) - 2 and word_list[i + 1] == "-"
+        ):  # Combine hyphenated words
             result.append(word_list[i] + word_list[i + 1] + word_list[i + 2])
             i += 2  # Skip the next two words
         else:
 original_text = """
+Title: UK pledges support for Ukraine with 100-year pact
+Content: Sir Keir Starmer has pledged to put Ukraine in the "strongest
+possible position" on a trip to Kyiv where he signed a "landmark"
+100-year pact with the war-stricken country. The prime minister's
+visit on Thursday was at one point marked by loud blasts and air
+raid sirens after a reported Russian drone attack was intercepted
+by Ukraine's defence systems. Acknowledging the "hello" from Russia,
+Volodymyr Zelensky said Ukraine would send its own "hello back".
+An estimated one million people have been killed or wounded in the
+war so far. As the invasion reaches the end of its third year, Ukraine
+is losing territory in the east. Zelensky praised the UK's commitment
+on Thursday, amid wider concerns that the US President-elect Donald
+Trump, who is set to take office on Monday, could potentially reduce aid.
     """
 compared_text = """
 Title: Japan pledges support for Ukraine with 100-year pact
+Content: A leading Japanese figure has pledged to put Ukraine
+in the "strongest possible position" on a trip to Kyiv where
+they signed a "landmark" 100-year pact with the war-stricken country.
+The visit on Thursday was at one point marked by loud blasts and air
+raid sirens after a reported Russian drone attack was intercepted by
+Ukraine's defence systems. Acknowledging the "hello" from Russia,
+Volodymyr Zelensky said Ukraine would send its own "hello back".
+An estimated one million people have been killed or wounded in the
+war so far. As the invasion reaches the end of its third year, Ukraine
+is losing territory in the east. Zelensky praised Japan's commitment
+on Thursday, amid wider concerns that the next US President, who is
+set to take office on Monday, could potentially reduce aid.
     """
 if __name__ == "__main__":
     with gr.Blocks() as demo:
         gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
         text1_input = gr.Textbox(
+            label="Paragraph 1",
+            lines=5,
             value=original_text,
         )
         text2_input = gr.Textbox(
+            label="Paragraph 2",
+            lines=5,
             value=compared_text,
         )
         submit_button = gr.Button("Highlight Matches")
+        output1 = gr.HTML("<br>" * 10)
+        output2 = gr.HTML("<br>" * 10)
         submit_button.click(
             fn=highlight_entities,
             inputs=[text1_input, text2_input],
+            outputs=[output1, output2],
         )
     # Launch the Gradio app
     demo.launch()

src/application/text/helper.py CHANGED Viewed

@@ -1,73 +1,72 @@
-from collections import Counter
-from difflib import SequenceMatcher
 import re
 import string
-from sklearn.feature_extraction.text import TfidfVectorizer
 from nltk.tokenize import word_tokenize
 from nltk.util import ngrams
 def clean_text(text):
     """Doc cleaning"""
-    punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""  # not include , and . due to number
     # Lowering text
     text = text.lower()
     # Removing punctuation
     text = "".join([c for c in text if c not in punctuations])
     # Removing whitespace and newlines
-    text = re.sub(r'\s+',' ',text)
     text.replace("£", " * ")
     words = text.split()
-    text = ' '.join(words[:18])  # Join the first 18 words back into a string
     return text
 def remove_punctuation(text):
     """Remove punctuation from a given text."""
     punctuation_without_dot = string.punctuation.replace(".", "")
-    translator = str.maketrans('', '', punctuation_without_dot)
     return text.translate(translator)
 def get_keywords(text, num_keywords=5):
     """Return top k keywords from a doc using TF-IDF method"""
     # Create a TF-IDF Vectorizer
-    vectorizer = TfidfVectorizer(stop_words='english')
     # Fit and transform the text
     tfidf_matrix = vectorizer.fit_transform([text])
     # Get feature names (words)
     feature_names = vectorizer.get_feature_names_out()
     # Get TF-IDF scores
     tfidf_scores = tfidf_matrix.toarray()[0]
     # Sort words by TF-IDF score
     word_scores = list(zip(feature_names, tfidf_scores))
     word_scores.sort(key=lambda x: x[1], reverse=True)
     # Return top keywords
     return [word for word, score in word_scores[:num_keywords]]
-"""
-# Example usage
-text = "Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals. Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however this definition is rejected by major AI researchers."
-print(f"\n# Input text:\n'{text}'")
-print("\n----------------------\n")
-keywords = get_keywords(text)
-print("# Top keywords:", keywords)
-print("\n----------------------\n")
-"""
-def get_important_sentences(paragraph: str, keywords: list[str], num_sentences: int = 3) -> list[str]:
     """
-    Selects important sentences from a given paragraph based on a list of keywords.
     Args:
         paragraph (str): The input paragraph.
@@ -78,8 +77,10 @@ def get_important_sentences(paragraph: str, keywords: list[str], num_sentences:
         list: A list of important sentences.
     """
     # Clean and split the paragraph into sentences
-    sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', paragraph) if s.strip()]
     # Calculate the importance score for each sentence
     sentence_scores = []
     for sentence in sentences:
@@ -87,54 +88,49 @@ def get_important_sentences(paragraph: str, keywords: list[str], num_sentences:
         score = 0
         words = processed_sentence.lower().split()
         word_count = Counter(words)
         for keyword in keywords:
             if keyword.lower() in word_count:
                 score += word_count[keyword.lower()]
         sentence_scores.append((sentence, score))
     # Sort sentences by their scores in descending order
     sentence_scores.sort(key=lambda x: x[1], reverse=True)
     # Return the top N sentences
     return [sentence for sentence, score in sentence_scores[:num_sentences]]
-"""# Example usage
-keywords = get_keywords(paragraph)
-important_sentences = get_important_sentences(paragraph, keywords)
-print("# Important sentences:")
-for i, sentence in enumerate(important_sentences, 1):
-    print(f"{i}. {sentence}")
-print("\n----------------------\n")
-"""
-def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length: int = 5) -> list[str]:
     """
-    Extracts important phrases from a given paragraph based on a list of keywords.
     Phrase length is auto-determined, and overlapped parts are less than 20%.
     Args:
         paragraph (str): The input paragraph.
         keywords (list[str]): List of important keywords.
-        phrase_length (int): The length of phrases to extract (default is 5 words).
     Returns:
         list: A list of important phrases.
     """
     # Tokenize the paragraph into words
     words = word_tokenize(paragraph.lower())
     # Determine phrase length (between 3 and 7 words)
     phrase_length = min(max(len(words) // 10, 5), 7)
     # Generate n-grams (phrases) from the paragraph
     phrases = list(ngrams(words, phrase_length))
     important_phrases = []
     used_indices = set()
     for i, phrase in enumerate(phrases):
         # Check if the phrase contains any keyword
         if any(keyword.lower() in phrase for keyword in keywords):
@@ -142,33 +138,36 @@ def extract_important_phrases(paragraph: str, keywords: list[str], phrase_length
             if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
                 important_phrases.append(clean_text(" ".join(phrase)))
                 used_indices.add(i)
     return important_phrases
 def extract_equal_text(text1, text2):
     def cleanup(text):
         text = text.lower()
-        text = text.translate(str.maketrans('', '', string.punctuation))
         return text
     splited_text1 = cleanup(text1).split()
     splited_text2 = cleanup(text2).split()
     s = SequenceMatcher(None, splited_text1, splited_text2)
     equal_idx_1 = []
     equal_idx_2 = []
     text1 = text1.split()
     text2 = text2.split()
     for tag, i1, i2, j1, j2 in s.get_opcodes():
-        if tag == 'equal':
             equal_idx_1.append({"start": i1, "end": i2})
             equal_idx_2.append({"start": j1, "end": j2})
             # subtext_1 = " ".join(text1[i1:i2])
             # subtext_2 = " ".join(text2[j1:j2])
-            # print(f'{tag:7}   a[{i1:2}:{i2:2}] --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
     return equal_idx_1, equal_idx_2
 def connect_consecutive_indexes(nums):
     """
     Connects consecutive integers in a list.
@@ -197,11 +196,3 @@ def connect_consecutive_indexes(nums):
     result.append([start, end])  # Add the last range
     return result
-"""# Example usage
-keywords = get_keywords(paragraph)
-important_phrases = extract_important_phrases(paragraph, keywords)
-print("# Important phrases:")
-for i, phrase in enumerate(important_phrases[:5], 1):  # Print top 5 phrases
-    print(f"{i}. {phrase}")"""

 import re
 import string
+from collections import Counter
+from difflib import SequenceMatcher
 from nltk.tokenize import word_tokenize
 from nltk.util import ngrams
+from sklearn.feature_extraction.text import TfidfVectorizer
 def clean_text(text):
     """Doc cleaning"""
+    # exclude , and . due to number
+    punctuations = r"""!"#$%&'()*+-/:;<=>?@[\]^_`{|}~"""
     # Lowering text
     text = text.lower()
     # Removing punctuation
     text = "".join([c for c in text if c not in punctuations])
     # Removing whitespace and newlines
+    text = re.sub(r"\s+", " ", text)
     text.replace("£", " * ")
     words = text.split()
+    text = " ".join(words[:18])  # Join the first 18 words back into a string
     return text
 def remove_punctuation(text):
     """Remove punctuation from a given text."""
     punctuation_without_dot = string.punctuation.replace(".", "")
+    translator = str.maketrans("", "", punctuation_without_dot)
     return text.translate(translator)
 def get_keywords(text, num_keywords=5):
     """Return top k keywords from a doc using TF-IDF method"""
     # Create a TF-IDF Vectorizer
+    vectorizer = TfidfVectorizer(stop_words="english")
     # Fit and transform the text
     tfidf_matrix = vectorizer.fit_transform([text])
     # Get feature names (words)
     feature_names = vectorizer.get_feature_names_out()
     # Get TF-IDF scores
     tfidf_scores = tfidf_matrix.toarray()[0]
     # Sort words by TF-IDF score
     word_scores = list(zip(feature_names, tfidf_scores))
     word_scores.sort(key=lambda x: x[1], reverse=True)
     # Return top keywords
     return [word for word, score in word_scores[:num_keywords]]
+def get_important_sentences(
+    paragraph: str,
+    keywords: list[str],
+    num_sentences: int = 3,
+) -> list[str]:
     """
+    Selects important sentences based on a list of keywords.
     Args:
         paragraph (str): The input paragraph.
         list: A list of important sentences.
     """
     # Clean and split the paragraph into sentences
+    sentences = [
+        s.strip() for s in re.split(r"(?<=[.!?])\s+", paragraph) if s.strip()
+    ]
     # Calculate the importance score for each sentence
     sentence_scores = []
     for sentence in sentences:
         score = 0
         words = processed_sentence.lower().split()
         word_count = Counter(words)
         for keyword in keywords:
             if keyword.lower() in word_count:
                 score += word_count[keyword.lower()]
         sentence_scores.append((sentence, score))
     # Sort sentences by their scores in descending order
     sentence_scores.sort(key=lambda x: x[1], reverse=True)
     # Return the top N sentences
     return [sentence for sentence, score in sentence_scores[:num_sentences]]
+def extract_important_phrases(
+    paragraph: str,
+    keywords: list[str],
+    phrase_length: int = 5,
+) -> list[str]:
     """
+    Extracts important phrases based on a list of keywords.
     Phrase length is auto-determined, and overlapped parts are less than 20%.
     Args:
         paragraph (str): The input paragraph.
         keywords (list[str]): List of important keywords.
+        phrase_length (int): Length of phrases to extract (default: 5 words).
     Returns:
         list: A list of important phrases.
     """
     # Tokenize the paragraph into words
     words = word_tokenize(paragraph.lower())
     # Determine phrase length (between 3 and 7 words)
     phrase_length = min(max(len(words) // 10, 5), 7)
     # Generate n-grams (phrases) from the paragraph
     phrases = list(ngrams(words, phrase_length))
     important_phrases = []
     used_indices = set()
     for i, phrase in enumerate(phrases):
         # Check if the phrase contains any keyword
         if any(keyword.lower() in phrase for keyword in keywords):
             if not any(abs(i - j) < phrase_length * 0.8 for j in used_indices):
                 important_phrases.append(clean_text(" ".join(phrase)))
                 used_indices.add(i)
     return important_phrases
 def extract_equal_text(text1, text2):
     def cleanup(text):
         text = text.lower()
+        text = text.translate(str.maketrans("", "", string.punctuation))
         return text
     splited_text1 = cleanup(text1).split()
     splited_text2 = cleanup(text2).split()
     s = SequenceMatcher(None, splited_text1, splited_text2)
     equal_idx_1 = []
     equal_idx_2 = []
     text1 = text1.split()
     text2 = text2.split()
     for tag, i1, i2, j1, j2 in s.get_opcodes():
+        if tag == "equal":
             equal_idx_1.append({"start": i1, "end": i2})
             equal_idx_2.append({"start": j1, "end": j2})
             # subtext_1 = " ".join(text1[i1:i2])
             # subtext_2 = " ".join(text2[j1:j2])
+            # print(f'{tag:7}   a[{i1:2}:{i2:2}]
+            # --> b[{j1:2}:{j1:2}] {subtext_1!r:>55} --> {subtext_2!r}')
     return equal_idx_1, equal_idx_2
 def connect_consecutive_indexes(nums):
     """
     Connects consecutive integers in a list.
     result.append([start, end])  # Add the last range
     return result

src/application/text/highlight_text.py CHANGED Viewed

@@ -1,36 +1,45 @@
-import gradio as gr
 import colorsys
-from functools import partial
-import random
 def lighten_color(hex_color, factor=1.8):
     """Lightens a HEX color by increasing its brightness in HSV space."""
     hex_color = hex_color.lstrip("#")
-    r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
     # Convert to HSV
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
     v = min(1.0, v * factor)  # Increase brightness
     # Convert back to HEX
-    r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
-    return f'#{r:02x}{g:02x}{b:02x}'
 def darken_color(hex_color, factor=0.7):
     """Darkens a hex color by reducing its brightness in the HSV space."""
     hex_color = hex_color.lstrip("#")
-    r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
     # Convert to HSV to adjust brightness
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
     v = max(0, v * factor)  # Reduce brightness
     # Convert back to HEX
-    r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
-    return f'#{r:02x}{g:02x}{b:02x}'
 # Generate unique colors for pairs
 def generate_color(index, total_colors=20):
@@ -38,51 +47,98 @@ def generate_color(index, total_colors=20):
     hue = index / total_colors  # Spread hues in range [0,1]
     saturation = 0.65  # Keep colors vivid
-    lightness = 0.75   # Balanced brightness
     # Convert HSL to RGB
     r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
     r, g, b = int(r * 255), int(g * 255), int(b * 255)
-    return f'#{r:02x}{g:02x}{b:02x}'  # Convert to hex
 def highlight_pairs(text1, text2):
     """Highlight matching pairs between two paragraphs"""
     # Predefined matching pairs
     match_pairs = [
-        {"index": 1, "text1": "deep learning", "start1": 13, "end1": 26,
-                    "text2": "deep learning", "start2": 12, "end2": 25},
-        {"index": 2, "text1": "neural networks", "start1": 56, "end1": 71,
-                    "text2": "neural networks", "start2": 68, "end2": 83},
-        {"index": 3, "text1": "AI research", "start1": 86, "end1": 97,
-                    "text2": "AI research", "start2": 55, "end2": 66},
     ]
     # Assign unique colors to each index
-    pair_colors = {pair["index"]: generate_color(pair["index"], total_colors=len(match_pairs)) for pair in match_pairs}
-    def apply_highlight(text, pairs, key_start, key_end, key_index, pair_colors):
         highlighted_text = ""
         prev_end = 0
         for pair in sorted(pairs, key=lambda x: x[key_start]):
             start, end, index = pair[key_start], pair[key_end], pair[key_index]
-            color = pair_colors.get(index, "#ddd")  # Default color if not found
-            color = lighten_color(color, factor=2.2)  # Lightened color for background text
-            label_color = darken_color(color, factor=0.7)  # Make label color darker
             # Style the index as a label
-            index_label = (f'<span style="background-color:{label_color}; color:white; '
-                            f'padding:1px 4px; border-radius:4px; font-size:12px; '
-                            f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>')
             # Append non-highlighted text
             highlighted_text += text[prev_end:start]
             # Append highlighted text with index label
-            highlighted_text += (f'<span style="background-color:{color}; '
-                                    f'border-radius:3px; font-size:14px; display:inline-block;">'
-                                    f'{index_label} {text[start:end]}</span>')
             prev_end = end
         # Append remaining text
@@ -90,36 +146,57 @@ def highlight_pairs(text1, text2):
         return highlighted_text
     # Apply highlighting to both paragraphs using the global MATCH_PAIRS
-    highlighted_text1 = apply_highlight(text1, match_pairs, "start1", "end1", "index", pair_colors)
-    highlighted_text2 = apply_highlight(text2, match_pairs, "start2", "end2", "index", pair_colors)
     return highlighted_text1, highlighted_text2
-if __name__ == '__main__':
     # Create Gradio Interface
     text1 = ""
     with gr.Blocks() as demo:
         gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
         text1_input = gr.Textbox(
-            label="Paragraph 1",
-            lines=5,
-            value="The field of deep learning is advancing rapidly. Modern neural networks are improving AI research significantly."
         )
         text2_input = gr.Textbox(
-            label="Paragraph 2",
-            lines=5,
-            value="Advances in deep learning have led to breakthroughs in AI research. Neural networks are at the core of these innovations"
         )
         output1 = gr.HTML()
         output2 = gr.HTML()
         submit_button = gr.Button("Highlight Matches")
         submit_button.click(
             fn=highlight_pairs,
             inputs=[text1_input, text2_input],
-            outputs=[output1, output2]
         )
     # Launch the Gradio app
     demo.launch()

 import colorsys
+import gradio as gr
 def lighten_color(hex_color, factor=1.8):
     """Lightens a HEX color by increasing its brightness in HSV space."""
     hex_color = hex_color.lstrip("#")
+    r, g, b = (
+        int(hex_color[0:2], 16),
+        int(hex_color[2:4], 16),
+        int(hex_color[4:6], 16),
+    )
     # Convert to HSV
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
     v = min(1.0, v * factor)  # Increase brightness
     # Convert back to HEX
+    r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
+    return f"#{r:02x}{g:02x}{b:02x}"
 def darken_color(hex_color, factor=0.7):
     """Darkens a hex color by reducing its brightness in the HSV space."""
     hex_color = hex_color.lstrip("#")
+    r, g, b = (
+        int(hex_color[0:2], 16),
+        int(hex_color[2:4], 16),
+        int(hex_color[4:6], 16),
+    )
     # Convert to HSV to adjust brightness
     h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
     v = max(0, v * factor)  # Reduce brightness
     # Convert back to HEX
+    r, g, b = (int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v))
+    return f"#{r:02x}{g:02x}{b:02x}"
 # Generate unique colors for pairs
 def generate_color(index, total_colors=20):
     hue = index / total_colors  # Spread hues in range [0,1]
     saturation = 0.65  # Keep colors vivid
+    lightness = 0.75  # Balanced brightness
     # Convert HSL to RGB
     r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
     r, g, b = int(r * 255), int(g * 255), int(b * 255)
+    return f"#{r:02x}{g:02x}{b:02x}"  # Convert to hex
 def highlight_pairs(text1, text2):
     """Highlight matching pairs between two paragraphs"""
     # Predefined matching pairs
     match_pairs = [
+        {
+            "index": 1,
+            "text1": "deep learning",
+            "start1": 13,
+            "end1": 26,
+            "text2": "deep learning",
+            "start2": 12,
+            "end2": 25,
+        },
+        {
+            "index": 2,
+            "text1": "neural networks",
+            "start1": 56,
+            "end1": 71,
+            "text2": "neural networks",
+            "start2": 68,
+            "end2": 83,
+        },
+        {
+            "index": 3,
+            "text1": "AI research",
+            "start1": 86,
+            "end1": 97,
+            "text2": "AI research",
+            "start2": 55,
+            "end2": 66,
+        },
     ]
     # Assign unique colors to each index
+    pair_colors = {
+        pair["index"]: generate_color(
+            pair["index"],
+            total_colors=len(match_pairs),
+        )
+        for pair in match_pairs
+    }
+    def apply_highlight(
+        text,
+        pairs,
+        key_start,
+        key_end,
+        key_index,
+        pair_colors,
+    ):
         highlighted_text = ""
         prev_end = 0
         for pair in sorted(pairs, key=lambda x: x[key_start]):
             start, end, index = pair[key_start], pair[key_end], pair[key_index]
+            color = pair_colors.get(
+                index,
+                "#ddd",
+            )  # Default color if not found
+            color = lighten_color(
+                color,
+                factor=2.2,
+            )  # Lightened color for background text
+            label_color = darken_color(
+                color,
+                factor=0.7,
+            )  # Make label color darker
             # Style the index as a label
+            index_label = (
+                f'<span style="background-color:{label_color}; color:white; '
+                f"padding:1px 4px; border-radius:4px; font-size:12px; "
+                f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>'  # noqa: E501
+            )
             # Append non-highlighted text
             highlighted_text += text[prev_end:start]
             # Append highlighted text with index label
+            highlighted_text += (
+                f'<span style="background-color:{color}; '
+                f'border-radius:3px; font-size:14px; display:inline-block;">'
+                f"{index_label} {text[start:end]}</span>"
+            )
             prev_end = end
         # Append remaining text
         return highlighted_text
     # Apply highlighting to both paragraphs using the global MATCH_PAIRS
+    highlighted_text1 = apply_highlight(
+        text1,
+        match_pairs,
+        "start1",
+        "end1",
+        "index",
+        pair_colors,
+    )
+    highlighted_text2 = apply_highlight(
+        text2,
+        match_pairs,
+        "start2",
+        "end2",
+        "index",
+        pair_colors,
+    )
     return highlighted_text1, highlighted_text2
+if __name__ == "__main__":
     # Create Gradio Interface
     text1 = ""
     with gr.Blocks() as demo:
         gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
         text1_input = gr.Textbox(
+            label="Paragraph 1",
+            lines=5,
+            value="""
+The field of deep learning is advancing rapidly.
+Modern neural networks are improving AI research significantly.
+""",
         )
         text2_input = gr.Textbox(
+            label="Paragraph 2",
+            lines=5,
+            value="""
+Advances in deep learning have led to breakthroughs in AI research.
+Neural networks are at the core of these innovations",
+            """,
         )
         output1 = gr.HTML()
         output2 = gr.HTML()
         submit_button = gr.Button("Highlight Matches")
         submit_button.click(
             fn=highlight_pairs,
             inputs=[text1_input, text2_input],
+            outputs=[output1, output2],
         )
     # Launch the Gradio app
     demo.launch()

src/application/text/model_detection.py CHANGED Viewed

@@ -19,7 +19,7 @@ def detect_text_by_ai_model(
     """
     Model: chatgpt_detector_roberta
     Ref: https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta
     Detects if text is human or machine generated.
     Returns:

     """
     Model: chatgpt_detector_roberta
     Ref: https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta
     Detects if text is human or machine generated.
     Returns:

src/application/text/preprocessing.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from nltk.tokenize import sent_tokenize
 def split_into_paragraphs(input_text):
     """
     Splits input text into sentences by newlines.
@@ -17,6 +18,6 @@ def split_into_paragraphs(input_text):
     sentences = []
     for paragraph in paragraphs:
         paragraph = paragraph.strip()
-        if paragraph and paragraph != '\n':
             sentences.extend(sent_tokenize(paragraph))
-    return sentences

 from nltk.tokenize import sent_tokenize
 def split_into_paragraphs(input_text):
     """
     Splits input text into sentences by newlines.
     sentences = []
     for paragraph in paragraphs:
         paragraph = paragraph.strip()
+        if paragraph and paragraph != "\n":
             sentences.extend(sent_tokenize(paragraph))
+    return sentences

src/application/text/search.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from collections import Counter
 import os
 import string
 import requests
 from dotenv import load_dotenv
 from nltk.corpus import stopwords
@@ -9,27 +10,28 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from src.application.text.entity import extract_entities
-load_dotenv()
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
 def search_by_google(
-    query,
     num_results=10,
-    is_exact_terms = False
-    ) -> dict:
     """
     Searches the Google Custom Search Engine for the given query.
     Args:
         query: The search query.
-        is_exact_terms: Whether to use exact terms search (True) or regular search (False).
         num_results: The number of results to return (default: 10).
     Returns:
-        A dictionary containing the search results or None if there was an error.
     """
     url = "https://www.googleapis.com/customsearch/v1"
     params = {
         "key": GOOGLE_API_KEY,
@@ -40,7 +42,7 @@ def search_by_google(
         params["exactTerms"] = query
     else:
         params["q"] = query.replace('"', "")
     response = requests.get(url, params=params)
     if response.status_code == 200:
         return response.json()
@@ -48,9 +50,11 @@ def search_by_google(
         print(f"Error: {response.status_code}, {response.text}")
         return None
 def get_most_frequent_words(input_text, number_word=32):
     """
-    Gets the top words from the input text, excluding stop words and punctuation.
     Args:
         input_text: The input text as a string.
@@ -65,18 +69,21 @@ def get_most_frequent_words(input_text, number_word=32):
     words = word_tokenize(input_text.lower())  # Tokenize and lowercase
-    stop_words = set(stopwords.words('english'))
-    punctuation = set(string.punctuation) # get all punctuation
     filtered_words = [
-        word for word in words
-        if word.isalnum() and word not in stop_words and word not in punctuation
     ]
     word_frequencies = Counter(filtered_words)
     top_words = word_frequencies.most_common(number_word)
     for top_word in top_words:
         words.append(top_word[0])
     if len(words) > 32:
         search_phrase = " ".join(words[:32])
     else:
@@ -84,6 +91,7 @@ def get_most_frequent_words(input_text, number_word=32):
     return search_phrase
 def get_chunk(input_text, chunk_length=32, num_chunk=3):
     """
     Splits the input text into chunks of a specified length.
@@ -94,7 +102,7 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
         chunk_length: The desired length of each chunk (in words).
     Returns:
-        A list of string chunks.
         Returns an empty list if input is invalid.
     """
     if not isinstance(input_text, str):
@@ -112,25 +120,26 @@ def get_chunk(input_text, chunk_length=32, num_chunk=3):
     return chunks
 def get_keywords(text, num_keywords=5):
     """Return top k keywords from a doc using TF-IDF method"""
     # Create a TF-IDF Vectorizer
-    vectorizer = TfidfVectorizer(stop_words='english')
     # Fit and transform the text
     tfidf_matrix = vectorizer.fit_transform([text])
     # Get feature names (words)
     feature_names = vectorizer.get_feature_names_out()
     # Get TF-IDF scores
     tfidf_scores = tfidf_matrix.toarray()[0]
     # Sort words by TF-IDF score
     word_scores = list(zip(feature_names, tfidf_scores))
     word_scores.sort(key=lambda x: x[1], reverse=True)
     # Return top keywords
     return [word for word, score in word_scores[:num_keywords]]
@@ -150,29 +159,30 @@ def generate_search_phrases(input_text):
     """
     if not isinstance(input_text, str):
         return []
     search_phrases = []
     # Method 1: Get most frequent words
     search_phrases.append(get_most_frequent_words(input_text))
     # Method 2: Get the whole text
     search_phrases.append(input_text)
     # Method 3: Split text by chunks
     search_phrases.extend(get_chunk(input_text))  # TODO: for demo purposes
     # Method 4: Get most identities and key words
     entities = extract_entities(input_text)
     text_without_entities = remove_identities_from_text(input_text, entities)
     print(f"text_without_entities: {text_without_entities}")
     search_phrases.append(text_without_entities)
-    #keywords = get_keywords(input_text, 16)
-    #search_phrase = " ".join(entities) + " " + " ".join(keywords)
     # search_phrases.append(search_phrase) # TODO: for demo purposes
     return search_phrases
 def remove_identities_from_text(input_text, entities):
     """
     Removes entities from the input text.
@@ -183,5 +193,5 @@ def remove_identities_from_text(input_text, entities):
     """
     for entity in entities:
         input_text = input_text.replace(entity, "")
     return input_text

 import os
 import string
+from collections import Counter
 import requests
 from dotenv import load_dotenv
 from nltk.corpus import stopwords
 from src.application.text.entity import extract_entities
+load_dotenv()
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")
 def search_by_google(
+    query,
     num_results=10,
+    is_exact_terms=False,
+) -> dict:
     """
     Searches the Google Custom Search Engine for the given query.
     Args:
         query: The search query.
+        is_exact_terms: Whether to use exact terms search (True) or not.
         num_results: The number of results to return (default: 10).
     Returns:
+        A dict containing the search results or None if there was an error.
     """
     url = "https://www.googleapis.com/customsearch/v1"
     params = {
         "key": GOOGLE_API_KEY,
         params["exactTerms"] = query
     else:
         params["q"] = query.replace('"', "")
     response = requests.get(url, params=params)
     if response.status_code == 200:
         return response.json()
         print(f"Error: {response.status_code}, {response.text}")
         return None
 def get_most_frequent_words(input_text, number_word=32):
     """
+    Gets the top words from the input text,
+        excluding stop words and punctuation.
     Args:
         input_text: The input text as a string.
     words = word_tokenize(input_text.lower())  # Tokenize and lowercase
+    stop_words = set(stopwords.words("english"))
+    punctuation = set(string.punctuation)  # get all punctuation
     filtered_words = [
+        word
+        for word in words
+        if word.isalnum()
+        and word not in stop_words
+        and word not in punctuation
     ]
     word_frequencies = Counter(filtered_words)
     top_words = word_frequencies.most_common(number_word)
     for top_word in top_words:
         words.append(top_word[0])
     if len(words) > 32:
         search_phrase = " ".join(words[:32])
     else:
     return search_phrase
 def get_chunk(input_text, chunk_length=32, num_chunk=3):
     """
     Splits the input text into chunks of a specified length.
         chunk_length: The desired length of each chunk (in words).
     Returns:
+        A list of string chunks.
         Returns an empty list if input is invalid.
     """
     if not isinstance(input_text, str):
     return chunks
 def get_keywords(text, num_keywords=5):
     """Return top k keywords from a doc using TF-IDF method"""
     # Create a TF-IDF Vectorizer
+    vectorizer = TfidfVectorizer(stop_words="english")
     # Fit and transform the text
     tfidf_matrix = vectorizer.fit_transform([text])
     # Get feature names (words)
     feature_names = vectorizer.get_feature_names_out()
     # Get TF-IDF scores
     tfidf_scores = tfidf_matrix.toarray()[0]
     # Sort words by TF-IDF score
     word_scores = list(zip(feature_names, tfidf_scores))
     word_scores.sort(key=lambda x: x[1], reverse=True)
     # Return top keywords
     return [word for word, score in word_scores[:num_keywords]]
     """
     if not isinstance(input_text, str):
         return []
     search_phrases = []
     # Method 1: Get most frequent words
     search_phrases.append(get_most_frequent_words(input_text))
     # Method 2: Get the whole text
     search_phrases.append(input_text)
     # Method 3: Split text by chunks
     search_phrases.extend(get_chunk(input_text))  # TODO: for demo purposes
     # Method 4: Get most identities and key words
     entities = extract_entities(input_text)
     text_without_entities = remove_identities_from_text(input_text, entities)
     print(f"text_without_entities: {text_without_entities}")
     search_phrases.append(text_without_entities)
+    # keywords = get_keywords(input_text, 16)
+    # search_phrase = " ".join(entities) + " " + " ".join(keywords)
     # search_phrases.append(search_phrase) # TODO: for demo purposes
     return search_phrases
 def remove_identities_from_text(input_text, entities):
     """
     Removes entities from the input text.
     """
     for entity in entities:
         input_text = input_text.replace(entity, "")
     return input_text

src/application/text/search_detection.py CHANGED Viewed

@@ -1,28 +1,33 @@
 import string
 import warnings
-warnings.simplefilter(action='ignore', category=FutureWarning)
-from src.application.text.preprocessing import split_into_paragraphs
-from src.application.text.search import generate_search_phrases, search_by_google
-from src.application.url_reader import URLReader
-from src.application.text.helper import extract_equal_text
-import numpy as np
 import nltk
 import torch
-from nltk.corpus import stopwords
-from sentence_transformers import SentenceTransformer, util
-import math
-from difflib import SequenceMatcher
 # Download necessary NLTK data files
-nltk.download('punkt', quiet=True)
-nltk.download('punkt_tab', quiet=True)
-nltk.download('stopwords', quiet=True)
 # load the model
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-PARAPHASE_MODEL = SentenceTransformer('paraphrase-MiniLM-L6-v2')
 PARAPHASE_MODEL.to(DEVICE)
 BATCH_SIZE = 8
@@ -35,63 +40,94 @@ MIN_RATIO_PARAPHRASE_NUM = 0.7
 MAX_CHAR_SIZE = 30000
-def detect_text_by_relative_search(input_text, index, is_support_opposite = False):
     checked_urls = set()
     searched_phrases = generate_search_phrases(input_text[index])
     for candidate in searched_phrases:
         search_results = search_by_google(candidate)
-        urls = [item['link'] for item in search_results.get("items", [])]
         for url in urls[:3]:
-            if url in checked_urls: # visited url
                 continue
             if "bbc.com" not in url:
                 continue
             checked_urls.add(url)
             print(f"\t\tChecking URL: {url}")
             content = URLReader(url)
             if content.is_extracted is True:
                 if content.title is None or content.text is None:
-                    print(f"\t\t\t↑↑↑ Title or text not found")
                     continue
                 page_text = content.title + "\n" + content.text
                 if len(page_text) > MAX_CHAR_SIZE:
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
-                print(f"\t\t\t↑↑↑ Title: {content.title}")
-                paraphrase, aligned_first_sentences = check_paraphrase(input_text[index], page_text, url)
                 if paraphrase is False:
-                    return paraphrase, url, aligned_first_sentences, content.images, index
                 sub_paraphrase = True
-                while sub_paraphrase == True:
                     index += 1
                     print(f"----search {index} < {len(input_text)}----")
                     if index >= len(input_text):
                         print(f"input_text_last: {input_text[-1]}")
                         break
                     print(f"input_text: {input_text[index]}")
-                    sub_paraphrase, sub_sentences = check_paraphrase(input_text[index], page_text, url)
                     print(f"sub_paraphrase: {sub_paraphrase}")
                     print(f"sub_sentences: {sub_sentences}")
-                    if sub_paraphrase == True:
-                        aligned_first_sentences["input_sentence"] += "<br>" + sub_sentences["input_sentence"]
-                        aligned_first_sentences["matched_sentence"] += "<br>" + sub_sentences["matched_sentence"]
-                        aligned_first_sentences["similarity"] += sub_sentences["similarity"]
                         aligned_first_sentences["similarity"] /= 2
                 print(f"paraphrase: {paraphrase}")
                 print(f"aligned_first_sentences: {aligned_first_sentences}")
-                return paraphrase, url, aligned_first_sentences, content.images, index
     return False, None, [], [], index
 def find_text_source(text, text_index, sentences_df):
     sentence = {
         "input_sentence": text[text_index],
@@ -101,67 +137,94 @@ def find_text_source(text, text_index, sentences_df):
         "paraphrase": None,
         "url": "",
         "group": None,
-        }
     checked_urls = set()
     searched_phrases = generate_search_phrases(text[text_index])
     for candidate in searched_phrases:
         search_results = search_by_google(candidate)
-        urls = [item['link'] for item in search_results.get("items", [])]
         for url in urls[:3]:
-            if url in checked_urls: # visited url
                 continue
             if "bbc.com" not in url:
                 continue
             checked_urls.add(url)
             print(f"\t\tChecking URL: {url}")
             content = URLReader(url)
             if content.is_extracted is True:
                 if content.title is None or content.text is None:
-                    print(f"\t\t\t↑↑↑ Title or text not found")
                     continue
                 page_text = content.title + "\n" + content.text
                 if len(page_text) > MAX_CHAR_SIZE:
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
-                print(f"\t\t\t↑↑↑ Title: {content.title}")
-                paraphrase, aligned_sentence = check_paraphrase(text, page_text, url)
                 # add one more key "group" into aligned_sentence
-                sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
-                sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
-                sentences_df.loc[index, "label"] = aligned_sentence["label"]
-                sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
-                sentences_df.loc[index, "url"] = aligned_sentence["url"]
                 if aligned_sentence["paraphrase"] is False:
                     return paraphrase, sentences_df
-                for index, _ in enumerate(sentences_df):
-                    if sentences_df[index]["url"] is not None:
                         continue
                     # find content in new url
-                    _, aligned_sentence = check_paraphrase(text[index], page_text, url)
                     if aligned_sentence["url"] is not None:
                         continue
-                    sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
-                    sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
-                    sentences_df.loc[index, "label"] = aligned_sentence["label"]
-                    sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
-                    sentences_df.loc[index, "url"] = aligned_sentence["url"]
                 return sentences_df, content.images
     return sentence, []
 def longest_common_subsequence(arr1, arr2):
     """
     Finds the length of the longest common subsequence (contiguous) between
@@ -172,7 +235,7 @@ def longest_common_subsequence(arr1, arr2):
         arr2: The second array.
     Returns:
-        The length of the longest common subsequence.
         Returns 0 if either input is invalid.
     """
@@ -182,7 +245,7 @@ def longest_common_subsequence(arr1, arr2):
     n = len(arr1)
     m = len(arr2)
-    if n == 0 or m == 0: #handle empty list
         return 0
     # Create table dp with size (n+1) x (m+1)
@@ -200,10 +263,15 @@ def longest_common_subsequence(arr1, arr2):
     return max_length
-def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
-                   min_phrase_sentence_len, verbose=False):
     """
-    Checks if two sentences are similar based on exact match or
         longest common subsequence.
     Args:
@@ -218,7 +286,10 @@ def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
         Returns False if input is not valid.
     """
-    if not isinstance(input_sentence, str) or not isinstance(source_sentence, str):
         return False
     input_sentence = input_sentence.strip()
@@ -230,7 +301,10 @@ def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
     input_words = input_sentence.split()  # split without arguments
     source_words = source_sentence.split()  # split without arguments
-    if input_sentence == source_sentence and len(input_words) >= min_same_sentence_len:
         if verbose:
             print("Exact match found.")
         return True
@@ -251,29 +325,24 @@ def check_paraphrase(input_text, page_text, url):
     Args:
         input_text: The text to check for paraphrase.
         page_text: The text of the web page to compare with.
-        verbose: If True, print debug information.
     Returns:
         A tuple containing:
-            - is_paraphrase: True if the input text is considered a paraphrase, False otherwise.
-            - paraphrase_results: A list of dictionaries, each containing:
-                - input_sentence: The sentence from the input text.
-                - matched_sentence: The corresponding sentence from the web page (if found).
-                - similarity: The cosine similarity score between the sentences.
-                - is_paraphrase_sentence: True if the individual sentence pair meets the paraphrase criteria, False otherwise.
     """
     is_paraphrase_text = False
     if not isinstance(input_text, str) or not isinstance(page_text, str):
         return False, []
     # Extract sentences from input text and web page
     # input_sentences = split_into_paragraphs(input_text)
     input_sentences = [input_text]
     if not page_text:
         return is_paraphrase_text, []
     page_sentences = split_into_paragraphs(page_text)
     if not input_sentences or not page_sentences:
         return is_paraphrase_text, []
@@ -283,10 +352,18 @@ def check_paraphrase(input_text, page_text, url):
         if ", external" in sentence:
             additional_sentences.append(sentence.replace(", external", ""))
     page_sentences.extend(additional_sentences)
     # Encode sentences into embeddings
-    embeddings1 = PARAPHASE_MODEL.encode(input_sentences, convert_to_tensor=True, device=DEVICE)
-    embeddings2 = PARAPHASE_MODEL.encode(page_sentences, convert_to_tensor=True, device=DEVICE)
     # Compute cosine similarity matrix
     similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
@@ -298,7 +375,7 @@ def check_paraphrase(input_text, page_text, url):
     for i, sentence1 in enumerate(input_sentences):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
         best_matched_sentence = page_sentences[max_sim_index]
         is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
@@ -321,29 +398,40 @@ def check_paraphrase(input_text, page_text, url):
                 "url": url,
             }
-        # Check for individual sentence paraphrase if overall paraphrase not yet found
         if not is_paraphrase_text and check_sentence(
-            sentence1, page_sentences[max_sim_index], MIN_SAME_SENTENCE_LEN, MIN_PHRASE_SENTENCE_LEN
         ):
             is_paraphrase_text = True
-        #alignment.append(item)
         paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
     # Check if enough sentences are paraphrases
-    is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
     # Method 2: Check if overlapped words between sentences are more than 50%
-    equal_idx_1, _ = extract_equal_text(input_sentences[0], best_matched_sentence)
     matched_count = 0
     for index in equal_idx_1:
         matched_count += index["end"] - index["start"]
-    sent = input_sentences[0].translate(str.maketrans('', '', string.punctuation))
     num_words = len(sent.split())
     if matched_count > num_words / 2:
         is_paraphrase_text = True
     return is_paraphrase_text, alignment
@@ -359,10 +447,16 @@ def similarity_ratio(a, b):
         A float representing the similarity ratio between 0.0 and 1.0.
         Returns 0.0 if either input is None or not a string.
     """
-    if not isinstance(a, str) or not isinstance(b, str) or a is None or b is None:
         return 0.0  # Handle cases where inputs are not strings or None
     return SequenceMatcher(None, a, b).ratio()
 def check_human(alligned_sentences):
     """
     Checks if a sufficient number of input sentences are found within
@@ -379,5 +473,5 @@ def check_human(alligned_sentences):
     return False
-if __name__ == '__main__':
-    pass

 import string
 import warnings
+from difflib import SequenceMatcher
 import nltk
+import numpy as np
 import torch
+from sentence_transformers import (
+    SentenceTransformer,
+    util,
+)
+from src.application.text.helper import extract_equal_text
+from src.application.text.preprocessing import split_into_paragraphs
+from src.application.text.search import (
+    generate_search_phrases,
+    search_by_google,
+)
+from src.application.url_reader import URLReader
+warnings.simplefilter(action="ignore", category=FutureWarning)
 # Download necessary NLTK data files
+nltk.download("punkt", quiet=True)
+nltk.download("punkt_tab", quiet=True)
+nltk.download("stopwords", quiet=True)
 # load the model
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PARAPHASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
 PARAPHASE_MODEL.to(DEVICE)
 BATCH_SIZE = 8
 MAX_CHAR_SIZE = 30000
+def detect_text_by_relative_search(
+    input_text,
+    index,
+    is_support_opposite=False,
+):
     checked_urls = set()
     searched_phrases = generate_search_phrases(input_text[index])
     for candidate in searched_phrases:
         search_results = search_by_google(candidate)
+        urls = [item["link"] for item in search_results.get("items", [])]
         for url in urls[:3]:
+            if url in checked_urls:  # visited url
                 continue
             if "bbc.com" not in url:
                 continue
             checked_urls.add(url)
             print(f"\t\tChecking URL: {url}")
             content = URLReader(url)
             if content.is_extracted is True:
                 if content.title is None or content.text is None:
+                    print("\t\t\t↑↑↑ Title or text not found")
                     continue
                 page_text = content.title + "\n" + content.text
                 if len(page_text) > MAX_CHAR_SIZE:
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
+                print(f"\t\t\t↑↑↑ Title: {content.title}")
+                paraphrase, aligned_first_sentences = check_paraphrase(
+                    input_text[index],
+                    page_text,
+                    url,
+                )
                 if paraphrase is False:
+                    return (
+                        paraphrase,
+                        url,
+                        aligned_first_sentences,
+                        content.images,
+                        index,
+                    )
                 sub_paraphrase = True
+                while sub_paraphrase is True:
                     index += 1
                     print(f"----search {index} < {len(input_text)}----")
                     if index >= len(input_text):
                         print(f"input_text_last: {input_text[-1]}")
                         break
                     print(f"input_text: {input_text[index]}")
+                    sub_paraphrase, sub_sentences = check_paraphrase(
+                        input_text[index],
+                        page_text,
+                        url,
+                    )
                     print(f"sub_paraphrase: {sub_paraphrase}")
                     print(f"sub_sentences: {sub_sentences}")
+                    if sub_paraphrase is True:
+                        aligned_first_sentences["input_sentence"] += (
+                            "<br>" + sub_sentences["input_sentence"]
+                        )
+                        aligned_first_sentences["matched_sentence"] += (
+                            "<br>" + sub_sentences["matched_sentence"]
+                        )
+                        aligned_first_sentences["similarity"] += sub_sentences[
+                            "similarity"
+                        ]
                         aligned_first_sentences["similarity"] /= 2
                 print(f"paraphrase: {paraphrase}")
                 print(f"aligned_first_sentences: {aligned_first_sentences}")
+                return (
+                    paraphrase,
+                    url,
+                    aligned_first_sentences,
+                    content.images,
+                    index,
+                )
     return False, None, [], [], index
 def find_text_source(text, text_index, sentences_df):
     sentence = {
         "input_sentence": text[text_index],
         "paraphrase": None,
         "url": "",
         "group": None,
+    }
     checked_urls = set()
     searched_phrases = generate_search_phrases(text[text_index])
     for candidate in searched_phrases:
         search_results = search_by_google(candidate)
+        urls = [item["link"] for item in search_results.get("items", [])]
         for url in urls[:3]:
+            if url in checked_urls:  # visited url
                 continue
             if "bbc.com" not in url:
                 continue
             checked_urls.add(url)
             print(f"\t\tChecking URL: {url}")
             content = URLReader(url)
             if content.is_extracted is True:
                 if content.title is None or content.text is None:
+                    print("\t\t\t↑↑↑ Title or text not found")
                     continue
                 page_text = content.title + "\n" + content.text
                 if len(page_text) > MAX_CHAR_SIZE:
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
+                print(f"\t\t\t↑↑↑ Title: {content.title}")
+                paraphrase, aligned_sentence = check_paraphrase(
+                    text,
+                    page_text,
+                    url,
+                )
                 # add one more key "group" into aligned_sentence
+                sentences_df.loc[text_index, "input_sentence"] = (
+                    aligned_sentence["input_sentence"]
+                )
+                sentences_df.loc[text_index, "matched_sentence"] = (
+                    aligned_sentence["matched_sentence"]
+                )
+                sentences_df.loc[text_index, "label"] = aligned_sentence[
+                    "label"
+                ]
+                sentences_df.loc[text_index, "similarity"] = aligned_sentence[
+                    "similarity"
+                ]
+                sentences_df.loc[text_index, "url"] = aligned_sentence["url"]
                 if aligned_sentence["paraphrase"] is False:
                     return paraphrase, sentences_df
+                for text_index, _ in enumerate(sentences_df):
+                    if sentences_df[text_index]["url"] is not None:
                         continue
                     # find content in new url
+                    _, aligned_sentence = check_paraphrase(
+                        text[text_index],
+                        page_text,
+                        url,
+                    )
                     if aligned_sentence["url"] is not None:
                         continue
+                    sentences_df.loc[text_index, "input_sentence"] = (
+                        aligned_sentence["input_sentence"]
+                    )
+                    sentences_df.loc[text_index, "matched_sentence"] = (
+                        aligned_sentence["matched_sentence"]
+                    )
+                    sentences_df.loc[text_index, "label"] = aligned_sentence[
+                        "label"
+                    ]
+                    sentences_df.loc[text_index, "similarity"] = (
+                        aligned_sentence["similarity"]
+                    )
+                    sentences_df.loc[text_index, "url"] = aligned_sentence[
+                        "url"
+                    ]
                 return sentences_df, content.images
     return sentence, []
 def longest_common_subsequence(arr1, arr2):
     """
     Finds the length of the longest common subsequence (contiguous) between
         arr2: The second array.
     Returns:
+        The length of the longest common subsequence.
         Returns 0 if either input is invalid.
     """
     n = len(arr1)
     m = len(arr2)
+    if n == 0 or m == 0:  # handle empty list
         return 0
     # Create table dp with size (n+1) x (m+1)
     return max_length
+def check_sentence(
+    input_sentence,
+    source_sentence,
+    min_same_sentence_len,
+    min_phrase_sentence_len,
+    verbose=False,
+):
     """
+    Checks if two sentences are similar based on exact match or
         longest common subsequence.
     Args:
         Returns False if input is not valid.
     """
+    if not isinstance(input_sentence, str) or not isinstance(
+        source_sentence,
+        str,
+    ):
         return False
     input_sentence = input_sentence.strip()
     input_words = input_sentence.split()  # split without arguments
     source_words = source_sentence.split()  # split without arguments
+    if (
+        input_sentence == source_sentence
+        and len(input_words) >= min_same_sentence_len
+    ):
         if verbose:
             print("Exact match found.")
         return True
     Args:
         input_text: The text to check for paraphrase.
         page_text: The text of the web page to compare with.
+        url
     Returns:
         A tuple containing:
     """
     is_paraphrase_text = False
     if not isinstance(input_text, str) or not isinstance(page_text, str):
         return False, []
     # Extract sentences from input text and web page
     # input_sentences = split_into_paragraphs(input_text)
     input_sentences = [input_text]
     if not page_text:
         return is_paraphrase_text, []
     page_sentences = split_into_paragraphs(page_text)
     if not input_sentences or not page_sentences:
         return is_paraphrase_text, []
         if ", external" in sentence:
             additional_sentences.append(sentence.replace(", external", ""))
     page_sentences.extend(additional_sentences)
     # Encode sentences into embeddings
+    embeddings1 = PARAPHASE_MODEL.encode(
+        input_sentences,
+        convert_to_tensor=True,
+        device=DEVICE,
+    )
+    embeddings2 = PARAPHASE_MODEL.encode(
+        page_sentences,
+        convert_to_tensor=True,
+        device=DEVICE,
+    )
     # Compute cosine similarity matrix
     similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
     for i, sentence1 in enumerate(input_sentences):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
         best_matched_sentence = page_sentences[max_sim_index]
         is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
                 "url": url,
             }
+        # Check for individual sentence paraphrase
+        # if overall paraphrase not yet found
         if not is_paraphrase_text and check_sentence(
+            sentence1,
+            page_sentences[max_sim_index],
+            MIN_SAME_SENTENCE_LEN,
+            MIN_PHRASE_SENTENCE_LEN,
         ):
             is_paraphrase_text = True
+        # alignment.append(item)
         paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
     # Check if enough sentences are paraphrases
+    is_paraphrase_text = (
+        paraphrased_sentence_count > 0
+    )  # min_matching_sentences
     # Method 2: Check if overlapped words between sentences are more than 50%
+    equal_idx_1, _ = extract_equal_text(
+        input_sentences[0],
+        best_matched_sentence,
+    )
     matched_count = 0
     for index in equal_idx_1:
         matched_count += index["end"] - index["start"]
+    sent = input_sentences[0].translate(
+        str.maketrans("", "", string.punctuation),
+    )
     num_words = len(sent.split())
     if matched_count > num_words / 2:
         is_paraphrase_text = True
     return is_paraphrase_text, alignment
         A float representing the similarity ratio between 0.0 and 1.0.
         Returns 0.0 if either input is None or not a string.
     """
+    if (
+        not isinstance(a, str)
+        or not isinstance(b, str)
+        or a is None
+        or b is None
+    ):
         return 0.0  # Handle cases where inputs are not strings or None
     return SequenceMatcher(None, a, b).ratio()
 def check_human(alligned_sentences):
     """
     Checks if a sufficient number of input sentences are found within
     return False
+if __name__ == "__main__":
+    pass

src/application/url_reader.py CHANGED Viewed

@@ -1,31 +1,40 @@
 import string
-from bs4 import BeautifulSoup
-from newspaper import article, ArticleException, ArticleBinaryDataException
 import requests
 # TODO: move this to a config file
-MAX_URL_SIZE = 2000000 # ~2MB
-class URLReader():
-    def __init__(self, url: string, newspaper: bool=True):
         self.url = url
         self.text = None  # string
         self.title = None  # string
         self.images = None  # list of Image objects
         self.top_image = None  # Image object
         self.is_extracted = False
         url_size = self.get_size()
-        if url_size == None or url_size > MAX_URL_SIZE:
             return
-        else:
             self.is_extracted = True
-        self.newspaper = newspaper  # True if using newspaper4k, False if using BS
         if self.newspaper is True:
             self.extract_content_newspaper()
         else:
             self.extract_content_bs()
     def extract_content_newspaper(self):
         """
         Use newspaper4k to extracts content from a URL
@@ -36,20 +45,20 @@ class URLReader():
         Returns:
             The extracted content (title, text, images)
         """
         try:
             response = requests.get(self.url)
-            response.raise_for_status()  # Raise exception for unsuccessful requests
         except requests.exceptions.RequestException as e:
             print(f"Error fetching URL: {e}")
             return None
         try:
             news = article(url=self.url, fetch_images=True)
         except (ArticleException, ArticleBinaryDataException) as e:
             print(f"\t\t↑↑↑ Error downloading article: {e}")
             return None
         self.title = news.title
         self.text = news.text
         self.images = list(set(news.images))  # Remove duplicates
@@ -61,30 +70,30 @@ class URLReader():
         """
         response = requests.get(self.url)
         response.raise_for_status()
         response.encoding = response.apparent_encoding
         try:
             soup = BeautifulSoup(response.content, "html.parser")
-        except:
-            print(f"Error parsing HTML content from {self.url}")
             return None
         self.title = soup.title.string.strip() if soup.title else None
-        image_urls = [img['src'] for img in soup.find_all('img')]
         self.images = image_urls
         self.top_image = self.images[0]
         # Exclude text within specific elements
         for element in soup(["img", "figcaption", "table", "script", "style"]):
             element.extract()
-        #text = soup.get_text(separator="\n")
-        paragraphs = soup.find_all('p')
-        text = ' '.join([p.get_text() for p in paragraphs])
         self.text = text
     def get_size(self):
         """
         Retrieves the size of a URL's content using a HEAD request.
@@ -93,27 +102,32 @@ class URLReader():
             url: The URL to check.
         Returns:
-            The size of the content in bytes, or None if the size cannot be determined
             (e.g., due to network errors or missing Content-Length header).
         """
         try:
-            response = requests.head(self.url, allow_redirects=True, timeout=5) # Add timeout
-            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
-            content_length = response.headers.get('Content-Length')
             if content_length is not None:
                 return int(content_length)
             else:
-                print(f"\t\t↑↑↑ Content-Length header not found")
                 return None
         except requests.exceptions.RequestException as e:
             print(f"\t\t↑↑↑ Error getting URL size: {e}")
         return None
-if __name__ == '__main__':
     url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
     reader = URLReader(url)
     print(f"Title: {reader.title}")
-    print(f"Text: {reader.text}")

 import string
 import requests
+from bs4 import BeautifulSoup
+from newspaper import (
+    ArticleBinaryDataException,
+    ArticleException,
+    article,
+)
 # TODO: move this to a config file
+MAX_URL_SIZE = 2000000  # ~2MB
+class URLReader:
+    def __init__(self, url: string, newspaper: bool = True):
         self.url = url
         self.text = None  # string
         self.title = None  # string
         self.images = None  # list of Image objects
         self.top_image = None  # Image object
         self.is_extracted = False
         url_size = self.get_size()
+        if url_size is None or url_size > MAX_URL_SIZE:
             return
+        else:
             self.is_extracted = True
+        self.newspaper = (
+            newspaper  # True if using newspaper4k, False if using BS
+        )
         if self.newspaper is True:
             self.extract_content_newspaper()
         else:
             self.extract_content_bs()
     def extract_content_newspaper(self):
         """
         Use newspaper4k to extracts content from a URL
         Returns:
             The extracted content (title, text, images)
         """
         try:
             response = requests.get(self.url)
+            response.raise_for_status()
         except requests.exceptions.RequestException as e:
             print(f"Error fetching URL: {e}")
             return None
         try:
             news = article(url=self.url, fetch_images=True)
         except (ArticleException, ArticleBinaryDataException) as e:
             print(f"\t\t↑↑↑ Error downloading article: {e}")
             return None
         self.title = news.title
         self.text = news.text
         self.images = list(set(news.images))  # Remove duplicates
         """
         response = requests.get(self.url)
         response.raise_for_status()
         response.encoding = response.apparent_encoding
         try:
             soup = BeautifulSoup(response.content, "html.parser")
+        except Exception as e:
+            print(f"Error parsing HTML content from {self.url}: {e}")
             return None
         self.title = soup.title.string.strip() if soup.title else None
+        image_urls = [img["src"] for img in soup.find_all("img")]
         self.images = image_urls
         self.top_image = self.images[0]
         # Exclude text within specific elements
         for element in soup(["img", "figcaption", "table", "script", "style"]):
             element.extract()
+        # text = soup.get_text(separator="\n")
+        paragraphs = soup.find_all("p")
+        text = " ".join([p.get_text() for p in paragraphs])
         self.text = text
     def get_size(self):
         """
         Retrieves the size of a URL's content using a HEAD request.
             url: The URL to check.
         Returns:
+            The size of the content in bytes,
+            or None if the size cannot be determined
             (e.g., due to network errors or missing Content-Length header).
         """
         try:
+            response = requests.head(
+                self.url,
+                allow_redirects=True,
+                timeout=5,
+            )  # Add timeout
+            response.raise_for_status()  # Raise HTTPError for bad responses
+            content_length = response.headers.get("Content-Length")
             if content_length is not None:
                 return int(content_length)
             else:
+                print("\t\t↑↑↑ Content-Length header not found")
                 return None
         except requests.exceptions.RequestException as e:
             print(f"\t\t↑↑↑ Error getting URL size: {e}")
         return None
+if __name__ == "__main__":
     url = "https://www.bbc.com/sport/football/articles/c2d3rdy3673o"
     reader = URLReader(url)
     print(f"Title: {reader.title}")
+    print(f"Text: {reader.text}")

test.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import re
 def find_entity_spans(entity, text):
     """
     Finds the start and end indices of whole word entities in text.
@@ -13,10 +14,14 @@ def find_entity_spans(entity, text):
         of a found entity.  Returns an empty list if no entities are found.
     """
     spans = []
-    for m in re.finditer(r"\b" + re.escape(entity) + r"\b", text): # The crucial change
         spans.append((m.start(), m.end()))
     return spans
 # Example usage:
 temp_text = "win winger winning"
 entity = {"key": "win"}  # Example dictionary (adjust as needed)
@@ -27,24 +32,24 @@ print(spans)  # Output: [(0, 3)] (Only "win" at the beginning)
 temp_text = "The quick brown fox jumps over the lazy dog."
 entity = {"key": "fox"}
 spans = find_entity_spans(entity["key"], temp_text)
-print(spans) # Output: [(16, 19)]
 temp_text = "foxes fox foxing"
 entity = {"key": "fox"}
 spans = find_entity_spans(entity["key"], temp_text)
-print(spans) # Output: [(0, 3), (6, 9)]
 temp_text = "winger win winning"
 entity = {"key": "win"}
 spans = find_entity_spans(entity["key"], temp_text)
-print(spans) # Output: [(8, 11)]
 temp_text = "winger win winning"
 entity = {"key": "winger"}
 spans = find_entity_spans(entity["key"], temp_text)
-print(spans) # Output: [(0, 6)]
 temp_text = "winger win winning"
 entity = {"key": "winning"}
 spans = find_entity_spans(entity["key"], temp_text)
-print(spans) # Output: [(12, 19)]

 import re
 def find_entity_spans(entity, text):
     """
     Finds the start and end indices of whole word entities in text.
         of a found entity.  Returns an empty list if no entities are found.
     """
     spans = []
+    for m in re.finditer(
+        r"\b" + re.escape(entity) + r"\b",
+        text,
+    ):  # The crucial change
         spans.append((m.start(), m.end()))
     return spans
 # Example usage:
 temp_text = "win winger winning"
 entity = {"key": "win"}  # Example dictionary (adjust as needed)
 temp_text = "The quick brown fox jumps over the lazy dog."
 entity = {"key": "fox"}
 spans = find_entity_spans(entity["key"], temp_text)
+print(spans)  # Output: [(16, 19)]
 temp_text = "foxes fox foxing"
 entity = {"key": "fox"}
 spans = find_entity_spans(entity["key"], temp_text)
+print(spans)  # Output: [(0, 3), (6, 9)]
 temp_text = "winger win winning"
 entity = {"key": "win"}
 spans = find_entity_spans(entity["key"], temp_text)
+print(spans)  # Output: [(8, 11)]
 temp_text = "winger win winning"
 entity = {"key": "winger"}
 spans = find_entity_spans(entity["key"], temp_text)
+print(spans)  # Output: [(0, 6)]
 temp_text = "winger win winning"
 entity = {"key": "winning"}
 spans = find_entity_spans(entity["key"], temp_text)
+print(spans)  # Output: [(12, 19)]