Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Feb 7

Commit

504f37b

1 Parent(s): 8617832

solve bugs, update combination score and label, add method for better searching.

Browse files

Files changed (11) hide show

application.py +62 -22
application_2.py +5 -4
examples/example_image_real_4.jpg.webp +0 -0
examples/example_text_LLM_entities.txt +1 -1
gpt_test.py +39 -0
src/application/content_detection.py +128 -56
src/application/image/image_detection.py +4 -2
src/application/text/entity.py +8 -2
src/application/text/search.py +19 -3
src/application/text/search_detection.py +70 -0
test.py +50 -27

application.py CHANGED Viewed

@@ -8,10 +8,6 @@ from src.application.content_detection import NewsVerification
 from src.application.url_reader import URLReader
 from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
-GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
-SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
 AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
 AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
@@ -43,14 +39,6 @@ def load_url(url):
 def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
-    if news_image is not None:
-        # Convert to PIL Image for easier saving
-        img = Image.open(news_image)
-        # Save the image (you can customize the filename)
-        filepath = "example_image_input.jpg"  # Or use a dynamic filename
-        img.save(filepath)
     news_analysis = NewsVerification()
     news_analysis.load_news(news_title, news_content, news_image)
     news_analysis.generate_analysis_report()
@@ -60,7 +48,7 @@ def generate_analysis_report(news_title:str, news_content: str, news_image: Imag
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
     with gr.Row():
         # SETTINGS
         with gr.Column(scale=1):
@@ -93,13 +81,59 @@ with gr.Blocks() as demo:
                 with gr.Accordion("Input News"):
                     news_title = gr.Textbox(label="Title", value="")
                     news_image = gr.Image(label="Image", type="filepath")
-                    news_content = gr.Textbox(label="Content", value="", lines=12)
         # NEWS ANALYSIS REPORT
         with gr.Column(scale=2):
-            with gr.Accordion("News Analysis"):
-                detection_button = gr.Button("Verify news")
-                detailed_analysis = gr.HTML("<br>"*40)
     # Connect events
     load_button.click(
@@ -116,9 +150,9 @@ with gr.Blocks() as demo:
     generate_image_button.click(generate_fake_image,
                         inputs=[image_generation_model, news_title],
                         outputs=[news_image])
-    detection_button.click(generate_analysis_report,
                             inputs=[news_title, news_content, news_image],
-                            outputs=[detailed_analysis])
     # change Image
     #url_input.change(load_image, inputs=url_input, outputs=image_view)
@@ -132,23 +166,28 @@ with gr.Blocks() as demo:
             text_llm_topic = file.read()
         with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
             text_llm_modification = file.read()
     except FileNotFoundError:
         print("File not found.")
     except Exception as e:
         print(f"An error occurred: {e}")
-    title_1 = "Southampton news: Leeds target striker Cameron Archer"
-    title_2 = "Southampton news: Leeds target striker Cameron Archer"
     image_1 = "examples/example_image_real_1.jpg.webp"
     image_2 = "examples/example_image_real_2.jpg.webp"
     image_3 = "examples/example_image_real_3.jpg"
     gr.Examples(
         examples=[
             [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
             [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
             [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
         ],
         inputs=[news_title, news_image, news_content],
         label="Examples",
@@ -156,7 +195,8 @@ with gr.Blocks() as demo:
             "2 real news",
             "1 real news + 1 LLM modification-based news",
             "1 real news + 1 LLM topic-based news",
         ],
     )
-demo.launch(share=False)

 from src.application.url_reader import URLReader
 from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
 AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
 AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
 def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
     news_analysis = NewsVerification()
     news_analysis.load_news(news_title, news_content, news_image)
     news_analysis.generate_analysis_report()
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
     with gr.Row():
         # SETTINGS
         with gr.Column(scale=1):
                 with gr.Accordion("Input News"):
                     news_title = gr.Textbox(label="Title", value="")
                     news_image = gr.Image(label="Image", type="filepath")
+                    news_content = gr.Textbox(label="Content", value="", lines=13)
         # NEWS ANALYSIS REPORT
+        ordinary_user_explanation = """
+        FOR ORDINARY USER<br>
+        - Green texts are the matched words in the input and source news.<br>
+        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
+        """
+        fact_checker_explanation = """
+        FOR FACT CHECKER<br>
+        - Green texts are the matched words in the input and source news.<br>
+        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
+        """
+        governor_explanation = """
+        FOR GOVERNOR<br>
+        - Green texts are the matched words in the input and source news.<br>
+        - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
+        """
+        table = """
+        <h5>Comparison between input news and source news:</h5>
+            <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
+            <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
+                <thead>
+                    <tr>
+                        <th>Input news</th>
+                        <th>Source (URL provided in Originality column correspondingly)</th>
+                        <th>Forensic</th>
+                        <th>Originality</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <th>TBD</th>
+                        <th>TBD</th>
+                        <th>TBD</th>
+                        <th>TBD</th>
+                    </tr>
+                </tbody>
+            </table>
+            <style>"""
         with gr.Column(scale=2):
+            with gr.Accordion("NEWS ANALYSIS"):
+                verification_button = gr.Button("Verify news")
+                with gr.Tab("Orinary User"):
+                    gr.HTML(ordinary_user_explanation)
+                    ordinary_user_result = gr.HTML(table)
+                with gr.Tab("Fact Checker"):
+                    gr.HTML(fact_checker_explanation)
+                    fact_checker_result = gr.HTML(table)
+                with gr.Tab("Governor"):
+                    gr.HTML(governor_explanation)
+                    governor_result = gr.HTML(table)
     # Connect events
     load_button.click(
     generate_image_button.click(generate_fake_image,
                         inputs=[image_generation_model, news_title],
                         outputs=[news_image])
+    verification_button.click(generate_analysis_report,
                             inputs=[news_title, news_content, news_image],
+                            outputs=[ordinary_user_result, fact_checker_result, governor_result])
     # change Image
     #url_input.change(load_image, inputs=url_input, outputs=image_view)
             text_llm_topic = file.read()
         with open('examples/example_text_LLM_modification.txt','r', encoding='utf-8') as file:
             text_llm_modification = file.read()
+        with open('examples/example_text_LLM_entities.txt','r', encoding='utf-8') as file:
+            text_llm_entities = file.read()
     except FileNotFoundError:
         print("File not found.")
     except Exception as e:
         print(f"An error occurred: {e}")
+    title_1 = "Southampton news: Leeds target striker Cameron Archer."
+    title_2 = "Southampton news: Leeds target striker Cameron Archer."
+    title_4 = "Japan pledges support for Ukraine with 100-year pact."
     image_1 = "examples/example_image_real_1.jpg.webp"
     image_2 = "examples/example_image_real_2.jpg.webp"
     image_3 = "examples/example_image_real_3.jpg"
+    image_4 = "examples/example_image_real_4.jpg.webp"
     gr.Examples(
         examples=[
             [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
             [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
             [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
+            [title_4, image_4, text_llm_entities],
         ],
         inputs=[news_title, news_image, news_content],
         label="Examples",
             "2 real news",
             "1 real news + 1 LLM modification-based news",
             "1 real news + 1 LLM topic-based news",
+            "1 LLM changed-entities news",
         ],
     )
+demo.launch(share=True)

application_2.py CHANGED Viewed

@@ -100,7 +100,7 @@ with gr.Blocks() as demo:
         - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
         table = """
-        <h5>Comparison between input news and source news</h5>
             <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
             <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
                 <thead>
@@ -132,7 +132,7 @@ with gr.Blocks() as demo:
                     gr.HTML(fact_checker_explanation)
                     fact_checker_result = gr.HTML(table)
                 with gr.Tab("Governor"):
-                    gr.HTML(fact_checker_explanation)
                     governor_result = gr.HTML(table)
     # Connect events
@@ -180,13 +180,14 @@ with gr.Blocks() as demo:
     image_1 = "examples/example_image_real_1.jpg.webp"
     image_2 = "examples/example_image_real_2.jpg.webp"
     image_3 = "examples/example_image_real_3.jpg"
     gr.Examples(
         examples=[
             [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
             [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
             [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
-            [title_4, image_3, text_llm_entities],
         ],
         inputs=[news_title, news_image, news_content],
         label="Examples",
@@ -198,4 +199,4 @@ with gr.Blocks() as demo:
         ],
     )
-demo.launch(share=False)

         - Each highlighted pair (marked with a number) shows the key differences between the input text and the source.
         """
         table = """
+        <h5>Comparison between input news and source news:</h5>
             <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
             <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
                 <thead>
                     gr.HTML(fact_checker_explanation)
                     fact_checker_result = gr.HTML(table)
                 with gr.Tab("Governor"):
+                    gr.HTML(governor_explanation)
                     governor_result = gr.HTML(table)
     # Connect events
     image_1 = "examples/example_image_real_1.jpg.webp"
     image_2 = "examples/example_image_real_2.jpg.webp"
     image_3 = "examples/example_image_real_3.jpg"
+    image_4 = "examples/example_image_real_4.jpg.webp"
     gr.Examples(
         examples=[
             [title_1, image_1, text_real_1 + '\n\n' + text_real_2],
             [title_1, image_2, text_real_1 + '\n\n' + text_llm_modification],
             [title_1, image_3, text_real_1 + '\n\n' + text_llm_topic],
+            [title_4, image_4, text_llm_entities],
         ],
         inputs=[news_title, news_image, news_content],
         label="Examples",
         ],
     )
+demo.launch(share=True)

examples/example_image_real_4.jpg.webp ADDED Viewed

examples/example_text_LLM_entities.txt CHANGED Viewed

@@ -1 +1 @@

- ~~Japan~~ ~~Prime~~ ~~Minister~~ has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.

+ Shigeru Ishiba has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country. The prime minister's visit on Sunday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems. Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back". An estimated two millions people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the north. Zelensky praised the Japan's commitment on Sunday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid.

gpt_test.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+from dotenv import load_dotenv
+from openai import AzureOpenAI
+load_dotenv()
+AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
+AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
+AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
+azure_client = AzureOpenAI(
+  azure_endpoint = "https://quoc-nguyen.openai.azure.com/",
+  api_key=AZURE_OPENAI_API_KEY,
+  api_version="2024-05-01-preview"
+)
+deplopment_name = "o1-mini" # or "gpt-4o"
+TEXT_PROMPT = """
+replace Ukraine with Denmark:
+"Sir Keir Starmer has pledged to put Ukraine in the "strongest possible position" on a trip to Kyiv where he signed a "landmark" 100-year pact with the war-stricken country.
+The prime minister's visit on Thursday was at one point marked by loud blasts and air raid sirens after a reported Russian drone attack was intercepted by Ukraine's defence systems.
+Acknowledging the "hello" from Russia, Volodymyr Zelensky said Ukraine would send its own "hello back".
+An estimated one million people have been killed or wounded in the war so far. As the invasion reaches the end of its third year, Ukraine is losing territory in the east.
+Zelensky praised the UK's commitment on Thursday, amid wider concerns that the US President-elect Donald Trump, who is set to take office on Monday, could potentially reduce aid."
+"""
+response = azure_client.chat.completions.create(
+    model=deplopment_name, # model = "deployment_name".
+    messages=[
+        # {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": TEXT_PROMPT},
+    ],
+    # max_tokens=512,
+    # temperature=0,
+)
+print(response.choices[0].message.content)

src/application/content_detection.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from difflib import SequenceMatcher
 from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
 from src.application.text.entity import apply_highlight, highlight_entities
 from src.application.text.helper import extract_equal_text
 from src.application.text.model_detection import detect_text_by_ai_model
 from src.application.text.preprocessing import split_into_paragraphs
-from src.application.text.search_detection import check_human, detect_text_by_relative_search
 class NewsVerification():
@@ -25,11 +27,22 @@ class NewsVerification():
         self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
         self.aligned_sentences:list[dict] = []
         self.is_paraphrased:list[bool] = []
         self.ordinary_user_table:list = []
         self.fact_checker_table:list = []
         self.governor_table:list = []
     def load_news(self, news_title, news_content, news_image):
         self.news_text = news_title + "\n\n" + news_content
@@ -70,7 +83,7 @@ class NewsVerification():
             if current_index >= len(input_sentences):
                 break
-            if current_index >= index and index != 0 and index != len(input_sentences) - 1:
                 continue
             paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
@@ -117,6 +130,40 @@ class NewsVerification():
             previous_paraphrase = paraphrase
     def detect_image_origin(self):
         print("CHECK IMAGE:")
         if self.news_image is None:
@@ -183,6 +230,15 @@ class NewsVerification():
         self.detect_image_origin()
     def analyze_details(self):
         ordinary_user_table = self.create_ordinary_user_table()
         fact_checker_table = self.create_fact_checker_table()
         governor_table = self.create_governor_table()
@@ -253,17 +309,17 @@ class NewsVerification():
                 )
             # Get entity-words (in pair) with colors
-            entities_with_colors = highlight_entities(
-                    aligned_sentence["input_sentence"],
-                    aligned_sentence["matched_sentence"],
-                )
             self.fact_checker_table.append(
                 [
                     aligned_sentence,
                     equal_idx_1,
                     equal_idx_2,
-                    entities_with_colors,
                 ]
             )
@@ -273,7 +329,7 @@ class NewsVerification():
         table = "\n".join(rows)
         return f"""
-        <h5>Comparison between input news and source news</h5>
         <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
         <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
             <thead>
@@ -293,12 +349,14 @@ class NewsVerification():
     """
     def format_text_fact_checker_row(self, row, max_length=30):
         if row[0]["input_sentence"] == "":
             return ""
         if row[0]["matched_sentence"] != "":  # source is not empty
             # highlight entities
             input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
             source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
             # Color overlapping words
             input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input)  # text, index of highlight words
@@ -317,11 +375,13 @@ class NewsVerification():
         short_url = self.shorten_url(url, max_length)
         source_text_url = f"""<a href="{url}">{short_url}</a>"""
         return f"""
                 <tr>
                     <td>{input_sentence}</td>
                     <td>{source_sentence}</td>
-                    <td>{label}<br>({score*100:.2f}%)</td>
                     <td>{source_text_url}</td>
                 </tr>
                 """
@@ -347,7 +407,7 @@ class NewsVerification():
         table = "\n".join(rows)
         return f"""
-        <h5>Comparison between input news and source news</h5>
         <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
         <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
             <thead>
@@ -374,28 +434,15 @@ class NewsVerification():
         for index, row in enumerate(self.aligned_sentences):
             if row["input_sentence"] == "":
                 continue
-            input_sentences += row["input_sentence"]
             label = self.aligned_sentences[index]["label"]
-            if label == "HUMAN":
-                score = self.aligned_sentences[index]["similarity"]
-            if label == "MACHINE":
-                score = 1 - self.aligned_sentences[index]["similarity"]
-            scores += score
             url = self.aligned_sentences[index]["url"] #
             short_url = self.shorten_url(url, max_length)
             source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
             sentence_count += 1
-        if scores == 0:
-            label = "UNKNOWN"
-        else:
-            scores /= sentence_count
-            if scores > 0.5:
-                label = "HUMAN"
-            else:
-                label = "MACHINE"
-                scores = 1 - scores
         return f"""
                 <tr>
@@ -408,14 +455,14 @@ class NewsVerification():
     def format_image_ordinary_user_row(self, max_length=30):
         if self.image_referent_url is not None or self.image_referent_url != "":
-            source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
             short_url = self.shorten_url(self.image_referent_url, max_length)
             source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
         else:
-            source_image = "Image not found"
             source_image_url = ""
-        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
     def create_governor_table(self):
@@ -434,17 +481,17 @@ class NewsVerification():
                 )
             # Get entity-words (in pair) with colors
-            entities_with_colors = highlight_entities(
-                    aligned_sentence["input_sentence"],
-                    aligned_sentence["matched_sentence"],
-                )
             self.governor_table.append(
                 [
                     aligned_sentence,
                     equal_idx_1,
                     equal_idx_2,
-                    entities_with_colors,
                 ]
             )
@@ -453,7 +500,7 @@ class NewsVerification():
         table = "\n".join(rows)
         return f"""
-            <h5>Comparison between input news and source news</h5>
             <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
             <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
                 <thead>
@@ -502,36 +549,24 @@ class NewsVerification():
                 input_sentence = row[0]["input_sentence"]
                 source_sentence = row[0]["matched_sentence"]
-            input_sentences += input_sentence
-            source_sentences += source_sentence
-            score = row[0]["similarity"]
-            label = row[0]["label"]
-            if label == "HUMAN":
-                score = row[0]["similarity"]
-            if label == "MACHINE":
-                score = 1 - row[0]["similarity"]
-            scores += score
             url = row[0]["url"]
             short_url = self.shorten_url(url, max_length)
             source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
             sentence_count += 1
-        if scores == 0:
-            label = "UNKNOWN"
-        else:
-            scores /= sentence_count
-            if scores > 0.5:
-                label = "HUMAN"
-            else:
-                label = "MACHINE"
-                scores = 1 - scores
         return f"""
                 <tr>
                     <td>{input_sentences}</td>
                     <td>{source_sentences}</td>
-                    <td>{label}<br>({score*100:.2f}%)</td>
                     <td>{source_text_urls}</td>
                 </tr>
                 """
@@ -548,6 +583,15 @@ class NewsVerification():
         return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
     def shorten_url(self, url, max_length=30):
         if url is None:
             return ""
@@ -668,4 +712,32 @@ class NewsVerification():
                 starts.append(start)
                 ends.append(end + 1)
-        return starts, ends

 from difflib import SequenceMatcher
+import pandas as pd
 from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
 from src.application.text.entity import apply_highlight, highlight_entities
 from src.application.text.helper import extract_equal_text
 from src.application.text.model_detection import detect_text_by_ai_model
 from src.application.text.preprocessing import split_into_paragraphs
+from src.application.text.search_detection import check_human, detect_text_by_relative_search, find_text_source
 class NewsVerification():
         self.found_img_url:list[str] = ["https://ichef.bbci.co.uk/ace/standard/819/cpsprodpb/8acc/live/86282470-defb-11ef-ba00-65100a906e68.jpg"]
         self.aligned_sentences:list[dict] = []
+        self.aligned_sentences_df:pd.DataFrame = pd.DataFrame(columns=[
+            "input_sentence",
+            "matched_sentence",
+            "label",
+            "similarity",
+            "paraphrase",
+            "url",
+            "group",
+            "entities",
+            ])
         self.is_paraphrased:list[bool] = []
         self.ordinary_user_table:list = []
         self.fact_checker_table:list = []
         self.governor_table:list = []
+        self.entities_with_colors = []
     def load_news(self, news_title, news_content, news_image):
         self.news_text = news_title + "\n\n" + news_content
             if current_index >= len(input_sentences):
                 break
+            if current_index > index and index != 0 and index != len(input_sentences) - 1:
                 continue
             paraphrase, text_url, searched_sentences, img_urls, current_index = detect_text_by_relative_search(input_sentences, index)
             previous_paraphrase = paraphrase
+    def determine_text_origin_2(self):
+        """
+        Determines the origin of the given text based on paraphrasing detection and human authorship analysis.
+        Args:
+            text: The input text to be analyzed.
+        Returns:
+            str: The predicted origin of the text:
+                - "HUMAN": If the text is likely written by a human.
+                - "MACHINE": If the text is likely generated by a machine.
+        """
+        print("CHECK TEXT:")
+        print("\tFrom search engine:")
+        # Classify by search engine
+        input_sentences = split_into_paragraphs(self.news_text)
+        for _ in range(5):
+            self.aligned_sentences_df = pd.concat(
+                [self.aligned_sentences_df, pd.DataFrame([{}])],
+                ignore_index=False,
+                )
+        for index, sentence in enumerate(input_sentences):
+            print(f"-------index = {index}-------")
+            print(f"current_sentence = {input_sentences[index]}")
+            if self.aligned_sentences_df["url"] is not None:
+                continue
+            self.aligned_sentences_df, img_urls = find_text_source(
+                input_sentences[index],
+                self.aligned_sentences_df,
+            )
     def detect_image_origin(self):
         print("CHECK IMAGE:")
         if self.news_image is None:
         self.detect_image_origin()
     def analyze_details(self):
+        entities_with_colors = []
+        for index, aligned_sentence in enumerate(self.aligned_sentences):
+            # Get entity-words (in pair) with colors
+            entities_with_colors = highlight_entities(
+                    aligned_sentence["input_sentence"],
+                    aligned_sentence["matched_sentence"],
+                )
+            self.aligned_sentences[index]["entities"] = entities_with_colors
         ordinary_user_table = self.create_ordinary_user_table()
         fact_checker_table = self.create_fact_checker_table()
         governor_table = self.create_governor_table()
                 )
             # Get entity-words (in pair) with colors
+            # entities_with_colors = highlight_entities(
+            #         aligned_sentence["input_sentence"],
+            #         aligned_sentence["matched_sentence"],
+            #     )
             self.fact_checker_table.append(
                 [
                     aligned_sentence,
                     equal_idx_1,
                     equal_idx_2,
+                    aligned_sentence["entities"],
                 ]
             )
         table = "\n".join(rows)
         return f"""
+        <h5>Comparison between input news and source news:</h5>
         <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
         <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
             <thead>
     """
     def format_text_fact_checker_row(self, row, max_length=30):
+        entity_count = 0
         if row[0]["input_sentence"] == "":
             return ""
         if row[0]["matched_sentence"] != "":  # source is not empty
             # highlight entities
             input_sentence, highlight_idx_input = apply_highlight(row[0]["input_sentence"], row[3], "input")
             source_sentence, highlight_idx_source = apply_highlight(row[0]["matched_sentence"], row[3], "source")
+            entity_count = len(row[3])
             # Color overlapping words
             input_sentence = self.color_text(input_sentence, row[1], highlight_idx_input)  # text, index of highlight words
         short_url = self.shorten_url(url, max_length)
         source_text_url = f"""<a href="{url}">{short_url}</a>"""
+        entity_count_text = self.get_entity_count_text(entity_count)
         return f"""
                 <tr>
                     <td>{input_sentence}</td>
                     <td>{source_sentence}</td>
+                    <td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
                     <td>{source_text_url}</td>
                 </tr>
                 """
         table = "\n".join(rows)
         return f"""
+        <h5>Comparison between input news and source news:</h5>
         <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
         <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
             <thead>
         for index, row in enumerate(self.aligned_sentences):
             if row["input_sentence"] == "":
                 continue
+            input_sentences += row["input_sentence"] + "<br><br>"
             label = self.aligned_sentences[index]["label"]
             url = self.aligned_sentences[index]["url"] #
             short_url = self.shorten_url(url, max_length)
             source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
             sentence_count += 1
+        scores, label = self.calculate_score_label()
         return f"""
                 <tr>
     def format_image_ordinary_user_row(self, max_length=30):
         if self.image_referent_url is not None or self.image_referent_url != "":
+            # source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
             short_url = self.shorten_url(self.image_referent_url, max_length)
             source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
         else:
+            # source_image = "Image not found"
             source_image_url = ""
+        return f"""<tr><td>input image</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
     def create_governor_table(self):
                 )
             # Get entity-words (in pair) with colors
+            # entities_with_colors = highlight_entities(
+            #         aligned_sentence["input_sentence"],
+            #         aligned_sentence["matched_sentence"],
+            #     )
             self.governor_table.append(
                 [
                     aligned_sentence,
                     equal_idx_1,
                     equal_idx_2,
+                    aligned_sentence["entities"],
                 ]
             )
         table = "\n".join(rows)
         return f"""
+            <h5>Comparison between input news and source news:</h5>
             <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
             <col style="width: 170px;">  <col style="width: 170px;">  <col style="width: 30px;"> <col style="width: 75px;">
                 <thead>
                 input_sentence = row[0]["input_sentence"]
                 source_sentence = row[0]["matched_sentence"]
+            # convert score to HUMAN-based score:
+            input_sentences += input_sentence + "<br><br>"
+            source_sentences += source_sentence + "<br><br>"
             url = row[0]["url"]
             short_url = self.shorten_url(url, max_length)
             source_text_urls += f"""<a href="{url}">{short_url}</a><br>"""
             sentence_count += 1
+        score, label = self.calculate_score_label()
+        entity_count_text = self.get_entity_count_text(entity_count)
         return f"""
                 <tr>
                     <td>{input_sentences}</td>
                     <td>{source_sentences}</td>
+                    <td>{label}<br>({score*100:.2f}%)<br><br>{entity_count_text}</td>
                     <td>{source_text_urls}</td>
                 </tr>
                 """
         return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
+    def get_entity_count_text(self, entity_count):
+        if entity_count <= 0:
+            entity_count_text = ""
+        elif entity_count == 1:
+            entity_count_text = "with altered entity"
+        else:
+            entity_count_text = "with altered entities"
+        return entity_count_text
     def shorten_url(self, url, max_length=30):
         if url is None:
             return ""
                 starts.append(start)
                 ends.append(end + 1)
+        return starts, ends
+    def calculate_score_label(self):
+        human_score = []
+        machine_score = []
+        machine_flag = False
+        for sentence in self.aligned_sentences:
+            if sentence["input_sentence"] == "":
+                continue
+            if sentence["label"] == "HUMAN":
+                human_score.append(sentence["similarity"])
+            elif sentence["label"] == "MACHINE":
+                machine_score.append(1 - sentence["similarity"])
+                machine_flag = True
+        if machine_flag is True and len(machine_score) > 0:
+            # average value of machine_score
+            machine_score_avg = sum(machine_score) / len(machine_score)
+            if machine_score_avg < 0.5:
+                machine_score_avg = 1 - machine_score_avg
+            return machine_score_avg, "MACHINE"
+        elif machine_flag is False and len(human_score) > 0:
+            # average value of human_score
+            human_score_avg = sum(human_score) / len(human_score)
+            return human_score_avg, "HUMAN"
+        else:
+            return 0, "UNKNOWN"

src/application/image/image_detection.py CHANGED Viewed

@@ -12,6 +12,10 @@ def compare_list_of_images(news_image_path, img_urls):
     matched_url = ""
     max_similarity = 0
     for url in img_urls:
         print(f"\t{url}")
         referred_image = get_image_from_url(url)
         if referred_image is None:
@@ -29,8 +33,6 @@ def compare_list_of_images(news_image_path, img_urls):
 def detect_image_from_news_image(news_image_path, image_urls):
     print("\tFrom news:")
-    for url in image_urls:
-        print(f"\t{url}")
     return compare_list_of_images(news_image_path, image_urls)
 def detect_image_by_reverse_search(news_image_path):

     matched_url = ""
     max_similarity = 0
     for url in img_urls:
+        if "ichef.bbci.co.uk" in url and " " in url:
+            url_list = url.split(",")
+            if len(url_list) > 0:
+                url = url_list[0].split(" ")[0]
         print(f"\t{url}")
         referred_image = get_image_from_url(url)
         if referred_image is None:
 def detect_image_from_news_image(news_image_path, image_urls):
     print("\tFrom news:")
     return compare_list_of_images(news_image_path, image_urls)
 def detect_image_by_reverse_search(news_image_path):

src/application/text/entity.py CHANGED Viewed

@@ -166,8 +166,14 @@ def apply_highlight(text, entities_with_colors, key="input", count = 0):
         highlighted_text = ""
         # find a list of starts and ends of entity in text:
-        starts = [m.start() for m in re.finditer(entity[key], temp_text)]
-        ends = [m.end() for m in re.finditer(entity[key], temp_text)]
         all_starts.extend(starts)
         all_ends.extend(ends)

         highlighted_text = ""
         # find a list of starts and ends of entity in text:
+        # starts = [m.start() for m in re.finditer(entity[key], temp_text)]
+        # ends = [m.end() for m in re.finditer(entity[key], temp_text)]
+        starts =[]
+        ends = []
+        # "\b" is for bound a word
+        for m in re.finditer(r"\b" + re.escape(entity[key]) + r"\b", temp_text):
+            starts.append(m.start())
+            ends.append(m.end())
         all_starts.extend(starts)
         all_ends.extend(ends)

src/application/text/search.py CHANGED Viewed

@@ -160,12 +160,28 @@ def generate_search_phrases(input_text):
     search_phrases.append(input_text)
     # Method 3: Split text by chunks
-    # search_phrases.extend(get_chunk(input_text))  # TODO: for demo purposes
     # Method 4: Get most identities and key words
     entities = extract_entities(input_text)
-    keywords = get_keywords(input_text, 16)
-    search_phrase = " ".join(entities) + " " + " ".join(keywords)
     # search_phrases.append(search_phrase) # TODO: for demo purposes
     return search_phrases

     search_phrases.append(input_text)
     # Method 3: Split text by chunks
+    search_phrases.extend(get_chunk(input_text))  # TODO: for demo purposes
     # Method 4: Get most identities and key words
     entities = extract_entities(input_text)
+    text_without_entities = remove_identities_from_text(input_text, entities)
+    print(f"text_without_entities: {text_without_entities}")
+    search_phrases.append(text_without_entities)
+    #keywords = get_keywords(input_text, 16)
+    #search_phrase = " ".join(entities) + " " + " ".join(keywords)
     # search_phrases.append(search_phrase) # TODO: for demo purposes
     return search_phrases
+def remove_identities_from_text(input_text, entities):
+    """
+    Removes entities from the input text.
+    Args:
+        input_text: The input text as a string.
+        entities: A list of entities to be removed.
+    """
+    for entity in entities:
+        input_text = input_text.replace(entity, "")
+    return input_text

src/application/text/search_detection.py CHANGED Viewed

@@ -92,6 +92,76 @@ def detect_text_by_relative_search(input_text, index, is_support_opposite = Fals
     return False, None, [], [], index
 def longest_common_subsequence(arr1, arr2):
     """
     Finds the length of the longest common subsequence (contiguous) between

     return False, None, [], [], index
+def find_text_source(text, text_index, sentences_df):
+    sentence = {
+        "input_sentence": text[text_index],
+        "matched_sentence": "",
+        "label": "",
+        "similarity": None,
+        "paraphrase": None,
+        "url": "",
+        "group": None,
+        }
+    checked_urls = set()
+    searched_phrases = generate_search_phrases(text[text_index])
+    for candidate in searched_phrases:
+        search_results = search_by_google(candidate)
+        urls = [item['link'] for item in search_results.get("items", [])]
+        for url in urls[:3]:
+            if url in checked_urls: # visited url
+                continue
+            if "bbc.com" not in url:
+                continue
+            checked_urls.add(url)
+            print(f"\t\tChecking URL: {url}")
+            content = URLReader(url)
+            if content.is_extracted is True:
+                if content.title is None or content.text is None:
+                    print(f"\t\t\t↑↑↑ Title or text not found")
+                    continue
+                page_text = content.title + "\n" + content.text
+                if len(page_text) > MAX_CHAR_SIZE:
+                    print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
+                    continue
+                print(f"\t\t\t↑↑↑ Title: {content.title}")
+                paraphrase, aligned_sentence = check_paraphrase(text, page_text, url)
+                # add one more key "group" into aligned_sentence
+                sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
+                sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
+                sentences_df.loc[index, "label"] = aligned_sentence["label"]
+                sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
+                sentences_df.loc[index, "url"] = aligned_sentence["url"]
+                if aligned_sentence["paraphrase"] is False:
+                    return paraphrase, sentences_df
+                for index, _ in enumerate(sentences_df):
+                    if sentences_df[index]["url"] is not None:
+                        continue
+                    # find content in new url
+                    _, aligned_sentence = check_paraphrase(text[index], page_text, url)
+                    if aligned_sentence["url"] is not None:
+                        continue
+                    sentences_df.loc[index, "input_sentence"] = aligned_sentence["input_sentence"]
+                    sentences_df.loc[index, "matched_sentence"] = aligned_sentence["matched_sentence"]
+                    sentences_df.loc[index, "label"] = aligned_sentence["label"]
+                    sentences_df.loc[index, "similarity"] = aligned_sentence["similarity"]
+                    sentences_df.loc[index, "url"] = aligned_sentence["url"]
+                return sentences_df, content.images
+    return sentence, []
 def longest_common_subsequence(arr1, arr2):
     """
     Finds the length of the longest common subsequence (contiguous) between

test.py CHANGED Viewed

@@ -1,27 +1,50 @@
-import json
-text = """```json
-[
-    ["Sunday", "Thursday"],
-    ["two millions", "one million"],
-    ["north", "east"],
-    ["Japan", "UK"],
-    ["Sunday", "Thursday"]
-]
-```
-"""
-def read_json(json_string) -> list[list[str]]:
-    try:
-        entities = json.loads(json_string)
-        # Remove duplicates pair of entities
-        unique_data = []
-        for inner_list in entities:
-            if inner_list not in unique_data:
-                unique_data.append(inner_list)
-        return unique_data
-    except json.JSONDecodeError as e:
-        print(f"Error decoding JSON: {e}")
-        return []
-print(read_json(text.replace("```json", "").replace("```", "")))

+import re
+def find_entity_spans(entity, text):
+    """
+    Finds the start and end indices of whole word entities in text.
+    Args:
+        entity: The entity string to search for.
+        text: The text to search within.
+    Returns:
+        A list of tuples, where each tuple contains the start and end indices
+        of a found entity.  Returns an empty list if no entities are found.
+    """
+    spans = []
+    for m in re.finditer(r"\b" + re.escape(entity) + r"\b", text): # The crucial change
+        spans.append((m.start(), m.end()))
+    return spans
+# Example usage:
+temp_text = "win winger winning"
+entity = {"key": "win"}  # Example dictionary (adjust as needed)
+spans = find_entity_spans(entity["key"], temp_text)
+print(spans)  # Output: [(0, 3)] (Only "win" at the beginning)
+temp_text = "The quick brown fox jumps over the lazy dog."
+entity = {"key": "fox"}
+spans = find_entity_spans(entity["key"], temp_text)
+print(spans) # Output: [(16, 19)]
+temp_text = "foxes fox foxing"
+entity = {"key": "fox"}
+spans = find_entity_spans(entity["key"], temp_text)
+print(spans) # Output: [(0, 3), (6, 9)]
+temp_text = "winger win winning"
+entity = {"key": "win"}
+spans = find_entity_spans(entity["key"], temp_text)
+print(spans) # Output: [(8, 11)]
+temp_text = "winger win winning"
+entity = {"key": "winger"}
+spans = find_entity_spans(entity["key"], temp_text)
+print(spans) # Output: [(0, 6)]
+temp_text = "winger win winning"
+entity = {"key": "winning"}
+spans = find_entity_spans(entity["key"], temp_text)
+print(spans) # Output: [(12, 19)]