Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

pmkhanh7890 commited on Feb 3

Commit

d952fbe

1 Parent(s): badcb49

revise demo

Browse files

Files changed (14) hide show

Yandexsample.html +0 -0
app.py +55 -16
application.py +39 -23
application_2.py +151 -0
demo.py +0 -309
requirements.txt +2 -1
sample_1.jpg.webp +0 -0
sample_1.txt +5 -0
sample_2.jpg.webp +0 -0
sample_2.txt +5 -0
sample_3.txt +5 -0
src/application/content_detection.py +195 -121
src/application/highlight_text.py +125 -0
src/application/text/search_detection.py +28 -28

Yandexsample.html DELETED Viewed

The diff for this file is too large to render. See raw diff

app.py CHANGED Viewed

@@ -1,23 +1,62 @@
-from bs4 import BeautifulSoup
-import requests
-from src.application.image.search_yandex import get_image_links
-img_search_url = """https://yandex.ru/images/search?cbir_id=4481385%2Fw-xYJ246B9thwtVBmNcpkg9409&rpt=imageview&lr=10636"""
-print(img_search_url)
-headers = {
-    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
-    'Content-Type': 'application/json',
-    }
-response = requests.get(img_search_url, headers=headers)
-response.raise_for_status()  # Raise an exception for bad status codes
-# Parse the HTML content
-soup = BeautifulSoup(response.content, 'html.parser')
-image_urls = get_image_links(soup.prettify())
-print(f"image_urls: {image_urls}")

+import difflib
+def compare_sentences(sentence_1, sentence_2):
+    """
+    Compares two sentences and identifies common phrases, outputting their start and end positions.
+    Args:
+        sentence_1: The first sentence (string).
+        sentence_2: The second sentence (string).
+    Returns:
+        A list of dictionaries, where each dictionary represents a common phrase and contains:
+            - "phrase": The common phrase (string).
+            - "start_1": The starting index of the phrase in sentence_1 (int).
+            - "end_1": The ending index of the phrase in sentence_1 (int).
+            - "start_2": The starting index of the phrase in sentence_2 (int).
+            - "end_2": The ending index of the phrase in sentence_2 (int).
+        Returns an empty list if no common phrases are found.  Handles edge cases like empty strings.
+    """
+    if not sentence_1 or not sentence_2:  # Handle empty strings
+        return []
+    s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
+    common_phrases = []
+    for block in s.get_matching_blocks():
+        if block.size > 0:  # Ignore zero-length matches
+            start_1 = block.a
+            end_1 = block.a + block.size
+            start_2 = block.b
+            end_2 = block.b + block.size
+            phrase = sentence_1[start_1:end_1]  # Or sentence_2[start_2:end_2], they are the same
+            common_phrases.append({
+                "phrase": phrase,
+                "start_1": start_1,
+                "end_1": end_1,
+                "start_2": start_2,
+                "end_2": end_2
+            })
+    return common_phrases
+# Example usage:
+sentence_1 = "
+Muzzamil Hussain was in 3rd-grade school when the first bombs fell on the playground outside of his classroom in Kargil, a mountain city in India. While the violent onset of the 1998 Kargil war between Pakistan and India unfolded around him, Hussain and his family escaped south to the remote Suru Valley.
+After India claimed victory later that year and displaced families returned home, Hussain listened as his bedridden grandfather asked the family to visit an old property, initially built by Hussain's great-grandfather, near Kargil's bazaar to make sure it had survived the war. When Hussain's uncles cracked through an old rusty latch and peered through the hand-carved wooden doors, they discovered wooden crates stamped with names of cities worldwide. Making space on the dusty floor, the family began to lay out silks from China, silver cookware from Afghanistan, rugs from Persia, turquoise from Tibet, saddles from Mongolia, and luxury soaps and salves from London, New York, and Munich. "
+sentence_2 = "A quick brown fox jumps over a lazy cat."
+common_phrases = compare_sentences(sentence_1, sentence_2)
+if common_phrases:
+    for phrase_data in common_phrases:
+        print(phrase_data)
+else:
+    print("No common phrases found.")

application.py CHANGED Viewed

@@ -43,10 +43,12 @@ def load_url(url):
 def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
     news_analysis.load_news(news_title, news_content, news_image)
-    return news_analysis.generate_analysis_report(), news_analysis.analyze_details()
-news_analysis = NewsVerification()
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
@@ -54,14 +56,11 @@ with gr.Blocks() as demo:
     with gr.Row():
         # SETTINGS
         with gr.Column(scale=1):
-            with gr.Accordion("Settings"):
-                gr.Markdown("Give an URL or fill in news by yourself")
                 with gr.Accordion("1. Enter a URL"):
                     url_input = gr.Textbox(
                         label="",
                         show_label=False,
-                        value="https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science",
                         )
                     load_button = gr.Button("Load URL")
@@ -82,20 +81,17 @@ with gr.Blocks() as demo:
                     )
                     replace_button = gr.Button("Replace all")
-        # GENERATED CONTENT
-        with gr.Column(scale=1):
-            with gr.Accordion("Input News"):
-                news_title = gr.Textbox(label="Title", value="")
-                news_image = gr.Image(label="Image", type="filepath")
-                news_content = gr.Textbox(label="Content", value="", lines=12)
         # NEWS ANALYSIS REPORT
-        with gr.Column(scale=1):
             with gr.Accordion("News Analysis"):
                 detection_button = gr.Button("Verify news")
-                analyzed_information = gr.HTML()
-                with gr.Accordion("Detailed information"):
-                    detailed_analysis = gr.HTML()
     # Connect events
     load_button.click(
@@ -114,25 +110,45 @@ with gr.Blocks() as demo:
                         outputs=[news_image])
     detection_button.click(generate_analysis_report,
                             inputs=[news_title, news_content, news_image],
-                            outputs=[analyzed_information, detailed_analysis])
     # change Image
     #url_input.change(load_image, inputs=url_input, outputs=image_view)
     gr.Examples(
         examples=[
-            ["https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road"],
-            ["https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science"],
         ],
-        inputs=[url_input],
         label="Examples",
         example_labels=[
-            "BBC news 1",
-            "BBC news 2",
         ],
     )
-demo.launch()
 # https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road

 def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
+    news_analysis = NewsVerification()
     news_analysis.load_news(news_title, news_content, news_image)
+    news_analysis.generate_analysis_report()
+    return news_analysis.analyze_details()
 # Define the GUI
 with gr.Blocks() as demo:
     gr.Markdown("# NEWS VERIFICATION")
     with gr.Row():
         # SETTINGS
         with gr.Column(scale=1):
                 with gr.Accordion("1. Enter a URL"):
                     url_input = gr.Textbox(
                         label="",
                         show_label=False,
+                        value="",
                         )
                     load_button = gr.Button("Load URL")
                     )
                     replace_button = gr.Button("Replace all")
+                # GENERATED CONTENT
+                with gr.Accordion("Input News"):
+                    news_title = gr.Textbox(label="Title", value="")
+                    news_image = gr.Image(label="Image", type="filepath")
+                    news_content = gr.Textbox(label="Content", value="", lines=12)
         # NEWS ANALYSIS REPORT
+        with gr.Column(scale=2):
             with gr.Accordion("News Analysis"):
                 detection_button = gr.Button("Verify news")
+                detailed_analysis = gr.HTML()
     # Connect events
     load_button.click(
                         outputs=[news_image])
     detection_button.click(generate_analysis_report,
                             inputs=[news_title, news_content, news_image],
+                            outputs=[detailed_analysis])
     # change Image
     #url_input.change(load_image, inputs=url_input, outputs=image_view)
+    try:
+        with open('sample_1.txt','r', encoding='utf-8') as file:
+            text_sample_1 = file.read()
+        with open('sample_2.txt','r', encoding='utf-8') as file:
+            text_sample_2 = file.read()
+        with open('sample_3.txt','r', encoding='utf-8') as file:
+            text_sample_3 = file.read()
+    except FileNotFoundError:
+        print("File not found.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    title_1 = "The ancient discovery that put a Silk Road city back on the map"
+    title_2 = "The modern rediscovery that erased a Silk Road city from the map"
+    image_1 = "sample_1.jpg.webp"
+    image_2 = "sample_2.jpg.webp"
     gr.Examples(
         examples=[
+            [title_1, image_1, text_sample_1],
+            [title_2, image_2, text_sample_2],
+            [title_1, image_2, text_sample_3],
         ],
+        inputs=[news_title, news_image, news_content],
         label="Examples",
         example_labels=[
+            "2 real news",
+            "2 modified news",
+            "1 real news & 1 fake news",
         ],
     )
+demo.launch(share=False)
 # https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road

application_2.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import gradio as gr
+import requests
+from PIL import Image
+from src.application.content_detection import NewsVerification
+from src.application.url_reader import URLReader
+from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
+GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
+SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
+AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
+AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
+def load_url(url):
+    """
+    Load content from the given URL.
+    """
+    content = URLReader(url)
+    image = None
+    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'}
+    try:
+        response = requests.get(
+            url,
+            headers = header,
+            stream = True
+        )
+        response.raise_for_status()  # Raise an exception for bad status codes
+        image_response = requests.get(content.top_image, stream=True)
+        try:
+            image = Image.open(image_response.raw)
+        except:
+            print(f"Error loading image from {content.top_image}")
+    except (requests.exceptions.RequestException, FileNotFoundError) as e:
+        print(f"Error fetching image: {e}")
+    return content.title, content.text, image
+def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
+    news_analysis = NewsVerification()
+    news_analysis.load_news(news_title, news_content, news_image)
+    news_analysis.generate_analysis_report()
+    return news_analysis.analyze_details()
+# Define the GUI
+with gr.Blocks() as demo:
+    gr.Markdown("# NEWS VERIFICATION")
+    with gr.Row():
+        # SETTINGS
+        with gr.Column(scale=1):
+                with gr.Accordion("1. Enter a URL"):
+                    url_input = gr.Textbox(
+                        label="",
+                        show_label=False,
+                        value="",
+                        )
+                    load_button = gr.Button("Load URL")
+                with gr.Accordion("2. Select content-generation models", open=True, visible=False):
+                    with gr.Row():
+                            text_generation_model = gr.Dropdown(choices=AZURE_TEXT_MODEL, label="Text-generation model")
+                            image_generation_model = gr.Dropdown(choices=AZURE_IMAGE_MODEL, label="Image-generation model")
+                            generate_text_button = gr.Button("Generate text")
+                            generate_image_button = gr.Button("Generate image")
+                with gr.Accordion("3. Replace any terms", open=True, visible=False):
+                    replace_df = gr.Dataframe(
+                        headers=["Find what:", "Replace with:"],
+                        datatype=["str", "str"],
+                        row_count=(1, "dynamic"),
+                        col_count=(2, "fixed"),
+                        interactive=True
+                    )
+                    replace_button = gr.Button("Replace all")
+                # GENERATED CONTENT
+                with gr.Accordion("Input News"):
+                    news_title = gr.Textbox(label="Title", value="")
+                    news_image = gr.Image(label="Image", type="filepath")
+                    news_content = gr.Textbox(label="Content", value="", lines=12)
+        # NEWS ANALYSIS REPORT
+        with gr.Column(scale=2):
+            with gr.Accordion("News Analysis"):
+                detection_button = gr.Button("Verify news")
+                detailed_analysis = gr.HTML()
+    # Connect events
+    load_button.click(
+        load_url,
+        inputs=url_input,
+        outputs=[news_title, news_content, news_image]
+        )
+    replace_button.click(replace_text,
+                        inputs=[news_title, news_content, replace_df],
+                        outputs=[news_title, news_content])
+    generate_text_button.click(generate_fake_text,
+                        inputs=[text_generation_model, news_title, news_content],
+                        outputs=[news_title, news_content])
+    generate_image_button.click(generate_fake_image,
+                        inputs=[image_generation_model, news_title],
+                        outputs=[news_image])
+    detection_button.click(generate_analysis_report,
+                            inputs=[news_title, news_content, news_image],
+                            outputs=[detailed_analysis])
+    # change Image
+    #url_input.change(load_image, inputs=url_input, outputs=image_view)
+    try:
+        with open('sample_1.txt','r', encoding='utf-8') as file:
+            text_sample_1 = file.read()
+        with open('sample_2.txt','r', encoding='utf-8') as file:
+            text_sample_2 = file.read()
+        with open('sample_3.txt','r', encoding='utf-8') as file:
+            text_sample_3 = file.read()
+    except FileNotFoundError:
+        print("File not found.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    title_1 = "The ancient discovery that put a Silk Road city back on the map"
+    title_2 = "The modern rediscovery that erased a Silk Road city from the map"
+    image_1 = "sample_1.jpg.webp"
+    image_2 = "sample_2.jpg.webp"
+    gr.Examples(
+        examples=[
+            [title_1, image_1, text_sample_1],
+            [title_2, image_2, text_sample_2],
+            [title_1, image_2, text_sample_3],
+        ],
+        inputs=[news_title, news_image, news_content],
+        label="Examples",
+        example_labels=[
+            "2 real news",
+            "2 modified news",
+            "1 real news & 1 fake news",
+        ],
+    )
+demo.launch(share=False)

demo.py DELETED Viewed

@@ -1,309 +0,0 @@
-import os
-from src.images.Search_Image.search import find_similar_img_from_url
-import re
-import gradio as gr
-from src.images.Search_Image.image_model_share import (
-    image_generation_detection,
-)
-from src.texts.Search_Text._text_detection_share import (
-    UNKNOWN,
-    abstract_detect_generated_text,
-)
-from src.texts.Search_Text.fake_text_generation_share import (
-    highlight_overlap_by_word_to_list,
-)
-os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
-TEMP_IMAGE = "temp_image.jpg"
-TEMP_INPUT_IMAGE = "temp_input_image.jpg"
-HUMAN_IMAGE = "data/test_data/human_news.jpg"
-HUMAN_CAPTION = "Stoke City have secured West Brom striker Saido Berahino for £12 million on a five-and-a-half-year contract."
-HUMAN_CONTENT = """
-Tracey Jolliffe has already donated a kidney, 16 eggs and 80 pints of blood, and intends to leave her brain to science. She is now hoping to give away part of her liver to a person she may never meet.
-"If I had another spare kidney, I'd do it again," Tracey tells the BBC's Victoria Derbyshire programme.
-She is what is known as an "altruistic donor" - someone willing to give away an organ to potentially help save the life of a complete stranger.
-A microbiologist in the NHS, and the daughter of two nurses, she has spent her life learning about the importance of healthcare from a professional standpoint.
-But she has also been keen to make a difference on a personal level.
-"I signed up to donate blood, and to the bone marrow register, when I was 18," she says.
-Now 50, her wish to donate has become gradually more expansive.
-In 2012, she was one of fewer than 100 people that year to donate a kidney without knowing the recipient's identity - and now supports the charity Give A Kidney, encouraging others to do the same.
-As of 30 September 2016, 5,126 people remain on the NHS kidney transplant waiting list.
-Tracey's kidney donation, in all likelihood, will have saved someone's life.
-"I remind myself of it every day when I wake up," she says, rightly proud of her life-changing actions.
-It was not, however, a decision taken on the spur of a moment.
-Donating a kidney is an "involved process", she says, with suitability assessments taking at least three months to complete.
-Tests leading up to the transplant include X-rays, heart tracing and a special test of kidney function, which involves an injection and a series of blood tests.
-"It is not something to do if you're scared of needles," she jokes.
-The risks associated with donating, however, are relatively low for those deemed healthy enough to proceed, with a mortality rate of about one in 3,000 - roughly the same as having an appendix removed.
-Compared with the general public, NHS Blood and Transplant says, most kidney donors have equivalent - or better - life expectancy than the average person.
-Tracey says she was in hospital for five days after her operation but felt "back to normal" within six weeks.
-"""
-HUMAN_NEWS_CNN = """
-Mayotte authorities fear hunger and disease after cyclone, as death toll rises in Mozambique
-Cyclone Chido caused devastation in Mayotte and authorities are now rushing to prevent disease and hunger spreading in the French overseas territory Sipa USA
-Authorities in Mayotte were racing on Tuesday to stop hunger, disease and lawlessness from spreading in the French overseas territory after the weekend’s devastating cyclone, while Mozambique reported dozens of deaths from the storm.
-Hundreds or even thousands could be dead in Mayotte, which took the strongest hit from Cyclone Chido, French officials have said. The storm laid waste to large parts of the archipelago off east Africa, France’s poorest overseas territory, before striking continental Africa.
-With many parts of Mayotte still inaccessible and some victims buried before their deaths could be officially counted, it may take days to discover the full extent of the destruction.
-So far, 22 deaths and more than 1,400 injuries have been confirmed, Ambdilwahedou Soumaila, the mayor of the capital Mamoudzou, told Radio France Internationale on Tuesday morning.
-“The priority today is water and food,” Soumaila said. “There are people who have unfortunately died where the bodies are starting to decompose that can create a sanitary problem.”
-“We don’t have electricity. When night falls, there are people who take advantage of that situation.”
-Rescue workers operate in storm-hit Mayotte on Wednesday.
-Rescue workers operate in storm-hit Mayotte on Wednesday. Securite Civile via Reuters
-Twenty tonnes of food and water are due to start arriving on Tuesday by air and sea. The French government said late on Monday it expects 50% of water supplies to be restored within 48 hours and 95% within the week.
-France’s interior ministry announced that a curfew would go into effect on Tuesday night from 10 p.m. to 4 a.m. local time.
-Rescue workers have been searching for survivors amid the debris of shantytowns bowled over by 200 kph (124 mph) winds.
-Chido was the strongest storm to strike Mayotte in more than 90 years, French weather service Meteo France said. In Mozambique, it killed at least 34 people, officials said on Tuesday. Another seven died in Malawi.
-Drone footage from Mozambique’s Cabo Delgado province, already experiencing a humanitarian crisis due to an Islamist insurgency, showed razed thatched-roof houses near the beach and personal belongings scattered under the few palm trees still standing.
-Dispute over immigration
-French President Emmanuel Macron said after an emergency cabinet meeting on Monday that he would visit Mayotte in the coming days, as the disaster quickly fueled a political back-and-forth about immigration, the environment and France’s treatment of its overseas territories.
-Mayotte has been grappling with unrest in recent years, with many residents angry at illegal immigration and inflation.
-More than three-quarters of its roughly 321,000 people live in relative poverty, and about one-third are estimated to be undocumented migrants, most from nearby Comoros and Madagascar.
-The territory has become a stronghold for the far-right National Rally with 60% voting for Marine Le Pen in the 2022 presidential election runoff.
-France’s acting Interior Minister Bruno Retailleau, from the conservative Republicans party, told a news conference in Mayotte that the early warning system had worked “perfectly” but many of the undocumented had not come to designated shelters.
-People stand amid uprooted trees and debris after cyclone Chido hit Mecufi district, Cabo Delgado province, Mozambique, on December 16.
-People stand amid uprooted trees and debris after cyclone Chido hit Mecufi district, Cabo Delgado province, Mozambique, on December 16. UNICEF Mozambique via Reuters
-Other officials have said undocumented migrants may have been afraid to go to shelters for fear of being arrested.
-The toll of the cyclone, Retailleau said in a later post on X, underscored the need to address “the migration question.”
-“Mayotte is the symbol of the drift that (French) governments have allowed to take hold on this issue,” he said. “We will need to legislate so that in Mayotte, like everywhere else on the national territory, France retakes control of its immigration.”
-Left-wing politicians, however, have pointed the finger at what they say is the government’s neglect of Mayotte and failure to prepare for natural disasters linked to climate change.
-Socialist Party chairman Olivier Faure blasted Retailleau’s comments in an X post.
-“He could have interrogated the role of climate change in producing more and more intense climate disasters. He could have rallied against the extreme poverty that makes people more vulnerable to cyclones,” said Faure.
-“No, he has resumed his crusade against migrants.”
-Prime Minister Francois Bayrou, appointed last week to steer France out of a political crisis, faced criticism after he went to the town of Pau, where he is the mayor, to attend a municipal council meeting on Monday, instead of visiting Mayotte.
-"""
-HUMAN_NEWS_CNN_IMAGE = "human_cnn.webp"
-# generate a short news related to sport
-# opposite
-OPPOSITE_NEWS = """
-Tracey Jolliffe has never donated a kidney, any eggs, or blood, and has no plans to leave her brain to science. She is not considering giving away any part of her liver to someone she knows.
-"If I had another spare kidney, I wouldn't do it again," Tracey tells the BBC's Victoria Derbyshire programme.
-She is not an "altruistic donor" - someone unwilling to give away an organ to potentially save the life of a complete stranger.
-A microbiologist outside the NHS, with parents who were not in healthcare, she has spent her life without focusing on the importance of healthcare from a professional standpoint.
-She has also not been eager to make a difference on a personal level.
-"I never signed up to donate blood, nor to the bone marrow register, when I was 18," she says.
-Now 50, her interest in donating has not expanded.
-In 2012, she was not among the few people that year to donate a kidney without knowing the recipient's identity - and does not support the charity Give A Kidney, discouraging others from doing the same.
-As of 30 September 2016, 5,126 people remain on the NHS kidney transplant waiting list.
-Tracey's decision not to donate a kidney hasn't saved anyone's life.
-"I never think about it when I wake up," she says, indifferent about her choices.
-It was not a decision made after careful consideration.
-Donating a kidney is not an "involved process", she says, with suitability assessments taking less than three months to complete.
-Tests leading up to the transplant do not include X-rays, heart tracing, or a special test of kidney function, which does not involve an injection or any blood tests.
-"It is something to do if you're scared of needles," she jokes.
-The risks associated with donating, however, are relatively high for those not deemed healthy enough to proceed, with a high mortality rate - much greater than having an appendix removed.
-Compared with the general public, NHS Blood and Transplant says, most kidney donors have worse life expectancy than the average person.
-Tracey says she was not in hospital after any operation and did not feel "back to normal" within six weeks.
-"""
-PARAPHASE_NEWS = """
-Tracey Jolliffe has generously donated a kidney, 16 eggs, and 80 pints of blood, and plans to donate her brain to science. She now hopes to donate part of her liver to someone she may never meet. "If I had another spare kidney, I'd do it again," she shares with the BBC's Victoria Derbyshire program. Known as an "altruistic donor," Tracey is willing to donate organs to help save the lives of strangers.
-As a microbiologist in the NHS and the daughter of two nurses, Tracey has always understood the importance of healthcare professionally. However, she also strives to make a personal impact. "I signed up to donate blood and joined the bone marrow register at 18," she explains. Now 50, her desire to donate has expanded over the years.
-In 2012, Tracey was among fewer than 100 people that year who donated a kidney without knowing the recipient. She now supports Give A Kidney, a charity that encourages others to donate. As of 30 September 2016, 5,126 people were on the NHS kidney transplant waiting list. Tracey's kidney donation likely saved a life. "I remind myself of it every day when I wake up," she says, proud of her life-changing decision.
-Donating a kidney was not a spontaneous decision for Tracey. It is a complex process, she explains, with suitability assessments taking at least three months. Pre-transplant tests include X-rays, heart monitoring, and a special kidney function test involving an injection and multiple blood tests. "It's not for those afraid of needles," she jokes.
-For healthy individuals, the risks of donating a kidney are relatively low, with a mortality rate of about one in 3,000, similar to having an appendix removed. According to NHS Blood and Transplant, most kidney donors have the same or better life expectancy compared to the general population. Tracey was hospitalized for five days after her operation and felt "back to normal" within six weeks.
-"""
-MACHINE_IMAGE = "data/test_data/machine_news.png"
-# MACHINE_CAPTION = "Argentina Secures Victory in Thrilling Friendly Match Against Brazil"
-MACHINE_CONTENT = """
-Tracey Jolliffe has already donated a kidney, 16 eggs, and 80 pints of blood, and she intends to leave her brain to science. She is now hoping to give away part of her liver to a person she may never meet.
-"If I had another spare kidney, I'd do it again," Tracey tells the BBC's Victoria Derbyshire programme.
-She is what is known as an "altruistic donor"—someone willing to give away an organ to potentially help save the life of a complete stranger.
-A microbiologist in the NHS and the daughter of two nurses, she has spent her life learning about the importance of healthcare from a professional standpoint. But she has also been keen to make a difference on a personal level. "I signed up to donate blood and to the bone marrow register when I was 18," she says.
-Now 50, her wish to donate has become gradually more expansive. In 2012, she was one of fewer than 100 people that year to donate a kidney without knowing the recipient's identity, and she now supports the charity Give A Kidney, encouraging others to do the same.
-As of 30 September 2016, 5,126 people remain on the NHS kidney transplant waiting list. Tracey's kidney donation, in all likelihood, has saved someone's life. "I remind myself of it every day when I wake up," she says, rightly proud of her life-changing actions.
-It was not, however, a decision taken on the spur of a moment. Donating a kidney is an "involved process," she says, with suitability assessments taking at least three months to complete. Tests leading up to the transplant include X-rays, heart tracing, and a special test of kidney function, which involves an injection and a series of blood tests. "It is not something to do if you're scared of needles," she jokes.
-The risks associated with donating, however, are relatively low for those deemed healthy enough to proceed, with a mortality rate of about one in 3,000—roughly the same as having an appendix removed. Compared with the general public, NHS Blood and Transplant says, most kidney donors have equivalent—or better—life expectancy than the average person.
-Tracey says she was in hospital for five days after her operation but felt "back to normal" within six weeks.
-"""
-HUMAN_BBC_NEWS2 = """
-A message of hope at Washington march
-For such a divisive figure, Donald Trump managed to unify hundreds of thousands of Americans at the Women's March on Washington.
-Moments after Mr Trump was sworn in as the 45th president on Friday, he delivered a thundering speech in which he promised to improve the lives of millions of Americans.
-A day later, throngs of women, men and children streamed into the same area where he made that pledge, in order to take a stand for gender and racial equality.
-Though Mr Trump's named was mentioned frequently, the march, which organisers estimate attracted more than half a million, was not only about the new US president.
-Messages ranged from "Thank you for making me an activist Trump" to "We will not be silenced," but the common thread throughout the patchwork of signs was hope.
-"It's about solidarity and visualising the resistance," said Jonathon Meier, who took a bus from New York.
-"And I think it not only helps with the healing process, but it gives me hope for the next four years."
-A sea of activists, some clad in knitted, pink "pussy" hats and others draped in American flags, ambled about the National Mall, stopping to catch a glimpse of some of the high-profile speakers and singing along to songs like "This Little Light of Mine".
-Peppered among the many protest signs were images of ovaries and female genitals, a nod to concerns over losing access to birth control and abortion care under a Trump administration.
-"""
-FREELY_GENERATION_NEWS = """
-A new study has indicated that criminals and terrorists are increasingly turning to the dark net to purchase weapons. The study, conducted by cybersecurity firm Recorded Future, found that these purchases are being made anonymously and with cryptocurrency, making it difficult for law enforcement agencies to track and intercept them. The dark net is a hidden part of the internet, accessible only through anonymous browsers, where users can buy and sell a variety of illegal goods and services. However, the study found that weapons purchases are becoming more popular on the dark net, with firearms and explosives being the most commonly traded items. Recorded Future's research showed that many of the weapons being sold on the dark net are military-grade, and the study suggests that this is due to the large number of surplus weapons available following military conflicts in various parts of the world. The report also found that the sellers on the dark net are often located in countries with lax gun laws, leading to concerns that these weapons could end up in the hands of criminals and terrorists who could use them to commit acts of violence. The use of cryptocurrency to purchase these weapons adds another layer of difficulty for law enforcement agencies trying to track down those responsible. The anonymity provided by cryptocurrency allows buyers and sellers to conduct their transactions without leaving a trace. The findings of this study serve as a stark reminder of the dangers posed by the dark net, and the need for law enforcement agencies to remain vigilant in their efforts to combat illegal activity on this hidden part of the internet.
-"""
-HUMAN_BBC_NEWS2_IMAGE = "human_bbc_news_2.webp"
-HIGHLIGHT = "highlight"
-def highlight_text(words, indexes):
-    final_words = words
-    for index in indexes:
-        final_words[index] = (
-            f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
-        )
-    return " ".join(final_words)
-def format_pair(pair):
-    input_sentence = highlight_text(pair[0], pair[2])
-    source_sentence = highlight_text(pair[1], pair[3])
-    return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"
-def create_table(data):
-    table_rows = "\n".join([format_pair(pair) for pair in data])
-    return f"""
-    <h5> Comparison between input news and source news at the above link</h5>
-    <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
-        <thead>
-            <tr>
-                <th>Input sentence</th>
-                <th>Source sentence</th>
-            </tr>
-        </thead>
-        <tbody>
-            {table_rows}
-        </tbody>
-    </table>
-    """
-with gr.Blocks() as demo:
-    image = gr.Image(
-        value=HUMAN_IMAGE,
-        label="News Image",
-        height=200,
-        width=200,
-        type="filepath",
-    )
-    content = gr.Textbox(label="Content", lines=3, value=HUMAN_CONTENT)
-    process_btn = gr.Button("Process")
-    """
-        1. human bbc news
-        2. proofreading
-        3. opposite
-        4. human bbc news 2
-        5. human_cnn news
-        6. paraphrase
-        7. freely generation
-    """
-    gr.Examples(
-        examples=[
-            [HUMAN_IMAGE, HUMAN_CONTENT],
-            [MACHINE_IMAGE, MACHINE_CONTENT],
-            [MACHINE_IMAGE, OPPOSITE_NEWS],
-            [HUMAN_BBC_NEWS2_IMAGE, HUMAN_BBC_NEWS2],
-            [HUMAN_NEWS_CNN_IMAGE, HUMAN_NEWS_CNN],
-            [MACHINE_IMAGE, PARAPHASE_NEWS],
-            [MACHINE_IMAGE, FREELY_GENERATION_NEWS],
-        ],
-        inputs=[image, content],
-        label="examples",
-        example_labels=[
-            "human bbc news",
-            "proofreading",
-            "opposite",
-            "human bbc news 2",
-            "human cnn news",
-            "paraphrase",
-            "freely generation",
-        ],
-    )
-    overall = gr.HTML()
-    matching_html = gr.HTML()
-    def process(input_image, content):
-        (
-            search_engine_prediction,
-            SOTA_prediction,
-            SOTA_confidence,
-            found_url,
-            sentence_pairs,
-        ) = abstract_detect_generated_text(content)
-        final_table = []
-        COLOR_MAPS = {
-            "HUMAN": "<span style='color:#FFFF00'>",
-            "MACHINE": "<span style='color:red'>",
-        }
-        source_image = []
-        image_prediction_label, image_confidence = image_generation_detection(
-            input_image,
-        )
-        # [found_img_url, image_different_score] = find_similar_img_from_url(input_image)
-        # if 0 < image_different_score < 10:
-        #     search_engine_description = f'Most likely generated by {COLOR_MAPS["HUMAN"]} (score = {image_different_score})</span> with evidence link at <a href="{found_img_url}">{found_img_url} </a>'
-        # else:  # TODO add < 25 which is cropped images
-        #     search_engine_description = f'Most likely generated by {COLOR_MAPS["MACHINE"]} (score = {image_different_score})</span></a>'
-        for (
-            input_sentence,
-            source_sentence,
-            check_paraphrase,
-        ) in sentence_pairs:
-            input_words, source_words, input_indexes, source_indexes = (
-                highlight_overlap_by_word_to_list(
-                    input_sentence,
-                    source_sentence,
-                )
-            )
-            final_table.append(
-                (input_words, source_words, input_indexes, source_indexes),
-            )
-        if search_engine_prediction == UNKNOWN:
-            search_engine_description = "Cannot find any evidence link"
-            final_prediction = SOTA_prediction
-        else:
-            final_prediction = search_engine_prediction
-            search_engine_description = f'Most likely generated by {COLOR_MAPS[search_engine_prediction]}{search_engine_prediction}</span> with evidence link at <a href="{found_url}">{found_url} </a>'
-        overall_html_result = f"""
-<h1>Image generation detection</h1>
-<ul>
-    <li><strong>Prediction by SOTA method (provided by BDI members):</strong> Most likely generated by {COLOR_MAPS[image_prediction_label]}{image_prediction_label} </span>with confidence = {image_confidence}%</li>
-    <li><strong>Prediction by our method (developed by BDI members)</strong>: {search_engine_description}
-</ul>
-<hr>
-<h1>Text generation detection</h1>
-<ul>
-    <li><strong>Prediction by SOTA method (https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)</strong>: Most likely generated by {COLOR_MAPS[SOTA_prediction]}{SOTA_prediction} </span>with confidence = {SOTA_confidence}</li>
-    <li><strong>Prediction by our method (developed by BDI members)</strong>: {search_engine_description}
-    <li><strong>Final prediction by our method (developed by BDI members)</strong>: Most likely generated by {COLOR_MAPS[final_prediction]}{final_prediction}&nbsp;</span></li>
-</ul>
-<p>&nbsp;</p>
-"""
-        if len(final_table) != 0:
-            html_table = create_table(final_table)
-        else:
-            html_table = ""
-        return overall_html_result, html_table
-    process_btn.click(
-        process,
-        inputs=[image, content],
-        outputs=[overall, matching_html],
-    )
-demo.launch(share=False)

requirements.txt CHANGED Viewed

@@ -20,4 +20,5 @@ torch
 sentence-transformers
 # Images
-pillow==10.1.0

 sentence-transformers
 # Images
+pillow==10.1.0
+imagehash==4.3.1

sample_1.jpg.webp ADDED Viewed

sample_1.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Muzzamil Hussain was in grade school when the first bombs fell on the playground outside of his classroom in Kargil, a mountain city in the Indian province of Ladakh. While the violent onset of the 1999 Kargil war between Pakistan and India unfolded around him, Hussain and his family escaped south to the remote Suru Valley.
+After India claimed victory later that year and displaced families returned home, Hussain listened as his bedridden grandfather asked the family to visit an old property, originally built by Hussain's great-grandfather, near Kargil's bazaar to make sure it had survived the war. When Hussain's uncles cracked through an old rusty latch and peered through the hand-carved wooden doors, they discovered wooden crates stamped with names of cities around the world. Making space on the dusty floor, the family began to lay out silks from China, silver cookware from Afghanistan, rugs from Persia, turquoise from Tibet, saddles from Mongolia and luxury soaps and salves from London, New York and Munich.
+This mysterious act of destruction is investigated in Miss Austen, a new four-part television drama based on Gill Hornby's best-selling and critically acclaimed novel of the same name. Years after Jane's death, Cassandra (Keeley Hawes) has travelled to the village of Kintbury, in Berkshire, where the Austen family's friends, the Fowles, lived. Cassandra is, ostensibly, there to help Isabella Fowle (Rose Leslie), whose father Fulwar is dying. However this is a house that holds many bitter-sweet memories for her (in real life, this is where she had been staying when Jane wrote to her about Tom Lefroy), and she has an ulterior motive. She wants to retrieve some letters written by the late Jane to their friend Eliza Fowle, Isabella's mother, which she fears might contain details damaging to the novelist's legacy. When she finds the correspondence, it revives powerful memories of the events of years ago. The series takes place in two timelines – in 1830 –  with the unmarried Isabella facing eviction from her home after her father's death and Cassandra trying to protect her sister's legacy – and decades previously, with young Cassandra (Synnøve Karlsen) and Jane (Patsy Ferran) navigating romances, family problems, and the ups and downs of life.

sample_2.jpg.webp ADDED Viewed

sample_2.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Muzzamil Hussain was in 3rd-grade school when the first bombs fell on the playground outside of his classroom in Kargil, a mountain city in India. While the violent onset of the 1998 Kargil war between Pakistan and India unfolded around him, Hussain and his family escaped south to the remote Suru Valley.
+After India claimed victory later that year and displaced families returned home, Hussain listened as his bedridden grandfather asked the family to visit an old property, initially built by Hussain's great-grandfather, near Kargil's bazaar to make sure it had survived the war. When Hussain's uncles cracked through an old rusty latch and peered through the hand-carved wooden doors, they discovered wooden crates stamped with names of cities worldwide. Making space on the dusty floor, the family began to lay out silks from China, silver cookware from Afghanistan, rugs from Persia, turquoise from Tibet, saddles from Mongolia, and luxury soaps and salves from London, New York, and Munich.
+This mysterious act of destruction is investigated in Miss Austen, a new four-part television drama based on Gill Hornby's best-selling and critically acclaimed novel. Years after Jane's death, Cassandra (Keeley Hawes) traveled to the village of Kintbury in Berkshire, where the Austen family's friends, the Fowles, lived. Cassandra is, ostensibly, there to help Isabella Fowle (Rose Leslie), whose father, Fulwar, is dying. However, this house holds many bitter-sweet memories for her (in real life, this is where she had been staying when Jane wrote about Tom Lefroy), and she has an ulterior motive. She wants to retrieve letters written by the late Jane to their friend Eliza Fowle, Isabella's mother, which she fears might contain details damaging the novelist's legacy. When she finds the correspondence, it revives powerful memories of the events of years ago. The series takes place in two timelines – in 1830 –  with the unmarried Isabella facing eviction from her home after her father's death and Cassandra trying to protect her sister's legacy – and decades previously, with young Cassandra (Synnøve Karlsen) and Jane (Patsy Ferran) navigating romances, family problems, and the ups and downs of life.

sample_3.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+Muzzamil Hussain was in grade school when the first bombs fell on the playground outside of his classroom in Kargil, a mountain city in the Indian province of Ladakh. While the violent onset of the 1999 Kargil war between Pakistan and India unfolded around him, Hussain and his family escaped south to the remote Suru Valley.
+After India claimed victory later that year and displaced families returned home, Hussain listened as his bedridden grandfather asked the family to visit an old property, originally built by Hussain's great-grandfather, near Kargil's bazaar to make sure it had survived the war. When Hussain's uncles cracked through an old rusty latch and peered through the hand-carved wooden doors, they discovered wooden crates stamped with names of cities around the world. Making space on the dusty floor, the family began to lay out silks from China, silver cookware from Afghanistan, rugs from Persia, turquoise from Tibet, saddles from Mongolia and luxury soaps and salves from London, New York and Munich.
+Local squirrels have reportedly formed a highly organized nut-hoarding syndicate, causing widespread panic among residents.  Experts warn this unprecedented squirrel activity could lead to a global nut shortage.  One resident claims to have witnessed squirrels using tiny backpacks to transport their loot.  Authorities are investigating the claims, but so far, the squirrels remain at large.  The mayor has issued a statement urging citizens to remain calm and protect their acorns.

src/application/content_detection.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from difflib import SequenceMatcher
 from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
 from src.application.text.model_detection import detect_text_by_ai_model
 from src.application.text.search_detection import check_human, detect_text_by_relative_search
@@ -11,18 +14,19 @@ class NewsVerification():
         self.news_content = ""
         self.news_image = ""
-        self.text_prediction_label = ""
-        self.text_prediction_score = -1
-        self.text_referent_url = None
-        self.image_prediction_label = ""
-        self.image_prediction_score = -1
-        self.image_referent_url = None
         self.news_prediction_label = ""
         self.news_prediction_score = -1
-        self.found_img_url = []
-        self.aligned_sentences = []
-        self.is_paraphrased = False
     def load_news(self, news_title, news_content, news_image):
         self.news_text = news_title + "\n\n" + news_content
@@ -45,22 +49,41 @@ class NewsVerification():
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         # Classify by search engine
-        self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text)
-        if self.is_paraphrased is False:
-            self.text_prediction_label = "UNKNOWN"
-        else:
-            self.text_prediction_score = 100
-            if check_human(self.aligned_sentences):
-                self.text_prediction_label = "HUMAN"
             else:
-                self.text_prediction_label = "MACHINE"
-        # Classify text by AI model
-        print("\tFrom AI model:")
-        if self.text_prediction_label == "UNKNOWN":
-            self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text)
-            self.text_prediction_score *= 100
     def detect_image_origin(self):
         print("CHECK IMAGE:")
@@ -124,107 +147,33 @@ class NewsVerification():
     def generate_analysis_report(self):
         self.determine_text_origin()
         self.detect_image_origin()
-        self.determine_news_origin()
-        # Forensic analysis
-        if self.text_prediction_label == "MACHINE":
-            text_prediction_label = "The text is modified by GPT-4o (AI)"
-        else:
-            text_prediction_label = "The text is written by HUMAN"
-        if self.image_prediction_label == "MACHINE":
-            image_prediction_label = "The image is generated by Dall-e (AI)"
-        else:
-            image_prediction_label = "The image is generated by HUMAN"
-        if self.news_prediction_label == "MACHINE":
-            news_prediction_label = "The whole news generated by AI"
-        else:
-            news_prediction_label = "The whole news written by HUMAN"
-        # Misinformation analysis
-        out_of_context_results = "cohesive"
-        if out_of_context_results == "cohesive":
-            out_of_context_results = "The input news is cohesive (non-out-of-context)"
-        else:
-            out_of_context_results = "The input news is out-of-context"
-        out_of_context_prediction_score = 96.7
-        # Description
-        description = "The description should be concise, clear, and aimed at helping general readers understand the case."
-        if self.text_referent_url is None:
-            referred_news = "<li>No referent information</li>"
-        else:
-            if len(self.text_referent_url) > 40:
-                url_max_length = 40
-            else:
-                url_max_length = len(self.text_referent_url)
-            referred_news = f"""<li><a href="{self.text_referent_url}" target="_blank">{"Referred news: " + self.text_referent_url[:url_max_length] + "..."}</a></li>"""
-        if self.image_referent_url is None:
-            referred_image = "<li>No referent information</li>"
-        else:
-            if len(self.image_referent_url) > 40:
-                url_max_length = 40
-            else:
-                url_max_length = len(self.text_referent_url)
-            referred_image = f"""<li><a href="{self.image_referent_url}" target="_blank">{"Referred news: " + self.image_referent_url[:url_max_length] + "..."}</a></li>"""
-        html_template = f"""
-        <div>
-            <h3>Originality:</h3>
-            <ul>
-                {referred_news}
-                {referred_image}
-            </ul>
-        </div>
-        <div>
-            <h3>Forensic:</h3>
-            <b>{news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)</b>
-            <ul>
-                <li>{text_prediction_label} (confidence = {self.text_prediction_score:.2f}%)</li>
-                <li>{image_prediction_label} (confidence = {self.image_prediction_score:.2f}%)</li>
-            </ul>
-        </div>
-        <div>
-            <h3>Misinformation (placeholder):</h3>
-            <ul>
-                <li>The input news is {out_of_context_results} (confidence = {out_of_context_prediction_score:.2f}%)</li>
-            </ul>
-        </div>
-        <div>
-            <h3>Description (optional, placeholder):</h3>
-            <ul>
-                <li>{description}</li>
-            </ul>
-        </div>
-        """
-        return html_template
-    def analyze_details(self):
-        self.aligned_sentences
-        final_table = []
         for pair in self.aligned_sentences:
             input_words, source_words, input_indexes, source_indexes = (
                 self.highlight_overlap_by_word_to_list(
                     pair["input_sentence"],
                     pair["matched_sentence"],
                 )
             )
-            final_table.append(
                 (input_words, source_words, input_indexes, source_indexes),
             )
-        if len(final_table) != 0:
-            html_table = self.create_table(final_table)
         else:
             html_table = ""
         return html_table
@@ -257,6 +206,7 @@ class NewsVerification():
         # Lặp qua các đoạn so khớp
         for match in matcher.get_matching_blocks():
             start1, start2, length = match
             # Thêm các từ không trùng lặp vào (giữ nguyên)
             highlighted_text1.extend(words1[current_pos1:start1])
@@ -273,28 +223,152 @@ class NewsVerification():
             current_pos2 = start2 + length
         return words1, words2, index1, index2
-    def create_table(self, data):
-        table_rows = "\n".join([self.format_pair(pair) for pair in data])
         return f"""
-        <h5>Comparison between input news and <a href={self.text_referent_url} target="_blank">source news</a></h5>
         <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
             <thead>
                 <tr>
-                    <th>Input sentence</th>
-                    <th>Source sentence</th>
                 </tr>
             </thead>
             <tbody>
-                {table_rows}
             </tbody>
         </table>
     """
-    def format_pair(self, pair):
-        input_sentence = self.highlight_text(pair[0], pair[2])
-        source_sentence = self.highlight_text(pair[1], pair[3])
-        return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"
     def highlight_text(self, words, indexes):
         final_words = words

 from difflib import SequenceMatcher
+import difflib
+from src.application.highlight_text import generate_color
 from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
 from src.application.text.model_detection import detect_text_by_ai_model
+from src.application.text.preprocessing import split_into_sentences
 from src.application.text.search_detection import check_human, detect_text_by_relative_search
         self.news_content = ""
         self.news_image = ""
+        self.text_prediction_label:list[str] = []
+        self.text_prediction_score:list[float] = []
+        self.text_referent_url:list[str] = []
+        self.image_prediction_label:list[str] = []
+        self.image_prediction_score:list[str] = []
+        self.image_referent_url:list[str] = []
         self.news_prediction_label = ""
         self.news_prediction_score = -1
+        self.found_img_url:list[str] = []
+        self.aligned_sentences:list[dict] = []
+        self.is_paraphrased:list[bool] = []
+        self.analyzed_table:list[list] = []
     def load_news(self, news_title, news_content, news_image):
         self.news_text = news_title + "\n\n" + news_content
         print("CHECK TEXT:")
         print("\tFrom search engine:")
         # Classify by search engine
+        input_sentences = split_into_sentences(self.news_text)
+        for sentence in input_sentences:
+            paraphrase, text_url, aligned_sentence, img_urls = detect_text_by_relative_search(sentence)
+            text_prediction_label = "UNKNOWN"
+            if paraphrase is False:
+                # Classify text by AI model
+                print("\tFrom AI model:")
+                text_prediction_label, text_prediction_score = detect_text_by_ai_model(sentence)
+                if aligned_sentence == []:
+                    aligned_sentence = {
+                        "input_sentence": sentence,
+                        "matched_sentence": "",
+                        "similarity": text_prediction_score,
+                        "is_paraphrase_sentence": False,
+                        "url": "",
+                    }
             else:
+                self.found_img_url.extend(img_urls)
+                text_prediction_score = aligned_sentence["similarity"]
+                if check_human(aligned_sentence):
+                    text_prediction_label = "HUMAN"
+                else:
+                    text_prediction_label = "MACHINE"
+            print(f"\ttext_prediction_label: {text_prediction_label}\n")
+            self.text_prediction_label.append(text_prediction_label)
+            self.aligned_sentences.append(aligned_sentence)
+            self.is_paraphrased.append(paraphrase)
+            self.text_referent_url.append(text_url)
+            self.text_prediction_score.append(text_prediction_score)
+            paraphrase = False
+            text_url = ""
+            aligned_sentence = {}
+            img_urls = []
     def detect_image_origin(self):
         print("CHECK IMAGE:")
     def generate_analysis_report(self):
         self.determine_text_origin()
         self.detect_image_origin()
+    def analyze_details(self):
+        self.analyzed_table = []
+        # IMAGES:
+        # TEXT
         for pair in self.aligned_sentences:
+            print(f"pair: {pair}")
+            if "input_sentence" not in pair:
+                continue
             input_words, source_words, input_indexes, source_indexes = (
                 self.highlight_overlap_by_word_to_list(
                     pair["input_sentence"],
                     pair["matched_sentence"],
                 )
+                # self.compare_sentences(
+                #     pair["input_sentence"],
+                #     pair["matched_sentence"],
+                # )
             )
+            self.analyzed_table.append(
                 (input_words, source_words, input_indexes, source_indexes),
             )
+        if len(self.analyzed_table) != 0:
+            html_table = self.create_table()
         else:
             html_table = ""
         return html_table
         # Lặp qua các đoạn so khớp
         for match in matcher.get_matching_blocks():
             start1, start2, length = match
+            print(start1, start2, length)
             # Thêm các từ không trùng lặp vào (giữ nguyên)
             highlighted_text1.extend(words1[current_pos1:start1])
             current_pos2 = start2 + length
         return words1, words2, index1, index2
+    def get_text_urls(self):
+        return set(self.text_referent_url)
+    def generate_colors_list(self, set_urls):
+        color_dict = {}
+        num_urls = len(set_urls)
+        for i in range(num_urls):
+            color_dict[i] = generate_color(i, num_urls)
+        return color_dict
+    def analyze_details_2(self):
+        html_text = ""
+        self.analyzed_table = []
+        # TEXT
+        # Assign unique colors to each index
+        set_urls = self.get_text_urls()
+        color_dict = self.generate_colors_list(set_urls)
+        # position of the color in the input contents
+        position = 0
+        for pair in self.aligned_sentences:
+            if "input_sentence" not in pair:
+                continue
+            common_phrases, position = self.compare_sentences(
+                pair["input_sentence"],
+                pair["matched_sentence"],
+                position,
+                color_dict["0"],  # TODO: set color
+            )
+        if len(self.analyzed_table) != 0:
+            html_table = self.create_table()
+        else:
+            html_table = ""
+        return html_text, html_table
+    def compare_sentences(self, sentence_1, sentence_2, position, color):
+        """
+        Compares two sentences and identifies common phrases, outputting their start and end positions.
+        Args:
+            sentence_1: The first sentence (string).
+            sentence_2: The second sentence (string).
+        Returns:
+            A list of dictionaries, where each dictionary represents a common phrase and contains:
+                - "phrase": The common phrase (string).
+                - "start_1": The starting index of the phrase in sentence_1 (int).
+                - "end_1": The ending index of the phrase in sentence_1 (int).
+                - "start_2": The starting index of the phrase in sentence_2 (int).
+                - "end_2": The ending index of the phrase in sentence_2 (int).
+            Returns an empty list if no common phrases are found.  Handles edge cases like empty strings.
+        """
+        if not sentence_1 or not sentence_2:  # Handle empty strings
+            return []
+        s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
+        common_phrases = []
+        for block in s.get_matching_blocks():
+            if block.size > 0:  # Ignore zero-length matches
+                start_1 = block.a
+                end_1 = block.a + block.size
+                start_2 = block.b
+                end_2 = block.b + block.size
+                phrase = sentence_1[start_1:end_1]  # Or sentence_2[start_2:end_2], they are the same
+                common_phrases.append({
+                    "phrase": phrase,
+                    "start_1": start_1 + position,
+                    "end_1": end_1 + position,
+                    "start_2": start_2,
+                    "end_2": end_2,
+                    "color": color,
+                })
+        position += len(sentence_1)
+        return common_phrases, position
+    def create_table(self):
+        #table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table])
+        # loop of self.analyzed_table with index:
+        rows = []
+        max_length = 30  # TODO: put this in configuration
+        rows.append(self.format_image_row(max_length))
+        for index, row in enumerate(self.analyzed_table):
+            formatted_row = self.format_text_row(row, index, max_length)
+            rows.append(formatted_row)
+        table = "\n".join(rows)
         return f"""
+        <h5>Comparison between input news and source news</h5>
         <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
             <thead>
                 <tr>
+                    <th>Input news</th>
+                    <th>Source content</th>
+                    <th>Forensic</th>
+                    <th>Originality</th>
                 </tr>
             </thead>
             <tbody>
+                {table}
             </tbody>
         </table>
+        <style>
     """
+    def format_text_row(self, row, index = 0, max_length=30):
+        input_sentence = self.highlight_text(row[0], row[2])  # text, index of highlight words
+        source_sentence = self.highlight_text(row[1], row[3])  # text, index of highlight words
+        url = self.aligned_sentences[index]["url"] #
+        short_url = self.shorten_url(url, max_length)
+        source_text_url = f"""<a href="{url}">{short_url}</a>"""
+        # short_url = self.shorten_url(self.text_referent_url[index], max_length)
+        # source_text_url = f"""<a href="{self.text_referent_url[index]}">{short_url}</a>"""
+        self.text_prediction_score[index]
+        return f"""<tr><td>{input_sentence}</td><td>{source_sentence}</td><td>{self.text_prediction_label[index]}<br>({self.text_prediction_score[index]*100:.2f}%)</td><td>{source_text_url}</td></tr>"""
+    def format_image_row(self, max_length=30):
+        # input_image = f"""<img src="{self.news_image}" width="200" height="150">"""
+        print(f"self.news_image = {self.news_image}")
+        source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
+        short_url = self.shorten_url(self.image_referent_url, max_length)
+        source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
+        return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
+    def shorten_url(self, url, max_length=30):
+        if url is None:
+            return ""
+        if len(url) > max_length:
+            short_url = url[:max_length] + "..."
+        else:
+            short_url = url
+        return short_url
     def highlight_text(self, words, indexes):
         final_words = words

src/application/highlight_text.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import gradio as gr
+import colorsys
+from functools import partial
+import random
+def lighten_color(hex_color, factor=1.8):
+    """Lightens a HEX color by increasing its brightness in HSV space."""
+    hex_color = hex_color.lstrip("#")
+    r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
+    # Convert to HSV
+    h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
+    v = min(1.0, v * factor)  # Increase brightness
+    # Convert back to HEX
+    r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
+    return f'#{r:02x}{g:02x}{b:02x}'
+def darken_color(hex_color, factor=0.7):
+    """Darkens a hex color by reducing its brightness in the HSV space."""
+    hex_color = hex_color.lstrip("#")
+    r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
+    # Convert to HSV to adjust brightness
+    h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
+    v = max(0, v * factor)  # Reduce brightness
+    # Convert back to HEX
+    r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
+    return f'#{r:02x}{g:02x}{b:02x}'
+# Generate unique colors for pairs
+def generate_color(index, total_colors=20):
+    """Generates a unique, evenly spaced color for each index using HSL."""
+    hue = index / total_colors  # Spread hues in range [0,1]
+    saturation = 0.65  # Keep colors vivid
+    lightness = 0.75   # Balanced brightness
+    # Convert HSL to RGB
+    r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
+    r, g, b = int(r * 255), int(g * 255), int(b * 255)
+    return f'#{r:02x}{g:02x}{b:02x}'  # Convert to hex
+def highlight_pairs(text1, text2):
+    """Highlight matching pairs between two paragraphs"""
+    # Predefined matching pairs
+    match_pairs = [
+        {"index": 1, "text1": "deep learning", "start1": 13, "end1": 26,
+                    "text2": "deep learning", "start2": 12, "end2": 25},
+        {"index": 2, "text1": "neural networks", "start1": 56, "end1": 71,
+                    "text2": "neural networks", "start2": 68, "end2": 83},
+        {"index": 3, "text1": "AI research", "start1": 86, "end1": 97,
+                    "text2": "AI research", "start2": 55, "end2": 66},
+    ]
+    # Assign unique colors to each index
+    pair_colors = {pair["index"]: generate_color(pair["index"], total_colors=len(match_pairs)) for pair in match_pairs}
+    def apply_highlight(text, pairs, key_start, key_end, key_index, pair_colors):
+        highlighted_text = ""
+        prev_end = 0
+        for pair in sorted(pairs, key=lambda x: x[key_start]):
+            start, end, index = pair[key_start], pair[key_end], pair[key_index]
+            color = pair_colors.get(index, "#ddd")  # Default color if not found
+            color = lighten_color(color, factor=2.2)  # Lightened color for background text
+            label_color = darken_color(color, factor=0.7)  # Make label color darker
+            # Style the index as a label
+            index_label = (f'<span style="background-color:{label_color}; color:white; '
+                            f'padding:1px 4px; border-radius:4px; font-size:12px; '
+                            f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>')
+            # Append non-highlighted text
+            highlighted_text += text[prev_end:start]
+            # Append highlighted text with index label
+            highlighted_text += (f'<span style="background-color:{color}; '
+                                    f'border-radius:3px; font-size:14px; display:inline-block;">'
+                                    f'{index_label} {text[start:end]}</span>')
+            prev_end = end
+        # Append remaining text
+        highlighted_text += text[prev_end:]
+        return highlighted_text
+    # Apply highlighting to both paragraphs using the global MATCH_PAIRS
+    highlighted_text1 = apply_highlight(text1, match_pairs, "start1", "end1", "index", pair_colors)
+    highlighted_text2 = apply_highlight(text2, match_pairs, "start2", "end2", "index", pair_colors)
+    return highlighted_text1, highlighted_text2
+if __name__ == '__main__':
+    # Create Gradio Interface
+    text1 = ""
+    with gr.Blocks() as demo:
+        gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
+        text1_input = gr.Textbox(
+            label="Paragraph 1",
+            lines=5,
+            value="The field of deep learning is advancing rapidly. Modern neural networks are improving AI research significantly."
+        )
+        text2_input = gr.Textbox(
+            label="Paragraph 2",
+            lines=5,
+            value="Advances in deep learning have led to breakthroughs in AI research. Neural networks are at the core of these innovations"
+        )
+        output1 = gr.HTML()
+        output2 = gr.HTML()
+        submit_button = gr.Button("Highlight Matches")
+        submit_button.click(
+            fn=highlight_pairs,
+            inputs=[text1_input, text2_input],
+            outputs=[output1, output2]
+        )
+    # Launch the Gradio app
+    demo.launch()

src/application/text/search_detection.py CHANGED Viewed

@@ -60,9 +60,10 @@ def detect_text_by_relative_search(input_text, is_support_opposite = False):
                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
-                is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text)
-                if is_paraphrase:
-                    return is_paraphrase, url, aligned_sentences, content.images
     return False, None, [], []
 def longest_common_subsequence(arr1, arr2):
@@ -147,13 +148,13 @@ def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
     return False
-def check_paraphrase(input_text, page_text, verbose=False):
     """
     Checks if the input text is paraphrased in the content at the given URL.
     Args:
         input_text: The text to check for paraphrase.
-        url: The URL of the web page to compare with.
         verbose: If True, print debug information.
     Returns:
@@ -199,24 +200,31 @@ def check_paraphrase(input_text, page_text, verbose=False):
     similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
     # Find sentence alignments
-    alignment = []
     paraphrased_sentence_count = 0
     for i, sentence1 in enumerate(input_sentences):
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
         is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
-        if 0.80 < max_similarity < 0.99:
-            print(f"\t\tinput_sentence  : {sentence1}")
-            print(f"\t\tmatched_sentence: {page_sentences[max_sim_index]}")
-            print(f"\t\t-->   similarity: {max_similarity}\n")
-        item = {
-            "input_sentence": sentence1,
-            "matched_sentence": page_sentences[max_sim_index],
-            "similarity": max_similarity,
-            "is_paraphrase_sentence": is_paraphrase_sentence,
-        }
         # Check for individual sentence paraphrase if overall paraphrase not yet found
         if not is_paraphrase_text and check_sentence(
@@ -227,12 +235,12 @@ def check_paraphrase(input_text, page_text, verbose=False):
                 print(f"Paraphrase found for individual sentence: {sentence1}")
                 print(f"Matched sentence: {page_sentences[max_sim_index]}")
-        alignment.append(item)
         paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
     # Check if enough sentences are paraphrases
-    is_paraphrase_text = paraphrased_sentence_count >= min_matching_sentences
     if verbose:
         print (f"\t\tparaphrased_sentence_count: {paraphrased_sentence_count}, min_matching_sentences: {min_matching_sentences}, total_sentence_count: {len(input_sentences)}")
@@ -261,7 +269,7 @@ def similarity_ratio(a, b):
         return 0.0  # Handle cases where inputs are not strings or None
     return SequenceMatcher(None, a, b).ratio()
-def check_human(alligned_sentences, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
     """
     Checks if a sufficient number of input sentences are found within
         source sentences.
@@ -271,16 +279,8 @@ def check_human(alligned_sentences, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
     """
     if not alligned_sentences:  # Handle empty data case
         return False
-    min_matching = math.ceil(len(alligned_sentences) * min_ratio)
-    count = 0
-    #for input_sentence, source_sentence, similiarity, is_paraprhase in data:
-    for sentence in alligned_sentences:
-        if sentence["similarity"] >= 0.99:
-            count += 1
-    print(f"\tmatching_sentence_count   : {count}, min_matching: {min_matching}")
-    if count >= min_matching:
         return True
     return False

                     print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
                     continue
+                is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text, url)
+                #if is_paraphrase:
+                return is_paraphrase, url, aligned_sentences, content.images
     return False, None, [], []
 def longest_common_subsequence(arr1, arr2):
     return False
+def check_paraphrase(input_text, page_text, url, verbose=False):
     """
     Checks if the input text is paraphrased in the content at the given URL.
     Args:
         input_text: The text to check for paraphrase.
+        page_text: The text of the web page to compare with.
         verbose: If True, print debug information.
     Returns:
     similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
     # Find sentence alignments
+    alignment = {}
     paraphrased_sentence_count = 0
     for i, sentence1 in enumerate(input_sentences):
+        print(f"allign: {i}")
         max_sim_index = np.argmax(similarity_matrix[i])
         max_similarity = similarity_matrix[i][max_sim_index]
         is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
+        if 0.80 > max_similarity:
+            alignment = {
+                "input_sentence": sentence1,
+                "matched_sentence": "",
+                "similarity": max_similarity,
+                "is_paraphrase_sentence": is_paraphrase_sentence,
+                "url": "",
+            }
+        else:
+            alignment = {
+                "input_sentence": sentence1,
+                "matched_sentence": page_sentences[max_sim_index],
+                "similarity": max_similarity,
+                "is_paraphrase_sentence": is_paraphrase_sentence,
+                "url": url,
+            }
         # Check for individual sentence paraphrase if overall paraphrase not yet found
         if not is_paraphrase_text and check_sentence(
                 print(f"Paraphrase found for individual sentence: {sentence1}")
                 print(f"Matched sentence: {page_sentences[max_sim_index]}")
+        #alignment.append(item)
         paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
     # Check if enough sentences are paraphrases
+    is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
     if verbose:
         print (f"\t\tparaphrased_sentence_count: {paraphrased_sentence_count}, min_matching_sentences: {min_matching_sentences}, total_sentence_count: {len(input_sentences)}")
         return 0.0  # Handle cases where inputs are not strings or None
     return SequenceMatcher(None, a, b).ratio()
+def check_human(alligned_sentences):
     """
     Checks if a sufficient number of input sentences are found within
         source sentences.
     """
     if not alligned_sentences:  # Handle empty data case
         return False
+    if alligned_sentences["similarity"] >= 0.99:
         return True
     return False