pmkhanh7890 commited on
Commit
d952fbe
Β·
1 Parent(s): badcb49

revise demo

Browse files
Yandexsample.html DELETED
The diff for this file is too large to render. See raw diff
 
app.py CHANGED
@@ -1,23 +1,62 @@
1
- from bs4 import BeautifulSoup
2
- import requests
3
 
4
- from src.application.image.search_yandex import get_image_links
 
 
5
 
 
 
 
6
 
7
- img_search_url = """https://yandex.ru/images/search?cbir_id=4481385%2Fw-xYJ246B9thwtVBmNcpkg9409&rpt=imageview&lr=10636"""
8
- print(img_search_url)
 
 
 
 
 
 
 
9
 
 
 
10
 
11
- headers = {
12
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
13
- 'Content-Type': 'application/json',
14
- }
15
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- response = requests.get(img_search_url, headers=headers)
18
- response.raise_for_status() # Raise an exception for bad status codes
19
 
20
- # Parse the HTML content
21
- soup = BeautifulSoup(response.content, 'html.parser')
22
- image_urls = get_image_links(soup.prettify())
23
- print(f"image_urls: {image_urls}")
 
1
+ import difflib
 
2
 
3
+ def compare_sentences(sentence_1, sentence_2):
4
+ """
5
+ Compares two sentences and identifies common phrases, outputting their start and end positions.
6
 
7
+ Args:
8
+ sentence_1: The first sentence (string).
9
+ sentence_2: The second sentence (string).
10
 
11
+ Returns:
12
+ A list of dictionaries, where each dictionary represents a common phrase and contains:
13
+ - "phrase": The common phrase (string).
14
+ - "start_1": The starting index of the phrase in sentence_1 (int).
15
+ - "end_1": The ending index of the phrase in sentence_1 (int).
16
+ - "start_2": The starting index of the phrase in sentence_2 (int).
17
+ - "end_2": The ending index of the phrase in sentence_2 (int).
18
+ Returns an empty list if no common phrases are found. Handles edge cases like empty strings.
19
+ """
20
 
21
+ if not sentence_1 or not sentence_2: # Handle empty strings
22
+ return []
23
 
24
+ s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
25
+ common_phrases = []
26
+
27
+ for block in s.get_matching_blocks():
28
+ if block.size > 0: # Ignore zero-length matches
29
+ start_1 = block.a
30
+ end_1 = block.a + block.size
31
+ start_2 = block.b
32
+ end_2 = block.b + block.size
33
+
34
+ phrase = sentence_1[start_1:end_1] # Or sentence_2[start_2:end_2], they are the same
35
+
36
+ common_phrases.append({
37
+ "phrase": phrase,
38
+ "start_1": start_1,
39
+ "end_1": end_1,
40
+ "start_2": start_2,
41
+ "end_2": end_2
42
+ })
43
+
44
+ return common_phrases
45
+
46
+
47
+
48
+ # Example usage:
49
+ sentence_1 = "
50
+ Muzzamil Hussain was in 3rd-grade school when the first bombs fell on the playground outside of his classroom in Kargil, a mountain city in India. While the violent onset of the 1998 Kargil war between Pakistan and India unfolded around him, Hussain and his family escaped south to the remote Suru Valley.
51
+ After India claimed victory later that year and displaced families returned home, Hussain listened as his bedridden grandfather asked the family to visit an old property, initially built by Hussain's great-grandfather, near Kargil's bazaar to make sure it had survived the war. When Hussain's uncles cracked through an old rusty latch and peered through the hand-carved wooden doors, they discovered wooden crates stamped with names of cities worldwide. Making space on the dusty floor, the family began to lay out silks from China, silver cookware from Afghanistan, rugs from Persia, turquoise from Tibet, saddles from Mongolia, and luxury soaps and salves from London, New York, and Munich. "
52
+ sentence_2 = "A quick brown fox jumps over a lazy cat."
53
+
54
+ common_phrases = compare_sentences(sentence_1, sentence_2)
55
+
56
+ if common_phrases:
57
+ for phrase_data in common_phrases:
58
+ print(phrase_data)
59
+ else:
60
+ print("No common phrases found.")
61
 
 
 
62
 
 
 
 
 
application.py CHANGED
@@ -43,10 +43,12 @@ def load_url(url):
43
 
44
 
45
  def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
 
46
  news_analysis.load_news(news_title, news_content, news_image)
47
- return news_analysis.generate_analysis_report(), news_analysis.analyze_details()
 
 
48
 
49
- news_analysis = NewsVerification()
50
  # Define the GUI
51
  with gr.Blocks() as demo:
52
  gr.Markdown("# NEWS VERIFICATION")
@@ -54,14 +56,11 @@ with gr.Blocks() as demo:
54
  with gr.Row():
55
  # SETTINGS
56
  with gr.Column(scale=1):
57
- with gr.Accordion("Settings"):
58
- gr.Markdown("Give an URL or fill in news by yourself")
59
-
60
  with gr.Accordion("1. Enter a URL"):
61
  url_input = gr.Textbox(
62
  label="",
63
  show_label=False,
64
- value="https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science",
65
  )
66
  load_button = gr.Button("Load URL")
67
 
@@ -82,20 +81,17 @@ with gr.Blocks() as demo:
82
  )
83
  replace_button = gr.Button("Replace all")
84
 
85
- # GENERATED CONTENT
86
- with gr.Column(scale=1):
87
- with gr.Accordion("Input News"):
88
- news_title = gr.Textbox(label="Title", value="")
89
- news_image = gr.Image(label="Image", type="filepath")
90
- news_content = gr.Textbox(label="Content", value="", lines=12)
91
 
92
  # NEWS ANALYSIS REPORT
93
- with gr.Column(scale=1):
94
  with gr.Accordion("News Analysis"):
95
  detection_button = gr.Button("Verify news")
96
- analyzed_information = gr.HTML()
97
- with gr.Accordion("Detailed information"):
98
- detailed_analysis = gr.HTML()
99
 
100
  # Connect events
101
  load_button.click(
@@ -114,25 +110,45 @@ with gr.Blocks() as demo:
114
  outputs=[news_image])
115
  detection_button.click(generate_analysis_report,
116
  inputs=[news_title, news_content, news_image],
117
- outputs=[analyzed_information, detailed_analysis])
118
 
119
  # change Image
120
  #url_input.change(load_image, inputs=url_input, outputs=image_view)
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  gr.Examples(
123
  examples=[
124
- ["https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road"],
125
- ["https://bbc.com/future/article/20250110-how-often-you-should-wash-your-towels-according-to-science"],
 
126
  ],
127
- inputs=[url_input],
128
  label="Examples",
129
  example_labels=[
130
- "BBC news 1",
131
- "BBC news 2",
 
132
  ],
133
  )
134
 
135
- demo.launch()
136
 
137
 
138
  # https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road
 
43
 
44
 
45
  def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
46
+ news_analysis = NewsVerification()
47
  news_analysis.load_news(news_title, news_content, news_image)
48
+ news_analysis.generate_analysis_report()
49
+ return news_analysis.analyze_details()
50
+
51
 
 
52
  # Define the GUI
53
  with gr.Blocks() as demo:
54
  gr.Markdown("# NEWS VERIFICATION")
 
56
  with gr.Row():
57
  # SETTINGS
58
  with gr.Column(scale=1):
 
 
 
59
  with gr.Accordion("1. Enter a URL"):
60
  url_input = gr.Textbox(
61
  label="",
62
  show_label=False,
63
+ value="",
64
  )
65
  load_button = gr.Button("Load URL")
66
 
 
81
  )
82
  replace_button = gr.Button("Replace all")
83
 
84
+ # GENERATED CONTENT
85
+ with gr.Accordion("Input News"):
86
+ news_title = gr.Textbox(label="Title", value="")
87
+ news_image = gr.Image(label="Image", type="filepath")
88
+ news_content = gr.Textbox(label="Content", value="", lines=12)
 
89
 
90
  # NEWS ANALYSIS REPORT
91
+ with gr.Column(scale=2):
92
  with gr.Accordion("News Analysis"):
93
  detection_button = gr.Button("Verify news")
94
+ detailed_analysis = gr.HTML()
 
 
95
 
96
  # Connect events
97
  load_button.click(
 
110
  outputs=[news_image])
111
  detection_button.click(generate_analysis_report,
112
  inputs=[news_title, news_content, news_image],
113
+ outputs=[detailed_analysis])
114
 
115
  # change Image
116
  #url_input.change(load_image, inputs=url_input, outputs=image_view)
117
 
118
+ try:
119
+ with open('sample_1.txt','r', encoding='utf-8') as file:
120
+ text_sample_1 = file.read()
121
+ with open('sample_2.txt','r', encoding='utf-8') as file:
122
+ text_sample_2 = file.read()
123
+ with open('sample_3.txt','r', encoding='utf-8') as file:
124
+ text_sample_3 = file.read()
125
+ except FileNotFoundError:
126
+ print("File not found.")
127
+ except Exception as e:
128
+ print(f"An error occurred: {e}")
129
+
130
+ title_1 = "The ancient discovery that put a Silk Road city back on the map"
131
+ title_2 = "The modern rediscovery that erased a Silk Road city from the map"
132
+
133
+ image_1 = "sample_1.jpg.webp"
134
+ image_2 = "sample_2.jpg.webp"
135
+
136
  gr.Examples(
137
  examples=[
138
+ [title_1, image_1, text_sample_1],
139
+ [title_2, image_2, text_sample_2],
140
+ [title_1, image_2, text_sample_3],
141
  ],
142
+ inputs=[news_title, news_image, news_content],
143
  label="Examples",
144
  example_labels=[
145
+ "2 real news",
146
+ "2 modified news",
147
+ "1 real news & 1 fake news",
148
  ],
149
  )
150
 
151
+ demo.launch(share=False)
152
 
153
 
154
  # https://www.bbc.com/travel/article/20250127-one-of-the-last-traders-on-the-silk-road
application_2.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import requests
5
+ from PIL import Image
6
+
7
+ from src.application.content_detection import NewsVerification
8
+ from src.application.url_reader import URLReader
9
+ from src.application.content_generation import generate_fake_image, generate_fake_text, replace_text
10
+
11
+
12
+ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
13
+ SEARCH_ENGINE_ID = os.getenv('SEARCH_ENGINE_ID')
14
+
15
+ AZURE_TEXT_MODEL = ["gpt-4o-mini", "gpt-4o"]
16
+ AZURE_IMAGE_MODEL = ["dall-e-3", "Stable Diffusion (not supported)"]
17
+
18
+ def load_url(url):
19
+ """
20
+ Load content from the given URL.
21
+ """
22
+ content = URLReader(url)
23
+ image = None
24
+ header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'}
25
+ try:
26
+ response = requests.get(
27
+ url,
28
+ headers = header,
29
+ stream = True
30
+ )
31
+ response.raise_for_status() # Raise an exception for bad status codes
32
+
33
+ image_response = requests.get(content.top_image, stream=True)
34
+ try:
35
+ image = Image.open(image_response.raw)
36
+ except:
37
+ print(f"Error loading image from {content.top_image}")
38
+
39
+ except (requests.exceptions.RequestException, FileNotFoundError) as e:
40
+ print(f"Error fetching image: {e}")
41
+
42
+ return content.title, content.text, image
43
+
44
+
45
+ def generate_analysis_report(news_title:str, news_content: str, news_image: Image):
46
+ news_analysis = NewsVerification()
47
+ news_analysis.load_news(news_title, news_content, news_image)
48
+ news_analysis.generate_analysis_report()
49
+ return news_analysis.analyze_details()
50
+
51
+
52
+ # Define the GUI
53
+ with gr.Blocks() as demo:
54
+ gr.Markdown("# NEWS VERIFICATION")
55
+
56
+ with gr.Row():
57
+ # SETTINGS
58
+ with gr.Column(scale=1):
59
+ with gr.Accordion("1. Enter a URL"):
60
+ url_input = gr.Textbox(
61
+ label="",
62
+ show_label=False,
63
+ value="",
64
+ )
65
+ load_button = gr.Button("Load URL")
66
+
67
+ with gr.Accordion("2. Select content-generation models", open=True, visible=False):
68
+ with gr.Row():
69
+ text_generation_model = gr.Dropdown(choices=AZURE_TEXT_MODEL, label="Text-generation model")
70
+ image_generation_model = gr.Dropdown(choices=AZURE_IMAGE_MODEL, label="Image-generation model")
71
+ generate_text_button = gr.Button("Generate text")
72
+ generate_image_button = gr.Button("Generate image")
73
+
74
+ with gr.Accordion("3. Replace any terms", open=True, visible=False):
75
+ replace_df = gr.Dataframe(
76
+ headers=["Find what:", "Replace with:"],
77
+ datatype=["str", "str"],
78
+ row_count=(1, "dynamic"),
79
+ col_count=(2, "fixed"),
80
+ interactive=True
81
+ )
82
+ replace_button = gr.Button("Replace all")
83
+
84
+ # GENERATED CONTENT
85
+ with gr.Accordion("Input News"):
86
+ news_title = gr.Textbox(label="Title", value="")
87
+ news_image = gr.Image(label="Image", type="filepath")
88
+ news_content = gr.Textbox(label="Content", value="", lines=12)
89
+
90
+ # NEWS ANALYSIS REPORT
91
+ with gr.Column(scale=2):
92
+ with gr.Accordion("News Analysis"):
93
+ detection_button = gr.Button("Verify news")
94
+ detailed_analysis = gr.HTML()
95
+
96
+ # Connect events
97
+ load_button.click(
98
+ load_url,
99
+ inputs=url_input,
100
+ outputs=[news_title, news_content, news_image]
101
+ )
102
+ replace_button.click(replace_text,
103
+ inputs=[news_title, news_content, replace_df],
104
+ outputs=[news_title, news_content])
105
+ generate_text_button.click(generate_fake_text,
106
+ inputs=[text_generation_model, news_title, news_content],
107
+ outputs=[news_title, news_content])
108
+ generate_image_button.click(generate_fake_image,
109
+ inputs=[image_generation_model, news_title],
110
+ outputs=[news_image])
111
+ detection_button.click(generate_analysis_report,
112
+ inputs=[news_title, news_content, news_image],
113
+ outputs=[detailed_analysis])
114
+
115
+ # change Image
116
+ #url_input.change(load_image, inputs=url_input, outputs=image_view)
117
+
118
+ try:
119
+ with open('sample_1.txt','r', encoding='utf-8') as file:
120
+ text_sample_1 = file.read()
121
+ with open('sample_2.txt','r', encoding='utf-8') as file:
122
+ text_sample_2 = file.read()
123
+ with open('sample_3.txt','r', encoding='utf-8') as file:
124
+ text_sample_3 = file.read()
125
+ except FileNotFoundError:
126
+ print("File not found.")
127
+ except Exception as e:
128
+ print(f"An error occurred: {e}")
129
+
130
+ title_1 = "The ancient discovery that put a Silk Road city back on the map"
131
+ title_2 = "The modern rediscovery that erased a Silk Road city from the map"
132
+
133
+ image_1 = "sample_1.jpg.webp"
134
+ image_2 = "sample_2.jpg.webp"
135
+
136
+ gr.Examples(
137
+ examples=[
138
+ [title_1, image_1, text_sample_1],
139
+ [title_2, image_2, text_sample_2],
140
+ [title_1, image_2, text_sample_3],
141
+ ],
142
+ inputs=[news_title, news_image, news_content],
143
+ label="Examples",
144
+ example_labels=[
145
+ "2 real news",
146
+ "2 modified news",
147
+ "1 real news & 1 fake news",
148
+ ],
149
+ )
150
+
151
+ demo.launch(share=False)
demo.py DELETED
@@ -1,309 +0,0 @@
1
- import os
2
-
3
- from src.images.Search_Image.search import find_similar_img_from_url
4
-
5
- import re
6
-
7
- import gradio as gr
8
-
9
- from src.images.Search_Image.image_model_share import (
10
- image_generation_detection,
11
- )
12
- from src.texts.Search_Text._text_detection_share import (
13
- UNKNOWN,
14
- abstract_detect_generated_text,
15
- )
16
- from src.texts.Search_Text.fake_text_generation_share import (
17
- highlight_overlap_by_word_to_list,
18
- )
19
-
20
- os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
21
-
22
- TEMP_IMAGE = "temp_image.jpg"
23
- TEMP_INPUT_IMAGE = "temp_input_image.jpg"
24
-
25
- HUMAN_IMAGE = "data/test_data/human_news.jpg"
26
-
27
- HUMAN_CAPTION = "Stoke City have secured West Brom striker Saido Berahino for Β£12 million on a five-and-a-half-year contract."
28
- HUMAN_CONTENT = """
29
- Tracey Jolliffe has already donated a kidney, 16 eggs and 80 pints of blood, and intends to leave her brain to science. She is now hoping to give away part of her liver to a person she may never meet.
30
- "If I had another spare kidney, I'd do it again," Tracey tells the BBC's Victoria Derbyshire programme.
31
- She is what is known as an "altruistic donor" - someone willing to give away an organ to potentially help save the life of a complete stranger.
32
- A microbiologist in the NHS, and the daughter of two nurses, she has spent her life learning about the importance of healthcare from a professional standpoint.
33
- But she has also been keen to make a difference on a personal level.
34
- "I signed up to donate blood, and to the bone marrow register, when I was 18," she says.
35
- Now 50, her wish to donate has become gradually more expansive.
36
- In 2012, she was one of fewer than 100 people that year to donate a kidney without knowing the recipient's identity - and now supports the charity Give A Kidney, encouraging others to do the same.
37
- As of 30 September 2016, 5,126 people remain on the NHS kidney transplant waiting list.
38
- Tracey's kidney donation, in all likelihood, will have saved someone's life.
39
- "I remind myself of it every day when I wake up," she says, rightly proud of her life-changing actions.
40
- It was not, however, a decision taken on the spur of a moment.
41
- Donating a kidney is an "involved process", she says, with suitability assessments taking at least three months to complete.
42
- Tests leading up to the transplant include X-rays, heart tracing and a special test of kidney function, which involves an injection and a series of blood tests.
43
- "It is not something to do if you're scared of needles," she jokes.
44
- The risks associated with donating, however, are relatively low for those deemed healthy enough to proceed, with a mortality rate of about one in 3,000 - roughly the same as having an appendix removed.
45
- Compared with the general public, NHS Blood and Transplant says, most kidney donors have equivalent - or better - life expectancy than the average person.
46
- Tracey says she was in hospital for five days after her operation but felt "back to normal" within six weeks.
47
- """
48
-
49
- HUMAN_NEWS_CNN = """
50
- Mayotte authorities fear hunger and disease after cyclone, as death toll rises in Mozambique
51
- Cyclone Chido caused devastation in Mayotte and authorities are now rushing to prevent disease and hunger spreading in the French overseas territory Sipa USA
52
- Authorities in Mayotte were racing on Tuesday to stop hunger, disease and lawlessness from spreading in the French overseas territory after the weekend’s devastating cyclone, while Mozambique reported dozens of deaths from the storm.
53
- Hundreds or even thousands could be dead in Mayotte, which took the strongest hit from Cyclone Chido, French officials have said. The storm laid waste to large parts of the archipelago off east Africa, France’s poorest overseas territory, before striking continental Africa.
54
- With many parts of Mayotte still inaccessible and some victims buried before their deaths could be officially counted, it may take days to discover the full extent of the destruction.
55
- So far, 22 deaths and more than 1,400 injuries have been confirmed, Ambdilwahedou Soumaila, the mayor of the capital Mamoudzou, told Radio France Internationale on Tuesday morning.
56
- β€œThe priority today is water and food,” Soumaila said. β€œThere are people who have unfortunately died where the bodies are starting to decompose that can create a sanitary problem.”
57
- β€œWe don’t have electricity. When night falls, there are people who take advantage of that situation.”
58
-
59
- Rescue workers operate in storm-hit Mayotte on Wednesday.
60
- Rescue workers operate in storm-hit Mayotte on Wednesday. Securite Civile via Reuters
61
- Twenty tonnes of food and water are due to start arriving on Tuesday by air and sea. The French government said late on Monday it expects 50% of water supplies to be restored within 48 hours and 95% within the week.
62
- France’s interior ministry announced that a curfew would go into effect on Tuesday night from 10 p.m. to 4 a.m. local time.
63
- Rescue workers have been searching for survivors amid the debris of shantytowns bowled over by 200 kph (124 mph) winds.
64
- Chido was the strongest storm to strike Mayotte in more than 90 years, French weather service Meteo France said. In Mozambique, it killed at least 34 people, officials said on Tuesday. Another seven died in Malawi.
65
- Drone footage from Mozambique’s Cabo Delgado province, already experiencing a humanitarian crisis due to an Islamist insurgency, showed razed thatched-roof houses near the beach and personal belongings scattered under the few palm trees still standing.
66
-
67
- Dispute over immigration
68
- French President Emmanuel Macron said after an emergency cabinet meeting on Monday that he would visit Mayotte in the coming days, as the disaster quickly fueled a political back-and-forth about immigration, the environment and France’s treatment of its overseas territories.
69
- Mayotte has been grappling with unrest in recent years, with many residents angry at illegal immigration and inflation.
70
- More than three-quarters of its roughly 321,000 people live in relative poverty, and about one-third are estimated to be undocumented migrants, most from nearby Comoros and Madagascar.
71
- The territory has become a stronghold for the far-right National Rally with 60% voting for Marine Le Pen in the 2022 presidential election runoff.
72
- France’s acting Interior Minister Bruno Retailleau, from the conservative Republicans party, told a news conference in Mayotte that the early warning system had worked β€œperfectly” but many of the undocumented had not come to designated shelters.
73
- People stand amid uprooted trees and debris after cyclone Chido hit Mecufi district, Cabo Delgado province, Mozambique, on December 16.
74
- People stand amid uprooted trees and debris after cyclone Chido hit Mecufi district, Cabo Delgado province, Mozambique, on December 16. UNICEF Mozambique via Reuters
75
- Other officials have said undocumented migrants may have been afraid to go to shelters for fear of being arrested.
76
- The toll of the cyclone, Retailleau said in a later post on X, underscored the need to address β€œthe migration question.”
77
- β€œMayotte is the symbol of the drift that (French) governments have allowed to take hold on this issue,” he said. β€œWe will need to legislate so that in Mayotte, like everywhere else on the national territory, France retakes control of its immigration.”
78
- Left-wing politicians, however, have pointed the finger at what they say is the government’s neglect of Mayotte and failure to prepare for natural disasters linked to climate change.
79
- Socialist Party chairman Olivier Faure blasted Retailleau’s comments in an X post.
80
- β€œHe could have interrogated the role of climate change in producing more and more intense climate disasters. He could have rallied against the extreme poverty that makes people more vulnerable to cyclones,” said Faure.
81
- β€œNo, he has resumed his crusade against migrants.”
82
- Prime Minister Francois Bayrou, appointed last week to steer France out of a political crisis, faced criticism after he went to the town of Pau, where he is the mayor, to attend a municipal council meeting on Monday, instead of visiting Mayotte.
83
- """
84
-
85
- HUMAN_NEWS_CNN_IMAGE = "human_cnn.webp"
86
- # generate a short news related to sport
87
-
88
- # opposite
89
- OPPOSITE_NEWS = """
90
- Tracey Jolliffe has never donated a kidney, any eggs, or blood, and has no plans to leave her brain to science. She is not considering giving away any part of her liver to someone she knows.
91
- "If I had another spare kidney, I wouldn't do it again," Tracey tells the BBC's Victoria Derbyshire programme.
92
- She is not an "altruistic donor" - someone unwilling to give away an organ to potentially save the life of a complete stranger.
93
- A microbiologist outside the NHS, with parents who were not in healthcare, she has spent her life without focusing on the importance of healthcare from a professional standpoint.
94
- She has also not been eager to make a difference on a personal level.
95
- "I never signed up to donate blood, nor to the bone marrow register, when I was 18," she says.
96
- Now 50, her interest in donating has not expanded.
97
- In 2012, she was not among the few people that year to donate a kidney without knowing the recipient's identity - and does not support the charity Give A Kidney, discouraging others from doing the same.
98
- As of 30 September 2016, 5,126 people remain on the NHS kidney transplant waiting list.
99
- Tracey's decision not to donate a kidney hasn't saved anyone's life.
100
- "I never think about it when I wake up," she says, indifferent about her choices.
101
- It was not a decision made after careful consideration.
102
- Donating a kidney is not an "involved process", she says, with suitability assessments taking less than three months to complete.
103
- Tests leading up to the transplant do not include X-rays, heart tracing, or a special test of kidney function, which does not involve an injection or any blood tests.
104
- "It is something to do if you're scared of needles," she jokes.
105
- The risks associated with donating, however, are relatively high for those not deemed healthy enough to proceed, with a high mortality rate - much greater than having an appendix removed.
106
- Compared with the general public, NHS Blood and Transplant says, most kidney donors have worse life expectancy than the average person.
107
- Tracey says she was not in hospital after any operation and did not feel "back to normal" within six weeks.
108
- """
109
-
110
- PARAPHASE_NEWS = """
111
- Tracey Jolliffe has generously donated a kidney, 16 eggs, and 80 pints of blood, and plans to donate her brain to science. She now hopes to donate part of her liver to someone she may never meet. "If I had another spare kidney, I'd do it again," she shares with the BBC's Victoria Derbyshire program. Known as an "altruistic donor," Tracey is willing to donate organs to help save the lives of strangers.
112
- As a microbiologist in the NHS and the daughter of two nurses, Tracey has always understood the importance of healthcare professionally. However, she also strives to make a personal impact. "I signed up to donate blood and joined the bone marrow register at 18," she explains. Now 50, her desire to donate has expanded over the years.
113
- In 2012, Tracey was among fewer than 100 people that year who donated a kidney without knowing the recipient. She now supports Give A Kidney, a charity that encourages others to donate. As of 30 September 2016, 5,126 people were on the NHS kidney transplant waiting list. Tracey's kidney donation likely saved a life. "I remind myself of it every day when I wake up," she says, proud of her life-changing decision.
114
- Donating a kidney was not a spontaneous decision for Tracey. It is a complex process, she explains, with suitability assessments taking at least three months. Pre-transplant tests include X-rays, heart monitoring, and a special kidney function test involving an injection and multiple blood tests. "It's not for those afraid of needles," she jokes.
115
- For healthy individuals, the risks of donating a kidney are relatively low, with a mortality rate of about one in 3,000, similar to having an appendix removed. According to NHS Blood and Transplant, most kidney donors have the same or better life expectancy compared to the general population. Tracey was hospitalized for five days after her operation and felt "back to normal" within six weeks.
116
- """
117
-
118
- MACHINE_IMAGE = "data/test_data/machine_news.png"
119
- # MACHINE_CAPTION = "Argentina Secures Victory in Thrilling Friendly Match Against Brazil"
120
- MACHINE_CONTENT = """
121
- Tracey Jolliffe has already donated a kidney, 16 eggs, and 80 pints of blood, and she intends to leave her brain to science. She is now hoping to give away part of her liver to a person she may never meet.
122
- "If I had another spare kidney, I'd do it again," Tracey tells the BBC's Victoria Derbyshire programme.
123
- She is what is known as an "altruistic donor"β€”someone willing to give away an organ to potentially help save the life of a complete stranger.
124
- A microbiologist in the NHS and the daughter of two nurses, she has spent her life learning about the importance of healthcare from a professional standpoint. But she has also been keen to make a difference on a personal level. "I signed up to donate blood and to the bone marrow register when I was 18," she says.
125
- Now 50, her wish to donate has become gradually more expansive. In 2012, she was one of fewer than 100 people that year to donate a kidney without knowing the recipient's identity, and she now supports the charity Give A Kidney, encouraging others to do the same.
126
- As of 30 September 2016, 5,126 people remain on the NHS kidney transplant waiting list. Tracey's kidney donation, in all likelihood, has saved someone's life. "I remind myself of it every day when I wake up," she says, rightly proud of her life-changing actions.
127
- It was not, however, a decision taken on the spur of a moment. Donating a kidney is an "involved process," she says, with suitability assessments taking at least three months to complete. Tests leading up to the transplant include X-rays, heart tracing, and a special test of kidney function, which involves an injection and a series of blood tests. "It is not something to do if you're scared of needles," she jokes.
128
- The risks associated with donating, however, are relatively low for those deemed healthy enough to proceed, with a mortality rate of about one in 3,000β€”roughly the same as having an appendix removed. Compared with the general public, NHS Blood and Transplant says, most kidney donors have equivalentβ€”or betterβ€”life expectancy than the average person.
129
- Tracey says she was in hospital for five days after her operation but felt "back to normal" within six weeks.
130
- """
131
-
132
- HUMAN_BBC_NEWS2 = """
133
- A message of hope at Washington march
134
- For such a divisive figure, Donald Trump managed to unify hundreds of thousands of Americans at the Women's March on Washington.
135
- Moments after Mr Trump was sworn in as the 45th president on Friday, he delivered a thundering speech in which he promised to improve the lives of millions of Americans.
136
- A day later, throngs of women, men and children streamed into the same area where he made that pledge, in order to take a stand for gender and racial equality.
137
- Though Mr Trump's named was mentioned frequently, the march, which organisers estimate attracted more than half a million, was not only about the new US president.
138
- Messages ranged from "Thank you for making me an activist Trump" to "We will not be silenced," but the common thread throughout the patchwork of signs was hope.
139
- "It's about solidarity and visualising the resistance," said Jonathon Meier, who took a bus from New York.
140
- "And I think it not only helps with the healing process, but it gives me hope for the next four years."
141
- A sea of activists, some clad in knitted, pink "pussy" hats and others draped in American flags, ambled about the National Mall, stopping to catch a glimpse of some of the high-profile speakers and singing along to songs like "This Little Light of Mine".
142
- Peppered among the many protest signs were images of ovaries and female genitals, a nod to concerns over losing access to birth control and abortion care under a Trump administration.
143
- """
144
-
145
- FREELY_GENERATION_NEWS = """
146
- A new study has indicated that criminals and terrorists are increasingly turning to the dark net to purchase weapons. The study, conducted by cybersecurity firm Recorded Future, found that these purchases are being made anonymously and with cryptocurrency, making it difficult for law enforcement agencies to track and intercept them. The dark net is a hidden part of the internet, accessible only through anonymous browsers, where users can buy and sell a variety of illegal goods and services. However, the study found that weapons purchases are becoming more popular on the dark net, with firearms and explosives being the most commonly traded items. Recorded Future's research showed that many of the weapons being sold on the dark net are military-grade, and the study suggests that this is due to the large number of surplus weapons available following military conflicts in various parts of the world. The report also found that the sellers on the dark net are often located in countries with lax gun laws, leading to concerns that these weapons could end up in the hands of criminals and terrorists who could use them to commit acts of violence. The use of cryptocurrency to purchase these weapons adds another layer of difficulty for law enforcement agencies trying to track down those responsible. The anonymity provided by cryptocurrency allows buyers and sellers to conduct their transactions without leaving a trace. The findings of this study serve as a stark reminder of the dangers posed by the dark net, and the need for law enforcement agencies to remain vigilant in their efforts to combat illegal activity on this hidden part of the internet.
147
- """
148
-
149
- HUMAN_BBC_NEWS2_IMAGE = "human_bbc_news_2.webp"
150
-
151
- HIGHLIGHT = "highlight"
152
-
153
-
154
- def highlight_text(words, indexes):
155
- final_words = words
156
- for index in indexes:
157
- final_words[index] = (
158
- f"<span style='color:#00FF00; font-weight:bold;'>{words[index]}</span>"
159
- )
160
- return " ".join(final_words)
161
-
162
-
163
- def format_pair(pair):
164
- input_sentence = highlight_text(pair[0], pair[2])
165
- source_sentence = highlight_text(pair[1], pair[3])
166
- return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"
167
-
168
-
169
- def create_table(data):
170
- table_rows = "\n".join([format_pair(pair) for pair in data])
171
- return f"""
172
- <h5> Comparison between input news and source news at the above link</h5>
173
- <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
174
- <thead>
175
- <tr>
176
- <th>Input sentence</th>
177
- <th>Source sentence</th>
178
- </tr>
179
- </thead>
180
- <tbody>
181
- {table_rows}
182
- </tbody>
183
- </table>
184
- """
185
-
186
-
187
- with gr.Blocks() as demo:
188
- image = gr.Image(
189
- value=HUMAN_IMAGE,
190
- label="News Image",
191
- height=200,
192
- width=200,
193
- type="filepath",
194
- )
195
- content = gr.Textbox(label="Content", lines=3, value=HUMAN_CONTENT)
196
-
197
- process_btn = gr.Button("Process")
198
-
199
- """
200
- 1. human bbc news
201
- 2. proofreading
202
- 3. opposite
203
- 4. human bbc news 2
204
- 5. human_cnn news
205
- 6. paraphrase
206
- 7. freely generation
207
- """
208
- gr.Examples(
209
- examples=[
210
- [HUMAN_IMAGE, HUMAN_CONTENT],
211
- [MACHINE_IMAGE, MACHINE_CONTENT],
212
- [MACHINE_IMAGE, OPPOSITE_NEWS],
213
- [HUMAN_BBC_NEWS2_IMAGE, HUMAN_BBC_NEWS2],
214
- [HUMAN_NEWS_CNN_IMAGE, HUMAN_NEWS_CNN],
215
- [MACHINE_IMAGE, PARAPHASE_NEWS],
216
- [MACHINE_IMAGE, FREELY_GENERATION_NEWS],
217
- ],
218
- inputs=[image, content],
219
- label="examples",
220
- example_labels=[
221
- "human bbc news",
222
- "proofreading",
223
- "opposite",
224
- "human bbc news 2",
225
- "human cnn news",
226
- "paraphrase",
227
- "freely generation",
228
- ],
229
- )
230
-
231
- overall = gr.HTML()
232
- matching_html = gr.HTML()
233
-
234
- def process(input_image, content):
235
- (
236
- search_engine_prediction,
237
- SOTA_prediction,
238
- SOTA_confidence,
239
- found_url,
240
- sentence_pairs,
241
- ) = abstract_detect_generated_text(content)
242
-
243
- final_table = []
244
- COLOR_MAPS = {
245
- "HUMAN": "<span style='color:#FFFF00'>",
246
- "MACHINE": "<span style='color:red'>",
247
- }
248
-
249
- source_image = []
250
- image_prediction_label, image_confidence = image_generation_detection(
251
- input_image,
252
- )
253
- # [found_img_url, image_different_score] = find_similar_img_from_url(input_image)
254
-
255
- # if 0 < image_different_score < 10:
256
- # search_engine_description = f'Most likely generated by {COLOR_MAPS["HUMAN"]} (score = {image_different_score})</span> with evidence link at <a href="{found_img_url}">{found_img_url} </a>'
257
- # else: # TODO add < 25 which is cropped images
258
- # search_engine_description = f'Most likely generated by {COLOR_MAPS["MACHINE"]} (score = {image_different_score})</span></a>'
259
-
260
- for (
261
- input_sentence,
262
- source_sentence,
263
- check_paraphrase,
264
- ) in sentence_pairs:
265
- input_words, source_words, input_indexes, source_indexes = (
266
- highlight_overlap_by_word_to_list(
267
- input_sentence,
268
- source_sentence,
269
- )
270
- )
271
- final_table.append(
272
- (input_words, source_words, input_indexes, source_indexes),
273
- )
274
-
275
- if search_engine_prediction == UNKNOWN:
276
- search_engine_description = "Cannot find any evidence link"
277
- final_prediction = SOTA_prediction
278
- else:
279
- final_prediction = search_engine_prediction
280
- search_engine_description = f'Most likely generated by {COLOR_MAPS[search_engine_prediction]}{search_engine_prediction}</span> with evidence link at <a href="{found_url}">{found_url} </a>'
281
-
282
- overall_html_result = f"""
283
- <h1>Image generation detection</h1>
284
- <ul>
285
- <li><strong>Prediction by SOTA method (provided by BDI members):</strong> Most likely generated by {COLOR_MAPS[image_prediction_label]}{image_prediction_label} </span>with confidence = {image_confidence}%</li>
286
- <li><strong>Prediction by our method (developed by BDI members)</strong>: {search_engine_description}
287
- </ul>
288
- <hr>
289
- <h1>Text generation detection</h1>
290
- <ul>
291
- <li><strong>Prediction by SOTA method (https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)</strong>: Most likely generated by {COLOR_MAPS[SOTA_prediction]}{SOTA_prediction} </span>with confidence = {SOTA_confidence}</li>
292
- <li><strong>Prediction by our method (developed by BDI members)</strong>: {search_engine_description}
293
- <li><strong>Final prediction by our method (developed by BDI members)</strong>: Most likely generated by {COLOR_MAPS[final_prediction]}{final_prediction}&nbsp;</span></li>
294
- </ul>
295
- <p>&nbsp;</p>
296
- """
297
- if len(final_table) != 0:
298
- html_table = create_table(final_table)
299
- else:
300
- html_table = ""
301
- return overall_html_result, html_table
302
-
303
- process_btn.click(
304
- process,
305
- inputs=[image, content],
306
- outputs=[overall, matching_html],
307
- )
308
-
309
- demo.launch(share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -20,4 +20,5 @@ torch
20
  sentence-transformers
21
 
22
  # Images
23
- pillow==10.1.0
 
 
20
  sentence-transformers
21
 
22
  # Images
23
+ pillow==10.1.0
24
+ imagehash==4.3.1
sample_1.jpg.webp ADDED
sample_1.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ Muzzamil Hussain was in grade school when the first bombs fell on the playground outside of his classroom in Kargil, a mountain city in the Indian province of Ladakh. While the violent onset of the 1999 Kargil war between Pakistan and India unfolded around him, Hussain and his family escaped south to the remote Suru Valley.
3
+ After India claimed victory later that year and displaced families returned home, Hussain listened as his bedridden grandfather asked the family to visit an old property, originally built by Hussain's great-grandfather, near Kargil's bazaar to make sure it had survived the war. When Hussain's uncles cracked through an old rusty latch and peered through the hand-carved wooden doors, they discovered wooden crates stamped with names of cities around the world. Making space on the dusty floor, the family began to lay out silks from China, silver cookware from Afghanistan, rugs from Persia, turquoise from Tibet, saddles from Mongolia and luxury soaps and salves from London, New York and Munich.
4
+
5
+ This mysterious act of destruction is investigated in Miss Austen, a new four-part television drama based on Gill Hornby's best-selling and critically acclaimed novel of the same name. Years after Jane's death, Cassandra (Keeley Hawes) has travelled to the village of Kintbury, in Berkshire, where the Austen family's friends, the Fowles, lived. Cassandra is, ostensibly, there to help Isabella Fowle (Rose Leslie), whose father Fulwar is dying. However this is a house that holds many bitter-sweet memories for her (in real life, this is where she had been staying when Jane wrote to her about Tom Lefroy), and she has an ulterior motive. She wants to retrieve some letters written by the late Jane to their friend Eliza Fowle, Isabella's mother, which she fears might contain details damaging to the novelist's legacy. When she finds the correspondence, it revives powerful memories of the events of years ago. The series takes place in two timelines – in 1830 – with the unmarried Isabella facing eviction from her home after her father's death and Cassandra trying to protect her sister's legacy – and decades previously, with young Cassandra (SynnΓΈve Karlsen) and Jane (Patsy Ferran) navigating romances, family problems, and the ups and downs of life.
sample_2.jpg.webp ADDED
sample_2.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ Muzzamil Hussain was in 3rd-grade school when the first bombs fell on the playground outside of his classroom in Kargil, a mountain city in India. While the violent onset of the 1998 Kargil war between Pakistan and India unfolded around him, Hussain and his family escaped south to the remote Suru Valley.
3
+ After India claimed victory later that year and displaced families returned home, Hussain listened as his bedridden grandfather asked the family to visit an old property, initially built by Hussain's great-grandfather, near Kargil's bazaar to make sure it had survived the war. When Hussain's uncles cracked through an old rusty latch and peered through the hand-carved wooden doors, they discovered wooden crates stamped with names of cities worldwide. Making space on the dusty floor, the family began to lay out silks from China, silver cookware from Afghanistan, rugs from Persia, turquoise from Tibet, saddles from Mongolia, and luxury soaps and salves from London, New York, and Munich.
4
+
5
+ This mysterious act of destruction is investigated in Miss Austen, a new four-part television drama based on Gill Hornby's best-selling and critically acclaimed novel. Years after Jane's death, Cassandra (Keeley Hawes) traveled to the village of Kintbury in Berkshire, where the Austen family's friends, the Fowles, lived. Cassandra is, ostensibly, there to help Isabella Fowle (Rose Leslie), whose father, Fulwar, is dying. However, this house holds many bitter-sweet memories for her (in real life, this is where she had been staying when Jane wrote about Tom Lefroy), and she has an ulterior motive. She wants to retrieve letters written by the late Jane to their friend Eliza Fowle, Isabella's mother, which she fears might contain details damaging the novelist's legacy. When she finds the correspondence, it revives powerful memories of the events of years ago. The series takes place in two timelines – in 1830 – with the unmarried Isabella facing eviction from her home after her father's death and Cassandra trying to protect her sister's legacy – and decades previously, with young Cassandra (SynnΓΈve Karlsen) and Jane (Patsy Ferran) navigating romances, family problems, and the ups and downs of life.
sample_3.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ Muzzamil Hussain was in grade school when the first bombs fell on the playground outside of his classroom in Kargil, a mountain city in the Indian province of Ladakh. While the violent onset of the 1999 Kargil war between Pakistan and India unfolded around him, Hussain and his family escaped south to the remote Suru Valley.
3
+ After India claimed victory later that year and displaced families returned home, Hussain listened as his bedridden grandfather asked the family to visit an old property, originally built by Hussain's great-grandfather, near Kargil's bazaar to make sure it had survived the war. When Hussain's uncles cracked through an old rusty latch and peered through the hand-carved wooden doors, they discovered wooden crates stamped with names of cities around the world. Making space on the dusty floor, the family began to lay out silks from China, silver cookware from Afghanistan, rugs from Persia, turquoise from Tibet, saddles from Mongolia and luxury soaps and salves from London, New York and Munich.
4
+
5
+ Local squirrels have reportedly formed a highly organized nut-hoarding syndicate, causing widespread panic among residents. Experts warn this unprecedented squirrel activity could lead to a global nut shortage. One resident claims to have witnessed squirrels using tiny backpacks to transport their loot. Authorities are investigating the claims, but so far, the squirrels remain at large. The mayor has issued a statement urging citizens to remain calm and protect their acorns.
src/application/content_detection.py CHANGED
@@ -1,6 +1,9 @@
1
  from difflib import SequenceMatcher
 
 
2
  from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
3
  from src.application.text.model_detection import detect_text_by_ai_model
 
4
  from src.application.text.search_detection import check_human, detect_text_by_relative_search
5
 
6
 
@@ -11,18 +14,19 @@ class NewsVerification():
11
  self.news_content = ""
12
  self.news_image = ""
13
 
14
- self.text_prediction_label = ""
15
- self.text_prediction_score = -1
16
- self.text_referent_url = None
17
- self.image_prediction_label = ""
18
- self.image_prediction_score = -1
19
- self.image_referent_url = None
20
  self.news_prediction_label = ""
21
  self.news_prediction_score = -1
22
 
23
- self.found_img_url = []
24
- self.aligned_sentences = []
25
- self.is_paraphrased = False
 
26
 
27
  def load_news(self, news_title, news_content, news_image):
28
  self.news_text = news_title + "\n\n" + news_content
@@ -45,22 +49,41 @@ class NewsVerification():
45
  print("CHECK TEXT:")
46
  print("\tFrom search engine:")
47
  # Classify by search engine
48
- self.is_paraphrased, self.text_referent_url, self.aligned_sentences, self.found_img_url = detect_text_by_relative_search(self.news_text)
 
 
49
 
50
- if self.is_paraphrased is False:
51
- self.text_prediction_label = "UNKNOWN"
52
- else:
53
- self.text_prediction_score = 100
54
- if check_human(self.aligned_sentences):
55
- self.text_prediction_label = "HUMAN"
 
 
 
 
 
 
 
56
  else:
57
- self.text_prediction_label = "MACHINE"
58
-
59
- # Classify text by AI model
60
- print("\tFrom AI model:")
61
- if self.text_prediction_label == "UNKNOWN":
62
- self.text_prediction_label, self.text_prediction_score = detect_text_by_ai_model(self.news_text)
63
- self.text_prediction_score *= 100
 
 
 
 
 
 
 
 
 
 
64
 
65
  def detect_image_origin(self):
66
  print("CHECK IMAGE:")
@@ -124,107 +147,33 @@ class NewsVerification():
124
  def generate_analysis_report(self):
125
  self.determine_text_origin()
126
  self.detect_image_origin()
127
- self.determine_news_origin()
128
-
129
- # Forensic analysis
130
- if self.text_prediction_label == "MACHINE":
131
- text_prediction_label = "The text is modified by GPT-4o (AI)"
132
- else:
133
- text_prediction_label = "The text is written by HUMAN"
134
-
135
- if self.image_prediction_label == "MACHINE":
136
- image_prediction_label = "The image is generated by Dall-e (AI)"
137
- else:
138
- image_prediction_label = "The image is generated by HUMAN"
139
 
140
- if self.news_prediction_label == "MACHINE":
141
- news_prediction_label = "The whole news generated by AI"
142
- else:
143
- news_prediction_label = "The whole news written by HUMAN"
144
-
145
- # Misinformation analysis
146
- out_of_context_results = "cohesive"
147
- if out_of_context_results == "cohesive":
148
- out_of_context_results = "The input news is cohesive (non-out-of-context)"
149
- else:
150
- out_of_context_results = "The input news is out-of-context"
151
- out_of_context_prediction_score = 96.7
152
 
153
- # Description
154
- description = "The description should be concise, clear, and aimed at helping general readers understand the case."
155
 
156
- if self.text_referent_url is None:
157
- referred_news = "<li>No referent information</li>"
158
- else:
159
- if len(self.text_referent_url) > 40:
160
- url_max_length = 40
161
- else:
162
- url_max_length = len(self.text_referent_url)
163
-
164
- referred_news = f"""<li><a href="{self.text_referent_url}" target="_blank">{"Referred news: " + self.text_referent_url[:url_max_length] + "..."}</a></li>"""
165
-
166
- if self.image_referent_url is None:
167
- referred_image = "<li>No referent information</li>"
168
- else:
169
- if len(self.image_referent_url) > 40:
170
- url_max_length = 40
171
- else:
172
- url_max_length = len(self.text_referent_url)
173
- referred_image = f"""<li><a href="{self.image_referent_url}" target="_blank">{"Referred news: " + self.image_referent_url[:url_max_length] + "..."}</a></li>"""
174
-
175
- html_template = f"""
176
- <div>
177
- <h3>Originality:</h3>
178
- <ul>
179
- {referred_news}
180
- {referred_image}
181
- </ul>
182
- </div>
183
-
184
- <div>
185
- <h3>Forensic:</h3>
186
- <b>{news_prediction_label} (confidence = {self.news_prediction_score:.2f}%)</b>
187
- <ul>
188
- <li>{text_prediction_label} (confidence = {self.text_prediction_score:.2f}%)</li>
189
- <li>{image_prediction_label} (confidence = {self.image_prediction_score:.2f}%)</li>
190
- </ul>
191
- </div>
192
-
193
- <div>
194
- <h3>Misinformation (placeholder):</h3>
195
- <ul>
196
- <li>The input news is {out_of_context_results} (confidence = {out_of_context_prediction_score:.2f}%)</li>
197
- </ul>
198
- </div>
199
-
200
- <div>
201
- <h3>Description (optional, placeholder):</h3>
202
- <ul>
203
- <li>{description}</li>
204
- </ul>
205
- </div>
206
- """
207
-
208
- return html_template
209
-
210
-
211
- def analyze_details(self):
212
- self.aligned_sentences
213
- final_table = []
214
-
215
  for pair in self.aligned_sentences:
 
 
 
216
  input_words, source_words, input_indexes, source_indexes = (
217
  self.highlight_overlap_by_word_to_list(
218
  pair["input_sentence"],
219
  pair["matched_sentence"],
220
  )
 
 
 
 
221
  )
222
- final_table.append(
223
  (input_words, source_words, input_indexes, source_indexes),
224
  )
225
 
226
- if len(final_table) != 0:
227
- html_table = self.create_table(final_table)
228
  else:
229
  html_table = ""
230
  return html_table
@@ -257,6 +206,7 @@ class NewsVerification():
257
  # LαΊ·p qua cΓ‘c Δ‘oαΊ‘n so khα»›p
258
  for match in matcher.get_matching_blocks():
259
  start1, start2, length = match
 
260
 
261
  # ThΓͺm cΓ‘c tα»« khΓ΄ng trΓΉng lαΊ·p vΓ o (giα»― nguyΓͺn)
262
  highlighted_text1.extend(words1[current_pos1:start1])
@@ -273,28 +223,152 @@ class NewsVerification():
273
  current_pos2 = start2 + length
274
 
275
  return words1, words2, index1, index2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
- def create_table(self, data):
278
- table_rows = "\n".join([self.format_pair(pair) for pair in data])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  return f"""
280
- <h5>Comparison between input news and <a href={self.text_referent_url} target="_blank">source news</a></h5>
281
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
282
  <thead>
283
  <tr>
284
- <th>Input sentence</th>
285
- <th>Source sentence</th>
 
 
286
  </tr>
287
  </thead>
288
  <tbody>
289
- {table_rows}
290
  </tbody>
291
  </table>
 
 
292
  """
293
 
294
- def format_pair(self, pair):
295
- input_sentence = self.highlight_text(pair[0], pair[2])
296
- source_sentence = self.highlight_text(pair[1], pair[3])
297
- return f"<tr><td>{input_sentence}</td><td>{source_sentence}</td></tr>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
  def highlight_text(self, words, indexes):
300
  final_words = words
 
1
  from difflib import SequenceMatcher
2
+ import difflib
3
+ from src.application.highlight_text import generate_color
4
  from src.application.image.image_detection import detect_image_by_ai_model, detect_image_by_reverse_search, detect_image_from_news_image
5
  from src.application.text.model_detection import detect_text_by_ai_model
6
+ from src.application.text.preprocessing import split_into_sentences
7
  from src.application.text.search_detection import check_human, detect_text_by_relative_search
8
 
9
 
 
14
  self.news_content = ""
15
  self.news_image = ""
16
 
17
+ self.text_prediction_label:list[str] = []
18
+ self.text_prediction_score:list[float] = []
19
+ self.text_referent_url:list[str] = []
20
+ self.image_prediction_label:list[str] = []
21
+ self.image_prediction_score:list[str] = []
22
+ self.image_referent_url:list[str] = []
23
  self.news_prediction_label = ""
24
  self.news_prediction_score = -1
25
 
26
+ self.found_img_url:list[str] = []
27
+ self.aligned_sentences:list[dict] = []
28
+ self.is_paraphrased:list[bool] = []
29
+ self.analyzed_table:list[list] = []
30
 
31
  def load_news(self, news_title, news_content, news_image):
32
  self.news_text = news_title + "\n\n" + news_content
 
49
  print("CHECK TEXT:")
50
  print("\tFrom search engine:")
51
  # Classify by search engine
52
+ input_sentences = split_into_sentences(self.news_text)
53
+ for sentence in input_sentences:
54
+ paraphrase, text_url, aligned_sentence, img_urls = detect_text_by_relative_search(sentence)
55
 
56
+ text_prediction_label = "UNKNOWN"
57
+ if paraphrase is False:
58
+ # Classify text by AI model
59
+ print("\tFrom AI model:")
60
+ text_prediction_label, text_prediction_score = detect_text_by_ai_model(sentence)
61
+ if aligned_sentence == []:
62
+ aligned_sentence = {
63
+ "input_sentence": sentence,
64
+ "matched_sentence": "",
65
+ "similarity": text_prediction_score,
66
+ "is_paraphrase_sentence": False,
67
+ "url": "",
68
+ }
69
  else:
70
+ self.found_img_url.extend(img_urls)
71
+ text_prediction_score = aligned_sentence["similarity"]
72
+ if check_human(aligned_sentence):
73
+ text_prediction_label = "HUMAN"
74
+ else:
75
+ text_prediction_label = "MACHINE"
76
+
77
+ print(f"\ttext_prediction_label: {text_prediction_label}\n")
78
+ self.text_prediction_label.append(text_prediction_label)
79
+ self.aligned_sentences.append(aligned_sentence)
80
+ self.is_paraphrased.append(paraphrase)
81
+ self.text_referent_url.append(text_url)
82
+ self.text_prediction_score.append(text_prediction_score)
83
+ paraphrase = False
84
+ text_url = ""
85
+ aligned_sentence = {}
86
+ img_urls = []
87
 
88
  def detect_image_origin(self):
89
  print("CHECK IMAGE:")
 
147
  def generate_analysis_report(self):
148
  self.determine_text_origin()
149
  self.detect_image_origin()
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ def analyze_details(self):
152
+ self.analyzed_table = []
153
+ # IMAGES:
 
 
 
 
 
 
 
 
 
154
 
 
 
155
 
156
+ # TEXT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  for pair in self.aligned_sentences:
158
+ print(f"pair: {pair}")
159
+ if "input_sentence" not in pair:
160
+ continue
161
  input_words, source_words, input_indexes, source_indexes = (
162
  self.highlight_overlap_by_word_to_list(
163
  pair["input_sentence"],
164
  pair["matched_sentence"],
165
  )
166
+ # self.compare_sentences(
167
+ # pair["input_sentence"],
168
+ # pair["matched_sentence"],
169
+ # )
170
  )
171
+ self.analyzed_table.append(
172
  (input_words, source_words, input_indexes, source_indexes),
173
  )
174
 
175
+ if len(self.analyzed_table) != 0:
176
+ html_table = self.create_table()
177
  else:
178
  html_table = ""
179
  return html_table
 
206
  # LαΊ·p qua cΓ‘c Δ‘oαΊ‘n so khα»›p
207
  for match in matcher.get_matching_blocks():
208
  start1, start2, length = match
209
+ print(start1, start2, length)
210
 
211
  # ThΓͺm cΓ‘c tα»« khΓ΄ng trΓΉng lαΊ·p vΓ o (giα»― nguyΓͺn)
212
  highlighted_text1.extend(words1[current_pos1:start1])
 
223
  current_pos2 = start2 + length
224
 
225
  return words1, words2, index1, index2
226
+
227
+
228
+ def get_text_urls(self):
229
+ return set(self.text_referent_url)
230
+
231
+ def generate_colors_list(self, set_urls):
232
+ color_dict = {}
233
+ num_urls = len(set_urls)
234
+ for i in range(num_urls):
235
+ color_dict[i] = generate_color(i, num_urls)
236
+
237
+ return color_dict
238
+
239
+ def analyze_details_2(self):
240
+ html_text = ""
241
+
242
+ self.analyzed_table = []
243
+ # TEXT
244
+ # Assign unique colors to each index
245
+ set_urls = self.get_text_urls()
246
+ color_dict = self.generate_colors_list(set_urls)
247
+
248
+ # position of the color in the input contents
249
+ position = 0
250
+ for pair in self.aligned_sentences:
251
+ if "input_sentence" not in pair:
252
+ continue
253
+ common_phrases, position = self.compare_sentences(
254
+ pair["input_sentence"],
255
+ pair["matched_sentence"],
256
+ position,
257
+ color_dict["0"], # TODO: set color
258
+ )
259
+
260
+
261
+ if len(self.analyzed_table) != 0:
262
+ html_table = self.create_table()
263
+ else:
264
+ html_table = ""
265
+ return html_text, html_table
266
+
267
+ def compare_sentences(self, sentence_1, sentence_2, position, color):
268
+ """
269
+ Compares two sentences and identifies common phrases, outputting their start and end positions.
270
 
271
+ Args:
272
+ sentence_1: The first sentence (string).
273
+ sentence_2: The second sentence (string).
274
+
275
+ Returns:
276
+ A list of dictionaries, where each dictionary represents a common phrase and contains:
277
+ - "phrase": The common phrase (string).
278
+ - "start_1": The starting index of the phrase in sentence_1 (int).
279
+ - "end_1": The ending index of the phrase in sentence_1 (int).
280
+ - "start_2": The starting index of the phrase in sentence_2 (int).
281
+ - "end_2": The ending index of the phrase in sentence_2 (int).
282
+ Returns an empty list if no common phrases are found. Handles edge cases like empty strings.
283
+ """
284
+
285
+ if not sentence_1 or not sentence_2: # Handle empty strings
286
+ return []
287
+
288
+ s = difflib.SequenceMatcher(None, sentence_1, sentence_2)
289
+ common_phrases = []
290
+
291
+ for block in s.get_matching_blocks():
292
+ if block.size > 0: # Ignore zero-length matches
293
+ start_1 = block.a
294
+ end_1 = block.a + block.size
295
+ start_2 = block.b
296
+ end_2 = block.b + block.size
297
+
298
+ phrase = sentence_1[start_1:end_1] # Or sentence_2[start_2:end_2], they are the same
299
+
300
+ common_phrases.append({
301
+ "phrase": phrase,
302
+ "start_1": start_1 + position,
303
+ "end_1": end_1 + position,
304
+ "start_2": start_2,
305
+ "end_2": end_2,
306
+ "color": color,
307
+ })
308
+ position += len(sentence_1)
309
+ return common_phrases, position
310
+
311
+ def create_table(self):
312
+ #table_rows = "\n".join([self.format_row(row) for row in self.analyzed_table])
313
+ # loop of self.analyzed_table with index:
314
+ rows = []
315
+ max_length = 30 # TODO: put this in configuration
316
+ rows.append(self.format_image_row(max_length))
317
+
318
+ for index, row in enumerate(self.analyzed_table):
319
+ formatted_row = self.format_text_row(row, index, max_length)
320
+ rows.append(formatted_row)
321
+ table = "\n".join(rows)
322
  return f"""
323
+ <h5>Comparison between input news and source news</h5>
324
  <table border="1" style="width:100%; text-align:left; border-collapse:collapse;">
325
  <thead>
326
  <tr>
327
+ <th>Input news</th>
328
+ <th>Source content</th>
329
+ <th>Forensic</th>
330
+ <th>Originality</th>
331
  </tr>
332
  </thead>
333
  <tbody>
334
+ {table}
335
  </tbody>
336
  </table>
337
+
338
+ <style>
339
  """
340
 
341
+ def format_text_row(self, row, index = 0, max_length=30):
342
+ input_sentence = self.highlight_text(row[0], row[2]) # text, index of highlight words
343
+ source_sentence = self.highlight_text(row[1], row[3]) # text, index of highlight words
344
+
345
+ url = self.aligned_sentences[index]["url"] #
346
+ short_url = self.shorten_url(url, max_length)
347
+ source_text_url = f"""<a href="{url}">{short_url}</a>"""
348
+
349
+ # short_url = self.shorten_url(self.text_referent_url[index], max_length)
350
+ # source_text_url = f"""<a href="{self.text_referent_url[index]}">{short_url}</a>"""
351
+
352
+ self.text_prediction_score[index]
353
+ return f"""<tr><td>{input_sentence}</td><td>{source_sentence}</td><td>{self.text_prediction_label[index]}<br>({self.text_prediction_score[index]*100:.2f}%)</td><td>{source_text_url}</td></tr>"""
354
+
355
+ def format_image_row(self, max_length=30):
356
+ # input_image = f"""<img src="{self.news_image}" width="200" height="150">"""
357
+ print(f"self.news_image = {self.news_image}")
358
+ source_image = f"""<img src="{self.image_referent_url}" width="200" height="150">"""
359
+ short_url = self.shorten_url(self.image_referent_url, max_length)
360
+ source_image_url = f"""<a href="{self.image_referent_url}">{short_url}</a>"""
361
+ return f"""<tr><td>input image</td><td>{source_image}</td><td>{self.image_prediction_label}<br>({self.image_prediction_score:.2f}%)</td><td>{source_image_url}</td></tr>"""
362
+
363
+ def shorten_url(self, url, max_length=30):
364
+ if url is None:
365
+ return ""
366
+
367
+ if len(url) > max_length:
368
+ short_url = url[:max_length] + "..."
369
+ else:
370
+ short_url = url
371
+ return short_url
372
 
373
  def highlight_text(self, words, indexes):
374
  final_words = words
src/application/highlight_text.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import colorsys
3
+ from functools import partial
4
+ import random
5
+
6
+
7
+ def lighten_color(hex_color, factor=1.8):
8
+ """Lightens a HEX color by increasing its brightness in HSV space."""
9
+
10
+ hex_color = hex_color.lstrip("#")
11
+ r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
12
+
13
+ # Convert to HSV
14
+ h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
15
+ v = min(1.0, v * factor) # Increase brightness
16
+
17
+ # Convert back to HEX
18
+ r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
19
+ return f'#{r:02x}{g:02x}{b:02x}'
20
+
21
+ def darken_color(hex_color, factor=0.7):
22
+ """Darkens a hex color by reducing its brightness in the HSV space."""
23
+
24
+ hex_color = hex_color.lstrip("#")
25
+ r, g, b = int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)
26
+
27
+ # Convert to HSV to adjust brightness
28
+ h, s, v = colorsys.rgb_to_hsv(r / 255.0, g / 255.0, b / 255.0)
29
+ v = max(0, v * factor) # Reduce brightness
30
+
31
+ # Convert back to HEX
32
+ r, g, b = [int(c * 255) for c in colorsys.hsv_to_rgb(h, s, v)]
33
+ return f'#{r:02x}{g:02x}{b:02x}'
34
+
35
+ # Generate unique colors for pairs
36
+ def generate_color(index, total_colors=20):
37
+ """Generates a unique, evenly spaced color for each index using HSL."""
38
+
39
+ hue = index / total_colors # Spread hues in range [0,1]
40
+ saturation = 0.65 # Keep colors vivid
41
+ lightness = 0.75 # Balanced brightness
42
+
43
+ # Convert HSL to RGB
44
+ r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
45
+ r, g, b = int(r * 255), int(g * 255), int(b * 255)
46
+
47
+ return f'#{r:02x}{g:02x}{b:02x}' # Convert to hex
48
+
49
+ def highlight_pairs(text1, text2):
50
+ """Highlight matching pairs between two paragraphs"""
51
+ # Predefined matching pairs
52
+ match_pairs = [
53
+ {"index": 1, "text1": "deep learning", "start1": 13, "end1": 26,
54
+ "text2": "deep learning", "start2": 12, "end2": 25},
55
+ {"index": 2, "text1": "neural networks", "start1": 56, "end1": 71,
56
+ "text2": "neural networks", "start2": 68, "end2": 83},
57
+ {"index": 3, "text1": "AI research", "start1": 86, "end1": 97,
58
+ "text2": "AI research", "start2": 55, "end2": 66},
59
+ ]
60
+
61
+ # Assign unique colors to each index
62
+ pair_colors = {pair["index"]: generate_color(pair["index"], total_colors=len(match_pairs)) for pair in match_pairs}
63
+
64
+
65
+ def apply_highlight(text, pairs, key_start, key_end, key_index, pair_colors):
66
+ highlighted_text = ""
67
+ prev_end = 0
68
+
69
+ for pair in sorted(pairs, key=lambda x: x[key_start]):
70
+ start, end, index = pair[key_start], pair[key_end], pair[key_index]
71
+ color = pair_colors.get(index, "#ddd") # Default color if not found
72
+ color = lighten_color(color, factor=2.2) # Lightened color for background text
73
+ label_color = darken_color(color, factor=0.7) # Make label color darker
74
+
75
+ # Style the index as a label
76
+ index_label = (f'<span style="background-color:{label_color}; color:white; '
77
+ f'padding:1px 4px; border-radius:4px; font-size:12px; '
78
+ f'font-weight:bold; display:inline-block; margin-right:4px;">{index}</span>')
79
+
80
+ # Append non-highlighted text
81
+ highlighted_text += text[prev_end:start]
82
+ # Append highlighted text with index label
83
+ highlighted_text += (f'<span style="background-color:{color}; '
84
+ f'border-radius:3px; font-size:14px; display:inline-block;">'
85
+ f'{index_label} {text[start:end]}</span>')
86
+ prev_end = end
87
+
88
+ # Append remaining text
89
+ highlighted_text += text[prev_end:]
90
+ return highlighted_text
91
+
92
+ # Apply highlighting to both paragraphs using the global MATCH_PAIRS
93
+ highlighted_text1 = apply_highlight(text1, match_pairs, "start1", "end1", "index", pair_colors)
94
+ highlighted_text2 = apply_highlight(text2, match_pairs, "start2", "end2", "index", pair_colors)
95
+
96
+ return highlighted_text1, highlighted_text2
97
+
98
+ if __name__ == '__main__':
99
+ # Create Gradio Interface
100
+ text1 = ""
101
+
102
+ with gr.Blocks() as demo:
103
+ gr.Markdown("### Highlight Matching Parts Between Two Paragraphs")
104
+ text1_input = gr.Textbox(
105
+ label="Paragraph 1",
106
+ lines=5,
107
+ value="The field of deep learning is advancing rapidly. Modern neural networks are improving AI research significantly."
108
+ )
109
+ text2_input = gr.Textbox(
110
+ label="Paragraph 2",
111
+ lines=5,
112
+ value="Advances in deep learning have led to breakthroughs in AI research. Neural networks are at the core of these innovations"
113
+ )
114
+ output1 = gr.HTML()
115
+ output2 = gr.HTML()
116
+ submit_button = gr.Button("Highlight Matches")
117
+
118
+ submit_button.click(
119
+ fn=highlight_pairs,
120
+ inputs=[text1_input, text2_input],
121
+ outputs=[output1, output2]
122
+ )
123
+
124
+ # Launch the Gradio app
125
+ demo.launch()
src/application/text/search_detection.py CHANGED
@@ -60,9 +60,10 @@ def detect_text_by_relative_search(input_text, is_support_opposite = False):
60
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
61
  continue
62
 
63
- is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text)
64
- if is_paraphrase:
65
- return is_paraphrase, url, aligned_sentences, content.images
 
66
  return False, None, [], []
67
 
68
  def longest_common_subsequence(arr1, arr2):
@@ -147,13 +148,13 @@ def check_sentence(input_sentence, source_sentence, min_same_sentence_len,
147
  return False
148
 
149
 
150
- def check_paraphrase(input_text, page_text, verbose=False):
151
  """
152
  Checks if the input text is paraphrased in the content at the given URL.
153
 
154
  Args:
155
  input_text: The text to check for paraphrase.
156
- url: The URL of the web page to compare with.
157
  verbose: If True, print debug information.
158
 
159
  Returns:
@@ -199,24 +200,31 @@ def check_paraphrase(input_text, page_text, verbose=False):
199
  similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
200
 
201
  # Find sentence alignments
202
- alignment = []
203
  paraphrased_sentence_count = 0
204
  for i, sentence1 in enumerate(input_sentences):
 
205
  max_sim_index = np.argmax(similarity_matrix[i])
206
  max_similarity = similarity_matrix[i][max_sim_index]
207
 
208
  is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
209
 
210
- if 0.80 < max_similarity < 0.99:
211
- print(f"\t\tinput_sentence : {sentence1}")
212
- print(f"\t\tmatched_sentence: {page_sentences[max_sim_index]}")
213
- print(f"\t\t--> similarity: {max_similarity}\n")
214
- item = {
215
- "input_sentence": sentence1,
216
- "matched_sentence": page_sentences[max_sim_index],
217
- "similarity": max_similarity,
218
- "is_paraphrase_sentence": is_paraphrase_sentence,
219
- }
 
 
 
 
 
 
220
 
221
  # Check for individual sentence paraphrase if overall paraphrase not yet found
222
  if not is_paraphrase_text and check_sentence(
@@ -227,12 +235,12 @@ def check_paraphrase(input_text, page_text, verbose=False):
227
  print(f"Paraphrase found for individual sentence: {sentence1}")
228
  print(f"Matched sentence: {page_sentences[max_sim_index]}")
229
 
230
- alignment.append(item)
231
  paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
232
 
233
  # Check if enough sentences are paraphrases
234
 
235
- is_paraphrase_text = paraphrased_sentence_count >= min_matching_sentences
236
 
237
  if verbose:
238
  print (f"\t\tparaphrased_sentence_count: {paraphrased_sentence_count}, min_matching_sentences: {min_matching_sentences}, total_sentence_count: {len(input_sentences)}")
@@ -261,7 +269,7 @@ def similarity_ratio(a, b):
261
  return 0.0 # Handle cases where inputs are not strings or None
262
  return SequenceMatcher(None, a, b).ratio()
263
 
264
- def check_human(alligned_sentences, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
265
  """
266
  Checks if a sufficient number of input sentences are found within
267
  source sentences.
@@ -271,16 +279,8 @@ def check_human(alligned_sentences, min_ratio=MIN_RATIO_PARAPHRASE_NUM):
271
  """
272
  if not alligned_sentences: # Handle empty data case
273
  return False
274
- min_matching = math.ceil(len(alligned_sentences) * min_ratio)
275
 
276
- count = 0
277
-
278
- #for input_sentence, source_sentence, similiarity, is_paraprhase in data:
279
- for sentence in alligned_sentences:
280
- if sentence["similarity"] >= 0.99:
281
- count += 1
282
- print(f"\tmatching_sentence_count : {count}, min_matching: {min_matching}")
283
- if count >= min_matching:
284
  return True
285
  return False
286
 
 
60
  print(f"\t\t\t↑↑↑ More than {MAX_CHAR_SIZE} characters")
61
  continue
62
 
63
+ is_paraphrase, aligned_sentences = check_paraphrase(input_text, page_text, url)
64
+ #if is_paraphrase:
65
+ return is_paraphrase, url, aligned_sentences, content.images
66
+
67
  return False, None, [], []
68
 
69
  def longest_common_subsequence(arr1, arr2):
 
148
  return False
149
 
150
 
151
+ def check_paraphrase(input_text, page_text, url, verbose=False):
152
  """
153
  Checks if the input text is paraphrased in the content at the given URL.
154
 
155
  Args:
156
  input_text: The text to check for paraphrase.
157
+ page_text: The text of the web page to compare with.
158
  verbose: If True, print debug information.
159
 
160
  Returns:
 
200
  similarity_matrix = util.cos_sim(embeddings1, embeddings2).cpu().numpy()
201
 
202
  # Find sentence alignments
203
+ alignment = {}
204
  paraphrased_sentence_count = 0
205
  for i, sentence1 in enumerate(input_sentences):
206
+ print(f"allign: {i}")
207
  max_sim_index = np.argmax(similarity_matrix[i])
208
  max_similarity = similarity_matrix[i][max_sim_index]
209
 
210
  is_paraphrase_sentence = max_similarity > PARAPHRASE_THRESHOLD
211
 
212
+ if 0.80 > max_similarity:
213
+ alignment = {
214
+ "input_sentence": sentence1,
215
+ "matched_sentence": "",
216
+ "similarity": max_similarity,
217
+ "is_paraphrase_sentence": is_paraphrase_sentence,
218
+ "url": "",
219
+ }
220
+ else:
221
+ alignment = {
222
+ "input_sentence": sentence1,
223
+ "matched_sentence": page_sentences[max_sim_index],
224
+ "similarity": max_similarity,
225
+ "is_paraphrase_sentence": is_paraphrase_sentence,
226
+ "url": url,
227
+ }
228
 
229
  # Check for individual sentence paraphrase if overall paraphrase not yet found
230
  if not is_paraphrase_text and check_sentence(
 
235
  print(f"Paraphrase found for individual sentence: {sentence1}")
236
  print(f"Matched sentence: {page_sentences[max_sim_index]}")
237
 
238
+ #alignment.append(item)
239
  paraphrased_sentence_count += 1 if is_paraphrase_sentence else 0
240
 
241
  # Check if enough sentences are paraphrases
242
 
243
+ is_paraphrase_text = paraphrased_sentence_count > 0 #min_matching_sentences
244
 
245
  if verbose:
246
  print (f"\t\tparaphrased_sentence_count: {paraphrased_sentence_count}, min_matching_sentences: {min_matching_sentences}, total_sentence_count: {len(input_sentences)}")
 
269
  return 0.0 # Handle cases where inputs are not strings or None
270
  return SequenceMatcher(None, a, b).ratio()
271
 
272
+ def check_human(alligned_sentences):
273
  """
274
  Checks if a sufficient number of input sentences are found within
275
  source sentences.
 
279
  """
280
  if not alligned_sentences: # Handle empty data case
281
  return False
 
282
 
283
+ if alligned_sentences["similarity"] >= 0.99:
 
 
 
 
 
 
 
284
  return True
285
  return False
286