Spaces:

pmkhanh7890
/

news_verification

Running

File size: 13,407 Bytes

22e1b62

import warnings

import torchvision.transforms as transforms
from google_img_source_search import ReverseImageSearcher

# from src.images.CNN_model_classifier import predict_cnn
# from src.images.diffusion_model_classifier import (
#     ImageClassifier,
#     predict_single_image,
# )

warnings.simplefilter(
    action="ignore",
    category=FutureWarning,
)  # disable FutureWarning

import gradio as gr  # noqa: E402
from transformers import (  # noqa: E402
    AutoModelForSequenceClassification,
    AutoTokenizer,
    pipeline,
)

from src.texts.MAGE.deployment import (  # noqa: E402
    detect,
    preprocess,
)
from src.texts.PASTED.pasted_lexicon import Detector  # noqa: E402
from src.texts.Search_Text.search import (  # noqa: E402
    get_important_sentences,
    get_keywords,
    is_human_written,
)
from src.images.Search_Image.search import (
    compare_images,
    get_image_from_path,
    get_image_from_url,
)


def convert_score_range(score):
    """
    Converts a score from the range [0, 1] to [-1, 1].

    Args:
    score: The original score in the range [0, 1].

    Returns:
    The converted score in the range [-1, 1].
    """

    return 2 * score - 1


def generate_highlighted_text(text_scores):
    """
    Generates a highlighted text string based on the given text and scores.

    Args:
        text_scores: A list of tuples, where each tuple contains a text
            segment and its score.

    Returns:
        A string of HTML code with highlighted text.
    """
    highlighted_text = ""
    for text, score in text_scores:
        # Map score to a color using a gradient
        color = f"rgba(255, 0, 0, {1 - score})"  # Red to green gradient
        highlighted_text += (
            f"<span style='background-color: {color}'>{text}</span>"  # noqa
        )
    return highlighted_text


def separate_characters_with_mask(text, mask):
    """Separates characters in a string and pairs them with a mask sign.

    Args:
        text: The input string.

    Returns:
        A list of tuples, where each tuple contains a character and a mask.
    """

    return [(char, mask) for char in text]


def detect_ai_text(model_name, search_engine, text):
    if search_engine is True:
        keywords = get_keywords(text)
        important_sentences = get_important_sentences(text, keywords)
        predictions = is_human_written(important_sentences[0])
        print("keywords: ", keywords)
        print("important_sentences: ", important_sentences)
        print("predictions: ", predictions)
        if predictions == -1:
            caption = "[Found exact match] "
            text_scores = list(zip([caption, text], [0, predictions]))
            print("text_scores: ", text_scores)
            return text_scores

    if model_name == "SimLLM":
        tokenize_input = SimLLM_tokenizer(text, return_tensors="pt")
        outputs = SimLLM_model(**tokenize_input)
        predictions = outputs.logits.argmax(dim=-1).item()
        if predictions == 0:
            predictions = "human-written"
        else:
            predictions = "machine-generated"

    elif model_name == "MAGE":
        processed_text = preprocess(text)
        predictions = detect(
            processed_text,
            MAGE_tokenizer,
            MAGE_model,
            device,
        )

    elif model_name == "chatgpt-detector-roberta":
        predictions = roberta_pipeline_en(text)[0]["label"]
        if predictions == "Human":
            predictions = "human-written"
        else:  # ChatGPT
            predictions = "machine-generated"
    elif model_name == "PASTED-Lexical":
        predictions = detector(text)

    if model_name != "PASTED-Lexical":
        text_scores = list(zip([text], [predictions]))
    else:
        text_scores = []
        for text, score in predictions:
            new_score = convert_score_range(score)  # normalize score
            text_scores.append((text, new_score))

    return text_scores


diffusion_model_path = (
    "src/images/Diffusion/model_checkpoints/"
    "image-classifier-step=7007-val_loss=0.09.ckpt"
)
cnn_model_path = "src/images/CNN/model_checkpoints/blur_jpg_prob0.5.pth"


def detect_ai_image(input_image_path, search_engine):
    # if search_engine is True:
        # Search image
        
        rev_img_searcher = ReverseImageSearcher()
        search_items = rev_img_searcher.search_by_file(input_image_path)
        min_result_difference = 5000
        result_image_url = ""
        input_image = get_image_from_path(input_image_path)
        
        for search_item in search_items:
            # print(f'Title: {search_item.page_title}')
            # print(f'Site: {search_item.page_url}')
            # print(f'Img: {search_item.image_url}\n')

            # Compare each search result image with the input image
            result_image = get_image_from_url(search_item.image_url)
            # input_image = get_image_from_url(search_item.image_url)
            result_difference = compare_images(result_image, input_image)
            
            print(f"Difference with search result: {result_difference}")
            print(f"Result image url: {search_item.page_url}\n")

            if min_result_difference > result_difference:
                min_result_difference = result_difference
                result_image_url = search_item.image_url
                result_page_url = search_item.page_url
                
                
            if result_difference == 0:
                break


        if min_result_difference == 0:
            result = f"<h1>Input image is LIKELY SIMILAR to image from:</h1>"\
                    f"<ul>"\
                    f'<li>\nPage URL: <a href="url">{result_page_url}</a></li>'\
                    f'<li>\nImage URL: <a href="url">{result_image_url}</a></li>'\
                    f"<li>\nDifference score: {min_result_difference}</li>"\
                    f"</ul>"
        elif 10 > min_result_difference > 0:
            result = f"<h1>Input image is potentially a VARIATRION from:</h1>"\
                    f"<ul>"\
                    f'<li>\nPage URL: <a href="url">{result_page_url}</a></li>'\
                    f'<li>\nImage URL: <a href="url">{result_image_url}</a></li>'\
                    f"<li>\nDifference score: {min_result_difference}</li>"\
                    f"</ul>"
        elif min_result_difference < 5000:
            result = f"<h1>Input image is not similar to any search results.</h1>"\
                    f"<ul>"\
                    f'<li>\nPage URL: <a href="url">{result_page_url}</a></li>'\
                    f'<li>\nImage URL: <a href="url">{result_image_url}</a></li>'\
                    f"<li>\nDifference score: {min_result_difference}</li>"\
                    f"</ul>"
        else:
            result = f"<h1>No search result found.</h1>"\
        
        return result

    # def get_prediction_diffusion(image):
    #     model = ImageClassifier.load_from_checkpoint(diffusion_model_path)

    #     prediction = predict_single_image(image, model)
    #     return (prediction >= 0.5, prediction)

    # def get_prediction_cnn(image):
    #     prediction = predict_cnn(image, cnn_model_path)
    #     return (prediction >= 0.5, prediction)

    # # Define the transformations for the image
    # transform = transforms.Compose(
    #     [
    #         transforms.Resize((224, 224)),  # Image size expected by ResNet50
    #         transforms.ToTensor(),
    #         transforms.Normalize(
    #             mean=[0.485, 0.456, 0.406],
    #             std=[0.229, 0.224, 0.225],
    #         ),
    #     ],
    # )
    # image_tensor = transform(inp)
    # pred_diff, prob_diff = get_prediction_diffusion(image_tensor)
    # pred_cnn, prob_cnn = get_prediction_cnn(image_tensor)
    # verdict = (
    #     "AI Generated" if (pred_diff or pred_cnn) else "No GenAI detected"
    # )
    # return (
    #     f"<h1>{verdict}</h1>"
    #     f"<ul>"
    #     f"<li>Diffusion detection score: {prob_diff:.1%} "
    #     f"{'(MATCH)' if pred_diff else ''}</li>"
    #     f"<li>CNN detection score: {prob_cnn:.1%} "
    #     f"{'(MATCH)' if pred_cnn else ''}</li>"
    #     f"</ul>"
    # )


# Define GPUs
device = "cpu"  # use 'cuda:0' if GPU is available

# init MAGE
model_dir = "yaful/MAGE"  # model in huggingface
MAGE_tokenizer = AutoTokenizer.from_pretrained(model_dir)
MAGE_model = AutoModelForSequenceClassification.from_pretrained(model_dir).to(
    device,
)

# init chatgpt-detector-roberta
model_dir = "Hello-SimpleAI/chatgpt-detector-roberta"  # model in huggingface
roberta_pipeline_en = pipeline(task="text-classification", model=model_dir)

# init PASTED
model_dir = "linzw/PASTED-Lexical"
detector = Detector(model_dir, device)

# init SimLLM
model_path = "./models/single_model_detector"
SimLLM_tokenizer = AutoTokenizer.from_pretrained(model_path)
SimLLM_model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Init variable for UI
title = """
<center>

<h1> AI-generated content detection </h1>
<b> Demo by NICT & Tokyo Techies <b>

</center>
"""

examples = [
    [
        "SimLLM",
        False,
        """\
The BBC's long-running consumer rights series Watchdog is to end as a \
standalone programme, instead becoming part of The One Show. Watchdog \
began in 1980 as a strand of Nationwide, but proved so popular it \
became a separate programme in 1985. Co-host Steph McGovern has moved \
to Channel 4, but Matt Allwright and Nikki Fox will stay to front the \
new strand. The BBC said they would investigate viewer complaints all \
year round rather than for two series a year.
""",
    ],
    [
        "chatgpt-detector-roberta",
        False,
        """\
Artificial intelligence (AI) is the science of making machines \
intelligent. It enables computers to learn from data, recognize \
patterns, and make decisions. AI powers many technologies we use \
daily, from voice assistants to self-driving cars. It's rapidly \
evolving, promising to revolutionize various industries and reshape \
the future.""",
    ],
]

model_remark = """<left>
Model sources:
<a href="https://github.com/Tokyo-Techies/prj-nict-ai-content-detection">SimLLM</a>,
<a href="https://github.com/yafuly/MAGE">MAGE</a>,
<a href="https://huggingface.co/Hello-SimpleAI/chatgpt-detector-roberta">chatgpt-detector-roberta</a>,
<a href="https://github.com/Linzwcs/PASTED">PASTED-Lexical</a>.
</left>
"""  # noqa: E501

image_samples = [
    ["src/images/samples/fake_dalle.jpg", "Generated (Dall-E)"],
    ["src/images/samples/fake_midjourney.png", "Generated (MidJourney)"],
    ["src/images/samples/fake_stable.jpg", "Generated (Stable Diffusion)"],
    ["src/images/samples/fake_cnn.png", "Generated (GAN)"],
    ["src/images/samples/real.png", "Organic"],
    [
        "https://p.potaufeu.asahi.com/1831-p/picture/27695628/89644a996fdd0cfc9e06398c64320fbe.jpg",  # noqa E501
        "Internet GenAI",
    ],
]
image_samples_path = [i[0] for i in image_samples]

# UI
with gr.Blocks() as demo:
    with gr.Row():
        gr.HTML(title)
    with gr.Row():
        with gr.Tab("Text"):
            with gr.Row():
                with gr.Column():
                    model = gr.Dropdown(
                        [
                            "SimLLM",
                            "MAGE",
                            "chatgpt-detector-roberta",
                            "PASTED-Lexical",
                        ],
                        label="Detection model",
                    )
                    search_engine = gr.Checkbox(label="Use search engine")
                    gr.HTML(model_remark)
                with gr.Column():
                    text_input = gr.Textbox(
                        label="Input text",
                        placeholder="Enter text here...",
                        lines=5,
                    )

            output = gr.HighlightedText(
                label="Detection results",
                combine_adjacent=True,
                show_legend=True,
                color_map={
                    "human-written": "#7d58cf",
                    "machine-generated": "#e34242",
                },
            )

            gr.Examples(
                examples=examples,
                inputs=[model, search_engine, text_input],
            )
            model.change(
                detect_ai_text,
                inputs=[model, search_engine, text_input],
                outputs=output,
            )
            search_engine.change(
                detect_ai_text,
                inputs=[model, search_engine, text_input],
                outputs=output,
            )
            text_input.change(
                detect_ai_text,
                inputs=[model, search_engine, text_input],
                outputs=output,
            )
        with gr.Tab("Images"):
            with gr.Row():
                input_image = gr.Image(type="filepath")
                with gr.Column():
                    output_image = gr.Markdown(height=400)
            gr.Examples(
                examples=image_samples,
                inputs=input_image,
            )

            input_image.change(
                detect_ai_image,
                inputs=input_image,
                outputs=output_image,
            )


# demo.launch(share=True)
demo.launch(allowed_paths=image_samples_path, share=True)