|
import gradio as gr |
|
import pandas as pd |
|
import torch |
|
from extract_text import extract_text_from_image |
|
from text_similarity import analyze_similarity |
|
|
|
def process_image(image, key_texts, similarity_threshold, fragment_threshold): |
|
"""Processes the image, extracts text, and analyzes similarities.""" |
|
try: |
|
if image is None: |
|
return "Please upload an image for analysis.", None, None, None, None, None |
|
|
|
if not key_texts.strip(): |
|
return "Please enter key texts for comparison.", None, None, None, None, None |
|
|
|
|
|
gpu_available = torch.cuda.is_available() |
|
extracted_texts = extract_text_from_image(image, gpu_available) |
|
|
|
if isinstance(key_texts, str): |
|
key_texts = [text.strip() for text in key_texts.split('\n') if text.strip()] |
|
|
|
|
|
results = analyze_similarity( |
|
extracted_texts, |
|
key_texts, |
|
similarity_threshold=similarity_threshold/100, |
|
fragment_threshold=fragment_threshold/100 |
|
) |
|
|
|
|
|
html_report = generate_html_report(results) |
|
|
|
|
|
dfs = generate_results_dataframe(results) |
|
|
|
|
|
df_statistics = dfs.get("statistics", pd.DataFrame()) |
|
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) |
|
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) |
|
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"])) |
|
|
|
return html_report, df_statistics, df_similar, df_fragments, df_combined, extracted_texts, gpu_available |
|
|
|
|
|
except Exception as e: |
|
return f"Erro ao processar: {str(e)}", None, None, None, None, None |
|
|
|
def process_manual_input(texts, key_texts, similarity_threshold, fragment_threshold): |
|
"""Processes the user's manual text input.""" |
|
|
|
if not texts.strip() or not key_texts.strip(): |
|
return "Please enter texts for analysis and key texts for comparison.", None, None, None, None |
|
|
|
try: |
|
|
|
results = analyze_similarity( |
|
texts, |
|
key_texts, |
|
similarity_threshold=similarity_threshold/100, |
|
fragment_threshold=fragment_threshold/100 |
|
) |
|
|
|
|
|
html_report = generate_html_report(results) |
|
|
|
|
|
dfs = generate_results_dataframe(results) |
|
|
|
|
|
df_statistics = dfs.get("statistics", pd.DataFrame()) |
|
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) |
|
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) |
|
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"])) |
|
|
|
return html_report, df_statistics, df_similar, df_fragments, df_combined |
|
|
|
except Exception as e: |
|
return f"Erro ao processar: {str(e)}", None, None, None, None |
|
|
|
def generate_html_report(results): |
|
"""Generates an HTML report about the detected similarities.""" |
|
html = "<h2>Similarity Report</h2>" |
|
|
|
|
|
html += "<div padding: 15px; border-radius: 5px; margin-bottom: 20px;'>" |
|
html += f"<p><b>Total texts analyzed:</b> {results['statistics']['total_analyzed']}</p>" |
|
html += f"<p><b>Texts with detected similarity:</b> {results['statistics']['total_processed']}</p>" |
|
html += "</div>" |
|
|
|
|
|
html += "<h3>Detected Similarities</h3>" |
|
|
|
|
|
if results["similar_texts"]: |
|
html += "<h4>Direct Similar Texts</h4>" |
|
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>" |
|
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>" |
|
|
|
for item in results["similar_texts"]: |
|
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>" |
|
|
|
html += "</table>" |
|
|
|
|
|
if results["fragments_detected"]: |
|
html += "<h4>Text with Detected Fragments</h4>" |
|
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>" |
|
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>" |
|
|
|
for item in results["fragments_detected"]: |
|
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>" |
|
|
|
html += "</table>" |
|
|
|
|
|
if results["combined"]: |
|
html += "<h4>Text that need to be combined</h4>" |
|
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>" |
|
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 1</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 2</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Combination</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>" |
|
|
|
for item in results["combined"]: |
|
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][0]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][1]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['combined_text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>" |
|
|
|
html += "</table>" |
|
|
|
if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]): |
|
html += "<p>No significant similarity found with the current parameters.</p>" |
|
|
|
return html |
|
|
|
def generate_results_dataframe(results): |
|
"""Generates pandas DataFrames to visualize the results.""" |
|
dfs = {} |
|
|
|
|
|
if results["similar_texts"]: |
|
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}") |
|
for item in results["similar_texts"]] |
|
dfs["similar"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"]) |
|
|
|
|
|
if results["fragments_detected"]: |
|
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}") |
|
for item in results["fragments_detected"]] |
|
dfs["fragments"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"]) |
|
|
|
|
|
if results["combined"]: |
|
data = [(f"{item['indices'][0]},{item['indices'][1]}", |
|
item['texts'][0], |
|
item['texts'][1], |
|
item['combined_text'], |
|
item['key_text'], |
|
f"{item['similarity']:.2%}") |
|
for item in results["combined"]] |
|
dfs["combined"] = pd.DataFrame(data, columns=["Indices", "Text 1", "Text 2", |
|
"Combined Text", "Key Text", "Similarity"]) |
|
|
|
|
|
data = [ |
|
("Total analyzed", results["statistics"]["total_analyzed"]), |
|
("Total with similarity", results["statistics"]["total_processed"]), |
|
("Direct similarity", results["statistics"]["direct_similarity"]), |
|
("Fragments", results["statistics"]["fragments"]), |
|
("Combined", results["statistics"]["combined"]) |
|
] |
|
dfs["statistics"] = pd.DataFrame(data, columns=["Metric", "Value"]) |
|
|
|
return dfs |
|
|
|
def generate_gradio(): |
|
with gr.Blocks(title="Text Similarity Detector") as demo: |
|
gr.Markdown("# 🔍 Text Similarity Detector with Image Extraction") |
|
gr.Markdown(""" |
|
This tool analyzes the similarity between texts extracted from an image and reference key texts. |
|
It can identify: |
|
- Direct similar texts |
|
- Key text fragments within the texts |
|
- Text combinations that match key texts |
|
""") |
|
|
|
with gr.Tabs() as tabs: |
|
with gr.TabItem("Image Analysis"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
input_image = gr.Image(label="Upload an image to extract text", type="pil", height=600) |
|
key_texts_image = gr.Textbox( |
|
label="Key Texts for Comparison", |
|
placeholder="Paste your key texts here (one per line)", |
|
lines=5 |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Row(): |
|
similarity_threshold_image = gr.Slider( |
|
label="Similarity Threshold (%)", |
|
minimum=50, |
|
maximum=100, |
|
value=70, |
|
step=1 |
|
) |
|
fragment_threshold_image = gr.Slider( |
|
label="Fragment Similarity Threshold (%)", |
|
minimum=50, |
|
maximum=100, |
|
value=70, |
|
step=1 |
|
) |
|
|
|
analyze_image_btn = gr.Button("Analyze Image", variant="primary") |
|
|
|
with gr.Column(scale=1): |
|
gpu_available = gr.Checkbox(label="Used GPU") |
|
extracted_texts = gr.Textbox(label="Extracted Texts from the Image", lines=5) |
|
html_output = gr.HTML(label="Similarity Report") |
|
with gr.Tabs(): |
|
with gr.TabItem("Statistics"): |
|
statistics_output = gr.Dataframe(label="Statistics") |
|
with gr.TabItem("Direct Similarity"): |
|
similar_texts_output = gr.Dataframe(label="Direct Similar Texts") |
|
with gr.TabItem("Fragments"): |
|
fragments_output = gr.Dataframe(label="Texts with Fragments") |
|
with gr.TabItem("Combined"): |
|
combined_output = gr.Dataframe(label="Combined Texts") |
|
|
|
with gr.TabItem("Manual Analysis"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
input_texts = gr.Textbox( |
|
label="List of Texts for Analysis", |
|
placeholder="Paste your list of texts here (one per line)", |
|
lines=10 |
|
) |
|
key_texts_input = gr.Textbox( |
|
label="Key Texts for Comparison", |
|
placeholder="Paste your key texts here (one per line)", |
|
lines=5 |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Row(): |
|
similarity_threshold = gr.Slider( |
|
label="Similarity Threshold (%)", |
|
minimum=50, |
|
maximum=100, |
|
value=70, |
|
step=1 |
|
) |
|
fragment_threshold = gr.Slider( |
|
label="Fragment Similarity Threshold (%)", |
|
minimum=50, |
|
maximum=100, |
|
value=70, |
|
step=1 |
|
) |
|
|
|
analyze_btn = gr.Button("Analyze Image", variant="primary") |
|
|
|
with gr.Column(scale=1): |
|
html_output_manual = gr.HTML(label="Manual Similarity Report") |
|
with gr.Tabs(): |
|
with gr.TabItem("Statistics"): |
|
statistics_output_manual = gr.Dataframe(label="Statistics") |
|
with gr.TabItem("Direct Similarity"): |
|
similar_texts_output_manual = gr.Dataframe(label="Direct Similar Texts") |
|
with gr.TabItem("Fragments"): |
|
fragments_output_manual = gr.Dataframe(label="Texts with Fragments") |
|
with gr.TabItem("Combined"): |
|
combined_output_manual = gr.Dataframe(label="Combined Texts") |
|
|
|
|
|
analyze_image_btn.click( |
|
process_image, |
|
inputs=[input_image, key_texts_image, similarity_threshold_image, fragment_threshold_image], |
|
outputs=[html_output, statistics_output, similar_texts_output, fragments_output, combined_output, extracted_texts, gpu_available] |
|
) |
|
|
|
|
|
analyze_btn.click( |
|
process_manual_input, |
|
inputs=[input_texts, key_texts_input, similarity_threshold, fragment_threshold], |
|
outputs=[html_output_manual, statistics_output_manual, similar_texts_output_manual, fragments_output_manual, combined_output_manual] |
|
) |
|
|
|
return demo |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
generate_gradio.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|