Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import torch | |
from extract_text import extract_text_from_image | |
from text_similarity import analyze_similarity | |
def process_image(image, key_texts, similarity_threshold, fragment_threshold): | |
"""Processes the image, extracts text, and analyzes similarities.""" | |
try: | |
if image is None: | |
return "Please upload an image for analysis.", None, None, None, None, None | |
if not key_texts.strip(): | |
return "Please enter key texts for comparison.", None, None, None, None, None | |
# Extract text from the image using the user's method | |
gpu_available = torch.cuda.is_available() | |
extracted_texts = extract_text_from_image(image, gpu_available) | |
if isinstance(key_texts, str): | |
key_texts = [text.strip() for text in key_texts.split('\n') if text.strip()] | |
# Process the analysis | |
results = analyze_similarity( | |
extracted_texts, | |
key_texts, | |
similarity_threshold=similarity_threshold/100, # Convert percentage to decimal | |
fragment_threshold=fragment_threshold/100 # Convert percentage to decimal | |
) | |
# Gerar relatório HTML | |
html_report = generate_html_report(results) | |
# Gerar DataFrames | |
dfs = generate_results_dataframe(results) | |
# Extrair DataFrames individuais (ou criar vazios se não existirem) | |
df_statistics = dfs.get("statistics", pd.DataFrame()) | |
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) | |
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) | |
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"])) | |
return html_report, df_statistics, df_similar, df_fragments, df_combined, extracted_texts, gpu_available | |
except Exception as e: | |
return f"Erro ao processar: {str(e)}", None, None, None, None, None | |
def process_manual_input(texts, key_texts, similarity_threshold, fragment_threshold): | |
"""Processes the user's manual text input.""" | |
# Validate input | |
if not texts.strip() or not key_texts.strip(): | |
return "Please enter texts for analysis and key texts for comparison.", None, None, None, None | |
try: | |
# Process the analysis | |
results = analyze_similarity( | |
texts, | |
key_texts, | |
similarity_threshold=similarity_threshold/100, # Convert percentage to decimal | |
fragment_threshold=fragment_threshold/100 # Convert percentage to decimal | |
) | |
# Generate HTML report | |
html_report = generate_html_report(results) | |
# Gerar DataFrames | |
dfs = generate_results_dataframe(results) | |
# Extract individual DataFrames (or create empty ones if they don't exist) | |
df_statistics = dfs.get("statistics", pd.DataFrame()) | |
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) | |
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) | |
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"])) | |
return html_report, df_statistics, df_similar, df_fragments, df_combined | |
except Exception as e: | |
return f"Erro ao processar: {str(e)}", None, None, None, None | |
def generate_html_report(results): | |
"""Generates an HTML report about the detected similarities.""" | |
html = "<h2>Similarity Report</h2>" | |
# General statistics | |
html += "<div padding: 15px; border-radius: 5px; margin-bottom: 20px;'>" | |
html += f"<p><b>Total texts analyzed:</b> {results['statistics']['total_analyzed']}</p>" | |
html += f"<p><b>Texts with detected similarity:</b> {results['statistics']['total_processed']}</p>" | |
html += "</div>" | |
# Results table | |
html += "<h3>Detected Similarities</h3>" | |
# Similar texts | |
if results["similar_texts"]: | |
html += "<h4>Direct Similar Texts</h4>" | |
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>" | |
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>" | |
for item in results["similar_texts"]: | |
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>" | |
html += "</table>" | |
# Detected fragments | |
if results["fragments_detected"]: | |
html += "<h4>Text with Detected Fragments</h4>" | |
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>" | |
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>" | |
for item in results["fragments_detected"]: | |
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>" | |
html += "</table>" | |
# Combined texts | |
if results["combined"]: | |
html += "<h4>Text that need to be combined</h4>" | |
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>" | |
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 1</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 2</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Combination</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>" | |
for item in results["combined"]: | |
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][0]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][1]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['combined_text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>" | |
html += "</table>" | |
if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]): | |
html += "<p>No significant similarity found with the current parameters.</p>" | |
return html | |
def generate_results_dataframe(results): | |
"""Generates pandas DataFrames to visualize the results.""" | |
dfs = {} | |
# DataFrame for similar texts | |
if results["similar_texts"]: | |
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}") | |
for item in results["similar_texts"]] | |
dfs["similar"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"]) | |
# DataFrame for fragments | |
if results["fragments_detected"]: | |
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}") | |
for item in results["fragments_detected"]] | |
dfs["fragments"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"]) | |
# DataFrame for combined | |
if results["combined"]: | |
data = [(f"{item['indices'][0]},{item['indices'][1]}", | |
item['texts'][0], | |
item['texts'][1], | |
item['combined_text'], | |
item['key_text'], | |
f"{item['similarity']:.2%}") | |
for item in results["combined"]] | |
dfs["combined"] = pd.DataFrame(data, columns=["Indices", "Text 1", "Text 2", | |
"Combined Text", "Key Text", "Similarity"]) | |
# Statistics DataFrame | |
data = [ | |
("Total analyzed", results["statistics"]["total_analyzed"]), | |
("Total with similarity", results["statistics"]["total_processed"]), | |
("Direct similarity", results["statistics"]["direct_similarity"]), | |
("Fragments", results["statistics"]["fragments"]), | |
("Combined", results["statistics"]["combined"]) | |
] | |
dfs["statistics"] = pd.DataFrame(data, columns=["Metric", "Value"]) | |
return dfs | |
def generate_gradio(): | |
with gr.Blocks(title="Text Similarity Detector") as demo: | |
gr.Markdown("# 🔍 Text Similarity Detector with Image Extraction") | |
gr.Markdown(""" | |
This tool analyzes the similarity between texts extracted from an image and reference key texts. | |
It can identify: | |
- Direct similar texts | |
- Key text fragments within the texts | |
- Text combinations that match key texts | |
""") | |
with gr.Tabs() as tabs: | |
with gr.TabItem("Image Analysis"): | |
with gr.Row(): | |
with gr.Column(scale=1): # Column for inputs on the left | |
input_image = gr.Image(label="Upload an image to extract text", type="pil", height=600) | |
key_texts_image = gr.Textbox( | |
label="Key Texts for Comparison", | |
placeholder="Paste your key texts here (one per line)", | |
lines=5 | |
) | |
# with gr.Row(): | |
# key_texts_image = gr.Textbox( | |
# label="Key Texts for Comparison", | |
# placeholder="Paste your key texts here (one per line)", | |
# lines=5 | |
# ) | |
# min_similarity_per_key_image = gr.Textbox( | |
# label="Minimum Similarity for Each Key Text (%)", | |
# placeholder="Enter one value per line, matching the key texts", | |
# lines=5 | |
# ) | |
with gr.Row(): | |
similarity_threshold_image = gr.Slider( | |
label="Similarity Threshold (%)", | |
minimum=50, | |
maximum=100, | |
value=70, | |
step=1 | |
) | |
fragment_threshold_image = gr.Slider( | |
label="Fragment Similarity Threshold (%)", | |
minimum=50, | |
maximum=100, | |
value=70, | |
step=1 | |
) | |
analyze_image_btn = gr.Button("Analyze Image", variant="primary") | |
with gr.Column(scale=1): # Column for outputs on the right | |
gpu_available = gr.Checkbox(label="Used GPU") | |
extracted_texts = gr.Textbox(label="Extracted Texts from the Image", lines=5) | |
html_output = gr.HTML(label="Similarity Report") | |
with gr.Tabs(): | |
with gr.TabItem("Statistics"): | |
statistics_output = gr.Dataframe(label="Statistics") | |
with gr.TabItem("Direct Similarity"): | |
similar_texts_output = gr.Dataframe(label="Direct Similar Texts") | |
with gr.TabItem("Fragments"): | |
fragments_output = gr.Dataframe(label="Texts with Fragments") | |
with gr.TabItem("Combined"): | |
combined_output = gr.Dataframe(label="Combined Texts") | |
with gr.TabItem("Manual Analysis"): | |
with gr.Row(): | |
with gr.Column(scale=1): # Column for inputs on the left | |
input_texts = gr.Textbox( | |
label="List of Texts for Analysis", | |
placeholder="Paste your list of texts here (one per line)", | |
lines=10 | |
) | |
key_texts_input = gr.Textbox( | |
label="Key Texts for Comparison", | |
placeholder="Paste your key texts here (one per line)", | |
lines=5 | |
) | |
# with gr.Row(): | |
# key_texts_input = gr.Textbox( | |
# label="Key Texts for Comparison", | |
# placeholder="Paste your key texts here (one per line)", | |
# lines=5 | |
# ) | |
# min_similarity_per_key_input = gr.Textbox( | |
# label="Minimum Similarity for Each Key Text (%)", | |
# placeholder="Enter one value per line, matching the key texts", | |
# lines=5 | |
# ) | |
with gr.Row(): | |
similarity_threshold = gr.Slider( | |
label="Similarity Threshold (%)", | |
minimum=50, | |
maximum=100, | |
value=70, | |
step=1 | |
) | |
fragment_threshold = gr.Slider( | |
label="Fragment Similarity Threshold (%)", | |
minimum=50, | |
maximum=100, | |
value=70, | |
step=1 | |
) | |
analyze_btn = gr.Button("Analyze Image", variant="primary") | |
with gr.Column(scale=1): # Column for outputs on the right | |
html_output_manual = gr.HTML(label="Manual Similarity Report") | |
with gr.Tabs(): | |
with gr.TabItem("Statistics"): | |
statistics_output_manual = gr.Dataframe(label="Statistics") | |
with gr.TabItem("Direct Similarity"): | |
similar_texts_output_manual = gr.Dataframe(label="Direct Similar Texts") | |
with gr.TabItem("Fragments"): | |
fragments_output_manual = gr.Dataframe(label="Texts with Fragments") | |
with gr.TabItem("Combined"): | |
combined_output_manual = gr.Dataframe(label="Combined Texts") | |
# Connect the image processing function to the button | |
analyze_image_btn.click( | |
process_image, | |
inputs=[input_image, key_texts_image, similarity_threshold_image, fragment_threshold_image], | |
outputs=[html_output, statistics_output, similar_texts_output, fragments_output, combined_output, extracted_texts, gpu_available] | |
) | |
# Connect the manual text processing function to the button | |
analyze_btn.click( | |
process_manual_input, | |
inputs=[input_texts, key_texts_input, similarity_threshold, fragment_threshold], | |
outputs=[html_output_manual, statistics_output_manual, similar_texts_output_manual, fragments_output_manual, combined_output_manual] | |
) | |
return demo | |
#app = gr.mount_gradio_app(app, demo, path="/") | |
if __name__ == "__main__": | |
generate_gradio.launch() | |
# PORT = int(os.getenv("PORT", 7860)) | |
# if __name__ == "__main__": | |
# import uvicorn | |
# print(f"A arrancar na porta {PORT}...") | |
# uvicorn.run(app) | |
#demo.launch(server_name="0.0.0.0", server_port=7860) | |