MarioPrzBasto's picture
Add application file
2f73fd7
raw
history blame
16.7 kB
import gradio as gr
import pandas as pd
import torch
from extract_text import extract_text_from_image
from text_similarity import analyze_similarity
def process_image(image, key_texts, similarity_threshold, fragment_threshold):
"""Processes the image, extracts text, and analyzes similarities."""
try:
if image is None:
return "Please upload an image for analysis.", None, None, None, None, None
if not key_texts.strip():
return "Please enter key texts for comparison.", None, None, None, None, None
# Extract text from the image using the user's method
gpu_available = torch.cuda.is_available()
extracted_texts = extract_text_from_image(image, gpu_available)
if isinstance(key_texts, str):
key_texts = [text.strip() for text in key_texts.split('\n') if text.strip()]
# Process the analysis
results = analyze_similarity(
extracted_texts,
key_texts,
similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
fragment_threshold=fragment_threshold/100 # Convert percentage to decimal
)
# Gerar relatório HTML
html_report = generate_html_report(results)
# Gerar DataFrames
dfs = generate_results_dataframe(results)
# Extrair DataFrames individuais (ou criar vazios se não existirem)
df_statistics = dfs.get("statistics", pd.DataFrame())
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"]))
return html_report, df_statistics, df_similar, df_fragments, df_combined, extracted_texts, gpu_available
except Exception as e:
return f"Erro ao processar: {str(e)}", None, None, None, None, None
def process_manual_input(texts, key_texts, similarity_threshold, fragment_threshold):
"""Processes the user's manual text input."""
# Validate input
if not texts.strip() or not key_texts.strip():
return "Please enter texts for analysis and key texts for comparison.", None, None, None, None
try:
# Process the analysis
results = analyze_similarity(
texts,
key_texts,
similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
fragment_threshold=fragment_threshold/100 # Convert percentage to decimal
)
# Generate HTML report
html_report = generate_html_report(results)
# Gerar DataFrames
dfs = generate_results_dataframe(results)
# Extract individual DataFrames (or create empty ones if they don't exist)
df_statistics = dfs.get("statistics", pd.DataFrame())
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"]))
return html_report, df_statistics, df_similar, df_fragments, df_combined
except Exception as e:
return f"Erro ao processar: {str(e)}", None, None, None, None
def generate_html_report(results):
"""Generates an HTML report about the detected similarities."""
html = "<h2>Similarity Report</h2>"
# General statistics
html += "<div padding: 15px; border-radius: 5px; margin-bottom: 20px;'>"
html += f"<p><b>Total texts analyzed:</b> {results['statistics']['total_analyzed']}</p>"
html += f"<p><b>Texts with detected similarity:</b> {results['statistics']['total_processed']}</p>"
html += "</div>"
# Results table
html += "<h3>Detected Similarities</h3>"
# Similar texts
if results["similar_texts"]:
html += "<h4>Direct Similar Texts</h4>"
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
for item in results["similar_texts"]:
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
html += "</table>"
# Detected fragments
if results["fragments_detected"]:
html += "<h4>Text with Detected Fragments</h4>"
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
for item in results["fragments_detected"]:
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
html += "</table>"
# Combined texts
if results["combined"]:
html += "<h4>Text that need to be combined</h4>"
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 1</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 2</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Combination</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
for item in results["combined"]:
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][0]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][1]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['combined_text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
html += "</table>"
if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]):
html += "<p>No significant similarity found with the current parameters.</p>"
return html
def generate_results_dataframe(results):
"""Generates pandas DataFrames to visualize the results."""
dfs = {}
# DataFrame for similar texts
if results["similar_texts"]:
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}")
for item in results["similar_texts"]]
dfs["similar"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"])
# DataFrame for fragments
if results["fragments_detected"]:
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}")
for item in results["fragments_detected"]]
dfs["fragments"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"])
# DataFrame for combined
if results["combined"]:
data = [(f"{item['indices'][0]},{item['indices'][1]}",
item['texts'][0],
item['texts'][1],
item['combined_text'],
item['key_text'],
f"{item['similarity']:.2%}")
for item in results["combined"]]
dfs["combined"] = pd.DataFrame(data, columns=["Indices", "Text 1", "Text 2",
"Combined Text", "Key Text", "Similarity"])
# Statistics DataFrame
data = [
("Total analyzed", results["statistics"]["total_analyzed"]),
("Total with similarity", results["statistics"]["total_processed"]),
("Direct similarity", results["statistics"]["direct_similarity"]),
("Fragments", results["statistics"]["fragments"]),
("Combined", results["statistics"]["combined"])
]
dfs["statistics"] = pd.DataFrame(data, columns=["Metric", "Value"])
return dfs
def generate_gradio():
with gr.Blocks(title="Text Similarity Detector") as demo:
gr.Markdown("# 🔍 Text Similarity Detector with Image Extraction")
gr.Markdown("""
This tool analyzes the similarity between texts extracted from an image and reference key texts.
It can identify:
- Direct similar texts
- Key text fragments within the texts
- Text combinations that match key texts
""")
with gr.Tabs() as tabs:
with gr.TabItem("Image Analysis"):
with gr.Row():
with gr.Column(scale=1): # Column for inputs on the left
input_image = gr.Image(label="Upload an image to extract text", type="pil", height=600)
key_texts_image = gr.Textbox(
label="Key Texts for Comparison",
placeholder="Paste your key texts here (one per line)",
lines=5
)
# with gr.Row():
# key_texts_image = gr.Textbox(
# label="Key Texts for Comparison",
# placeholder="Paste your key texts here (one per line)",
# lines=5
# )
# min_similarity_per_key_image = gr.Textbox(
# label="Minimum Similarity for Each Key Text (%)",
# placeholder="Enter one value per line, matching the key texts",
# lines=5
# )
with gr.Row():
similarity_threshold_image = gr.Slider(
label="Similarity Threshold (%)",
minimum=50,
maximum=100,
value=70,
step=1
)
fragment_threshold_image = gr.Slider(
label="Fragment Similarity Threshold (%)",
minimum=50,
maximum=100,
value=70,
step=1
)
analyze_image_btn = gr.Button("Analyze Image", variant="primary")
with gr.Column(scale=1): # Column for outputs on the right
gpu_available = gr.Checkbox(label="Used GPU")
extracted_texts = gr.Textbox(label="Extracted Texts from the Image", lines=5)
html_output = gr.HTML(label="Similarity Report")
with gr.Tabs():
with gr.TabItem("Statistics"):
statistics_output = gr.Dataframe(label="Statistics")
with gr.TabItem("Direct Similarity"):
similar_texts_output = gr.Dataframe(label="Direct Similar Texts")
with gr.TabItem("Fragments"):
fragments_output = gr.Dataframe(label="Texts with Fragments")
with gr.TabItem("Combined"):
combined_output = gr.Dataframe(label="Combined Texts")
with gr.TabItem("Manual Analysis"):
with gr.Row():
with gr.Column(scale=1): # Column for inputs on the left
input_texts = gr.Textbox(
label="List of Texts for Analysis",
placeholder="Paste your list of texts here (one per line)",
lines=10
)
key_texts_input = gr.Textbox(
label="Key Texts for Comparison",
placeholder="Paste your key texts here (one per line)",
lines=5
)
# with gr.Row():
# key_texts_input = gr.Textbox(
# label="Key Texts for Comparison",
# placeholder="Paste your key texts here (one per line)",
# lines=5
# )
# min_similarity_per_key_input = gr.Textbox(
# label="Minimum Similarity for Each Key Text (%)",
# placeholder="Enter one value per line, matching the key texts",
# lines=5
# )
with gr.Row():
similarity_threshold = gr.Slider(
label="Similarity Threshold (%)",
minimum=50,
maximum=100,
value=70,
step=1
)
fragment_threshold = gr.Slider(
label="Fragment Similarity Threshold (%)",
minimum=50,
maximum=100,
value=70,
step=1
)
analyze_btn = gr.Button("Analyze Image", variant="primary")
with gr.Column(scale=1): # Column for outputs on the right
html_output_manual = gr.HTML(label="Manual Similarity Report")
with gr.Tabs():
with gr.TabItem("Statistics"):
statistics_output_manual = gr.Dataframe(label="Statistics")
with gr.TabItem("Direct Similarity"):
similar_texts_output_manual = gr.Dataframe(label="Direct Similar Texts")
with gr.TabItem("Fragments"):
fragments_output_manual = gr.Dataframe(label="Texts with Fragments")
with gr.TabItem("Combined"):
combined_output_manual = gr.Dataframe(label="Combined Texts")
# Connect the image processing function to the button
analyze_image_btn.click(
process_image,
inputs=[input_image, key_texts_image, similarity_threshold_image, fragment_threshold_image],
outputs=[html_output, statistics_output, similar_texts_output, fragments_output, combined_output, extracted_texts, gpu_available]
)
# Connect the manual text processing function to the button
analyze_btn.click(
process_manual_input,
inputs=[input_texts, key_texts_input, similarity_threshold, fragment_threshold],
outputs=[html_output_manual, statistics_output_manual, similar_texts_output_manual, fragments_output_manual, combined_output_manual]
)
return demo
#app = gr.mount_gradio_app(app, demo, path="/")
if __name__ == "__main__":
generate_gradio.launch()
# PORT = int(os.getenv("PORT", 7860))
# if __name__ == "__main__":
# import uvicorn
# print(f"A arrancar na porta {PORT}...")
# uvicorn.run(app)
#demo.launch(server_name="0.0.0.0", server_port=7860)