Spaces:
Sleeping
Sleeping
File size: 16,729 Bytes
2f73fd7 72ac86a 2f73fd7 973182c 2f73fd7 973182c 2f73fd7 72ac86a 2f73fd7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 |
import gradio as gr
import pandas as pd
import torch
from extract_text import extract_text_from_image
from text_similarity import analyze_similarity
def process_image(image, key_texts, similarity_threshold, fragment_threshold):
"""Processes the image, extracts text, and analyzes similarities."""
try:
if image is None:
return "Please upload an image for analysis.", None, None, None, None, None
if not key_texts.strip():
return "Please enter key texts for comparison.", None, None, None, None, None
# Extract text from the image using the user's method
gpu_available = torch.cuda.is_available()
extracted_texts = extract_text_from_image(image, gpu_available)
if isinstance(key_texts, str):
key_texts = [text.strip() for text in key_texts.split('\n') if text.strip()]
# Process the analysis
results = analyze_similarity(
extracted_texts,
key_texts,
similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
fragment_threshold=fragment_threshold/100 # Convert percentage to decimal
)
# Gerar relatório HTML
html_report = generate_html_report(results)
# Gerar DataFrames
dfs = generate_results_dataframe(results)
# Extrair DataFrames individuais (ou criar vazios se não existirem)
df_statistics = dfs.get("statistics", pd.DataFrame())
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"]))
return html_report, df_statistics, df_similar, df_fragments, df_combined, extracted_texts, gpu_available
except Exception as e:
return f"Erro ao processar: {str(e)}", None, None, None, None, None
def process_manual_input(texts, key_texts, similarity_threshold, fragment_threshold):
"""Processes the user's manual text input."""
# Validate input
if not texts.strip() or not key_texts.strip():
return "Please enter texts for analysis and key texts for comparison.", None, None, None, None
try:
# Process the analysis
results = analyze_similarity(
texts,
key_texts,
similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
fragment_threshold=fragment_threshold/100 # Convert percentage to decimal
)
# Generate HTML report
html_report = generate_html_report(results)
# Gerar DataFrames
dfs = generate_results_dataframe(results)
# Extract individual DataFrames (or create empty ones if they don't exist)
df_statistics = dfs.get("statistics", pd.DataFrame())
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"]))
return html_report, df_statistics, df_similar, df_fragments, df_combined
except Exception as e:
return f"Erro ao processar: {str(e)}", None, None, None, None
def generate_html_report(results):
"""Generates an HTML report about the detected similarities."""
html = "<h2>Similarity Report</h2>"
# General statistics
html += "<div padding: 15px; border-radius: 5px; margin-bottom: 20px;'>"
html += f"<p><b>Total texts analyzed:</b> {results['statistics']['total_analyzed']}</p>"
html += f"<p><b>Texts with detected similarity:</b> {results['statistics']['total_processed']}</p>"
html += "</div>"
# Results table
html += "<h3>Detected Similarities</h3>"
# Similar texts
if results["similar_texts"]:
html += "<h4>Direct Similar Texts</h4>"
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
for item in results["similar_texts"]:
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
html += "</table>"
# Detected fragments
if results["fragments_detected"]:
html += "<h4>Text with Detected Fragments</h4>"
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
for item in results["fragments_detected"]:
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
html += "</table>"
# Combined texts
if results["combined"]:
html += "<h4>Text that need to be combined</h4>"
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 1</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 2</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Combination</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
for item in results["combined"]:
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][0]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][1]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['combined_text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
html += "</table>"
if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]):
html += "<p>No significant similarity found with the current parameters.</p>"
return html
def generate_results_dataframe(results):
"""Generates pandas DataFrames to visualize the results."""
dfs = {}
# DataFrame for similar texts
if results["similar_texts"]:
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}")
for item in results["similar_texts"]]
dfs["similar"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"])
# DataFrame for fragments
if results["fragments_detected"]:
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}")
for item in results["fragments_detected"]]
dfs["fragments"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"])
# DataFrame for combined
if results["combined"]:
data = [(f"{item['indices'][0]},{item['indices'][1]}",
item['texts'][0],
item['texts'][1],
item['combined_text'],
item['key_text'],
f"{item['similarity']:.2%}")
for item in results["combined"]]
dfs["combined"] = pd.DataFrame(data, columns=["Indices", "Text 1", "Text 2",
"Combined Text", "Key Text", "Similarity"])
# Statistics DataFrame
data = [
("Total analyzed", results["statistics"]["total_analyzed"]),
("Total with similarity", results["statistics"]["total_processed"]),
("Direct similarity", results["statistics"]["direct_similarity"]),
("Fragments", results["statistics"]["fragments"]),
("Combined", results["statistics"]["combined"])
]
dfs["statistics"] = pd.DataFrame(data, columns=["Metric", "Value"])
return dfs
def generate_gradio():
with gr.Blocks(title="Text Similarity Detector") as demo:
gr.Markdown("# 🔍 Text Similarity Detector with Image Extraction")
gr.Markdown("""
This tool analyzes the similarity between texts extracted from an image and reference key texts.
It can identify:
- Direct similar texts
- Key text fragments within the texts
- Text combinations that match key texts
""")
with gr.Tabs() as tabs:
with gr.TabItem("Image Analysis"):
with gr.Row():
with gr.Column(scale=1): # Column for inputs on the left
input_image = gr.Image(label="Upload an image to extract text", type="pil", height=600)
key_texts_image = gr.Textbox(
label="Key Texts for Comparison",
placeholder="Paste your key texts here (one per line)",
lines=5
)
# with gr.Row():
# key_texts_image = gr.Textbox(
# label="Key Texts for Comparison",
# placeholder="Paste your key texts here (one per line)",
# lines=5
# )
# min_similarity_per_key_image = gr.Textbox(
# label="Minimum Similarity for Each Key Text (%)",
# placeholder="Enter one value per line, matching the key texts",
# lines=5
# )
with gr.Row():
similarity_threshold_image = gr.Slider(
label="Similarity Threshold (%)",
minimum=50,
maximum=100,
value=70,
step=1
)
fragment_threshold_image = gr.Slider(
label="Fragment Similarity Threshold (%)",
minimum=50,
maximum=100,
value=70,
step=1
)
analyze_image_btn = gr.Button("Analyze Image", variant="primary")
with gr.Column(scale=1): # Column for outputs on the right
gpu_available = gr.Checkbox(label="Used GPU")
extracted_texts = gr.Textbox(label="Extracted Texts from the Image", lines=5)
html_output = gr.HTML(label="Similarity Report")
with gr.Tabs():
with gr.TabItem("Statistics"):
statistics_output = gr.Dataframe(label="Statistics")
with gr.TabItem("Direct Similarity"):
similar_texts_output = gr.Dataframe(label="Direct Similar Texts")
with gr.TabItem("Fragments"):
fragments_output = gr.Dataframe(label="Texts with Fragments")
with gr.TabItem("Combined"):
combined_output = gr.Dataframe(label="Combined Texts")
with gr.TabItem("Manual Analysis"):
with gr.Row():
with gr.Column(scale=1): # Column for inputs on the left
input_texts = gr.Textbox(
label="List of Texts for Analysis",
placeholder="Paste your list of texts here (one per line)",
lines=10
)
key_texts_input = gr.Textbox(
label="Key Texts for Comparison",
placeholder="Paste your key texts here (one per line)",
lines=5
)
# with gr.Row():
# key_texts_input = gr.Textbox(
# label="Key Texts for Comparison",
# placeholder="Paste your key texts here (one per line)",
# lines=5
# )
# min_similarity_per_key_input = gr.Textbox(
# label="Minimum Similarity for Each Key Text (%)",
# placeholder="Enter one value per line, matching the key texts",
# lines=5
# )
with gr.Row():
similarity_threshold = gr.Slider(
label="Similarity Threshold (%)",
minimum=50,
maximum=100,
value=70,
step=1
)
fragment_threshold = gr.Slider(
label="Fragment Similarity Threshold (%)",
minimum=50,
maximum=100,
value=70,
step=1
)
analyze_btn = gr.Button("Analyze Image", variant="primary")
with gr.Column(scale=1): # Column for outputs on the right
html_output_manual = gr.HTML(label="Manual Similarity Report")
with gr.Tabs():
with gr.TabItem("Statistics"):
statistics_output_manual = gr.Dataframe(label="Statistics")
with gr.TabItem("Direct Similarity"):
similar_texts_output_manual = gr.Dataframe(label="Direct Similar Texts")
with gr.TabItem("Fragments"):
fragments_output_manual = gr.Dataframe(label="Texts with Fragments")
with gr.TabItem("Combined"):
combined_output_manual = gr.Dataframe(label="Combined Texts")
# Connect the image processing function to the button
analyze_image_btn.click(
process_image,
inputs=[input_image, key_texts_image, similarity_threshold_image, fragment_threshold_image],
outputs=[html_output, statistics_output, similar_texts_output, fragments_output, combined_output, extracted_texts, gpu_available]
)
# Connect the manual text processing function to the button
analyze_btn.click(
process_manual_input,
inputs=[input_texts, key_texts_input, similarity_threshold, fragment_threshold],
outputs=[html_output_manual, statistics_output_manual, similar_texts_output_manual, fragments_output_manual, combined_output_manual]
)
return demo
#app = gr.mount_gradio_app(app, demo, path="/")
if __name__ == "__main__":
generate_gradio.launch()
# PORT = int(os.getenv("PORT", 7860))
# if __name__ == "__main__":
# import uvicorn
# print(f"A arrancar na porta {PORT}...")
# uvicorn.run(app)
#demo.launch(server_name="0.0.0.0", server_port=7860)
|