import gradio as gr import pandas as pd import torch from extract_text import extract_text_from_image from text_similarity import analyze_similarity def process_image(image, key_texts, similarity_threshold, fragment_threshold): """Processes the image, extracts text, and analyzes similarities.""" try: if image is None: return "Please upload an image for analysis.", None, None, None, None, None if not key_texts.strip(): return "Please enter key texts for comparison.", None, None, None, None, None # Extract text from the image using the user's method gpu_available = torch.cuda.is_available() extracted_texts = extract_text_from_image(image, gpu_available) if isinstance(key_texts, str): key_texts = [text.strip() for text in key_texts.split('\n') if text.strip()] # Process the analysis results = analyze_similarity( extracted_texts, key_texts, similarity_threshold=similarity_threshold/100, # Convert percentage to decimal fragment_threshold=fragment_threshold/100 # Convert percentage to decimal ) # Gerar relatório HTML html_report = generate_html_report(results) # Gerar DataFrames dfs = generate_results_dataframe(results) # Extrair DataFrames individuais (ou criar vazios se não existirem) df_statistics = dfs.get("statistics", pd.DataFrame()) df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"])) return html_report, df_statistics, df_similar, df_fragments, df_combined, extracted_texts, gpu_available except Exception as e: return f"Erro ao processar: {str(e)}", None, None, None, None, None def process_manual_input(texts, key_texts, similarity_threshold, fragment_threshold): """Processes the user's manual text input.""" # Validate input if not texts.strip() or not key_texts.strip(): return "Please enter texts for analysis and key texts for comparison.", None, None, None, None try: # Process the analysis results = analyze_similarity( texts, key_texts, similarity_threshold=similarity_threshold/100, # Convert percentage to decimal fragment_threshold=fragment_threshold/100 # Convert percentage to decimal ) # Generate HTML report html_report = generate_html_report(results) # Gerar DataFrames dfs = generate_results_dataframe(results) # Extract individual DataFrames (or create empty ones if they don't exist) df_statistics = dfs.get("statistics", pd.DataFrame()) df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"])) return html_report, df_statistics, df_similar, df_fragments, df_combined except Exception as e: return f"Erro ao processar: {str(e)}", None, None, None, None def generate_html_report(results): """Generates an HTML report about the detected similarities.""" html = "

Similarity Report

" # General statistics html += "
" html += f"

Total texts analyzed: {results['statistics']['total_analyzed']}

" html += f"

Texts with detected similarity: {results['statistics']['total_processed']}

" html += "
" # Results table html += "

Detected Similarities

" # Similar texts if results["similar_texts"]: html += "

Direct Similar Texts

" html += "" html += "" for item in results["similar_texts"]: html += f"" html += "
Original TextKey TextSimilarity
{item['text']}{item['key_text']}{item['similarity']:.2%}
" # Detected fragments if results["fragments_detected"]: html += "

Text with Detected Fragments

" html += "" html += "" for item in results["fragments_detected"]: html += f"" html += "
Original TextKey TextSimilarity
{item['text']}{item['key_text']}{item['similarity']:.2%}
" # Combined texts if results["combined"]: html += "

Text that need to be combined

" html += "" html += "" for item in results["combined"]: html += f"" html += "
Text 1Text 2CombinationKey TextSimilarity
{item['texts'][0]}{item['texts'][1]}{item['combined_text']}{item['key_text']}{item['similarity']:.2%}
" if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]): html += "

No significant similarity found with the current parameters.

" return html def generate_results_dataframe(results): """Generates pandas DataFrames to visualize the results.""" dfs = {} # DataFrame for similar texts if results["similar_texts"]: data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}") for item in results["similar_texts"]] dfs["similar"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"]) # DataFrame for fragments if results["fragments_detected"]: data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}") for item in results["fragments_detected"]] dfs["fragments"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"]) # DataFrame for combined if results["combined"]: data = [(f"{item['indices'][0]},{item['indices'][1]}", item['texts'][0], item['texts'][1], item['combined_text'], item['key_text'], f"{item['similarity']:.2%}") for item in results["combined"]] dfs["combined"] = pd.DataFrame(data, columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"]) # Statistics DataFrame data = [ ("Total analyzed", results["statistics"]["total_analyzed"]), ("Total with similarity", results["statistics"]["total_processed"]), ("Direct similarity", results["statistics"]["direct_similarity"]), ("Fragments", results["statistics"]["fragments"]), ("Combined", results["statistics"]["combined"]) ] dfs["statistics"] = pd.DataFrame(data, columns=["Metric", "Value"]) return dfs def generate_gradio(): with gr.Blocks(title="Text Similarity Detector") as demo: gr.Markdown("# 🔍 Text Similarity Detector with Image Extraction") gr.Markdown(""" This tool analyzes the similarity between texts extracted from an image and reference key texts. It can identify: - Direct similar texts - Key text fragments within the texts - Text combinations that match key texts """) with gr.Tabs() as tabs: with gr.TabItem("Image Analysis"): with gr.Row(): with gr.Column(scale=1): # Column for inputs on the left input_image = gr.Image(label="Upload an image to extract text", type="pil", height=600) key_texts_image = gr.Textbox( label="Key Texts for Comparison", placeholder="Paste your key texts here (one per line)", lines=5 ) # with gr.Row(): # key_texts_image = gr.Textbox( # label="Key Texts for Comparison", # placeholder="Paste your key texts here (one per line)", # lines=5 # ) # min_similarity_per_key_image = gr.Textbox( # label="Minimum Similarity for Each Key Text (%)", # placeholder="Enter one value per line, matching the key texts", # lines=5 # ) with gr.Row(): similarity_threshold_image = gr.Slider( label="Similarity Threshold (%)", minimum=50, maximum=100, value=70, step=1 ) fragment_threshold_image = gr.Slider( label="Fragment Similarity Threshold (%)", minimum=50, maximum=100, value=70, step=1 ) analyze_image_btn = gr.Button("Analyze Image", variant="primary") with gr.Column(scale=1): # Column for outputs on the right gpu_available = gr.Checkbox(label="Used GPU") extracted_texts = gr.Textbox(label="Extracted Texts from the Image", lines=5) html_output = gr.HTML(label="Similarity Report") with gr.Tabs(): with gr.TabItem("Statistics"): statistics_output = gr.Dataframe(label="Statistics") with gr.TabItem("Direct Similarity"): similar_texts_output = gr.Dataframe(label="Direct Similar Texts") with gr.TabItem("Fragments"): fragments_output = gr.Dataframe(label="Texts with Fragments") with gr.TabItem("Combined"): combined_output = gr.Dataframe(label="Combined Texts") with gr.TabItem("Manual Analysis"): with gr.Row(): with gr.Column(scale=1): # Column for inputs on the left input_texts = gr.Textbox( label="List of Texts for Analysis", placeholder="Paste your list of texts here (one per line)", lines=10 ) key_texts_input = gr.Textbox( label="Key Texts for Comparison", placeholder="Paste your key texts here (one per line)", lines=5 ) # with gr.Row(): # key_texts_input = gr.Textbox( # label="Key Texts for Comparison", # placeholder="Paste your key texts here (one per line)", # lines=5 # ) # min_similarity_per_key_input = gr.Textbox( # label="Minimum Similarity for Each Key Text (%)", # placeholder="Enter one value per line, matching the key texts", # lines=5 # ) with gr.Row(): similarity_threshold = gr.Slider( label="Similarity Threshold (%)", minimum=50, maximum=100, value=70, step=1 ) fragment_threshold = gr.Slider( label="Fragment Similarity Threshold (%)", minimum=50, maximum=100, value=70, step=1 ) analyze_btn = gr.Button("Analyze Image", variant="primary") with gr.Column(scale=1): # Column for outputs on the right html_output_manual = gr.HTML(label="Manual Similarity Report") with gr.Tabs(): with gr.TabItem("Statistics"): statistics_output_manual = gr.Dataframe(label="Statistics") with gr.TabItem("Direct Similarity"): similar_texts_output_manual = gr.Dataframe(label="Direct Similar Texts") with gr.TabItem("Fragments"): fragments_output_manual = gr.Dataframe(label="Texts with Fragments") with gr.TabItem("Combined"): combined_output_manual = gr.Dataframe(label="Combined Texts") # Connect the image processing function to the button analyze_image_btn.click( process_image, inputs=[input_image, key_texts_image, similarity_threshold_image, fragment_threshold_image], outputs=[html_output, statistics_output, similar_texts_output, fragments_output, combined_output, extracted_texts, gpu_available] ) # Connect the manual text processing function to the button analyze_btn.click( process_manual_input, inputs=[input_texts, key_texts_input, similarity_threshold, fragment_threshold], outputs=[html_output_manual, statistics_output_manual, similar_texts_output_manual, fragments_output_manual, combined_output_manual] ) return demo #app = gr.mount_gradio_app(app, demo, path="/") if __name__ == "__main__": generate_gradio.launch() # PORT = int(os.getenv("PORT", 7860)) # if __name__ == "__main__": # import uvicorn # print(f"A arrancar na porta {PORT}...") # uvicorn.run(app) #demo.launch(server_name="0.0.0.0", server_port=7860)