|
import gradio as gr |
|
import pandas as pd |
|
import cv2 |
|
import numpy as np |
|
import requests |
|
import torch |
|
import base64 |
|
import os |
|
import logging |
|
from io import BytesIO |
|
from PIL import Image |
|
from fastapi import FastAPI |
|
from fastapi.middleware.cors import CORSMiddleware |
|
from extract_text import extract_text_from_image |
|
from models import TextSimilarityRequest |
|
from text_similarity import analyze_similarity |
|
from starlette.responses import JSONResponse |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
app = FastAPI() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.post("/text_similarity", summary="Perform images text similarity", response_model=float, tags=["Text Similarities"]) |
|
async def text_similarity(request: TextSimilarityRequest): |
|
image_info = request.imageInfo |
|
key_texts = request.keyTexts |
|
similarity_threshold = request.similarityThreshold |
|
origin_id = image_info.originId |
|
|
|
logging.info(f"Checking text similarity for main source with resource id {origin_id}") |
|
|
|
image = load_image_url(image_info.source) |
|
|
|
|
|
gpu_available = torch.cuda.is_available() |
|
extracted_texts = extract_text_from_image(image, gpu_available) |
|
|
|
results = analyze_similarity( |
|
extracted_texts, |
|
key_texts, |
|
similarity_threshold=similarity_threshold/100, |
|
fragment_threshold=100/100 |
|
) |
|
|
|
log_similarity_report(results, origin_id) |
|
|
|
total_texts = len(key_texts) |
|
passed_texts = results["statistics"]["total_processed"] |
|
|
|
percentage_passed = (passed_texts / total_texts) * 100 |
|
|
|
logging.info(f"Text similarity for main source with resource id {origin_id} is {percentage_passed}%") |
|
|
|
return percentage_passed |
|
|
|
def log_similarity_report(results, originId): |
|
|
|
logging.info(f"[{originId}] Total texts analyzed: {results['statistics']['total_analyzed']}") |
|
logging.info(f"[{originId}] Texts with detected similarity: {results['statistics']['total_processed']}") |
|
|
|
|
|
if results["similar_texts"]: |
|
logging.info(f"[{originId}] Direct Similar Texts Found: {len(results['similar_texts'])}") |
|
for item in results["similar_texts"]: |
|
logging.info(f"[{originId}] Similar Text: '{item['text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}") |
|
|
|
|
|
if results["fragments_detected"]: |
|
logging.info(f"[{originId}] Fragments Detected: {len(results['fragments_detected'])}") |
|
for item in results["fragments_detected"]: |
|
logging.info(f"[{originId}] Fragment: '{item['text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}") |
|
|
|
|
|
if results["combined"]: |
|
logging.info(f"[{originId}] Texts to be Combined: {len(results['combined'])}") |
|
for item in results["combined"]: |
|
logging.info(f"[{originId}] Combined Text: '{item['combined_text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}") |
|
|
|
|
|
if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]): |
|
logging.info(f"[{originId}] No significant similarity found.") |
|
|
|
|
|
logging.info(f"[{originId}] Direct similarity: {results['statistics']['direct_similarity']}") |
|
logging.info(f"[{originId}] Fragments: {results['statistics']['fragments']}") |
|
logging.info(f"[{originId}] Combined: {results['statistics']['combined']}") |
|
|
|
def load_image_url(source): |
|
Image.MAX_IMAGE_PIXELS = None |
|
|
|
if source.startswith('http'): |
|
response = requests.get(source) |
|
img = np.asarray(bytearray(response.content), dtype=np.uint8) |
|
img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE) |
|
else: |
|
img = base64.b64decode(source) |
|
img = Image.open(BytesIO(img)) |
|
img = np.array(img) |
|
img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) |
|
|
|
return img |
|
|
|
def process_image(image, key_texts, similarity_threshold, fragment_threshold): |
|
"""Processes the image, extracts text, and analyzes similarities.""" |
|
try: |
|
if image is None: |
|
return "Please upload an image for analysis.", None, None, None, None, None |
|
|
|
if not key_texts.strip(): |
|
return "Please enter key texts for comparison.", None, None, None, None, None |
|
|
|
|
|
gpu_available = torch.cuda.is_available() |
|
extracted_texts = extract_text_from_image(image, gpu_available) |
|
|
|
if isinstance(key_texts, str): |
|
key_texts = [text.strip() for text in key_texts.split('\n') if text.strip()] |
|
|
|
|
|
results = analyze_similarity( |
|
extracted_texts, |
|
key_texts, |
|
similarity_threshold=similarity_threshold/100, |
|
fragment_threshold=fragment_threshold/100 |
|
) |
|
|
|
|
|
html_report = generate_html_report(results) |
|
|
|
|
|
dfs = generate_results_dataframe(results) |
|
|
|
|
|
df_statistics = dfs.get("statistics", pd.DataFrame()) |
|
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) |
|
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) |
|
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"])) |
|
|
|
return html_report, df_statistics, df_similar, df_fragments, df_combined, extracted_texts, gpu_available |
|
|
|
|
|
except Exception as e: |
|
return f"Erro ao processar: {str(e)}", None, None, None, None, None |
|
|
|
def process_manual_input(texts, key_texts, similarity_threshold, fragment_threshold): |
|
"""Processes the user's manual text input.""" |
|
|
|
if not texts.strip() or not key_texts.strip(): |
|
return "Please enter texts for analysis and key texts for comparison.", None, None, None, None |
|
|
|
try: |
|
|
|
results = analyze_similarity( |
|
texts, |
|
key_texts, |
|
similarity_threshold=similarity_threshold/100, |
|
fragment_threshold=fragment_threshold/100 |
|
) |
|
|
|
|
|
html_report = generate_html_report(results) |
|
|
|
|
|
dfs = generate_results_dataframe(results) |
|
|
|
|
|
df_statistics = dfs.get("statistics", pd.DataFrame()) |
|
df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) |
|
df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"])) |
|
df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"])) |
|
|
|
return html_report, df_statistics, df_similar, df_fragments, df_combined |
|
|
|
except Exception as e: |
|
return f"Erro ao processar: {str(e)}", None, None, None, None |
|
|
|
def generate_html_report(results): |
|
"""Generates an HTML report about the detected similarities.""" |
|
html = "<h2>Similarity Report</h2>" |
|
|
|
|
|
html += "<div padding: 15px; border-radius: 5px; margin-bottom: 20px;'>" |
|
html += f"<p><b>Total texts analyzed:</b> {results['statistics']['total_analyzed']}</p>" |
|
html += f"<p><b>Texts with detected similarity:</b> {results['statistics']['total_processed']}</p>" |
|
html += "</div>" |
|
|
|
|
|
html += "<h3>Detected Similarities</h3>" |
|
|
|
|
|
if results["similar_texts"]: |
|
html += "<h4>Direct Similar Texts</h4>" |
|
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>" |
|
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>" |
|
|
|
for item in results["similar_texts"]: |
|
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>" |
|
|
|
html += "</table>" |
|
|
|
|
|
if results["fragments_detected"]: |
|
html += "<h4>Text with Detected Fragments</h4>" |
|
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>" |
|
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>" |
|
|
|
for item in results["fragments_detected"]: |
|
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>" |
|
|
|
html += "</table>" |
|
|
|
|
|
if results["combined"]: |
|
html += "<h4>Text that need to be combined</h4>" |
|
html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>" |
|
html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 1</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 2</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Combination</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>" |
|
|
|
for item in results["combined"]: |
|
html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][0]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][1]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['combined_text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>" |
|
|
|
html += "</table>" |
|
|
|
if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]): |
|
html += "<p>No significant similarity found with the current parameters.</p>" |
|
|
|
return html |
|
|
|
def generate_results_dataframe(results): |
|
"""Generates pandas DataFrames to visualize the results.""" |
|
dfs = {} |
|
|
|
|
|
if results["similar_texts"]: |
|
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}") |
|
for item in results["similar_texts"]] |
|
dfs["similar"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"]) |
|
|
|
|
|
if results["fragments_detected"]: |
|
data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}") |
|
for item in results["fragments_detected"]] |
|
dfs["fragments"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"]) |
|
|
|
|
|
if results["combined"]: |
|
data = [(f"{item['indices'][0]},{item['indices'][1]}", |
|
item['texts'][0], |
|
item['texts'][1], |
|
item['combined_text'], |
|
item['key_text'], |
|
f"{item['similarity']:.2%}") |
|
for item in results["combined"]] |
|
dfs["combined"] = pd.DataFrame(data, columns=["Indices", "Text 1", "Text 2", |
|
"Combined Text", "Key Text", "Similarity"]) |
|
|
|
|
|
data = [ |
|
("Total analyzed", results["statistics"]["total_analyzed"]), |
|
("Total with similarity", results["statistics"]["total_processed"]), |
|
("Direct similarity", results["statistics"]["direct_similarity"]), |
|
("Fragments", results["statistics"]["fragments"]), |
|
("Combined", results["statistics"]["combined"]) |
|
] |
|
dfs["statistics"] = pd.DataFrame(data, columns=["Metric", "Value"]) |
|
|
|
return dfs |
|
|
|
|
|
|
|
@app.get("/api") |
|
def read_root(): |
|
return JSONResponse(content={"message": "Hello from FastAPI inside Gradio!"}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|