Spaces:
Sleeping
Sleeping
import logging | |
import cv2 | |
import numpy as np | |
import requests | |
import torch | |
import base64 | |
import gradio as gr | |
from PIL import Image | |
from io import BytesIO | |
from fastapi import FastAPI | |
from models import TextSimilarityRequest | |
from extract_text import extract_text_from_image | |
from text_similarity import analyze_similarity | |
from app import generate_gradio | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
app = FastAPI() | |
async def text_similarity(request: TextSimilarityRequest): | |
image_info = request.imageInfo | |
key_texts = request.keyTexts | |
similarity_threshold = request.similarityThreshold | |
origin_id = image_info.originId | |
logging.info(f"Checking text similarity for main source with resource id {origin_id}") | |
image = load_image_url(image_info.source) | |
# Extract text from the image using the user's method | |
gpu_available = torch.cuda.is_available() | |
extracted_texts = extract_text_from_image(image, gpu_available) | |
results = analyze_similarity( | |
extracted_texts, | |
key_texts, | |
similarity_threshold=similarity_threshold/100, # Convert percentage to decimal | |
fragment_threshold=100/100 # Convert percentage to decimal | |
) | |
log_similarity_report(results, origin_id) | |
total_texts = len(key_texts) | |
passed_texts = results["statistics"]["direct_similarity"] + results["statistics"]["combined"] | |
percentage_passed = (passed_texts / total_texts) * 100 | |
logging.info(f"Text similarity for main source with resource id {origin_id} is {percentage_passed}%") | |
return percentage_passed | |
def log_similarity_report(results, originId): | |
# General statistics | |
logging.info(f"[{originId}] Total texts analyzed: {results['statistics']['total_analyzed']}") | |
logging.info(f"[{originId}] Texts with detected similarity: {results['statistics']['total_processed']}") | |
# Similar texts | |
if results["similar_texts"]: | |
logging.info(f"[{originId}] Direct Similar Texts Found: {len(results['similar_texts'])}") | |
for item in results["similar_texts"]: | |
logging.info(f"[{originId}] Similar Text: '{item['text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}") | |
# Detected fragments | |
if results["fragments_detected"]: | |
logging.info(f"[{originId}] Fragments Detected: {len(results['fragments_detected'])}") | |
for item in results["fragments_detected"]: | |
logging.info(f"[{originId}] Fragment: '{item['text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}") | |
# Combined texts | |
if results["combined"]: | |
logging.info(f"[{originId}] Texts to be Combined: {len(results['combined'])}") | |
for item in results["combined"]: | |
logging.info(f"[{originId}] Combined Text: '{item['combined_text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}") | |
# If no significant similarity found | |
if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]): | |
logging.info(f"[{originId}] No significant similarity found.") | |
# Statistics | |
logging.info(f"[{originId}] Direct similarity: {results['statistics']['direct_similarity']}") | |
logging.info(f"[{originId}] Fragments: {results['statistics']['fragments']}") | |
logging.info(f"[{originId}] Combined: {results['statistics']['combined']}") | |
def load_image_url(source): | |
Image.MAX_IMAGE_PIXELS = None | |
if source.startswith('http'): | |
response = requests.get(source) | |
img = np.asarray(bytearray(response.content), dtype=np.uint8) | |
img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE) | |
else: | |
img = base64.b64decode(source) | |
img = Image.open(BytesIO(img)) | |
img = np.array(img) | |
img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) | |
return img | |
async def startup_event(): | |
gr.mount_gradio_app(app, generate_gradio(), path="/") |