Spaces:

ProzisTech
/

key-text-image-finder

Running

App Files Files Community

MarioPrzBasto commited on Mar 27

Commit

d0b11df

1 Parent(s): 51c00f9

Add application file

Browse files

Files changed (4) hide show

extract_text.py +0 -29
models.py +0 -11
requirements.txt +2 -12
text_similarity.py +0 -125

extract_text.py DELETED Viewed

@@ -1,29 +0,0 @@
-import cv2
-import numpy as np
-import easyocr
-import torch
-# Inicializar EasyOCR
-device = "cuda" if torch.cuda.is_available() else "cpu"
-reader = easyocr.Reader(["en"], gpu=(device == "cuda"), verbose=False)
-def extract_text_from_image(img, gpu_available):
-    reader = easyocr.Reader(['en'], gpu=gpu_available, verbose=False)
-    img = np.array(img)
-    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-    # Resizing and blurring
-    scale_factor = 2
-    upscaled = cv2.resize(img, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)
-    blur_img = cv2.blur(upscaled, (5, 5))
-    all_text_found = []
-    text_ = reader.readtext(blur_img, detail=1, paragraph=False, text_threshold=0.3)
-    for t in text_:
-        bbox, text, score = t
-        if score > 0.1:  # Filter weak detections
-            all_text_found.append(text)
-    return all_text_found

models.py DELETED Viewed

@@ -1,11 +0,0 @@
-from pydantic import BaseModel
-from typing import List
-class RequestModel(BaseModel):
-    originId: int
-    source: str
-class TextSimilarityRequest(BaseModel):
-    imageInfo: RequestModel
-    keyTexts: List[str]
-    similarityThreshold: float

requirements.txt CHANGED Viewed

@@ -1,12 +1,2 @@
-opencv-python
-numpy
-matplotlib
-easyocr
-scikit-image
-pillow
-pandas
-torch
-uvicorn
-gradio
-requests
-starlette


1	+ fastapi
2	+ uvicorn

text_similarity.py DELETED Viewed

@@ -1,125 +0,0 @@
-import re
-from difflib import SequenceMatcher
-from collections import defaultdict
-def extract_special_characters(text):
-    """Extracts all unique special characters from a list of texts."""
-    characters = re.findall(r'[^\w\s]', text)  # Finds non-alphanumeric and non-space characters
-    return ''.join(characters)
-def clean_text(text, keep=""):
-    """Removes special characters except those specified in 'keep', and converts to lowercase."""
-    pattern = rf'[^\w\s{re.escape(keep)}]'
-    return re.sub(pattern, '', text.lower())
-def text_similarity(text, key_text):
-    """Calculates the similarity between two texts using SequenceMatcher."""
-    return SequenceMatcher(None, text, key_text).ratio()
-def detect_fragments(text, key_texts, threshold=0.7):
-    """Checks if a text contains fragments of key texts."""
-    for key_text in key_texts:
-        characters_to_not_clean = extract_special_characters(key_text)
-        words = clean_text(text, characters_to_not_clean).split()
-        key_words = key_text.split()
-        # If the text is too short, we can't make an effective sliding window
-        if len(words) < len(key_words):
-            similarity = text_similarity(text, key_text)
-            if similarity >= threshold:
-                return True, key_text, similarity
-            continue
-        # Sliding window to compare word sequences
-        for i in range(len(words) - len(key_words) + 1):
-            fragment = " ".join(words[i:i+len(key_words)])
-            similarity = text_similarity(fragment, key_text)
-            if similarity >= threshold:
-                return True, key_text, similarity
-    return False, None, 0
-def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7):
-    """
-    Analyzes the similarity between a list of texts and key texts.
-    Returns a detailed report on the similarities found.
-    """
-    results = {
-        "similar_texts": [],
-        "fragments_detected": [],
-        "combined": [],
-        "statistics": defaultdict(int)
-    }
-    processed_texts = set()
-    # Check direct similarity
-    for i, text in enumerate(text_list):
-        if not text.strip():
-            continue
-        for key_text in key_texts:
-            if not key_text.strip():
-                continue
-            similarity = text_similarity(text, key_text)
-            if similarity >= similarity_threshold:
-                results["similar_texts"].append({
-                    "index": i,
-                    "text": text,
-                    "key_text": key_text,
-                    "similarity": similarity
-                })
-                results["statistics"]["direct_similarity"] += 1
-                processed_texts.add(i)
-    # Check fragments
-    # for i, text in enumerate(text_list):
-    #     if i in processed_texts or not text.strip():
-    #         continue
-    #     has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold)
-    #     if has_fragment:
-    #         results["fragments_detected"].append({
-    #             "index": i,
-    #             "text": text,
-    #             "key_text": key_text,
-    #             "similarity": similarity
-    #         })
-    #         results["statistics"]["fragments"] += 1
-    #         processed_texts.add(i)
-    # Check texts that can be combined
-    for i in range(len(text_list)):
-        if i in processed_texts or not text_list[i].strip():
-            continue
-        for j in range(i+1, len(text_list)):
-            if j in processed_texts or not text_list[j].strip():
-                continue
-            combined_text = text_list[i] + " " + text_list[j]
-            for key_text in key_texts:
-                if not key_text.strip():
-                    continue
-                similarity = text_similarity(combined_text, key_text)
-                if similarity >= similarity_threshold:
-                    results["combined"].append({
-                        "indices": [i, j],
-                        "texts": [text_list[i], text_list[j]],
-                        "combined_text": combined_text,
-                        "key_text": key_text,
-                        "similarity": similarity
-                    })
-                    results["statistics"]["combined"] += 1
-                    processed_texts.add(i)
-                    processed_texts.add(j)
-                    break
-    # Calculate overall statistics
-    valid_texts = sum(1 for text in text_list if text.strip())
-    results["statistics"]["total_analyzed"] = valid_texts
-    results["statistics"]["total_processed"] = len(processed_texts)
-    return results