MarioPrzBasto commited on
Commit
d0b11df
·
1 Parent(s): 51c00f9

Add application file

Browse files
Files changed (4) hide show
  1. extract_text.py +0 -29
  2. models.py +0 -11
  3. requirements.txt +2 -12
  4. text_similarity.py +0 -125
extract_text.py DELETED
@@ -1,29 +0,0 @@
1
- import cv2
2
- import numpy as np
3
- import easyocr
4
- import torch
5
-
6
- # Inicializar EasyOCR
7
- device = "cuda" if torch.cuda.is_available() else "cpu"
8
- reader = easyocr.Reader(["en"], gpu=(device == "cuda"), verbose=False)
9
-
10
- def extract_text_from_image(img, gpu_available):
11
- reader = easyocr.Reader(['en'], gpu=gpu_available, verbose=False)
12
-
13
- img = np.array(img)
14
- img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
15
-
16
- # Resizing and blurring
17
- scale_factor = 2
18
- upscaled = cv2.resize(img, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)
19
- blur_img = cv2.blur(upscaled, (5, 5))
20
-
21
- all_text_found = []
22
- text_ = reader.readtext(blur_img, detail=1, paragraph=False, text_threshold=0.3)
23
-
24
- for t in text_:
25
- bbox, text, score = t
26
- if score > 0.1: # Filter weak detections
27
- all_text_found.append(text)
28
-
29
- return all_text_found
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models.py DELETED
@@ -1,11 +0,0 @@
1
- from pydantic import BaseModel
2
- from typing import List
3
-
4
- class RequestModel(BaseModel):
5
- originId: int
6
- source: str
7
-
8
- class TextSimilarityRequest(BaseModel):
9
- imageInfo: RequestModel
10
- keyTexts: List[str]
11
- similarityThreshold: float
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,12 +1,2 @@
1
- opencv-python
2
- numpy
3
- matplotlib
4
- easyocr
5
- scikit-image
6
- pillow
7
- pandas
8
- torch
9
- uvicorn
10
- gradio
11
- requests
12
- starlette
 
1
+ fastapi
2
+ uvicorn
 
 
 
 
 
 
 
 
 
 
text_similarity.py DELETED
@@ -1,125 +0,0 @@
1
- import re
2
- from difflib import SequenceMatcher
3
- from collections import defaultdict
4
-
5
- def extract_special_characters(text):
6
- """Extracts all unique special characters from a list of texts."""
7
- characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
8
- return ''.join(characters)
9
-
10
- def clean_text(text, keep=""):
11
- """Removes special characters except those specified in 'keep', and converts to lowercase."""
12
- pattern = rf'[^\w\s{re.escape(keep)}]'
13
- return re.sub(pattern, '', text.lower())
14
-
15
- def text_similarity(text, key_text):
16
- """Calculates the similarity between two texts using SequenceMatcher."""
17
- return SequenceMatcher(None, text, key_text).ratio()
18
-
19
- def detect_fragments(text, key_texts, threshold=0.7):
20
- """Checks if a text contains fragments of key texts."""
21
- for key_text in key_texts:
22
- characters_to_not_clean = extract_special_characters(key_text)
23
- words = clean_text(text, characters_to_not_clean).split()
24
-
25
- key_words = key_text.split()
26
-
27
- # If the text is too short, we can't make an effective sliding window
28
- if len(words) < len(key_words):
29
- similarity = text_similarity(text, key_text)
30
- if similarity >= threshold:
31
- return True, key_text, similarity
32
- continue
33
-
34
- # Sliding window to compare word sequences
35
- for i in range(len(words) - len(key_words) + 1):
36
- fragment = " ".join(words[i:i+len(key_words)])
37
- similarity = text_similarity(fragment, key_text)
38
- if similarity >= threshold:
39
- return True, key_text, similarity
40
- return False, None, 0
41
-
42
- def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7):
43
- """
44
- Analyzes the similarity between a list of texts and key texts.
45
- Returns a detailed report on the similarities found.
46
- """
47
- results = {
48
- "similar_texts": [],
49
- "fragments_detected": [],
50
- "combined": [],
51
- "statistics": defaultdict(int)
52
- }
53
-
54
- processed_texts = set()
55
-
56
- # Check direct similarity
57
- for i, text in enumerate(text_list):
58
- if not text.strip():
59
- continue
60
-
61
- for key_text in key_texts:
62
- if not key_text.strip():
63
- continue
64
-
65
- similarity = text_similarity(text, key_text)
66
- if similarity >= similarity_threshold:
67
- results["similar_texts"].append({
68
- "index": i,
69
- "text": text,
70
- "key_text": key_text,
71
- "similarity": similarity
72
- })
73
- results["statistics"]["direct_similarity"] += 1
74
- processed_texts.add(i)
75
-
76
- # Check fragments
77
- # for i, text in enumerate(text_list):
78
- # if i in processed_texts or not text.strip():
79
- # continue
80
-
81
- # has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold)
82
- # if has_fragment:
83
- # results["fragments_detected"].append({
84
- # "index": i,
85
- # "text": text,
86
- # "key_text": key_text,
87
- # "similarity": similarity
88
- # })
89
- # results["statistics"]["fragments"] += 1
90
- # processed_texts.add(i)
91
-
92
- # Check texts that can be combined
93
- for i in range(len(text_list)):
94
- if i in processed_texts or not text_list[i].strip():
95
- continue
96
-
97
- for j in range(i+1, len(text_list)):
98
- if j in processed_texts or not text_list[j].strip():
99
- continue
100
-
101
- combined_text = text_list[i] + " " + text_list[j]
102
- for key_text in key_texts:
103
- if not key_text.strip():
104
- continue
105
-
106
- similarity = text_similarity(combined_text, key_text)
107
- if similarity >= similarity_threshold:
108
- results["combined"].append({
109
- "indices": [i, j],
110
- "texts": [text_list[i], text_list[j]],
111
- "combined_text": combined_text,
112
- "key_text": key_text,
113
- "similarity": similarity
114
- })
115
- results["statistics"]["combined"] += 1
116
- processed_texts.add(i)
117
- processed_texts.add(j)
118
- break
119
-
120
- # Calculate overall statistics
121
- valid_texts = sum(1 for text in text_list if text.strip())
122
- results["statistics"]["total_analyzed"] = valid_texts
123
- results["statistics"]["total_processed"] = len(processed_texts)
124
-
125
- return results