MarioPrzBasto commited on
Commit
2f73fd7
·
1 Parent(s): 72ac86a

Add application file

Browse files
Files changed (9) hide show
  1. .gitattributes +0 -35
  2. Dockerfile +9 -4
  3. README.md +5 -7
  4. app.py +318 -13
  5. extract_text.py +29 -0
  6. main.py +107 -0
  7. models.py +11 -0
  8. requirements.txt +10 -6
  9. text_similarity.py +125 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -1,10 +1,15 @@
1
  FROM python:3.9
2
 
3
- COPY . .
4
 
5
- WORKDIR /
6
 
7
- RUN pip install --no-cache-dir --upgrade -r /requirements.txt
8
 
9
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
10
 
 
 
 
 
 
 
1
  FROM python:3.9
2
 
3
+ WORKDIR /app
4
 
5
+ RUN mkdir -p /app/.EasyOCR && chmod 777 /app/.EasyOCR
6
 
7
+ ENV EASYOCR_MODULE_PATH="/app/.EasyOCR"
8
 
9
+ COPY requirements.txt .
10
 
11
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
12
+
13
+ COPY ./*.py /app/
14
+
15
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
- title: Key Texts Image Finder
3
- emoji: 🏆
4
- colorFrom: purple
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.22.0
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
1
  ---
2
+ title: Similarity
3
+ emoji: 🌍
4
+ colorFrom: indigo
5
+ colorTo: gray
6
+ sdk: docker
 
 
7
  pinned: false
8
  ---
9
 
app.py CHANGED
@@ -1,17 +1,322 @@
1
- from fastapi import FastAPI
2
- from fastapi.middleware.cors import CORSMiddleware
 
 
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- app.add_middleware(
8
- CORSMiddleware,
9
- allow_origins=["*"],
10
- allow_credentials=True,
11
- allow_methods=["*"],
12
- allow_headers=["*"],
13
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- @app.get("/", tags=["Home"])
16
- def api_home():
17
- return {'detail': 'Welcome to FastAPI TextGen Tutorial!'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import torch
4
+ from extract_text import extract_text_from_image
5
+ from text_similarity import analyze_similarity
6
 
7
+ def process_image(image, key_texts, similarity_threshold, fragment_threshold):
8
+ """Processes the image, extracts text, and analyzes similarities."""
9
+ try:
10
+ if image is None:
11
+ return "Please upload an image for analysis.", None, None, None, None, None
12
+
13
+ if not key_texts.strip():
14
+ return "Please enter key texts for comparison.", None, None, None, None, None
15
+
16
+ # Extract text from the image using the user's method
17
+ gpu_available = torch.cuda.is_available()
18
+ extracted_texts = extract_text_from_image(image, gpu_available)
19
+
20
+ if isinstance(key_texts, str):
21
+ key_texts = [text.strip() for text in key_texts.split('\n') if text.strip()]
22
+
23
+ # Process the analysis
24
+ results = analyze_similarity(
25
+ extracted_texts,
26
+ key_texts,
27
+ similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
28
+ fragment_threshold=fragment_threshold/100 # Convert percentage to decimal
29
+ )
30
+
31
+ # Gerar relatório HTML
32
+ html_report = generate_html_report(results)
33
+
34
+ # Gerar DataFrames
35
+ dfs = generate_results_dataframe(results)
36
+
37
+ # Extrair DataFrames individuais (ou criar vazios se não existirem)
38
+ df_statistics = dfs.get("statistics", pd.DataFrame())
39
+ df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
40
+ df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
41
+ df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"]))
42
+
43
+ return html_report, df_statistics, df_similar, df_fragments, df_combined, extracted_texts, gpu_available
44
 
45
+
46
+ except Exception as e:
47
+ return f"Erro ao processar: {str(e)}", None, None, None, None, None
48
+
49
+ def process_manual_input(texts, key_texts, similarity_threshold, fragment_threshold):
50
+ """Processes the user's manual text input."""
51
+ # Validate input
52
+ if not texts.strip() or not key_texts.strip():
53
+ return "Please enter texts for analysis and key texts for comparison.", None, None, None, None
54
+
55
+ try:
56
+ # Process the analysis
57
+ results = analyze_similarity(
58
+ texts,
59
+ key_texts,
60
+ similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
61
+ fragment_threshold=fragment_threshold/100 # Convert percentage to decimal
62
+ )
63
+
64
+ # Generate HTML report
65
+ html_report = generate_html_report(results)
66
+
67
+ # Gerar DataFrames
68
+ dfs = generate_results_dataframe(results)
69
+
70
+ # Extract individual DataFrames (or create empty ones if they don't exist)
71
+ df_statistics = dfs.get("statistics", pd.DataFrame())
72
+ df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
73
+ df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
74
+ df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"]))
75
+
76
+ return html_report, df_statistics, df_similar, df_fragments, df_combined
77
+
78
+ except Exception as e:
79
+ return f"Erro ao processar: {str(e)}", None, None, None, None
80
 
81
+ def generate_html_report(results):
82
+ """Generates an HTML report about the detected similarities."""
83
+ html = "<h2>Similarity Report</h2>"
84
+
85
+ # General statistics
86
+ html += "<div padding: 15px; border-radius: 5px; margin-bottom: 20px;'>"
87
+ html += f"<p><b>Total texts analyzed:</b> {results['statistics']['total_analyzed']}</p>"
88
+ html += f"<p><b>Texts with detected similarity:</b> {results['statistics']['total_processed']}</p>"
89
+ html += "</div>"
90
+
91
+ # Results table
92
+ html += "<h3>Detected Similarities</h3>"
93
+
94
+ # Similar texts
95
+ if results["similar_texts"]:
96
+ html += "<h4>Direct Similar Texts</h4>"
97
+ html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
98
+ html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
99
+
100
+ for item in results["similar_texts"]:
101
+ html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
102
+
103
+ html += "</table>"
104
+
105
+ # Detected fragments
106
+ if results["fragments_detected"]:
107
+ html += "<h4>Text with Detected Fragments</h4>"
108
+ html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
109
+ html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
110
+
111
+ for item in results["fragments_detected"]:
112
+ html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
113
+
114
+ html += "</table>"
115
+
116
+ # Combined texts
117
+ if results["combined"]:
118
+ html += "<h4>Text that need to be combined</h4>"
119
+ html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
120
+ html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 1</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 2</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Combination</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
121
+
122
+ for item in results["combined"]:
123
+ html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][0]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][1]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['combined_text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
124
+
125
+ html += "</table>"
126
+
127
+ if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]):
128
+ html += "<p>No significant similarity found with the current parameters.</p>"
129
+
130
+ return html
131
 
132
+ def generate_results_dataframe(results):
133
+ """Generates pandas DataFrames to visualize the results."""
134
+ dfs = {}
135
+
136
+ # DataFrame for similar texts
137
+ if results["similar_texts"]:
138
+ data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}")
139
+ for item in results["similar_texts"]]
140
+ dfs["similar"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"])
141
+
142
+ # DataFrame for fragments
143
+ if results["fragments_detected"]:
144
+ data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}")
145
+ for item in results["fragments_detected"]]
146
+ dfs["fragments"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"])
147
+
148
+ # DataFrame for combined
149
+ if results["combined"]:
150
+ data = [(f"{item['indices'][0]},{item['indices'][1]}",
151
+ item['texts'][0],
152
+ item['texts'][1],
153
+ item['combined_text'],
154
+ item['key_text'],
155
+ f"{item['similarity']:.2%}")
156
+ for item in results["combined"]]
157
+ dfs["combined"] = pd.DataFrame(data, columns=["Indices", "Text 1", "Text 2",
158
+ "Combined Text", "Key Text", "Similarity"])
159
+
160
+ # Statistics DataFrame
161
+ data = [
162
+ ("Total analyzed", results["statistics"]["total_analyzed"]),
163
+ ("Total with similarity", results["statistics"]["total_processed"]),
164
+ ("Direct similarity", results["statistics"]["direct_similarity"]),
165
+ ("Fragments", results["statistics"]["fragments"]),
166
+ ("Combined", results["statistics"]["combined"])
167
+ ]
168
+ dfs["statistics"] = pd.DataFrame(data, columns=["Metric", "Value"])
169
+
170
+ return dfs
171
+
172
+ def generate_gradio():
173
+ with gr.Blocks(title="Text Similarity Detector") as demo:
174
+ gr.Markdown("# 🔍 Text Similarity Detector with Image Extraction")
175
+ gr.Markdown("""
176
+ This tool analyzes the similarity between texts extracted from an image and reference key texts.
177
+ It can identify:
178
+ - Direct similar texts
179
+ - Key text fragments within the texts
180
+ - Text combinations that match key texts
181
+ """)
182
+
183
+ with gr.Tabs() as tabs:
184
+ with gr.TabItem("Image Analysis"):
185
+ with gr.Row():
186
+ with gr.Column(scale=1): # Column for inputs on the left
187
+ input_image = gr.Image(label="Upload an image to extract text", type="pil", height=600)
188
+ key_texts_image = gr.Textbox(
189
+ label="Key Texts for Comparison",
190
+ placeholder="Paste your key texts here (one per line)",
191
+ lines=5
192
+ )
193
+ # with gr.Row():
194
+ # key_texts_image = gr.Textbox(
195
+ # label="Key Texts for Comparison",
196
+ # placeholder="Paste your key texts here (one per line)",
197
+ # lines=5
198
+ # )
199
+
200
+ # min_similarity_per_key_image = gr.Textbox(
201
+ # label="Minimum Similarity for Each Key Text (%)",
202
+ # placeholder="Enter one value per line, matching the key texts",
203
+ # lines=5
204
+ # )
205
+
206
+ with gr.Row():
207
+ similarity_threshold_image = gr.Slider(
208
+ label="Similarity Threshold (%)",
209
+ minimum=50,
210
+ maximum=100,
211
+ value=70,
212
+ step=1
213
+ )
214
+ fragment_threshold_image = gr.Slider(
215
+ label="Fragment Similarity Threshold (%)",
216
+ minimum=50,
217
+ maximum=100,
218
+ value=70,
219
+ step=1
220
+ )
221
+
222
+ analyze_image_btn = gr.Button("Analyze Image", variant="primary")
223
+
224
+ with gr.Column(scale=1): # Column for outputs on the right
225
+ gpu_available = gr.Checkbox(label="Used GPU")
226
+ extracted_texts = gr.Textbox(label="Extracted Texts from the Image", lines=5)
227
+ html_output = gr.HTML(label="Similarity Report")
228
+ with gr.Tabs():
229
+ with gr.TabItem("Statistics"):
230
+ statistics_output = gr.Dataframe(label="Statistics")
231
+ with gr.TabItem("Direct Similarity"):
232
+ similar_texts_output = gr.Dataframe(label="Direct Similar Texts")
233
+ with gr.TabItem("Fragments"):
234
+ fragments_output = gr.Dataframe(label="Texts with Fragments")
235
+ with gr.TabItem("Combined"):
236
+ combined_output = gr.Dataframe(label="Combined Texts")
237
+
238
+ with gr.TabItem("Manual Analysis"):
239
+ with gr.Row():
240
+ with gr.Column(scale=1): # Column for inputs on the left
241
+ input_texts = gr.Textbox(
242
+ label="List of Texts for Analysis",
243
+ placeholder="Paste your list of texts here (one per line)",
244
+ lines=10
245
+ )
246
+ key_texts_input = gr.Textbox(
247
+ label="Key Texts for Comparison",
248
+ placeholder="Paste your key texts here (one per line)",
249
+ lines=5
250
+ )
251
+ # with gr.Row():
252
+ # key_texts_input = gr.Textbox(
253
+ # label="Key Texts for Comparison",
254
+ # placeholder="Paste your key texts here (one per line)",
255
+ # lines=5
256
+ # )
257
+
258
+ # min_similarity_per_key_input = gr.Textbox(
259
+ # label="Minimum Similarity for Each Key Text (%)",
260
+ # placeholder="Enter one value per line, matching the key texts",
261
+ # lines=5
262
+ # )
263
+
264
+ with gr.Row():
265
+ similarity_threshold = gr.Slider(
266
+ label="Similarity Threshold (%)",
267
+ minimum=50,
268
+ maximum=100,
269
+ value=70,
270
+ step=1
271
+ )
272
+ fragment_threshold = gr.Slider(
273
+ label="Fragment Similarity Threshold (%)",
274
+ minimum=50,
275
+ maximum=100,
276
+ value=70,
277
+ step=1
278
+ )
279
+
280
+ analyze_btn = gr.Button("Analyze Image", variant="primary")
281
+
282
+ with gr.Column(scale=1): # Column for outputs on the right
283
+ html_output_manual = gr.HTML(label="Manual Similarity Report")
284
+ with gr.Tabs():
285
+ with gr.TabItem("Statistics"):
286
+ statistics_output_manual = gr.Dataframe(label="Statistics")
287
+ with gr.TabItem("Direct Similarity"):
288
+ similar_texts_output_manual = gr.Dataframe(label="Direct Similar Texts")
289
+ with gr.TabItem("Fragments"):
290
+ fragments_output_manual = gr.Dataframe(label="Texts with Fragments")
291
+ with gr.TabItem("Combined"):
292
+ combined_output_manual = gr.Dataframe(label="Combined Texts")
293
+
294
+ # Connect the image processing function to the button
295
+ analyze_image_btn.click(
296
+ process_image,
297
+ inputs=[input_image, key_texts_image, similarity_threshold_image, fragment_threshold_image],
298
+ outputs=[html_output, statistics_output, similar_texts_output, fragments_output, combined_output, extracted_texts, gpu_available]
299
+ )
300
+
301
+ # Connect the manual text processing function to the button
302
+ analyze_btn.click(
303
+ process_manual_input,
304
+ inputs=[input_texts, key_texts_input, similarity_threshold, fragment_threshold],
305
+ outputs=[html_output_manual, statistics_output_manual, similar_texts_output_manual, fragments_output_manual, combined_output_manual]
306
+ )
307
+
308
+ return demo
309
+
310
+ #app = gr.mount_gradio_app(app, demo, path="/")
311
+
312
+ if __name__ == "__main__":
313
+ generate_gradio.launch()
314
+
315
+ # PORT = int(os.getenv("PORT", 7860))
316
+
317
+ # if __name__ == "__main__":
318
+ # import uvicorn
319
+ # print(f"A arrancar na porta {PORT}...")
320
+ # uvicorn.run(app)
321
+
322
+ #demo.launch(server_name="0.0.0.0", server_port=7860)
extract_text.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import easyocr
4
+ import torch
5
+
6
+ # Inicializar EasyOCR
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+ reader = easyocr.Reader(["en"], gpu=(device == "cuda"), verbose=False)
9
+
10
+ def extract_text_from_image(img, gpu_available):
11
+ reader = easyocr.Reader(['en'], gpu=gpu_available, verbose=False)
12
+
13
+ img = np.array(img)
14
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
15
+
16
+ # Resizing and blurring
17
+ scale_factor = 2
18
+ upscaled = cv2.resize(img, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)
19
+ blur_img = cv2.blur(upscaled, (5, 5))
20
+
21
+ all_text_found = []
22
+ text_ = reader.readtext(blur_img, detail=1, paragraph=False, text_threshold=0.3)
23
+
24
+ for t in text_:
25
+ bbox, text, score = t
26
+ if score > 0.1: # Filter weak detections
27
+ all_text_found.append(text)
28
+
29
+ return all_text_found
main.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import cv2
3
+ import numpy as np
4
+ import requests
5
+ import torch
6
+ import base64
7
+ import gradio as gr
8
+ from PIL import Image
9
+ from io import BytesIO
10
+ from fastapi import FastAPI
11
+ from models import TextSimilarityRequest
12
+ from extract_text import extract_text_from_image
13
+ from text_similarity import analyze_similarity
14
+ from app import generate_gradio
15
+
16
+
17
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
+ app = FastAPI()
19
+
20
+ @app.get("/teste", tags=["Home"])
21
+ def api_home():
22
+ return "oi"
23
+
24
+
25
+ @app.post("/text_similarity", summary="Perform images text similarity", response_model=float, tags=["Text Similarities"])
26
+ async def text_similarity(request: TextSimilarityRequest):
27
+ image_info = request.imageInfo
28
+ key_texts = request.keyTexts
29
+ similarity_threshold = request.similarityThreshold
30
+ origin_id = image_info.originId
31
+
32
+ logging.info(f"Checking text similarity for main source with resource id {origin_id}")
33
+
34
+ image = load_image_url(image_info.source)
35
+
36
+ # Extract text from the image using the user's method
37
+ gpu_available = torch.cuda.is_available()
38
+ extracted_texts = extract_text_from_image(image, gpu_available)
39
+
40
+ results = analyze_similarity(
41
+ extracted_texts,
42
+ key_texts,
43
+ similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
44
+ fragment_threshold=100/100 # Convert percentage to decimal
45
+ )
46
+
47
+ log_similarity_report(results, origin_id)
48
+
49
+ total_texts = len(key_texts)
50
+ passed_texts = results["statistics"]["total_processed"]
51
+
52
+ percentage_passed = (passed_texts / total_texts) * 100
53
+
54
+ logging.info(f"Text similarity for main source with resource id {origin_id} is {percentage_passed}%")
55
+
56
+ return percentage_passed
57
+
58
+ def log_similarity_report(results, originId):
59
+ # General statistics
60
+ logging.info(f"[{originId}] Total texts analyzed: {results['statistics']['total_analyzed']}")
61
+ logging.info(f"[{originId}] Texts with detected similarity: {results['statistics']['total_processed']}")
62
+
63
+ # Similar texts
64
+ if results["similar_texts"]:
65
+ logging.info(f"[{originId}] Direct Similar Texts Found: {len(results['similar_texts'])}")
66
+ for item in results["similar_texts"]:
67
+ logging.info(f"[{originId}] Similar Text: '{item['text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}")
68
+
69
+ # Detected fragments
70
+ if results["fragments_detected"]:
71
+ logging.info(f"[{originId}] Fragments Detected: {len(results['fragments_detected'])}")
72
+ for item in results["fragments_detected"]:
73
+ logging.info(f"[{originId}] Fragment: '{item['text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}")
74
+
75
+ # Combined texts
76
+ if results["combined"]:
77
+ logging.info(f"[{originId}] Texts to be Combined: {len(results['combined'])}")
78
+ for item in results["combined"]:
79
+ logging.info(f"[{originId}] Combined Text: '{item['combined_text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}")
80
+
81
+ # If no significant similarity found
82
+ if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]):
83
+ logging.info(f"[{originId}] No significant similarity found.")
84
+
85
+ # Statistics
86
+ logging.info(f"[{originId}] Direct similarity: {results['statistics']['direct_similarity']}")
87
+ logging.info(f"[{originId}] Fragments: {results['statistics']['fragments']}")
88
+ logging.info(f"[{originId}] Combined: {results['statistics']['combined']}")
89
+
90
+ def load_image_url(source):
91
+ Image.MAX_IMAGE_PIXELS = None
92
+
93
+ if source.startswith('http'):
94
+ response = requests.get(source)
95
+ img = np.asarray(bytearray(response.content), dtype=np.uint8)
96
+ img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE)
97
+ else:
98
+ img = base64.b64decode(source)
99
+ img = Image.open(BytesIO(img))
100
+ img = np.array(img)
101
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
102
+
103
+ return img
104
+
105
+ @app.on_event("startup")
106
+ async def startup_event():
107
+ gr.mount_gradio_app(app, generate_gradio(), path="/")
models.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List
3
+
4
+ class RequestModel(BaseModel):
5
+ originId: int
6
+ source: str
7
+
8
+ class TextSimilarityRequest(BaseModel):
9
+ imageInfo: RequestModel
10
+ keyTexts: List[str]
11
+ similarityThreshold: float
requirements.txt CHANGED
@@ -1,7 +1,11 @@
1
- fastapi==0.99.1
2
- uvicorn
3
  requests
4
- pydantic==1.10.12
5
- langchain
6
- clarifai
7
- Pillow
 
 
 
 
 
 
1
+ numpy
 
2
  requests
3
+ fastapi
4
+ pydantic
5
+ scikit-image
6
+ pillow
7
+ uvicorn
8
+ opencv-python-headless
9
+ torch
10
+ easyocr
11
+ gradio
text_similarity.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from difflib import SequenceMatcher
3
+ from collections import defaultdict
4
+
5
+ def extract_special_characters(text):
6
+ """Extracts all unique special characters from a list of texts."""
7
+ characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
8
+ return ''.join(characters)
9
+
10
+ def clean_text(text, keep=""):
11
+ """Removes special characters except those specified in 'keep', and converts to lowercase."""
12
+ pattern = rf'[^\w\s{re.escape(keep)}]'
13
+ return re.sub(pattern, '', text.lower())
14
+
15
+ def text_similarity(text, key_text):
16
+ """Calculates the similarity between two texts using SequenceMatcher."""
17
+ return SequenceMatcher(None, text, key_text).ratio()
18
+
19
+ def detect_fragments(text, key_texts, threshold=0.7):
20
+ """Checks if a text contains fragments of key texts."""
21
+ for key_text in key_texts:
22
+ characters_to_not_clean = extract_special_characters(key_text)
23
+ words = clean_text(text, characters_to_not_clean).split()
24
+
25
+ key_words = key_text.split()
26
+
27
+ # If the text is too short, we can't make an effective sliding window
28
+ if len(words) < len(key_words):
29
+ similarity = text_similarity(text, key_text)
30
+ if similarity >= threshold:
31
+ return True, key_text, similarity
32
+ continue
33
+
34
+ # Sliding window to compare word sequences
35
+ for i in range(len(words) - len(key_words) + 1):
36
+ fragment = " ".join(words[i:i+len(key_words)])
37
+ similarity = text_similarity(fragment, key_text)
38
+ if similarity >= threshold:
39
+ return True, key_text, similarity
40
+ return False, None, 0
41
+
42
+ def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7):
43
+ """
44
+ Analyzes the similarity between a list of texts and key texts.
45
+ Returns a detailed report on the similarities found.
46
+ """
47
+ results = {
48
+ "similar_texts": [],
49
+ "fragments_detected": [],
50
+ "combined": [],
51
+ "statistics": defaultdict(int)
52
+ }
53
+
54
+ processed_texts = set()
55
+
56
+ # Check direct similarity
57
+ for i, text in enumerate(text_list):
58
+ if not text.strip():
59
+ continue
60
+
61
+ for key_text in key_texts:
62
+ if not key_text.strip():
63
+ continue
64
+
65
+ similarity = text_similarity(text, key_text)
66
+ if similarity >= similarity_threshold:
67
+ results["similar_texts"].append({
68
+ "index": i,
69
+ "text": text,
70
+ "key_text": key_text,
71
+ "similarity": similarity
72
+ })
73
+ results["statistics"]["direct_similarity"] += 1
74
+ processed_texts.add(i)
75
+
76
+ # Check fragments
77
+ # for i, text in enumerate(text_list):
78
+ # if i in processed_texts or not text.strip():
79
+ # continue
80
+
81
+ # has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold)
82
+ # if has_fragment:
83
+ # results["fragments_detected"].append({
84
+ # "index": i,
85
+ # "text": text,
86
+ # "key_text": key_text,
87
+ # "similarity": similarity
88
+ # })
89
+ # results["statistics"]["fragments"] += 1
90
+ # processed_texts.add(i)
91
+
92
+ # Check texts that can be combined
93
+ for i in range(len(text_list)):
94
+ if i in processed_texts or not text_list[i].strip():
95
+ continue
96
+
97
+ for j in range(i+1, len(text_list)):
98
+ if j in processed_texts or not text_list[j].strip():
99
+ continue
100
+
101
+ combined_text = text_list[i] + " " + text_list[j]
102
+ for key_text in key_texts:
103
+ if not key_text.strip():
104
+ continue
105
+
106
+ similarity = text_similarity(combined_text, key_text)
107
+ if similarity >= similarity_threshold:
108
+ results["combined"].append({
109
+ "indices": [i, j],
110
+ "texts": [text_list[i], text_list[j]],
111
+ "combined_text": combined_text,
112
+ "key_text": key_text,
113
+ "similarity": similarity
114
+ })
115
+ results["statistics"]["combined"] += 1
116
+ processed_texts.add(i)
117
+ processed_texts.add(j)
118
+ break
119
+
120
+ # Calculate overall statistics
121
+ valid_texts = sum(1 for text in text_list if text.strip())
122
+ results["statistics"]["total_analyzed"] = valid_texts
123
+ results["statistics"]["total_processed"] = len(processed_texts)
124
+
125
+ return results