MarioPrzBasto commited on
Commit
973182c
·
1 Parent(s): c5c3b15

Add application file

Browse files
Files changed (7) hide show
  1. Dockerfile +11 -0
  2. README.md +6 -4
  3. extract_text.py +29 -0
  4. main.py +427 -0
  5. models.py +11 -0
  6. requirements.txt +12 -0
  7. text_similarity.py +125 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
8
+
9
+ COPY ./*.py /app/
10
+
11
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,9 +1,11 @@
1
  ---
2
- title: Teste
3
- emoji: 🏃
4
- colorFrom: red
5
  colorTo: red
6
- sdk: docker
 
 
7
  pinned: false
8
  ---
9
 
 
1
  ---
2
+ title: Key Texts Image Finder
3
+ emoji: 🏆
4
+ colorFrom: purple
5
  colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.22.0
8
+ app_file: app.py
9
  pinned: false
10
  ---
11
 
extract_text.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import easyocr
4
+ import torch
5
+
6
+ # Inicializar EasyOCR
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+ reader = easyocr.Reader(["en"], gpu=(device == "cuda"), verbose=False)
9
+
10
+ def extract_text_from_image(img, gpu_available):
11
+ reader = easyocr.Reader(['en'], gpu=gpu_available, verbose=False)
12
+
13
+ img = np.array(img)
14
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
15
+
16
+ # Resizing and blurring
17
+ scale_factor = 2
18
+ upscaled = cv2.resize(img, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR)
19
+ blur_img = cv2.blur(upscaled, (5, 5))
20
+
21
+ all_text_found = []
22
+ text_ = reader.readtext(blur_img, detail=1, paragraph=False, text_threshold=0.3)
23
+
24
+ for t in text_:
25
+ bbox, text, score = t
26
+ if score > 0.1: # Filter weak detections
27
+ all_text_found.append(text)
28
+
29
+ return all_text_found
main.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import cv2
4
+ import numpy as np
5
+ import requests
6
+ import torch
7
+ import base64
8
+ import os
9
+ import logging
10
+ from io import BytesIO
11
+ from PIL import Image
12
+ from fastapi import FastAPI
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from extract_text import extract_text_from_image
15
+ from models import TextSimilarityRequest
16
+ from text_similarity import analyze_similarity
17
+ from starlette.responses import JSONResponse
18
+
19
+
20
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
21
+ app = FastAPI()
22
+ # app.add_middleware(
23
+ # CORSMiddleware,
24
+ # allow_origins=["*"],
25
+ # allow_credentials=True,
26
+ # allow_methods=["*"],
27
+ # allow_headers=["*"],
28
+ # )
29
+
30
+ @app.post("/text_similarity", summary="Perform images text similarity", response_model=float, tags=["Text Similarities"])
31
+ async def text_similarity(request: TextSimilarityRequest):
32
+ image_info = request.imageInfo
33
+ key_texts = request.keyTexts
34
+ similarity_threshold = request.similarityThreshold
35
+ origin_id = image_info.originId
36
+
37
+ logging.info(f"Checking text similarity for main source with resource id {origin_id}")
38
+
39
+ image = load_image_url(image_info.source)
40
+
41
+ # Extract text from the image using the user's method
42
+ gpu_available = torch.cuda.is_available()
43
+ extracted_texts = extract_text_from_image(image, gpu_available)
44
+
45
+ results = analyze_similarity(
46
+ extracted_texts,
47
+ key_texts,
48
+ similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
49
+ fragment_threshold=100/100 # Convert percentage to decimal
50
+ )
51
+
52
+ log_similarity_report(results, origin_id)
53
+
54
+ total_texts = len(key_texts)
55
+ passed_texts = results["statistics"]["total_processed"]
56
+
57
+ percentage_passed = (passed_texts / total_texts) * 100
58
+
59
+ logging.info(f"Text similarity for main source with resource id {origin_id} is {percentage_passed}%")
60
+
61
+ return percentage_passed
62
+
63
+ def log_similarity_report(results, originId):
64
+ # General statistics
65
+ logging.info(f"[{originId}] Total texts analyzed: {results['statistics']['total_analyzed']}")
66
+ logging.info(f"[{originId}] Texts with detected similarity: {results['statistics']['total_processed']}")
67
+
68
+ # Similar texts
69
+ if results["similar_texts"]:
70
+ logging.info(f"[{originId}] Direct Similar Texts Found: {len(results['similar_texts'])}")
71
+ for item in results["similar_texts"]:
72
+ logging.info(f"[{originId}] Similar Text: '{item['text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}")
73
+
74
+ # Detected fragments
75
+ if results["fragments_detected"]:
76
+ logging.info(f"[{originId}] Fragments Detected: {len(results['fragments_detected'])}")
77
+ for item in results["fragments_detected"]:
78
+ logging.info(f"[{originId}] Fragment: '{item['text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}")
79
+
80
+ # Combined texts
81
+ if results["combined"]:
82
+ logging.info(f"[{originId}] Texts to be Combined: {len(results['combined'])}")
83
+ for item in results["combined"]:
84
+ logging.info(f"[{originId}] Combined Text: '{item['combined_text']}' -> Key Text: '{item['key_text']}' with Similarity: {item['similarity']:.2%}")
85
+
86
+ # If no significant similarity found
87
+ if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]):
88
+ logging.info(f"[{originId}] No significant similarity found.")
89
+
90
+ # Statistics
91
+ logging.info(f"[{originId}] Direct similarity: {results['statistics']['direct_similarity']}")
92
+ logging.info(f"[{originId}] Fragments: {results['statistics']['fragments']}")
93
+ logging.info(f"[{originId}] Combined: {results['statistics']['combined']}")
94
+
95
+ def load_image_url(source):
96
+ Image.MAX_IMAGE_PIXELS = None
97
+
98
+ if source.startswith('http'):
99
+ response = requests.get(source)
100
+ img = np.asarray(bytearray(response.content), dtype=np.uint8)
101
+ img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE)
102
+ else:
103
+ img = base64.b64decode(source)
104
+ img = Image.open(BytesIO(img))
105
+ img = np.array(img)
106
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
107
+
108
+ return img
109
+
110
+ def process_image(image, key_texts, similarity_threshold, fragment_threshold):
111
+ """Processes the image, extracts text, and analyzes similarities."""
112
+ try:
113
+ if image is None:
114
+ return "Please upload an image for analysis.", None, None, None, None, None
115
+
116
+ if not key_texts.strip():
117
+ return "Please enter key texts for comparison.", None, None, None, None, None
118
+
119
+ # Extract text from the image using the user's method
120
+ gpu_available = torch.cuda.is_available()
121
+ extracted_texts = extract_text_from_image(image, gpu_available)
122
+
123
+ if isinstance(key_texts, str):
124
+ key_texts = [text.strip() for text in key_texts.split('\n') if text.strip()]
125
+
126
+ # Process the analysis
127
+ results = analyze_similarity(
128
+ extracted_texts,
129
+ key_texts,
130
+ similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
131
+ fragment_threshold=fragment_threshold/100 # Convert percentage to decimal
132
+ )
133
+
134
+ # Gerar relatório HTML
135
+ html_report = generate_html_report(results)
136
+
137
+ # Gerar DataFrames
138
+ dfs = generate_results_dataframe(results)
139
+
140
+ # Extrair DataFrames individuais (ou criar vazios se não existirem)
141
+ df_statistics = dfs.get("statistics", pd.DataFrame())
142
+ df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
143
+ df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
144
+ df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"]))
145
+
146
+ return html_report, df_statistics, df_similar, df_fragments, df_combined, extracted_texts, gpu_available
147
+
148
+
149
+ except Exception as e:
150
+ return f"Erro ao processar: {str(e)}", None, None, None, None, None
151
+
152
+ def process_manual_input(texts, key_texts, similarity_threshold, fragment_threshold):
153
+ """Processes the user's manual text input."""
154
+ # Validate input
155
+ if not texts.strip() or not key_texts.strip():
156
+ return "Please enter texts for analysis and key texts for comparison.", None, None, None, None
157
+
158
+ try:
159
+ # Process the analysis
160
+ results = analyze_similarity(
161
+ texts,
162
+ key_texts,
163
+ similarity_threshold=similarity_threshold/100, # Convert percentage to decimal
164
+ fragment_threshold=fragment_threshold/100 # Convert percentage to decimal
165
+ )
166
+
167
+ # Generate HTML report
168
+ html_report = generate_html_report(results)
169
+
170
+ # Gerar DataFrames
171
+ dfs = generate_results_dataframe(results)
172
+
173
+ # Extract individual DataFrames (or create empty ones if they don't exist)
174
+ df_statistics = dfs.get("statistics", pd.DataFrame())
175
+ df_similar = dfs.get("similar", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
176
+ df_fragments = dfs.get("fragments", pd.DataFrame(columns=["Index", "Original Text", "Key Text", "Similarity"]))
177
+ df_combined = dfs.get("combined", pd.DataFrame(columns=["Indices", "Text 1", "Text 2", "Combined Text", "Key Text", "Similarity"]))
178
+
179
+ return html_report, df_statistics, df_similar, df_fragments, df_combined
180
+
181
+ except Exception as e:
182
+ return f"Erro ao processar: {str(e)}", None, None, None, None
183
+
184
+ def generate_html_report(results):
185
+ """Generates an HTML report about the detected similarities."""
186
+ html = "<h2>Similarity Report</h2>"
187
+
188
+ # General statistics
189
+ html += "<div padding: 15px; border-radius: 5px; margin-bottom: 20px;'>"
190
+ html += f"<p><b>Total texts analyzed:</b> {results['statistics']['total_analyzed']}</p>"
191
+ html += f"<p><b>Texts with detected similarity:</b> {results['statistics']['total_processed']}</p>"
192
+ html += "</div>"
193
+
194
+ # Results table
195
+ html += "<h3>Detected Similarities</h3>"
196
+
197
+ # Similar texts
198
+ if results["similar_texts"]:
199
+ html += "<h4>Direct Similar Texts</h4>"
200
+ html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
201
+ html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
202
+
203
+ for item in results["similar_texts"]:
204
+ html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
205
+
206
+ html += "</table>"
207
+
208
+ # Detected fragments
209
+ if results["fragments_detected"]:
210
+ html += "<h4>Text with Detected Fragments</h4>"
211
+ html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
212
+ html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Original Text</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
213
+
214
+ for item in results["fragments_detected"]:
215
+ html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
216
+
217
+ html += "</table>"
218
+
219
+ # Combined texts
220
+ if results["combined"]:
221
+ html += "<h4>Text that need to be combined</h4>"
222
+ html += "<table width='100%' style='border-collapse: collapse; margin-bottom: 20px;'>"
223
+ html += "<tr><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 1</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Text 2</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Combination</th><th style='text-align: left; padding: 8px; border: 1px solid #ddd;'>Key Text</th><th style='text-align: center; padding: 8px; border: 1px solid #ddd;'>Similarity</th></tr>"
224
+
225
+ for item in results["combined"]:
226
+ html += f"<tr><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][0]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['texts'][1]}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['combined_text']}</td><td style='padding: 8px; border: 1px solid #ddd;'>{item['key_text']}</td><td style='text-align: center; padding: 8px; border: 1px solid #ddd;'>{item['similarity']:.2%}</td></tr>"
227
+
228
+ html += "</table>"
229
+
230
+ if not (results["similar_texts"] or results["fragments_detected"] or results["combined"]):
231
+ html += "<p>No significant similarity found with the current parameters.</p>"
232
+
233
+ return html
234
+
235
+ def generate_results_dataframe(results):
236
+ """Generates pandas DataFrames to visualize the results."""
237
+ dfs = {}
238
+
239
+ # DataFrame for similar texts
240
+ if results["similar_texts"]:
241
+ data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}")
242
+ for item in results["similar_texts"]]
243
+ dfs["similar"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"])
244
+
245
+ # DataFrame for fragments
246
+ if results["fragments_detected"]:
247
+ data = [(item['index'], item['text'], item['key_text'], f"{item['similarity']:.2%}")
248
+ for item in results["fragments_detected"]]
249
+ dfs["fragments"] = pd.DataFrame(data, columns=["Index", "Original Text", "Key Text", "Similarity"])
250
+
251
+ # DataFrame for combined
252
+ if results["combined"]:
253
+ data = [(f"{item['indices'][0]},{item['indices'][1]}",
254
+ item['texts'][0],
255
+ item['texts'][1],
256
+ item['combined_text'],
257
+ item['key_text'],
258
+ f"{item['similarity']:.2%}")
259
+ for item in results["combined"]]
260
+ dfs["combined"] = pd.DataFrame(data, columns=["Indices", "Text 1", "Text 2",
261
+ "Combined Text", "Key Text", "Similarity"])
262
+
263
+ # Statistics DataFrame
264
+ data = [
265
+ ("Total analyzed", results["statistics"]["total_analyzed"]),
266
+ ("Total with similarity", results["statistics"]["total_processed"]),
267
+ ("Direct similarity", results["statistics"]["direct_similarity"]),
268
+ ("Fragments", results["statistics"]["fragments"]),
269
+ ("Combined", results["statistics"]["combined"])
270
+ ]
271
+ dfs["statistics"] = pd.DataFrame(data, columns=["Metric", "Value"])
272
+
273
+ return dfs
274
+
275
+ with gr.Blocks(title="Text Similarity Detector") as demo:
276
+ gr.Markdown("# 🔍 Text Similarity Detector with Image Extraction")
277
+ gr.Markdown("""
278
+ This tool analyzes the similarity between texts extracted from an image and reference key texts.
279
+ It can identify:
280
+ - Direct similar texts
281
+ - Key text fragments within the texts
282
+ - Text combinations that match key texts
283
+ """)
284
+
285
+ with gr.Tabs() as tabs:
286
+ with gr.TabItem("Image Analysis"):
287
+ with gr.Row():
288
+ with gr.Column(scale=1): # Column for inputs on the left
289
+ input_image = gr.Image(label="Upload an image to extract text", type="pil", height=600)
290
+ key_texts_image = gr.Textbox(
291
+ label="Key Texts for Comparison",
292
+ placeholder="Paste your key texts here (one per line)",
293
+ lines=5
294
+ )
295
+ # with gr.Row():
296
+ # key_texts_image = gr.Textbox(
297
+ # label="Key Texts for Comparison",
298
+ # placeholder="Paste your key texts here (one per line)",
299
+ # lines=5
300
+ # )
301
+
302
+ # min_similarity_per_key_image = gr.Textbox(
303
+ # label="Minimum Similarity for Each Key Text (%)",
304
+ # placeholder="Enter one value per line, matching the key texts",
305
+ # lines=5
306
+ # )
307
+
308
+ with gr.Row():
309
+ similarity_threshold_image = gr.Slider(
310
+ label="Similarity Threshold (%)",
311
+ minimum=50,
312
+ maximum=100,
313
+ value=70,
314
+ step=1
315
+ )
316
+ fragment_threshold_image = gr.Slider(
317
+ label="Fragment Similarity Threshold (%)",
318
+ minimum=50,
319
+ maximum=100,
320
+ value=70,
321
+ step=1
322
+ )
323
+
324
+ analyze_image_btn = gr.Button("Analyze Image", variant="primary")
325
+
326
+ with gr.Column(scale=1): # Column for outputs on the right
327
+ gpu_available = gr.Checkbox(label="Used GPU")
328
+ extracted_texts = gr.Textbox(label="Extracted Texts from the Image", lines=5)
329
+ html_output = gr.HTML(label="Similarity Report")
330
+ with gr.Tabs():
331
+ with gr.TabItem("Statistics"):
332
+ statistics_output = gr.Dataframe(label="Statistics")
333
+ with gr.TabItem("Direct Similarity"):
334
+ similar_texts_output = gr.Dataframe(label="Direct Similar Texts")
335
+ with gr.TabItem("Fragments"):
336
+ fragments_output = gr.Dataframe(label="Texts with Fragments")
337
+ with gr.TabItem("Combined"):
338
+ combined_output = gr.Dataframe(label="Combined Texts")
339
+
340
+ with gr.TabItem("Manual Analysis"):
341
+ with gr.Row():
342
+ with gr.Column(scale=1): # Column for inputs on the left
343
+ input_texts = gr.Textbox(
344
+ label="List of Texts for Analysis",
345
+ placeholder="Paste your list of texts here (one per line)",
346
+ lines=10
347
+ )
348
+ key_texts_input = gr.Textbox(
349
+ label="Key Texts for Comparison",
350
+ placeholder="Paste your key texts here (one per line)",
351
+ lines=5
352
+ )
353
+ # with gr.Row():
354
+ # key_texts_input = gr.Textbox(
355
+ # label="Key Texts for Comparison",
356
+ # placeholder="Paste your key texts here (one per line)",
357
+ # lines=5
358
+ # )
359
+
360
+ # min_similarity_per_key_input = gr.Textbox(
361
+ # label="Minimum Similarity for Each Key Text (%)",
362
+ # placeholder="Enter one value per line, matching the key texts",
363
+ # lines=5
364
+ # )
365
+
366
+ with gr.Row():
367
+ similarity_threshold = gr.Slider(
368
+ label="Similarity Threshold (%)",
369
+ minimum=50,
370
+ maximum=100,
371
+ value=70,
372
+ step=1
373
+ )
374
+ fragment_threshold = gr.Slider(
375
+ label="Fragment Similarity Threshold (%)",
376
+ minimum=50,
377
+ maximum=100,
378
+ value=70,
379
+ step=1
380
+ )
381
+
382
+ analyze_btn = gr.Button("Analyze Image", variant="primary")
383
+
384
+ with gr.Column(scale=1): # Column for outputs on the right
385
+ html_output_manual = gr.HTML(label="Manual Similarity Report")
386
+ with gr.Tabs():
387
+ with gr.TabItem("Statistics"):
388
+ statistics_output_manual = gr.Dataframe(label="Statistics")
389
+ with gr.TabItem("Direct Similarity"):
390
+ similar_texts_output_manual = gr.Dataframe(label="Direct Similar Texts")
391
+ with gr.TabItem("Fragments"):
392
+ fragments_output_manual = gr.Dataframe(label="Texts with Fragments")
393
+ with gr.TabItem("Combined"):
394
+ combined_output_manual = gr.Dataframe(label="Combined Texts")
395
+
396
+ # Connect the image processing function to the button
397
+ analyze_image_btn.click(
398
+ process_image,
399
+ inputs=[input_image, key_texts_image, similarity_threshold_image, fragment_threshold_image],
400
+ outputs=[html_output, statistics_output, similar_texts_output, fragments_output, combined_output, extracted_texts, gpu_available]
401
+ )
402
+
403
+ # Connect the manual text processing function to the button
404
+ analyze_btn.click(
405
+ process_manual_input,
406
+ inputs=[input_texts, key_texts_input, similarity_threshold, fragment_threshold],
407
+ outputs=[html_output_manual, statistics_output_manual, similar_texts_output_manual, fragments_output_manual, combined_output_manual]
408
+ )
409
+
410
+ #app = gr.mount_gradio_app(app, demo, path="/")
411
+
412
+ @app.get("/api")
413
+ def read_root():
414
+ return JSONResponse(content={"message": "Hello from FastAPI inside Gradio!"})
415
+
416
+ # if __name__ == "__main__":
417
+ # import uvicorn
418
+ # uvicorn.run(app)
419
+
420
+ # PORT = int(os.getenv("PORT", 7860))
421
+
422
+ # if __name__ == "__main__":
423
+ # import uvicorn
424
+ # print(f"A arrancar na porta {PORT}...")
425
+ # uvicorn.run(app)
426
+
427
+ #demo.launch(server_name="0.0.0.0", server_port=7860)
models.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List
3
+
4
+ class RequestModel(BaseModel):
5
+ originId: int
6
+ source: str
7
+
8
+ class TextSimilarityRequest(BaseModel):
9
+ imageInfo: RequestModel
10
+ keyTexts: List[str]
11
+ similarityThreshold: float
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ opencv-python
2
+ numpy
3
+ matplotlib
4
+ easyocr
5
+ scikit-image
6
+ pillow
7
+ pandas
8
+ torch
9
+ uvicorn
10
+ gradio
11
+ requests
12
+ starlette
text_similarity.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from difflib import SequenceMatcher
3
+ from collections import defaultdict
4
+
5
+ def extract_special_characters(text):
6
+ """Extracts all unique special characters from a list of texts."""
7
+ characters = re.findall(r'[^\w\s]', text) # Finds non-alphanumeric and non-space characters
8
+ return ''.join(characters)
9
+
10
+ def clean_text(text, keep=""):
11
+ """Removes special characters except those specified in 'keep', and converts to lowercase."""
12
+ pattern = rf'[^\w\s{re.escape(keep)}]'
13
+ return re.sub(pattern, '', text.lower())
14
+
15
+ def text_similarity(text, key_text):
16
+ """Calculates the similarity between two texts using SequenceMatcher."""
17
+ return SequenceMatcher(None, text, key_text).ratio()
18
+
19
+ def detect_fragments(text, key_texts, threshold=0.7):
20
+ """Checks if a text contains fragments of key texts."""
21
+ for key_text in key_texts:
22
+ characters_to_not_clean = extract_special_characters(key_text)
23
+ words = clean_text(text, characters_to_not_clean).split()
24
+
25
+ key_words = key_text.split()
26
+
27
+ # If the text is too short, we can't make an effective sliding window
28
+ if len(words) < len(key_words):
29
+ similarity = text_similarity(text, key_text)
30
+ if similarity >= threshold:
31
+ return True, key_text, similarity
32
+ continue
33
+
34
+ # Sliding window to compare word sequences
35
+ for i in range(len(words) - len(key_words) + 1):
36
+ fragment = " ".join(words[i:i+len(key_words)])
37
+ similarity = text_similarity(fragment, key_text)
38
+ if similarity >= threshold:
39
+ return True, key_text, similarity
40
+ return False, None, 0
41
+
42
+ def analyze_similarity(text_list, key_texts, similarity_threshold=0.7, fragment_threshold=0.7):
43
+ """
44
+ Analyzes the similarity between a list of texts and key texts.
45
+ Returns a detailed report on the similarities found.
46
+ """
47
+ results = {
48
+ "similar_texts": [],
49
+ "fragments_detected": [],
50
+ "combined": [],
51
+ "statistics": defaultdict(int)
52
+ }
53
+
54
+ processed_texts = set()
55
+
56
+ # Check direct similarity
57
+ for i, text in enumerate(text_list):
58
+ if not text.strip():
59
+ continue
60
+
61
+ for key_text in key_texts:
62
+ if not key_text.strip():
63
+ continue
64
+
65
+ similarity = text_similarity(text, key_text)
66
+ if similarity >= similarity_threshold:
67
+ results["similar_texts"].append({
68
+ "index": i,
69
+ "text": text,
70
+ "key_text": key_text,
71
+ "similarity": similarity
72
+ })
73
+ results["statistics"]["direct_similarity"] += 1
74
+ processed_texts.add(i)
75
+
76
+ # Check fragments
77
+ # for i, text in enumerate(text_list):
78
+ # if i in processed_texts or not text.strip():
79
+ # continue
80
+
81
+ # has_fragment, key_text, similarity = detect_fragments(text, key_texts, fragment_threshold)
82
+ # if has_fragment:
83
+ # results["fragments_detected"].append({
84
+ # "index": i,
85
+ # "text": text,
86
+ # "key_text": key_text,
87
+ # "similarity": similarity
88
+ # })
89
+ # results["statistics"]["fragments"] += 1
90
+ # processed_texts.add(i)
91
+
92
+ # Check texts that can be combined
93
+ for i in range(len(text_list)):
94
+ if i in processed_texts or not text_list[i].strip():
95
+ continue
96
+
97
+ for j in range(i+1, len(text_list)):
98
+ if j in processed_texts or not text_list[j].strip():
99
+ continue
100
+
101
+ combined_text = text_list[i] + " " + text_list[j]
102
+ for key_text in key_texts:
103
+ if not key_text.strip():
104
+ continue
105
+
106
+ similarity = text_similarity(combined_text, key_text)
107
+ if similarity >= similarity_threshold:
108
+ results["combined"].append({
109
+ "indices": [i, j],
110
+ "texts": [text_list[i], text_list[j]],
111
+ "combined_text": combined_text,
112
+ "key_text": key_text,
113
+ "similarity": similarity
114
+ })
115
+ results["statistics"]["combined"] += 1
116
+ processed_texts.add(i)
117
+ processed_texts.add(j)
118
+ break
119
+
120
+ # Calculate overall statistics
121
+ valid_texts = sum(1 for text in text_list if text.strip())
122
+ results["statistics"]["total_analyzed"] = valid_texts
123
+ results["statistics"]["total_processed"] = len(processed_texts)
124
+
125
+ return results