File size: 27,484 Bytes
6f96666
 
 
 
364e0ba
6f96666
 
364e0ba
6f96666
364e0ba
6f96666
364e0ba
 
 
 
6f96666
364e0ba
 
6f96666
364e0ba
 
6f96666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
6f96666
364e0ba
 
6f96666
 
 
 
364e0ba
6f96666
 
 
 
 
 
 
 
 
d3509b9
6f96666
 
 
 
 
 
184c6f9
6f96666
184c6f9
6f96666
 
f652e83
364e0ba
d3509b9
6f96666
364e0ba
6f96666
 
 
364e0ba
184c6f9
364e0ba
 
 
 
6f96666
 
364e0ba
 
 
 
 
 
 
6f96666
364e0ba
6f96666
 
 
 
364e0ba
 
6f96666
364e0ba
 
 
 
 
 
6f96666
 
 
2ad4c58
 
364e0ba
6f96666
364e0ba
6f96666
 
 
364e0ba
 
6f96666
 
364e0ba
6f96666
 
 
 
 
364e0ba
 
6f96666
 
364e0ba
6f96666
 
364e0ba
 
6f96666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
6f96666
 
a0e200f
6f96666
 
 
 
 
 
364e0ba
6f96666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
6f96666
a0e200f
364e0ba
6f96666
a0e200f
6f96666
 
 
 
 
 
 
364e0ba
 
6f96666
 
 
 
 
 
 
 
 
 
 
 
364e0ba
6f96666
a0e200f
364e0ba
6f96666
 
 
364e0ba
 
6f96666
 
 
 
 
364e0ba
6f96666
364e0ba
6f96666
0e6dbe2
6f96666
 
 
 
0e6dbe2
6f96666
 
0e6dbe2
6f96666
 
 
 
 
 
 
0e6dbe2
364e0ba
6f96666
 
 
 
 
364e0ba
 
6f96666
 
 
 
0e6dbe2
6f96666
 
0e6dbe2
6f96666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e6dbe2
6f96666
 
364e0ba
6f96666
 
 
 
364e0ba
 
 
 
 
0e6dbe2
6f96666
 
 
91e3e31
364e0ba
6f96666
 
 
 
 
 
 
 
 
810882a
6f96666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
6f96666
 
 
 
 
 
 
 
 
 
 
364e0ba
 
6f96666
 
364e0ba
 
6f96666
 
 
 
 
 
364e0ba
6f96666
364e0ba
 
 
 
6f96666
 
364e0ba
 
6f96666
 
 
 
 
 
 
 
 
 
 
 
4dd18db
6f96666
 
 
 
 
 
 
 
4dd18db
a0e200f
364e0ba
 
 
6f96666
0c80b43
364e0ba
 
 
 
6f96666
364e0ba
0c80b43
364e0ba
 
0c80b43
364e0ba
 
6f96666
364e0ba
 
0c80b43
364e0ba
6f96666
364e0ba
6f96666
 
 
364e0ba
0c80b43
364e0ba
 
 
6f96666
364e0ba
 
12a89b7
364e0ba
6f96666
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364e0ba
 
6f96666
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
import pymupdf4llm
from markdown_it import MarkdownIt
from mdit_plain.renderer import RendererPlain
import os
import re
from typing import Tuple, Optional, List, Dict, Any

import fitz  # PyMuPDF
from collections import defaultdict, Counter
import language_tool_python

import json
import traceback
import io
import tempfile
# import os # Already imported
import gradio as gr

# Set JAVA_HOME environment variable (from target script)
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'


# --- Functions for PDF to Markdown to Plain Text ---
def convert_markdown_to_plain_text(markdown_text: str) -> str:
    """
    Converts a Markdown string to plain text.
    """
    if not markdown_text:
        return ""
    try:
        parser = MarkdownIt(renderer_cls=RendererPlain)
        plain_text = parser.render(markdown_text)
        return plain_text
    except Exception as e:
        print(f"Error converting Markdown to plain text: {e}")
        return markdown_text

# --- Function for Rectangle Conversion ---
def convert_rect_to_dict(rect: fitz.Rect) -> Optional[Dict[str, float]]:
    """Converts a fitz.Rect object to a dictionary."""
    if not rect or not isinstance(rect, fitz.Rect):
        print(f"Warning: Invalid rect object received: {rect}")
        return None
    return {
        "x0": rect.x0,
        "y0": rect.y0,
        "x1": rect.x1,
        "y1": rect.y1,
        "width": rect.width,
        "height": rect.height
    }

# --- Helper function for mapping LT issues to PDF rectangles ---
def try_map_issues_to_page_rects(
    issues_to_map_for_context: List[Dict[str, Any]],
    pdf_rects: List[fitz.Rect],
    page_number_for_mapping: int # 1-based page number
) -> int:
    mapped_count = 0
    num_issues_to_try = len(issues_to_map_for_context)
    num_available_rects = len(pdf_rects)
    limit = min(num_issues_to_try, num_available_rects)

    for i in range(limit):
        issue_to_update = issues_to_map_for_context[i]
        if issue_to_update['is_mapped_to_pdf']: # Check the correct flag name
            continue
        pdf_rect = pdf_rects[i]
        coord_dict = convert_rect_to_dict(pdf_rect)
        if coord_dict:
            issue_to_update['pdf_coordinates_list'] = [coord_dict] # Store as list of dicts
            issue_to_update['is_mapped_to_pdf'] = True
            issue_to_update['mapped_page_number'] = page_number_for_mapping
            mapped_count += 1
        else:
            print(f"      Warning: Could not convert rect for context '{issue_to_update['context_text'][:30]}...' on page {page_number_for_mapping}")
    return mapped_count


# ------------------------------
# Analysis Functions (from target script, with modifications)
# ------------------------------

def extract_pdf_text(file_input: Any) -> str:
    """Extracts full text from a PDF file using PyMuPDF4LLM (as Markdown)."""
    temp_file_path_for_pymupdf4llm = None
    actual_path_to_process = None
    try:
        if isinstance(file_input, str):
            actual_path_to_process = file_input
        elif hasattr(file_input, 'read') and callable(file_input.read):
            temp_file_obj = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
            temp_file_path_for_pymupdf4llm = temp_file_obj.name
            file_input.seek(0)
            temp_file_obj.write(file_input.read())
            temp_file_obj.close()
            actual_path_to_process = temp_file_path_for_pymupdf4llm
        else:
            raise ValueError("Input 'file_input' must be a file path (str) or a file-like object.")

        doc_for_page_count = fitz.open(actual_path_to_process)
        page_count = len(doc_for_page_count)
        doc_for_page_count.close()
        print(f"PDF has {page_count} pages. Extracting Markdown using pymupdf4llm.")
        
        markdown_text = pymupdf4llm.to_markdown(actual_path_to_process)
        
        print(f"Total extracted Markdown text length: {len(markdown_text)} characters.")
        return markdown_text
        
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
        traceback.print_exc()
        return ""
    finally:
        if temp_file_path_for_pymupdf4llm and os.path.exists(temp_file_path_for_pymupdf4llm):
            os.remove(temp_file_path_for_pymupdf4llm)


def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
    return {term: term.lower() in full_text.lower() for term in search_terms}

def label_authors(full_text: str) -> str:
    # This function was in the original script but not directly used by analyze_pdf's output structure.
    # Keeping it in case it's called elsewhere or for future use.
    author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
    match = re.search(author_line_regex, full_text, re.MULTILINE)
    if match:
        authors = match.group(1).strip()
        return full_text.replace(authors, f"Authors: {authors}")
    return full_text

def check_metadata(plain_text: str) -> Dict[str, Any]:
    return {
        "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', plain_text)),
        "list_of_authors": bool(re.search(r'Authors?:', plain_text, re.IGNORECASE)),
        "keywords_list": bool(re.search(r'Keywords?:', plain_text, re.IGNORECASE)),
        "word_count": len(plain_text.split()) or "Missing"
    }

def check_disclosures(plain_text: str) -> Dict[str, bool]:
    search_terms = [
        "conflict of interest statement",
        "ethics statement",
        "funding statement",
        "data access statement"
    ]
    results = check_text_presence(plain_text, search_terms)
    has_author_contribution = ("author contribution statement" in plain_text.lower() or
                               "author contributions statement" in plain_text.lower())
    results["author contribution statement"] = has_author_contribution
    return results

def check_figures_and_tables(plain_text: str) -> Dict[str, bool]:
    return {
        "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', plain_text, re.IGNORECASE)),
        "figures_legends": bool(re.search(r'Figure \d+.*?legend', plain_text, re.IGNORECASE)),
        "tables_legends": bool(re.search(r'Table \d+.*?legend', plain_text, re.IGNORECASE))
    }

def check_references_summary(plain_text: str) -> Dict[str, Any]: # Renamed from check_references for clarity
    abstract_candidate = plain_text[:2000] 
    return {
        "old_references": bool(re.search(r'\b19[0-9]{2}\b', plain_text)), 
        "citations_in_abstract": bool(re.search(r'\[\d+\]', abstract_candidate, re.IGNORECASE)) or \
                                 bool(re.search(r'\bcit(?:ation|ed)\b', abstract_candidate, re.IGNORECASE)),
        "reference_count": len(re.findall(r'\[\d+(?:,\s*\d+)*\]', plain_text)), 
        "self_citations": bool(re.search(r'Self-citation', plain_text, re.IGNORECASE)) 
    }

def check_structure(plain_text: str) -> Dict[str, bool]:
    text_lower = plain_text.lower()
    return {
        "imrad_structure": all(section.lower() in text_lower for section in ["introduction", "method", "result", "discussion"]),
        "abstract_structure": "structured abstract" in text_lower
    }

def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
    """
    Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
    Filters issues to only include those between "abstract" and "references/bibliography".
    Returns a list of issue dictionaries with fields for mapping.
    """
    if not markdown_text_from_pdf.strip():
        return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}

    plain_text_from_markdown = convert_markdown_to_plain_text(markdown_text_from_pdf)
    text_for_analysis = plain_text_from_markdown.replace('\n', ' ')
    text_for_analysis = re.sub(r'\s+', ' ', text_for_analysis).strip()

    if not text_for_analysis:
        return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}

    # --- Determine content boundaries ---
    text_for_analysis_lower = text_for_analysis.lower()
    
    abstract_match = re.search(r'\babstract\b', text_for_analysis_lower)
    # If "abstract" is found, analysis starts from its beginning. Otherwise, from text start.
    content_start_index = abstract_match.start() if abstract_match else 0
    if abstract_match:
        print(f"Found 'abstract' at index {content_start_index}")
    else:
        print(f"Did not find 'abstract', starting language analysis from index 0")

    references_match = re.search(r'\breferences\b', text_for_analysis_lower)
    bibliography_match = re.search(r'\bbibliography\b', text_for_analysis_lower)

    content_end_index = len(text_for_analysis) # Default to end of text

    if references_match and bibliography_match:
        content_end_index = min(references_match.start(), bibliography_match.start())
        print(f"Found 'references' at {references_match.start()} and 'bibliography' at {bibliography_match.start()}. Using {content_end_index} as end boundary.")
    elif references_match:
        content_end_index = references_match.start()
        print(f"Found 'references' at {content_end_index}. Using it as end boundary.")
    elif bibliography_match:
        content_end_index = bibliography_match.start()
        print(f"Found 'bibliography' at {content_end_index}. Using it as end boundary.")
    else:
        print(f"Did not find 'references' or 'bibliography'. Language analysis up to end of text (index {content_end_index}).")

    # If "abstract" is found after "references/bibliography", the range is invalid for filtering.
    # In such a case, or if no abstract is found, we might effectively process a very small or no region.
    # This logic correctly makes the valid region empty if abstract_start >= content_end.
    if content_start_index >= content_end_index:
        print(f"Warning: Content start index ({content_start_index}) is not before content end index ({content_end_index}). No language issues will be reported from this range.")
        # Effectively, no issues will pass the filter below.
    
    tool = None
    processed_issues: List[Dict[str, Any]] = []
    try:
        tool = language_tool_python.LanguageTool('en-US') 
        raw_lt_matches = tool.check(text_for_analysis)
        
        lt_issues_in_range = 0
        for idx, match in enumerate(raw_lt_matches):
            if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue
            
            # Filter by content boundaries
            if not (content_start_index <= match.offset < content_end_index):
                continue
            lt_issues_in_range +=1

            context_str = text_for_analysis[match.offset : match.offset + match.errorLength]
            processed_issues.append({
                '_internal_id': f"lt_{idx}",
                'ruleId': match.ruleId,
                'message': match.message,
                'context_text': context_str, 
                'offset_in_text': match.offset,
                'error_length': match.errorLength,
                'replacements_suggestion': match.replacements[:3] if match.replacements else [],
                'category_name': match.category,
                'is_mapped_to_pdf': False,
                'pdf_coordinates_list': [], 
                'mapped_page_number': -1
            })
        print(f"LanguageTool found {len(raw_lt_matches)} raw issues, {lt_issues_in_range} issues within defined content range.")
        
        regex_pattern = r'\b(\w+)\[(\d+)\]'
        regex_matches = list(re.finditer(regex_pattern, text_for_analysis))
        
        regex_issues_in_range = 0
        for reg_idx, match in enumerate(regex_matches):
            # Filter by content boundaries
            if not (content_start_index <= match.start() < content_end_index):
                continue
            regex_issues_in_range += 1
            
            word = match.group(1)
            number = match.group(2)
            processed_issues.append({
                '_internal_id': f"regex_{reg_idx}",
                'ruleId': "SPACE_BEFORE_BRACKET",
                'message': f"Missing space before '[' in '{word}[{number}]'. Should be '{word} [{number}]'.",
                'context_text': text_for_analysis[match.start():match.end()],
                'offset_in_text': match.start(),
                'error_length': match.end() - match.start(),
                'replacements_suggestion': [f"{word} [{number}]"],
                'category_name': "Formatting",
                'is_mapped_to_pdf': False,
                'pdf_coordinates_list': [],
                'mapped_page_number': -1
            })
        print(f"Regex check found {len(regex_matches)} raw matches, {regex_issues_in_range} issues within defined content range.")
        
        return {
            "total_issues": len(processed_issues),
            "issues_list": processed_issues,
            "text_used_for_analysis": text_for_analysis 
        }
    except Exception as e:
        print(f"Error in check_language_issues_and_regex: {e}")
        traceback.print_exc()
        return {"error": str(e), "total_issues": 0, "issues_list": [], "text_used_for_analysis": text_for_analysis}
    finally:
        if tool: tool.close()

def check_figure_order(plain_text: str) -> Dict[str, Any]:
    figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
    figure_references_str = re.findall(figure_pattern, plain_text, re.IGNORECASE)
    
    valid_figure_numbers_int = []
    for num_str in figure_references_str:
        if num_str.isdigit():
            valid_figure_numbers_int.append(int(num_str))
    
    unique_sorted_figures = sorted(list(set(valid_figure_numbers_int)))
    is_sequential = all(unique_sorted_figures[i] + 1 == unique_sorted_figures[i+1] for i in range(len(unique_sorted_figures)-1))

    missing_figures = []
    if unique_sorted_figures:
        expected_figures = set(range(1, max(unique_sorted_figures) + 1))
        missing_figures = sorted(list(expected_figures - set(unique_sorted_figures)))
    
    counts = Counter(valid_figure_numbers_int)
    duplicate_refs = [num for num, count in counts.items() if count > 1]
    
    return {
        "sequential_order_of_unique_figures": is_sequential, 
        "figure_count_unique": len(unique_sorted_figures),
        "missing_figures_in_sequence_to_max": missing_figures, 
        "figure_order_as_encountered": valid_figure_numbers_int,
        "duplicate_references_to_same_figure_number": duplicate_refs
    }

def check_reference_order(plain_text: str) -> Dict[str, Any]:
    reference_pattern = r'\[(\d+)\]' 
    references_str = re.findall(reference_pattern, plain_text)
    ref_numbers_int = [int(ref) for ref in references_str if ref.isdigit()]
    
    max_ref_val = 0
    out_of_order_details = []
    
    if ref_numbers_int:
        max_ref_val = max(ref_numbers_int)
        current_max_seen_in_text = 0
        for i, ref in enumerate(ref_numbers_int):
            if ref < current_max_seen_in_text : 
                 out_of_order_details.append({
                     "position_in_text_occurrences": i + 1, 
                     "value": ref,
                     "previous_max_value_seen": current_max_seen_in_text,
                     "message": f"Reference [{ref}] appeared after a higher reference [{current_max_seen_in_text}] was already cited."
                 })
            current_max_seen_in_text = max(current_max_seen_in_text, ref)

    all_expected_refs_up_to_max = set(range(1, max_ref_val + 1)) if max_ref_val > 0 else set()
    used_refs_set = set(ref_numbers_int)
    missing_refs_in_sequence_to_max = sorted(list(all_expected_refs_up_to_max - used_refs_set))
    
    is_ordered_in_text = all(ref_numbers_int[i] <= ref_numbers_int[i+1] for i in range(len(ref_numbers_int)-1))

    return {
        "max_reference_number_cited": max_ref_val,
        "out_of_order_citations_details": out_of_order_details, 
        "missing_references_up_to_max_cited": missing_refs_in_sequence_to_max,
        "is_citation_order_non_decreasing_in_text": is_ordered_in_text
    }

# ------------------------------
# Main Analysis Function
# ------------------------------

def analyze_pdf(filepath_or_stream: Any) -> Tuple[Dict[str, Any], None]:
    doc_for_mapping = None
    temp_fitz_file_path = None 

    try:
        markdown_text = extract_pdf_text(filepath_or_stream)
        if not markdown_text:
            return {"error": "Failed to extract text (Markdown) from PDF."}, None
        
        plain_text_for_general_checks = convert_markdown_to_plain_text(markdown_text)
        cleaned_plain_text_for_regex = re.sub(r'\s+', ' ', plain_text_for_general_checks.replace('\n', ' ')).strip()

        # This will now use the modified function with boundary filtering
        language_and_regex_issue_report = check_language_issues_and_regex(markdown_text)
        
        if "error" in language_and_regex_issue_report:
            return {"error": f"Language/Regex check error: {language_and_regex_issue_report['error']}"}, None
        
        detailed_issues_for_mapping = language_and_regex_issue_report.get("issues_list", [])

        if detailed_issues_for_mapping:
            # The rest of the mapping logic remains the same, operating on the filtered issues.
            if isinstance(filepath_or_stream, str):
                pdf_path_for_fitz = filepath_or_stream
            elif hasattr(filepath_or_stream, 'read') and callable(filepath_or_stream.read):
                filepath_or_stream.seek(0)
                temp_fitz_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
                temp_fitz_file_path = temp_fitz_file.name
                temp_fitz_file.write(filepath_or_stream.read())
                temp_fitz_file.close()
                pdf_path_for_fitz = temp_fitz_file_path
            else:
                # This case should ideally be caught by extract_pdf_text, but good to have a fallback
                return {"error": "Invalid PDF input for coordinate mapping."}, None

            try:
                doc_for_mapping = fitz.open(pdf_path_for_fitz)
                if doc_for_mapping.page_count > 0:
                    print(f"\n--- Mapping {len(detailed_issues_for_mapping)} Issues (filtered) to PDF Coordinates ---")
                    # Only attempt to map issues if there are any after filtering
                    if detailed_issues_for_mapping: 
                        for page_idx in range(doc_for_mapping.page_count):
                            page = doc_for_mapping[page_idx]
                            current_page_num_1_based = page_idx + 1
                            
                            unmapped_issues_on_this_page_by_context = defaultdict(list)
                            for issue_dict in detailed_issues_for_mapping:
                                if not issue_dict['is_mapped_to_pdf']:
                                    unmapped_issues_on_this_page_by_context[issue_dict['context_text']].append(issue_dict)

                            if not unmapped_issues_on_this_page_by_context:
                                if all(iss['is_mapped_to_pdf'] for iss in detailed_issues_for_mapping): break
                                continue

                            for ctx_str, issues_for_ctx in unmapped_issues_on_this_page_by_context.items():
                                if not ctx_str.strip(): continue
                                try:
                                    # Use TEXT_PRESERVE_LIGATURES and TEXT_PRESERVE_WHITESPACE for better matching
                                    # with text derived from pymupdf4llm which tries to preserve structure.
                                    pdf_rects = page.search_for(ctx_str, flags=fitz.TEXT_PRESERVE_LIGATURES | fitz.TEXT_PRESERVE_WHITESPACE)
                                    if pdf_rects:
                                        try_map_issues_to_page_rects(issues_for_ctx, pdf_rects, current_page_num_1_based)
                                except Exception as search_exc:
                                    print(f"Warning: Error searching for context '{ctx_str[:30]}' on page {current_page_num_1_based}: {search_exc}")
                        total_mapped = sum(1 for iss in detailed_issues_for_mapping if iss['is_mapped_to_pdf'])
                        print(f"Finished coordinate mapping. Mapped issues: {total_mapped}/{len(detailed_issues_for_mapping)}.")
                    else:
                        print("No language/regex issues found within the defined content boundaries to map.")
            except Exception as e_map:
                print(f"Error during PDF coordinate mapping: {e_map}")
                traceback.print_exc()
            finally:
                if doc_for_mapping: doc_for_mapping.close()
                if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
                    os.unlink(temp_fitz_file_path)
        
        final_formatted_issues_list = []
        for issue_data in detailed_issues_for_mapping: # This list is already filtered
            page_num_for_json = 0
            coords_for_json = []
            if issue_data['is_mapped_to_pdf'] and issue_data['pdf_coordinates_list']:
                # Assuming pdf_coordinates_list stores a list of dicts, take the first one
                coord_dict = issue_data['pdf_coordinates_list'][0] 
                coords_for_json = [coord_dict['x0'], coord_dict['y0'], coord_dict['x1'], coord_dict['y1']]
                page_num_for_json = issue_data['mapped_page_number']
            
            final_formatted_issues_list.append({
                "message": issue_data['message'],
                "context": issue_data['context_text'], 
                "suggestions": issue_data['replacements_suggestion'],
                "category": issue_data['category_name'],
                "rule_id": issue_data['ruleId'],
                "offset": issue_data['offset_in_text'], 
                "length": issue_data['error_length'],   
                "coordinates": coords_for_json,
                "page": page_num_for_json
            })

        results = {
            "issues": final_formatted_issues_list, # This will now contain only filtered issues
            "document_checks": { 
                "metadata": check_metadata(cleaned_plain_text_for_regex),
                "disclosures": check_disclosures(cleaned_plain_text_for_regex),
                "figures_and_tables": check_figures_and_tables(cleaned_plain_text_for_regex),
                "references_summary": check_references_summary(cleaned_plain_text_for_regex), 
                "structure": check_structure(cleaned_plain_text_for_regex),
                "figure_order_analysis": check_figure_order(cleaned_plain_text_for_regex), 
                "reference_order_analysis": check_reference_order(cleaned_plain_text_for_regex), 
                "plain_language_summary_present": bool(re.search(r'plain language summary', cleaned_plain_text_for_regex, re.IGNORECASE)),
                "readability_issues_detected": False, # Placeholder, not implemented
            }
        }
        
        return results, None 

    except Exception as e:
        print(f"Overall analysis error in analyze_pdf: {e}")
        traceback.print_exc()
        # Ensure cleanup even if an early error occurs
        if doc_for_mapping: doc_for_mapping.close()
        if temp_fitz_file_path and os.path.exists(temp_fitz_file_path):
            os.unlink(temp_fitz_file_path)
        return {"error": str(e)}, None

# ------------------------------
# Gradio Interface
# ------------------------------

def process_upload(file_data_binary: bytes) -> Tuple[str, Optional[str]]:
    if file_data_binary is None:
        return json.dumps({"error": "No file uploaded"}, indent=2), None

    temp_input_path = None
    try:
        # Create a temporary file with .pdf extension from the binary data
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input_file:
            temp_input_file.write(file_data_binary)
            temp_input_path = temp_input_file.name
        print(f"Temporary PDF for analysis: {temp_input_path}")
        
        results_dict, _ = analyze_pdf(temp_input_path) # Pass the path to the temp file
        
        results_json = json.dumps(results_dict, indent=2, ensure_ascii=False)
        return results_json, None # No annotated PDF path to return for now

    except Exception as e:
        print(f"Error in process_upload: {e}")
        error_message = json.dumps({"error": str(e), "traceback": traceback.format_exc()}, indent=2)
        return error_message, None
    finally:
        if temp_input_path and os.path.exists(temp_input_path):
            os.unlink(temp_input_path)
            print(f"Cleaned up temporary file: {temp_input_path}")


def create_interface():
    with gr.Blocks(title="PDF Analyzer") as interface:
        gr.Markdown("# PDF Analyzer")
        gr.Markdown("Upload a PDF document to analyze its structure, references, language, and more. Language issues will include PDF coordinates if found, and are filtered to appear between 'Abstract' and 'References/Bibliography'.")
        
        with gr.Row():
            file_input = gr.File(
                label="Upload PDF",
                file_types=[".pdf"],
                type="binary" # Changed to binary to handle uploads directly
            )
        
        with gr.Row():
            analyze_btn = gr.Button("Analyze PDF")
        
        with gr.Row():
            results_output = gr.JSON(
                label="Analysis Results (Coordinates for issues in 'issues' list)",
                show_label=True
            )
        
        with gr.Row():
            # Keeping the placeholder for PDF output, but it's not functional for annotation
            pdf_output = gr.File(
                label="Annotated PDF (Functionality Removed - View Coordinates in JSON)",
                show_label=True,
                # value=None # Ensure it's empty initially
            )
        
        analyze_btn.click(
            fn=process_upload,
            inputs=[file_input],
            outputs=[results_output, pdf_output] # pdf_output will receive None
        )
    return interface

if __name__ == "__main__":
    print("\n--- Launching Gradio Interface ---")
    # Ensure JAVA_HOME is set if not globally configured
    if 'JAVA_HOME' not in os.environ:
        # Attempt to set a common default if necessary, or ensure the user sets it.
        # For this script, it's set at the top.
        print("JAVA_HOME is set to:", os.environ.get('JAVA_HOME'))
    else:
        print("JAVA_HOME is set to:", os.environ.get('JAVA_HOME'))

    # Check if LanguageTool can be initialized (optional check)
    try:
        lt_test = language_tool_python.LanguageTool('en-US')
        lt_test.close()
        print("LanguageTool initialized successfully.")
    except Exception as lt_e:
        print(f"Warning: Could not initialize LanguageTool. Language checks might fail: {lt_e}")
        print("Please ensure Java is installed and JAVA_HOME is correctly set.")
        print("For example, on Ubuntu with OpenJDK 11: export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64")


    interface = create_interface()
    interface.launch(
        share=False, # Set to True for public link if ngrok is installed
        server_port=None # Gradio will pick an available port
    )