Update app.py
Browse files
app.py
CHANGED
@@ -19,14 +19,14 @@ os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
|
|
19 |
# Analysis Functions
|
20 |
# ------------------------------
|
21 |
|
22 |
-
def extract_pdf_text_by_page(file) -> List[str]:
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
|
31 |
def extract_pdf_text(file) -> str:
|
32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
@@ -34,23 +34,12 @@ def extract_pdf_text(file) -> str:
|
|
34 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
35 |
full_text = ""
|
36 |
|
37 |
-
for
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
text = block[4] # The text content is at index 4
|
44 |
-
|
45 |
-
# Handle line-break hyphens
|
46 |
-
# text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
|
47 |
-
|
48 |
-
# Preserve regular hyphens within words (e.g., "state-of-the-art")
|
49 |
-
processed_text += text + "\n"
|
50 |
-
|
51 |
-
full_text += processed_text
|
52 |
-
print(f"Extracted text from page {page_num}: {len(processed_text)} characters.")
|
53 |
-
|
54 |
doc.close()
|
55 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
56 |
return full_text
|
@@ -125,6 +114,10 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
|
|
125 |
|
126 |
# Process LanguageTool matches
|
127 |
for match in matches:
|
|
|
|
|
|
|
|
|
128 |
issues.append({
|
129 |
"message": match.message,
|
130 |
"context": match.context.strip(),
|
@@ -249,18 +242,21 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
|
|
249 |
word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
|
250 |
for page_number in range(len(doc)):
|
251 |
page = doc[page_number]
|
|
|
252 |
words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
|
253 |
for w in words:
|
254 |
# print(w)
|
255 |
word_text = w[4]
|
256 |
# **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
|
257 |
-
if '[' in word_text:
|
258 |
-
|
259 |
word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
|
260 |
# print(f"Total words extracted: {len(word_list)}")
|
261 |
|
262 |
# Concatenate all words to form the full text
|
|
|
263 |
concatenated_text = " ".join([w[1] for w in word_list])
|
|
|
264 |
# print(f"Concatenated text length: {concatenated_text} characters.")
|
265 |
|
266 |
# Find "References" section and exclude from processing
|
@@ -277,8 +273,8 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
|
|
277 |
continue
|
278 |
|
279 |
|
280 |
-
error_text = concatenated_text[offset:offset+length
|
281 |
-
|
282 |
|
283 |
# Find the words that fall within the error span
|
284 |
current_pos = 0
|
@@ -350,7 +346,8 @@ def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
|
|
350 |
full_text = extract_pdf_text(filepath)
|
351 |
if not full_text:
|
352 |
return {"error": "Failed to extract text from PDF."}, None
|
353 |
-
|
|
|
354 |
language_issues = check_language_issues(full_text)
|
355 |
|
356 |
# Handle potential errors from check_language_issues
|
@@ -456,7 +453,7 @@ def create_interface():
|
|
456 |
if __name__ == "__main__":
|
457 |
interface = create_interface()
|
458 |
interface.launch(
|
459 |
-
share=
|
460 |
# server_name="0.0.0.0",
|
461 |
server_port=None
|
462 |
)
|
|
|
19 |
# Analysis Functions
|
20 |
# ------------------------------
|
21 |
|
22 |
+
# def extract_pdf_text_by_page(file) -> List[str]:
|
23 |
+
# """Extracts text from a PDF file, page by page, using PyMuPDF."""
|
24 |
+
# if isinstance(file, str):
|
25 |
+
# with fitz.open(file) as doc:
|
26 |
+
# return [page.get_text("text") for page in doc]
|
27 |
+
# else:
|
28 |
+
# with fitz.open(stream=file.read(), filetype="pdf") as doc:
|
29 |
+
# return [page.get_text("text") for page in doc]
|
30 |
|
31 |
def extract_pdf_text(file) -> str:
|
32 |
"""Extracts full text from a PDF file using PyMuPDF."""
|
|
|
34 |
doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
|
35 |
full_text = ""
|
36 |
|
37 |
+
for page_number in range(len(doc)):
|
38 |
+
page = doc[page_number]
|
39 |
+
words = page.get_text("word")
|
40 |
+
full_text += words
|
41 |
+
|
42 |
+
print(full_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
doc.close()
|
44 |
print(f"Total extracted text length: {len(full_text)} characters.")
|
45 |
return full_text
|
|
|
114 |
|
115 |
# Process LanguageTool matches
|
116 |
for match in matches:
|
117 |
+
# Ignore issues with rule_id 'EN_SPLIT_WORDS_HYPHEN'
|
118 |
+
if match.ruleId == "EN_SPLIT_WORDS_HYPHEN":
|
119 |
+
continue
|
120 |
+
|
121 |
issues.append({
|
122 |
"message": match.message,
|
123 |
"context": match.context.strip(),
|
|
|
242 |
word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
|
243 |
for page_number in range(len(doc)):
|
244 |
page = doc[page_number]
|
245 |
+
print(page.get_text("words"))
|
246 |
words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
|
247 |
for w in words:
|
248 |
# print(w)
|
249 |
word_text = w[4]
|
250 |
# **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
|
251 |
+
# if '[' in word_text:
|
252 |
+
# word_text = word_text.replace('[', ' [')
|
253 |
word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
|
254 |
# print(f"Total words extracted: {len(word_list)}")
|
255 |
|
256 |
# Concatenate all words to form the full text
|
257 |
+
concatenated_text=""
|
258 |
concatenated_text = " ".join([w[1] for w in word_list])
|
259 |
+
|
260 |
# print(f"Concatenated text length: {concatenated_text} characters.")
|
261 |
|
262 |
# Find "References" section and exclude from processing
|
|
|
273 |
continue
|
274 |
|
275 |
|
276 |
+
error_text = concatenated_text[offset:offset+length]
|
277 |
+
print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
|
278 |
|
279 |
# Find the words that fall within the error span
|
280 |
current_pos = 0
|
|
|
346 |
full_text = extract_pdf_text(filepath)
|
347 |
if not full_text:
|
348 |
return {"error": "Failed to extract text from PDF."}, None
|
349 |
+
|
350 |
+
# print(full_text)
|
351 |
language_issues = check_language_issues(full_text)
|
352 |
|
353 |
# Handle potential errors from check_language_issues
|
|
|
453 |
if __name__ == "__main__":
|
454 |
interface = create_interface()
|
455 |
interface.launch(
|
456 |
+
share=False, # Set to False in production
|
457 |
# server_name="0.0.0.0",
|
458 |
server_port=None
|
459 |
)
|