samyak152002 commited on
Commit
810882a
·
verified ·
1 Parent(s): 4bb46a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -31
app.py CHANGED
@@ -19,14 +19,14 @@ os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
19
  # Analysis Functions
20
  # ------------------------------
21
 
22
- def extract_pdf_text_by_page(file) -> List[str]:
23
- """Extracts text from a PDF file, page by page, using PyMuPDF."""
24
- if isinstance(file, str):
25
- with fitz.open(file) as doc:
26
- return [page.get_text("text") for page in doc]
27
- else:
28
- with fitz.open(stream=file.read(), filetype="pdf") as doc:
29
- return [page.get_text("text") for page in doc]
30
 
31
  def extract_pdf_text(file) -> str:
32
  """Extracts full text from a PDF file using PyMuPDF."""
@@ -34,23 +34,12 @@ def extract_pdf_text(file) -> str:
34
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
35
  full_text = ""
36
 
37
- for page_num, page in enumerate(doc, start=1):
38
- # Get text blocks with their coordinates
39
- blocks = page.get_text("blocks")
40
- processed_text = ""
41
-
42
- for block in blocks:
43
- text = block[4] # The text content is at index 4
44
-
45
- # Handle line-break hyphens
46
- # text = re.sub(r'(\w+)-\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2), text)
47
-
48
- # Preserve regular hyphens within words (e.g., "state-of-the-art")
49
- processed_text += text + "\n"
50
-
51
- full_text += processed_text
52
- print(f"Extracted text from page {page_num}: {len(processed_text)} characters.")
53
-
54
  doc.close()
55
  print(f"Total extracted text length: {len(full_text)} characters.")
56
  return full_text
@@ -125,6 +114,10 @@ def check_language_issues(full_text: str) -> Dict[str, Any]:
125
 
126
  # Process LanguageTool matches
127
  for match in matches:
 
 
 
 
128
  issues.append({
129
  "message": match.message,
130
  "context": match.context.strip(),
@@ -249,18 +242,21 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
249
  word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
250
  for page_number in range(len(doc)):
251
  page = doc[page_number]
 
252
  words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
253
  for w in words:
254
  # print(w)
255
  word_text = w[4]
256
  # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
257
- if '[' in word_text:
258
- word_text = word_text.replace('[', ' [')
259
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
260
  # print(f"Total words extracted: {len(word_list)}")
261
 
262
  # Concatenate all words to form the full text
 
263
  concatenated_text = " ".join([w[1] for w in word_list])
 
264
  # print(f"Concatenated text length: {concatenated_text} characters.")
265
 
266
  # Find "References" section and exclude from processing
@@ -277,8 +273,8 @@ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> byt
277
  continue
278
 
279
 
280
- error_text = concatenated_text[offset:offset+length+1]
281
- # print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
282
 
283
  # Find the words that fall within the error span
284
  current_pos = 0
@@ -350,7 +346,8 @@ def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
350
  full_text = extract_pdf_text(filepath)
351
  if not full_text:
352
  return {"error": "Failed to extract text from PDF."}, None
353
-
 
354
  language_issues = check_language_issues(full_text)
355
 
356
  # Handle potential errors from check_language_issues
@@ -456,7 +453,7 @@ def create_interface():
456
  if __name__ == "__main__":
457
  interface = create_interface()
458
  interface.launch(
459
- share=True, # Set to False in production
460
  # server_name="0.0.0.0",
461
  server_port=None
462
  )
 
19
  # Analysis Functions
20
  # ------------------------------
21
 
22
+ # def extract_pdf_text_by_page(file) -> List[str]:
23
+ # """Extracts text from a PDF file, page by page, using PyMuPDF."""
24
+ # if isinstance(file, str):
25
+ # with fitz.open(file) as doc:
26
+ # return [page.get_text("text") for page in doc]
27
+ # else:
28
+ # with fitz.open(stream=file.read(), filetype="pdf") as doc:
29
+ # return [page.get_text("text") for page in doc]
30
 
31
  def extract_pdf_text(file) -> str:
32
  """Extracts full text from a PDF file using PyMuPDF."""
 
34
  doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
35
  full_text = ""
36
 
37
+ for page_number in range(len(doc)):
38
+ page = doc[page_number]
39
+ words = page.get_text("word")
40
+ full_text += words
41
+
42
+ print(full_text)
 
 
 
 
 
 
 
 
 
 
 
43
  doc.close()
44
  print(f"Total extracted text length: {len(full_text)} characters.")
45
  return full_text
 
114
 
115
  # Process LanguageTool matches
116
  for match in matches:
117
+ # Ignore issues with rule_id 'EN_SPLIT_WORDS_HYPHEN'
118
+ if match.ruleId == "EN_SPLIT_WORDS_HYPHEN":
119
+ continue
120
+
121
  issues.append({
122
  "message": match.message,
123
  "context": match.context.strip(),
 
242
  word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
243
  for page_number in range(len(doc)):
244
  page = doc[page_number]
245
+ print(page.get_text("words"))
246
  words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
247
  for w in words:
248
  # print(w)
249
  word_text = w[4]
250
  # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
251
+ # if '[' in word_text:
252
+ # word_text = word_text.replace('[', ' [')
253
  word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
254
  # print(f"Total words extracted: {len(word_list)}")
255
 
256
  # Concatenate all words to form the full text
257
+ concatenated_text=""
258
  concatenated_text = " ".join([w[1] for w in word_list])
259
+
260
  # print(f"Concatenated text length: {concatenated_text} characters.")
261
 
262
  # Find "References" section and exclude from processing
 
273
  continue
274
 
275
 
276
+ error_text = concatenated_text[offset:offset+length]
277
+ print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
278
 
279
  # Find the words that fall within the error span
280
  current_pos = 0
 
346
  full_text = extract_pdf_text(filepath)
347
  if not full_text:
348
  return {"error": "Failed to extract text from PDF."}, None
349
+
350
+ # print(full_text)
351
  language_issues = check_language_issues(full_text)
352
 
353
  # Handle potential errors from check_language_issues
 
453
  if __name__ == "__main__":
454
  interface = create_interface()
455
  interface.launch(
456
+ share=False, # Set to False in production
457
  # server_name="0.0.0.0",
458
  server_port=None
459
  )