samyak152002 commited on
Commit
54a4d88
·
verified ·
1 Parent(s): f5f575c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -1
app.py CHANGED
@@ -19,6 +19,10 @@ import gradio as gr
19
  # Set JAVA_HOME environment variable (from target script)
20
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
21
 
 
 
 
 
22
 
23
  # --- Functions for PDF to Markdown to Plain Text ---
24
  def convert_markdown_to_plain_text(markdown_text: str) -> str:
@@ -175,6 +179,7 @@ def check_structure(plain_text: str) -> Dict[str, bool]:
175
  "abstract_structure": "structured abstract" in text_lower
176
  }
177
 
 
178
  def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
179
  """
180
  Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
@@ -230,7 +235,9 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
230
  processed_issues: List[Dict[str, Any]] = []
231
 
232
  try:
 
233
  tool = language_tool_python.LanguageTool('en-US')
 
234
  raw_lt_matches = tool.check(text_for_analysis)
235
 
236
  # Define a set of rule IDs to ignore
@@ -250,7 +257,7 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
250
  continue
251
  lt_issues_in_range +=1
252
 
253
- context_str = text_for_analysis[match.offset : match.offset + match.errorLength]
254
  processed_issues.append({
255
  '_internal_id': f"lt_{idx}",
256
  'ruleId': match.ruleId,
 
19
  # Set JAVA_HOME environment variable (from target script)
20
  os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
21
 
22
+ global_constants = {
23
+ "CONTEXT_LENGTH" : 3
24
+ }
25
+
26
 
27
  # --- Functions for PDF to Markdown to Plain Text ---
28
  def convert_markdown_to_plain_text(markdown_text: str) -> str:
 
179
  "abstract_structure": "structured abstract" in text_lower
180
  }
181
 
182
+
183
  def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
184
  """
185
  Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
 
235
  processed_issues: List[Dict[str, Any]] = []
236
 
237
  try:
238
+
239
  tool = language_tool_python.LanguageTool('en-US')
240
+ print(text_for_analysis)
241
  raw_lt_matches = tool.check(text_for_analysis)
242
 
243
  # Define a set of rule IDs to ignore
 
257
  continue
258
  lt_issues_in_range +=1
259
 
260
+ context_str = text_for_analysis[match.offset - global_constants["CONTEXT_LENGTH"] : match.offset + match.errorLength + global_constants["CONTEXT_LENGTH"]]
261
  processed_issues.append({
262
  '_internal_id': f"lt_{idx}",
263
  'ruleId': match.ruleId,