Update app.py
Browse files
app.py
CHANGED
@@ -19,6 +19,10 @@ import gradio as gr
|
|
19 |
# Set JAVA_HOME environment variable (from target script)
|
20 |
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
|
21 |
|
|
|
|
|
|
|
|
|
22 |
|
23 |
# --- Functions for PDF to Markdown to Plain Text ---
|
24 |
def convert_markdown_to_plain_text(markdown_text: str) -> str:
|
@@ -175,6 +179,7 @@ def check_structure(plain_text: str) -> Dict[str, bool]:
|
|
175 |
"abstract_structure": "structured abstract" in text_lower
|
176 |
}
|
177 |
|
|
|
178 |
def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
|
179 |
"""
|
180 |
Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
|
@@ -230,7 +235,9 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
|
|
230 |
processed_issues: List[Dict[str, Any]] = []
|
231 |
|
232 |
try:
|
|
|
233 |
tool = language_tool_python.LanguageTool('en-US')
|
|
|
234 |
raw_lt_matches = tool.check(text_for_analysis)
|
235 |
|
236 |
# Define a set of rule IDs to ignore
|
@@ -250,7 +257,7 @@ def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, An
|
|
250 |
continue
|
251 |
lt_issues_in_range +=1
|
252 |
|
253 |
-
context_str = text_for_analysis[match.offset : match.offset + match.errorLength]
|
254 |
processed_issues.append({
|
255 |
'_internal_id': f"lt_{idx}",
|
256 |
'ruleId': match.ruleId,
|
|
|
19 |
# Set JAVA_HOME environment variable (from target script)
|
20 |
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
|
21 |
|
22 |
+
global_constants = {
|
23 |
+
"CONTEXT_LENGTH" : 3
|
24 |
+
}
|
25 |
+
|
26 |
|
27 |
# --- Functions for PDF to Markdown to Plain Text ---
|
28 |
def convert_markdown_to_plain_text(markdown_text: str) -> str:
|
|
|
179 |
"abstract_structure": "structured abstract" in text_lower
|
180 |
}
|
181 |
|
182 |
+
|
183 |
def check_language_issues_and_regex(markdown_text_from_pdf: str) -> Dict[str, Any]:
|
184 |
"""
|
185 |
Performs LanguageTool and specific regex checks on text derived from PDF's Markdown.
|
|
|
235 |
processed_issues: List[Dict[str, Any]] = []
|
236 |
|
237 |
try:
|
238 |
+
|
239 |
tool = language_tool_python.LanguageTool('en-US')
|
240 |
+
print(text_for_analysis)
|
241 |
raw_lt_matches = tool.check(text_for_analysis)
|
242 |
|
243 |
# Define a set of rule IDs to ignore
|
|
|
257 |
continue
|
258 |
lt_issues_in_range +=1
|
259 |
|
260 |
+
context_str = text_for_analysis[match.offset - global_constants["CONTEXT_LENGTH"] : match.offset + match.errorLength + global_constants["CONTEXT_LENGTH"]]
|
261 |
processed_issues.append({
|
262 |
'_internal_id': f"lt_{idx}",
|
263 |
'ruleId': match.ruleId,
|