Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
07e2819
1
Parent(s):
c2b2088
Fixing missing references case
Browse files
app.py
CHANGED
@@ -77,7 +77,7 @@ async def extract_text_pdf(id_doc: str):
|
|
77 |
ref_pos = ref_pos.end()
|
78 |
|
79 |
if ref_pos is not None:
|
80 |
-
|
81 |
|
82 |
def remove_in_betweens(text):
|
83 |
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
|
@@ -85,9 +85,9 @@ async def extract_text_pdf(id_doc: str):
|
|
85 |
return removed_parentheses
|
86 |
|
87 |
def remove_punctuations(text):
|
88 |
-
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*\-]", '
|
89 |
|
90 |
-
postprocess_text = remove_in_betweens(
|
91 |
postprocess_text = remove_punctuations(postprocess_text)
|
92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
93 |
titles = doc.get_toc()
|
|
|
77 |
ref_pos = ref_pos.end()
|
78 |
|
79 |
if ref_pos is not None:
|
80 |
+
pdf_text = pdf_text[:ref_pos - 10]
|
81 |
|
82 |
def remove_in_betweens(text):
|
83 |
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
|
|
|
85 |
return removed_parentheses
|
86 |
|
87 |
def remove_punctuations(text):
|
88 |
+
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*\-]", '', text)
|
89 |
|
90 |
+
postprocess_text = remove_in_betweens(pdf_text)
|
91 |
postprocess_text = remove_punctuations(postprocess_text)
|
92 |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
|
93 |
titles = doc.get_toc()
|