Omar ID EL MOUMEN commited on
Commit
07e2819
·
1 Parent(s): c2b2088

Fixing missing references case

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -77,7 +77,7 @@ async def extract_text_pdf(id_doc: str):
77
  ref_pos = ref_pos.end()
78
 
79
  if ref_pos is not None:
80
- postprocess_text = pdf_text[:ref_pos - 10]
81
 
82
  def remove_in_betweens(text):
83
  removed_brackets = re.sub(r'\[.*?\]', ' ', text)
@@ -85,9 +85,9 @@ async def extract_text_pdf(id_doc: str):
85
  return removed_parentheses
86
 
87
  def remove_punctuations(text):
88
- return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*\-]", ' ', text)
89
 
90
- postprocess_text = remove_in_betweens(postprocess_text)
91
  postprocess_text = remove_punctuations(postprocess_text)
92
  regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
93
  titles = doc.get_toc()
 
77
  ref_pos = ref_pos.end()
78
 
79
  if ref_pos is not None:
80
+ pdf_text = pdf_text[:ref_pos - 10]
81
 
82
  def remove_in_betweens(text):
83
  removed_brackets = re.sub(r'\[.*?\]', ' ', text)
 
85
  return removed_parentheses
86
 
87
  def remove_punctuations(text):
88
+ return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*\-]", '', text)
89
 
90
+ postprocess_text = remove_in_betweens(pdf_text)
91
  postprocess_text = remove_punctuations(postprocess_text)
92
  regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
93
  titles = doc.get_toc()