Spaces:

OrganizedProgrammers
/

arXiv

Sleeping

Omar ID EL MOUMEN commited on Mar 27

Commit

db5cf0a

1 Parent(s): a039626

Accept sub chapters

Files changed (1) hide show

app.py CHANGED Viewed

@@ -111,7 +111,7 @@ async def extract_text_pdf(document: DocumentID):
             main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE)
         else:
             for title in titles:
-                if title[0] == 1:
                     main_titles.append(title[1])
         return {"pub_id": document.doc_id, "titles": [re.sub(r"\s+", " ", remove_punctuations(remove_in_betweens(t))).strip() for t in main_titles], "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
     else:

             main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE)
         else:
             for title in titles:
+                if title[0] == 1 or title[0] == 2:
                     main_titles.append(title[1])
         return {"pub_id": document.doc_id, "titles": [re.sub(r"\s+", " ", remove_punctuations(remove_in_betweens(t))).strip() for t in main_titles], "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
     else: