Standard_Intelligence_Dev

Sleeping

App Files Files Community

heymenn commited on Apr 11, 2024

Commit

36947a6

verified ·

1 Parent(s): 2409e73

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +1 -6

scrape_3gpp.py CHANGED Viewed

@@ -424,22 +424,18 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
                             extracted_content.append(discussion_details)
                         elif category == "pdf":
-                            print("Entered the PDF category")
                             tabLine = []
                             file = pdfReader
                             pdfNumberPages = len(file.pages)
-                            print(f"This is the number of pages : {pdfNumberPages}")
                             for pdfPage in range(0, pdfNumberPages):
                                 load_page = file.get_page(pdfPage)
                                 text = load_page.extract_text()
                                 lines = text.split("\n")
-                                print(f"This is the lines : {lines}")
-                                keyword = ["objective", "introduction", "summary", "scope"]
                                 for indexPdf,line in enumerate(lines):
-                                    print(line)
                                     if len(line) < 20:
                                       for key in keyword:
@@ -448,7 +444,6 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
                                           if key in line:
                                               selectedText = lines[indexPdf:]
                                               tabLine.append([pdfPage,selectedText,key])
-                                              print(f"Selected line in keywords is: {line}")
                             for r in tabLine:
                                 extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')

                             extracted_content.append(discussion_details)
                         elif category == "pdf":
                             tabLine = []
                             file = pdfReader
                             pdfNumberPages = len(file.pages)
                             for pdfPage in range(0, pdfNumberPages):
                                 load_page = file.get_page(pdfPage)
                                 text = load_page.extract_text()
                                 lines = text.split("\n")
+                                keyword = ["objective", "introduction", "summary", "scope", "conclusion"]
                                 for indexPdf,line in enumerate(lines):
                                     if len(line) < 20:
                                       for key in keyword:
                                           if key in line:
                                               selectedText = lines[indexPdf:]
                                               tabLine.append([pdfPage,selectedText,key])
                             for r in tabLine:
                                 extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')