Standard_Intelligence_Dev

Sleeping

heymenn commited on Apr 11, 2024

Commit

2409e73

verified ·

1 Parent(s): aef2e85

Update scrape_3gpp.py

Files changed (1) hide show

scrape_3gpp.py CHANGED Viewed

@@ -437,17 +437,19 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
                                 lines = text.split("\n")
                                 print(f"This is the lines : {lines}")
                                 keyword = ["objective", "introduction", "summary", "scope"]
-                                for line in lines:
                                     print(line)
                                     if len(line) < 20:
                                       for key in keyword:
                                           line = line.lower()
-                                          if key in line:
-                                            start_index = line.find(key)
-                                            selectedText = lines[start_index:]
-                                            tabLine.append([pdfPage,selectedText,key])
-                                            print(f"Selected line in keywords is: {line}")
                             for r in tabLine:
                                 extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
                                 extracted_content.append(' '.join(r[1]))

                                 lines = text.split("\n")
                                 print(f"This is the lines : {lines}")
                                 keyword = ["objective", "introduction", "summary", "scope"]
+                                for indexPdf,line in enumerate(lines):
                                     print(line)
                                     if len(line) < 20:
                                       for key in keyword:
                                           line = line.lower()
+                                          if key in line:
+                                              selectedText = lines[indexPdf:]
+                                              tabLine.append([pdfPage,selectedText,key])
+                                              print(f"Selected line in keywords is: {line}")
                             for r in tabLine:
                                 extracted_content.append(f'PDF Page number {r[0]} extracted text from the KEYWORD {r[2]} : \n')
                                 extracted_content.append(' '.join(r[1]))