Bentham commited on
Commit
6bb1418
·
verified ·
1 Parent(s): c08b912

fonctions tableaux

Browse files
Files changed (1) hide show
  1. main.py +39 -0
main.py CHANGED
@@ -13,6 +13,11 @@ import shutil
13
  import json
14
  import asyncio
15
  import hashlib
 
 
 
 
 
16
 
17
  from openai import AsyncOpenAI
18
  from readability import Document
@@ -721,6 +726,40 @@ def delete_temp_files(file_paths: list):
721
  # MODIFICATIONS START
722
  import hashlib
723
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724
  def extract_images_from_pdf(input_filename: str) -> List[bytes]:
725
  images = []
726
  hashes = set() # Pour stocker les hashes des images uniques
 
13
  import json
14
  import asyncio
15
  import hashlib
16
+ import camelot
17
+ from pptx import Presentation
18
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
19
+ from docx import Document as DocxDocument
20
+
21
 
22
  from openai import AsyncOpenAI
23
  from readability import Document
 
726
  # MODIFICATIONS START
727
  import hashlib
728
 
729
+ def extract_tables_from_docx(input_filename: str) -> List[str]:
730
+ doc = DocxDocument(input_filename)
731
+ table_texts = []
732
+ for table in doc.tables:
733
+ for row in table.rows:
734
+ row_text = "\t".join(cell.text.strip() for cell in row.cells)
735
+ table_texts.append(row_text)
736
+ table_texts.append("") # Ajoute une ligne vide entre les tables
737
+ return table_texts
738
+
739
+ def extract_tables_from_pptx(input_filename: str) -> List[str]:
740
+ prs = Presentation(input_filename)
741
+ table_texts = []
742
+ for slide in prs.slides:
743
+ for shape in slide.shapes:
744
+ if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
745
+ table = shape.table
746
+ for row in table.rows:
747
+ row_text = "\t".join(cell.text.strip() for cell in row.cells)
748
+ table_texts.append(row_text)
749
+ table_texts.append("") # Ligne vide entre les tables
750
+ return table_texts
751
+
752
+ def extract_tables_from_pdf(input_filename: str) -> List[str]:
753
+ tables = camelot.read_pdf(input_filename, pages='all', flavor='stream')
754
+ table_texts = []
755
+ for table in tables:
756
+ df = table.df
757
+ for _, row in df.iterrows():
758
+ row_text = "\t".join(cell.strip() for cell in row)
759
+ table_texts.append(row_text)
760
+ table_texts.append("") # Ligne vide entre les tables
761
+ return table_texts
762
+
763
  def extract_images_from_pdf(input_filename: str) -> List[bytes]:
764
  images = []
765
  hashes = set() # Pour stocker les hashes des images uniques