convert_test

Sleeping

App Files Files Community

Bentham commited on Dec 19, 2024

Commit

6bb1418

verified ·

1 Parent(s): c08b912

fonctions tableaux

Browse files

Files changed (1) hide show

main.py +39 -0

main.py CHANGED Viewed

@@ -13,6 +13,11 @@ import shutil
 import json
 import asyncio
 import hashlib
 from openai import AsyncOpenAI
 from readability import Document
@@ -721,6 +726,40 @@ def delete_temp_files(file_paths: list):
 # MODIFICATIONS START
 import hashlib
 def extract_images_from_pdf(input_filename: str) -> List[bytes]:
     images = []
     hashes = set()  # Pour stocker les hashes des images uniques

 import json
 import asyncio
 import hashlib
+import camelot
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+from docx import Document as DocxDocument
 from openai import AsyncOpenAI
 from readability import Document
 # MODIFICATIONS START
 import hashlib
+def extract_tables_from_docx(input_filename: str) -> List[str]:
+    doc = DocxDocument(input_filename)
+    table_texts = []
+    for table in doc.tables:
+        for row in table.rows:
+            row_text = "\t".join(cell.text.strip() for cell in row.cells)
+            table_texts.append(row_text)
+        table_texts.append("")  # Ajoute une ligne vide entre les tables
+    return table_texts
+def extract_tables_from_pptx(input_filename: str) -> List[str]:
+    prs = Presentation(input_filename)
+    table_texts = []
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
+                table = shape.table
+                for row in table.rows:
+                    row_text = "\t".join(cell.text.strip() for cell in row.cells)
+                    table_texts.append(row_text)
+                table_texts.append("")  # Ligne vide entre les tables
+    return table_texts
+def extract_tables_from_pdf(input_filename: str) -> List[str]:
+    tables = camelot.read_pdf(input_filename, pages='all', flavor='stream')
+    table_texts = []
+    for table in tables:
+        df = table.df
+        for _, row in df.iterrows():
+            row_text = "\t".join(cell.strip() for cell in row)
+            table_texts.append(row_text)
+        table_texts.append("")  # Ligne vide entre les tables
+    return table_texts
 def extract_images_from_pdf(input_filename: str) -> List[bytes]:
     images = []
     hashes = set()  # Pour stocker les hashes des images uniques