Spaces:
Sleeping
Sleeping
fonctions tableaux
Browse files
main.py
CHANGED
@@ -13,6 +13,11 @@ import shutil
|
|
13 |
import json
|
14 |
import asyncio
|
15 |
import hashlib
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
from openai import AsyncOpenAI
|
18 |
from readability import Document
|
@@ -721,6 +726,40 @@ def delete_temp_files(file_paths: list):
|
|
721 |
# MODIFICATIONS START
|
722 |
import hashlib
|
723 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
724 |
def extract_images_from_pdf(input_filename: str) -> List[bytes]:
|
725 |
images = []
|
726 |
hashes = set() # Pour stocker les hashes des images uniques
|
|
|
13 |
import json
|
14 |
import asyncio
|
15 |
import hashlib
|
16 |
+
import camelot
|
17 |
+
from pptx import Presentation
|
18 |
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
19 |
+
from docx import Document as DocxDocument
|
20 |
+
|
21 |
|
22 |
from openai import AsyncOpenAI
|
23 |
from readability import Document
|
|
|
726 |
# MODIFICATIONS START
|
727 |
import hashlib
|
728 |
|
729 |
+
def extract_tables_from_docx(input_filename: str) -> List[str]:
|
730 |
+
doc = DocxDocument(input_filename)
|
731 |
+
table_texts = []
|
732 |
+
for table in doc.tables:
|
733 |
+
for row in table.rows:
|
734 |
+
row_text = "\t".join(cell.text.strip() for cell in row.cells)
|
735 |
+
table_texts.append(row_text)
|
736 |
+
table_texts.append("") # Ajoute une ligne vide entre les tables
|
737 |
+
return table_texts
|
738 |
+
|
739 |
+
def extract_tables_from_pptx(input_filename: str) -> List[str]:
|
740 |
+
prs = Presentation(input_filename)
|
741 |
+
table_texts = []
|
742 |
+
for slide in prs.slides:
|
743 |
+
for shape in slide.shapes:
|
744 |
+
if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
745 |
+
table = shape.table
|
746 |
+
for row in table.rows:
|
747 |
+
row_text = "\t".join(cell.text.strip() for cell in row.cells)
|
748 |
+
table_texts.append(row_text)
|
749 |
+
table_texts.append("") # Ligne vide entre les tables
|
750 |
+
return table_texts
|
751 |
+
|
752 |
+
def extract_tables_from_pdf(input_filename: str) -> List[str]:
|
753 |
+
tables = camelot.read_pdf(input_filename, pages='all', flavor='stream')
|
754 |
+
table_texts = []
|
755 |
+
for table in tables:
|
756 |
+
df = table.df
|
757 |
+
for _, row in df.iterrows():
|
758 |
+
row_text = "\t".join(cell.strip() for cell in row)
|
759 |
+
table_texts.append(row_text)
|
760 |
+
table_texts.append("") # Ligne vide entre les tables
|
761 |
+
return table_texts
|
762 |
+
|
763 |
def extract_images_from_pdf(input_filename: str) -> List[bytes]:
|
764 |
images = []
|
765 |
hashes = set() # Pour stocker les hashes des images uniques
|