Spaces:
Sleeping
Sleeping
import os | |
from typing import Optional | |
import pandas as pd | |
from pdf2image import convert_from_path | |
from PIL import Image | |
from .config import settings | |
from striprtf.striprtf import rtf_to_text | |
from docx import Document | |
class FileProcessor: | |
def __init__(self): | |
pass | |
def process_pdf(self, file_path: str) -> str: | |
#list of images | |
return convert_from_path(file_path) | |
def process_image(self, file_path: str) -> str: | |
return Image.open(file_path).convert('RGB') | |
def process_doc(self, file_path: str) -> str: | |
def split_doc(data: str) -> list: | |
line_data = data.split('\n') | |
line_count = len(line_data) | |
#split by three | |
split_len = line_count//3 | |
#split by 3 and also handle the case where the split is not even, so that the last one has the remaining lines | |
doc_split = [line_data[i:i+split_len] for i in range(0, line_count, split_len)] | |
#under each split, merge the line data into a single line | |
doc_split = ['\n'.join(split) for split in doc_split] | |
return doc_split | |
if file_path.lower().endswith('.doc'): | |
with open(file_path, "r", encoding='utf-8', errors='ignore') as file: | |
data = rtf_to_text(file.read()) | |
if len(data.split('\n')) > 200: | |
split_data = split_doc(data) | |
return split_data | |
else: | |
return data | |
else: | |
doc = Document(file_path) | |
return '\n'.join([paragraph.text for paragraph in doc.paragraphs]) | |
def process_docx(self, file_path: str) -> str: | |
doc = Document(file_path) | |
return '\n'.join([paragraph.text for paragraph in doc.paragraphs]) | |
def process_xlsx(self, file_path: str) -> str: | |
df = pd.read_excel(file_path) | |
return df.to_string() | |
def process_csv(self, file_path: str) -> str: | |
df = pd.read_csv(file_path) | |
return df.to_string() | |
def process_txt(self, file_path: str) -> str: | |
with open(file_path, 'r') as file: | |
return file.read() | |
def process_file(self, file_path: str) -> str: | |
"""Main method to process any supported file type.""" | |
_, file_extension = os.path.splitext(file_path) | |
processors = { | |
'.pdf': self.process_pdf, | |
'.jpeg':self.process_image, | |
'.jpg':self.process_image, | |
'.png':self.process_image, | |
'.doc': self.process_doc, | |
'.docx': self.process_docx, | |
'.xls': self.process_xlsx, | |
'.xlsx': self.process_xlsx, | |
'.csv': self.process_csv, | |
'.txt': self.process_txt | |
} | |
processor = processors.get(file_extension.lower()) | |
if not processor: | |
raise ValueError(f"Unsupported file format: {file_extension}") | |
return processor(file_path) |