import os from typing import Optional import pandas as pd from pdf2image import convert_from_path from PIL import Image from .config import settings from striprtf.striprtf import rtf_to_text from docx import Document class FileProcessor: def __init__(self): pass def process_pdf(self, file_path: str) -> str: #list of images return convert_from_path(file_path) def process_image(self, file_path: str) -> str: return Image.open(file_path).convert('RGB') def process_doc(self, file_path: str) -> str: def split_doc(data: str) -> list: line_data = data.split('\n') line_count = len(line_data) #split by three split_len = line_count//3 #split by 3 and also handle the case where the split is not even, so that the last one has the remaining lines doc_split = [line_data[i:i+split_len] for i in range(0, line_count, split_len)] #under each split, merge the line data into a single line doc_split = ['\n'.join(split) for split in doc_split] return doc_split if file_path.lower().endswith('.doc'): with open(file_path, "r", encoding='utf-8', errors='ignore') as file: data = rtf_to_text(file.read()) if len(data.split('\n')) > 200: split_data = split_doc(data) return split_data else: return data else: doc = Document(file_path) return '\n'.join([paragraph.text for paragraph in doc.paragraphs]) def process_docx(self, file_path: str) -> str: doc = Document(file_path) return '\n'.join([paragraph.text for paragraph in doc.paragraphs]) def process_xlsx(self, file_path: str) -> str: df = pd.read_excel(file_path) return df.to_string() def process_csv(self, file_path: str) -> str: df = pd.read_csv(file_path) return df.to_string() def process_txt(self, file_path: str) -> str: with open(file_path, 'r') as file: return file.read() def process_file(self, file_path: str) -> str: """Main method to process any supported file type.""" _, file_extension = os.path.splitext(file_path) processors = { '.pdf': self.process_pdf, '.jpeg':self.process_image, '.jpg':self.process_image, '.png':self.process_image, '.doc': self.process_doc, '.docx': self.process_docx, '.xls': self.process_xlsx, '.xlsx': self.process_xlsx, '.csv': self.process_csv, '.txt': self.process_txt } processor = processors.get(file_extension.lower()) if not processor: raise ValueError(f"Unsupported file format: {file_extension}") return processor(file_path)