import re import pdfplumber # remove not required things and clean the text def clean_text(text): # Remove HTML tags text = re.sub(r'<[^>]*?>', '', text) # Remove URLs text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) # Remove special characters text = re.sub(r'[^a-zA-Z0-9 ]', '', text) # Replace multiple spaces with a single space text = re.sub(r'\s{2,}', ' ', text) # Trim leading and trailing whitespace text = text.strip() # Remove extra whitespace text = ' '.join(text.split()) return text def extract_text_from_pdf(uploaded_file): if uploaded_file is not None: with pdfplumber.open(uploaded_file) as pdf: pages = [page.extract_text() for page in pdf.pages] return "\n".join(pages) if pages else "" return ""