|
import re |
|
import pdfplumber |
|
|
|
|
|
|
|
def clean_text(text): |
|
|
|
text = re.sub(r'<[^>]*?>', '', text) |
|
|
|
text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text) |
|
|
|
text = re.sub(r'[^a-zA-Z0-9 ]', '', text) |
|
|
|
text = re.sub(r'\s{2,}', ' ', text) |
|
|
|
text = text.strip() |
|
|
|
text = ' '.join(text.split()) |
|
return text |
|
|
|
def extract_text_from_pdf(uploaded_file): |
|
if uploaded_file is not None: |
|
with pdfplumber.open(uploaded_file) as pdf: |
|
pages = [page.extract_text() for page in pdf.pages] |
|
return "\n".join(pages) if pages else "" |
|
return "" |