Spaces:

spine-crook
/

cold-email-generator

Running

MandarBhalerao

Initial commit

e5db578 2 months ago

865 Bytes

	import re
	import pdfplumber


	# remove not required things and clean the text
	def clean_text(text):
	# Remove HTML tags
	text = re.sub(r'<[^>]*?>', '', text)
	# Remove URLs
	text = re.sub(r'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\$\$,]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
	# Remove special characters
	text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
	# Replace multiple spaces with a single space
	text = re.sub(r'\s{2,}', ' ', text)
	# Trim leading and trailing whitespace
	text = text.strip()
	# Remove extra whitespace
	text = ' '.join(text.split())
	return text

	def extract_text_from_pdf(uploaded_file):
	if uploaded_file is not None:
	with pdfplumber.open(uploaded_file) as pdf:
	pages = [page.extract_text() for page in pdf.pages]
	return "\n".join(pages) if pages else ""
	return ""