Spaces:

Pavan178
/

txt-pdf

Paused

App Files Files Community

txt-pdf / app.py

Pavan178

Create app.py

94cefaf verified 12 months ago

raw

history blame contribute delete

3.88 kB

	import gradio as gr
	import pdfplumber
	import re
	import tempfile
	import os
	import torch
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from concurrent.futures import ThreadPoolExecutor
	import spaces



	@spaces.GPU
	def preprocess_text_for_tts(text):
	text = re.sub(r'[^\x20-\x7E]', ' ', text)
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text, flags=re.MULTILINE)
	text = re.sub(r'\S+@\S+', '', text)
	text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
	text = re.sub(r'\.{2,}', ' ', text)

	def convert_case(match):
	word = match.group(0)
	common_abbreviations = {'AI', 'ML', 'NLP', 'CV', 'API', 'GPU', 'CPU', 'RAM', 'ROM', 'USA', 'UK', 'EU'}
	return word if word in common_abbreviations else word.title()

	text = re.sub(r'\b[A-Z]+\b', convert_case, text)
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'\.([A-Za-z])', r'. \1', text)
	text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)
	text = re.sub(r'([A-Za-z])\s([.,!?])', r'\1\2', text)
	text = re.sub(r'([.,!?])([A-Za-z])', r'\1 \2', text)
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	# Check if CUDA (GPU) is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Load the model and tokenizer
	model_name = "sherif31/T5-Grammer-Correction" # Replace with your actual model name
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


	def correct_text(text):
	# Split the text into chunks to avoid exceeding max token limit
	max_chunk_length = 512
	chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
	corrected_chunks = []

	for chunk in chunks:
	input_text = f"grammar: {chunk}"
	input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

	with torch.no_grad():
	output = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5)

	corrected_chunk = tokenizer.decode(output[0], skip_special_tokens=True)
	corrected_chunks.append(corrected_chunk)

	return ' '.join(corrected_chunks)

	def extract_text_from_pages(pdf_bytes):
	page_text_dict = {}

	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
	temp_pdf.write(pdf_bytes)
	temp_pdf_path = temp_pdf.name

	try:
	with pdfplumber.open(temp_pdf_path) as pdf:
	for page_num, page in enumerate(pdf.pages, 1):
	raw_text = page.extract_text()
	if raw_text:
	cleaned_text = preprocess_text_for_tts(raw_text)
	corrected_text = correct_text(cleaned_text)
	page_text_dict[page_num] = corrected_text
	else:
	page_text_dict[page_num] = ""
	finally:
	os.unlink(temp_pdf_path)

	return page_text_dict

	def process_pdf(pdf_file):
	if pdf_file is None:
	return "No file uploaded. Please upload a PDF file."

	result = extract_text_from_pages(pdf_file)

	# Use ThreadPoolExecutor for parallel processing
	with ThreadPoolExecutor() as executor:
	corrected_texts = list(executor.map(correct_text, result.values()))

	# Combine the results
	output = ""
	for page_num, text in zip(result.keys(), corrected_texts):
	output += f"Page {page_num}:\n{text}\n\n"

	return output

	# Create the Gradio interface
	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(label="Upload PDF", type="binary"),
	outputs=gr.Textbox(label="Extracted and Processed Text"),
	title="PDF Text Extractor and Processor",
	description="Upload a PDF file to extract, clean, and correct its text content."
	)

	# Launch the app
	iface.launch()