txt-pdf / app.py
Pavan178's picture
Create app.py
94cefaf verified
import gradio as gr
import pdfplumber
import re
import tempfile
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from concurrent.futures import ThreadPoolExecutor
import spaces
@spaces.GPU
def preprocess_text_for_tts(text):
text = re.sub(r'[^\x20-\x7E]', ' ', text)
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
text = re.sub(r'\.{2,}', ' ', text)
def convert_case(match):
word = match.group(0)
common_abbreviations = {'AI', 'ML', 'NLP', 'CV', 'API', 'GPU', 'CPU', 'RAM', 'ROM', 'USA', 'UK', 'EU'}
return word if word in common_abbreviations else word.title()
text = re.sub(r'\b[A-Z]+\b', convert_case, text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\.([A-Za-z])', r'. \1', text)
text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)
text = re.sub(r'([A-Za-z])\s([.,!?])', r'\1\2', text)
text = re.sub(r'([.,!?])([A-Za-z])', r'\1 \2', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Load the model and tokenizer
model_name = "sherif31/T5-Grammer-Correction" # Replace with your actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
def correct_text(text):
# Split the text into chunks to avoid exceeding max token limit
max_chunk_length = 512
chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
corrected_chunks = []
for chunk in chunks:
input_text = f"grammar: {chunk}"
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
with torch.no_grad():
output = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5)
corrected_chunk = tokenizer.decode(output[0], skip_special_tokens=True)
corrected_chunks.append(corrected_chunk)
return ' '.join(corrected_chunks)
def extract_text_from_pages(pdf_bytes):
page_text_dict = {}
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf.write(pdf_bytes)
temp_pdf_path = temp_pdf.name
try:
with pdfplumber.open(temp_pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
raw_text = page.extract_text()
if raw_text:
cleaned_text = preprocess_text_for_tts(raw_text)
corrected_text = correct_text(cleaned_text)
page_text_dict[page_num] = corrected_text
else:
page_text_dict[page_num] = ""
finally:
os.unlink(temp_pdf_path)
return page_text_dict
def process_pdf(pdf_file):
if pdf_file is None:
return "No file uploaded. Please upload a PDF file."
result = extract_text_from_pages(pdf_file)
# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
corrected_texts = list(executor.map(correct_text, result.values()))
# Combine the results
output = ""
for page_num, text in zip(result.keys(), corrected_texts):
output += f"Page {page_num}:\n{text}\n\n"
return output
# Create the Gradio interface
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload PDF", type="binary"),
outputs=gr.Textbox(label="Extracted and Processed Text"),
title="PDF Text Extractor and Processor",
description="Upload a PDF file to extract, clean, and correct its text content."
)
# Launch the app
iface.launch()