txt-pdf / app.py
Pavan178's picture
Create app.py
94cefaf verified
import gradio as gr
import pdfplumber
import re
import tempfile
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from concurrent.futures import ThreadPoolExecutor
import spaces
def preprocess_text_for_tts(text):
text = re.sub(r'[^\x20-\x7E]', ' ', text)
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
text = re.sub(r'\.{2,}', ' ', text)
def convert_case(match):
word = match.group(0)
common_abbreviations = {'AI', 'ML', 'NLP', 'CV', 'API', 'GPU', 'CPU', 'RAM', 'ROM', 'USA', 'UK', 'EU'}
return word if word in common_abbreviations else word.title()
text = re.sub(r'\b[A-Z]+\b', convert_case, text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\.([A-Za-z])', r'. \1', text)
text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)
text = re.sub(r'([A-Za-z])\s([.,!?])', r'\1\2', text)
text = re.sub(r'([.,!?])([A-Za-z])', r'\1 \2', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Load the model and tokenizer
model_name = "sherif31/T5-Grammer-Correction" # Replace with your actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
def correct_text(text):
# Split the text into chunks to avoid exceeding max token limit
max_chunk_length = 512
chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
corrected_chunks = []
for chunk in chunks:
input_text = f"grammar: {chunk}"
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
with torch.no_grad():
output = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5)
corrected_chunk = tokenizer.decode(output[0], skip_special_tokens=True)
return ' '.join(corrected_chunks)
def extract_text_from_pages(pdf_bytes):
page_text_dict = {}
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf_path = temp_pdf.name
with pdfplumber.open(temp_pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
raw_text = page.extract_text()
if raw_text:
cleaned_text = preprocess_text_for_tts(raw_text)
corrected_text = correct_text(cleaned_text)
page_text_dict[page_num] = corrected_text
page_text_dict[page_num] = ""
return page_text_dict
def process_pdf(pdf_file):
if pdf_file is None:
return "No file uploaded. Please upload a PDF file."
result = extract_text_from_pages(pdf_file)
# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
corrected_texts = list(executor.map(correct_text, result.values()))
# Combine the results
output = ""
for page_num, text in zip(result.keys(), corrected_texts):
output += f"Page {page_num}:\n{text}\n\n"
return output
# Create the Gradio interface
iface = gr.Interface(
inputs=gr.File(label="Upload PDF", type="binary"),
outputs=gr.Textbox(label="Extracted and Processed Text"),
title="PDF Text Extractor and Processor",
description="Upload a PDF file to extract, clean, and correct its text content."
# Launch the app