Spaces:

Pavan178
/

txt-pdf

Paused

File size: 3,880 Bytes

94cefaf

import gradio as gr
import pdfplumber
import re
import tempfile
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from concurrent.futures import ThreadPoolExecutor
import spaces



@spaces.GPU
def preprocess_text_for_tts(text):
    text = re.sub(r'[^\x20-\x7E]', ' ', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
    text = re.sub(r'\.{2,}', ' ', text)
    
    def convert_case(match):
        word = match.group(0)
        common_abbreviations = {'AI', 'ML', 'NLP', 'CV', 'API', 'GPU', 'CPU', 'RAM', 'ROM', 'USA', 'UK', 'EU'}
        return word if word in common_abbreviations else word.title()

    text = re.sub(r'\b[A-Z]+\b', convert_case, text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.([A-Za-z])', r'. \1', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)
    text = re.sub(r'([A-Za-z])\s([.,!?])', r'\1\2', text)
    text = re.sub(r'([.,!?])([A-Za-z])', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the model and tokenizer
model_name = "sherif31/T5-Grammer-Correction"  # Replace with your actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


def correct_text(text):
    # Split the text into chunks to avoid exceeding max token limit
    max_chunk_length = 512
    chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
    corrected_chunks = []

    for chunk in chunks:
        input_text = f"grammar: {chunk}"
        input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

        with torch.no_grad():
            output = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5)

        corrected_chunk = tokenizer.decode(output[0], skip_special_tokens=True)
        corrected_chunks.append(corrected_chunk)

    return ' '.join(corrected_chunks)

def extract_text_from_pages(pdf_bytes):
    page_text_dict = {}

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        temp_pdf.write(pdf_bytes)
        temp_pdf_path = temp_pdf.name

    try:
        with pdfplumber.open(temp_pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                raw_text = page.extract_text()
                if raw_text:
                    cleaned_text = preprocess_text_for_tts(raw_text)
                    corrected_text = correct_text(cleaned_text)
                    page_text_dict[page_num] = corrected_text
                else:
                    page_text_dict[page_num] = ""
    finally:
        os.unlink(temp_pdf_path)

    return page_text_dict

def process_pdf(pdf_file):
    if pdf_file is None:
        return "No file uploaded. Please upload a PDF file."
    
    result = extract_text_from_pages(pdf_file)
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        corrected_texts = list(executor.map(correct_text, result.values()))
    
    # Combine the results
    output = ""
    for page_num, text in zip(result.keys(), corrected_texts):
        output += f"Page {page_num}:\n{text}\n\n"
    
    return output

# Create the Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.Textbox(label="Extracted and Processed Text"),
    title="PDF Text Extractor and Processor",
    description="Upload a PDF file to extract, clean, and correct its text content."
)

# Launch the app
iface.launch()