File size: 3,880 Bytes
94cefaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
import pdfplumber
import re
import tempfile
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from concurrent.futures import ThreadPoolExecutor
import spaces



@spaces.GPU
def preprocess_text_for_tts(text):
    text = re.sub(r'[^\x20-\x7E]', ' ', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text)
    text = re.sub(r'\.{2,}', ' ', text)
    
    def convert_case(match):
        word = match.group(0)
        common_abbreviations = {'AI', 'ML', 'NLP', 'CV', 'API', 'GPU', 'CPU', 'RAM', 'ROM', 'USA', 'UK', 'EU'}
        return word if word in common_abbreviations else word.title()

    text = re.sub(r'\b[A-Z]+\b', convert_case, text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.([A-Za-z])', r'. \1', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text)
    text = re.sub(r'([A-Za-z])\s([.,!?])', r'\1\2', text)
    text = re.sub(r'([.,!?])([A-Za-z])', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the model and tokenizer
model_name = "sherif31/T5-Grammer-Correction"  # Replace with your actual model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


def correct_text(text):
    # Split the text into chunks to avoid exceeding max token limit
    max_chunk_length = 512
    chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
    corrected_chunks = []

    for chunk in chunks:
        input_text = f"grammar: {chunk}"
        input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

        with torch.no_grad():
            output = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5)

        corrected_chunk = tokenizer.decode(output[0], skip_special_tokens=True)
        corrected_chunks.append(corrected_chunk)

    return ' '.join(corrected_chunks)

def extract_text_from_pages(pdf_bytes):
    page_text_dict = {}

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
        temp_pdf.write(pdf_bytes)
        temp_pdf_path = temp_pdf.name

    try:
        with pdfplumber.open(temp_pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                raw_text = page.extract_text()
                if raw_text:
                    cleaned_text = preprocess_text_for_tts(raw_text)
                    corrected_text = correct_text(cleaned_text)
                    page_text_dict[page_num] = corrected_text
                else:
                    page_text_dict[page_num] = ""
    finally:
        os.unlink(temp_pdf_path)

    return page_text_dict

def process_pdf(pdf_file):
    if pdf_file is None:
        return "No file uploaded. Please upload a PDF file."
    
    result = extract_text_from_pages(pdf_file)
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        corrected_texts = list(executor.map(correct_text, result.values()))
    
    # Combine the results
    output = ""
    for page_num, text in zip(result.keys(), corrected_texts):
        output += f"Page {page_num}:\n{text}\n\n"
    
    return output

# Create the Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF", type="binary"),
    outputs=gr.Textbox(label="Extracted and Processed Text"),
    title="PDF Text Extractor and Processor",
    description="Upload a PDF file to extract, clean, and correct its text content."
)

# Launch the app
iface.launch()