|
import gradio as gr |
|
import pdfplumber |
|
import re |
|
import tempfile |
|
import os |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
from concurrent.futures import ThreadPoolExecutor |
|
import spaces |
|
|
|
|
|
|
|
@spaces.GPU |
|
def preprocess_text_for_tts(text): |
|
text = re.sub(r'[^\x20-\x7E]', ' ', text) |
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) |
|
text = re.sub(r'\S+@\S+', '', text) |
|
text = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '', text) |
|
text = re.sub(r'\.{2,}', ' ', text) |
|
|
|
def convert_case(match): |
|
word = match.group(0) |
|
common_abbreviations = {'AI', 'ML', 'NLP', 'CV', 'API', 'GPU', 'CPU', 'RAM', 'ROM', 'USA', 'UK', 'EU'} |
|
return word if word in common_abbreviations else word.title() |
|
|
|
text = re.sub(r'\b[A-Z]+\b', convert_case, text) |
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'\.([A-Za-z])', r'. \1', text) |
|
text = re.sub(r'([a-z])([A-Z])', r'\1. \2', text) |
|
text = re.sub(r'([A-Za-z])\s([.,!?])', r'\1\2', text) |
|
text = re.sub(r'([.,!?])([A-Za-z])', r'\1 \2', text) |
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
|
return text |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(f"Using device: {device}") |
|
|
|
|
|
model_name = "sherif31/T5-Grammer-Correction" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) |
|
|
|
|
|
def correct_text(text): |
|
|
|
max_chunk_length = 512 |
|
chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)] |
|
corrected_chunks = [] |
|
|
|
for chunk in chunks: |
|
input_text = f"grammar: {chunk}" |
|
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device) |
|
|
|
with torch.no_grad(): |
|
output = model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=5) |
|
|
|
corrected_chunk = tokenizer.decode(output[0], skip_special_tokens=True) |
|
corrected_chunks.append(corrected_chunk) |
|
|
|
return ' '.join(corrected_chunks) |
|
|
|
def extract_text_from_pages(pdf_bytes): |
|
page_text_dict = {} |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: |
|
temp_pdf.write(pdf_bytes) |
|
temp_pdf_path = temp_pdf.name |
|
|
|
try: |
|
with pdfplumber.open(temp_pdf_path) as pdf: |
|
for page_num, page in enumerate(pdf.pages, 1): |
|
raw_text = page.extract_text() |
|
if raw_text: |
|
cleaned_text = preprocess_text_for_tts(raw_text) |
|
corrected_text = correct_text(cleaned_text) |
|
page_text_dict[page_num] = corrected_text |
|
else: |
|
page_text_dict[page_num] = "" |
|
finally: |
|
os.unlink(temp_pdf_path) |
|
|
|
return page_text_dict |
|
|
|
def process_pdf(pdf_file): |
|
if pdf_file is None: |
|
return "No file uploaded. Please upload a PDF file." |
|
|
|
result = extract_text_from_pages(pdf_file) |
|
|
|
|
|
with ThreadPoolExecutor() as executor: |
|
corrected_texts = list(executor.map(correct_text, result.values())) |
|
|
|
|
|
output = "" |
|
for page_num, text in zip(result.keys(), corrected_texts): |
|
output += f"Page {page_num}:\n{text}\n\n" |
|
|
|
return output |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_pdf, |
|
inputs=gr.File(label="Upload PDF", type="binary"), |
|
outputs=gr.Textbox(label="Extracted and Processed Text"), |
|
title="PDF Text Extractor and Processor", |
|
description="Upload a PDF file to extract, clean, and correct its text content." |
|
) |
|
|
|
|
|
iface.launch() |
|
|