import streamlit as st from transformers import T5Tokenizer, T5ForConditionalGeneration from pdfminer.high_level import extract_text import nltk from nltk import sent_tokenize # Download the punkt tokenizer for sentence segmentation nltk.download('punkt') def main(): st.title("PDF Translation") # Upload the pdf uploaded_file = st.file_uploader("Upload a PDF file and we will translate the text inside to German and French", type=["pdf"]) if uploaded_file is not None: # Extract text from pdf text = extract_text(uploaded_file) tokenizer = T5Tokenizer.from_pretrained("t5-base") model = T5ForConditionalGeneration.from_pretrained("t5-base") # Define translation prefixes for each language translation_prefixes = { "german": "translate English to German: ", "french": "translate English to French: " } # Variables to track translation state translated_german = False translated_french = False # Buttons to trigger translation translate_german = st.button("Translate to German") translate_french = st.button("Translate to French") # Translate and display for German if translate_german and not translated_german: translated_sentences_german = translate_text(text, translation_prefixes["german"], tokenizer, model) display_translation(translated_sentences_german, "German") translated_german = True # Translate and display for French if translate_french and not translated_french: translated_sentences_french = translate_text(text, translation_prefixes["french"], tokenizer, model) display_translation(translated_sentences_french, "French") translated_french = True def translate_text(text, prefix, tokenizer, model): # Split text into sentences sentences = sent_tokenize(text) # Translate each sentence translated_sentences = [] for sentence in sentences: text_to_translate = prefix + sentence input_ids = tokenizer(text_to_translate, return_tensors="pt").input_ids outputs = model.generate(input_ids=input_ids, max_length=500, num_beams=4, no_repeat_ngram_size=2) translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) translated_sentences.append(translated_text) return translated_sentences def display_translation(translations, language): st.write(f"\nLanguage: {language}") st.write(f"Translation:\n {' '.join(translations)}") if __name__ == "__main__": main()