Aytaj commited on
Commit
46d411c
·
1 Parent(s): 4f58b0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -29
app.py CHANGED
@@ -1,6 +1,11 @@
1
  import streamlit as st
2
  from transformers import T5Tokenizer, T5ForConditionalGeneration
3
  from pdfminer.high_level import extract_text
 
 
 
 
 
4
 
5
  def main():
6
  st.title("PDF Translation")
@@ -10,42 +15,46 @@ def main():
10
  if uploaded_file is not None:
11
  # Extract text from pdf
12
  text = extract_text(uploaded_file)
13
- tokenizer = T5Tokenizer.from_pretrained("t5-small")
14
- model = T5ForConditionalGeneration.from_pretrained("t5-small")
15
 
16
  # Define translation prefixes for each language
17
  translation_prefixes = {
18
  "german": "translate English to German: ",
19
  "french": "translate English to French: "
20
  }
21
-
22
- # Generate translations for each language
23
- translations = {}
24
-
25
  # Buttons to trigger translation
26
  translate_german = st.button("Translate to German")
27
  translate_french = st.button("Translate to French")
28
-
29
- for language, prefix in translation_prefixes.items():
30
- # Translate the entire text, not page by page
31
- text_to_translate = prefix + text
32
- input_ids = tokenizer(text_to_translate, return_tensors="pt").input_ids
33
- outputs = model.generate(input_ids=input_ids, max_length=150, num_beams=4, no_repeat_ngram_size=2)
34
- translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
35
- translations[language] = translated_text
36
-
37
- # Display the translations based on the button clicked
38
- if translate_german:
39
- display_translation(translations["german"], "German")
40
-
41
- if translate_french:
42
- display_translation(translations["french"], "French")
43
-
44
-
45
- def display_translation(translation, language):
 
 
 
 
 
 
 
 
 
46
  st.write(f"\nLanguage: {language}")
47
- st.write(f"Translation:\n {translation}")
48
-
49
-
50
- if __name__ == "__main__":
51
- main()
 
1
  import streamlit as st
2
  from transformers import T5Tokenizer, T5ForConditionalGeneration
3
  from pdfminer.high_level import extract_text
4
+ import nltk
5
+ from nltk import sent_tokenize
6
+
7
+ # Download the punkt tokenizer for sentence segmentation
8
+ nltk.download('punkt')
9
 
10
  def main():
11
  st.title("PDF Translation")
 
15
  if uploaded_file is not None:
16
  # Extract text from pdf
17
  text = extract_text(uploaded_file)
18
+ tokenizer = T5Tokenizer.from_pretrained("t5-base")
19
+ model = T5ForConditionalGeneration.from_pretrained("t5-base")
20
 
21
  # Define translation prefixes for each language
22
  translation_prefixes = {
23
  "german": "translate English to German: ",
24
  "french": "translate English to French: "
25
  }
26
+ # Variables to track translation state
27
+ translated_german = False
28
+ translated_french = False
 
29
  # Buttons to trigger translation
30
  translate_german = st.button("Translate to German")
31
  translate_french = st.button("Translate to French")
32
+ # Translate and display for German
33
+ if translate_german and not translated_german:
34
+ translated_sentences_german = translate_text(text, translation_prefixes["german"], tokenizer, model)
35
+ display_translation(translated_sentences_german, "German")
36
+ translated_german = True
37
+ # Translate and display for French
38
+ if translate_french and not translated_french:
39
+ translated_sentences_french = translate_text(text, translation_prefixes["french"], tokenizer, model)
40
+ display_translation(translated_sentences_french, "French")
41
+ translated_french = True
42
+
43
+ def translate_text(text, prefix, tokenizer, model):
44
+ # Split text into sentences
45
+ sentences = sent_tokenize(text)
46
+
47
+ # Translate each sentence
48
+ translated_sentences = []
49
+ for sentence in sentences:
50
+ text_to_translate = prefix + sentence
51
+ input_ids = tokenizer(text_to_translate, return_tensors="pt").input_ids
52
+ outputs = model.generate(input_ids=input_ids, max_length=500, num_beams=4, no_repeat_ngram_size=2)
53
+ translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
54
+ translated_sentences.append(translated_text)
55
+
56
+ return translated_sentences
57
+
58
+ def display_translation(translations, language):
59
  st.write(f"\nLanguage: {language}")
60
+ st.write(f"Translation:\n {' '.join(translations)}")