File size: 5,642 Bytes
8d64fe0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import streamlit as st
import sparknlp
import os

from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline

# Page Configuration
st.set_page_config(
    layout="wide", 
    initial_sidebar_state="auto"
)

# Custom CSS for Styling
st.markdown("""

    <style>

        .main-title {

            font-size: 36px;

            color: #4A90E2;

            font-weight: bold;

            text-align: center;

        }

        .section {

            background-color: #f9f9f9;

            padding: 10px;

            border-radius: 10px;

            margin-top: 10px;

        }

        .section p, .section ul {

            color: #666666;

        }

    </style>

""", unsafe_allow_html=True)

# Initialize Spark NLP
@st.cache_resource
def init_spark():
    return sparknlp.start()

# Create a Spark NLP Pipeline for MarianTransformer
@st.cache_resource
def create_pipeline(model_name):
    document_assembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

    sentence_detector = SentenceDetectorDLModel()\
        .pretrained("sentence_detector_dl", "xx")\
        .setInputCols(["document"])\
        .setOutputCol("sentences")

    marian_translator = MarianTransformer.pretrained(model_name, "xx")\
        .setInputCols(["sentences"])\
        .setOutputCol("translation")

    return Pipeline(stages=[document_assembler, sentence_detector, marian_translator])

# Process the Input Text Through the Pipeline
def fit_data(pipeline, text):
    data = spark.createDataFrame([[text]]).toDF("text")
    result = pipeline.fit(data).transform(data)
    return result.select('translation.result').collect()

# Title and Subtitle
title = 'Multilingual Text Translation with Spark NLP and MarianMT'
sub_title = """

The MarianTransformer is a powerful, state-of-the-art machine translation model based on the Transformer architecture. Developed by the MarianMT project, this annotator supports over 1,000 translation directions, making it one of the most versatile tools for multilingual natural language processing. Integrated within Spark NLP, the MarianTransformer Annotator allows for scalable and efficient text translation, leveraging the parallel processing capabilities of Apache Spark. Whether you're translating large documents or handling multiple languages simultaneously, this tool ensures high-quality translations with minimal latency.

"""

st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)

# Mapping Models to Descriptions
model_mappings = {
    "opus_mt_en_fr": "Translate text from English to French",
    "opus_mt_en_it": "Translate text from English to Italian",
    "opus_mt_en_es": "Translate text from English to Spanish",
    "opus_mt_en_de": "Translate text from English to German",
    "opus_mt_en_cpp": "Translate text from English to Portuguese",
    "opus_mt_fr_en": "Translate text from French to English",
    "opus_mt_it_en": "Translate text from Italian to English",
    "opus_mt_es_en": "Translate text from Spanish to English",
    "opus_mt_de_en": "Translate text from German to English",
    "opus_mt_cpp_en": "Translate text from Portuguese to English"
}

# Sidebar for Language Selection
st.sidebar.title("Language Selection")

language_mapping = {
    "English": 'en', 
    "French": 'fr', 
    "Italian": 'it', 
    "Spanish": 'es', 
    "German": 'de', 
    "Portuguese": 'cpp'
}

from_language = st.sidebar.selectbox("Translate From", list(language_mapping.keys()))

if from_language == 'English':
    to_language = st.sidebar.selectbox("Translate To", ['French', 'Italian', 'Spanish', 'German', 'Portuguese'])
else:
    to_language = st.sidebar.selectbox("Translate To", ['English'])

selected_model = f'opus_mt_{language_mapping[from_language]}_{language_mapping[to_language]}'
st.subheader(model_mappings[selected_model])

# Reference Notebook Link in Sidebar
link= """<a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/TRANSLATION_MARIAN.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/></a>"""
st.sidebar.title('')
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)

# Load Sample Text Files
folder_path = f"inputs/{selected_model}"
examples = [
    lines[1].strip() 
    for filename in os.listdir(folder_path) 
    if filename.endswith('.txt') 
    for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()] 
    if len(lines) >= 2
]

selected_text = st.selectbox("Select a Sample Text", examples)
custom_input = st.text_input("Try it for yourself!")

if custom_input:
    selected_text = custom_input

# Display the Selected or Entered Text
st.subheader('Selected Text')
st.write(selected_text)

# Perform Translation and Display the Result
st.subheader("Translation Result")

spark = init_spark()
pipeline = create_pipeline(selected_model)
output = fit_data(pipeline, selected_text)

res = "".join(output[0][0])
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
st.markdown(HTML_WRAPPER.format(res), unsafe_allow_html=True)