import streamlit as st import sparknlp import os import pandas as pd from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from sparknlp.pretrained import PretrainedPipeline from annotated_text import annotated_text # Page configuration st.set_page_config( layout="wide", page_title="Spark NLP Demos App", initial_sidebar_state="auto" ) # CSS for styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def init_spark(): return sparknlp.start() @st.cache_resource def create_pipeline(model): documentAssembler = DocumentAssembler()\ .setInputCol("text")\ .setOutputCol("document") sentence_detector = SentenceDetector() \ .setInputCols(["document"]) \ .setOutputCol("sentence") languageDetector = LanguageDetectorDL.pretrained(model)\ .setInputCols("sentence")\ .setOutputCol("language")\ .setThreshold(0.5)\ .setCoalesceSentences(True) nlpPipeline = Pipeline( stages=[ documentAssembler, sentence_detector, languageDetector]) return nlpPipeline def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) results = model.fullAnnotate(data)[0] return results # Set up the page layout st.markdown('
State-Of-The-Art Language Detection With Spark NLP
', unsafe_allow_html=True) st.subheader('Support for 375 different languages') # Sidebar content model = st.sidebar.selectbox( "Choose the pretrained model", ["ld_wiki_tatoeba_cnn_375"], help="For more info about the models visit: https://sparknlp.org/models" ) with st.expander("View Supported Languges"): st.write("Abkhaz, Iraqi Arabic, Adyghe, Afrikaans, Gulf Arabic, Afrihili, Assyrian Neo-Aramaic, Ainu, Aklanon, Gheg Albanian, Amharic, Aragonese, Old English, Uab Meto, North Levantine Arabic, Arabic, Algerian Arabic, Moroccan Arabic, Egyptian Arabic, Assamese, Asturian, Kotava, Awadhi, Aymara, Azerbaijani, Bashkir, Baluchi, Balinese, Bavarian, Central Bikol, Belarusian, Berber, Bulgarian, Bhojpuri, Bislama, Banjar, Bambara, Bengali, Tibetan, Breton, Bodo, Bosnian, Buryat, Baybayanon, Brithenig, Catalan, Cayuga, Chavacano, Chechen, Cebuano, Chamorro, Chagatai, Chinook Jargon, Choctaw, Cherokee, Jin Chinese, Chukchi, Central Mnong, Corsican, Chinese Pidgin English, Crimean Tatar, Seychellois Creole, Czech, Kashubian, Chuvash, Welsh, CycL, Cuyonon, Danish, German, Dungan, Drents, Lower Sorbian, Central Dusun, Dhivehi, Dutton World Speedwords, Ewe, Emilian, Greek, Erromintxela, English, Middle English, Esperanto, Spanish, Estonian, Basque, Evenki, Extremaduran, Persian, Finnish, Fijian, Kven Finnish, Faroese, French, Middle French, Old French, North Frisian, Pulaar, Friulian, Nigerian Fulfulde, Frisian, Irish, Ga, Gagauz, Gan Chinese, Garhwali, Guadeloupean Creole French, Scottish Gaelic, Gilbertese, Galician, Guarani, Konkani (Goan), Gronings, Gothic, Ancient Greek, Swiss German, Gujarati, Manx, Hausa, Hakka Chinese, Hawaiian, Ancient Hebrew, Hebrew, Hindi, Fiji Hindi, Hiligaynon, Hmong Njua (Green), Ho, Croatian, Hunsrik, Upper Sorbian, Xiang Chinese, Haitian Creole, Hungarian, Armenian, Interlingua, Iban, Indonesian, Interlingue, Igbo, Nuosu, Inuktitut, Ilocano, Ido, Icelandic, Italian, Ingrian, Japanese, Jamaican Patois, Lojban, Juhuri (Judeo-Tat), Jewish Palestinian Aramaic, Javanese, Georgian, Karakalpak, Kabyle, Kamba, Kekchi (Q'eqchi'), Khasi, Khakas, Kazakh, Greenlandic, Khmer, Kannada, Korean, Komi-Permyak, Komi-Zyrian, Karachay-Balkar, Karelian, Kashmiri, Kölsch, Kurdish, Kumyk, Cornish, Keningau Murut, Kyrgyz, Coastal Kadazan, Latin, Southern Subanen, Ladino, Luxembourgish, Láadan, Lingua Franca Nova, Luganda, Ligurian, Livonian, Lakota, Ladin, Lombard, Lingala, Lao, Louisiana Creole, Lithuanian, Latgalian, Latvian, Latvian, Literary Chinese, Laz, Madurese, Maithili, North Moluccan Malay, Moksha, Morisyen, Malagasy, Mambae, Marshallese, Meadow Mari, Maori, Mi'kmaq, Minangkabau, Macedonian, Malayalam, Mongolian, Manchu, Mon, Mohawk, Marathi, Hill Mari, Malay, Maltese, Tagal Murut, Mirandese, Hmong Daw (White), Burmese, Erzya, Nauruan, Nahuatl, Norwegian Bokmål, Central Huasteca Nahuatl, Low German (Low Saxon), Nepali, Newari, Ngeq, Guerrero Nahuatl, Niuean, Dutch, Orizaba Nahuatl, Norwegian Nynorsk, Norwegian, Nogai, Old Norse, Novial, Nepali, Naga (Tangshang), Navajo, Chinyanja, Nyungar, Old Aramaic, Occitan, Ojibwe, Odia (Oriya), Old East Slavic, Ossetian, Old Spanish, Old Saxon, Ottoman Turkish, Old Turkish, Punjabi (Eastern), Pangasinan, Kapampangan, Papiamento, Palauan, Picard, Pennsylvania German, Palatine German, Phoenician, Pali, Polish, Piedmontese, Punjabi (Western), Pipil, Old Prussian, Pashto, Portuguese, Quechua, K'iche', Quenya, Rapa Nui, Rendille, Tarifit, Romansh, Kirundi, Romanian, Romani, Russian, Rusyn, Kinyarwanda, Okinawan, Sanskrit, Yakut, Sardinian, Sicilian, Scots, Sindhi, Northern Sami, Sango, Samogitian, Shuswap, Tachawit, Sinhala, Sindarin, Slovak, Slovenian, Samoan, Southern Sami, Shona, Somali, Albanian, Serbian, Swazi, Southern Sotho, Saterland Frisian, Sundanese, Sumerian, Swedish, Swahili, Swabian, Swahili, Syriac, Tamil, Telugu, Tetun, Tajik, Thai, Tahaggart Tamahaq, Tigrinya, Tigre, Turkmen, Tokelauan, Tagalog, Klingon, Talysh, Jewish Babylonian Aramaic, Temuan, Setswana, Tongan, Tonga (Zambezi), Toki Pona, Tok Pisin, Old Tupi, Turkish, Tsonga, Tatar, Isan, Tuvaluan, Tahitian, Tuvinian, Talossan, Udmurt, Uyghur, Ukrainian, Umbundu, Urdu, Urhobo, Uzbek, Venetian, Veps, Vietnamese, Volapük, Võro, Walloon, Waray, Wolof, Shanghainese, Kalmyk, Xhosa, Mingrelian, Yiddish, Yoruba, Cantonese, Chinese, Malay (Vernacular), Malay, Zulu, and Zaza.") # Reference notebook link in sidebar link = """ Open In Colab """ st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Load examples folder_path = f"inputs/{model}" examples = [ lines[1].strip() for filename in os.listdir(folder_path) if filename.endswith('.txt') for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()] if len(lines) >= 2 ] selected_text = st.selectbox("Select a sample text", examples) custom_input = st.text_input("Try it for yourself!") if custom_input: selected_text = custom_input elif selected_text: selected_text = selected_text st.subheader('Selected Text') st.markdown("""
{selected_text}
""", unsafe_allow_html=True) # Initialize Spark and create pipeline spark = init_spark() pipeline = create_pipeline(model) output = fit_data(pipeline, selected_text) # Display output language_map = { 'ab': "Abkhaz", 'ace': "Achinese", 'acm': "Iraqi Arabic", 'ady': "Adyghe", 'af': "Afrikaans", 'afb': "Gulf Arabic", 'afh': "Afrihili", 'aii': "Assyrian Neo-Aramaic", 'ain': "Ainu", 'akl': "Aklanon", 'aln': "Gheg Albanian", 'als': "Tosk Albanian", 'am': "Amharic", 'an': "Aragonese", 'ang': "Old English", 'aoz': "Uab Meto", 'apc': "North Levantine Arabic", 'ar': "Arabic", 'arq': "Algerian Arabic", 'ary': "Moroccan Arabic", 'arz': "Egyptian Arabic", 'as': "Assamese", 'ast': "Asturian", 'av': "Avaric", 'avk': "Kotava", 'awa': "Awadhi", 'ay': "Aymara", 'az': "Azerbaijani", 'azb': "South Azerbaijani", 'ba': "Bashkir", 'bal': "Baluchi", 'ban': "Balinese", 'bar': "Bavarian", 'bat-smg': "bat-smg", 'bcl': "Central Bikol", 'be': "Belarusian", 'ber': "Berber", 'bg': "Bulgarian", 'bh': "bh", 'bho': "Bhojpuri", 'bi': "Bislama", 'bjn': "Banjar", 'bm': "Bambara", 'bn': "Bengali", 'bo': "Tibetan", 'bpy': "Bishnupriya", 'br': "Breton", 'brx': "Bodo", 'bs': "Bosnian", 'bua': "Buryat", 'bvy': "Baybayanon", 'bxr': "Russia Buriat", 'bzt': "Brithenig", 'ca': "Catalan", 'cay': "Cayuga", 'cbk': "Chavacano", 'cbk-zam': "cbk-zam", 'cdo': "Min Dong Chinese", 'ce': "Chechen", 'ceb': "Cebuano", 'ch': "Chamorro", 'chg': "Chagatai", 'chn': "Chinook Jargon", 'cho': "Choctaw", 'chr': "Cherokee", 'cjy': "Jin Chinese", 'ckb': "Central Kurdish (Soranî)", 'ckt': "Chukchi", 'cmo': "Central Mnong", 'co': "Corsican", 'cpi': "Chinese Pidgin English", 'crh': "Crimean Tatar", 'crs': "Seychellois Creole", 'cs': "Czech", 'ces': "Czech", 'csb': "Kashubian", 'cv': "Chuvash", 'cy': "Welsh", 'cycl': "CycL", 'cyo': "Cuyonon", 'da': "Danish", 'de': "German", 'deu': "German", 'diq': "Dimli (individual language)", 'dng': "Dungan", 'drt': "Drents", 'dsb': "Lower Sorbian", 'dtp': "Central Dusun", 'dty': "dty", 'dv': "Dhivehi", 'dws': "Dutton World Speedwords", 'ee': "Ewe", 'egl': "Emilian", 'el': "Greek", 'ell': "Greek", 'eml': "eml", 'emx': "Erromintxela", 'en': "English", 'enm': "Middle English", 'eo': "Esperanto", 'es': "Spanish", 'et': "Estonian", 'eu': "Basque", 'evn': "Evenki", 'ext': "Extremaduran", 'fa': "Persian", 'fi': "Finnish", 'fiu-vro': "fiu-vro", 'fj': "Fijian", 'fkv': "Kven Finnish", 'fo': "Faroese", 'fr': "French", 'fra': "French", 'frm': "Middle French", 'fro': "Old French", 'frp': "Arpitan", 'frr': "North Frisian", 'fuc': "Pulaar", 'fur': "Friulian", 'fuv': "Nigerian Fulfulde", 'fy': "Frisian", 'ga': "Irish", 'gaa': "Ga", 'gag': "Gagauz", 'gan': "Gan Chinese", 'gbm': "Garhwali", 'gcf': "Guadeloupean Creole French", 'gd': "Scottish Gaelic", 'gil': "Gilbertese", 'gl': "Galician", 'glk': "Gilaki", 'gn': "Guarani", 'gom': "Konkani (Goan)", 'gos': "Gronings", 'got': "Gothic", 'grc': "Ancient Greek", 'gsw': "Swiss German", 'gu': "Gujarati", 'gv': "Manx", 'ha': "Hausa", 'hak': "Hakka Chinese", 'haw': "Hawaiian", 'hbo': "Ancient Hebrew", 'he': "Hebrew", 'hi': "Hindi", 'hif': "Fiji Hindi", 'hil': "Hiligaynon", 'hnj': "Hmong Njua (Green)", 'hoc': "Ho", 'hr': "Croatian", 'hrx': "Hunsrik", 'hsb': "Upper Sorbian", 'hsn': "Xiang Chinese", 'ht': "Haitian Creole", 'hu': "Hungarian", 'hy': "Armenian", 'ia': "Interlingua", 'iba': "Iban", 'id': "Indonesian", 'ie': "Interlingue", 'ig': "Igbo", 'ii': "Nuosu", 'ike': "Inuktitut", 'ilo': "Ilocano", 'io': "Ido", 'is': "Icelandic", 'it': "Italian", 'izh': "Ingrian", 'ja': "Japanese", 'jam': "Jamaican Patois", 'jbo': "Lojban", 'jdt': "Juhuri (Judeo-Tat)", 'jpa': "Jewish Palestinian Aramaic", 'jv': "Javanese", 'ka': "Georgian", 'kaa': "Karakalpak", 'kab': "Kabyle", 'kam': "Kamba", 'kbd': "Kabardian", 'kek': "Kekchi (Q'eqchi')", 'kha': "Khasi", 'kjh': "Khakas", 'kk': "Kazakh", 'kl': "Greenlandic", 'km': "Khmer", 'kn': "Kannada", 'ko': "Korean", 'koi': "Komi-Permyak", 'kpv': "Komi-Zyrian", 'krc': "Karachay-Balkar", 'krl': "Karelian", 'ks': "Kashmiri", 'ksh': "Kölsch", 'ku': "Kurdish", 'kum': "Kumyk", 'kv': "Komi", 'kw': "Cornish", 'kxi': "Keningau Murut", 'ky': "Kyrgyz", 'kzj': "Coastal Kadazan", 'la': "Latin", 'laa': "Southern Subanen", 'lad': "Ladino", 'lb': "Luxembourgish", 'ldn': "Láadan", 'lez': "Lezghian", 'lfn': "Lingua Franca Nova", 'lg': "Luganda", 'li': "Limburgan", 'lij': "Ligurian", 'liv': "Livonian", 'lkt': "Lakota", 'lld': "Ladin", 'lmo': "Lombard", 'ln': "Lingala", 'lo': "Lao", 'lou': "Louisiana Creole", 'lrc': "Northern Luri", 'lt': "Lithuanian", 'ltg': "Latgalian", 'lv': "Latvian", 'lvs': "Latvian", 'lzh': "Literary Chinese", 'lzz': "Laz", 'mad': "Madurese", 'mai': "Maithili", 'map-bms': "map-bms", 'max': "North Moluccan Malay", 'mdf': "Moksha", 'mfe': "Morisyen", 'mg': "Malagasy", 'mgm': "Mambae", 'mh': "Marshallese", 'mhr': "Meadow Mari", 'mi': "Maori", 'mic': "Mi'kmaq", 'min': "Minangkabau", 'mk': "Macedonian", 'ml': "Malayalam", 'mn': "Mongolian", 'mnc': "Manchu", 'mnw': "Mon", 'moh': "Mohawk", 'mr': "Marathi", 'mrj': "Hill Mari", 'ms': "Malay", 'mt': "Maltese", 'mvv': "Tagal Murut", 'mwl': "Mirandese", 'mww': "Hmong Daw (White)", 'my': "Burmese", 'myv': "Erzya", 'mzn': "Mazanderani", 'na': "Nauruan", 'nah': "Nahuatl", 'nap': "Neapolitan", 'nb': "Norwegian Bokmål", 'nch': "Central Huasteca Nahuatl", 'nds': "Low German (Low Saxon)", 'nds-nl': "nds-nl", 'ne': "Nepali", 'new': "Newari", 'ngt': "Ngeq", 'ngu': "Guerrero Nahuatl", 'niu': "Niuean", 'nl': "Dutch", 'nlv': "Orizaba Nahuatl", 'nn': "Norwegian Nynorsk", 'no': "Norwegian", 'nog': "Nogai", 'non': "Old Norse", 'nov': "Novial", 'npi': "Nepali", 'nrm': "Narom", 'nso': "Pedi", 'nst': "Naga (Tangshang)", 'nv': "Navajo", 'ny': "Chinyanja", 'nys': "Nyungar", 'oar': "Old Aramaic", 'oc': "Occitan", 'oj': "Ojibwe", 'olo': "Livvi", 'om': "Oromo", 'or': "Odia (Oriya)", 'orv': "Old East Slavic", 'os': "Ossetian", 'osp': "Old Spanish", 'osx': "Old Saxon", 'ota': "Ottoman Turkish", 'otk': "Old Turkish", 'pa': "Punjabi (Eastern)", 'pag': "Pangasinan", 'pam': "Kapampangan", 'pap': "Papiamento", 'pau': "Palauan", 'pcd': "Picard", 'pdc': "Pennsylvania German", 'pfl': "Palatine German", 'phn': "Phoenician", 'pi': "Pali", 'pl': "Polish", 'pms': "Piedmontese", 'pnb': "Punjabi (Western)", 'ppl': "Pipil", 'prg': "Old Prussian", 'ps': "Pashto", 'pt': "Portuguese", 'qu': "Quechua", 'quc': "K'iche'", 'qya': "Quenya", 'rap': "Rapa Nui", 'rel': "Rendille", 'rif': "Tarifit", 'rm': "Romansh", 'rn': "Kirundi", 'ro': "Romanian", 'ron': "Romanian", 'roa-rup': "roa-rup", 'roa-tara': "roa-tara", 'rom': "Romani", 'ru': "Russian", 'rue': "Rusyn", 'rw': "Kinyarwanda", 'ryu': "Okinawan", 'sa': "Sanskrit", 'sah': "Yakut", 'sc': "Sardinian", 'scn': "Sicilian", 'sco': "Scots", 'sd': "Sindhi", 'se': "Northern Sami", 'sg': "Sango", 'sgs': "Samogitian", 'sh': "Serbo-Croatian", 'shs': "Shuswap", 'shy': "Tachawit", 'si': "Sinhala", 'sjn': "Sindarin", 'sk': "Slovak", 'slk': "Slovak", 'sl': "Slovenian", 'sm': "Samoan", 'sma': "Southern Sami", 'sn': "Shona", 'so': "Somali", 'sq': "Albanian", 'sr': "Serbian", 'srn': "Sranan Tongo", 'ss': "Swazi", 'st': "Southern Sotho", 'stq': "Saterland Frisian", 'su': "Sundanese", 'sux': "Sumerian", 'sv': "Swedish", 'sw': "Swahili", 'swg': "Swabian", 'swh': "Swahili", 'syc': "Syriac", 'szl': "Silesian", 'ta': "Tamil", 'tcy': "Tulu", 'te': "Telugu", 'tet': "Tetun", 'tg': "Tajik", 'th': "Thai", 'thv': "Tahaggart Tamahaq", 'ti': "Tigrinya", 'tig': "Tigre", 'tk': "Turkmen", 'tkl': "Tokelauan", 'tl': "Tagalog", 'tlh': "Klingon", 'tly': "Talysh", 'tmr': "Jewish Babylonian Aramaic", 'tmw': "Temuan", 'tn': "Setswana", 'to': "Tongan", 'toi': "Tonga (Zambezi)", 'toki': "Toki Pona", 'tpi': "Tok Pisin", 'tpw': "Old Tupi", 'tr': "Turkish", 'ts': "Tsonga", 'tt': "Tatar", 'tts': "Isan", 'tvl': "Tuvaluan", 'ty': "Tahitian", 'tyv': "Tuvinian", 'tzl': "Talossan", 'udm': "Udmurt", 'ug': "Uyghur", 'uk': "Ukrainian", 'umb': "Umbundu", 'ur': "Urdu", 'urh': "Urhobo", 'uz': "Uzbek", 'vec': "Venetian", 'vep': "Veps", 'vi': "Vietnamese", 'vls': "Vlaams", 'vo': "Volapük", 'vro': "Võro", 'wa': "Walloon", 'war': "Waray", 'wo': "Wolof", 'wuu': "Shanghainese", 'xal': "Kalmyk", 'xh': "Xhosa", 'xmf': "Mingrelian", 'yi': "Yiddish", 'yo': "Yoruba", 'yue': "Cantonese", 'zea': "Zeeuws", 'zh': "Chinese", 'zh-classical': "zh-classical", 'zh-min-nan': "zh-min-nan", 'zh-yue': "zh-yue", 'zlm': "Malay (Vernacular)", 'zsm': "Malay", 'zu': "Zulu", 'zza': "Zaza" } language = language_map[output['language'][0].result] confidence = round(float(output['language'][0].metadata[language])*100, 2) st.markdown(f"This text is in **{language} ({output['language'][0].result})** language.") st.markdown(f"Classification Confidence: **{confidence}%**")