Spaces:

cdleong
/

langcode-search

Runtime error

File size: 4,547 Bytes

68a8c29
6570b48
6390590
1dcc788
 
58b95fa
5a1315d
9989672
1dcc788
58b95fa
39897d9
a0fdec6
 
abed01c
1392687
39897d9
1392687
39897d9
 
e3d850e
 
ff7c666
67daf03
cde8835
e3ca56d
726336c
cde8835
e3d850e
21247cf
606d796
a397155
0fef655
 
 
 
d6aa39c
0fef655
947dc2d
7e88eb4
a8b6772
0fef655
2f590b1
16fc4ca
92a84ae
 
 
 
9989672
5d305df
 
 
 
 
 
 
 
 
 
92a84ae
dedac74
8329262
b65ecd9
878ffe0
 
fa7755f
1392687
431f228
878ffe0
 
fa7755f
 
 
 
 
 
 
 
 
 
 
 
1392687
 
 
 
0ca7b2b
fa7755f
92a84ae
59b8b02
a1ef3a3
 
537dd73
92a84ae
ca54e5d
fa7755f
 
07e3fb8

import streamlit as st
import langcodes

st.write("# Language code/tag search")
st.write("Fed up with language tag confusion? Here's your one-stop shop!")
st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English`, and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
# https://huggingface.co/blog/streamlit-spaces
# https://github.com/psf/requests-html
# https://docs.streamlit.io/library/api-reference/write-magic/st.write
langtext = st.text_input("Language Code/Tag Lookup using langcodes", "english")

st.write("Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")

if langcodes.tag_is_valid(langtext):
  st.write(f"...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
else:
  st.write(f"...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
    

try:
  lang = langcodes.Language.get(langtext)
#  st.write(f"{lang} is the BCP-47 tag.")
  if "unknown" in lang.display_name().lower():
    st.write(f"Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
    lang = None
except langcodes.LanguageTagError as e: 
  st.write(f"Could not lookup code directly, attempting to search for it as a natural language string.")
  lang = None
  


if lang is None:
  try:
    found = langcodes.find(langtext)
    lang = found
    st.write(f"natural language search found the following BCP-47 tag: {lang}")
  except LookupError as e:
    st.write(f"Unable to look up language code.")
    st.write(f"Try also: https://r12a.github.io/app-subtags/")
    st.write(f"Try also: https://glottolog.org/glottolog?search={langtext}")
    lang = None


def pull_obsolete_codes(iso_code):
  from requests_html import HTMLSession
  session = HTMLSession() 
  r= session.get(f"https://iso639-3.sil.org/code/{iso_code}")
  # https://www.w3schools.com/cssref/css_selectors.asp
  for found_element in r.html.find(".views-field-nothing", clean=True):
    lines = found_element.text.splitlines()
    # lines = text.splitlines()
    obsolete_codes = {}
    for line in lines:
      for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]:
        if obsolete_code_name in line:
  
          code = line.split()[-1]
          obsolete_codes[obsolete_code_name] = code  
  return obsolete_codes


#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
if lang is not None: 
  display = lang.display_name()
  st.write("## Results")
  st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")  
  st.write(f"Breakdown of tag components: {lang.describe()}")  
  st.write(f"Display name for {lang}: {lang.display_name()}")
  st.write(f"Autonym for {lang}: {lang.autonym()}")
  st.write(f"## Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library: {langcodes.standardize_tag(lang)}")
  
  
  st.write("## Further Information:")
  broader_tags = lang.broader_tags()
  st.write(f"Broader tags for this language, if any:")
  st.write(broader_tags)
  st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}")
  st.write(f"https://glottolog.org/glottolog?search={t_variant} may be of interest, with links to Ethnologue, etc. If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}")  

  # ethnologue prefers T for german (deu), and T for French
  st.write("## Older Codes")
  b_variant = lang.to_alpha3(variant='B')
  t_variant = lang.to_alpha3(variant='T')
  st.write(f"ISO 639-3 'alpha3' code, 'terminology' variant (deprecated): {t_variant}")
  st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' variant (deprecated): {b_variant}")
  st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}")
  st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")  
  obsolete_codes = pull_obsolete_codes(t_variant)
  if obsolete_codes:
    st.write(f"Obsolete codes from previous ISO-639 iterations:")
    st.write(obsolete_codes)