umarigan commited on
Commit
81805e8
·
verified ·
1 Parent(s): a1bb499

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -53
app.py CHANGED
@@ -22,21 +22,10 @@ def read_file(file):
22
  st.error("Unsupported file type")
23
  return None
24
 
25
- # Rest of your code remains the same
26
- example_list = [
27
- "Mustafa Kemal Atatürk 1919 yılında Samsun'a çıktı.",
28
- """Mustafa Kemal Atatürk, Türk asker, devlet adamı ve Türkiye Cumhuriyeti'nin kurucusudur.
29
- # ... (rest of the example text)
30
- """
31
- ]
32
-
33
  st.title("Demo for Turkish NER Models")
34
 
35
  model_list = [
36
- 'akdeniz27/bert-base-turkish-cased-ner',
37
- 'akdeniz27/convbert-base-turkish-cased-ner',
38
  'girayyagmur/bert-base-turkish-ner-cased',
39
- 'FacebookAI/xlm-roberta-large',
40
  'savasy/bert-base-turkish-ner-cased',
41
  'xlm-roberta-large-finetuned-conll03-english',
42
  'asahi417/tner-xlm-roberta-base-ontonotes5'
@@ -46,46 +35,41 @@ st.sidebar.header("Select NER Model")
46
  model_checkpoint = st.sidebar.radio("", model_list)
47
 
48
  st.sidebar.write("For details of models: 'https://huggingface.co/akdeniz27/")
49
- st.sidebar.write("")
50
 
51
- if model_checkpoint in ["akdeniz27/xlm-roberta-base-turkish-ner", "xlm-roberta-large-finetuned-conll03-english", "asahi417/tner-xlm-roberta-base-ontonotes5"]:
52
- aggregation = "simple"
53
- if model_checkpoint != "akdeniz27/xlm-roberta-base-turkish-ner":
54
- st.sidebar.write("The selected NER model is included just to show the zero-shot transfer learning capability of XLM-Roberta pretrained language model.")
55
- else:
56
- aggregation = "first"
57
 
58
  st.subheader("Select Text Input Method")
59
  input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text', 'Upload File'))
60
 
 
 
 
 
 
61
  if input_method == 'Select from Examples':
62
- selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1)
63
- input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2)
64
  elif input_method == "Write or Paste New Text":
65
- input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2)
66
  else:
67
  uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
68
  if uploaded_file is not None:
69
  input_text = read_file(uploaded_file)
70
  if input_text:
71
- st.text_area("Extracted Text", input_text, height=128, max_chars=None, key=2)
72
  else:
73
  input_text = ""
74
 
75
- # Rest of your functions (setModel, get_html, entity_comb) remain the same
76
-
77
  @st.cache_resource
78
  def setModel(model_checkpoint, aggregation):
79
  model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
80
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
81
  return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)
82
 
83
- @st.cache_resource
84
- def get_html(html: str):
85
- WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
86
- html = html.replace("\n", " ")
87
- return WRAPPER.format(html)
88
-
89
  @st.cache_resource
90
  def entity_comb(output):
91
  output_comb = []
@@ -93,46 +77,31 @@ def entity_comb(output):
93
  if ind == 0:
94
  output_comb.append(entity)
95
  elif output[ind]["start"] == output[ind-1]["end"] and output[ind]["entity_group"] == output[ind-1]["entity_group"]:
96
- output_comb[-1]["word"] = output_comb[-1]["word"] + output[ind]["word"]
97
  output_comb[-1]["end"] = output[ind]["end"]
98
  else:
99
  output_comb.append(entity)
100
  return output_comb
101
 
102
- Run_Button = st.button("Run", key=None)
103
 
104
- if Run_Button and input_text != "":
105
- # Your existing processing code remains the same
106
  ner_pipeline = setModel(model_checkpoint, aggregation)
107
  output = ner_pipeline(input_text)
108
 
109
  output_comb = entity_comb(output)
110
 
111
  df = pd.DataFrame.from_dict(output_comb)
112
- cols_to_keep = ['word','entity_group','score','start','end']
113
  df_final = df[cols_to_keep]
114
 
115
  st.subheader("Recognized Entities")
116
  st.dataframe(df_final)
117
 
118
- st.subheader("Spacy Style Display")
119
- spacy_display = {}
120
- spacy_display["ents"] = []
121
- spacy_display["text"] = input_text
122
- spacy_display["title"] = None
123
-
124
  for entity in output_comb:
125
  spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
126
-
127
- tner_entity_list = ["person", "group", "facility", "organization", "geopolitical area", "location", "product", "event", "work of art", "law", "language", "date", "time", "percent", "money", "quantity", "ordinal number", "cardinal number"]
128
- spacy_entity_list = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL", "MISC"]
129
-
130
- for ent in spacy_display["ents"]:
131
- if model_checkpoint == "asahi417/tner-xlm-roberta-base-ontonotes5":
132
- ent["label"] = spacy_entity_list[tner_entity_list.index(ent["label"])]
133
- else:
134
- if ent["label"] == "PER": ent["label"] = "PERSON"
135
 
136
- html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": spacy_entity_list})
137
- style = "<style>mark.entity { display: inline-block }</style>"
138
- st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)
 
22
  st.error("Unsupported file type")
23
  return None
24
 
 
 
 
 
 
 
 
 
25
  st.title("Demo for Turkish NER Models")
26
 
27
  model_list = [
 
 
28
  'girayyagmur/bert-base-turkish-ner-cased',
 
29
  'savasy/bert-base-turkish-ner-cased',
30
  'xlm-roberta-large-finetuned-conll03-english',
31
  'asahi417/tner-xlm-roberta-base-ontonotes5'
 
35
  model_checkpoint = st.sidebar.radio("", model_list)
36
 
37
  st.sidebar.write("For details of models: 'https://huggingface.co/akdeniz27/")
38
+ st.sidebar.write("Only PDF, DOCX, and TXT files are supported.")
39
 
40
+ # Determine aggregation strategy
41
+ aggregation = "simple" if model_checkpoint in ["akdeniz27/xlm-roberta-base-turkish-ner",
42
+ "xlm-roberta-large-finetuned-conll03-english",
43
+ "asahi417/tner-xlm-roberta-base-ontonotes5"] else "first"
 
 
44
 
45
  st.subheader("Select Text Input Method")
46
  input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text', 'Upload File'))
47
 
48
+ example_list = [
49
+ "Mustafa Kemal Atatürk 1919 yılında Samsun'a çıktı.",
50
+ """Mustafa Kemal Atatürk, Türk asker, devlet adamı ve Türkiye Cumhuriyeti'nin kurucusudur."""
51
+ ]
52
+
53
  if input_method == 'Select from Examples':
54
+ selected_text = st.selectbox('Select Text from List', example_list, index=0)
55
+ input_text = st.text_area("Selected Text", selected_text, height=128)
56
  elif input_method == "Write or Paste New Text":
57
+ input_text = st.text_area('Write or Paste Text Below', value="", height=128)
58
  else:
59
  uploaded_file = st.file_uploader("Choose a file", type=["txt", "pdf", "docx"])
60
  if uploaded_file is not None:
61
  input_text = read_file(uploaded_file)
62
  if input_text:
63
+ st.text_area("Extracted Text", input_text, height=128)
64
  else:
65
  input_text = ""
66
 
 
 
67
  @st.cache_resource
68
  def setModel(model_checkpoint, aggregation):
69
  model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
70
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
71
  return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)
72
 
 
 
 
 
 
 
73
  @st.cache_resource
74
  def entity_comb(output):
75
  output_comb = []
 
77
  if ind == 0:
78
  output_comb.append(entity)
79
  elif output[ind]["start"] == output[ind-1]["end"] and output[ind]["entity_group"] == output[ind-1]["entity_group"]:
80
+ output_comb[-1]["word"] += output[ind]["word"]
81
  output_comb[-1]["end"] = output[ind]["end"]
82
  else:
83
  output_comb.append(entity)
84
  return output_comb
85
 
86
+ Run_Button = st.button("Run")
87
 
88
+ if Run_Button and input_text:
 
89
  ner_pipeline = setModel(model_checkpoint, aggregation)
90
  output = ner_pipeline(input_text)
91
 
92
  output_comb = entity_comb(output)
93
 
94
  df = pd.DataFrame.from_dict(output_comb)
95
+ cols_to_keep = ['word', 'entity_group', 'score', 'start', 'end']
96
  df_final = df[cols_to_keep]
97
 
98
  st.subheader("Recognized Entities")
99
  st.dataframe(df_final)
100
 
101
+ # Spacy display logic
102
+ spacy_display = {"ents": [], "text": input_text, "title": None}
 
 
 
 
103
  for entity in output_comb:
104
  spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
 
 
 
 
 
 
 
 
 
105
 
106
+ html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True)
107
+ st.write(html, unsafe_allow_html=True)