umarigan commited on
Commit
c480c1f
·
verified ·
1 Parent(s): 8bb7ed4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -16
app.py CHANGED
@@ -6,6 +6,26 @@ import PyPDF2
6
  import docx
7
  import io
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  st.set_page_config(layout="wide")
10
 
11
  # Function to read text from uploaded file
@@ -22,12 +42,6 @@ def read_file(file):
22
  st.error("Unsupported file type")
23
  return None
24
 
25
- # Function to generate text chunks
26
- def chunk_text(text, max_length=128):
27
- words = text.split()
28
- for i in range(0, len(words), max_length):
29
- yield " ".join(words[i:i + max_length])
30
-
31
  st.title("Turkish NER Models Testing")
32
 
33
  model_list = [
@@ -44,9 +58,7 @@ st.sidebar.write("For details of models: 'https://huggingface.co/akdeniz27/")
44
  st.sidebar.write("Only PDF, DOCX, and TXT files are supported.")
45
 
46
  # Determine aggregation strategy
47
- aggregation = "simple" if model_checkpoint in ["akdeniz27/xlm-roberta-base-turkish-ner",
48
- "xlm-roberta-large-finetuned-conll03-english",
49
- "asahi417/tner-xlm-roberta-base-ontonotes5"] else "first"
50
 
51
  st.subheader("Select Text Input Method")
52
  input_method = st.radio("", ('Write or Paste New Text', 'Upload File'))
@@ -86,23 +98,37 @@ Run_Button = st.button("Run")
86
  if Run_Button and input_text:
87
  ner_pipeline = setModel(model_checkpoint, aggregation)
88
 
89
- # Process the text in chunks
90
- output_comb = []
91
- for chunk in chunk_text(input_text):
 
 
 
92
  output = ner_pipeline(chunk)
93
- output_comb.extend(entity_comb(output))
94
-
 
 
 
 
 
 
 
 
 
 
 
95
  df = pd.DataFrame.from_dict(output_comb)
96
  cols_to_keep = ['word', 'entity_group', 'score', 'start', 'end']
97
  df_final = df[cols_to_keep]
98
 
99
  st.subheader("Recognized Entities")
100
  st.dataframe(df_final)
101
-
102
  # Spacy display logic
103
  spacy_display = {"ents": [], "text": input_text, "title": None}
104
  for entity in output_comb:
105
  spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
106
-
107
  html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True)
108
  st.write(html, unsafe_allow_html=True)
 
6
  import docx
7
  import io
8
 
9
+ def chunk_text(text, chunk_size=128):
10
+ words = text.split()
11
+ chunks = []
12
+ current_chunk = []
13
+ current_length = 0
14
+
15
+ for word in words:
16
+ if current_length + len(word) + 1 > chunk_size:
17
+ chunks.append(' '.join(current_chunk))
18
+ current_chunk = [word]
19
+ current_length = len(word)
20
+ else:
21
+ current_chunk.append(word)
22
+ current_length += len(word) + 1
23
+
24
+ if current_chunk:
25
+ chunks.append(' '.join(current_chunk))
26
+
27
+ return chunks
28
+
29
  st.set_page_config(layout="wide")
30
 
31
  # Function to read text from uploaded file
 
42
  st.error("Unsupported file type")
43
  return None
44
 
 
 
 
 
 
 
45
  st.title("Turkish NER Models Testing")
46
 
47
  model_list = [
 
58
  st.sidebar.write("Only PDF, DOCX, and TXT files are supported.")
59
 
60
  # Determine aggregation strategy
61
+ aggregation = "simple" if model_checkpoint in ["akdeniz27/xlm-roberta-base-turkish-ner", "xlm-roberta-large-finetuned-conll03-english", "asahi417/tner-xlm-roberta-base-ontonotes5"] else "first"
 
 
62
 
63
  st.subheader("Select Text Input Method")
64
  input_method = st.radio("", ('Write or Paste New Text', 'Upload File'))
 
98
  if Run_Button and input_text:
99
  ner_pipeline = setModel(model_checkpoint, aggregation)
100
 
101
+ # Chunk the input text
102
+ chunks = chunk_text(input_text)
103
+
104
+ # Process each chunk
105
+ all_outputs = []
106
+ for i, chunk in enumerate(chunks):
107
  output = ner_pipeline(chunk)
108
+
109
+ # Adjust start and end positions for entities in chunks after the first
110
+ if i > 0:
111
+ offset = len(' '.join(chunks[:i])) + 1
112
+ for entity in output:
113
+ entity['start'] += offset
114
+ entity['end'] += offset
115
+
116
+ all_outputs.extend(output)
117
+
118
+ # Combine entities
119
+ output_comb = entity_comb(all_outputs)
120
+
121
  df = pd.DataFrame.from_dict(output_comb)
122
  cols_to_keep = ['word', 'entity_group', 'score', 'start', 'end']
123
  df_final = df[cols_to_keep]
124
 
125
  st.subheader("Recognized Entities")
126
  st.dataframe(df_final)
127
+
128
  # Spacy display logic
129
  spacy_display = {"ents": [], "text": input_text, "title": None}
130
  for entity in output_comb:
131
  spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
132
+
133
  html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True)
134
  st.write(html, unsafe_allow_html=True)