Spaces:
Running
Running
Olivier CARON
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -7,36 +7,46 @@ from gliner import GLiNER
|
|
7 |
from gliner_file import run_ner
|
8 |
import time
|
9 |
|
10 |
-
st.set_page_config(
|
|
|
|
|
|
|
11 |
|
12 |
# Modified function to load data from either an Excel or CSV file
|
13 |
@st.cache_data
|
14 |
def load_data(file):
|
15 |
_, file_ext = os.path.splitext(file.name)
|
16 |
-
if file_ext.lower() in [
|
17 |
return pl.read_excel(file)
|
18 |
-
elif file_ext.lower() ==
|
19 |
file.seek(0) # Go back to the beginning of the file
|
20 |
try:
|
21 |
-
sample = file.read(4096).decode(
|
22 |
-
|
|
|
|
|
23 |
except UnicodeDecodeError:
|
24 |
-
encoding =
|
25 |
file.seek(0)
|
26 |
sample = file.read(4096).decode(encoding)
|
27 |
-
|
28 |
file.seek(0)
|
29 |
dialect = csv.Sniffer().sniff(sample) # Detect the delimiter
|
30 |
|
31 |
file.seek(0)
|
32 |
-
if encoding !=
|
33 |
file_content = file.read().decode(encoding)
|
34 |
file = StringIO(file_content)
|
35 |
else:
|
36 |
-
file_content = file.read().decode(
|
37 |
file = StringIO(file_content)
|
38 |
-
|
39 |
-
return pl.read_csv(
|
|
|
|
|
|
|
|
|
|
|
40 |
else:
|
41 |
raise ValueError("The uploaded file must be a CSV or Excel file.")
|
42 |
|
@@ -44,51 +54,60 @@ def load_data(file):
|
|
44 |
# Function to perform NER and update the UI
|
45 |
def perform_ner(filtered_df, selected_column, labels_list):
|
46 |
ner_results_dict = {label: [] for label in labels_list}
|
47 |
-
|
48 |
progress_bar = st.progress(0)
|
49 |
progress_text = st.empty()
|
50 |
-
|
51 |
start_time = time.time() # Record start time for total runtime
|
52 |
|
53 |
for index, row in enumerate(filtered_df.to_pandas().itertuples(), 1):
|
54 |
iteration_start_time = time.time() # Start time for this iteration
|
55 |
-
|
56 |
if st.session_state.stop_processing:
|
57 |
progress_text.text("Process stopped by the user.")
|
58 |
break
|
59 |
|
60 |
text_to_analyze = getattr(row, selected_column)
|
61 |
-
ner_results = run_ner(
|
|
|
|
|
62 |
|
63 |
for label in labels_list:
|
64 |
texts = ner_results.get(label, [])
|
65 |
-
concatenated_texts =
|
66 |
ner_results_dict[label].append(concatenated_texts)
|
67 |
|
68 |
progress = index / filtered_df.height
|
69 |
progress_bar.progress(progress)
|
70 |
-
|
71 |
-
iteration_time =
|
|
|
|
|
72 |
total_time = time.time() - start_time # Calculate total elapsed time so far
|
73 |
-
|
74 |
-
progress_text.text(
|
|
|
|
|
75 |
|
76 |
end_time = time.time() # Record end time
|
77 |
total_execution_time = end_time - start_time # Calculate total runtime
|
78 |
-
|
79 |
-
progress_text.text(
|
80 |
-
|
|
|
|
|
81 |
for label, texts in ner_results_dict.items():
|
82 |
filtered_df = filtered_df.with_columns(pl.Series(name=label, values=texts))
|
83 |
|
84 |
return filtered_df
|
85 |
|
|
|
86 |
def main():
|
87 |
st.title("Online NER with GliNER")
|
88 |
st.markdown("Prototype v0.1")
|
89 |
|
90 |
# Ensure the stop_processing flag is initialized
|
91 |
-
if
|
92 |
st.session_state.stop_processing = False
|
93 |
|
94 |
uploaded_file = st.sidebar.file_uploader("Choose a file")
|
@@ -104,9 +123,15 @@ def main():
|
|
104 |
|
105 |
selected_column = st.selectbox("Select the column for NER:", df.columns, index=0)
|
106 |
filter_text = st.text_input("Filter column by input text", "")
|
107 |
-
ner_labels = st.text_input(
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
st.dataframe(filtered_df)
|
111 |
|
112 |
if st.button("Start NER"):
|
@@ -114,27 +139,35 @@ def main():
|
|
114 |
st.warning("Please enter some labels for NER.")
|
115 |
else:
|
116 |
# Load GLiNER model if not already loaded
|
117 |
-
if
|
118 |
-
with st.spinner(
|
119 |
-
st.session_state.gliner_model = GLiNER.from_pretrained(
|
|
|
|
|
120 |
st.session_state.gliner_model.eval()
|
121 |
-
|
122 |
labels_list = ner_labels.split(",")
|
123 |
updated_df = perform_ner(filtered_df, selected_column, labels_list)
|
124 |
st.dataframe(updated_df)
|
125 |
|
126 |
def to_excel(df):
|
127 |
output = BytesIO()
|
128 |
-
df.to_pandas().to_excel(output, index=False, engine=
|
129 |
return output.getvalue()
|
130 |
|
131 |
df_excel = to_excel(updated_df)
|
132 |
-
st.download_button(
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
-
st.button("Stop Processing", on_click=lambda: setattr(st.session_state, 'stop_processing', True))
|
138 |
|
139 |
if __name__ == "__main__":
|
140 |
main()
|
|
|
7 |
from gliner_file import run_ner
|
8 |
import time
|
9 |
|
10 |
+
st.set_page_config(
|
11 |
+
page_title="GliNER", page_icon="π§", layout="wide", initial_sidebar_state="expanded"
|
12 |
+
)
|
13 |
+
|
14 |
|
15 |
# Modified function to load data from either an Excel or CSV file
|
16 |
@st.cache_data
|
17 |
def load_data(file):
|
18 |
_, file_ext = os.path.splitext(file.name)
|
19 |
+
if file_ext.lower() in [".xls", ".xlsx"]:
|
20 |
return pl.read_excel(file)
|
21 |
+
elif file_ext.lower() == ".csv":
|
22 |
file.seek(0) # Go back to the beginning of the file
|
23 |
try:
|
24 |
+
sample = file.read(4096).decode(
|
25 |
+
"utf-8"
|
26 |
+
) # Try to decode the sample in UTF-8
|
27 |
+
encoding = "utf-8"
|
28 |
except UnicodeDecodeError:
|
29 |
+
encoding = "latin1" # Switch to 'latin1' if UTF-8 fails
|
30 |
file.seek(0)
|
31 |
sample = file.read(4096).decode(encoding)
|
32 |
+
|
33 |
file.seek(0)
|
34 |
dialect = csv.Sniffer().sniff(sample) # Detect the delimiter
|
35 |
|
36 |
file.seek(0)
|
37 |
+
if encoding != "utf-8":
|
38 |
file_content = file.read().decode(encoding)
|
39 |
file = StringIO(file_content)
|
40 |
else:
|
41 |
+
file_content = file.read().decode("utf-8")
|
42 |
file = StringIO(file_content)
|
43 |
+
|
44 |
+
return pl.read_csv(
|
45 |
+
file,
|
46 |
+
separator=dialect.delimiter,
|
47 |
+
truncate_ragged_lines=True,
|
48 |
+
ignore_errors=True,
|
49 |
+
)
|
50 |
else:
|
51 |
raise ValueError("The uploaded file must be a CSV or Excel file.")
|
52 |
|
|
|
54 |
# Function to perform NER and update the UI
|
55 |
def perform_ner(filtered_df, selected_column, labels_list):
|
56 |
ner_results_dict = {label: [] for label in labels_list}
|
57 |
+
|
58 |
progress_bar = st.progress(0)
|
59 |
progress_text = st.empty()
|
60 |
+
|
61 |
start_time = time.time() # Record start time for total runtime
|
62 |
|
63 |
for index, row in enumerate(filtered_df.to_pandas().itertuples(), 1):
|
64 |
iteration_start_time = time.time() # Start time for this iteration
|
65 |
+
|
66 |
if st.session_state.stop_processing:
|
67 |
progress_text.text("Process stopped by the user.")
|
68 |
break
|
69 |
|
70 |
text_to_analyze = getattr(row, selected_column)
|
71 |
+
ner_results = run_ner(
|
72 |
+
st.session_state.gliner_model, text_to_analyze, labels_list
|
73 |
+
)
|
74 |
|
75 |
for label in labels_list:
|
76 |
texts = ner_results.get(label, [])
|
77 |
+
concatenated_texts = ", ".join(texts)
|
78 |
ner_results_dict[label].append(concatenated_texts)
|
79 |
|
80 |
progress = index / filtered_df.height
|
81 |
progress_bar.progress(progress)
|
82 |
+
|
83 |
+
iteration_time = (
|
84 |
+
time.time() - iteration_start_time
|
85 |
+
) # Calculate runtime for this iteration
|
86 |
total_time = time.time() - start_time # Calculate total elapsed time so far
|
87 |
+
|
88 |
+
progress_text.text(
|
89 |
+
f"Progress: {index}/{filtered_df.height} - {progress * 100:.0f}% (Iteration: {iteration_time:.2f}s, Total: {total_time:.2f}s)"
|
90 |
+
)
|
91 |
|
92 |
end_time = time.time() # Record end time
|
93 |
total_execution_time = end_time - start_time # Calculate total runtime
|
94 |
+
|
95 |
+
progress_text.text(
|
96 |
+
f"Processing complete! Total execution time: {total_execution_time:.2f}s"
|
97 |
+
)
|
98 |
+
|
99 |
for label, texts in ner_results_dict.items():
|
100 |
filtered_df = filtered_df.with_columns(pl.Series(name=label, values=texts))
|
101 |
|
102 |
return filtered_df
|
103 |
|
104 |
+
|
105 |
def main():
|
106 |
st.title("Online NER with GliNER")
|
107 |
st.markdown("Prototype v0.1")
|
108 |
|
109 |
# Ensure the stop_processing flag is initialized
|
110 |
+
if "stop_processing" not in st.session_state:
|
111 |
st.session_state.stop_processing = False
|
112 |
|
113 |
uploaded_file = st.sidebar.file_uploader("Choose a file")
|
|
|
123 |
|
124 |
selected_column = st.selectbox("Select the column for NER:", df.columns, index=0)
|
125 |
filter_text = st.text_input("Filter column by input text", "")
|
126 |
+
ner_labels = st.text_input(
|
127 |
+
"Enter all your different labels, separated by a comma", ""
|
128 |
+
)
|
129 |
+
|
130 |
+
filtered_df = (
|
131 |
+
df.filter(pl.col(selected_column).str.contains(f"(?i).*{filter_text}.*"))
|
132 |
+
if filter_text
|
133 |
+
else df
|
134 |
+
)
|
135 |
st.dataframe(filtered_df)
|
136 |
|
137 |
if st.button("Start NER"):
|
|
|
139 |
st.warning("Please enter some labels for NER.")
|
140 |
else:
|
141 |
# Load GLiNER model if not already loaded
|
142 |
+
if "gliner_model" not in st.session_state:
|
143 |
+
with st.spinner("Loading GLiNER model... Please wait."):
|
144 |
+
st.session_state.gliner_model = GLiNER.from_pretrained(
|
145 |
+
"urchade/gliner_largev2"
|
146 |
+
)
|
147 |
st.session_state.gliner_model.eval()
|
148 |
+
|
149 |
labels_list = ner_labels.split(",")
|
150 |
updated_df = perform_ner(filtered_df, selected_column, labels_list)
|
151 |
st.dataframe(updated_df)
|
152 |
|
153 |
def to_excel(df):
|
154 |
output = BytesIO()
|
155 |
+
df.to_pandas().to_excel(output, index=False, engine="openpyxl")
|
156 |
return output.getvalue()
|
157 |
|
158 |
df_excel = to_excel(updated_df)
|
159 |
+
st.download_button(
|
160 |
+
label="π₯ Download Excel",
|
161 |
+
data=df_excel,
|
162 |
+
file_name="ner_results.xlsx",
|
163 |
+
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
164 |
+
)
|
165 |
+
|
166 |
+
st.button(
|
167 |
+
"Stop Processing",
|
168 |
+
on_click=lambda: setattr(st.session_state, "stop_processing", True),
|
169 |
+
)
|
170 |
|
|
|
171 |
|
172 |
if __name__ == "__main__":
|
173 |
main()
|