oliviercaron commited on
Commit
9d9b743
·
verified ·
1 Parent(s): c949917

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -173
app.py DELETED
@@ -1,173 +0,0 @@
1
- import os
2
- import csv
3
- import streamlit as st
4
- import polars as pl
5
- from io import BytesIO, StringIO
6
- from gliner import GLiNER
7
- from gliner_file import run_ner
8
- import time
9
-
10
- st.set_page_config(
11
- page_title="GliNER", page_icon="🧊", layout="wide", initial_sidebar_state="expanded"
12
- )
13
-
14
-
15
- # Modified function to load data from either an Excel or CSV file
16
- @st.cache_data
17
- def load_data(file):
18
- _, file_ext = os.path.splitext(file.name)
19
- if file_ext.lower() in [".xls", ".xlsx"]:
20
- return pl.read_excel(file)
21
- elif file_ext.lower() == ".csv":
22
- file.seek(0) # Go back to the beginning of the file
23
- try:
24
- sample = file.read(4096).decode(
25
- "utf-8"
26
- ) # Try to decode the sample in UTF-8
27
- encoding = "utf-8"
28
- except UnicodeDecodeError:
29
- encoding = "latin1" # Switch to 'latin1' if UTF-8 fails
30
- file.seek(0)
31
- sample = file.read(4096).decode(encoding)
32
-
33
- file.seek(0)
34
- dialect = csv.Sniffer().sniff(sample) # Detect the delimiter
35
-
36
- file.seek(0)
37
- if encoding != "utf-8":
38
- file_content = file.read().decode(encoding)
39
- file = StringIO(file_content)
40
- else:
41
- file_content = file.read().decode("utf-8")
42
- file = StringIO(file_content)
43
-
44
- return pl.read_csv(
45
- file,
46
- separator=dialect.delimiter,
47
- truncate_ragged_lines=True,
48
- ignore_errors=True,
49
- )
50
- else:
51
- raise ValueError("The uploaded file must be a CSV or Excel file.")
52
-
53
-
54
- # Function to perform NER and update the UI
55
- def perform_ner(filtered_df, selected_column, labels_list):
56
- ner_results_dict = {label: [] for label in labels_list}
57
-
58
- progress_bar = st.progress(0)
59
- progress_text = st.empty()
60
-
61
- start_time = time.time() # Record start time for total runtime
62
-
63
- for index, row in enumerate(filtered_df.to_pandas().itertuples(), 1):
64
- iteration_start_time = time.time() # Start time for this iteration
65
-
66
- if st.session_state.stop_processing:
67
- progress_text.text("Process stopped by the user.")
68
- break
69
-
70
- text_to_analyze = getattr(row, selected_column)
71
- ner_results = run_ner(
72
- st.session_state.gliner_model, text_to_analyze, labels_list
73
- )
74
-
75
- for label in labels_list:
76
- texts = ner_results.get(label, [])
77
- concatenated_texts = ", ".join(texts)
78
- ner_results_dict[label].append(concatenated_texts)
79
-
80
- progress = index / filtered_df.height
81
- progress_bar.progress(progress)
82
-
83
- iteration_time = (
84
- time.time() - iteration_start_time
85
- ) # Calculate runtime for this iteration
86
- total_time = time.time() - start_time # Calculate total elapsed time so far
87
-
88
- progress_text.text(
89
- f"Progress: {index}/{filtered_df.height} - {progress * 100:.0f}% (Iteration: {iteration_time:.2f}s, Total: {total_time:.2f}s)"
90
- )
91
-
92
- end_time = time.time() # Record end time
93
- total_execution_time = end_time - start_time # Calculate total runtime
94
-
95
- progress_text.text(
96
- f"Processing complete! Total execution time: {total_execution_time:.2f}s"
97
- )
98
-
99
- for label, texts in ner_results_dict.items():
100
- filtered_df = filtered_df.with_columns(pl.Series(name=label, values=texts))
101
-
102
- return filtered_df
103
-
104
-
105
- def main():
106
- st.title("Online NER with GliNER")
107
- st.markdown("Prototype v0.1")
108
-
109
- # Ensure the stop_processing flag is initialized
110
- if "stop_processing" not in st.session_state:
111
- st.session_state.stop_processing = False
112
-
113
- uploaded_file = st.sidebar.file_uploader("Choose a file")
114
- if uploaded_file is None:
115
- st.warning("Please upload a file.")
116
- return
117
-
118
- try:
119
- df = load_data(uploaded_file)
120
- except ValueError as e:
121
- st.error(str(e))
122
- return
123
-
124
- selected_column = st.selectbox("Select the column for NER:", df.columns, index=0)
125
- filter_text = st.text_input("Filter column by input text", "")
126
- ner_labels = st.text_input(
127
- "Enter all your different labels, separated by a comma", ""
128
- )
129
-
130
- filtered_df = (
131
- df.filter(pl.col(selected_column).str.contains(f"(?i).*{filter_text}.*"))
132
- if filter_text
133
- else df
134
- )
135
- st.dataframe(filtered_df)
136
-
137
- if st.button("Start NER"):
138
- if not ner_labels:
139
- st.warning("Please enter some labels for NER.")
140
- else:
141
- # Load GLiNER model if not already loaded
142
- if "gliner_model" not in st.session_state:
143
- with st.spinner("Loading GLiNER model... Please wait."):
144
- st.session_state.gliner_model = GLiNER.from_pretrained(
145
- "urchade/gliner_largev2"
146
- )
147
- st.session_state.gliner_model.eval()
148
-
149
- labels_list = ner_labels.split(",")
150
- updated_df = perform_ner(filtered_df, selected_column, labels_list)
151
- st.dataframe(updated_df)
152
-
153
- def to_excel(df):
154
- output = BytesIO()
155
- df.to_pandas().to_excel(output, index=False, engine="openpyxl")
156
- return output.getvalue()
157
-
158
- df_excel = to_excel(updated_df)
159
- st.download_button(
160
- label="📥 Download Excel",
161
- data=df_excel,
162
- file_name="ner_results.xlsx",
163
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
164
- )
165
-
166
- st.button(
167
- "Stop Processing",
168
- on_click=lambda: setattr(st.session_state, "stop_processing", True),
169
- )
170
-
171
-
172
- if __name__ == "__main__":
173
- main()