Archisman Karmakar commited on
Commit
4a97943
·
unverified ·
1 Parent(s): 179dfdb

stage1 deberta for seq classification hft model

Browse files
app_main_hf.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import sys
4
+ import joblib
5
+ import importlib.util
6
+
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), )))
8
+
9
+ from dashboard import show_dashboard
10
+ from sentiment_analysis.sentiment_analysis import show_sentiment_analysis
11
+ from emotion_analysis import show_emotion_analysis
12
+ # from text_transformation import show_text_transformation
13
+
14
+
15
+ def main():
16
+ st.sidebar.title("Navigation")
17
+ selection = st.sidebar.radio("Go to", ["Dashboard", "Stage 1: Sentiment Polarity Analysis", "Stage 2: Emotion Mood-tag Analysis", "Stage 3: Text Transformation & Normalization"])
18
+
19
+ if selection == "Dashboard":
20
+ show_dashboard()
21
+ elif selection == "Stage 1: Sentiment Polarity Analysis":
22
+ show_sentiment_analysis()
23
+ elif selection == "Stage 2: Emotion Mood-tag Analysis":
24
+ # show_emotion_analysis()
25
+ st.write("This section is under development.")
26
+ elif selection == "Stage 3: Text Transformation & Normalization":
27
+ # show_text_transformation()
28
+ st.write("This section is under development.")
29
+
30
+ st.sidebar.title("About")
31
+ st.sidebar.info("""
32
+ **Contributors:**
33
+ - Archisman Karmakar
34
+ - [LinkedIn](https://www.linkedin.com/in/archismankarmakar/)
35
+ - [GitHub](https://www.github.com/ArchismanKarmakar)
36
+ - [Kaggle](https://www.kaggle.com/archismancoder)
37
+ - Sumon Chatterjee
38
+ - [LinkedIn](https://www.linkedin.com/in/sumon-chatterjee-3b3b43227)
39
+ - [GitHub](https://github.com/Sumon670)
40
+ - [Kaggle](https://www.kaggle.com/sumonchatterjee)
41
+
42
+ **Mentors:**
43
+ - Prof. Anupam Mondal
44
+ - [LinkedIn](https://www.linkedin.com/in/anupam-mondal-ph-d-8a7a1a39/)
45
+ - [Google Scholar](https://scholar.google.com/citations?user=ESRR9o4AAAAJ&hl=en)
46
+ - [Website](https://sites.google.com/view/anupammondal/home)
47
+ - Prof. Sainik Kumar Mahata
48
+ - [LinkedIn](https://www.linkedin.com/in/mahatasainikk)
49
+ - [Google Scholar](https://scholar.google.co.in/citations?user=OcJDM50AAAAJ&hl=en)
50
+ - [Website](https://sites.google.com/view/sainik-kumar-mahata/home)
51
+
52
+ This is our research project for our B.Tech final year and a journal which is yet to be published.
53
+ """)
54
+
55
+ if __name__ == "__main__":
56
+ main()
dashboard.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def show_dashboard():
4
+ st.title("Tachygraphy Micro-text Analysis & Normalization")
5
+ st.write("""
6
+ Welcome to the Tachygraphy Micro-text Analysis & Normalization Project. This application is designed to analyze text data through three stages:
7
+ 1. Sentiment Polarity Analysis
8
+ 2. Emotion Mood-tag Analysis
9
+ 3. Text Transformation & Normalization
10
+ """)
emotion_analysis.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def show_emotion_analysis():
4
+ st.title("Stage 2: Emotion Mood-tag Analysis")
5
+ st.write("This section will handle emotion detection.")
6
+ # Add your emotion detection code here
7
+
8
+ if __name__ == "__main__":
9
+ show_emotion_analysis()
imports.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, DebertaV2Tokenizer, DebertaV2ForSequenceClassification
3
+ import torch
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import plotly.express as px
7
+ import pandas as pd
8
+ import json
9
+ import gc
10
+ import psutil
11
+ import os
12
+ import importlib.util
13
+ import sys
requirements.txt ADDED
@@ -0,0 +1,823 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==1.4.0
2
+ accelerate==1.2.1
3
+ aiofiles==22.1.0
4
+ aiohappyeyeballs==2.4.6
5
+ aiohttp==3.11.12
6
+ aiosignal==1.3.2
7
+ aiosqlite==0.21.0
8
+ alabaster==1.0.0
9
+ albucore==0.0.19
10
+ albumentations==1.4.20
11
+ alembic==1.14.1
12
+ altair==5.5.0
13
+ annotated-types==0.7.0
14
+ annoy==1.17.3
15
+ ansicolors==1.1.8
16
+ antlr4-python3-runtime==4.9.3
17
+ anyio==3.7.1
18
+ argon2-cffi==23.1.0
19
+ argon2-cffi-bindings==21.2.0
20
+ args==0.1.0
21
+ array_record==0.5.1
22
+ arrow==1.3.0
23
+ arviz==0.20.0
24
+ astropy==6.1.7
25
+ astropy-iers-data==0.2024.12.16.0.35.48
26
+ asttokens==3.0.0
27
+ astunparse==1.6.3
28
+ async-timeout==5.0.1
29
+ atpublic==4.1.0
30
+ attrs==25.1.0
31
+ audioread==3.0.1
32
+ autograd==1.7.0
33
+ babel==2.16.0
34
+ backcall==0.2.0
35
+ bayesian-optimization==2.0.3
36
+ beautifulsoup4==4.12.3
37
+ betterproto==2.0.0b6
38
+ bigframes==1.29.0
39
+ bigquery-magics==0.4.0
40
+ bleach==6.2.0
41
+ blinker==1.9.0
42
+ blis==0.7.11
43
+ blobfile==3.0.0
44
+ blosc2==2.7.1
45
+ bokeh==3.6.2
46
+ Boruta==0.4.3
47
+ boto3==1.36.23
48
+ botocore==1.36.23
49
+ Bottleneck==1.4.2
50
+ -e git+https://github.com/SohierDane/BigQuery_Helper@8615a7f6c1663e7f2d48aa2b32c2dbcb600a440f#egg=bq_helper
51
+ bqplot==0.12.43
52
+ branca==0.8.1
53
+ bs4==0.0.2
54
+ CacheControl==0.14.1
55
+ cachetools==5.5.0
56
+ Cartopy==0.24.1
57
+ catalogue==2.0.10
58
+ catboost==1.2.7
59
+ category_encoders==2.7.0
60
+ certifi==2025.1.31
61
+ cesium==0.12.1
62
+ cffi==1.17.1
63
+ chardet==5.2.0
64
+ charset-normalizer==3.4.1
65
+ Chessnut==0.4.1
66
+ chex==0.1.88
67
+ clarabel==0.9.0
68
+ click==8.1.7
69
+ click-plugins==1.1.1
70
+ cligj==0.7.2
71
+ clint==0.5.1
72
+ cloudpathlib==0.20.0
73
+ cloudpickle==3.1.0
74
+ cmake==3.31.2
75
+ cmdstanpy==1.2.5
76
+ colorama==0.4.6
77
+ colorcet==3.1.0
78
+ colorlog==6.9.0
79
+ colorlover==0.3.0
80
+ colour==0.1.5
81
+ comm==0.2.2
82
+ community==1.0.0b1
83
+ confection==0.1.5
84
+ cons==0.4.6
85
+ contourpy==1.3.1
86
+ coverage==7.6.12
87
+ cryptography==44.0.1
88
+ cuda-bindings==12.8.0
89
+ cuda-python==12.8.0
90
+ cudf-cu12==25.2.0
91
+ cufflinks==0.17.3
92
+ cuml-cu12==25.2.0
93
+ cupy-cuda12x==12.2.0
94
+ cuvs-cu12==25.2.0
95
+ cvxopt==1.3.2
96
+ cvxpy==1.6.0
97
+ cycler==0.12.1
98
+ cymem==2.0.10
99
+ Cython==3.0.11
100
+ cytoolz==1.0.1
101
+ daal==2025.2.0
102
+ dacite==1.9.2
103
+ dask==2024.12.1
104
+ dask-cuda==25.2.0
105
+ dask-cudf-cu12==25.2.0
106
+ dask-expr==1.1.21
107
+ dataclasses-json==0.6.7
108
+ datascience==0.17.6
109
+ datasets==3.3.1
110
+ datashader==0.17.0
111
+ db-dtypes==1.3.1
112
+ dbus-python==1.2.18
113
+ deap==1.4.2
114
+ debugpy==1.8.0
115
+ decorator==4.4.2
116
+ deepdiff==8.2.0
117
+ deepspeed==0.16.4
118
+ defusedxml==0.7.1
119
+ Deprecated==1.2.15
120
+ diffusers==0.31.0
121
+ dill==0.3.8
122
+ dipy==1.10.0
123
+ distributed==2024.12.1
124
+ distributed-ucxx-cu12==0.42.0
125
+ distro==1.9.0
126
+ dlib==19.24.2
127
+ dm-tree==0.1.8
128
+ dnspython==2.7.0
129
+ docker==7.1.0
130
+ docker-pycreds==0.4.0
131
+ docstring-to-markdown==0.15
132
+ docstring_parser==0.16
133
+ docutils==0.21.2
134
+ dopamine_rl==4.1.0
135
+ duckdb==1.1.3
136
+ earthengine-api==1.4.3
137
+ easydict==1.13
138
+ easyocr==1.7.2
139
+ editdistance==0.8.1
140
+ eerepr==0.0.4
141
+ einops==0.8.0
142
+ eli5==0.13.0
143
+ email_validator==2.2.0
144
+ emoji==2.14.1
145
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
146
+ entrypoints==0.4
147
+ et_xmlfile==2.0.0
148
+ etils==1.11.0
149
+ etuples==0.3.9
150
+ eval_type_backport==0.2.0
151
+ evaluate==0.4.3
152
+ exceptiongroup==1.2.2
153
+ execnb==0.1.11
154
+ Farama-Notifications==0.0.4
155
+ fastai==2.7.18
156
+ fastcore==1.7.27
157
+ fastdownload==0.0.7
158
+ fastjsonschema==2.21.1
159
+ fastprogress==1.0.3
160
+ fastrlock==0.8.2
161
+ fasttext==0.9.3
162
+ featuretools==1.31.0
163
+ filelock==3.17.0
164
+ fiona==1.10.1
165
+ firebase-admin==6.6.0
166
+ Flask==3.1.0
167
+ flatbuffers==24.3.25
168
+ flax==0.8.5
169
+ folium==0.19.2
170
+ fonttools==4.55.3
171
+ fqdn==1.5.1
172
+ frozendict==2.4.6
173
+ frozenlist==1.5.0
174
+ fsspec==2024.12.0
175
+ funcy==2.0
176
+ fury==0.12.0
177
+ future==1.0.0
178
+ fuzzywuzzy==0.18.0
179
+ gast==0.6.0
180
+ gatspy==0.3
181
+ gcsfs==2024.10.0
182
+ GDAL==3.6.4
183
+ gdown==5.2.0
184
+ geemap==0.35.1
185
+ gensim==4.3.3
186
+ geocoder==1.38.1
187
+ geographiclib==2.0
188
+ geojson==3.2.0
189
+ geopandas==0.14.4
190
+ geopy==2.4.1
191
+ ghapi==1.0.6
192
+ gin-config==0.5.0
193
+ gitdb==4.0.11
194
+ GitPython==3.1.43
195
+ glob2==0.7
196
+ google==2.0.3
197
+ google-ai-generativelanguage==0.6.10
198
+ google-api-core==1.34.1
199
+ google-api-python-client==2.155.0
200
+ google-auth==2.27.0
201
+ google-auth-httplib2==0.2.0
202
+ google-auth-oauthlib==1.2.1
203
+ google-cloud-aiplatform==1.74.0
204
+ google-cloud-automl==1.0.1
205
+ google-cloud-bigquery==3.25.0
206
+ google-cloud-bigquery-connection==1.17.0
207
+ google-cloud-bigtable==2.27.0
208
+ google-cloud-core==2.4.1
209
+ google-cloud-datastore==2.20.2
210
+ google-cloud-firestore==2.19.0
211
+ google-cloud-functions==1.19.0
212
+ google-cloud-iam==2.17.0
213
+ google-cloud-language==2.16.0
214
+ google-cloud-pubsub==2.27.1
215
+ google-cloud-resource-manager==1.14.0
216
+ google-cloud-storage==2.14.0
217
+ google-cloud-translate==3.12.1
218
+ google-cloud-videointelligence==2.16.0
219
+ google-cloud-vision==3.10.0
220
+ google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz
221
+ google-crc32c==1.6.0
222
+ google-genai==0.2.2
223
+ google-generativeai==0.8.3
224
+ google-pasta==0.2.0
225
+ google-resumable-media==2.7.2
226
+ googleapis-common-protos==1.66.0
227
+ googledrivedownloader==0.4
228
+ gpxpy==1.6.2
229
+ graphviz==0.20.3
230
+ greenlet==3.1.1
231
+ grpc-google-iam-v1==0.13.1
232
+ grpcio==1.68.1
233
+ grpcio-status==1.48.2
234
+ grpclib==0.4.8rc2
235
+ gspread==6.0.2
236
+ gspread-dataframe==3.3.1
237
+ gym==0.25.2
238
+ gym-notices==0.0.8
239
+ gymnasium==0.29.0
240
+ h11==0.14.0
241
+ h2==4.2.0
242
+ h2o==3.46.0.6
243
+ h5netcdf==1.4.1
244
+ h5py==3.12.1
245
+ haversine==2.9.0
246
+ hep_ml==0.7.3
247
+ hf_transfer==0.1.9
248
+ hjson==3.1.0
249
+ holidays==0.63
250
+ holoviews==1.20.0
251
+ hpack==4.1.0
252
+ html5lib==1.1
253
+ htmlmin==0.1.12
254
+ httpcore==1.0.7
255
+ httpimport==1.4.0
256
+ httplib2==0.22.0
257
+ httpx==0.28.1
258
+ huggingface-hub==0.29.0
259
+ humanize==4.11.0
260
+ hyperframe==6.1.0
261
+ hyperopt==0.2.7
262
+ ibis-framework==9.2.0
263
+ id==1.5.0
264
+ idna==3.10
265
+ igraph==0.11.8
266
+ ImageHash==4.3.1
267
+ imageio==2.36.1
268
+ imageio-ffmpeg==0.5.1
269
+ imagesize==1.4.1
270
+ imbalanced-learn==0.12.4
271
+ imgaug==0.4.0
272
+ immutabledict==4.2.1
273
+ importlib-resources==5.13.0
274
+ importlib_metadata==8.5.0
275
+ imutils==0.5.4
276
+ in-toto-attestation==0.9.3
277
+ inflect==7.4.0
278
+ iniconfig==2.0.0
279
+ intel-cmplr-lib-rt==2024.2.0
280
+ intel-cmplr-lib-ur==2024.2.0
281
+ intel-openmp==2024.2.0
282
+ ipyevents==2.0.2
283
+ ipyfilechooser==0.6.0
284
+ ipykernel==5.5.6
285
+ ipyleaflet==0.19.2
286
+ ipympl==0.9.6
287
+ ipyparallel==8.8.0
288
+ ipython==7.34.0
289
+ ipython-genutils==0.2.0
290
+ ipython-sql==0.5.0
291
+ ipytree==0.2.2
292
+ ipywidgets==8.1.5
293
+ isoduration==20.11.0
294
+ isoweek==1.3.3
295
+ itsdangerous==2.2.0
296
+ Janome==0.5.0
297
+ jax==0.4.33
298
+ jax-cuda12-pjrt==0.4.33
299
+ jax-cuda12-plugin==0.4.33
300
+ jaxlib==0.4.33
301
+ jedi==0.19.2
302
+ jeepney==0.7.1
303
+ jellyfish==1.1.0
304
+ jieba==0.42.1
305
+ Jinja2==3.1.4
306
+ jiter==0.8.2
307
+ jmespath==1.0.1
308
+ joblib==1.4.2
309
+ json5==0.10.0
310
+ jsonpatch==1.33
311
+ jsonpickle==4.0.1
312
+ jsonpointer==3.0.0
313
+ jsonschema==4.23.0
314
+ jsonschema-specifications==2024.10.1
315
+ jupyter-console==6.1.0
316
+ jupyter-events==0.12.0
317
+ jupyter-leaflet==0.19.2
318
+ jupyter-lsp==1.5.1
319
+ jupyter-ydoc==0.2.5
320
+ jupyter_client==8.6.3
321
+ jupyter_core==5.7.2
322
+ jupyter_server==2.12.5
323
+ jupyter_server_fileid==0.9.3
324
+ jupyter_server_terminals==0.5.3
325
+ jupyter_server_ydoc==0.8.0
326
+ jupyterlab==3.6.8
327
+ jupyterlab-lsp==3.10.2
328
+ jupyterlab_pygments==0.3.0
329
+ jupyterlab_server==2.27.3
330
+ jupyterlab_widgets==3.0.13
331
+ kaggle==1.6.17
332
+ kaggle-environments==1.16.11
333
+ kagglehub==0.3.9
334
+ keras==3.5.0
335
+ keras-core==0.1.7
336
+ keras-cv==0.9.0
337
+ keras-hub==0.18.1
338
+ keras-nlp==0.18.1
339
+ keras-tuner==1.4.7
340
+ keyring==23.5.0
341
+ kiwisolver==1.4.7
342
+ kornia==0.8.0
343
+ kornia_rs==0.1.8
344
+ kt-legacy==1.0.5
345
+ langchain==0.3.12
346
+ langchain-core==0.3.25
347
+ langchain-text-splitters==0.3.3
348
+ langcodes==3.5.0
349
+ langid==1.1.6
350
+ langsmith==0.2.3
351
+ language_data==1.3.0
352
+ launchpadlib==1.10.16
353
+ lazr.restfulclient==0.14.4
354
+ lazr.uri==1.0.6
355
+ lazy_loader==0.4
356
+ learntools @ git+https://github.com/Kaggle/learntools@010e3b5035354e15c073a0aca9e202c2e2beb742
357
+ leven==1.0.4
358
+ libclang==18.1.1
359
+ libcudf-cu12==25.2.0
360
+ libcuml-cu12==25.2.0
361
+ libcuvs-cu12==25.2.0
362
+ libkvikio-cu12==25.2.0
363
+ libpysal==4.9.2
364
+ libraft-cu12==25.2.0
365
+ librosa==0.10.2.post1
366
+ libucx-cu12==1.18.0
367
+ libucxx-cu12==0.42.0
368
+ lightgbm @ file:///tmp/lightgbm/lightgbm-4.5.0-py3-none-linux_x86_64.whl
369
+ lightning-utilities==0.12.0
370
+ lime==0.2.0.1
371
+ line_profiler==4.2.0
372
+ linkify-it-py==2.0.3
373
+ llvmlite==0.43.0
374
+ lml==0.1.0
375
+ locket==1.0.0
376
+ logical-unification==0.4.6
377
+ lxml==5.3.0
378
+ Mako==1.3.9
379
+ mamba==0.11.3
380
+ marisa-trie==1.2.1
381
+ Markdown==3.7
382
+ markdown-it-py==3.0.0
383
+ MarkupSafe==3.0.2
384
+ marshmallow==3.26.1
385
+ matplotlib==3.7.5
386
+ matplotlib-inline==0.1.7
387
+ matplotlib-venn==1.1.1
388
+ mdit-py-plugins==0.4.2
389
+ mdurl==0.1.2
390
+ miniKanren==1.0.3
391
+ missingno==0.5.2
392
+ mistune==0.8.4
393
+ mizani==0.13.1
394
+ mkl==2025.0.1
395
+ mkl-fft==1.3.8
396
+ mkl-random==1.2.4
397
+ mkl-service==2.4.1
398
+ mkl-umath==0.1.1
399
+ ml-dtypes==0.4.1
400
+ mlcrate==0.2.0
401
+ mlxtend==0.23.3
402
+ mne==1.9.0
403
+ model-signing==0.2.0
404
+ more-itertools==10.5.0
405
+ moviepy==1.0.3
406
+ mpld3==0.5.10
407
+ mpmath==1.3.0
408
+ msgpack==1.1.0
409
+ multidict==6.1.0
410
+ multimethod==1.12
411
+ multipledispatch==1.0.0
412
+ multiprocess==0.70.16
413
+ multitasking==0.0.11
414
+ murmurhash==1.0.11
415
+ music21==9.3.0
416
+ mypy-extensions==1.0.0
417
+ namex==0.0.8
418
+ narwhals==1.18.4
419
+ natsort==8.4.0
420
+ nbclassic==1.1.0
421
+ nbclient==0.5.13
422
+ nbconvert==6.4.5
423
+ nbdev==2.3.34
424
+ nbformat==5.10.4
425
+ ndindex==1.9.2
426
+ nest-asyncio==1.6.0
427
+ networkx==3.4.2
428
+ nibabel==5.3.2
429
+ nilearn==0.10.4
430
+ ninja==1.11.1.3
431
+ nltk==3.2.4
432
+ nose==1.3.7
433
+ notebook==6.5.4
434
+ notebook_shim==0.2.4
435
+ numba==0.60.0
436
+ numba-cuda==0.2.0
437
+ numexpr==2.10.2
438
+ numpy==1.26.4
439
+ nvidia-cublas-cu12==12.6.4.1
440
+ nvidia-cuda-cupti-cu12==12.6.80
441
+ nvidia-cuda-nvcc-cu12==12.6.85
442
+ nvidia-cuda-runtime-cu12==12.6.77
443
+ nvidia-cudnn-cu12==9.6.0.74
444
+ nvidia-cufft-cu12==11.3.0.4
445
+ nvidia-curand-cu12==10.3.7.77
446
+ nvidia-cusolver-cu12==11.7.1.2
447
+ nvidia-cusparse-cu12==12.5.4.2
448
+ nvidia-ml-py==12.570.86
449
+ nvidia-nccl-cu12==2.23.4
450
+ nvidia-nvcomp-cu12==4.1.0.6
451
+ nvidia-nvjitlink-cu12==12.6.85
452
+ nvtx==0.2.10
453
+ nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.10.0-py3-none-any.whl
454
+ oauth2client==4.1.3
455
+ oauthlib==3.2.2
456
+ odfpy==1.4.1
457
+ olefile==0.47
458
+ omegaconf==2.3.0
459
+ onnx==1.17.0
460
+ openai==1.57.4
461
+ opencv-contrib-python==4.10.0.84
462
+ opencv-python==4.10.0.84
463
+ opencv-python-headless==4.10.0.84
464
+ openpyxl==3.1.5
465
+ openslide-bin==4.0.0.6
466
+ openslide-python==1.4.1
467
+ opentelemetry-api==1.29.0
468
+ opentelemetry-sdk==1.29.0
469
+ opentelemetry-semantic-conventions==0.50b0
470
+ opt_einsum==3.4.0
471
+ optax==0.2.4
472
+ optim==0.1.0
473
+ optree==0.13.1
474
+ optuna==4.2.1
475
+ orbax-checkpoint==0.6.4
476
+ orderly-set==5.3.0
477
+ orjson==3.10.12
478
+ osqp==0.6.7.post3
479
+ overrides==7.7.0
480
+ packaging==24.2
481
+ pandas==2.2.3
482
+ pandas-datareader==0.10.0
483
+ pandas-gbq==0.25.0
484
+ pandas-profiling==3.6.6
485
+ pandas-stubs==2.2.2.240909
486
+ pandasql==0.7.3
487
+ pandocfilters==1.5.1
488
+ panel==1.5.4
489
+ papermill==2.6.0
490
+ param==2.2.0
491
+ parso==0.8.4
492
+ parsy==2.1
493
+ partd==1.4.2
494
+ path==17.1.0
495
+ path.py==12.5.0
496
+ pathlib==1.0.1
497
+ pathos==0.3.2
498
+ patsy==1.0.1
499
+ pdf2image==1.17.0
500
+ peewee==3.17.8
501
+ peft==0.14.0
502
+ pettingzoo==1.24.0
503
+ pexpect==4.9.0
504
+ phik==0.12.4
505
+ pickleshare==0.7.5
506
+ pillow==11.0.0
507
+ platformdirs==4.3.6
508
+ plotly==5.24.1
509
+ plotly-express==0.4.1
510
+ plotnine==0.14.4
511
+ pluggy==1.5.0
512
+ ply==3.11
513
+ polars==1.9.0
514
+ pooch==1.8.2
515
+ portalocker==3.1.1
516
+ portpicker==1.5.2
517
+ pox==0.3.5
518
+ ppft==1.7.6.9
519
+ preprocessing==0.1.13
520
+ preshed==3.0.9
521
+ prettytable==3.12.0
522
+ proglog==0.1.10
523
+ progressbar2==4.5.0
524
+ prometheus_client==0.21.1
525
+ promise==2.3
526
+ prompt_toolkit==3.0.48
527
+ propcache==0.2.1
528
+ prophet==1.1.6
529
+ proto-plus==1.25.0
530
+ protobuf==3.20.3
531
+ psutil==5.9.5
532
+ psycopg2==2.9.10
533
+ ptyprocess==0.7.0
534
+ pudb==2024.1.3
535
+ py-cpuinfo==9.0.0
536
+ py4j==0.10.9.7
537
+ pyaml==25.1.0
538
+ PyArabic==0.6.15
539
+ pyarrow==19.0.1
540
+ pyasn1==0.6.1
541
+ pyasn1_modules==0.4.1
542
+ pybind11==2.13.6
543
+ pyclipper==1.3.0.post6
544
+ pycocotools==2.0.8
545
+ pycparser==2.22
546
+ pycryptodome==3.21.0
547
+ pycryptodomex==3.21.0
548
+ pyct==0.5.0
549
+ pycuda==2025.1
550
+ pydantic==2.11.0a2
551
+ pydantic_core==2.29.0
552
+ pydata-google-auth==1.9.0
553
+ pydegensac==0.1.2
554
+ pydicom==3.0.1
555
+ pydot==3.0.3
556
+ pydotplus==2.0.2
557
+ PyDrive==1.3.1
558
+ PyDrive2==1.21.3
559
+ pydub==0.25.1
560
+ pyemd==1.0.0
561
+ pyerfa==2.0.1.5
562
+ pyexcel-io==0.6.7
563
+ pyexcel-ods==0.6.0
564
+ pygame==2.6.1
565
+ pygit2==1.16.0
566
+ pygltflib==1.16.3
567
+ Pygments==2.19.1
568
+ PyGObject==3.42.1
569
+ PyJWT==2.10.1
570
+ pyLDAvis==3.4.1
571
+ pylibcudf-cu12==25.2.0
572
+ pylibcugraph-cu12==24.10.0
573
+ pylibraft-cu12==25.2.0
574
+ pymc==5.19.1
575
+ pymc3==3.11.4
576
+ pymongo==4.11.1
577
+ Pympler==1.1
578
+ pymystem3==0.2.0
579
+ pynvjitlink-cu12==0.4.0
580
+ pynvml==12.0.0
581
+ pyogrio==0.10.0
582
+ Pyomo==6.8.2
583
+ PyOpenGL==3.1.7
584
+ pyOpenSSL==25.0.0
585
+ pyparsing==3.2.0
586
+ pypdf==5.3.0
587
+ pyperclip==1.9.0
588
+ pyproj==3.7.0
589
+ pyshp==2.3.1
590
+ PySocks==1.7.1
591
+ pyspark==3.5.3
592
+ pytensor==2.26.4
593
+ pytesseract==0.3.13
594
+ pytest==8.3.4
595
+ python-apt==0.0.0
596
+ python-bidi==0.6.6
597
+ python-box==7.3.0
598
+ python-dateutil==2.9.0.post0
599
+ python-json-logger==3.2.1
600
+ python-louvain==0.16
601
+ python-lsp-jsonrpc==1.1.2
602
+ python-lsp-server==1.12.2
603
+ python-slugify==8.0.4
604
+ python-utils==3.9.1
605
+ pytools==2025.1.1
606
+ pytorch-ignite==0.5.1
607
+ pytorch-lightning==2.5.0.post0
608
+ pytz==2025.1
609
+ PyUpSet==0.1.1.post7
610
+ pyviz_comms==3.0.3
611
+ PyWavelets==1.8.0
612
+ PyYAML==6.0.2
613
+ pyzmq==24.0.1
614
+ qdldl==0.1.7.post4
615
+ qgrid==1.3.1
616
+ qtconsole==5.6.1
617
+ QtPy==2.4.3
618
+ raft-dask-cu12==25.2.0
619
+ rapids-dask-dependency==25.2.0
620
+ ratelim==0.1.6
621
+ ray==2.42.1
622
+ referencing==0.35.1
623
+ regex==2024.11.6
624
+ requests==2.32.3
625
+ requests-oauthlib==1.3.1
626
+ requests-toolbelt==1.0.0
627
+ requirements-parser==0.9.0
628
+ rfc3161-client==0.1.2
629
+ rfc3339-validator==0.1.4
630
+ rfc3986-validator==0.1.1
631
+ rfc8785==0.1.4
632
+ rgf-python==3.12.0
633
+ rich==13.9.4
634
+ rmm-cu12==25.2.0
635
+ rouge_score==0.1.2
636
+ rpds-py==0.22.3
637
+ rpy2==3.4.2
638
+ rsa==4.9
639
+ Rtree==1.3.0
640
+ s3fs==0.4.2
641
+ s3transfer==0.11.2
642
+ sacrebleu==2.5.1
643
+ safetensors==0.4.5
644
+ scikit-image==0.25.0
645
+ scikit-learn==1.2.2
646
+ scikit-learn-intelex==2025.2.0
647
+ scikit-multilearn==0.2.0
648
+ scikit-optimize==0.10.2
649
+ scikit-plot==0.3.7
650
+ scikit-surprise==1.1.4
651
+ scipy==1.13.1
652
+ scooby==0.10.0
653
+ scs==3.2.7
654
+ seaborn==0.12.2
655
+ SecretStorage==3.3.1
656
+ securesystemslib==1.2.0
657
+ segment_anything @ git+https://github.com/facebookresearch/segment-anything.git@dca509fe793f601edb92606367a655c15ac00fdf
658
+ semver==3.0.4
659
+ Send2Trash==1.8.3
660
+ sentence-transformers==3.3.1
661
+ sentencepiece==0.2.0
662
+ sentry-sdk==2.19.2
663
+ setproctitle==1.3.4
664
+ setuptools-scm==8.1.0
665
+ shap==0.44.1
666
+ shapely==2.0.7
667
+ shellingham==1.5.4
668
+ Shimmy==1.3.0
669
+ sigstore==3.6.1
670
+ sigstore-protobuf-specs==0.3.2
671
+ sigstore-rekor-types==0.0.18
672
+ simple-parsing==0.1.6
673
+ SimpleITK==2.4.1
674
+ six==1.17.0
675
+ sklearn-pandas==2.2.0
676
+ slicer==0.0.7
677
+ smart-open==7.0.5
678
+ smmap==5.0.1
679
+ sniffio==1.3.1
680
+ snowballstemmer==2.2.0
681
+ sortedcontainers==2.4.0
682
+ soundfile==0.12.1
683
+ soupsieve==2.6
684
+ soxr==0.5.0.post1
685
+ spacy==3.7.5
686
+ spacy-legacy==3.0.12
687
+ spacy-loggers==1.0.5
688
+ Sphinx==8.1.3
689
+ sphinx-rtd-theme==0.2.4
690
+ sphinxcontrib-applehelp==2.0.0
691
+ sphinxcontrib-devhelp==2.0.0
692
+ sphinxcontrib-htmlhelp==2.1.0
693
+ sphinxcontrib-jsmath==1.0.1
694
+ sphinxcontrib-qthelp==2.0.0
695
+ sphinxcontrib-serializinghtml==2.0.0
696
+ SQLAlchemy==2.0.36
697
+ sqlglot==25.1.0
698
+ sqlparse==0.5.3
699
+ squarify==0.4.4
700
+ srsly==2.5.0
701
+ stable-baselines3==2.1.0
702
+ stanio==0.5.1
703
+ statsmodels==0.14.4
704
+ stopit==1.1.2
705
+ StrEnum==0.4.15
706
+ stringzilla==3.11.1
707
+ stumpy==1.13.0
708
+ sympy==1.13.1
709
+ tables==3.10.1
710
+ tabulate==0.9.0
711
+ tbb==2022.0.0
712
+ tbb4py==2022.0.0
713
+ tblib==3.0.0
714
+ tcmlib==1.2.0
715
+ tenacity==9.0.0
716
+ tensorboard==2.17.1
717
+ tensorboard-data-server==0.7.2
718
+ tensorflow==2.17.1
719
+ tensorflow-cloud==0.1.5
720
+ tensorflow-datasets==4.9.7
721
+ tensorflow-hub==0.16.1
722
+ tensorflow-io==0.37.1
723
+ tensorflow-io-gcs-filesystem==0.37.1
724
+ tensorflow-metadata==1.13.1
725
+ tensorflow-probability==0.24.0
726
+ tensorflow-text==2.17.0
727
+ tensorflow_decision_forests==1.10.0
728
+ tensorstore==0.1.71
729
+ termcolor==2.5.0
730
+ terminado==0.18.1
731
+ testpath==0.6.0
732
+ text-unidecode==1.3
733
+ textblob==0.17.1
734
+ texttable==1.7.0
735
+ tf-slim==1.1.0
736
+ tf_keras==2.17.0
737
+ Theano==1.0.5
738
+ Theano-PyMC==1.1.2
739
+ thinc==8.2.5
740
+ threadpoolctl==3.5.0
741
+ tifffile==2024.12.12
742
+ tiktoken==0.9.0
743
+ timm==1.0.12
744
+ tinycss2==1.4.0
745
+ tokenizers==0.21.0
746
+ toml==0.10.2
747
+ tomli==2.2.1
748
+ toolz==0.12.1
749
+ torch @ https://download.pytorch.org/whl/cu121_full/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl
750
+ torchaudio @ https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl
751
+ torchinfo==1.8.0
752
+ torchmetrics==1.6.1
753
+ torchsummary==1.5.1
754
+ torchtune==0.5.0
755
+ torchvision @ https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp310-cp310-linux_x86_64.whl
756
+ tornado==6.3.3
757
+ TPOT==0.12.1
758
+ tqdm==4.67.1
759
+ traitlets==5.7.1
760
+ traittypes==0.2.1
761
+ transformers==4.47.0
762
+ treelite==4.4.1
763
+ trx-python==0.3
764
+ tsfresh==0.20.2
765
+ tuf==5.1.0
766
+ tweepy==4.14.0
767
+ typeguard==4.4.1
768
+ typer==0.15.1
769
+ types-python-dateutil==2.9.0.20241206
770
+ types-pytz==2024.2.0.20241003
771
+ types-setuptools==75.6.0.20241126
772
+ typing-inspect==0.9.0
773
+ typing_extensions==4.12.2
774
+ tzdata==2025.1
775
+ tzlocal==5.2
776
+ uc-micro-py==1.0.3
777
+ ucx-py-cu12==0.42.0
778
+ ucxx-cu12==0.42.0
779
+ ujson==5.10.0
780
+ umf==0.9.1
781
+ update-checker==0.18.0
782
+ uri-template==1.3.0
783
+ uritemplate==4.1.1
784
+ urllib3==2.3.0
785
+ urwid==2.6.16
786
+ urwid_readline==0.15.1
787
+ vega-datasets==0.9.0
788
+ visions==0.7.6
789
+ vtk==9.3.1
790
+ wadllib==1.3.6
791
+ Wand==0.6.13
792
+ wandb==0.19.1
793
+ wasabi==1.1.3
794
+ watchdog==6.0.0
795
+ wavio==0.0.9
796
+ wcwidth==0.2.13
797
+ weasel==0.4.1
798
+ webcolors==24.11.1
799
+ webencodings==0.5.1
800
+ websocket-client==1.8.0
801
+ websockets==14.1
802
+ Werkzeug==3.1.3
803
+ widgetsnbextension==4.0.13
804
+ woodwork==0.31.0
805
+ wordcloud==1.9.4
806
+ wrapt==1.17.0
807
+ wurlitzer==3.1.1
808
+ xarray==2024.11.0
809
+ xarray-einstats==0.8.0
810
+ xgboost==2.0.3
811
+ xlrd==2.0.1
812
+ xvfbwrapper==0.2.9
813
+ xxhash==3.5.0
814
+ xyzservices==2024.9.0
815
+ y-py==0.6.2
816
+ yarl==1.18.3
817
+ ydata-profiling==4.12.2
818
+ ydf==0.9.0
819
+ yellowbrick==1.5
820
+ yfinance==0.2.50
821
+ ypy-websocket==0.8.4
822
+ zict==3.0.0
823
+ zipp==3.21.0
sentiment_analysis/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # from . import sentiment_analysis
sentiment_analysis/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (166 Bytes). View file
 
sentiment_analysis/__pycache__/sentiment_analysis.cpython-310.pyc ADDED
Binary file (4.86 kB). View file
 
sentiment_analysis/config/stage1_models.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1": {
3
+ "name": "DeBERTa v3 Base for Sequence Classification",
4
+ "type": "hf_automodel_finetuned_dbt3",
5
+ "module_path": "hmv_cfg_base_stage1.model1",
6
+ "hf_location": "tachygraphy-microtrext-norm-org/DeBERTa-v3-seqClassfication-LV1-SentimentPolarities-Batch8",
7
+ "tokenizer_class": "DebertaV2Tokenizer",
8
+ "model_class": "DebertaV2ForSequenceClassification",
9
+ "problem_type": "regression",
10
+ "base_model": "microsoft/deberta-v3-base",
11
+ "num_labels": 3,
12
+ "device": "cpu",
13
+ "load_function": "load_model",
14
+ "predict_function": "predict"
15
+ }
16
+ }
sentiment_analysis/hmv_cfg_base_stage1/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # from . import model1
sentiment_analysis/hmv_cfg_base_stage1/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (186 Bytes). View file
 
sentiment_analysis/hmv_cfg_base_stage1/__pycache__/model1.cpython-310.pyc ADDED
Binary file (1.95 kB). View file
 
sentiment_analysis/hmv_cfg_base_stage1/imports.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, DebertaV2Tokenizer, DebertaV2ForSequenceClassification
3
+ import torch
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import plotly.express as px
7
+ import pandas as pd
8
+ import json
9
+ import gc
10
+ import psutil
11
+ import os
12
+ import importlib.util
13
+ import sys
sentiment_analysis/hmv_cfg_base_stage1/model1.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from imports import *
3
+ import torch.nn.functional as F
4
+
5
+
6
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
7
+ CONFIG_STAGE1 = os.path.join(BASE_DIR, "..", "config", "stage1_models.json")
8
+
9
+ MODEL_OPTIONS = {
10
+ "1": {
11
+ "name": "DeBERTa v3 Base for Sequence Classification",
12
+ "type": "hf_automodel_finetuned_dbt3",
13
+ "module_path": "hmv_cfg_base_stage1.model1",
14
+ "hf_location": "tachygraphy-microtrext-norm-org/DeBERTa-v3-seqClassfication-LV1-SentimentPolarities-Batch8",
15
+ "tokenizer_class": "DebertaV2Tokenizer",
16
+ "model_class": "DebertaV2ForSequenceClassification",
17
+ "problem_type": "regression",
18
+ "base_model": "microsoft/deberta-v3-base",
19
+ "num_labels": 3,
20
+ "device": "cpu",
21
+ "load_function": "load_model",
22
+ "predict_function": "predict"
23
+ }
24
+ }
25
+
26
+
27
+ @st.cache_resource
28
+
29
+ def load_model():
30
+ model_key = "1"
31
+ model_info = MODEL_OPTIONS[model_key]
32
+ hf_location = model_info["hf_location"]
33
+
34
+ tokenizer_class = globals()[model_info["tokenizer_class"]]
35
+ model_class = globals()[model_info["model_class"]]
36
+ tokenizer = tokenizer_class.from_pretrained(hf_location)
37
+ model = model_class.from_pretrained(hf_location,
38
+ problem_type=model_info["problem_type"],
39
+ num_labels=model_info["num_labels"]
40
+ )
41
+
42
+ return model, tokenizer
43
+
44
+
45
+ def predict(text, model, tokenizer, device, max_len=128):
46
+ # Tokenize and pad the input text
47
+ inputs = tokenizer(
48
+ text,
49
+ add_special_tokens=True,
50
+ padding=True,
51
+ truncation=False,
52
+ return_tensors="pt",
53
+ return_token_type_ids=False,
54
+ ).to(device) # Move input tensors to the correct device
55
+
56
+ with torch.no_grad():
57
+ outputs = model(**inputs)
58
+
59
+ # probabilities = outputs.logits.cpu().numpy()
60
+
61
+ probabilities = torch.relu(outputs.logits)
62
+ probabilities = torch.clamp(torch.tensor(probabilities), min=0.00000, max=1.00000).cpu().numpy()
63
+ # probabilities /= probabilities.sum()
64
+ # probabilities = probabilities.cpu().numpy()
65
+
66
+ return probabilities
67
+
68
+
69
+ if __name__ == "__main__":
70
+ model, tokenizer = load_model("1")
71
+ print("Model and tokenizer loaded successfully.")
sentiment_analysis/hmv_cfg_base_stage1/stage1_bert_architecture.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+ class BERT_architecture(nn.Module):
4
+
5
+ def __init__(self, bert):
6
+ super(BERT_architecture, self).__init__()
7
+ self.bert = bert
8
+
9
+ self.dropout = nn.Dropout(0.3) # Increased dropout for regularization
10
+ self.layer_norm = nn.LayerNorm(768) # Layer normalization
11
+
12
+ self.fc1 = nn.Linear(768, 256) # Dense layer
13
+ self.fc2 = nn.Linear(256, 3) # Output layer with 3 classes
14
+
15
+ self.relu = nn.ReLU()
16
+ self.softmax = nn.LogSoftmax(dim=1)
17
+
18
+ def forward(self, sent_id, mask, token_type_ids):
19
+ _, cls_hs = self.bert(sent_id, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
20
+ x = self.layer_norm(cls_hs)
21
+ x = self.fc1(x)
22
+ x = self.relu(x)
23
+ x = self.dropout(x)
24
+ x = self.fc2(x)
25
+ x = self.softmax(x)
26
+ return x
sentiment_analysis/sentiment_analysis.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from imports import *
2
+ import importlib.util
3
+ import os
4
+ import sys
5
+ import joblib
6
+
7
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), )))
8
+
9
+ # from hmv_cfg_base_stage1.model1 import load_model as load_model1
10
+ # from hmv_cfg_base_stage1.model1 import predict as predict1
11
+
12
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
+ CONFIG_STAGE1 = os.path.join(BASE_DIR, "config", "stage1_models.json")
14
+ LOADERS_STAGE1 = os.path.join(BASE_DIR, "hmv-cfg-base-stage1")
15
+
16
+ # Load the model and tokenizer
17
+ # model_name = "tachygraphy-microtrext-norm-org/DeBERTa-v3-seqClassfication-LV1-SentimentPolarities-Batch8"
18
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+ # model = AutoModel.from_pretrained(model_name)
20
+
21
+ SENTIMENT_POLARITY_LABELS = [
22
+ "negative", "neutral", "positive"
23
+ ]
24
+
25
+ current_model = None
26
+ current_tokenizer = None
27
+
28
+ # Enabling Resource caching
29
+ @st.cache_resource
30
+
31
+ def load_model_config():
32
+ with open(CONFIG_STAGE1, "r") as f:
33
+ model_data = json.load(f)
34
+
35
+ model_options = {v["name"]: v for v in model_data.values()} # Extract names for dropdown
36
+ return model_data, model_options
37
+
38
+ MODEL_DATA, MODEL_OPTIONS = load_model_config()
39
+
40
+
41
+
42
+ # def load_model():
43
+ # model = DebertaV2ForSequenceClassification.from_pretrained(model_name)
44
+ # tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
45
+ # return model, tokenizer
46
+
47
+
48
+ # ✅ Dynamically Import Model Functions
49
+ def import_from_module(module_name, function_name):
50
+ try:
51
+ module = importlib.import_module(module_name)
52
+ return getattr(module, function_name)
53
+ except (ModuleNotFoundError, AttributeError) as e:
54
+ st.error(f"❌ Import Error: {e}")
55
+ return None
56
+
57
+
58
+ def free_memory():
59
+ # """Free up CPU & GPU memory before loading a new model."""
60
+ global current_model, current_tokenizer
61
+
62
+ if current_model is not None:
63
+ del current_model # Delete the existing model
64
+ current_model = None # Reset reference
65
+
66
+ if current_tokenizer is not None:
67
+ del current_tokenizer # Delete the tokenizer
68
+ current_tokenizer = None
69
+
70
+ gc.collect() # Force garbage collection for CPU memory
71
+
72
+ if torch.cuda.is_available():
73
+ torch.cuda.empty_cache() # Free GPU memory
74
+ torch.cuda.ipc_collect() # Clean up PyTorch GPU cache
75
+
76
+ # If running on CPU, reclaim memory using OS-level commands
77
+ try:
78
+ if torch.cuda.is_available() is False:
79
+ psutil.virtual_memory() # Refresh memory stats
80
+ except Exception as e:
81
+ print(f"Memory cleanup error: {e}")
82
+
83
+
84
+ def load_selected_model(model_name):
85
+ global current_model, current_tokenizer
86
+
87
+ free_memory()
88
+
89
+ # st.write("DEBUG: Available Models:", MODEL_OPTIONS.keys()) # ✅ See available models
90
+ # st.write("DEBUG: Selected Model:", MODEL_OPTIONS[model_name]) # ✅ Check selected model
91
+ # st.write("DEBUG: Model Name:", model_name) # ✅ Check selected model
92
+
93
+ if model_name not in MODEL_OPTIONS:
94
+ st.error(f"⚠️ Model '{model_name}' not found in config!")
95
+ return None, None, None
96
+
97
+ model_info = MODEL_OPTIONS[model_name]
98
+ hf_location = model_info["hf_location"]
99
+
100
+ model_module = model_info["module_path"]
101
+ load_function = model_info["load_function"]
102
+ predict_function = model_info["predict_function"]
103
+
104
+ load_model_func = import_from_module(model_module, load_function)
105
+ predict_func = import_from_module(model_module, predict_function)
106
+
107
+ if load_model_func is None or predict_func is None:
108
+ st.error("❌ Model functions could not be loaded!")
109
+ return None, None, None
110
+
111
+ model, tokenizer = load_model_func()
112
+
113
+ current_model, current_tokenizer = model, tokenizer
114
+ return model, tokenizer, predict_func
115
+
116
+ # def load_selected_model(model_name):
117
+ # # """Load model and tokenizer based on user selection."""
118
+ # global current_model, current_tokenizer
119
+
120
+ # # Free memory before loading a new model
121
+ # free_memory()
122
+
123
+ # if model_name not in MODEL_OPTIONS:
124
+ # st.error(f"⚠️ Model '{model_name}' not found in config!")
125
+ # return None, None
126
+
127
+ # model_info = MODEL_OPTIONS[model_name]
128
+ # hf_location = model_info["hf_location"]
129
+
130
+ # model_module = model_info["module_path"]
131
+ # # load_function = "load_model"
132
+ # # predict_function = "predict"
133
+
134
+ # load_function = model_info["load_function"]
135
+ # predict_function = model_info["predict_function"]
136
+
137
+ # # tokenizer_class = globals()[model_info["tokenizer_class"]]
138
+ # # model_class = globals()[model_info["model_class"]]
139
+
140
+ # # tokenizer = tokenizer_class.from_pretrained(hf_location)
141
+
142
+
143
+ # load_model_func = import_from_module(model_module, load_function)
144
+ # predict_func = import_from_module(model_module, predict_function)
145
+
146
+ # # # Load model
147
+ # # if model_info["type"] == "custom_checkpoint" or model_info["type"] == "custom_model":
148
+ # # model = torch.load(hf_location, map_location="cpu") # Load PyTorch model
149
+ # # elif model_info["type"] == "hf_automodel_finetuned_dbt3":
150
+ # # tokenizer_class = globals()[model_info["tokenizer_class"]]
151
+ # # model_class = globals()[model_info["model_class"]]
152
+ # # tokenizer = tokenizer_class.from_pretrained(hf_location)
153
+ # # model = model_class.from_pretrained(hf_location,
154
+ # # problem_type=model_info["problem_type"],
155
+ # # num_labels=model_info["num_labels"]
156
+ # # )
157
+ # # else:
158
+ # # st.error("Invalid model selection")
159
+ # # return None, None
160
+
161
+
162
+ # if load_model_func is None or predict_func is None:
163
+ # st.error("❌ Model functions could not be loaded!")
164
+ # return None, None
165
+
166
+ # # current_model, current_tokenizer = model, tokenizer # Store references
167
+ # # return model, tokenizer
168
+
169
+ # model, tokenizer = load_model_func(hf_location)
170
+
171
+ # current_model, current_tokenizer = model, tokenizer
172
+ # return model, tokenizer, predict_func
173
+
174
+
175
+
176
+ def predict(text, model, tokenizer, device, max_len=128):
177
+ # Tokenize and pad the input text
178
+ inputs = tokenizer(
179
+ text,
180
+ add_special_tokens=True,
181
+ padding=True,
182
+ truncation=False,
183
+ return_tensors="pt",
184
+ return_token_type_ids=False,
185
+ ).to(device) # Move input tensors to the correct device
186
+
187
+ with torch.no_grad():
188
+ outputs = model(**inputs)
189
+
190
+ # Apply sigmoid activation (for BCEWithLogitsLoss)
191
+ probabilities = outputs.logits.cpu().numpy()
192
+
193
+ return probabilities
194
+
195
+ # def show_sentiment_analysis():
196
+
197
+ # Add your sentiment analysis code here
198
+
199
+ # user_input = st.text_input("Enter text for sentiment analysis:")
200
+ # user_input = st.text_area("Enter text for sentiment analysis:", height=200)
201
+ # user_input = st.text_area("Enter text for sentiment analysis:", max_chars=500)
202
+
203
+ def show_sentiment_analysis():
204
+ st.title("Stage 1: Sentiment Polarity Analysis")
205
+ st.write("This section will handle sentiment analysis.")
206
+
207
+ if "selected_model" not in st.session_state:
208
+ st.session_state.selected_model = list(MODEL_OPTIONS.keys())[0] # Default selection
209
+
210
+ if "clear_output" not in st.session_state:
211
+ st.session_state.clear_output = False
212
+
213
+ st.selectbox("Choose a model:", list(MODEL_OPTIONS.keys()), key="selected_model")
214
+
215
+ selected_model = st.session_state.selected_model
216
+
217
+ if selected_model not in MODEL_OPTIONS:
218
+ st.error(f"❌ Selected model '{selected_model}' not found!")
219
+ st.stop()
220
+
221
+ st.session_state.clear_output = True # Reset output when model changes
222
+
223
+
224
+ # st.write("DEBUG: Available Models:", MODEL_OPTIONS.keys()) # ✅ See available models
225
+ # st.write("DEBUG: Selected Model:", MODEL_OPTIONS[selected_model]) # ✅ Check selected model
226
+
227
+
228
+ user_input = st.text_input("Enter text for sentiment analysis:")
229
+
230
+ if user_input:
231
+ # Make prediction
232
+
233
+ # model, tokenizer = load_model()
234
+ # model, tokenizer = load_selected_model(selected_model)
235
+
236
+ model, tokenizer, predict_func = load_selected_model(selected_model)
237
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
238
+
239
+ if model is None:
240
+ st.error("⚠️ Error: Model failed to load! Check model selection or configuration.")
241
+ st.stop()
242
+
243
+ model.to(device)
244
+
245
+ # predictions = predict(user_input, model, tokenizer, device)
246
+
247
+ predictions = predict_func(user_input, model, tokenizer, device)
248
+
249
+ # Squeeze predictions to remove extra dimensions
250
+ predictions_array = predictions.squeeze()
251
+
252
+ # Convert to binary predictions (argmax)
253
+ binary_predictions = np.zeros_like(predictions_array)
254
+ max_indices = np.argmax(predictions_array)
255
+ binary_predictions[max_indices] = 1
256
+
257
+ # Display raw predictions
258
+ st.write(f"**Predicted Sentiment Scores:** {predictions_array}")
259
+
260
+ # Display binary classification result
261
+ st.write(f"**Predicted Sentiment:**")
262
+ st.write(f"**NEGATIVE:** {binary_predictions[0]}, **NEUTRAL:** {binary_predictions[1]}, **POSITIVE:** {binary_predictions[2]}")
263
+ # st.write(f"**NEUTRAL:** {binary_predictions[1]}")
264
+ # st.write(f"**POSITIVE:** {binary_predictions[2]}")
265
+
266
+ # 1️⃣ **Polar Plot (Plotly)**
267
+ sentiment_polarities = predictions_array.tolist()
268
+ fig_polar = px.line_polar(
269
+ pd.DataFrame(dict(r=sentiment_polarities, theta=SENTIMENT_POLARITY_LABELS)),
270
+ r='r', theta='theta', line_close=True
271
+ )
272
+ st.plotly_chart(fig_polar)
273
+
274
+ # 2️⃣ **Normalized Horizontal Bar Chart (Matplotlib)**
275
+ normalized_predictions = predictions_array / predictions_array.sum()
276
+
277
+ fig, ax = plt.subplots(figsize=(8, 2))
278
+ left = 0
279
+ for i in range(len(normalized_predictions)):
280
+ ax.barh(0, normalized_predictions[i], color=plt.cm.tab10(i), left=left, label=SENTIMENT_POLARITY_LABELS[i])
281
+ left += normalized_predictions[i]
282
+
283
+ # Configure the chart
284
+ ax.set_xlim(0, 1)
285
+ ax.set_yticks([])
286
+ ax.set_xticks(np.arange(0, 1.1, 0.1))
287
+ ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=len(SENTIMENT_POLARITY_LABELS))
288
+ plt.title("Sentiment Polarity Prediction Distribution")
289
+
290
+ # Display in Streamlit
291
+ st.pyplot(fig)
292
+
293
+
294
+
295
+ if __name__ == "__main__":
296
+ show_sentiment_analysis()
src/bq-helper/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
src/bq-helper/README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Summary
2
+
3
+ BigQuery_Helper is a helper class to simplify common read-only BigQuery tasks. It makes it easy to execute queries while you're learning SQL, and provides a convenient stepping stone on the path to using [the core BigQuery python API](https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html). You can try it for yourself by forking [this Kaggle kernel](https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package/).
4
+
5
+ ## Installation
6
+ You can install BigQuery_Helper with the following command in your console:
7
+
8
+
9
+ `pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper`
10
+
11
+ If you aren't running BigQuery_Helper on [Kaggle](http://kaggle.com/), you will also need to go through the [standard BigQuery client setup and authentication process](https://cloud.google.com/bigquery/docs/reference/libraries).
12
+
13
+ This repo has only been tested on Python 3.6+ and the v0.29+ of the bigquery API.
14
+
15
+ ## Changelog
16
+ #### 0.4.0:
17
+ - `BigQueryHelper.table_schema` has been overhauled. It now returns a Pandas DataFrame and unrolls nested fields so that the results are in the format expected by queries. For example, the `github_repos.commits` nested field `author` now returns sub-fields names in the format like `author.email`.
18
+
19
+ #### 0.3.0:
20
+ - Each helper instance now logs the total bytes counted towards your quota or bill used across all queries run with that helper instance. You can access it with `BigQueryHelper.total_gb_used_net_cache`. Repeated queries are likely to hit the cache and may show up as 0 GB used.
21
+ - Queries that take longer than the maximum wait time, which defaults to 3 minutes, will be cancelled.
22
+ - Contributing to bq_helper should be easier now that there is a set of tests.
23
+
24
+ #### 0.2.0:
25
+ - `query_to_pandas` now returns an empty DataFrame when the query returns no results. Previously, this returned `None`.
src/bq-helper/__pycache__/version.cpython-312.pyc ADDED
Binary file (190 Bytes). View file
 
src/bq-helper/bq_helper.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helper class to simplify common read-only BigQuery tasks.
3
+ """
4
+
5
+
6
+ import pandas as pd
7
+ import time
8
+
9
+ from google.cloud import bigquery
10
+
11
+
12
+ class BigQueryHelper(object):
13
+ """
14
+ Helper class to simplify common BigQuery tasks like executing queries,
15
+ showing table schemas, etc without worrying about table or dataset pointers.
16
+
17
+ See the BigQuery docs for details of the steps this class lets you skip:
18
+ https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html
19
+ """
20
+
21
+ def __init__(self, active_project, dataset_name, max_wait_seconds=180):
22
+ self.project_name = active_project
23
+ self.dataset_name = dataset_name
24
+ self.max_wait_seconds = max_wait_seconds
25
+ self.client = bigquery.Client()
26
+ self.__dataset_ref = self.client.dataset(self.dataset_name, project=self.project_name)
27
+ self.dataset = None
28
+ self.tables = dict() # {table name (str): table object}
29
+ self.__table_refs = dict() # {table name (str): table reference}
30
+ self.total_gb_used_net_cache = 0
31
+ self.BYTES_PER_GB = 2**30
32
+
33
+ def __fetch_dataset(self):
34
+ """
35
+ Lazy loading of dataset. For example,
36
+ if the user only calls `self.query_to_pandas` then the
37
+ dataset never has to be fetched.
38
+ """
39
+ if self.dataset is None:
40
+ self.dataset = self.client.get_dataset(self.__dataset_ref)
41
+
42
+ def __fetch_table(self, table_name):
43
+ """
44
+ Lazy loading of table
45
+ """
46
+ self.__fetch_dataset()
47
+ if table_name not in self.__table_refs:
48
+ self.__table_refs[table_name] = self.dataset.table(table_name)
49
+ if table_name not in self.tables:
50
+ self.tables[table_name] = self.client.get_table(self.__table_refs[table_name])
51
+
52
+ def __handle_record_field(self, row, schema_details, top_level_name=''):
53
+ """
54
+ Unpack a single row, including any nested fields.
55
+ """
56
+ name = row['name']
57
+ if top_level_name != '':
58
+ name = top_level_name + '.' + name
59
+ schema_details.append([{
60
+ 'name': name,
61
+ 'type': row['type'],
62
+ 'mode': row['mode'],
63
+ 'fields': pd.np.nan,
64
+ 'description': row['description']
65
+ }])
66
+ # float check is to dodge row['fields'] == np.nan
67
+ if type(row.get('fields', 0.0)) == float:
68
+ return None
69
+ for entry in row['fields']:
70
+ self.__handle_record_field(entry, schema_details, name)
71
+
72
+ def __unpack_all_schema_fields(self, schema):
73
+ """
74
+ Unrolls nested schemas. Returns dataframe with one row per field,
75
+ and the field names in the format accepted by the API.
76
+ Results will look similar to the website schema, such as:
77
+ https://bigquery.cloud.google.com/table/bigquery-public-data:github_repos.commits?pli=1
78
+
79
+ Args:
80
+ schema: DataFrame derived from api repr of raw table.schema
81
+ Returns:
82
+ Dataframe of the unrolled schema.
83
+ """
84
+ schema_details = []
85
+ schema.apply(lambda row:
86
+ self.__handle_record_field(row, schema_details), axis=1)
87
+ result = pd.concat([pd.DataFrame.from_dict(x) for x in schema_details])
88
+ result.reset_index(drop=True, inplace=True)
89
+ del result['fields']
90
+ return result
91
+
92
+ def table_schema(self, table_name):
93
+ """
94
+ Get the schema for a specific table from a dataset.
95
+ Unrolls nested field names into the format that can be copied
96
+ directly into queries. For example, for the `github.commits` table,
97
+ the this will return `committer.name`.
98
+
99
+ This is a very different return signature than BigQuery's table.schema.
100
+ """
101
+ self.__fetch_table(table_name)
102
+ raw_schema = self.tables[table_name].schema
103
+ schema = pd.DataFrame.from_dict([x.to_api_repr() for x in raw_schema])
104
+ # the api_repr only has the fields column for tables with nested data
105
+ if 'fields' in schema.columns:
106
+ schema = self.__unpack_all_schema_fields(schema)
107
+ # Set the column order
108
+ schema = schema[['name', 'type', 'mode', 'description']]
109
+ return schema
110
+
111
+ def list_tables(self):
112
+ """
113
+ List the names of the tables in a dataset
114
+ """
115
+ self.__fetch_dataset()
116
+ return([x.table_id for x in self.client.list_tables(self.dataset)])
117
+
118
+ def estimate_query_size(self, query):
119
+ """
120
+ Estimate gigabytes scanned by query.
121
+ Does not consider if there is a cached query table.
122
+ See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.dryRun
123
+ """
124
+ my_job_config = bigquery.job.QueryJobConfig()
125
+ my_job_config.dry_run = True
126
+ my_job = self.client.query(query, job_config=my_job_config)
127
+ return my_job.total_bytes_processed / self.BYTES_PER_GB
128
+
129
+ def query_to_pandas(self, query):
130
+ """
131
+ Execute a SQL query & return a pandas dataframe
132
+ """
133
+ my_job = self.client.query(query)
134
+ start_time = time.time()
135
+ while not my_job.done():
136
+ if (time.time() - start_time) > self.max_wait_seconds:
137
+ print("Max wait time elapsed, query cancelled.")
138
+ self.client.cancel_job(my_job.job_id)
139
+ return None
140
+ time.sleep(0.1)
141
+ # Queries that hit errors will return an exception type.
142
+ # Those exceptions don't get raised until we call my_job.to_dataframe()
143
+ # In that case, my_job.total_bytes_billed can be called but is None
144
+ if my_job.total_bytes_billed:
145
+ self.total_gb_used_net_cache += my_job.total_bytes_billed / self.BYTES_PER_GB
146
+ return my_job.to_dataframe()
147
+
148
+ def query_to_pandas_safe(self, query, max_gb_scanned=1):
149
+ """
150
+ Execute a query, but only if the query would scan less than `max_gb_scanned` of data.
151
+ """
152
+ query_size = self.estimate_query_size(query)
153
+ if query_size <= max_gb_scanned:
154
+ return self.query_to_pandas(query)
155
+ msg = "Query cancelled; estimated size of {0} exceeds limit of {1} GB"
156
+ print(msg.format(query_size, max_gb_scanned))
157
+
158
+ def head(self, table_name, num_rows=5, start_index=None, selected_columns=None):
159
+ """
160
+ Get the first n rows of a table as a DataFrame.
161
+ Does not perform a full table scan; should use a trivial amount of data as long as n is small.
162
+ """
163
+ self.__fetch_table(table_name)
164
+ active_table = self.tables[table_name]
165
+ schema_subset = None
166
+ if selected_columns:
167
+ schema_subset = [col for col in active_table.schema if col.name in selected_columns]
168
+ results = self.client.list_rows(active_table, selected_fields=schema_subset,
169
+ max_results=num_rows, start_index=start_index)
170
+ results = [x for x in results]
171
+ return pd.DataFrame(
172
+ data=[list(x.values()) for x in results], columns=list(results[0].keys()))
src/bq-helper/setup.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup
2
+ from version import __version__ as version
3
+
4
+
5
+ setup(name='bq_helper',
6
+ version=version,
7
+ description='Helper class to simplify common read-only BigQuery tasks.',
8
+ author='Sohier Dane',
9
+ url='https://github.com/SohierDane/BigQuery_Helper',
10
+ license='Apache 2.0',
11
+ install_requires=['pandas', 'google-cloud-bigquery'],
12
+ classifiers=['Programming Language :: Python :: 3'],
13
+ )
src/bq-helper/test_helper.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests all public methods of the BigQueryHelper class.
3
+
4
+ Run from command line with:
5
+ python -m unittest test_helper.py
6
+
7
+
8
+ BILLING WARNING:
9
+ Running these tests requires a working BigQuery account and MAY CAUSE CHARGES.
10
+ However the dataset used for the tests is only ~2 MB, so any charges should
11
+ be very minimal. The downside is that this particular dataset is completely
12
+ refreshed every hour, so it's not possible to check for any specific return values.
13
+
14
+ For details on the test dataset, please see:
15
+ https://bigquery.cloud.google.com/table/bigquery-public-data:openaq.global_air_quality?tab=details
16
+ """
17
+
18
+
19
+ import unittest
20
+
21
+
22
+ from bq_helper import BigQueryHelper
23
+ from google.api_core.exceptions import BadRequest
24
+ from pandas.core.frame import DataFrame
25
+ from random import random
26
+
27
+
28
+ class TestBQHelper(unittest.TestCase):
29
+ def setUp(self):
30
+ self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
31
+ self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
32
+ # Query randomized so it won't hit the cache across multiple test runs
33
+ self.randomizable_query = """
34
+ SELECT value FROM `bigquery-public-data.openaq.global_air_quality`
35
+ WHERE value = {0}"""
36
+
37
+ def test_list_tables(self):
38
+ self.assertEqual(self.my_bq.list_tables(), ['global_air_quality'])
39
+
40
+ def test_list_schema(self):
41
+ self.assertEqual(len(self.my_bq.table_schema('global_air_quality')), 11)
42
+
43
+ def test_estimate_query_size(self):
44
+ self.assertIsInstance(self.my_bq.estimate_query_size(self.query), float)
45
+
46
+ def test_query_to_pandas(self):
47
+ self.assertIsInstance(self.my_bq.query_to_pandas(self.query), DataFrame)
48
+
49
+ def test_query_safe_passes(self):
50
+ self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query), DataFrame)
51
+
52
+ def test_query_safe_fails(self):
53
+ # Different query must be used for this test to ensure we don't hit the
54
+ # cache and end up passing by testing a query that would use zero bytes.
55
+ fail_query = self.randomizable_query.format(random())
56
+ self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10))
57
+
58
+ def test_head(self):
59
+ self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame)
60
+
61
+ def test_useage_tracker(self):
62
+ self.my_bq.query_to_pandas(self.randomizable_query.format(random()))
63
+ self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0)
64
+
65
+ def test_bad_query_raises_right_error(self):
66
+ with self.assertRaises(BadRequest):
67
+ self.my_bq.query_to_pandas("Not a valid query")
68
+
69
+ def test_list_nested_schema(self):
70
+ nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
71
+ self.assertEqual(len(nested_helper.table_schema('commits')), 33)
72
+
73
+
74
+ if __name__ == '__main__':
75
+ unittest.main()
src/bq-helper/version.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = '0.4.1'