Archisman Karmakar
commited on
stage1 deberta for seq classification hft model
Browse files- app_main_hf.py +56 -0
- dashboard.py +10 -0
- emotion_analysis.py +9 -0
- imports.py +13 -0
- requirements.txt +823 -0
- sentiment_analysis/__init__.py +1 -0
- sentiment_analysis/__pycache__/__init__.cpython-310.pyc +0 -0
- sentiment_analysis/__pycache__/sentiment_analysis.cpython-310.pyc +0 -0
- sentiment_analysis/config/stage1_models.json +16 -0
- sentiment_analysis/hmv_cfg_base_stage1/__init__.py +1 -0
- sentiment_analysis/hmv_cfg_base_stage1/__pycache__/__init__.cpython-310.pyc +0 -0
- sentiment_analysis/hmv_cfg_base_stage1/__pycache__/model1.cpython-310.pyc +0 -0
- sentiment_analysis/hmv_cfg_base_stage1/imports.py +13 -0
- sentiment_analysis/hmv_cfg_base_stage1/model1.py +71 -0
- sentiment_analysis/hmv_cfg_base_stage1/stage1_bert_architecture.py +26 -0
- sentiment_analysis/sentiment_analysis.py +296 -0
- src/bq-helper/LICENSE +201 -0
- src/bq-helper/README.md +25 -0
- src/bq-helper/__pycache__/version.cpython-312.pyc +0 -0
- src/bq-helper/bq_helper.py +172 -0
- src/bq-helper/setup.py +13 -0
- src/bq-helper/test_helper.py +75 -0
- src/bq-helper/version.py +1 -0
app_main_hf.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import joblib
|
5 |
+
import importlib.util
|
6 |
+
|
7 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), )))
|
8 |
+
|
9 |
+
from dashboard import show_dashboard
|
10 |
+
from sentiment_analysis.sentiment_analysis import show_sentiment_analysis
|
11 |
+
from emotion_analysis import show_emotion_analysis
|
12 |
+
# from text_transformation import show_text_transformation
|
13 |
+
|
14 |
+
|
15 |
+
def main():
|
16 |
+
st.sidebar.title("Navigation")
|
17 |
+
selection = st.sidebar.radio("Go to", ["Dashboard", "Stage 1: Sentiment Polarity Analysis", "Stage 2: Emotion Mood-tag Analysis", "Stage 3: Text Transformation & Normalization"])
|
18 |
+
|
19 |
+
if selection == "Dashboard":
|
20 |
+
show_dashboard()
|
21 |
+
elif selection == "Stage 1: Sentiment Polarity Analysis":
|
22 |
+
show_sentiment_analysis()
|
23 |
+
elif selection == "Stage 2: Emotion Mood-tag Analysis":
|
24 |
+
# show_emotion_analysis()
|
25 |
+
st.write("This section is under development.")
|
26 |
+
elif selection == "Stage 3: Text Transformation & Normalization":
|
27 |
+
# show_text_transformation()
|
28 |
+
st.write("This section is under development.")
|
29 |
+
|
30 |
+
st.sidebar.title("About")
|
31 |
+
st.sidebar.info("""
|
32 |
+
**Contributors:**
|
33 |
+
- Archisman Karmakar
|
34 |
+
- [LinkedIn](https://www.linkedin.com/in/archismankarmakar/)
|
35 |
+
- [GitHub](https://www.github.com/ArchismanKarmakar)
|
36 |
+
- [Kaggle](https://www.kaggle.com/archismancoder)
|
37 |
+
- Sumon Chatterjee
|
38 |
+
- [LinkedIn](https://www.linkedin.com/in/sumon-chatterjee-3b3b43227)
|
39 |
+
- [GitHub](https://github.com/Sumon670)
|
40 |
+
- [Kaggle](https://www.kaggle.com/sumonchatterjee)
|
41 |
+
|
42 |
+
**Mentors:**
|
43 |
+
- Prof. Anupam Mondal
|
44 |
+
- [LinkedIn](https://www.linkedin.com/in/anupam-mondal-ph-d-8a7a1a39/)
|
45 |
+
- [Google Scholar](https://scholar.google.com/citations?user=ESRR9o4AAAAJ&hl=en)
|
46 |
+
- [Website](https://sites.google.com/view/anupammondal/home)
|
47 |
+
- Prof. Sainik Kumar Mahata
|
48 |
+
- [LinkedIn](https://www.linkedin.com/in/mahatasainikk)
|
49 |
+
- [Google Scholar](https://scholar.google.co.in/citations?user=OcJDM50AAAAJ&hl=en)
|
50 |
+
- [Website](https://sites.google.com/view/sainik-kumar-mahata/home)
|
51 |
+
|
52 |
+
This is our research project for our B.Tech final year and a journal which is yet to be published.
|
53 |
+
""")
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
main()
|
dashboard.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def show_dashboard():
|
4 |
+
st.title("Tachygraphy Micro-text Analysis & Normalization")
|
5 |
+
st.write("""
|
6 |
+
Welcome to the Tachygraphy Micro-text Analysis & Normalization Project. This application is designed to analyze text data through three stages:
|
7 |
+
1. Sentiment Polarity Analysis
|
8 |
+
2. Emotion Mood-tag Analysis
|
9 |
+
3. Text Transformation & Normalization
|
10 |
+
""")
|
emotion_analysis.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def show_emotion_analysis():
|
4 |
+
st.title("Stage 2: Emotion Mood-tag Analysis")
|
5 |
+
st.write("This section will handle emotion detection.")
|
6 |
+
# Add your emotion detection code here
|
7 |
+
|
8 |
+
if __name__ == "__main__":
|
9 |
+
show_emotion_analysis()
|
imports.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, DebertaV2Tokenizer, DebertaV2ForSequenceClassification
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import plotly.express as px
|
7 |
+
import pandas as pd
|
8 |
+
import json
|
9 |
+
import gc
|
10 |
+
import psutil
|
11 |
+
import os
|
12 |
+
import importlib.util
|
13 |
+
import sys
|
requirements.txt
ADDED
@@ -0,0 +1,823 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==1.4.0
|
2 |
+
accelerate==1.2.1
|
3 |
+
aiofiles==22.1.0
|
4 |
+
aiohappyeyeballs==2.4.6
|
5 |
+
aiohttp==3.11.12
|
6 |
+
aiosignal==1.3.2
|
7 |
+
aiosqlite==0.21.0
|
8 |
+
alabaster==1.0.0
|
9 |
+
albucore==0.0.19
|
10 |
+
albumentations==1.4.20
|
11 |
+
alembic==1.14.1
|
12 |
+
altair==5.5.0
|
13 |
+
annotated-types==0.7.0
|
14 |
+
annoy==1.17.3
|
15 |
+
ansicolors==1.1.8
|
16 |
+
antlr4-python3-runtime==4.9.3
|
17 |
+
anyio==3.7.1
|
18 |
+
argon2-cffi==23.1.0
|
19 |
+
argon2-cffi-bindings==21.2.0
|
20 |
+
args==0.1.0
|
21 |
+
array_record==0.5.1
|
22 |
+
arrow==1.3.0
|
23 |
+
arviz==0.20.0
|
24 |
+
astropy==6.1.7
|
25 |
+
astropy-iers-data==0.2024.12.16.0.35.48
|
26 |
+
asttokens==3.0.0
|
27 |
+
astunparse==1.6.3
|
28 |
+
async-timeout==5.0.1
|
29 |
+
atpublic==4.1.0
|
30 |
+
attrs==25.1.0
|
31 |
+
audioread==3.0.1
|
32 |
+
autograd==1.7.0
|
33 |
+
babel==2.16.0
|
34 |
+
backcall==0.2.0
|
35 |
+
bayesian-optimization==2.0.3
|
36 |
+
beautifulsoup4==4.12.3
|
37 |
+
betterproto==2.0.0b6
|
38 |
+
bigframes==1.29.0
|
39 |
+
bigquery-magics==0.4.0
|
40 |
+
bleach==6.2.0
|
41 |
+
blinker==1.9.0
|
42 |
+
blis==0.7.11
|
43 |
+
blobfile==3.0.0
|
44 |
+
blosc2==2.7.1
|
45 |
+
bokeh==3.6.2
|
46 |
+
Boruta==0.4.3
|
47 |
+
boto3==1.36.23
|
48 |
+
botocore==1.36.23
|
49 |
+
Bottleneck==1.4.2
|
50 |
+
-e git+https://github.com/SohierDane/BigQuery_Helper@8615a7f6c1663e7f2d48aa2b32c2dbcb600a440f#egg=bq_helper
|
51 |
+
bqplot==0.12.43
|
52 |
+
branca==0.8.1
|
53 |
+
bs4==0.0.2
|
54 |
+
CacheControl==0.14.1
|
55 |
+
cachetools==5.5.0
|
56 |
+
Cartopy==0.24.1
|
57 |
+
catalogue==2.0.10
|
58 |
+
catboost==1.2.7
|
59 |
+
category_encoders==2.7.0
|
60 |
+
certifi==2025.1.31
|
61 |
+
cesium==0.12.1
|
62 |
+
cffi==1.17.1
|
63 |
+
chardet==5.2.0
|
64 |
+
charset-normalizer==3.4.1
|
65 |
+
Chessnut==0.4.1
|
66 |
+
chex==0.1.88
|
67 |
+
clarabel==0.9.0
|
68 |
+
click==8.1.7
|
69 |
+
click-plugins==1.1.1
|
70 |
+
cligj==0.7.2
|
71 |
+
clint==0.5.1
|
72 |
+
cloudpathlib==0.20.0
|
73 |
+
cloudpickle==3.1.0
|
74 |
+
cmake==3.31.2
|
75 |
+
cmdstanpy==1.2.5
|
76 |
+
colorama==0.4.6
|
77 |
+
colorcet==3.1.0
|
78 |
+
colorlog==6.9.0
|
79 |
+
colorlover==0.3.0
|
80 |
+
colour==0.1.5
|
81 |
+
comm==0.2.2
|
82 |
+
community==1.0.0b1
|
83 |
+
confection==0.1.5
|
84 |
+
cons==0.4.6
|
85 |
+
contourpy==1.3.1
|
86 |
+
coverage==7.6.12
|
87 |
+
cryptography==44.0.1
|
88 |
+
cuda-bindings==12.8.0
|
89 |
+
cuda-python==12.8.0
|
90 |
+
cudf-cu12==25.2.0
|
91 |
+
cufflinks==0.17.3
|
92 |
+
cuml-cu12==25.2.0
|
93 |
+
cupy-cuda12x==12.2.0
|
94 |
+
cuvs-cu12==25.2.0
|
95 |
+
cvxopt==1.3.2
|
96 |
+
cvxpy==1.6.0
|
97 |
+
cycler==0.12.1
|
98 |
+
cymem==2.0.10
|
99 |
+
Cython==3.0.11
|
100 |
+
cytoolz==1.0.1
|
101 |
+
daal==2025.2.0
|
102 |
+
dacite==1.9.2
|
103 |
+
dask==2024.12.1
|
104 |
+
dask-cuda==25.2.0
|
105 |
+
dask-cudf-cu12==25.2.0
|
106 |
+
dask-expr==1.1.21
|
107 |
+
dataclasses-json==0.6.7
|
108 |
+
datascience==0.17.6
|
109 |
+
datasets==3.3.1
|
110 |
+
datashader==0.17.0
|
111 |
+
db-dtypes==1.3.1
|
112 |
+
dbus-python==1.2.18
|
113 |
+
deap==1.4.2
|
114 |
+
debugpy==1.8.0
|
115 |
+
decorator==4.4.2
|
116 |
+
deepdiff==8.2.0
|
117 |
+
deepspeed==0.16.4
|
118 |
+
defusedxml==0.7.1
|
119 |
+
Deprecated==1.2.15
|
120 |
+
diffusers==0.31.0
|
121 |
+
dill==0.3.8
|
122 |
+
dipy==1.10.0
|
123 |
+
distributed==2024.12.1
|
124 |
+
distributed-ucxx-cu12==0.42.0
|
125 |
+
distro==1.9.0
|
126 |
+
dlib==19.24.2
|
127 |
+
dm-tree==0.1.8
|
128 |
+
dnspython==2.7.0
|
129 |
+
docker==7.1.0
|
130 |
+
docker-pycreds==0.4.0
|
131 |
+
docstring-to-markdown==0.15
|
132 |
+
docstring_parser==0.16
|
133 |
+
docutils==0.21.2
|
134 |
+
dopamine_rl==4.1.0
|
135 |
+
duckdb==1.1.3
|
136 |
+
earthengine-api==1.4.3
|
137 |
+
easydict==1.13
|
138 |
+
easyocr==1.7.2
|
139 |
+
editdistance==0.8.1
|
140 |
+
eerepr==0.0.4
|
141 |
+
einops==0.8.0
|
142 |
+
eli5==0.13.0
|
143 |
+
email_validator==2.2.0
|
144 |
+
emoji==2.14.1
|
145 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
146 |
+
entrypoints==0.4
|
147 |
+
et_xmlfile==2.0.0
|
148 |
+
etils==1.11.0
|
149 |
+
etuples==0.3.9
|
150 |
+
eval_type_backport==0.2.0
|
151 |
+
evaluate==0.4.3
|
152 |
+
exceptiongroup==1.2.2
|
153 |
+
execnb==0.1.11
|
154 |
+
Farama-Notifications==0.0.4
|
155 |
+
fastai==2.7.18
|
156 |
+
fastcore==1.7.27
|
157 |
+
fastdownload==0.0.7
|
158 |
+
fastjsonschema==2.21.1
|
159 |
+
fastprogress==1.0.3
|
160 |
+
fastrlock==0.8.2
|
161 |
+
fasttext==0.9.3
|
162 |
+
featuretools==1.31.0
|
163 |
+
filelock==3.17.0
|
164 |
+
fiona==1.10.1
|
165 |
+
firebase-admin==6.6.0
|
166 |
+
Flask==3.1.0
|
167 |
+
flatbuffers==24.3.25
|
168 |
+
flax==0.8.5
|
169 |
+
folium==0.19.2
|
170 |
+
fonttools==4.55.3
|
171 |
+
fqdn==1.5.1
|
172 |
+
frozendict==2.4.6
|
173 |
+
frozenlist==1.5.0
|
174 |
+
fsspec==2024.12.0
|
175 |
+
funcy==2.0
|
176 |
+
fury==0.12.0
|
177 |
+
future==1.0.0
|
178 |
+
fuzzywuzzy==0.18.0
|
179 |
+
gast==0.6.0
|
180 |
+
gatspy==0.3
|
181 |
+
gcsfs==2024.10.0
|
182 |
+
GDAL==3.6.4
|
183 |
+
gdown==5.2.0
|
184 |
+
geemap==0.35.1
|
185 |
+
gensim==4.3.3
|
186 |
+
geocoder==1.38.1
|
187 |
+
geographiclib==2.0
|
188 |
+
geojson==3.2.0
|
189 |
+
geopandas==0.14.4
|
190 |
+
geopy==2.4.1
|
191 |
+
ghapi==1.0.6
|
192 |
+
gin-config==0.5.0
|
193 |
+
gitdb==4.0.11
|
194 |
+
GitPython==3.1.43
|
195 |
+
glob2==0.7
|
196 |
+
google==2.0.3
|
197 |
+
google-ai-generativelanguage==0.6.10
|
198 |
+
google-api-core==1.34.1
|
199 |
+
google-api-python-client==2.155.0
|
200 |
+
google-auth==2.27.0
|
201 |
+
google-auth-httplib2==0.2.0
|
202 |
+
google-auth-oauthlib==1.2.1
|
203 |
+
google-cloud-aiplatform==1.74.0
|
204 |
+
google-cloud-automl==1.0.1
|
205 |
+
google-cloud-bigquery==3.25.0
|
206 |
+
google-cloud-bigquery-connection==1.17.0
|
207 |
+
google-cloud-bigtable==2.27.0
|
208 |
+
google-cloud-core==2.4.1
|
209 |
+
google-cloud-datastore==2.20.2
|
210 |
+
google-cloud-firestore==2.19.0
|
211 |
+
google-cloud-functions==1.19.0
|
212 |
+
google-cloud-iam==2.17.0
|
213 |
+
google-cloud-language==2.16.0
|
214 |
+
google-cloud-pubsub==2.27.1
|
215 |
+
google-cloud-resource-manager==1.14.0
|
216 |
+
google-cloud-storage==2.14.0
|
217 |
+
google-cloud-translate==3.12.1
|
218 |
+
google-cloud-videointelligence==2.16.0
|
219 |
+
google-cloud-vision==3.10.0
|
220 |
+
google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz
|
221 |
+
google-crc32c==1.6.0
|
222 |
+
google-genai==0.2.2
|
223 |
+
google-generativeai==0.8.3
|
224 |
+
google-pasta==0.2.0
|
225 |
+
google-resumable-media==2.7.2
|
226 |
+
googleapis-common-protos==1.66.0
|
227 |
+
googledrivedownloader==0.4
|
228 |
+
gpxpy==1.6.2
|
229 |
+
graphviz==0.20.3
|
230 |
+
greenlet==3.1.1
|
231 |
+
grpc-google-iam-v1==0.13.1
|
232 |
+
grpcio==1.68.1
|
233 |
+
grpcio-status==1.48.2
|
234 |
+
grpclib==0.4.8rc2
|
235 |
+
gspread==6.0.2
|
236 |
+
gspread-dataframe==3.3.1
|
237 |
+
gym==0.25.2
|
238 |
+
gym-notices==0.0.8
|
239 |
+
gymnasium==0.29.0
|
240 |
+
h11==0.14.0
|
241 |
+
h2==4.2.0
|
242 |
+
h2o==3.46.0.6
|
243 |
+
h5netcdf==1.4.1
|
244 |
+
h5py==3.12.1
|
245 |
+
haversine==2.9.0
|
246 |
+
hep_ml==0.7.3
|
247 |
+
hf_transfer==0.1.9
|
248 |
+
hjson==3.1.0
|
249 |
+
holidays==0.63
|
250 |
+
holoviews==1.20.0
|
251 |
+
hpack==4.1.0
|
252 |
+
html5lib==1.1
|
253 |
+
htmlmin==0.1.12
|
254 |
+
httpcore==1.0.7
|
255 |
+
httpimport==1.4.0
|
256 |
+
httplib2==0.22.0
|
257 |
+
httpx==0.28.1
|
258 |
+
huggingface-hub==0.29.0
|
259 |
+
humanize==4.11.0
|
260 |
+
hyperframe==6.1.0
|
261 |
+
hyperopt==0.2.7
|
262 |
+
ibis-framework==9.2.0
|
263 |
+
id==1.5.0
|
264 |
+
idna==3.10
|
265 |
+
igraph==0.11.8
|
266 |
+
ImageHash==4.3.1
|
267 |
+
imageio==2.36.1
|
268 |
+
imageio-ffmpeg==0.5.1
|
269 |
+
imagesize==1.4.1
|
270 |
+
imbalanced-learn==0.12.4
|
271 |
+
imgaug==0.4.0
|
272 |
+
immutabledict==4.2.1
|
273 |
+
importlib-resources==5.13.0
|
274 |
+
importlib_metadata==8.5.0
|
275 |
+
imutils==0.5.4
|
276 |
+
in-toto-attestation==0.9.3
|
277 |
+
inflect==7.4.0
|
278 |
+
iniconfig==2.0.0
|
279 |
+
intel-cmplr-lib-rt==2024.2.0
|
280 |
+
intel-cmplr-lib-ur==2024.2.0
|
281 |
+
intel-openmp==2024.2.0
|
282 |
+
ipyevents==2.0.2
|
283 |
+
ipyfilechooser==0.6.0
|
284 |
+
ipykernel==5.5.6
|
285 |
+
ipyleaflet==0.19.2
|
286 |
+
ipympl==0.9.6
|
287 |
+
ipyparallel==8.8.0
|
288 |
+
ipython==7.34.0
|
289 |
+
ipython-genutils==0.2.0
|
290 |
+
ipython-sql==0.5.0
|
291 |
+
ipytree==0.2.2
|
292 |
+
ipywidgets==8.1.5
|
293 |
+
isoduration==20.11.0
|
294 |
+
isoweek==1.3.3
|
295 |
+
itsdangerous==2.2.0
|
296 |
+
Janome==0.5.0
|
297 |
+
jax==0.4.33
|
298 |
+
jax-cuda12-pjrt==0.4.33
|
299 |
+
jax-cuda12-plugin==0.4.33
|
300 |
+
jaxlib==0.4.33
|
301 |
+
jedi==0.19.2
|
302 |
+
jeepney==0.7.1
|
303 |
+
jellyfish==1.1.0
|
304 |
+
jieba==0.42.1
|
305 |
+
Jinja2==3.1.4
|
306 |
+
jiter==0.8.2
|
307 |
+
jmespath==1.0.1
|
308 |
+
joblib==1.4.2
|
309 |
+
json5==0.10.0
|
310 |
+
jsonpatch==1.33
|
311 |
+
jsonpickle==4.0.1
|
312 |
+
jsonpointer==3.0.0
|
313 |
+
jsonschema==4.23.0
|
314 |
+
jsonschema-specifications==2024.10.1
|
315 |
+
jupyter-console==6.1.0
|
316 |
+
jupyter-events==0.12.0
|
317 |
+
jupyter-leaflet==0.19.2
|
318 |
+
jupyter-lsp==1.5.1
|
319 |
+
jupyter-ydoc==0.2.5
|
320 |
+
jupyter_client==8.6.3
|
321 |
+
jupyter_core==5.7.2
|
322 |
+
jupyter_server==2.12.5
|
323 |
+
jupyter_server_fileid==0.9.3
|
324 |
+
jupyter_server_terminals==0.5.3
|
325 |
+
jupyter_server_ydoc==0.8.0
|
326 |
+
jupyterlab==3.6.8
|
327 |
+
jupyterlab-lsp==3.10.2
|
328 |
+
jupyterlab_pygments==0.3.0
|
329 |
+
jupyterlab_server==2.27.3
|
330 |
+
jupyterlab_widgets==3.0.13
|
331 |
+
kaggle==1.6.17
|
332 |
+
kaggle-environments==1.16.11
|
333 |
+
kagglehub==0.3.9
|
334 |
+
keras==3.5.0
|
335 |
+
keras-core==0.1.7
|
336 |
+
keras-cv==0.9.0
|
337 |
+
keras-hub==0.18.1
|
338 |
+
keras-nlp==0.18.1
|
339 |
+
keras-tuner==1.4.7
|
340 |
+
keyring==23.5.0
|
341 |
+
kiwisolver==1.4.7
|
342 |
+
kornia==0.8.0
|
343 |
+
kornia_rs==0.1.8
|
344 |
+
kt-legacy==1.0.5
|
345 |
+
langchain==0.3.12
|
346 |
+
langchain-core==0.3.25
|
347 |
+
langchain-text-splitters==0.3.3
|
348 |
+
langcodes==3.5.0
|
349 |
+
langid==1.1.6
|
350 |
+
langsmith==0.2.3
|
351 |
+
language_data==1.3.0
|
352 |
+
launchpadlib==1.10.16
|
353 |
+
lazr.restfulclient==0.14.4
|
354 |
+
lazr.uri==1.0.6
|
355 |
+
lazy_loader==0.4
|
356 |
+
learntools @ git+https://github.com/Kaggle/learntools@010e3b5035354e15c073a0aca9e202c2e2beb742
|
357 |
+
leven==1.0.4
|
358 |
+
libclang==18.1.1
|
359 |
+
libcudf-cu12==25.2.0
|
360 |
+
libcuml-cu12==25.2.0
|
361 |
+
libcuvs-cu12==25.2.0
|
362 |
+
libkvikio-cu12==25.2.0
|
363 |
+
libpysal==4.9.2
|
364 |
+
libraft-cu12==25.2.0
|
365 |
+
librosa==0.10.2.post1
|
366 |
+
libucx-cu12==1.18.0
|
367 |
+
libucxx-cu12==0.42.0
|
368 |
+
lightgbm @ file:///tmp/lightgbm/lightgbm-4.5.0-py3-none-linux_x86_64.whl
|
369 |
+
lightning-utilities==0.12.0
|
370 |
+
lime==0.2.0.1
|
371 |
+
line_profiler==4.2.0
|
372 |
+
linkify-it-py==2.0.3
|
373 |
+
llvmlite==0.43.0
|
374 |
+
lml==0.1.0
|
375 |
+
locket==1.0.0
|
376 |
+
logical-unification==0.4.6
|
377 |
+
lxml==5.3.0
|
378 |
+
Mako==1.3.9
|
379 |
+
mamba==0.11.3
|
380 |
+
marisa-trie==1.2.1
|
381 |
+
Markdown==3.7
|
382 |
+
markdown-it-py==3.0.0
|
383 |
+
MarkupSafe==3.0.2
|
384 |
+
marshmallow==3.26.1
|
385 |
+
matplotlib==3.7.5
|
386 |
+
matplotlib-inline==0.1.7
|
387 |
+
matplotlib-venn==1.1.1
|
388 |
+
mdit-py-plugins==0.4.2
|
389 |
+
mdurl==0.1.2
|
390 |
+
miniKanren==1.0.3
|
391 |
+
missingno==0.5.2
|
392 |
+
mistune==0.8.4
|
393 |
+
mizani==0.13.1
|
394 |
+
mkl==2025.0.1
|
395 |
+
mkl-fft==1.3.8
|
396 |
+
mkl-random==1.2.4
|
397 |
+
mkl-service==2.4.1
|
398 |
+
mkl-umath==0.1.1
|
399 |
+
ml-dtypes==0.4.1
|
400 |
+
mlcrate==0.2.0
|
401 |
+
mlxtend==0.23.3
|
402 |
+
mne==1.9.0
|
403 |
+
model-signing==0.2.0
|
404 |
+
more-itertools==10.5.0
|
405 |
+
moviepy==1.0.3
|
406 |
+
mpld3==0.5.10
|
407 |
+
mpmath==1.3.0
|
408 |
+
msgpack==1.1.0
|
409 |
+
multidict==6.1.0
|
410 |
+
multimethod==1.12
|
411 |
+
multipledispatch==1.0.0
|
412 |
+
multiprocess==0.70.16
|
413 |
+
multitasking==0.0.11
|
414 |
+
murmurhash==1.0.11
|
415 |
+
music21==9.3.0
|
416 |
+
mypy-extensions==1.0.0
|
417 |
+
namex==0.0.8
|
418 |
+
narwhals==1.18.4
|
419 |
+
natsort==8.4.0
|
420 |
+
nbclassic==1.1.0
|
421 |
+
nbclient==0.5.13
|
422 |
+
nbconvert==6.4.5
|
423 |
+
nbdev==2.3.34
|
424 |
+
nbformat==5.10.4
|
425 |
+
ndindex==1.9.2
|
426 |
+
nest-asyncio==1.6.0
|
427 |
+
networkx==3.4.2
|
428 |
+
nibabel==5.3.2
|
429 |
+
nilearn==0.10.4
|
430 |
+
ninja==1.11.1.3
|
431 |
+
nltk==3.2.4
|
432 |
+
nose==1.3.7
|
433 |
+
notebook==6.5.4
|
434 |
+
notebook_shim==0.2.4
|
435 |
+
numba==0.60.0
|
436 |
+
numba-cuda==0.2.0
|
437 |
+
numexpr==2.10.2
|
438 |
+
numpy==1.26.4
|
439 |
+
nvidia-cublas-cu12==12.6.4.1
|
440 |
+
nvidia-cuda-cupti-cu12==12.6.80
|
441 |
+
nvidia-cuda-nvcc-cu12==12.6.85
|
442 |
+
nvidia-cuda-runtime-cu12==12.6.77
|
443 |
+
nvidia-cudnn-cu12==9.6.0.74
|
444 |
+
nvidia-cufft-cu12==11.3.0.4
|
445 |
+
nvidia-curand-cu12==10.3.7.77
|
446 |
+
nvidia-cusolver-cu12==11.7.1.2
|
447 |
+
nvidia-cusparse-cu12==12.5.4.2
|
448 |
+
nvidia-ml-py==12.570.86
|
449 |
+
nvidia-nccl-cu12==2.23.4
|
450 |
+
nvidia-nvcomp-cu12==4.1.0.6
|
451 |
+
nvidia-nvjitlink-cu12==12.6.85
|
452 |
+
nvtx==0.2.10
|
453 |
+
nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.10.0-py3-none-any.whl
|
454 |
+
oauth2client==4.1.3
|
455 |
+
oauthlib==3.2.2
|
456 |
+
odfpy==1.4.1
|
457 |
+
olefile==0.47
|
458 |
+
omegaconf==2.3.0
|
459 |
+
onnx==1.17.0
|
460 |
+
openai==1.57.4
|
461 |
+
opencv-contrib-python==4.10.0.84
|
462 |
+
opencv-python==4.10.0.84
|
463 |
+
opencv-python-headless==4.10.0.84
|
464 |
+
openpyxl==3.1.5
|
465 |
+
openslide-bin==4.0.0.6
|
466 |
+
openslide-python==1.4.1
|
467 |
+
opentelemetry-api==1.29.0
|
468 |
+
opentelemetry-sdk==1.29.0
|
469 |
+
opentelemetry-semantic-conventions==0.50b0
|
470 |
+
opt_einsum==3.4.0
|
471 |
+
optax==0.2.4
|
472 |
+
optim==0.1.0
|
473 |
+
optree==0.13.1
|
474 |
+
optuna==4.2.1
|
475 |
+
orbax-checkpoint==0.6.4
|
476 |
+
orderly-set==5.3.0
|
477 |
+
orjson==3.10.12
|
478 |
+
osqp==0.6.7.post3
|
479 |
+
overrides==7.7.0
|
480 |
+
packaging==24.2
|
481 |
+
pandas==2.2.3
|
482 |
+
pandas-datareader==0.10.0
|
483 |
+
pandas-gbq==0.25.0
|
484 |
+
pandas-profiling==3.6.6
|
485 |
+
pandas-stubs==2.2.2.240909
|
486 |
+
pandasql==0.7.3
|
487 |
+
pandocfilters==1.5.1
|
488 |
+
panel==1.5.4
|
489 |
+
papermill==2.6.0
|
490 |
+
param==2.2.0
|
491 |
+
parso==0.8.4
|
492 |
+
parsy==2.1
|
493 |
+
partd==1.4.2
|
494 |
+
path==17.1.0
|
495 |
+
path.py==12.5.0
|
496 |
+
pathlib==1.0.1
|
497 |
+
pathos==0.3.2
|
498 |
+
patsy==1.0.1
|
499 |
+
pdf2image==1.17.0
|
500 |
+
peewee==3.17.8
|
501 |
+
peft==0.14.0
|
502 |
+
pettingzoo==1.24.0
|
503 |
+
pexpect==4.9.0
|
504 |
+
phik==0.12.4
|
505 |
+
pickleshare==0.7.5
|
506 |
+
pillow==11.0.0
|
507 |
+
platformdirs==4.3.6
|
508 |
+
plotly==5.24.1
|
509 |
+
plotly-express==0.4.1
|
510 |
+
plotnine==0.14.4
|
511 |
+
pluggy==1.5.0
|
512 |
+
ply==3.11
|
513 |
+
polars==1.9.0
|
514 |
+
pooch==1.8.2
|
515 |
+
portalocker==3.1.1
|
516 |
+
portpicker==1.5.2
|
517 |
+
pox==0.3.5
|
518 |
+
ppft==1.7.6.9
|
519 |
+
preprocessing==0.1.13
|
520 |
+
preshed==3.0.9
|
521 |
+
prettytable==3.12.0
|
522 |
+
proglog==0.1.10
|
523 |
+
progressbar2==4.5.0
|
524 |
+
prometheus_client==0.21.1
|
525 |
+
promise==2.3
|
526 |
+
prompt_toolkit==3.0.48
|
527 |
+
propcache==0.2.1
|
528 |
+
prophet==1.1.6
|
529 |
+
proto-plus==1.25.0
|
530 |
+
protobuf==3.20.3
|
531 |
+
psutil==5.9.5
|
532 |
+
psycopg2==2.9.10
|
533 |
+
ptyprocess==0.7.0
|
534 |
+
pudb==2024.1.3
|
535 |
+
py-cpuinfo==9.0.0
|
536 |
+
py4j==0.10.9.7
|
537 |
+
pyaml==25.1.0
|
538 |
+
PyArabic==0.6.15
|
539 |
+
pyarrow==19.0.1
|
540 |
+
pyasn1==0.6.1
|
541 |
+
pyasn1_modules==0.4.1
|
542 |
+
pybind11==2.13.6
|
543 |
+
pyclipper==1.3.0.post6
|
544 |
+
pycocotools==2.0.8
|
545 |
+
pycparser==2.22
|
546 |
+
pycryptodome==3.21.0
|
547 |
+
pycryptodomex==3.21.0
|
548 |
+
pyct==0.5.0
|
549 |
+
pycuda==2025.1
|
550 |
+
pydantic==2.11.0a2
|
551 |
+
pydantic_core==2.29.0
|
552 |
+
pydata-google-auth==1.9.0
|
553 |
+
pydegensac==0.1.2
|
554 |
+
pydicom==3.0.1
|
555 |
+
pydot==3.0.3
|
556 |
+
pydotplus==2.0.2
|
557 |
+
PyDrive==1.3.1
|
558 |
+
PyDrive2==1.21.3
|
559 |
+
pydub==0.25.1
|
560 |
+
pyemd==1.0.0
|
561 |
+
pyerfa==2.0.1.5
|
562 |
+
pyexcel-io==0.6.7
|
563 |
+
pyexcel-ods==0.6.0
|
564 |
+
pygame==2.6.1
|
565 |
+
pygit2==1.16.0
|
566 |
+
pygltflib==1.16.3
|
567 |
+
Pygments==2.19.1
|
568 |
+
PyGObject==3.42.1
|
569 |
+
PyJWT==2.10.1
|
570 |
+
pyLDAvis==3.4.1
|
571 |
+
pylibcudf-cu12==25.2.0
|
572 |
+
pylibcugraph-cu12==24.10.0
|
573 |
+
pylibraft-cu12==25.2.0
|
574 |
+
pymc==5.19.1
|
575 |
+
pymc3==3.11.4
|
576 |
+
pymongo==4.11.1
|
577 |
+
Pympler==1.1
|
578 |
+
pymystem3==0.2.0
|
579 |
+
pynvjitlink-cu12==0.4.0
|
580 |
+
pynvml==12.0.0
|
581 |
+
pyogrio==0.10.0
|
582 |
+
Pyomo==6.8.2
|
583 |
+
PyOpenGL==3.1.7
|
584 |
+
pyOpenSSL==25.0.0
|
585 |
+
pyparsing==3.2.0
|
586 |
+
pypdf==5.3.0
|
587 |
+
pyperclip==1.9.0
|
588 |
+
pyproj==3.7.0
|
589 |
+
pyshp==2.3.1
|
590 |
+
PySocks==1.7.1
|
591 |
+
pyspark==3.5.3
|
592 |
+
pytensor==2.26.4
|
593 |
+
pytesseract==0.3.13
|
594 |
+
pytest==8.3.4
|
595 |
+
python-apt==0.0.0
|
596 |
+
python-bidi==0.6.6
|
597 |
+
python-box==7.3.0
|
598 |
+
python-dateutil==2.9.0.post0
|
599 |
+
python-json-logger==3.2.1
|
600 |
+
python-louvain==0.16
|
601 |
+
python-lsp-jsonrpc==1.1.2
|
602 |
+
python-lsp-server==1.12.2
|
603 |
+
python-slugify==8.0.4
|
604 |
+
python-utils==3.9.1
|
605 |
+
pytools==2025.1.1
|
606 |
+
pytorch-ignite==0.5.1
|
607 |
+
pytorch-lightning==2.5.0.post0
|
608 |
+
pytz==2025.1
|
609 |
+
PyUpSet==0.1.1.post7
|
610 |
+
pyviz_comms==3.0.3
|
611 |
+
PyWavelets==1.8.0
|
612 |
+
PyYAML==6.0.2
|
613 |
+
pyzmq==24.0.1
|
614 |
+
qdldl==0.1.7.post4
|
615 |
+
qgrid==1.3.1
|
616 |
+
qtconsole==5.6.1
|
617 |
+
QtPy==2.4.3
|
618 |
+
raft-dask-cu12==25.2.0
|
619 |
+
rapids-dask-dependency==25.2.0
|
620 |
+
ratelim==0.1.6
|
621 |
+
ray==2.42.1
|
622 |
+
referencing==0.35.1
|
623 |
+
regex==2024.11.6
|
624 |
+
requests==2.32.3
|
625 |
+
requests-oauthlib==1.3.1
|
626 |
+
requests-toolbelt==1.0.0
|
627 |
+
requirements-parser==0.9.0
|
628 |
+
rfc3161-client==0.1.2
|
629 |
+
rfc3339-validator==0.1.4
|
630 |
+
rfc3986-validator==0.1.1
|
631 |
+
rfc8785==0.1.4
|
632 |
+
rgf-python==3.12.0
|
633 |
+
rich==13.9.4
|
634 |
+
rmm-cu12==25.2.0
|
635 |
+
rouge_score==0.1.2
|
636 |
+
rpds-py==0.22.3
|
637 |
+
rpy2==3.4.2
|
638 |
+
rsa==4.9
|
639 |
+
Rtree==1.3.0
|
640 |
+
s3fs==0.4.2
|
641 |
+
s3transfer==0.11.2
|
642 |
+
sacrebleu==2.5.1
|
643 |
+
safetensors==0.4.5
|
644 |
+
scikit-image==0.25.0
|
645 |
+
scikit-learn==1.2.2
|
646 |
+
scikit-learn-intelex==2025.2.0
|
647 |
+
scikit-multilearn==0.2.0
|
648 |
+
scikit-optimize==0.10.2
|
649 |
+
scikit-plot==0.3.7
|
650 |
+
scikit-surprise==1.1.4
|
651 |
+
scipy==1.13.1
|
652 |
+
scooby==0.10.0
|
653 |
+
scs==3.2.7
|
654 |
+
seaborn==0.12.2
|
655 |
+
SecretStorage==3.3.1
|
656 |
+
securesystemslib==1.2.0
|
657 |
+
segment_anything @ git+https://github.com/facebookresearch/segment-anything.git@dca509fe793f601edb92606367a655c15ac00fdf
|
658 |
+
semver==3.0.4
|
659 |
+
Send2Trash==1.8.3
|
660 |
+
sentence-transformers==3.3.1
|
661 |
+
sentencepiece==0.2.0
|
662 |
+
sentry-sdk==2.19.2
|
663 |
+
setproctitle==1.3.4
|
664 |
+
setuptools-scm==8.1.0
|
665 |
+
shap==0.44.1
|
666 |
+
shapely==2.0.7
|
667 |
+
shellingham==1.5.4
|
668 |
+
Shimmy==1.3.0
|
669 |
+
sigstore==3.6.1
|
670 |
+
sigstore-protobuf-specs==0.3.2
|
671 |
+
sigstore-rekor-types==0.0.18
|
672 |
+
simple-parsing==0.1.6
|
673 |
+
SimpleITK==2.4.1
|
674 |
+
six==1.17.0
|
675 |
+
sklearn-pandas==2.2.0
|
676 |
+
slicer==0.0.7
|
677 |
+
smart-open==7.0.5
|
678 |
+
smmap==5.0.1
|
679 |
+
sniffio==1.3.1
|
680 |
+
snowballstemmer==2.2.0
|
681 |
+
sortedcontainers==2.4.0
|
682 |
+
soundfile==0.12.1
|
683 |
+
soupsieve==2.6
|
684 |
+
soxr==0.5.0.post1
|
685 |
+
spacy==3.7.5
|
686 |
+
spacy-legacy==3.0.12
|
687 |
+
spacy-loggers==1.0.5
|
688 |
+
Sphinx==8.1.3
|
689 |
+
sphinx-rtd-theme==0.2.4
|
690 |
+
sphinxcontrib-applehelp==2.0.0
|
691 |
+
sphinxcontrib-devhelp==2.0.0
|
692 |
+
sphinxcontrib-htmlhelp==2.1.0
|
693 |
+
sphinxcontrib-jsmath==1.0.1
|
694 |
+
sphinxcontrib-qthelp==2.0.0
|
695 |
+
sphinxcontrib-serializinghtml==2.0.0
|
696 |
+
SQLAlchemy==2.0.36
|
697 |
+
sqlglot==25.1.0
|
698 |
+
sqlparse==0.5.3
|
699 |
+
squarify==0.4.4
|
700 |
+
srsly==2.5.0
|
701 |
+
stable-baselines3==2.1.0
|
702 |
+
stanio==0.5.1
|
703 |
+
statsmodels==0.14.4
|
704 |
+
stopit==1.1.2
|
705 |
+
StrEnum==0.4.15
|
706 |
+
stringzilla==3.11.1
|
707 |
+
stumpy==1.13.0
|
708 |
+
sympy==1.13.1
|
709 |
+
tables==3.10.1
|
710 |
+
tabulate==0.9.0
|
711 |
+
tbb==2022.0.0
|
712 |
+
tbb4py==2022.0.0
|
713 |
+
tblib==3.0.0
|
714 |
+
tcmlib==1.2.0
|
715 |
+
tenacity==9.0.0
|
716 |
+
tensorboard==2.17.1
|
717 |
+
tensorboard-data-server==0.7.2
|
718 |
+
tensorflow==2.17.1
|
719 |
+
tensorflow-cloud==0.1.5
|
720 |
+
tensorflow-datasets==4.9.7
|
721 |
+
tensorflow-hub==0.16.1
|
722 |
+
tensorflow-io==0.37.1
|
723 |
+
tensorflow-io-gcs-filesystem==0.37.1
|
724 |
+
tensorflow-metadata==1.13.1
|
725 |
+
tensorflow-probability==0.24.0
|
726 |
+
tensorflow-text==2.17.0
|
727 |
+
tensorflow_decision_forests==1.10.0
|
728 |
+
tensorstore==0.1.71
|
729 |
+
termcolor==2.5.0
|
730 |
+
terminado==0.18.1
|
731 |
+
testpath==0.6.0
|
732 |
+
text-unidecode==1.3
|
733 |
+
textblob==0.17.1
|
734 |
+
texttable==1.7.0
|
735 |
+
tf-slim==1.1.0
|
736 |
+
tf_keras==2.17.0
|
737 |
+
Theano==1.0.5
|
738 |
+
Theano-PyMC==1.1.2
|
739 |
+
thinc==8.2.5
|
740 |
+
threadpoolctl==3.5.0
|
741 |
+
tifffile==2024.12.12
|
742 |
+
tiktoken==0.9.0
|
743 |
+
timm==1.0.12
|
744 |
+
tinycss2==1.4.0
|
745 |
+
tokenizers==0.21.0
|
746 |
+
toml==0.10.2
|
747 |
+
tomli==2.2.1
|
748 |
+
toolz==0.12.1
|
749 |
+
torch @ https://download.pytorch.org/whl/cu121_full/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl
|
750 |
+
torchaudio @ https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl
|
751 |
+
torchinfo==1.8.0
|
752 |
+
torchmetrics==1.6.1
|
753 |
+
torchsummary==1.5.1
|
754 |
+
torchtune==0.5.0
|
755 |
+
torchvision @ https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp310-cp310-linux_x86_64.whl
|
756 |
+
tornado==6.3.3
|
757 |
+
TPOT==0.12.1
|
758 |
+
tqdm==4.67.1
|
759 |
+
traitlets==5.7.1
|
760 |
+
traittypes==0.2.1
|
761 |
+
transformers==4.47.0
|
762 |
+
treelite==4.4.1
|
763 |
+
trx-python==0.3
|
764 |
+
tsfresh==0.20.2
|
765 |
+
tuf==5.1.0
|
766 |
+
tweepy==4.14.0
|
767 |
+
typeguard==4.4.1
|
768 |
+
typer==0.15.1
|
769 |
+
types-python-dateutil==2.9.0.20241206
|
770 |
+
types-pytz==2024.2.0.20241003
|
771 |
+
types-setuptools==75.6.0.20241126
|
772 |
+
typing-inspect==0.9.0
|
773 |
+
typing_extensions==4.12.2
|
774 |
+
tzdata==2025.1
|
775 |
+
tzlocal==5.2
|
776 |
+
uc-micro-py==1.0.3
|
777 |
+
ucx-py-cu12==0.42.0
|
778 |
+
ucxx-cu12==0.42.0
|
779 |
+
ujson==5.10.0
|
780 |
+
umf==0.9.1
|
781 |
+
update-checker==0.18.0
|
782 |
+
uri-template==1.3.0
|
783 |
+
uritemplate==4.1.1
|
784 |
+
urllib3==2.3.0
|
785 |
+
urwid==2.6.16
|
786 |
+
urwid_readline==0.15.1
|
787 |
+
vega-datasets==0.9.0
|
788 |
+
visions==0.7.6
|
789 |
+
vtk==9.3.1
|
790 |
+
wadllib==1.3.6
|
791 |
+
Wand==0.6.13
|
792 |
+
wandb==0.19.1
|
793 |
+
wasabi==1.1.3
|
794 |
+
watchdog==6.0.0
|
795 |
+
wavio==0.0.9
|
796 |
+
wcwidth==0.2.13
|
797 |
+
weasel==0.4.1
|
798 |
+
webcolors==24.11.1
|
799 |
+
webencodings==0.5.1
|
800 |
+
websocket-client==1.8.0
|
801 |
+
websockets==14.1
|
802 |
+
Werkzeug==3.1.3
|
803 |
+
widgetsnbextension==4.0.13
|
804 |
+
woodwork==0.31.0
|
805 |
+
wordcloud==1.9.4
|
806 |
+
wrapt==1.17.0
|
807 |
+
wurlitzer==3.1.1
|
808 |
+
xarray==2024.11.0
|
809 |
+
xarray-einstats==0.8.0
|
810 |
+
xgboost==2.0.3
|
811 |
+
xlrd==2.0.1
|
812 |
+
xvfbwrapper==0.2.9
|
813 |
+
xxhash==3.5.0
|
814 |
+
xyzservices==2024.9.0
|
815 |
+
y-py==0.6.2
|
816 |
+
yarl==1.18.3
|
817 |
+
ydata-profiling==4.12.2
|
818 |
+
ydf==0.9.0
|
819 |
+
yellowbrick==1.5
|
820 |
+
yfinance==0.2.50
|
821 |
+
ypy-websocket==0.8.4
|
822 |
+
zict==3.0.0
|
823 |
+
zipp==3.21.0
|
sentiment_analysis/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# from . import sentiment_analysis
|
sentiment_analysis/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (166 Bytes). View file
|
|
sentiment_analysis/__pycache__/sentiment_analysis.cpython-310.pyc
ADDED
Binary file (4.86 kB). View file
|
|
sentiment_analysis/config/stage1_models.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"1": {
|
3 |
+
"name": "DeBERTa v3 Base for Sequence Classification",
|
4 |
+
"type": "hf_automodel_finetuned_dbt3",
|
5 |
+
"module_path": "hmv_cfg_base_stage1.model1",
|
6 |
+
"hf_location": "tachygraphy-microtrext-norm-org/DeBERTa-v3-seqClassfication-LV1-SentimentPolarities-Batch8",
|
7 |
+
"tokenizer_class": "DebertaV2Tokenizer",
|
8 |
+
"model_class": "DebertaV2ForSequenceClassification",
|
9 |
+
"problem_type": "regression",
|
10 |
+
"base_model": "microsoft/deberta-v3-base",
|
11 |
+
"num_labels": 3,
|
12 |
+
"device": "cpu",
|
13 |
+
"load_function": "load_model",
|
14 |
+
"predict_function": "predict"
|
15 |
+
}
|
16 |
+
}
|
sentiment_analysis/hmv_cfg_base_stage1/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# from . import model1
|
sentiment_analysis/hmv_cfg_base_stage1/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (186 Bytes). View file
|
|
sentiment_analysis/hmv_cfg_base_stage1/__pycache__/model1.cpython-310.pyc
ADDED
Binary file (1.95 kB). View file
|
|
sentiment_analysis/hmv_cfg_base_stage1/imports.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, DebertaV2Tokenizer, DebertaV2ForSequenceClassification
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import plotly.express as px
|
7 |
+
import pandas as pd
|
8 |
+
import json
|
9 |
+
import gc
|
10 |
+
import psutil
|
11 |
+
import os
|
12 |
+
import importlib.util
|
13 |
+
import sys
|
sentiment_analysis/hmv_cfg_base_stage1/model1.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
from imports import *
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
|
6 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
7 |
+
CONFIG_STAGE1 = os.path.join(BASE_DIR, "..", "config", "stage1_models.json")
|
8 |
+
|
9 |
+
MODEL_OPTIONS = {
|
10 |
+
"1": {
|
11 |
+
"name": "DeBERTa v3 Base for Sequence Classification",
|
12 |
+
"type": "hf_automodel_finetuned_dbt3",
|
13 |
+
"module_path": "hmv_cfg_base_stage1.model1",
|
14 |
+
"hf_location": "tachygraphy-microtrext-norm-org/DeBERTa-v3-seqClassfication-LV1-SentimentPolarities-Batch8",
|
15 |
+
"tokenizer_class": "DebertaV2Tokenizer",
|
16 |
+
"model_class": "DebertaV2ForSequenceClassification",
|
17 |
+
"problem_type": "regression",
|
18 |
+
"base_model": "microsoft/deberta-v3-base",
|
19 |
+
"num_labels": 3,
|
20 |
+
"device": "cpu",
|
21 |
+
"load_function": "load_model",
|
22 |
+
"predict_function": "predict"
|
23 |
+
}
|
24 |
+
}
|
25 |
+
|
26 |
+
|
27 |
+
@st.cache_resource
|
28 |
+
|
29 |
+
def load_model():
|
30 |
+
model_key = "1"
|
31 |
+
model_info = MODEL_OPTIONS[model_key]
|
32 |
+
hf_location = model_info["hf_location"]
|
33 |
+
|
34 |
+
tokenizer_class = globals()[model_info["tokenizer_class"]]
|
35 |
+
model_class = globals()[model_info["model_class"]]
|
36 |
+
tokenizer = tokenizer_class.from_pretrained(hf_location)
|
37 |
+
model = model_class.from_pretrained(hf_location,
|
38 |
+
problem_type=model_info["problem_type"],
|
39 |
+
num_labels=model_info["num_labels"]
|
40 |
+
)
|
41 |
+
|
42 |
+
return model, tokenizer
|
43 |
+
|
44 |
+
|
45 |
+
def predict(text, model, tokenizer, device, max_len=128):
|
46 |
+
# Tokenize and pad the input text
|
47 |
+
inputs = tokenizer(
|
48 |
+
text,
|
49 |
+
add_special_tokens=True,
|
50 |
+
padding=True,
|
51 |
+
truncation=False,
|
52 |
+
return_tensors="pt",
|
53 |
+
return_token_type_ids=False,
|
54 |
+
).to(device) # Move input tensors to the correct device
|
55 |
+
|
56 |
+
with torch.no_grad():
|
57 |
+
outputs = model(**inputs)
|
58 |
+
|
59 |
+
# probabilities = outputs.logits.cpu().numpy()
|
60 |
+
|
61 |
+
probabilities = torch.relu(outputs.logits)
|
62 |
+
probabilities = torch.clamp(torch.tensor(probabilities), min=0.00000, max=1.00000).cpu().numpy()
|
63 |
+
# probabilities /= probabilities.sum()
|
64 |
+
# probabilities = probabilities.cpu().numpy()
|
65 |
+
|
66 |
+
return probabilities
|
67 |
+
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
model, tokenizer = load_model("1")
|
71 |
+
print("Model and tokenizer loaded successfully.")
|
sentiment_analysis/hmv_cfg_base_stage1/stage1_bert_architecture.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
|
3 |
+
class BERT_architecture(nn.Module):
|
4 |
+
|
5 |
+
def __init__(self, bert):
|
6 |
+
super(BERT_architecture, self).__init__()
|
7 |
+
self.bert = bert
|
8 |
+
|
9 |
+
self.dropout = nn.Dropout(0.3) # Increased dropout for regularization
|
10 |
+
self.layer_norm = nn.LayerNorm(768) # Layer normalization
|
11 |
+
|
12 |
+
self.fc1 = nn.Linear(768, 256) # Dense layer
|
13 |
+
self.fc2 = nn.Linear(256, 3) # Output layer with 3 classes
|
14 |
+
|
15 |
+
self.relu = nn.ReLU()
|
16 |
+
self.softmax = nn.LogSoftmax(dim=1)
|
17 |
+
|
18 |
+
def forward(self, sent_id, mask, token_type_ids):
|
19 |
+
_, cls_hs = self.bert(sent_id, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
|
20 |
+
x = self.layer_norm(cls_hs)
|
21 |
+
x = self.fc1(x)
|
22 |
+
x = self.relu(x)
|
23 |
+
x = self.dropout(x)
|
24 |
+
x = self.fc2(x)
|
25 |
+
x = self.softmax(x)
|
26 |
+
return x
|
sentiment_analysis/sentiment_analysis.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from imports import *
|
2 |
+
import importlib.util
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
import joblib
|
6 |
+
|
7 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), )))
|
8 |
+
|
9 |
+
# from hmv_cfg_base_stage1.model1 import load_model as load_model1
|
10 |
+
# from hmv_cfg_base_stage1.model1 import predict as predict1
|
11 |
+
|
12 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
13 |
+
CONFIG_STAGE1 = os.path.join(BASE_DIR, "config", "stage1_models.json")
|
14 |
+
LOADERS_STAGE1 = os.path.join(BASE_DIR, "hmv-cfg-base-stage1")
|
15 |
+
|
16 |
+
# Load the model and tokenizer
|
17 |
+
# model_name = "tachygraphy-microtrext-norm-org/DeBERTa-v3-seqClassfication-LV1-SentimentPolarities-Batch8"
|
18 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
19 |
+
# model = AutoModel.from_pretrained(model_name)
|
20 |
+
|
21 |
+
SENTIMENT_POLARITY_LABELS = [
|
22 |
+
"negative", "neutral", "positive"
|
23 |
+
]
|
24 |
+
|
25 |
+
current_model = None
|
26 |
+
current_tokenizer = None
|
27 |
+
|
28 |
+
# Enabling Resource caching
|
29 |
+
@st.cache_resource
|
30 |
+
|
31 |
+
def load_model_config():
|
32 |
+
with open(CONFIG_STAGE1, "r") as f:
|
33 |
+
model_data = json.load(f)
|
34 |
+
|
35 |
+
model_options = {v["name"]: v for v in model_data.values()} # Extract names for dropdown
|
36 |
+
return model_data, model_options
|
37 |
+
|
38 |
+
MODEL_DATA, MODEL_OPTIONS = load_model_config()
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
# def load_model():
|
43 |
+
# model = DebertaV2ForSequenceClassification.from_pretrained(model_name)
|
44 |
+
# tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
|
45 |
+
# return model, tokenizer
|
46 |
+
|
47 |
+
|
48 |
+
# ✅ Dynamically Import Model Functions
|
49 |
+
def import_from_module(module_name, function_name):
|
50 |
+
try:
|
51 |
+
module = importlib.import_module(module_name)
|
52 |
+
return getattr(module, function_name)
|
53 |
+
except (ModuleNotFoundError, AttributeError) as e:
|
54 |
+
st.error(f"❌ Import Error: {e}")
|
55 |
+
return None
|
56 |
+
|
57 |
+
|
58 |
+
def free_memory():
|
59 |
+
# """Free up CPU & GPU memory before loading a new model."""
|
60 |
+
global current_model, current_tokenizer
|
61 |
+
|
62 |
+
if current_model is not None:
|
63 |
+
del current_model # Delete the existing model
|
64 |
+
current_model = None # Reset reference
|
65 |
+
|
66 |
+
if current_tokenizer is not None:
|
67 |
+
del current_tokenizer # Delete the tokenizer
|
68 |
+
current_tokenizer = None
|
69 |
+
|
70 |
+
gc.collect() # Force garbage collection for CPU memory
|
71 |
+
|
72 |
+
if torch.cuda.is_available():
|
73 |
+
torch.cuda.empty_cache() # Free GPU memory
|
74 |
+
torch.cuda.ipc_collect() # Clean up PyTorch GPU cache
|
75 |
+
|
76 |
+
# If running on CPU, reclaim memory using OS-level commands
|
77 |
+
try:
|
78 |
+
if torch.cuda.is_available() is False:
|
79 |
+
psutil.virtual_memory() # Refresh memory stats
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Memory cleanup error: {e}")
|
82 |
+
|
83 |
+
|
84 |
+
def load_selected_model(model_name):
|
85 |
+
global current_model, current_tokenizer
|
86 |
+
|
87 |
+
free_memory()
|
88 |
+
|
89 |
+
# st.write("DEBUG: Available Models:", MODEL_OPTIONS.keys()) # ✅ See available models
|
90 |
+
# st.write("DEBUG: Selected Model:", MODEL_OPTIONS[model_name]) # ✅ Check selected model
|
91 |
+
# st.write("DEBUG: Model Name:", model_name) # ✅ Check selected model
|
92 |
+
|
93 |
+
if model_name not in MODEL_OPTIONS:
|
94 |
+
st.error(f"⚠️ Model '{model_name}' not found in config!")
|
95 |
+
return None, None, None
|
96 |
+
|
97 |
+
model_info = MODEL_OPTIONS[model_name]
|
98 |
+
hf_location = model_info["hf_location"]
|
99 |
+
|
100 |
+
model_module = model_info["module_path"]
|
101 |
+
load_function = model_info["load_function"]
|
102 |
+
predict_function = model_info["predict_function"]
|
103 |
+
|
104 |
+
load_model_func = import_from_module(model_module, load_function)
|
105 |
+
predict_func = import_from_module(model_module, predict_function)
|
106 |
+
|
107 |
+
if load_model_func is None or predict_func is None:
|
108 |
+
st.error("❌ Model functions could not be loaded!")
|
109 |
+
return None, None, None
|
110 |
+
|
111 |
+
model, tokenizer = load_model_func()
|
112 |
+
|
113 |
+
current_model, current_tokenizer = model, tokenizer
|
114 |
+
return model, tokenizer, predict_func
|
115 |
+
|
116 |
+
# def load_selected_model(model_name):
|
117 |
+
# # """Load model and tokenizer based on user selection."""
|
118 |
+
# global current_model, current_tokenizer
|
119 |
+
|
120 |
+
# # Free memory before loading a new model
|
121 |
+
# free_memory()
|
122 |
+
|
123 |
+
# if model_name not in MODEL_OPTIONS:
|
124 |
+
# st.error(f"⚠️ Model '{model_name}' not found in config!")
|
125 |
+
# return None, None
|
126 |
+
|
127 |
+
# model_info = MODEL_OPTIONS[model_name]
|
128 |
+
# hf_location = model_info["hf_location"]
|
129 |
+
|
130 |
+
# model_module = model_info["module_path"]
|
131 |
+
# # load_function = "load_model"
|
132 |
+
# # predict_function = "predict"
|
133 |
+
|
134 |
+
# load_function = model_info["load_function"]
|
135 |
+
# predict_function = model_info["predict_function"]
|
136 |
+
|
137 |
+
# # tokenizer_class = globals()[model_info["tokenizer_class"]]
|
138 |
+
# # model_class = globals()[model_info["model_class"]]
|
139 |
+
|
140 |
+
# # tokenizer = tokenizer_class.from_pretrained(hf_location)
|
141 |
+
|
142 |
+
|
143 |
+
# load_model_func = import_from_module(model_module, load_function)
|
144 |
+
# predict_func = import_from_module(model_module, predict_function)
|
145 |
+
|
146 |
+
# # # Load model
|
147 |
+
# # if model_info["type"] == "custom_checkpoint" or model_info["type"] == "custom_model":
|
148 |
+
# # model = torch.load(hf_location, map_location="cpu") # Load PyTorch model
|
149 |
+
# # elif model_info["type"] == "hf_automodel_finetuned_dbt3":
|
150 |
+
# # tokenizer_class = globals()[model_info["tokenizer_class"]]
|
151 |
+
# # model_class = globals()[model_info["model_class"]]
|
152 |
+
# # tokenizer = tokenizer_class.from_pretrained(hf_location)
|
153 |
+
# # model = model_class.from_pretrained(hf_location,
|
154 |
+
# # problem_type=model_info["problem_type"],
|
155 |
+
# # num_labels=model_info["num_labels"]
|
156 |
+
# # )
|
157 |
+
# # else:
|
158 |
+
# # st.error("Invalid model selection")
|
159 |
+
# # return None, None
|
160 |
+
|
161 |
+
|
162 |
+
# if load_model_func is None or predict_func is None:
|
163 |
+
# st.error("❌ Model functions could not be loaded!")
|
164 |
+
# return None, None
|
165 |
+
|
166 |
+
# # current_model, current_tokenizer = model, tokenizer # Store references
|
167 |
+
# # return model, tokenizer
|
168 |
+
|
169 |
+
# model, tokenizer = load_model_func(hf_location)
|
170 |
+
|
171 |
+
# current_model, current_tokenizer = model, tokenizer
|
172 |
+
# return model, tokenizer, predict_func
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
def predict(text, model, tokenizer, device, max_len=128):
|
177 |
+
# Tokenize and pad the input text
|
178 |
+
inputs = tokenizer(
|
179 |
+
text,
|
180 |
+
add_special_tokens=True,
|
181 |
+
padding=True,
|
182 |
+
truncation=False,
|
183 |
+
return_tensors="pt",
|
184 |
+
return_token_type_ids=False,
|
185 |
+
).to(device) # Move input tensors to the correct device
|
186 |
+
|
187 |
+
with torch.no_grad():
|
188 |
+
outputs = model(**inputs)
|
189 |
+
|
190 |
+
# Apply sigmoid activation (for BCEWithLogitsLoss)
|
191 |
+
probabilities = outputs.logits.cpu().numpy()
|
192 |
+
|
193 |
+
return probabilities
|
194 |
+
|
195 |
+
# def show_sentiment_analysis():
|
196 |
+
|
197 |
+
# Add your sentiment analysis code here
|
198 |
+
|
199 |
+
# user_input = st.text_input("Enter text for sentiment analysis:")
|
200 |
+
# user_input = st.text_area("Enter text for sentiment analysis:", height=200)
|
201 |
+
# user_input = st.text_area("Enter text for sentiment analysis:", max_chars=500)
|
202 |
+
|
203 |
+
def show_sentiment_analysis():
|
204 |
+
st.title("Stage 1: Sentiment Polarity Analysis")
|
205 |
+
st.write("This section will handle sentiment analysis.")
|
206 |
+
|
207 |
+
if "selected_model" not in st.session_state:
|
208 |
+
st.session_state.selected_model = list(MODEL_OPTIONS.keys())[0] # Default selection
|
209 |
+
|
210 |
+
if "clear_output" not in st.session_state:
|
211 |
+
st.session_state.clear_output = False
|
212 |
+
|
213 |
+
st.selectbox("Choose a model:", list(MODEL_OPTIONS.keys()), key="selected_model")
|
214 |
+
|
215 |
+
selected_model = st.session_state.selected_model
|
216 |
+
|
217 |
+
if selected_model not in MODEL_OPTIONS:
|
218 |
+
st.error(f"❌ Selected model '{selected_model}' not found!")
|
219 |
+
st.stop()
|
220 |
+
|
221 |
+
st.session_state.clear_output = True # Reset output when model changes
|
222 |
+
|
223 |
+
|
224 |
+
# st.write("DEBUG: Available Models:", MODEL_OPTIONS.keys()) # ✅ See available models
|
225 |
+
# st.write("DEBUG: Selected Model:", MODEL_OPTIONS[selected_model]) # ✅ Check selected model
|
226 |
+
|
227 |
+
|
228 |
+
user_input = st.text_input("Enter text for sentiment analysis:")
|
229 |
+
|
230 |
+
if user_input:
|
231 |
+
# Make prediction
|
232 |
+
|
233 |
+
# model, tokenizer = load_model()
|
234 |
+
# model, tokenizer = load_selected_model(selected_model)
|
235 |
+
|
236 |
+
model, tokenizer, predict_func = load_selected_model(selected_model)
|
237 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
238 |
+
|
239 |
+
if model is None:
|
240 |
+
st.error("⚠️ Error: Model failed to load! Check model selection or configuration.")
|
241 |
+
st.stop()
|
242 |
+
|
243 |
+
model.to(device)
|
244 |
+
|
245 |
+
# predictions = predict(user_input, model, tokenizer, device)
|
246 |
+
|
247 |
+
predictions = predict_func(user_input, model, tokenizer, device)
|
248 |
+
|
249 |
+
# Squeeze predictions to remove extra dimensions
|
250 |
+
predictions_array = predictions.squeeze()
|
251 |
+
|
252 |
+
# Convert to binary predictions (argmax)
|
253 |
+
binary_predictions = np.zeros_like(predictions_array)
|
254 |
+
max_indices = np.argmax(predictions_array)
|
255 |
+
binary_predictions[max_indices] = 1
|
256 |
+
|
257 |
+
# Display raw predictions
|
258 |
+
st.write(f"**Predicted Sentiment Scores:** {predictions_array}")
|
259 |
+
|
260 |
+
# Display binary classification result
|
261 |
+
st.write(f"**Predicted Sentiment:**")
|
262 |
+
st.write(f"**NEGATIVE:** {binary_predictions[0]}, **NEUTRAL:** {binary_predictions[1]}, **POSITIVE:** {binary_predictions[2]}")
|
263 |
+
# st.write(f"**NEUTRAL:** {binary_predictions[1]}")
|
264 |
+
# st.write(f"**POSITIVE:** {binary_predictions[2]}")
|
265 |
+
|
266 |
+
# 1️⃣ **Polar Plot (Plotly)**
|
267 |
+
sentiment_polarities = predictions_array.tolist()
|
268 |
+
fig_polar = px.line_polar(
|
269 |
+
pd.DataFrame(dict(r=sentiment_polarities, theta=SENTIMENT_POLARITY_LABELS)),
|
270 |
+
r='r', theta='theta', line_close=True
|
271 |
+
)
|
272 |
+
st.plotly_chart(fig_polar)
|
273 |
+
|
274 |
+
# 2️⃣ **Normalized Horizontal Bar Chart (Matplotlib)**
|
275 |
+
normalized_predictions = predictions_array / predictions_array.sum()
|
276 |
+
|
277 |
+
fig, ax = plt.subplots(figsize=(8, 2))
|
278 |
+
left = 0
|
279 |
+
for i in range(len(normalized_predictions)):
|
280 |
+
ax.barh(0, normalized_predictions[i], color=plt.cm.tab10(i), left=left, label=SENTIMENT_POLARITY_LABELS[i])
|
281 |
+
left += normalized_predictions[i]
|
282 |
+
|
283 |
+
# Configure the chart
|
284 |
+
ax.set_xlim(0, 1)
|
285 |
+
ax.set_yticks([])
|
286 |
+
ax.set_xticks(np.arange(0, 1.1, 0.1))
|
287 |
+
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=len(SENTIMENT_POLARITY_LABELS))
|
288 |
+
plt.title("Sentiment Polarity Prediction Distribution")
|
289 |
+
|
290 |
+
# Display in Streamlit
|
291 |
+
st.pyplot(fig)
|
292 |
+
|
293 |
+
|
294 |
+
|
295 |
+
if __name__ == "__main__":
|
296 |
+
show_sentiment_analysis()
|
src/bq-helper/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
src/bq-helper/README.md
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Summary
|
2 |
+
|
3 |
+
BigQuery_Helper is a helper class to simplify common read-only BigQuery tasks. It makes it easy to execute queries while you're learning SQL, and provides a convenient stepping stone on the path to using [the core BigQuery python API](https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html). You can try it for yourself by forking [this Kaggle kernel](https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package/).
|
4 |
+
|
5 |
+
## Installation
|
6 |
+
You can install BigQuery_Helper with the following command in your console:
|
7 |
+
|
8 |
+
|
9 |
+
`pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper`
|
10 |
+
|
11 |
+
If you aren't running BigQuery_Helper on [Kaggle](http://kaggle.com/), you will also need to go through the [standard BigQuery client setup and authentication process](https://cloud.google.com/bigquery/docs/reference/libraries).
|
12 |
+
|
13 |
+
This repo has only been tested on Python 3.6+ and the v0.29+ of the bigquery API.
|
14 |
+
|
15 |
+
## Changelog
|
16 |
+
#### 0.4.0:
|
17 |
+
- `BigQueryHelper.table_schema` has been overhauled. It now returns a Pandas DataFrame and unrolls nested fields so that the results are in the format expected by queries. For example, the `github_repos.commits` nested field `author` now returns sub-fields names in the format like `author.email`.
|
18 |
+
|
19 |
+
#### 0.3.0:
|
20 |
+
- Each helper instance now logs the total bytes counted towards your quota or bill used across all queries run with that helper instance. You can access it with `BigQueryHelper.total_gb_used_net_cache`. Repeated queries are likely to hit the cache and may show up as 0 GB used.
|
21 |
+
- Queries that take longer than the maximum wait time, which defaults to 3 minutes, will be cancelled.
|
22 |
+
- Contributing to bq_helper should be easier now that there is a set of tests.
|
23 |
+
|
24 |
+
#### 0.2.0:
|
25 |
+
- `query_to_pandas` now returns an empty DataFrame when the query returns no results. Previously, this returned `None`.
|
src/bq-helper/__pycache__/version.cpython-312.pyc
ADDED
Binary file (190 Bytes). View file
|
|
src/bq-helper/bq_helper.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Helper class to simplify common read-only BigQuery tasks.
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
import pandas as pd
|
7 |
+
import time
|
8 |
+
|
9 |
+
from google.cloud import bigquery
|
10 |
+
|
11 |
+
|
12 |
+
class BigQueryHelper(object):
|
13 |
+
"""
|
14 |
+
Helper class to simplify common BigQuery tasks like executing queries,
|
15 |
+
showing table schemas, etc without worrying about table or dataset pointers.
|
16 |
+
|
17 |
+
See the BigQuery docs for details of the steps this class lets you skip:
|
18 |
+
https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html
|
19 |
+
"""
|
20 |
+
|
21 |
+
def __init__(self, active_project, dataset_name, max_wait_seconds=180):
|
22 |
+
self.project_name = active_project
|
23 |
+
self.dataset_name = dataset_name
|
24 |
+
self.max_wait_seconds = max_wait_seconds
|
25 |
+
self.client = bigquery.Client()
|
26 |
+
self.__dataset_ref = self.client.dataset(self.dataset_name, project=self.project_name)
|
27 |
+
self.dataset = None
|
28 |
+
self.tables = dict() # {table name (str): table object}
|
29 |
+
self.__table_refs = dict() # {table name (str): table reference}
|
30 |
+
self.total_gb_used_net_cache = 0
|
31 |
+
self.BYTES_PER_GB = 2**30
|
32 |
+
|
33 |
+
def __fetch_dataset(self):
|
34 |
+
"""
|
35 |
+
Lazy loading of dataset. For example,
|
36 |
+
if the user only calls `self.query_to_pandas` then the
|
37 |
+
dataset never has to be fetched.
|
38 |
+
"""
|
39 |
+
if self.dataset is None:
|
40 |
+
self.dataset = self.client.get_dataset(self.__dataset_ref)
|
41 |
+
|
42 |
+
def __fetch_table(self, table_name):
|
43 |
+
"""
|
44 |
+
Lazy loading of table
|
45 |
+
"""
|
46 |
+
self.__fetch_dataset()
|
47 |
+
if table_name not in self.__table_refs:
|
48 |
+
self.__table_refs[table_name] = self.dataset.table(table_name)
|
49 |
+
if table_name not in self.tables:
|
50 |
+
self.tables[table_name] = self.client.get_table(self.__table_refs[table_name])
|
51 |
+
|
52 |
+
def __handle_record_field(self, row, schema_details, top_level_name=''):
|
53 |
+
"""
|
54 |
+
Unpack a single row, including any nested fields.
|
55 |
+
"""
|
56 |
+
name = row['name']
|
57 |
+
if top_level_name != '':
|
58 |
+
name = top_level_name + '.' + name
|
59 |
+
schema_details.append([{
|
60 |
+
'name': name,
|
61 |
+
'type': row['type'],
|
62 |
+
'mode': row['mode'],
|
63 |
+
'fields': pd.np.nan,
|
64 |
+
'description': row['description']
|
65 |
+
}])
|
66 |
+
# float check is to dodge row['fields'] == np.nan
|
67 |
+
if type(row.get('fields', 0.0)) == float:
|
68 |
+
return None
|
69 |
+
for entry in row['fields']:
|
70 |
+
self.__handle_record_field(entry, schema_details, name)
|
71 |
+
|
72 |
+
def __unpack_all_schema_fields(self, schema):
|
73 |
+
"""
|
74 |
+
Unrolls nested schemas. Returns dataframe with one row per field,
|
75 |
+
and the field names in the format accepted by the API.
|
76 |
+
Results will look similar to the website schema, such as:
|
77 |
+
https://bigquery.cloud.google.com/table/bigquery-public-data:github_repos.commits?pli=1
|
78 |
+
|
79 |
+
Args:
|
80 |
+
schema: DataFrame derived from api repr of raw table.schema
|
81 |
+
Returns:
|
82 |
+
Dataframe of the unrolled schema.
|
83 |
+
"""
|
84 |
+
schema_details = []
|
85 |
+
schema.apply(lambda row:
|
86 |
+
self.__handle_record_field(row, schema_details), axis=1)
|
87 |
+
result = pd.concat([pd.DataFrame.from_dict(x) for x in schema_details])
|
88 |
+
result.reset_index(drop=True, inplace=True)
|
89 |
+
del result['fields']
|
90 |
+
return result
|
91 |
+
|
92 |
+
def table_schema(self, table_name):
|
93 |
+
"""
|
94 |
+
Get the schema for a specific table from a dataset.
|
95 |
+
Unrolls nested field names into the format that can be copied
|
96 |
+
directly into queries. For example, for the `github.commits` table,
|
97 |
+
the this will return `committer.name`.
|
98 |
+
|
99 |
+
This is a very different return signature than BigQuery's table.schema.
|
100 |
+
"""
|
101 |
+
self.__fetch_table(table_name)
|
102 |
+
raw_schema = self.tables[table_name].schema
|
103 |
+
schema = pd.DataFrame.from_dict([x.to_api_repr() for x in raw_schema])
|
104 |
+
# the api_repr only has the fields column for tables with nested data
|
105 |
+
if 'fields' in schema.columns:
|
106 |
+
schema = self.__unpack_all_schema_fields(schema)
|
107 |
+
# Set the column order
|
108 |
+
schema = schema[['name', 'type', 'mode', 'description']]
|
109 |
+
return schema
|
110 |
+
|
111 |
+
def list_tables(self):
|
112 |
+
"""
|
113 |
+
List the names of the tables in a dataset
|
114 |
+
"""
|
115 |
+
self.__fetch_dataset()
|
116 |
+
return([x.table_id for x in self.client.list_tables(self.dataset)])
|
117 |
+
|
118 |
+
def estimate_query_size(self, query):
|
119 |
+
"""
|
120 |
+
Estimate gigabytes scanned by query.
|
121 |
+
Does not consider if there is a cached query table.
|
122 |
+
See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.dryRun
|
123 |
+
"""
|
124 |
+
my_job_config = bigquery.job.QueryJobConfig()
|
125 |
+
my_job_config.dry_run = True
|
126 |
+
my_job = self.client.query(query, job_config=my_job_config)
|
127 |
+
return my_job.total_bytes_processed / self.BYTES_PER_GB
|
128 |
+
|
129 |
+
def query_to_pandas(self, query):
|
130 |
+
"""
|
131 |
+
Execute a SQL query & return a pandas dataframe
|
132 |
+
"""
|
133 |
+
my_job = self.client.query(query)
|
134 |
+
start_time = time.time()
|
135 |
+
while not my_job.done():
|
136 |
+
if (time.time() - start_time) > self.max_wait_seconds:
|
137 |
+
print("Max wait time elapsed, query cancelled.")
|
138 |
+
self.client.cancel_job(my_job.job_id)
|
139 |
+
return None
|
140 |
+
time.sleep(0.1)
|
141 |
+
# Queries that hit errors will return an exception type.
|
142 |
+
# Those exceptions don't get raised until we call my_job.to_dataframe()
|
143 |
+
# In that case, my_job.total_bytes_billed can be called but is None
|
144 |
+
if my_job.total_bytes_billed:
|
145 |
+
self.total_gb_used_net_cache += my_job.total_bytes_billed / self.BYTES_PER_GB
|
146 |
+
return my_job.to_dataframe()
|
147 |
+
|
148 |
+
def query_to_pandas_safe(self, query, max_gb_scanned=1):
|
149 |
+
"""
|
150 |
+
Execute a query, but only if the query would scan less than `max_gb_scanned` of data.
|
151 |
+
"""
|
152 |
+
query_size = self.estimate_query_size(query)
|
153 |
+
if query_size <= max_gb_scanned:
|
154 |
+
return self.query_to_pandas(query)
|
155 |
+
msg = "Query cancelled; estimated size of {0} exceeds limit of {1} GB"
|
156 |
+
print(msg.format(query_size, max_gb_scanned))
|
157 |
+
|
158 |
+
def head(self, table_name, num_rows=5, start_index=None, selected_columns=None):
|
159 |
+
"""
|
160 |
+
Get the first n rows of a table as a DataFrame.
|
161 |
+
Does not perform a full table scan; should use a trivial amount of data as long as n is small.
|
162 |
+
"""
|
163 |
+
self.__fetch_table(table_name)
|
164 |
+
active_table = self.tables[table_name]
|
165 |
+
schema_subset = None
|
166 |
+
if selected_columns:
|
167 |
+
schema_subset = [col for col in active_table.schema if col.name in selected_columns]
|
168 |
+
results = self.client.list_rows(active_table, selected_fields=schema_subset,
|
169 |
+
max_results=num_rows, start_index=start_index)
|
170 |
+
results = [x for x in results]
|
171 |
+
return pd.DataFrame(
|
172 |
+
data=[list(x.values()) for x in results], columns=list(results[0].keys()))
|
src/bq-helper/setup.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup
|
2 |
+
from version import __version__ as version
|
3 |
+
|
4 |
+
|
5 |
+
setup(name='bq_helper',
|
6 |
+
version=version,
|
7 |
+
description='Helper class to simplify common read-only BigQuery tasks.',
|
8 |
+
author='Sohier Dane',
|
9 |
+
url='https://github.com/SohierDane/BigQuery_Helper',
|
10 |
+
license='Apache 2.0',
|
11 |
+
install_requires=['pandas', 'google-cloud-bigquery'],
|
12 |
+
classifiers=['Programming Language :: Python :: 3'],
|
13 |
+
)
|
src/bq-helper/test_helper.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Tests all public methods of the BigQueryHelper class.
|
3 |
+
|
4 |
+
Run from command line with:
|
5 |
+
python -m unittest test_helper.py
|
6 |
+
|
7 |
+
|
8 |
+
BILLING WARNING:
|
9 |
+
Running these tests requires a working BigQuery account and MAY CAUSE CHARGES.
|
10 |
+
However the dataset used for the tests is only ~2 MB, so any charges should
|
11 |
+
be very minimal. The downside is that this particular dataset is completely
|
12 |
+
refreshed every hour, so it's not possible to check for any specific return values.
|
13 |
+
|
14 |
+
For details on the test dataset, please see:
|
15 |
+
https://bigquery.cloud.google.com/table/bigquery-public-data:openaq.global_air_quality?tab=details
|
16 |
+
"""
|
17 |
+
|
18 |
+
|
19 |
+
import unittest
|
20 |
+
|
21 |
+
|
22 |
+
from bq_helper import BigQueryHelper
|
23 |
+
from google.api_core.exceptions import BadRequest
|
24 |
+
from pandas.core.frame import DataFrame
|
25 |
+
from random import random
|
26 |
+
|
27 |
+
|
28 |
+
class TestBQHelper(unittest.TestCase):
|
29 |
+
def setUp(self):
|
30 |
+
self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
|
31 |
+
self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
|
32 |
+
# Query randomized so it won't hit the cache across multiple test runs
|
33 |
+
self.randomizable_query = """
|
34 |
+
SELECT value FROM `bigquery-public-data.openaq.global_air_quality`
|
35 |
+
WHERE value = {0}"""
|
36 |
+
|
37 |
+
def test_list_tables(self):
|
38 |
+
self.assertEqual(self.my_bq.list_tables(), ['global_air_quality'])
|
39 |
+
|
40 |
+
def test_list_schema(self):
|
41 |
+
self.assertEqual(len(self.my_bq.table_schema('global_air_quality')), 11)
|
42 |
+
|
43 |
+
def test_estimate_query_size(self):
|
44 |
+
self.assertIsInstance(self.my_bq.estimate_query_size(self.query), float)
|
45 |
+
|
46 |
+
def test_query_to_pandas(self):
|
47 |
+
self.assertIsInstance(self.my_bq.query_to_pandas(self.query), DataFrame)
|
48 |
+
|
49 |
+
def test_query_safe_passes(self):
|
50 |
+
self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query), DataFrame)
|
51 |
+
|
52 |
+
def test_query_safe_fails(self):
|
53 |
+
# Different query must be used for this test to ensure we don't hit the
|
54 |
+
# cache and end up passing by testing a query that would use zero bytes.
|
55 |
+
fail_query = self.randomizable_query.format(random())
|
56 |
+
self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10))
|
57 |
+
|
58 |
+
def test_head(self):
|
59 |
+
self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame)
|
60 |
+
|
61 |
+
def test_useage_tracker(self):
|
62 |
+
self.my_bq.query_to_pandas(self.randomizable_query.format(random()))
|
63 |
+
self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0)
|
64 |
+
|
65 |
+
def test_bad_query_raises_right_error(self):
|
66 |
+
with self.assertRaises(BadRequest):
|
67 |
+
self.my_bq.query_to_pandas("Not a valid query")
|
68 |
+
|
69 |
+
def test_list_nested_schema(self):
|
70 |
+
nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
|
71 |
+
self.assertEqual(len(nested_helper.table_schema('commits')), 33)
|
72 |
+
|
73 |
+
|
74 |
+
if __name__ == '__main__':
|
75 |
+
unittest.main()
|
src/bq-helper/version.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__version__ = '0.4.1'
|