Spaces:

Tachygraphy-Microtext-Normalization-IEMK25
/

Tachygraphy-Microtext-Analysis-and-Normalization-ArchismanCoder

Sleeping

App Files Files Community

Archisman Karmakar commited on Mar 15

Commit

4a97943

unverified ·

1 Parent(s): 179dfdb

stage1 deberta for seq classification hft model

Browse files

Files changed (23) hide show

app_main_hf.py +56 -0
dashboard.py +10 -0
emotion_analysis.py +9 -0
imports.py +13 -0
requirements.txt +823 -0
sentiment_analysis/__init__.py +1 -0
sentiment_analysis/__pycache__/__init__.cpython-310.pyc +0 -0
sentiment_analysis/__pycache__/sentiment_analysis.cpython-310.pyc +0 -0
sentiment_analysis/config/stage1_models.json +16 -0
sentiment_analysis/hmv_cfg_base_stage1/__init__.py +1 -0
sentiment_analysis/hmv_cfg_base_stage1/__pycache__/__init__.cpython-310.pyc +0 -0
sentiment_analysis/hmv_cfg_base_stage1/__pycache__/model1.cpython-310.pyc +0 -0
sentiment_analysis/hmv_cfg_base_stage1/imports.py +13 -0
sentiment_analysis/hmv_cfg_base_stage1/model1.py +71 -0
sentiment_analysis/hmv_cfg_base_stage1/stage1_bert_architecture.py +26 -0
sentiment_analysis/sentiment_analysis.py +296 -0
src/bq-helper/LICENSE +201 -0
src/bq-helper/README.md +25 -0
src/bq-helper/__pycache__/version.cpython-312.pyc +0 -0
src/bq-helper/bq_helper.py +172 -0
src/bq-helper/setup.py +13 -0
src/bq-helper/test_helper.py +75 -0
src/bq-helper/version.py +1 -0

app_main_hf.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import streamlit as st
+import os
+import sys
+import joblib
+import importlib.util
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), )))
+from dashboard import show_dashboard
+from sentiment_analysis.sentiment_analysis import show_sentiment_analysis
+from emotion_analysis import show_emotion_analysis
+# from text_transformation import show_text_transformation
+def main():
+    st.sidebar.title("Navigation")
+    selection = st.sidebar.radio("Go to", ["Dashboard", "Stage 1: Sentiment Polarity Analysis", "Stage 2: Emotion Mood-tag Analysis", "Stage 3: Text Transformation & Normalization"])
+    if selection == "Dashboard":
+        show_dashboard()
+    elif selection == "Stage 1: Sentiment Polarity Analysis":
+        show_sentiment_analysis()
+    elif selection == "Stage 2: Emotion Mood-tag Analysis":
+        # show_emotion_analysis()
+        st.write("This section is under development.")
+    elif selection == "Stage 3: Text Transformation & Normalization":
+        # show_text_transformation()
+        st.write("This section is under development.")
+    st.sidebar.title("About")
+    st.sidebar.info("""
+        **Contributors:**
+        - Archisman Karmakar
+            - [LinkedIn](https://www.linkedin.com/in/archismankarmakar/)
+            - [GitHub](https://www.github.com/ArchismanKarmakar)
+            - [Kaggle](https://www.kaggle.com/archismancoder)
+        - Sumon Chatterjee
+            - [LinkedIn](https://www.linkedin.com/in/sumon-chatterjee-3b3b43227)
+            - [GitHub](https://github.com/Sumon670)
+            - [Kaggle](https://www.kaggle.com/sumonchatterjee)
+        **Mentors:**
+        - Prof. Anupam Mondal
+            - [LinkedIn](https://www.linkedin.com/in/anupam-mondal-ph-d-8a7a1a39/)
+            - [Google Scholar](https://scholar.google.com/citations?user=ESRR9o4AAAAJ&hl=en)
+            - [Website](https://sites.google.com/view/anupammondal/home)
+        - Prof. Sainik Kumar Mahata
+            - [LinkedIn](https://www.linkedin.com/in/mahatasainikk)
+            - [Google Scholar](https://scholar.google.co.in/citations?user=OcJDM50AAAAJ&hl=en)
+            - [Website](https://sites.google.com/view/sainik-kumar-mahata/home)
+        This is our research project for our B.Tech final year and a journal which is yet to be published.
+    """)
+if __name__ == "__main__":
+    main()

dashboard.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+def show_dashboard():
+    st.title("Tachygraphy Micro-text Analysis & Normalization")
+    st.write("""
+        Welcome to the Tachygraphy Micro-text Analysis & Normalization Project. This application is designed to analyze text data through three stages:
+        1. Sentiment Polarity Analysis
+        2. Emotion Mood-tag Analysis
+        3. Text Transformation & Normalization
+    """)

emotion_analysis.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import streamlit as st
+def show_emotion_analysis():
+    st.title("Stage 2: Emotion Mood-tag Analysis")
+    st.write("This section will handle emotion detection.")
+    # Add your emotion detection code here
+if __name__ == "__main__":
+    show_emotion_analysis()

imports.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, DebertaV2Tokenizer, DebertaV2ForSequenceClassification
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import plotly.express as px
+import pandas as pd
+import json
+import gc
+import psutil
+import os
+import importlib.util
+import sys

requirements.txt ADDED Viewed

	@@ -0,0 +1,823 @@

+absl-py==1.4.0
+accelerate==1.2.1
+aiofiles==22.1.0
+aiohappyeyeballs==2.4.6
+aiohttp==3.11.12
+aiosignal==1.3.2
+aiosqlite==0.21.0
+alabaster==1.0.0
+albucore==0.0.19
+albumentations==1.4.20
+alembic==1.14.1
+altair==5.5.0
+annotated-types==0.7.0
+annoy==1.17.3
+ansicolors==1.1.8
+antlr4-python3-runtime==4.9.3
+anyio==3.7.1
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+args==0.1.0
+array_record==0.5.1
+arrow==1.3.0
+arviz==0.20.0
+astropy==6.1.7
+astropy-iers-data==0.2024.12.16.0.35.48
+asttokens==3.0.0
+astunparse==1.6.3
+async-timeout==5.0.1
+atpublic==4.1.0
+attrs==25.1.0
+audioread==3.0.1
+autograd==1.7.0
+babel==2.16.0
+backcall==0.2.0
+bayesian-optimization==2.0.3
+beautifulsoup4==4.12.3
+betterproto==2.0.0b6
+bigframes==1.29.0
+bigquery-magics==0.4.0
+bleach==6.2.0
+blinker==1.9.0
+blis==0.7.11
+blobfile==3.0.0
+blosc2==2.7.1
+bokeh==3.6.2
+Boruta==0.4.3
+boto3==1.36.23
+botocore==1.36.23
+Bottleneck==1.4.2
+-e git+https://github.com/SohierDane/BigQuery_Helper@8615a7f6c1663e7f2d48aa2b32c2dbcb600a440f#egg=bq_helper
+bqplot==0.12.43
+branca==0.8.1
+bs4==0.0.2
+CacheControl==0.14.1
+cachetools==5.5.0
+Cartopy==0.24.1
+catalogue==2.0.10
+catboost==1.2.7
+category_encoders==2.7.0
+certifi==2025.1.31
+cesium==0.12.1
+cffi==1.17.1
+chardet==5.2.0
+charset-normalizer==3.4.1
+Chessnut==0.4.1
+chex==0.1.88
+clarabel==0.9.0
+click==8.1.7
+click-plugins==1.1.1
+cligj==0.7.2
+clint==0.5.1
+cloudpathlib==0.20.0
+cloudpickle==3.1.0
+cmake==3.31.2
+cmdstanpy==1.2.5
+colorama==0.4.6
+colorcet==3.1.0
+colorlog==6.9.0
+colorlover==0.3.0
+colour==0.1.5
+comm==0.2.2
+community==1.0.0b1
+confection==0.1.5
+cons==0.4.6
+contourpy==1.3.1
+coverage==7.6.12
+cryptography==44.0.1
+cuda-bindings==12.8.0
+cuda-python==12.8.0
+cudf-cu12==25.2.0
+cufflinks==0.17.3
+cuml-cu12==25.2.0
+cupy-cuda12x==12.2.0
+cuvs-cu12==25.2.0
+cvxopt==1.3.2
+cvxpy==1.6.0
+cycler==0.12.1
+cymem==2.0.10
+Cython==3.0.11
+cytoolz==1.0.1
+daal==2025.2.0
+dacite==1.9.2
+dask==2024.12.1
+dask-cuda==25.2.0
+dask-cudf-cu12==25.2.0
+dask-expr==1.1.21
+dataclasses-json==0.6.7
+datascience==0.17.6
+datasets==3.3.1
+datashader==0.17.0
+db-dtypes==1.3.1
+dbus-python==1.2.18
+deap==1.4.2
+debugpy==1.8.0
+decorator==4.4.2
+deepdiff==8.2.0
+deepspeed==0.16.4
+defusedxml==0.7.1
+Deprecated==1.2.15
+diffusers==0.31.0
+dill==0.3.8
+dipy==1.10.0
+distributed==2024.12.1
+distributed-ucxx-cu12==0.42.0
+distro==1.9.0
+dlib==19.24.2
+dm-tree==0.1.8
+dnspython==2.7.0
+docker==7.1.0
+docker-pycreds==0.4.0
+docstring-to-markdown==0.15
+docstring_parser==0.16
+docutils==0.21.2
+dopamine_rl==4.1.0
+duckdb==1.1.3
+earthengine-api==1.4.3
+easydict==1.13
+easyocr==1.7.2
+editdistance==0.8.1
+eerepr==0.0.4
+einops==0.8.0
+eli5==0.13.0
+email_validator==2.2.0
+emoji==2.14.1
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
+entrypoints==0.4
+et_xmlfile==2.0.0
+etils==1.11.0
+etuples==0.3.9
+eval_type_backport==0.2.0
+evaluate==0.4.3
+exceptiongroup==1.2.2
+execnb==0.1.11
+Farama-Notifications==0.0.4
+fastai==2.7.18
+fastcore==1.7.27
+fastdownload==0.0.7
+fastjsonschema==2.21.1
+fastprogress==1.0.3
+fastrlock==0.8.2
+fasttext==0.9.3
+featuretools==1.31.0
+filelock==3.17.0
+fiona==1.10.1
+firebase-admin==6.6.0
+Flask==3.1.0
+flatbuffers==24.3.25
+flax==0.8.5
+folium==0.19.2
+fonttools==4.55.3
+fqdn==1.5.1
+frozendict==2.4.6
+frozenlist==1.5.0
+fsspec==2024.12.0
+funcy==2.0
+fury==0.12.0
+future==1.0.0
+fuzzywuzzy==0.18.0
+gast==0.6.0
+gatspy==0.3
+gcsfs==2024.10.0
+GDAL==3.6.4
+gdown==5.2.0
+geemap==0.35.1
+gensim==4.3.3
+geocoder==1.38.1
+geographiclib==2.0
+geojson==3.2.0
+geopandas==0.14.4
+geopy==2.4.1
+ghapi==1.0.6
+gin-config==0.5.0
+gitdb==4.0.11
+GitPython==3.1.43
+glob2==0.7
+google==2.0.3
+google-ai-generativelanguage==0.6.10
+google-api-core==1.34.1
+google-api-python-client==2.155.0
+google-auth==2.27.0
+google-auth-httplib2==0.2.0
+google-auth-oauthlib==1.2.1
+google-cloud-aiplatform==1.74.0
+google-cloud-automl==1.0.1
+google-cloud-bigquery==3.25.0
+google-cloud-bigquery-connection==1.17.0
+google-cloud-bigtable==2.27.0
+google-cloud-core==2.4.1
+google-cloud-datastore==2.20.2
+google-cloud-firestore==2.19.0
+google-cloud-functions==1.19.0
+google-cloud-iam==2.17.0
+google-cloud-language==2.16.0
+google-cloud-pubsub==2.27.1
+google-cloud-resource-manager==1.14.0
+google-cloud-storage==2.14.0
+google-cloud-translate==3.12.1
+google-cloud-videointelligence==2.16.0
+google-cloud-vision==3.10.0
+google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz
+google-crc32c==1.6.0
+google-genai==0.2.2
+google-generativeai==0.8.3
+google-pasta==0.2.0
+google-resumable-media==2.7.2
+googleapis-common-protos==1.66.0
+googledrivedownloader==0.4
+gpxpy==1.6.2
+graphviz==0.20.3
+greenlet==3.1.1
+grpc-google-iam-v1==0.13.1
+grpcio==1.68.1
+grpcio-status==1.48.2
+grpclib==0.4.8rc2
+gspread==6.0.2
+gspread-dataframe==3.3.1
+gym==0.25.2
+gym-notices==0.0.8
+gymnasium==0.29.0
+h11==0.14.0
+h2==4.2.0
+h2o==3.46.0.6
+h5netcdf==1.4.1
+h5py==3.12.1
+haversine==2.9.0
+hep_ml==0.7.3
+hf_transfer==0.1.9
+hjson==3.1.0
+holidays==0.63
+holoviews==1.20.0
+hpack==4.1.0
+html5lib==1.1
+htmlmin==0.1.12
+httpcore==1.0.7
+httpimport==1.4.0
+httplib2==0.22.0
+httpx==0.28.1
+huggingface-hub==0.29.0
+humanize==4.11.0
+hyperframe==6.1.0
+hyperopt==0.2.7
+ibis-framework==9.2.0
+id==1.5.0
+idna==3.10
+igraph==0.11.8
+ImageHash==4.3.1
+imageio==2.36.1
+imageio-ffmpeg==0.5.1
+imagesize==1.4.1
+imbalanced-learn==0.12.4
+imgaug==0.4.0
+immutabledict==4.2.1
+importlib-resources==5.13.0
+importlib_metadata==8.5.0
+imutils==0.5.4
+in-toto-attestation==0.9.3
+inflect==7.4.0
+iniconfig==2.0.0
+intel-cmplr-lib-rt==2024.2.0
+intel-cmplr-lib-ur==2024.2.0
+intel-openmp==2024.2.0
+ipyevents==2.0.2
+ipyfilechooser==0.6.0
+ipykernel==5.5.6
+ipyleaflet==0.19.2
+ipympl==0.9.6
+ipyparallel==8.8.0
+ipython==7.34.0
+ipython-genutils==0.2.0
+ipython-sql==0.5.0
+ipytree==0.2.2
+ipywidgets==8.1.5
+isoduration==20.11.0
+isoweek==1.3.3
+itsdangerous==2.2.0
+Janome==0.5.0
+jax==0.4.33
+jax-cuda12-pjrt==0.4.33
+jax-cuda12-plugin==0.4.33
+jaxlib==0.4.33
+jedi==0.19.2
+jeepney==0.7.1
+jellyfish==1.1.0
+jieba==0.42.1
+Jinja2==3.1.4
+jiter==0.8.2
+jmespath==1.0.1
+joblib==1.4.2
+json5==0.10.0
+jsonpatch==1.33
+jsonpickle==4.0.1
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter-console==6.1.0
+jupyter-events==0.12.0
+jupyter-leaflet==0.19.2
+jupyter-lsp==1.5.1
+jupyter-ydoc==0.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.12.5
+jupyter_server_fileid==0.9.3
+jupyter_server_terminals==0.5.3
+jupyter_server_ydoc==0.8.0
+jupyterlab==3.6.8
+jupyterlab-lsp==3.10.2
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+kaggle==1.6.17
+kaggle-environments==1.16.11
+kagglehub==0.3.9
+keras==3.5.0
+keras-core==0.1.7
+keras-cv==0.9.0
+keras-hub==0.18.1
+keras-nlp==0.18.1
+keras-tuner==1.4.7
+keyring==23.5.0
+kiwisolver==1.4.7
+kornia==0.8.0
+kornia_rs==0.1.8
+kt-legacy==1.0.5
+langchain==0.3.12
+langchain-core==0.3.25
+langchain-text-splitters==0.3.3
+langcodes==3.5.0
+langid==1.1.6
+langsmith==0.2.3
+language_data==1.3.0
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+lazy_loader==0.4
+learntools @ git+https://github.com/Kaggle/learntools@010e3b5035354e15c073a0aca9e202c2e2beb742
+leven==1.0.4
+libclang==18.1.1
+libcudf-cu12==25.2.0
+libcuml-cu12==25.2.0
+libcuvs-cu12==25.2.0
+libkvikio-cu12==25.2.0
+libpysal==4.9.2
+libraft-cu12==25.2.0
+librosa==0.10.2.post1
+libucx-cu12==1.18.0
+libucxx-cu12==0.42.0
+lightgbm @ file:///tmp/lightgbm/lightgbm-4.5.0-py3-none-linux_x86_64.whl
+lightning-utilities==0.12.0
+lime==0.2.0.1
+line_profiler==4.2.0
+linkify-it-py==2.0.3
+llvmlite==0.43.0
+lml==0.1.0
+locket==1.0.0
+logical-unification==0.4.6
+lxml==5.3.0
+Mako==1.3.9
+mamba==0.11.3
+marisa-trie==1.2.1
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+matplotlib==3.7.5
+matplotlib-inline==0.1.7
+matplotlib-venn==1.1.1
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+miniKanren==1.0.3
+missingno==0.5.2
+mistune==0.8.4
+mizani==0.13.1
+mkl==2025.0.1
+mkl-fft==1.3.8
+mkl-random==1.2.4
+mkl-service==2.4.1
+mkl-umath==0.1.1
+ml-dtypes==0.4.1
+mlcrate==0.2.0
+mlxtend==0.23.3
+mne==1.9.0
+model-signing==0.2.0
+more-itertools==10.5.0
+moviepy==1.0.3
+mpld3==0.5.10
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multimethod==1.12
+multipledispatch==1.0.0
+multiprocess==0.70.16
+multitasking==0.0.11
+murmurhash==1.0.11
+music21==9.3.0
+mypy-extensions==1.0.0
+namex==0.0.8
+narwhals==1.18.4
+natsort==8.4.0
+nbclassic==1.1.0
+nbclient==0.5.13
+nbconvert==6.4.5
+nbdev==2.3.34
+nbformat==5.10.4
+ndindex==1.9.2
+nest-asyncio==1.6.0
+networkx==3.4.2
+nibabel==5.3.2
+nilearn==0.10.4
+ninja==1.11.1.3
+nltk==3.2.4
+nose==1.3.7
+notebook==6.5.4
+notebook_shim==0.2.4
+numba==0.60.0
+numba-cuda==0.2.0
+numexpr==2.10.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvcc-cu12==12.6.85
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.6.0.74
+nvidia-cufft-cu12==11.3.0.4
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-ml-py==12.570.86
+nvidia-nccl-cu12==2.23.4
+nvidia-nvcomp-cu12==4.1.0.6
+nvidia-nvjitlink-cu12==12.6.85
+nvtx==0.2.10
+nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-24.10.0-py3-none-any.whl
+oauth2client==4.1.3
+oauthlib==3.2.2
+odfpy==1.4.1
+olefile==0.47
+omegaconf==2.3.0
+onnx==1.17.0
+openai==1.57.4
+opencv-contrib-python==4.10.0.84
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+openpyxl==3.1.5
+openslide-bin==4.0.0.6
+openslide-python==1.4.1
+opentelemetry-api==1.29.0
+opentelemetry-sdk==1.29.0
+opentelemetry-semantic-conventions==0.50b0
+opt_einsum==3.4.0
+optax==0.2.4
+optim==0.1.0
+optree==0.13.1
+optuna==4.2.1
+orbax-checkpoint==0.6.4
+orderly-set==5.3.0
+orjson==3.10.12
+osqp==0.6.7.post3
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pandas-datareader==0.10.0
+pandas-gbq==0.25.0
+pandas-profiling==3.6.6
+pandas-stubs==2.2.2.240909
+pandasql==0.7.3
+pandocfilters==1.5.1
+panel==1.5.4
+papermill==2.6.0
+param==2.2.0
+parso==0.8.4
+parsy==2.1
+partd==1.4.2
+path==17.1.0
+path.py==12.5.0
+pathlib==1.0.1
+pathos==0.3.2
+patsy==1.0.1
+pdf2image==1.17.0
+peewee==3.17.8
+peft==0.14.0
+pettingzoo==1.24.0
+pexpect==4.9.0
+phik==0.12.4
+pickleshare==0.7.5
+pillow==11.0.0
+platformdirs==4.3.6
+plotly==5.24.1
+plotly-express==0.4.1
+plotnine==0.14.4
+pluggy==1.5.0
+ply==3.11
+polars==1.9.0
+pooch==1.8.2
+portalocker==3.1.1
+portpicker==1.5.2
+pox==0.3.5
+ppft==1.7.6.9
+preprocessing==0.1.13
+preshed==3.0.9
+prettytable==3.12.0
+proglog==0.1.10
+progressbar2==4.5.0
+prometheus_client==0.21.1
+promise==2.3
+prompt_toolkit==3.0.48
+propcache==0.2.1
+prophet==1.1.6
+proto-plus==1.25.0
+protobuf==3.20.3
+psutil==5.9.5
+psycopg2==2.9.10
+ptyprocess==0.7.0
+pudb==2024.1.3
+py-cpuinfo==9.0.0
+py4j==0.10.9.7
+pyaml==25.1.0
+PyArabic==0.6.15
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pybind11==2.13.6
+pyclipper==1.3.0.post6
+pycocotools==2.0.8
+pycparser==2.22
+pycryptodome==3.21.0
+pycryptodomex==3.21.0
+pyct==0.5.0
+pycuda==2025.1
+pydantic==2.11.0a2
+pydantic_core==2.29.0
+pydata-google-auth==1.9.0
+pydegensac==0.1.2
+pydicom==3.0.1
+pydot==3.0.3
+pydotplus==2.0.2
+PyDrive==1.3.1
+PyDrive2==1.21.3
+pydub==0.25.1
+pyemd==1.0.0
+pyerfa==2.0.1.5
+pyexcel-io==0.6.7
+pyexcel-ods==0.6.0
+pygame==2.6.1
+pygit2==1.16.0
+pygltflib==1.16.3
+Pygments==2.19.1
+PyGObject==3.42.1
+PyJWT==2.10.1
+pyLDAvis==3.4.1
+pylibcudf-cu12==25.2.0
+pylibcugraph-cu12==24.10.0
+pylibraft-cu12==25.2.0
+pymc==5.19.1
+pymc3==3.11.4
+pymongo==4.11.1
+Pympler==1.1
+pymystem3==0.2.0
+pynvjitlink-cu12==0.4.0
+pynvml==12.0.0
+pyogrio==0.10.0
+Pyomo==6.8.2
+PyOpenGL==3.1.7
+pyOpenSSL==25.0.0
+pyparsing==3.2.0
+pypdf==5.3.0
+pyperclip==1.9.0
+pyproj==3.7.0
+pyshp==2.3.1
+PySocks==1.7.1
+pyspark==3.5.3
+pytensor==2.26.4
+pytesseract==0.3.13
+pytest==8.3.4
+python-apt==0.0.0
+python-bidi==0.6.6
+python-box==7.3.0
+python-dateutil==2.9.0.post0
+python-json-logger==3.2.1
+python-louvain==0.16
+python-lsp-jsonrpc==1.1.2
+python-lsp-server==1.12.2
+python-slugify==8.0.4
+python-utils==3.9.1
+pytools==2025.1.1
+pytorch-ignite==0.5.1
+pytorch-lightning==2.5.0.post0
+pytz==2025.1
+PyUpSet==0.1.1.post7
+pyviz_comms==3.0.3
+PyWavelets==1.8.0
+PyYAML==6.0.2
+pyzmq==24.0.1
+qdldl==0.1.7.post4
+qgrid==1.3.1
+qtconsole==5.6.1
+QtPy==2.4.3
+raft-dask-cu12==25.2.0
+rapids-dask-dependency==25.2.0
+ratelim==0.1.6
+ray==2.42.1
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==1.3.1
+requests-toolbelt==1.0.0
+requirements-parser==0.9.0
+rfc3161-client==0.1.2
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rfc8785==0.1.4
+rgf-python==3.12.0
+rich==13.9.4
+rmm-cu12==25.2.0
+rouge_score==0.1.2
+rpds-py==0.22.3
+rpy2==3.4.2
+rsa==4.9
+Rtree==1.3.0
+s3fs==0.4.2
+s3transfer==0.11.2
+sacrebleu==2.5.1
+safetensors==0.4.5
+scikit-image==0.25.0
+scikit-learn==1.2.2
+scikit-learn-intelex==2025.2.0
+scikit-multilearn==0.2.0
+scikit-optimize==0.10.2
+scikit-plot==0.3.7
+scikit-surprise==1.1.4
+scipy==1.13.1
+scooby==0.10.0
+scs==3.2.7
+seaborn==0.12.2
+SecretStorage==3.3.1
+securesystemslib==1.2.0
+segment_anything @ git+https://github.com/facebookresearch/segment-anything.git@dca509fe793f601edb92606367a655c15ac00fdf
+semver==3.0.4
+Send2Trash==1.8.3
+sentence-transformers==3.3.1
+sentencepiece==0.2.0
+sentry-sdk==2.19.2
+setproctitle==1.3.4
+setuptools-scm==8.1.0
+shap==0.44.1
+shapely==2.0.7
+shellingham==1.5.4
+Shimmy==1.3.0
+sigstore==3.6.1
+sigstore-protobuf-specs==0.3.2
+sigstore-rekor-types==0.0.18
+simple-parsing==0.1.6
+SimpleITK==2.4.1
+six==1.17.0
+sklearn-pandas==2.2.0
+slicer==0.0.7
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+snowballstemmer==2.2.0
+sortedcontainers==2.4.0
+soundfile==0.12.1
+soupsieve==2.6
+soxr==0.5.0.post1
+spacy==3.7.5
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+Sphinx==8.1.3
+sphinx-rtd-theme==0.2.4
+sphinxcontrib-applehelp==2.0.0
+sphinxcontrib-devhelp==2.0.0
+sphinxcontrib-htmlhelp==2.1.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==2.0.0
+sphinxcontrib-serializinghtml==2.0.0
+SQLAlchemy==2.0.36
+sqlglot==25.1.0
+sqlparse==0.5.3
+squarify==0.4.4
+srsly==2.5.0
+stable-baselines3==2.1.0
+stanio==0.5.1
+statsmodels==0.14.4
+stopit==1.1.2
+StrEnum==0.4.15
+stringzilla==3.11.1
+stumpy==1.13.0
+sympy==1.13.1
+tables==3.10.1
+tabulate==0.9.0
+tbb==2022.0.0
+tbb4py==2022.0.0
+tblib==3.0.0
+tcmlib==1.2.0
+tenacity==9.0.0
+tensorboard==2.17.1
+tensorboard-data-server==0.7.2
+tensorflow==2.17.1
+tensorflow-cloud==0.1.5
+tensorflow-datasets==4.9.7
+tensorflow-hub==0.16.1
+tensorflow-io==0.37.1
+tensorflow-io-gcs-filesystem==0.37.1
+tensorflow-metadata==1.13.1
+tensorflow-probability==0.24.0
+tensorflow-text==2.17.0
+tensorflow_decision_forests==1.10.0
+tensorstore==0.1.71
+termcolor==2.5.0
+terminado==0.18.1
+testpath==0.6.0
+text-unidecode==1.3
+textblob==0.17.1
+texttable==1.7.0
+tf-slim==1.1.0
+tf_keras==2.17.0
+Theano==1.0.5
+Theano-PyMC==1.1.2
+thinc==8.2.5
+threadpoolctl==3.5.0
+tifffile==2024.12.12
+tiktoken==0.9.0
+timm==1.0.12
+tinycss2==1.4.0
+tokenizers==0.21.0
+toml==0.10.2
+tomli==2.2.1
+toolz==0.12.1
+torch @ https://download.pytorch.org/whl/cu121_full/torch-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl
+torchaudio @ https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp310-cp310-linux_x86_64.whl
+torchinfo==1.8.0
+torchmetrics==1.6.1
+torchsummary==1.5.1
+torchtune==0.5.0
+torchvision @ https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp310-cp310-linux_x86_64.whl
+tornado==6.3.3
+TPOT==0.12.1
+tqdm==4.67.1
+traitlets==5.7.1
+traittypes==0.2.1
+transformers==4.47.0
+treelite==4.4.1
+trx-python==0.3
+tsfresh==0.20.2
+tuf==5.1.0
+tweepy==4.14.0
+typeguard==4.4.1
+typer==0.15.1
+types-python-dateutil==2.9.0.20241206
+types-pytz==2024.2.0.20241003
+types-setuptools==75.6.0.20241126
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2025.1
+tzlocal==5.2
+uc-micro-py==1.0.3
+ucx-py-cu12==0.42.0
+ucxx-cu12==0.42.0
+ujson==5.10.0
+umf==0.9.1
+update-checker==0.18.0
+uri-template==1.3.0
+uritemplate==4.1.1
+urllib3==2.3.0
+urwid==2.6.16
+urwid_readline==0.15.1
+vega-datasets==0.9.0
+visions==0.7.6
+vtk==9.3.1
+wadllib==1.3.6
+Wand==0.6.13
+wandb==0.19.1
+wasabi==1.1.3
+watchdog==6.0.0
+wavio==0.0.9
+wcwidth==0.2.13
+weasel==0.4.1
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==14.1
+Werkzeug==3.1.3
+widgetsnbextension==4.0.13
+woodwork==0.31.0
+wordcloud==1.9.4
+wrapt==1.17.0
+wurlitzer==3.1.1
+xarray==2024.11.0
+xarray-einstats==0.8.0
+xgboost==2.0.3
+xlrd==2.0.1
+xvfbwrapper==0.2.9
+xxhash==3.5.0
+xyzservices==2024.9.0
+y-py==0.6.2
+yarl==1.18.3
+ydata-profiling==4.12.2
+ydf==0.9.0
+yellowbrick==1.5
+yfinance==0.2.50
+ypy-websocket==0.8.4
+zict==3.0.0
+zipp==3.21.0

sentiment_analysis/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # from . import sentiment_analysis

sentiment_analysis/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (166 Bytes). View file

sentiment_analysis/__pycache__/sentiment_analysis.cpython-310.pyc ADDED Viewed

Binary file (4.86 kB). View file

sentiment_analysis/config/stage1_models.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "1": {
+        "name": "DeBERTa v3 Base for Sequence Classification",
+        "type": "hf_automodel_finetuned_dbt3",
+        "module_path": "hmv_cfg_base_stage1.model1",
+        "hf_location": "tachygraphy-microtrext-norm-org/DeBERTa-v3-seqClassfication-LV1-SentimentPolarities-Batch8",
+        "tokenizer_class": "DebertaV2Tokenizer",
+        "model_class": "DebertaV2ForSequenceClassification",
+        "problem_type": "regression",
+        "base_model": "microsoft/deberta-v3-base",
+        "num_labels": 3,
+        "device": "cpu",
+        "load_function": "load_model",
+        "predict_function": "predict"
+    }
+}

sentiment_analysis/hmv_cfg_base_stage1/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # from . import model1

sentiment_analysis/hmv_cfg_base_stage1/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (186 Bytes). View file

sentiment_analysis/hmv_cfg_base_stage1/__pycache__/model1.cpython-310.pyc ADDED Viewed

Binary file (1.95 kB). View file

sentiment_analysis/hmv_cfg_base_stage1/imports.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, DebertaV2Tokenizer, DebertaV2ForSequenceClassification
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import plotly.express as px
+import pandas as pd
+import json
+import gc
+import psutil
+import os
+import importlib.util
+import sys

sentiment_analysis/hmv_cfg_base_stage1/model1.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch.nn as nn
+from imports import *
+import torch.nn.functional as F
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+CONFIG_STAGE1 = os.path.join(BASE_DIR, "..", "config", "stage1_models.json")
+MODEL_OPTIONS = {
+"1": {
+        "name": "DeBERTa v3 Base for Sequence Classification",
+        "type": "hf_automodel_finetuned_dbt3",
+        "module_path": "hmv_cfg_base_stage1.model1",
+        "hf_location": "tachygraphy-microtrext-norm-org/DeBERTa-v3-seqClassfication-LV1-SentimentPolarities-Batch8",
+        "tokenizer_class": "DebertaV2Tokenizer",
+        "model_class": "DebertaV2ForSequenceClassification",
+        "problem_type": "regression",
+        "base_model": "microsoft/deberta-v3-base",
+        "num_labels": 3,
+        "device": "cpu",
+        "load_function": "load_model",
+        "predict_function": "predict"
+    }
+}
+@st.cache_resource
+def load_model():
+    model_key = "1"
+    model_info = MODEL_OPTIONS[model_key]
+    hf_location = model_info["hf_location"]
+    tokenizer_class = globals()[model_info["tokenizer_class"]]
+    model_class = globals()[model_info["model_class"]]
+    tokenizer = tokenizer_class.from_pretrained(hf_location)
+    model = model_class.from_pretrained(hf_location,
+                                        problem_type=model_info["problem_type"],
+                                        num_labels=model_info["num_labels"]
+                                        )
+    return model, tokenizer
+def predict(text, model, tokenizer, device, max_len=128):
+    # Tokenize and pad the input text
+    inputs = tokenizer(
+        text,
+        add_special_tokens=True,
+        padding=True,
+        truncation=False,
+        return_tensors="pt",
+        return_token_type_ids=False,
+    ).to(device)  # Move input tensors to the correct device
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # probabilities = outputs.logits.cpu().numpy()
+    probabilities = torch.relu(outputs.logits)
+    probabilities = torch.clamp(torch.tensor(probabilities), min=0.00000, max=1.00000).cpu().numpy()
+    # probabilities /= probabilities.sum()
+    # probabilities = probabilities.cpu().numpy()
+    return probabilities
+if __name__ == "__main__":
+    model, tokenizer = load_model("1")
+    print("Model and tokenizer loaded successfully.")

sentiment_analysis/hmv_cfg_base_stage1/stage1_bert_architecture.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch.nn as nn
+class BERT_architecture(nn.Module):
+    def __init__(self, bert):
+        super(BERT_architecture, self).__init__()
+        self.bert = bert
+        self.dropout = nn.Dropout(0.3)  # Increased dropout for regularization
+        self.layer_norm = nn.LayerNorm(768)  # Layer normalization
+        self.fc1 = nn.Linear(768, 256)  # Dense layer
+        self.fc2 = nn.Linear(256, 3)  # Output layer with 3 classes
+        self.relu = nn.ReLU()
+        self.softmax = nn.LogSoftmax(dim=1)
+    def forward(self, sent_id, mask, token_type_ids):
+        _, cls_hs = self.bert(sent_id, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
+        x = self.layer_norm(cls_hs)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.softmax(x)
+        return x

sentiment_analysis/sentiment_analysis.py ADDED Viewed

	@@ -0,0 +1,296 @@

+from imports import *
+import importlib.util
+import os
+import sys
+import joblib
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), )))
+# from hmv_cfg_base_stage1.model1 import load_model as load_model1
+# from hmv_cfg_base_stage1.model1 import predict as predict1
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+CONFIG_STAGE1 = os.path.join(BASE_DIR, "config", "stage1_models.json")
+LOADERS_STAGE1 = os.path.join(BASE_DIR, "hmv-cfg-base-stage1")
+# Load the model and tokenizer
+# model_name = "tachygraphy-microtrext-norm-org/DeBERTa-v3-seqClassfication-LV1-SentimentPolarities-Batch8"
+# tokenizer = AutoTokenizer.from_pretrained(model_name)
+# model = AutoModel.from_pretrained(model_name)
+SENTIMENT_POLARITY_LABELS = [
+    "negative", "neutral", "positive"
+]
+current_model = None
+current_tokenizer = None
+# Enabling Resource caching
+@st.cache_resource
+def load_model_config():
+    with open(CONFIG_STAGE1, "r") as f:
+        model_data = json.load(f)
+    model_options = {v["name"]: v for v in model_data.values()}  # Extract names for dropdown
+    return model_data, model_options
+MODEL_DATA, MODEL_OPTIONS = load_model_config()
+# def load_model():
+#     model = DebertaV2ForSequenceClassification.from_pretrained(model_name)
+#     tokenizer = DebertaV2Tokenizer.from_pretrained(model_name)
+#     return model, tokenizer
+# ✅ Dynamically Import Model Functions
+def import_from_module(module_name, function_name):
+    try:
+        module = importlib.import_module(module_name)
+        return getattr(module, function_name)
+    except (ModuleNotFoundError, AttributeError) as e:
+        st.error(f"❌ Import Error: {e}")
+        return None
+def free_memory():
+    #  """Free up CPU & GPU memory before loading a new model."""
+    global current_model, current_tokenizer
+    if current_model is not None:
+        del current_model  # Delete the existing model
+        current_model = None  # Reset reference
+    if current_tokenizer is not None:
+        del current_tokenizer  # Delete the tokenizer
+        current_tokenizer = None
+    gc.collect()  # Force garbage collection for CPU memory
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()  # Free GPU memory
+        torch.cuda.ipc_collect()  # Clean up PyTorch GPU cache
+    # If running on CPU, reclaim memory using OS-level commands
+    try:
+        if torch.cuda.is_available() is False:
+            psutil.virtual_memory()  # Refresh memory stats
+    except Exception as e:
+        print(f"Memory cleanup error: {e}")
+def load_selected_model(model_name):
+    global current_model, current_tokenizer
+    free_memory()
+    # st.write("DEBUG: Available Models:", MODEL_OPTIONS.keys())  # ✅ See available models
+    # st.write("DEBUG: Selected Model:", MODEL_OPTIONS[model_name])  # ✅ Check selected model
+    # st.write("DEBUG: Model Name:", model_name)  # ✅ Check selected model
+    if model_name not in MODEL_OPTIONS:
+        st.error(f"⚠️ Model '{model_name}' not found in config!")
+        return None, None, None
+    model_info = MODEL_OPTIONS[model_name]
+    hf_location = model_info["hf_location"]
+    model_module = model_info["module_path"]
+    load_function = model_info["load_function"]
+    predict_function = model_info["predict_function"]
+    load_model_func = import_from_module(model_module, load_function)
+    predict_func = import_from_module(model_module, predict_function)
+    if load_model_func is None or predict_func is None:
+        st.error("❌ Model functions could not be loaded!")
+        return None, None, None
+    model, tokenizer = load_model_func()
+    current_model, current_tokenizer = model, tokenizer
+    return model, tokenizer, predict_func
+# def load_selected_model(model_name):
+#     # """Load model and tokenizer based on user selection."""
+#     global current_model, current_tokenizer
+#     # Free memory before loading a new model
+#     free_memory()
+#     if model_name not in MODEL_OPTIONS:
+#         st.error(f"⚠️ Model '{model_name}' not found in config!")
+#         return None, None
+#     model_info = MODEL_OPTIONS[model_name]
+#     hf_location = model_info["hf_location"]
+#     model_module = model_info["module_path"]
+#     # load_function = "load_model"
+#     # predict_function = "predict"
+#     load_function = model_info["load_function"]
+#     predict_function = model_info["predict_function"]
+#     # tokenizer_class = globals()[model_info["tokenizer_class"]]
+#     # model_class = globals()[model_info["model_class"]]
+#     # tokenizer = tokenizer_class.from_pretrained(hf_location)
+#     load_model_func = import_from_module(model_module, load_function)
+#     predict_func = import_from_module(model_module, predict_function)
+#     # # Load model
+#     # if model_info["type"] == "custom_checkpoint" or model_info["type"] == "custom_model":
+#     #     model = torch.load(hf_location, map_location="cpu")  # Load PyTorch model
+#     # elif model_info["type"] == "hf_automodel_finetuned_dbt3":
+#     #     tokenizer_class = globals()[model_info["tokenizer_class"]]
+#     #     model_class = globals()[model_info["model_class"]]
+#     #     tokenizer = tokenizer_class.from_pretrained(hf_location)
+#     #     model = model_class.from_pretrained(hf_location,
+#     #                                         problem_type=model_info["problem_type"],
+#     #                                         num_labels=model_info["num_labels"]
+#     #     )
+#     # else:
+#     #     st.error("Invalid model selection")
+#     #     return None, None
+#     if load_model_func is None or predict_func is None:
+#         st.error("❌ Model functions could not be loaded!")
+#         return None, None
+#     # current_model, current_tokenizer = model, tokenizer  # Store references
+#     # return model, tokenizer
+#     model, tokenizer = load_model_func(hf_location)
+#     current_model, current_tokenizer = model, tokenizer
+#     return model, tokenizer, predict_func
+def predict(text, model, tokenizer, device, max_len=128):
+    # Tokenize and pad the input text
+    inputs = tokenizer(
+        text,
+        add_special_tokens=True,
+        padding=True,
+        truncation=False,
+        return_tensors="pt",
+        return_token_type_ids=False,
+    ).to(device)  # Move input tensors to the correct device
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # Apply sigmoid activation (for BCEWithLogitsLoss)
+    probabilities = outputs.logits.cpu().numpy()
+    return probabilities
+# def show_sentiment_analysis():
+    # Add your sentiment analysis code here
+    # user_input = st.text_input("Enter text for sentiment analysis:")
+    # user_input = st.text_area("Enter text for sentiment analysis:", height=200)
+    # user_input = st.text_area("Enter text for sentiment analysis:", max_chars=500)
+def show_sentiment_analysis():
+    st.title("Stage 1: Sentiment Polarity Analysis")
+    st.write("This section will handle sentiment analysis.")
+    if "selected_model" not in st.session_state:
+        st.session_state.selected_model = list(MODEL_OPTIONS.keys())[0]  # Default selection
+    if "clear_output" not in st.session_state:
+        st.session_state.clear_output = False
+    st.selectbox("Choose a model:", list(MODEL_OPTIONS.keys()), key="selected_model")
+    selected_model = st.session_state.selected_model
+    if selected_model not in MODEL_OPTIONS:
+        st.error(f"❌ Selected model '{selected_model}' not found!")
+        st.stop()
+    st.session_state.clear_output = True  # Reset output when model changes
+    # st.write("DEBUG: Available Models:", MODEL_OPTIONS.keys())  # ✅ See available models
+    # st.write("DEBUG: Selected Model:", MODEL_OPTIONS[selected_model])  # ✅ Check selected model
+    user_input = st.text_input("Enter text for sentiment analysis:")
+    if user_input:
+        # Make prediction
+        # model, tokenizer = load_model()
+        # model, tokenizer = load_selected_model(selected_model)
+        model, tokenizer, predict_func = load_selected_model(selected_model)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if model is None:
+            st.error("⚠️ Error: Model failed to load! Check model selection or configuration.")
+            st.stop()
+        model.to(device)
+        # predictions = predict(user_input, model, tokenizer, device)
+        predictions = predict_func(user_input, model, tokenizer, device)
+        # Squeeze predictions to remove extra dimensions
+        predictions_array = predictions.squeeze()
+        # Convert to binary predictions (argmax)
+        binary_predictions = np.zeros_like(predictions_array)
+        max_indices = np.argmax(predictions_array)
+        binary_predictions[max_indices] = 1
+        # Display raw predictions
+        st.write(f"**Predicted Sentiment Scores:** {predictions_array}")
+        # Display binary classification result
+        st.write(f"**Predicted Sentiment:**")
+        st.write(f"**NEGATIVE:** {binary_predictions[0]}, **NEUTRAL:** {binary_predictions[1]}, **POSITIVE:** {binary_predictions[2]}")
+        # st.write(f"**NEUTRAL:** {binary_predictions[1]}")
+        # st.write(f"**POSITIVE:** {binary_predictions[2]}")
+        # 1️⃣ **Polar Plot (Plotly)**
+        sentiment_polarities = predictions_array.tolist()
+        fig_polar = px.line_polar(
+            pd.DataFrame(dict(r=sentiment_polarities, theta=SENTIMENT_POLARITY_LABELS)),
+            r='r', theta='theta', line_close=True
+        )
+        st.plotly_chart(fig_polar)
+        # 2️⃣ **Normalized Horizontal Bar Chart (Matplotlib)**
+        normalized_predictions = predictions_array / predictions_array.sum()
+        fig, ax = plt.subplots(figsize=(8, 2))
+        left = 0
+        for i in range(len(normalized_predictions)):
+            ax.barh(0, normalized_predictions[i], color=plt.cm.tab10(i), left=left, label=SENTIMENT_POLARITY_LABELS[i])
+            left += normalized_predictions[i]
+        # Configure the chart
+        ax.set_xlim(0, 1)
+        ax.set_yticks([])
+        ax.set_xticks(np.arange(0, 1.1, 0.1))
+        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=len(SENTIMENT_POLARITY_LABELS))
+        plt.title("Sentiment Polarity Prediction Distribution")
+        # Display in Streamlit
+        st.pyplot(fig)
+if __name__ == "__main__":
+    show_sentiment_analysis()

src/bq-helper/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

src/bq-helper/README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+## Summary
+BigQuery_Helper is a helper class to simplify common read-only BigQuery tasks. It makes it easy to execute queries while you're learning SQL, and provides a convenient stepping stone on the path to using [the core BigQuery python API](https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html). You can try it for yourself by forking [this Kaggle kernel](https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package/).
+## Installation
+You can install BigQuery_Helper with the following command in your console:
+`pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper`
+If you aren't running BigQuery_Helper on [Kaggle](http://kaggle.com/), you will also need to go through the [standard BigQuery client setup and authentication process](https://cloud.google.com/bigquery/docs/reference/libraries).
+This repo has only been tested on Python 3.6+ and the v0.29+ of the bigquery API.
+## Changelog
+#### 0.4.0:
+- `BigQueryHelper.table_schema` has been overhauled. It now returns a Pandas DataFrame and unrolls nested fields so that the results are in the format expected by queries. For example, the `github_repos.commits` nested field `author` now returns sub-fields names in the format like `author.email`.
+#### 0.3.0:
+- Each helper instance now logs the total bytes counted towards your quota or bill used across all queries run with that helper instance. You can access it with `BigQueryHelper.total_gb_used_net_cache`. Repeated queries are likely to hit the cache and may show up as 0 GB used.
+- Queries that take longer than the maximum wait time, which defaults to 3 minutes, will be cancelled.
+- Contributing to bq_helper should be easier now that there is a set of tests.
+#### 0.2.0:
+- `query_to_pandas` now returns an empty DataFrame when the query returns no results. Previously, this returned `None`.

src/bq-helper/__pycache__/version.cpython-312.pyc ADDED Viewed

Binary file (190 Bytes). View file

src/bq-helper/bq_helper.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Helper class to simplify common read-only BigQuery tasks.
+"""
+import pandas as pd
+import time
+from google.cloud import bigquery
+class BigQueryHelper(object):
+    """
+    Helper class to simplify common BigQuery tasks like executing queries,
+    showing table schemas, etc without worrying about table or dataset pointers.
+    See the BigQuery docs for details of the steps this class lets you skip:
+    https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html
+    """
+    def __init__(self, active_project, dataset_name, max_wait_seconds=180):
+        self.project_name = active_project
+        self.dataset_name = dataset_name
+        self.max_wait_seconds = max_wait_seconds
+        self.client = bigquery.Client()
+        self.__dataset_ref = self.client.dataset(self.dataset_name, project=self.project_name)
+        self.dataset = None
+        self.tables = dict()  # {table name (str): table object}
+        self.__table_refs = dict()  # {table name (str): table reference}
+        self.total_gb_used_net_cache = 0
+        self.BYTES_PER_GB = 2**30
+    def __fetch_dataset(self):
+        """
+        Lazy loading of dataset. For example,
+        if the user only calls `self.query_to_pandas` then the
+        dataset never has to be fetched.
+        """
+        if self.dataset is None:
+            self.dataset = self.client.get_dataset(self.__dataset_ref)
+    def __fetch_table(self, table_name):
+        """
+        Lazy loading of table
+        """
+        self.__fetch_dataset()
+        if table_name not in self.__table_refs:
+            self.__table_refs[table_name] = self.dataset.table(table_name)
+        if table_name not in self.tables:
+            self.tables[table_name] = self.client.get_table(self.__table_refs[table_name])
+    def __handle_record_field(self, row, schema_details, top_level_name=''):
+        """
+        Unpack a single row, including any nested fields.
+        """
+        name = row['name']
+        if top_level_name != '':
+            name = top_level_name + '.' + name
+        schema_details.append([{
+            'name': name,
+            'type': row['type'],
+            'mode': row['mode'],
+            'fields': pd.np.nan,
+            'description': row['description']
+                               }])
+        # float check is to dodge row['fields'] == np.nan
+        if type(row.get('fields', 0.0)) == float:
+            return None
+        for entry in row['fields']:
+            self.__handle_record_field(entry, schema_details, name)
+    def __unpack_all_schema_fields(self, schema):
+        """
+        Unrolls nested schemas. Returns dataframe with one row per field,
+        and the field names in the format accepted by the API.
+        Results will look similar to the website schema, such as:
+            https://bigquery.cloud.google.com/table/bigquery-public-data:github_repos.commits?pli=1
+        Args:
+            schema: DataFrame derived from api repr of raw table.schema
+        Returns:
+            Dataframe of the unrolled schema.
+        """
+        schema_details = []
+        schema.apply(lambda row:
+            self.__handle_record_field(row, schema_details), axis=1)
+        result = pd.concat([pd.DataFrame.from_dict(x) for x in schema_details])
+        result.reset_index(drop=True, inplace=True)
+        del result['fields']
+        return result
+    def table_schema(self, table_name):
+        """
+        Get the schema for a specific table from a dataset.
+        Unrolls nested field names into the format that can be copied
+        directly into queries. For example, for the `github.commits` table,
+        the this will return `committer.name`.
+        This is a very different return signature than BigQuery's table.schema.
+        """
+        self.__fetch_table(table_name)
+        raw_schema = self.tables[table_name].schema
+        schema = pd.DataFrame.from_dict([x.to_api_repr() for x in raw_schema])
+        # the api_repr only has the fields column for tables with nested data
+        if 'fields' in schema.columns:
+            schema = self.__unpack_all_schema_fields(schema)
+        # Set the column order
+        schema = schema[['name', 'type', 'mode', 'description']]
+        return schema
+    def list_tables(self):
+        """
+        List the names of the tables in a dataset
+        """
+        self.__fetch_dataset()
+        return([x.table_id for x in self.client.list_tables(self.dataset)])
+    def estimate_query_size(self, query):
+        """
+        Estimate gigabytes scanned by query.
+        Does not consider if there is a cached query table.
+        See https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.dryRun
+        """
+        my_job_config = bigquery.job.QueryJobConfig()
+        my_job_config.dry_run = True
+        my_job = self.client.query(query, job_config=my_job_config)
+        return my_job.total_bytes_processed / self.BYTES_PER_GB
+    def query_to_pandas(self, query):
+        """
+        Execute a SQL query & return a pandas dataframe
+        """
+        my_job = self.client.query(query)
+        start_time = time.time()
+        while not my_job.done():
+            if (time.time() - start_time) > self.max_wait_seconds:
+                print("Max wait time elapsed, query cancelled.")
+                self.client.cancel_job(my_job.job_id)
+                return None
+            time.sleep(0.1)
+        # Queries that hit errors will return an exception type.
+        # Those exceptions don't get raised until we call my_job.to_dataframe()
+        # In that case, my_job.total_bytes_billed can be called but is None
+        if my_job.total_bytes_billed:
+            self.total_gb_used_net_cache += my_job.total_bytes_billed / self.BYTES_PER_GB
+        return my_job.to_dataframe()
+    def query_to_pandas_safe(self, query, max_gb_scanned=1):
+        """
+        Execute a query, but only if the query would scan less than `max_gb_scanned` of data.
+        """
+        query_size = self.estimate_query_size(query)
+        if query_size <= max_gb_scanned:
+            return self.query_to_pandas(query)
+        msg = "Query cancelled; estimated size of {0} exceeds limit of {1} GB"
+        print(msg.format(query_size, max_gb_scanned))
+    def head(self, table_name, num_rows=5, start_index=None, selected_columns=None):
+        """
+        Get the first n rows of a table as a DataFrame.
+        Does not perform a full table scan; should use a trivial amount of data as long as n is small.
+        """
+        self.__fetch_table(table_name)
+        active_table = self.tables[table_name]
+        schema_subset = None
+        if selected_columns:
+            schema_subset = [col for col in active_table.schema if col.name in selected_columns]
+        results = self.client.list_rows(active_table, selected_fields=schema_subset,
+            max_results=num_rows, start_index=start_index)
+        results = [x for x in results]
+        return pd.DataFrame(
+            data=[list(x.values()) for x in results], columns=list(results[0].keys()))

src/bq-helper/setup.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from setuptools import setup
+from version import __version__ as version
+setup(name='bq_helper',
+      version=version,
+      description='Helper class to simplify common read-only BigQuery tasks.',
+      author='Sohier Dane',
+      url='https://github.com/SohierDane/BigQuery_Helper',
+      license='Apache 2.0',
+      install_requires=['pandas', 'google-cloud-bigquery'],
+      classifiers=['Programming Language :: Python :: 3'],
+      )

src/bq-helper/test_helper.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Tests all public methods of the BigQueryHelper class.
+Run from command line with:
+python -m unittest test_helper.py
+BILLING WARNING:
+Running these tests requires a working BigQuery account and MAY CAUSE CHARGES.
+However the dataset used for the tests is only ~2 MB, so any charges should
+be very minimal. The downside is that this particular dataset is completely
+refreshed every hour, so it's not possible to check for any specific return values.
+For details on the test dataset, please see:
+https://bigquery.cloud.google.com/table/bigquery-public-data:openaq.global_air_quality?tab=details
+"""
+import unittest
+from bq_helper import BigQueryHelper
+from google.api_core.exceptions import BadRequest
+from pandas.core.frame import DataFrame
+from random import random
+class TestBQHelper(unittest.TestCase):
+    def setUp(self):
+        self.my_bq = BigQueryHelper("bigquery-public-data", "openaq")
+        self.query = "SELECT location FROM `bigquery-public-data.openaq.global_air_quality`"
+        # Query randomized so it won't hit the cache across multiple test runs
+        self.randomizable_query = """
+            SELECT value FROM `bigquery-public-data.openaq.global_air_quality`
+            WHERE value = {0}"""
+    def test_list_tables(self):
+        self.assertEqual(self.my_bq.list_tables(), ['global_air_quality'])
+    def test_list_schema(self):
+        self.assertEqual(len(self.my_bq.table_schema('global_air_quality')), 11)
+    def test_estimate_query_size(self):
+        self.assertIsInstance(self.my_bq.estimate_query_size(self.query), float)
+    def test_query_to_pandas(self):
+        self.assertIsInstance(self.my_bq.query_to_pandas(self.query), DataFrame)
+    def test_query_safe_passes(self):
+        self.assertIsInstance(self.my_bq.query_to_pandas_safe(self.query), DataFrame)
+    def test_query_safe_fails(self):
+        # Different query must be used for this test to ensure we don't hit the
+        # cache and end up passing by testing a query that would use zero bytes.
+        fail_query = self.randomizable_query.format(random())
+        self.assertIsNone(self.my_bq.query_to_pandas_safe(fail_query, 10**-10))
+    def test_head(self):
+        self.assertIsInstance(self.my_bq.head('global_air_quality'), DataFrame)
+    def test_useage_tracker(self):
+        self.my_bq.query_to_pandas(self.randomizable_query.format(random()))
+        self.assertNotEqual(self.my_bq.total_gb_used_net_cache, 0)
+    def test_bad_query_raises_right_error(self):
+        with self.assertRaises(BadRequest):
+            self.my_bq.query_to_pandas("Not a valid query")
+    def test_list_nested_schema(self):
+        nested_helper = BigQueryHelper("bigquery-public-data", "github_repos")
+        self.assertEqual(len(nested_helper.table_schema('commits')), 33)
+if __name__ == '__main__':
+    unittest.main()

src/bq-helper/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = '0.4.1'