Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
import sparknlp | |
from sparknlp.base import DocumentAssembler | |
from sparknlp.annotator import AutoGGUFModel | |
from pyspark.ml import Pipeline | |
from sparknlp.base import LightPipeline | |
# Page Configuration | |
st.set_page_config( | |
layout="wide" | |
) | |
# CSS Styling | |
st.markdown(""" | |
<style> | |
.main-title { | |
font-size: 36px; | |
color: #4A90E2; | |
font-weight: bold; | |
text-align: center; | |
} | |
.section { | |
background-color: #f9f9f9; | |
padding: 10px; | |
border-radius: 10px; | |
margin-top: 10px; | |
} | |
.section p, .section ul { | |
color: #666666; | |
} | |
.table { | |
width: 100%; | |
border-collapse: collapse; | |
margin-top: 20px; | |
} | |
.table th, .table td { | |
border: 1px solid #ddd; | |
padding: 8px; | |
text-align: left; | |
} | |
.table th { | |
background-color: #4A90E2; | |
color: white; | |
} | |
.table td { | |
background-color: #f2f2f2; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Helper Functions | |
def init_spark(): | |
"""Initialize Spark NLP with GPU support.""" | |
return sparknlp.start(gpu=True) | |
def create_pipeline(model, prompt): | |
"""Create a Spark NLP pipeline with the specified model and prompt.""" | |
documentAssembler = DocumentAssembler() \ | |
.setInputCol("text") \ | |
.setOutputCol("document") | |
autoGGUFModel = (AutoGGUFModel.pretrained(model) | |
.setInputCols(["document"]) | |
.setOutputCol("completions") | |
.setUseChatTemplate(True) | |
.setSystemPrompt(prompt) | |
.setNPredict(-1) | |
.setTemperature(0.2) | |
.setTopP(0.9) | |
.setRepeatPenalty(1.3)) | |
pipeline = Pipeline().setStages([documentAssembler, autoGGUFModel]) | |
return pipeline | |
def fit_data(pipeline, data): | |
"""Fit data to the pipeline and generate output.""" | |
pipeline_model = pipeline.fit(spark.createDataFrame([[data]]).toDF('text')) | |
model = LightPipeline(pipeline_model) | |
result = model.annotate(data) | |
return result | |
def render_footer(): | |
sparknlp_version = sparknlp.version() | |
apache_spark_version = spark.version | |
footer_html = f""" | |
<hr style="margin-top: 50px; border: none; border-top: 1px solid #ddd;"> | |
<div style="text-align: center; font-size: 14px; color: #666;"> | |
<p><strong>Powered by:</strong> Spark NLP v{sparknlp_version} | Apache Spark v{apache_spark_version}</p> | |
<p>Developed by <a href="https://www.johnsnowlabs.com/" target="_blank" style="color: #4A90E2; text-decoration: none;">John Snow Labs</a></p> | |
<p>© {pd.Timestamp.now().year} All Rights Reserved</p> | |
</div> | |
""" | |
st.markdown(footer_html, unsafe_allow_html=True) | |
# Page Title and Subtitle | |
title = "GGUF (General-purpose Graph Universal Format) in Spark NLP" | |
sub_title = "Showcasing the Power of AutoGGUFModel in Spark NLP for various Text Generation Tasks" | |
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True) | |
st.markdown(f'<div style="text-align: center; color: #666666;">{sub_title}</div>', unsafe_allow_html=True) | |
st.markdown('---') | |
# Task and Model Selection | |
tasks_prompt_map = { | |
"Summarization": "You are a summarization assistant. Provide a concise and accurate summary of the given text within, focusing on the main ideas and key points. Avoid unnecessary details and ensure clarity.", | |
"Text Completion": "You are a creative and precise assistant. Complete the given text naturally and fluently, ensuring coherence with the provided context and maintaining the tone and style.", | |
"Translation": "You are a professional translator. Translate the given text accurately and naturally, preserving its meaning, tone, and context. Ensure fluency and clarity in the target language.", | |
"Paraphrasing": "You are a paraphrasing assistant. Rewrite the given text to convey the same meaning in a different way, ensuring clarity and grammatical accuracy without altering the original intent.", | |
"Question Answering": "You are an expert question-answering assistant. Based on the provided context, give accurate and concise answers to the questions. Ensure your responses are clear and directly address the query.", | |
"Code Generation": "You are a coding assistant. Write clean, efficient, and error-free code to solve the given problem or implement the specified functionality. Adhere to best practices and include comments as needed.", | |
} | |
model_list = [ | |
"phi3.5_mini_4k_instruct_q4_gguf", "meta_llama_3_8b_instruct_iq3_m", | |
"qwen2.5_3b_instruct_q3_k_l", "mistral_7b_instruct_v0.3_q3_k_l" | |
] | |
examples = { | |
"Summarization": """Spark NLP is an open-source text processing library designed for advanced natural language processing (NLP) in Python, Java, and Scala. Built on top of Apache Spark and its Spark ML library, it provides APIs for scalable, production-grade NLP pipelines that incorporate recent academic research. The library includes pre-trained neural network models, pipelines, embeddings, and support for training custom models, making it a robust tool for building sophisticated NLP workflows. The library’s design revolves around a pipeline structure, an ordered set of text annotators that facilitate tasks such as tokenization, normalization, stemming, lemmatization, and regular expression-based processing. Advanced features include text matching with tools like TextMatcher and DateMatcher, sentence detection (both traditional and deep learning-based), part-of-speech tagging, sentiment analysis, named entity recognition, dependency parsing, document classification, and spell checking. The library supports a wide range of languages, including East Asian (e.g., Chinese, Japanese, Korean) and right-to-left languages (e.g., Urdu, Farsi, Arabic, Hebrew). The Models Hub offers access to open-source and licensed pre-trained models and pipelines for over 200 languages. These resources include tokenization, lemmatization, part-of-speech tagging, named entity recognition, and embeddings such as GloVe, ELMo, BERT, ALBERT, XLNet, Small BERT, and ELECTRA. Sentence embeddings like Universal Sentence Embeddings (USE) and Language Agnostic BERT Sentence Embeddings (LaBSE) are also available. Spark NLP for Healthcare is a commercial extension designed specifically for clinical and biomedical text mining. It provides domain-specific annotators and pipelines for tasks such as clinical entity recognition, entity linking, normalization, assertion detection, de-identification, and relation extraction. Additionally, it includes pre-trained models like JSL-BERT-Clinical, BioBERT, and ClinicalBERT, which are optimized for identifying clinical concepts, drugs, risk factors, anatomy, demographics, and sensitive data. Another notable extension is Spark OCR, which is designed for optical character recognition (OCR) on images, scanned PDFs, and DICOM files. Built on Apache Spark, it supports tasks like text extraction, de-noising, skew correction, layout analysis, and masking sensitive information. Tight integration with Spark NLP allows users to combine OCR and NLP tasks, enabling workflows such as extracting and processing text from images or de-identifying sensitive information in scanned documents. It supports multiple output formats, including JSON, CSV, PDF, and annotated images, and is scalable across Spark clusters. Spark NLP is licensed under the Apache 2.0 license, with source code publicly available on GitHub. Prebuilt versions are distributed through PyPi and Anaconda for Python, Maven Central for Java and Scala, and Spark Packages for Spark development. Recognized for its contributions to NLP, the library received the Open Source Award in 2019.""", | |
"Text Completion": "The rise of artificial intelligence has transformed various industries. One of the most significant advancements is its ability to process natural language. With tools like large language models, we can now create personalized content, automate customer service, and even generate creative works. However, this rapid progress also raises ethical questions and concerns about...", | |
"Translation": """Spark NLP is an open-source text processing library designed for advanced natural language processing (NLP) in Python, Java, and Scala. Built on top of Apache Spark and its Spark ML library, it provides APIs for scalable, production-grade NLP pipelines that incorporate recent academic research. The library includes pre-trained neural network models, pipelines, embeddings, and support for training custom models, making it a robust tool for building sophisticated NLP workflows. The library’s design revolves around a pipeline structure, an ordered set of text annotators that facilitate tasks such as tokenization, normalization, stemming, lemmatization, and regular expression-based processing. Advanced features include text matching with tools like TextMatcher and DateMatcher, sentence detection (both traditional and deep learning-based), part-of-speech tagging, sentiment analysis, named entity recognition, dependency parsing, document classification, and spell checking. The library supports a wide range of languages, including East Asian (e.g., Chinese, Japanese, Korean) and right-to-left languages (e.g., Urdu, Farsi, Arabic, Hebrew). The Models Hub offers access to open-source and licensed pre-trained models and pipelines for over 200 languages. These resources include tokenization, lemmatization, part-of-speech tagging, named entity recognition, and embeddings such as GloVe, ELMo, BERT, ALBERT, XLNet, Small BERT, and ELECTRA. Sentence embeddings like Universal Sentence Embeddings (USE) and Language Agnostic BERT Sentence Embeddings (LaBSE) are also available. Spark NLP for Healthcare is a commercial extension designed specifically for clinical and biomedical text mining. It provides domain-specific annotators and pipelines for tasks such as clinical entity recognition, entity linking, normalization, assertion detection, de-identification, and relation extraction. Additionally, it includes pre-trained models like JSL-BERT-Clinical, BioBERT, and ClinicalBERT, which are optimized for identifying clinical concepts, drugs, risk factors, anatomy, demographics, and sensitive data. Another notable extension is Spark OCR, which is designed for optical character recognition (OCR) on images, scanned PDFs, and DICOM files. Built on Apache Spark, it supports tasks like text extraction, de-noising, skew correction, layout analysis, and masking sensitive information. Tight integration with Spark NLP allows users to combine OCR and NLP tasks, enabling workflows such as extracting and processing text from images or de-identifying sensitive information in scanned documents. It supports multiple output formats, including JSON, CSV, PDF, and annotated images, and is scalable across Spark clusters. Spark NLP is licensed under the Apache 2.0 license, with source code publicly available on GitHub. Prebuilt versions are distributed through PyPi and Anaconda for Python, Maven Central for Java and Scala, and Spark Packages for Spark development. Recognized for its contributions to NLP, the library received the Open Source Award in 2019.""", | |
"Paraphrasing": """Spark NLP is an open-source text processing library designed for advanced natural language processing (NLP) in Python, Java, and Scala. Built on top of Apache Spark and its Spark ML library, it provides APIs for scalable, production-grade NLP pipelines that incorporate recent academic research. The library includes pre-trained neural network models, pipelines, embeddings, and support for training custom models, making it a robust tool for building sophisticated NLP workflows. The library’s design revolves around a pipeline structure, an ordered set of text annotators that facilitate tasks such as tokenization, normalization, stemming, lemmatization, and regular expression-based processing. Advanced features include text matching with tools like TextMatcher and DateMatcher, sentence detection (both traditional and deep learning-based), part-of-speech tagging, sentiment analysis, named entity recognition, dependency parsing, document classification, and spell checking. The library supports a wide range of languages, including East Asian (e.g., Chinese, Japanese, Korean) and right-to-left languages (e.g., Arabic, Hebrew). Prebuilt versions are distributed through PyPi and Anaconda for Python, Maven Central for Java and Scala, and Spark Packages for Spark development.""", | |
"Question Answering": """ | |
Context: | |
Spark NLP is an open-source library designed for advanced natural language processing (NLP) tasks in Python, Java, and Scala. It is built on top of Apache Spark and provides APIs for creating scalable, production-grade NLP pipelines. The library includes pre-trained models, pipelines, and embeddings, supporting a wide range of languages and tasks. Its features include tokenization, part-of-speech tagging, named entity recognition, sentiment analysis, dependency parsing, and document classification. Spark NLP also offers tools for healthcare and OCR-related workflows through its commercial extensions, making it a comprehensive tool for NLP solutions. | |
Question: | |
What are the main features of Spark NLP, and what extensions does it offer for specialized tasks?, | |
""", | |
"Code Generation": "Generate a Python script that reads a CSV file, removes rows with missing values, and saves the cleaned data to a new file." | |
} | |
sec1, sec2 = st.columns(2) | |
with sec1: | |
with st.container(): | |
col1, col2 = st.columns(2) | |
with col1: | |
task = st.selectbox("Task:", tasks_prompt_map.keys()) | |
with col2: | |
model = st.selectbox("Model:", model_list) | |
# Text Input Section | |
prompt = st.text_input('Prompt for the Model:', tasks_prompt_map[task]) | |
# Task-specific modifications | |
if task == 'Translation': | |
col1, col2 = st.columns(2) | |
with col1: | |
from_lang = st.text_input("From Language:", "English") | |
with col2: | |
to_lang = st.text_input("To Language:", "French") | |
text_to_analyze = st.text_area("Form an Example text:", examples[task], 620) | |
if task == 'Summarization': | |
text_to_analyze = f"Summarize the following document: {text_to_analyze}" | |
elif task == 'Translation': | |
text_to_analyze = f"Translate the following document from {from_lang} to {to_lang}: {text_to_analyze}" | |
with sec2: | |
with st.container(): | |
# NLP Pipeline Execution | |
spark = init_spark() | |
pipeline = create_pipeline(model, prompt) | |
output = fit_data(pipeline, text_to_analyze) | |
result = output['completions'][0] | |
# Display Results | |
st.write('Result:') | |
st.container(border=True).write(result) | |
# Additional Analysis (Summarization Task) | |
if task == 'Summarization': | |
with st.container(border=True): | |
# Calculate stats | |
original_wc, summary_wc = len(text_to_analyze.split()), len(result.split()) | |
compressed_wc = original_wc - summary_wc | |
compression_ratio = (summary_wc / original_wc) * 100 if original_wc else 0 | |
# Prepare pie chart data for visualization | |
pie_data = pd.DataFrame({ | |
"Category": ["Summary", "Compressed"], | |
"Words": [summary_wc, compressed_wc], | |
}) | |
st.subheader("Summary Statistics") | |
col1, col2 = st.columns(2) | |
with col1: | |
data = pd.DataFrame({"Text": ["Original", "Summary"], "Count": [original_wc, summary_wc]}) | |
st.bar_chart(data.set_index("Text"), height=400) | |
st.markdown(f"**Original Word Count:** {original_wc} \n**Summary Word Count:** {summary_wc}") | |
with col2: | |
fig = px.pie( | |
pie_data, | |
values="Words", | |
names="Category", | |
color_discrete_map={"Summary": "#1f77b4", "Compressed": "#ff7f0e"}, | |
title=f"Compression Ratio: {compression_ratio:.1f}%", | |
hole=0.4, | |
) | |
fig.update_layout(width=400, height=400) | |
st.plotly_chart(fig, use_container_width=False) | |
st.markdown(f"**Words Compressed:** {compressed_wc} \n**Compression Ratio:** {compression_ratio:.1f}%") | |
render_footer() |