Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
from collections import Counter
|
| 5 |
+
import joblib
|
| 6 |
+
import logomaker
|
| 7 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 8 |
+
|
| 9 |
+
# Load model and vectorizer
|
| 10 |
+
model = joblib.load("Model/naive_bayes_model.pkl") # Update path if needed
|
| 11 |
+
vectorizer = joblib.load("Model/count_vectorizer.pkl") # Update path if needed
|
| 12 |
+
|
| 13 |
+
# Class mapping
|
| 14 |
+
class_mappings = {
|
| 15 |
+
0: "G Protein Coupled Receptors",
|
| 16 |
+
1: "Tyrosine Kinase",
|
| 17 |
+
2: "Tyrosine Phosphatase",
|
| 18 |
+
3: "Synthetase",
|
| 19 |
+
4: "Synthase",
|
| 20 |
+
5: "Ion Channel",
|
| 21 |
+
6: "Transcription Factor"
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
# Function to extract k-mers
|
| 25 |
+
def get_kmers(sequence, size=6):
|
| 26 |
+
return [sequence[i:i+size] for i in range(len(sequence)-size+1)]
|
| 27 |
+
|
| 28 |
+
# Page title
|
| 29 |
+
st.title("𧬠DNA Sequence Classifier")
|
| 30 |
+
|
| 31 |
+
# Sidebar
|
| 32 |
+
st.sidebar.header("Input Options")
|
| 33 |
+
uploaded_file = st.sidebar.file_uploader("Upload DNA Sequence File (.txt)", type=["txt"])
|
| 34 |
+
|
| 35 |
+
# Read uploaded file
|
| 36 |
+
sequence = ""
|
| 37 |
+
if uploaded_file:
|
| 38 |
+
raw = uploaded_file.read().decode("utf-8")
|
| 39 |
+
# Remove FASTA headers if present
|
| 40 |
+
sequence = ''.join([line.strip() for line in raw.splitlines() if not line.startswith(">")]).upper()
|
| 41 |
+
st.subheader("π₯ Input DNA Sequence")
|
| 42 |
+
st.text_area("Sequence (first 1000 characters shown)", sequence[:1000], height=150)
|
| 43 |
+
|
| 44 |
+
# Base Distribution
|
| 45 |
+
st.subheader("π¬ Nucleotide Distribution")
|
| 46 |
+
base_counts = Counter(sequence)
|
| 47 |
+
bases = ['A', 'T', 'G', 'C']
|
| 48 |
+
counts = [base_counts.get(base, 0) for base in bases]
|
| 49 |
+
fig1, ax1 = plt.subplots()
|
| 50 |
+
ax1.bar(bases, counts, color=['green', 'red', 'blue', 'orange'])
|
| 51 |
+
ax1.set_ylabel("Count")
|
| 52 |
+
st.pyplot(fig1)
|
| 53 |
+
|
| 54 |
+
# Top k-mers
|
| 55 |
+
st.subheader("π Top 10 6-mers")
|
| 56 |
+
kmers = get_kmers(sequence, size=6)
|
| 57 |
+
top_kmers = Counter(kmers).most_common(10)
|
| 58 |
+
df_top = pd.DataFrame(top_kmers, columns=["6-mer", "Count"])
|
| 59 |
+
st.dataframe(df_top)
|
| 60 |
+
|
| 61 |
+
# Prediction
|
| 62 |
+
st.subheader("π€ Predicted Class")
|
| 63 |
+
kmers_text = ' '.join(kmers)
|
| 64 |
+
vectorized = vectorizer.transform([kmers_text])
|
| 65 |
+
pred = model.predict(vectorized)[0]
|
| 66 |
+
proba = model.predict_proba(vectorized)[0]
|
| 67 |
+
|
| 68 |
+
st.markdown(f"### 𧬠Class: `{class_mappings[pred]}`")
|
| 69 |
+
st.markdown(f"Confidence: `{proba[pred]*100:.2f}%`")
|
| 70 |
+
|
| 71 |
+
# Optional: Show sequence logo (if short enough)
|
| 72 |
+
if len(sequence) <= 100:
|
| 73 |
+
st.subheader("π Sequence Logo")
|
| 74 |
+
logo_df = logomaker.alignment_to_matrix([sequence])
|
| 75 |
+
fig2, ax2 = plt.subplots(figsize=(10, 3))
|
| 76 |
+
logomaker.Logo(logo_df, ax=ax2)
|
| 77 |
+
st.pyplot(fig2)
|
| 78 |
+
else:
|
| 79 |
+
st.info("Please upload a DNA sequence file to begin.")
|