Spaces:

ahmadtalha
/

DNA-Sequence-Classification

Running

App Files Files Community

ahmadtalha commited on 14 days ago

Commit

699dac2

verified ·

1 Parent(s): 3b68c42

Upload app.py

Browse files

Files changed (1) hide show

app.py +79 -0

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+from collections import Counter
+import joblib
+import logomaker
+from sklearn.feature_extraction.text import CountVectorizer
+# Load model and vectorizer
+model = joblib.load("Model/naive_bayes_model.pkl")  # Update path if needed
+vectorizer = joblib.load("Model/count_vectorizer.pkl")  # Update path if needed
+# Class mapping
+class_mappings = {
+    0: "G Protein Coupled Receptors",
+    1: "Tyrosine Kinase",
+    2: "Tyrosine Phosphatase",
+    3: "Synthetase",
+    4: "Synthase",
+    5: "Ion Channel",
+    6: "Transcription Factor"
+}
+# Function to extract k-mers
+def get_kmers(sequence, size=6):
+    return [sequence[i:i+size] for i in range(len(sequence)-size+1)]
+# Page title
+st.title("🧬 DNA Sequence Classifier")
+# Sidebar
+st.sidebar.header("Input Options")
+uploaded_file = st.sidebar.file_uploader("Upload DNA Sequence File (.txt)", type=["txt"])
+# Read uploaded file
+sequence = ""
+if uploaded_file:
+    raw = uploaded_file.read().decode("utf-8")
+    # Remove FASTA headers if present
+    sequence = ''.join([line.strip() for line in raw.splitlines() if not line.startswith(">")]).upper()
+    st.subheader("📥 Input DNA Sequence")
+    st.text_area("Sequence (first 1000 characters shown)", sequence[:1000], height=150)
+    # Base Distribution
+    st.subheader("🔬 Nucleotide Distribution")
+    base_counts = Counter(sequence)
+    bases = ['A', 'T', 'G', 'C']
+    counts = [base_counts.get(base, 0) for base in bases]
+    fig1, ax1 = plt.subplots()
+    ax1.bar(bases, counts, color=['green', 'red', 'blue', 'orange'])
+    ax1.set_ylabel("Count")
+    st.pyplot(fig1)
+    # Top k-mers
+    st.subheader("🔠 Top 10 6-mers")
+    kmers = get_kmers(sequence, size=6)
+    top_kmers = Counter(kmers).most_common(10)
+    df_top = pd.DataFrame(top_kmers, columns=["6-mer", "Count"])
+    st.dataframe(df_top)
+    # Prediction
+    st.subheader("🤖 Predicted Class")
+    kmers_text = ' '.join(kmers)
+    vectorized = vectorizer.transform([kmers_text])
+    pred = model.predict(vectorized)[0]
+    proba = model.predict_proba(vectorized)[0]
+    st.markdown(f"### 🧬 Class: `{class_mappings[pred]}`")
+    st.markdown(f"Confidence: `{proba[pred]*100:.2f}%`")
+    # Optional: Show sequence logo (if short enough)
+    if len(sequence) <= 100:
+        st.subheader("📈 Sequence Logo")
+        logo_df = logomaker.alignment_to_matrix([sequence])
+        fig2, ax2 = plt.subplots(figsize=(10, 3))
+        logomaker.Logo(logo_df, ax=ax2)
+        st.pyplot(fig2)
+else:
+    st.info("Please upload a DNA sequence file to begin.")