Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,20 @@
|
|
1 |
import streamlit as st
|
2 |
import numpy as np
|
|
|
|
|
3 |
from Bio.Seq import Seq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
def calculate_gc_content(seq):
|
6 |
gc_count = seq.count('G') + seq.count('C')
|
@@ -27,7 +41,7 @@ def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
|
|
27 |
if in_region:
|
28 |
regulatory_regions.append((start, len(seq)))
|
29 |
|
30 |
-
return regulatory_regions
|
31 |
|
32 |
def analyze_dark_matter(sequence):
|
33 |
seq = Seq(sequence)
|
@@ -41,34 +55,75 @@ def analyze_dark_matter(sequence):
|
|
41 |
caat_box = seq.count("CCAAT")
|
42 |
|
43 |
# Find potential regulatory regions based on GC content
|
44 |
-
regulatory_regions = find_potential_regulatory_regions(seq)
|
45 |
|
46 |
-
return length, gc_content, tata_box, caat_box, regulatory_regions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
# Streamlit app
|
49 |
st.title("Genomic Dark Matter Analyzer")
|
50 |
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
if st.button("Analyze"):
|
54 |
if sequence:
|
55 |
-
length, gc_content, tata_box, caat_box, regulatory_regions = analyze_dark_matter(sequence)
|
|
|
|
|
56 |
|
57 |
-
st.write(f"Sequence Length
|
58 |
-
st.write(
|
59 |
-
|
60 |
-
st.write(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
st.subheader("Potential Regulatory Regions (based on GC content):")
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
# Visualize the sequence with highlighted regions
|
|
|
67 |
highlighted_seq = list(sequence)
|
68 |
for start, end in regulatory_regions:
|
69 |
for i in range(start, min(end, len(highlighted_seq))):
|
70 |
highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
|
71 |
|
72 |
st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
|
|
|
73 |
else:
|
74 |
-
st.write("Please enter a DNA sequence.")
|
|
|
1 |
import streamlit as st
|
2 |
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from Bio import Entrez, SeqIO
|
5 |
from Bio.Seq import Seq
|
6 |
+
from io import StringIO
|
7 |
+
|
8 |
+
# Set your email for NCBI Entrez
|
9 |
+
Entrez.email = "[email protected]"
|
10 |
+
|
11 |
+
def fetch_sequence_from_ncbi(accession):
|
12 |
+
try:
|
13 |
+
handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
|
14 |
+
record = SeqIO.read(handle, "fasta")
|
15 |
+
return str(record.seq)
|
16 |
+
except:
|
17 |
+
return None
|
18 |
|
19 |
def calculate_gc_content(seq):
|
20 |
gc_count = seq.count('G') + seq.count('C')
|
|
|
41 |
if in_region:
|
42 |
regulatory_regions.append((start, len(seq)))
|
43 |
|
44 |
+
return regulatory_regions, gc_content
|
45 |
|
46 |
def analyze_dark_matter(sequence):
|
47 |
seq = Seq(sequence)
|
|
|
55 |
caat_box = seq.count("CCAAT")
|
56 |
|
57 |
# Find potential regulatory regions based on GC content
|
58 |
+
regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq)
|
59 |
|
60 |
+
return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution
|
61 |
+
|
62 |
+
def plot_gc_distribution(gc_distribution):
|
63 |
+
fig, ax = plt.subplots(figsize=(10, 4))
|
64 |
+
ax.plot(gc_distribution)
|
65 |
+
ax.set_xlabel('Sequence Position')
|
66 |
+
ax.set_ylabel('GC Content (%)')
|
67 |
+
ax.set_title('GC Content Distribution')
|
68 |
+
ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)')
|
69 |
+
ax.legend()
|
70 |
+
return fig
|
71 |
|
72 |
# Streamlit app
|
73 |
st.title("Genomic Dark Matter Analyzer")
|
74 |
|
75 |
+
sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI"))
|
76 |
+
|
77 |
+
if sequence_input == "Enter sequence":
|
78 |
+
sequence = st.text_area("Paste your DNA sequence here", height=150)
|
79 |
+
else:
|
80 |
+
accession = st.text_input("Enter NCBI accession number")
|
81 |
+
if accession:
|
82 |
+
sequence = fetch_sequence_from_ncbi(accession)
|
83 |
+
if sequence:
|
84 |
+
st.success(f"Successfully fetched sequence for {accession}")
|
85 |
+
else:
|
86 |
+
st.error("Failed to fetch sequence. Please check the accession number.")
|
87 |
|
88 |
if st.button("Analyze"):
|
89 |
if sequence:
|
90 |
+
length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence)
|
91 |
+
|
92 |
+
st.subheader("Analysis Results")
|
93 |
|
94 |
+
st.write(f"**Sequence Length:** {length} base pairs")
|
95 |
+
st.write("*Description: This is the total number of nucleotides in the sequence.*")
|
96 |
+
|
97 |
+
st.write(f"**Overall GC Content:** {gc_content:.2f}%")
|
98 |
+
st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*")
|
99 |
+
|
100 |
+
st.write(f"**TATA Box motifs:** {tata_box}")
|
101 |
+
st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*")
|
102 |
+
|
103 |
+
st.write(f"**CAAT Box motifs:** {caat_box}")
|
104 |
+
st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*")
|
105 |
|
106 |
st.subheader("Potential Regulatory Regions (based on GC content):")
|
107 |
+
if regulatory_regions:
|
108 |
+
for start, end in regulatory_regions:
|
109 |
+
st.write(f"Region from base {start} to {end}")
|
110 |
+
else:
|
111 |
+
st.write("No potential regulatory regions identified based on GC content.")
|
112 |
+
st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*")
|
113 |
+
|
114 |
+
st.subheader("GC Content Distribution")
|
115 |
+
fig = plot_gc_distribution(gc_distribution)
|
116 |
+
st.pyplot(fig)
|
117 |
+
st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*")
|
118 |
|
119 |
# Visualize the sequence with highlighted regions
|
120 |
+
st.subheader("Sequence Visualization")
|
121 |
highlighted_seq = list(sequence)
|
122 |
for start, end in regulatory_regions:
|
123 |
for i in range(start, min(end, len(highlighted_seq))):
|
124 |
highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
|
125 |
|
126 |
st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
|
127 |
+
st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*")
|
128 |
else:
|
129 |
+
st.write("Please enter a DNA sequence or provide a valid NCBI accession number.")
|