Accelernate commited on
Commit
bed5689
·
verified ·
1 Parent(s): ca3e4c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -12
app.py CHANGED
@@ -1,6 +1,20 @@
1
  import streamlit as st
2
  import numpy as np
 
 
3
  from Bio.Seq import Seq
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def calculate_gc_content(seq):
6
  gc_count = seq.count('G') + seq.count('C')
@@ -27,7 +41,7 @@ def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
27
  if in_region:
28
  regulatory_regions.append((start, len(seq)))
29
 
30
- return regulatory_regions
31
 
32
  def analyze_dark_matter(sequence):
33
  seq = Seq(sequence)
@@ -41,34 +55,75 @@ def analyze_dark_matter(sequence):
41
  caat_box = seq.count("CCAAT")
42
 
43
  # Find potential regulatory regions based on GC content
44
- regulatory_regions = find_potential_regulatory_regions(seq)
45
 
46
- return length, gc_content, tata_box, caat_box, regulatory_regions
 
 
 
 
 
 
 
 
 
 
47
 
48
  # Streamlit app
49
  st.title("Genomic Dark Matter Analyzer")
50
 
51
- sequence = st.text_area("Paste your DNA sequence here", height=150)
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  if st.button("Analyze"):
54
  if sequence:
55
- length, gc_content, tata_box, caat_box, regulatory_regions = analyze_dark_matter(sequence)
 
 
56
 
57
- st.write(f"Sequence Length: {length}")
58
- st.write(f"Overall GC Content: {gc_content:.2f}%")
59
- st.write(f"TATA Box motifs: {tata_box}")
60
- st.write(f"CAAT Box motifs: {caat_box}")
 
 
 
 
 
 
 
61
 
62
  st.subheader("Potential Regulatory Regions (based on GC content):")
63
- for start, end in regulatory_regions:
64
- st.write(f"Region from base {start} to {end}")
 
 
 
 
 
 
 
 
 
65
 
66
  # Visualize the sequence with highlighted regions
 
67
  highlighted_seq = list(sequence)
68
  for start, end in regulatory_regions:
69
  for i in range(start, min(end, len(highlighted_seq))):
70
  highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
71
 
72
  st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
 
73
  else:
74
- st.write("Please enter a DNA sequence.")
 
1
  import streamlit as st
2
  import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ from Bio import Entrez, SeqIO
5
  from Bio.Seq import Seq
6
+ from io import StringIO
7
+
8
+ # Set your email for NCBI Entrez
9
+ Entrez.email = "[email protected]"
10
+
11
+ def fetch_sequence_from_ncbi(accession):
12
+ try:
13
+ handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
14
+ record = SeqIO.read(handle, "fasta")
15
+ return str(record.seq)
16
+ except:
17
+ return None
18
 
19
  def calculate_gc_content(seq):
20
  gc_count = seq.count('G') + seq.count('C')
 
41
  if in_region:
42
  regulatory_regions.append((start, len(seq)))
43
 
44
+ return regulatory_regions, gc_content
45
 
46
  def analyze_dark_matter(sequence):
47
  seq = Seq(sequence)
 
55
  caat_box = seq.count("CCAAT")
56
 
57
  # Find potential regulatory regions based on GC content
58
+ regulatory_regions, gc_distribution = find_potential_regulatory_regions(seq)
59
 
60
+ return length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution
61
+
62
+ def plot_gc_distribution(gc_distribution):
63
+ fig, ax = plt.subplots(figsize=(10, 4))
64
+ ax.plot(gc_distribution)
65
+ ax.set_xlabel('Sequence Position')
66
+ ax.set_ylabel('GC Content (%)')
67
+ ax.set_title('GC Content Distribution')
68
+ ax.axhline(y=60, color='r', linestyle='--', label='GC Threshold (60%)')
69
+ ax.legend()
70
+ return fig
71
 
72
  # Streamlit app
73
  st.title("Genomic Dark Matter Analyzer")
74
 
75
+ sequence_input = st.radio("Choose input method:", ("Enter sequence", "Fetch from NCBI"))
76
+
77
+ if sequence_input == "Enter sequence":
78
+ sequence = st.text_area("Paste your DNA sequence here", height=150)
79
+ else:
80
+ accession = st.text_input("Enter NCBI accession number")
81
+ if accession:
82
+ sequence = fetch_sequence_from_ncbi(accession)
83
+ if sequence:
84
+ st.success(f"Successfully fetched sequence for {accession}")
85
+ else:
86
+ st.error("Failed to fetch sequence. Please check the accession number.")
87
 
88
  if st.button("Analyze"):
89
  if sequence:
90
+ length, gc_content, tata_box, caat_box, regulatory_regions, gc_distribution = analyze_dark_matter(sequence)
91
+
92
+ st.subheader("Analysis Results")
93
 
94
+ st.write(f"**Sequence Length:** {length} base pairs")
95
+ st.write("*Description: This is the total number of nucleotides in the sequence.*")
96
+
97
+ st.write(f"**Overall GC Content:** {gc_content:.2f}%")
98
+ st.write("*Description: GC content is the percentage of G and C bases in the DNA. Higher GC content (>60%) is often associated with gene-rich regions or regulatory elements.*")
99
+
100
+ st.write(f"**TATA Box motifs:** {tata_box}")
101
+ st.write("*Description: TATA boxes are common promoter elements in eukaryotes, typically found about 25-35 base pairs upstream of the transcription start site.*")
102
+
103
+ st.write(f"**CAAT Box motifs:** {caat_box}")
104
+ st.write("*Description: CAAT boxes are another common promoter element, often found about 75-80 base pairs upstream of the transcription start site.*")
105
 
106
  st.subheader("Potential Regulatory Regions (based on GC content):")
107
+ if regulatory_regions:
108
+ for start, end in regulatory_regions:
109
+ st.write(f"Region from base {start} to {end}")
110
+ else:
111
+ st.write("No potential regulatory regions identified based on GC content.")
112
+ st.write("*Description: These regions have a GC content above 60% over a 50 base pair window, which may indicate regulatory function.*")
113
+
114
+ st.subheader("GC Content Distribution")
115
+ fig = plot_gc_distribution(gc_distribution)
116
+ st.pyplot(fig)
117
+ st.write("*Description: This plot shows how GC content varies along the sequence. Peaks above the red line (60% threshold) may indicate potential regulatory regions.*")
118
 
119
  # Visualize the sequence with highlighted regions
120
+ st.subheader("Sequence Visualization")
121
  highlighted_seq = list(sequence)
122
  for start, end in regulatory_regions:
123
  for i in range(start, min(end, len(highlighted_seq))):
124
  highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
125
 
126
  st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
127
+ st.write("*Description: This is a visualization of the sequence with potential regulatory regions highlighted in yellow.*")
128
  else:
129
+ st.write("Please enter a DNA sequence or provide a valid NCBI accession number.")