Accelernate commited on
Commit
28578a5
·
verified ·
1 Parent(s): a7ea9ce

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -0
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ from Bio import SeqIO
4
+ from Bio.Seq import Seq
5
+ from hmmlearn import hmm
6
+
7
+ # Function to encode DNA sequence
8
+ def encode_sequence(seq):
9
+ encoding = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
10
+ return np.array([encoding[base] for base in seq if base in encoding])
11
+
12
+ # Simple HMM model (this is a placeholder and would need proper training)
13
+ model = hmm.MultinomialHMM(n_components=2, random_state=42)
14
+ model.startprob_ = np.array([0.5, 0.5])
15
+ model.transmat_ = np.array([[0.7, 0.3],
16
+ [0.3, 0.7]])
17
+ model.emissionprob_ = np.array([[0.25, 0.25, 0.25, 0.25],
18
+ [0.20, 0.30, 0.30, 0.20]])
19
+
20
+ def analyze_dark_matter(sequence):
21
+ seq = Seq(sequence)
22
+
23
+ # Basic statistics
24
+ length = len(seq)
25
+ gc_content = SeqIO.GC(seq)
26
+
27
+ # Look for common regulatory motifs
28
+ tata_box = seq.count("TATAAA")
29
+ caat_box = seq.count("CCAAT")
30
+
31
+ # HMM analysis
32
+ encoded_seq = encode_sequence(str(seq))
33
+ logprob, hidden_states = model.decode(encoded_seq.reshape(-1, 1))
34
+
35
+ regulatory_regions = []
36
+ current_start = None
37
+ for i, state in enumerate(hidden_states):
38
+ if state == 1 and current_start is None:
39
+ current_start = i
40
+ elif state == 0 and current_start is not None:
41
+ regulatory_regions.append((current_start, i))
42
+ current_start = None
43
+
44
+ if current_start is not None:
45
+ regulatory_regions.append((current_start, len(hidden_states)))
46
+
47
+ return length, gc_content, tata_box, caat_box, regulatory_regions
48
+
49
+ # Streamlit app
50
+ st.title("Genomic Dark Matter Analyzer")
51
+
52
+ sequence = st.text_area("Paste your DNA sequence here", height=150)
53
+
54
+ if st.button("Analyze"):
55
+ if sequence:
56
+ length, gc_content, tata_box, caat_box, regulatory_regions = analyze_dark_matter(sequence)
57
+
58
+ st.write(f"Sequence Length: {length}")
59
+ st.write(f"GC Content: {gc_content:.2f}%")
60
+ st.write(f"TATA Box motifs: {tata_box}")
61
+ st.write(f"CAAT Box motifs: {caat_box}")
62
+
63
+ st.subheader("Potential Regulatory Regions (based on HMM):")
64
+ for start, end in regulatory_regions:
65
+ st.write(f"Region from base {start} to {end}")
66
+
67
+ # Visualize the sequence with highlighted regions
68
+ highlighted_seq = list(sequence)
69
+ for start, end in regulatory_regions:
70
+ for i in range(start, min(end, len(highlighted_seq))):
71
+ highlighted_seq[i] = f"<span style='background-color: yellow'>{highlighted_seq[i]}</span>"
72
+
73
+ st.markdown("".join(highlighted_seq), unsafe_allow_html=True)
74
+ else:
75
+ st.write("Please enter a DNA sequence.")