Accelernate commited on
Commit
ddb223f
·
verified ·
1 Parent(s): 971b948

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -35
app.py CHANGED
@@ -1,26 +1,33 @@
1
  import streamlit as st
2
  import numpy as np
3
  from Bio.Seq import Seq
4
- from hmmlearn import hmm
5
 
6
- # Function to encode DNA sequence
7
- def encode_sequence(seq):
8
- encoding = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
9
- return np.array([encoding.get(base.upper(), -1) for base in seq])
10
-
11
- # Function to calculate GC content
12
  def calculate_gc_content(seq):
13
  gc_count = seq.count('G') + seq.count('C')
14
  total_count = len(seq)
15
  return (gc_count / total_count) * 100 if total_count > 0 else 0
16
 
17
- # Simple HMM model (this is a placeholder and would need proper training)
18
- model = hmm.MultinomialHMM(n_components=2, random_state=42)
19
- model.startprob_ = np.array([0.5, 0.5])
20
- model.transmat_ = np.array([[0.7, 0.3],
21
- [0.3, 0.7]])
22
- model.emissionprob_ = np.array([[0.25, 0.25, 0.25, 0.25],
23
- [0.20, 0.30, 0.30, 0.20]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def analyze_dark_matter(sequence):
26
  seq = Seq(sequence)
@@ -33,25 +40,8 @@ def analyze_dark_matter(sequence):
33
  tata_box = seq.count("TATAAA")
34
  caat_box = seq.count("CCAAT")
35
 
36
- # HMM analysis
37
- encoded_seq = encode_sequence(str(seq))
38
- valid_indices = encoded_seq != -1
39
- if np.any(valid_indices):
40
- logprob, hidden_states = model.decode(encoded_seq[valid_indices].reshape(-1, 1))
41
- else:
42
- hidden_states = []
43
-
44
- regulatory_regions = []
45
- current_start = None
46
- for i, state in enumerate(hidden_states):
47
- if state == 1 and current_start is None:
48
- current_start = i
49
- elif state == 0 and current_start is not None:
50
- regulatory_regions.append((current_start, i))
51
- current_start = None
52
-
53
- if current_start is not None:
54
- regulatory_regions.append((current_start, len(hidden_states)))
55
 
56
  return length, gc_content, tata_box, caat_box, regulatory_regions
57
 
@@ -65,11 +55,11 @@ if st.button("Analyze"):
65
  length, gc_content, tata_box, caat_box, regulatory_regions = analyze_dark_matter(sequence)
66
 
67
  st.write(f"Sequence Length: {length}")
68
- st.write(f"GC Content: {gc_content:.2f}%")
69
  st.write(f"TATA Box motifs: {tata_box}")
70
  st.write(f"CAAT Box motifs: {caat_box}")
71
 
72
- st.subheader("Potential Regulatory Regions (based on HMM):")
73
  for start, end in regulatory_regions:
74
  st.write(f"Region from base {start} to {end}")
75
 
 
1
  import streamlit as st
2
  import numpy as np
3
  from Bio.Seq import Seq
 
4
 
 
 
 
 
 
 
5
  def calculate_gc_content(seq):
6
  gc_count = seq.count('G') + seq.count('C')
7
  total_count = len(seq)
8
  return (gc_count / total_count) * 100 if total_count > 0 else 0
9
 
10
+ def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
11
+ gc_content = []
12
+ for i in range(len(seq) - window_size + 1):
13
+ window = seq[i:i+window_size]
14
+ gc_content.append(calculate_gc_content(window))
15
+
16
+ regulatory_regions = []
17
+ in_region = False
18
+ start = 0
19
+ for i, gc in enumerate(gc_content):
20
+ if gc > gc_threshold and not in_region:
21
+ in_region = True
22
+ start = i
23
+ elif gc <= gc_threshold and in_region:
24
+ in_region = False
25
+ regulatory_regions.append((start, i + window_size))
26
+
27
+ if in_region:
28
+ regulatory_regions.append((start, len(seq)))
29
+
30
+ return regulatory_regions
31
 
32
  def analyze_dark_matter(sequence):
33
  seq = Seq(sequence)
 
40
  tata_box = seq.count("TATAAA")
41
  caat_box = seq.count("CCAAT")
42
 
43
+ # Find potential regulatory regions based on GC content
44
+ regulatory_regions = find_potential_regulatory_regions(seq)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  return length, gc_content, tata_box, caat_box, regulatory_regions
47
 
 
55
  length, gc_content, tata_box, caat_box, regulatory_regions = analyze_dark_matter(sequence)
56
 
57
  st.write(f"Sequence Length: {length}")
58
+ st.write(f"Overall GC Content: {gc_content:.2f}%")
59
  st.write(f"TATA Box motifs: {tata_box}")
60
  st.write(f"CAAT Box motifs: {caat_box}")
61
 
62
+ st.subheader("Potential Regulatory Regions (based on GC content):")
63
  for start, end in regulatory_regions:
64
  st.write(f"Region from base {start} to {end}")
65