Spaces:

WANDSAI
/

GenSeq

Sleeping

App Files Files Community

Accelernate commited on Jul 12, 2024

Commit

ddb223f

verified ·

1 Parent(s): 971b948

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -35

app.py CHANGED Viewed

@@ -1,26 +1,33 @@
 import streamlit as st
 import numpy as np
 from Bio.Seq import Seq
-from hmmlearn import hmm
-# Function to encode DNA sequence
-def encode_sequence(seq):
-    encoding = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
-    return np.array([encoding.get(base.upper(), -1) for base in seq])
-# Function to calculate GC content
 def calculate_gc_content(seq):
     gc_count = seq.count('G') + seq.count('C')
     total_count = len(seq)
     return (gc_count / total_count) * 100 if total_count > 0 else 0
-# Simple HMM model (this is a placeholder and would need proper training)
-model = hmm.MultinomialHMM(n_components=2, random_state=42)
-model.startprob_ = np.array([0.5, 0.5])
-model.transmat_ = np.array([[0.7, 0.3],
-                            [0.3, 0.7]])
-model.emissionprob_ = np.array([[0.25, 0.25, 0.25, 0.25],
-                                [0.20, 0.30, 0.30, 0.20]])
 def analyze_dark_matter(sequence):
     seq = Seq(sequence)
@@ -33,25 +40,8 @@ def analyze_dark_matter(sequence):
     tata_box = seq.count("TATAAA")
     caat_box = seq.count("CCAAT")
-    # HMM analysis
-    encoded_seq = encode_sequence(str(seq))
-    valid_indices = encoded_seq != -1
-    if np.any(valid_indices):
-        logprob, hidden_states = model.decode(encoded_seq[valid_indices].reshape(-1, 1))
-    else:
-        hidden_states = []
-    regulatory_regions = []
-    current_start = None
-    for i, state in enumerate(hidden_states):
-        if state == 1 and current_start is None:
-            current_start = i
-        elif state == 0 and current_start is not None:
-            regulatory_regions.append((current_start, i))
-            current_start = None
-    if current_start is not None:
-        regulatory_regions.append((current_start, len(hidden_states)))
     return length, gc_content, tata_box, caat_box, regulatory_regions
@@ -65,11 +55,11 @@ if st.button("Analyze"):
         length, gc_content, tata_box, caat_box, regulatory_regions = analyze_dark_matter(sequence)
         st.write(f"Sequence Length: {length}")
-        st.write(f"GC Content: {gc_content:.2f}%")
         st.write(f"TATA Box motifs: {tata_box}")
         st.write(f"CAAT Box motifs: {caat_box}")
-        st.subheader("Potential Regulatory Regions (based on HMM):")
         for start, end in regulatory_regions:
             st.write(f"Region from base {start} to {end}")

 import streamlit as st
 import numpy as np
 from Bio.Seq import Seq
 def calculate_gc_content(seq):
     gc_count = seq.count('G') + seq.count('C')
     total_count = len(seq)
     return (gc_count / total_count) * 100 if total_count > 0 else 0
+def find_potential_regulatory_regions(seq, window_size=50, gc_threshold=60):
+    gc_content = []
+    for i in range(len(seq) - window_size + 1):
+        window = seq[i:i+window_size]
+        gc_content.append(calculate_gc_content(window))
+    regulatory_regions = []
+    in_region = False
+    start = 0
+    for i, gc in enumerate(gc_content):
+        if gc > gc_threshold and not in_region:
+            in_region = True
+            start = i
+        elif gc <= gc_threshold and in_region:
+            in_region = False
+            regulatory_regions.append((start, i + window_size))
+    if in_region:
+        regulatory_regions.append((start, len(seq)))
+    return regulatory_regions
 def analyze_dark_matter(sequence):
     seq = Seq(sequence)
     tata_box = seq.count("TATAAA")
     caat_box = seq.count("CCAAT")
+    # Find potential regulatory regions based on GC content
+    regulatory_regions = find_potential_regulatory_regions(seq)
     return length, gc_content, tata_box, caat_box, regulatory_regions
         length, gc_content, tata_box, caat_box, regulatory_regions = analyze_dark_matter(sequence)
         st.write(f"Sequence Length: {length}")
+        st.write(f"Overall GC Content: {gc_content:.2f}%")
         st.write(f"TATA Box motifs: {tata_box}")
         st.write(f"CAAT Box motifs: {caat_box}")
+        st.subheader("Potential Regulatory Regions (based on GC content):")
         for start, end in regulatory_regions:
             st.write(f"Region from base {start} to {end}")