DNA_SEQV2 / app.py
lyimo's picture
Create app.py
e0a58f9 verified
import streamlit as st
from Bio import pairwise2
from Bio.Seq import Seq
import re
from collections import defaultdict
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# Define resistance genes and mutation patterns
RESISTANCE_GENES = {
'rpoB': {
'start': 759807,
'end': 763325,
'description': 'RNA polymerase β subunit',
'drug': 'Rifampicin',
'mutations': {
'531': {'from': 'S', 'to': ['L'], 'freq': 'High', 'confidence': 'High'},
'526': {'from': 'H', 'to': ['Y', 'D', 'R'], 'freq': 'High', 'confidence': 'High'},
'516': {'from': 'D', 'to': ['V', 'G'], 'freq': 'Moderate', 'confidence': 'High'},
'511': {'from': 'L', 'to': ['P'], 'freq': 'Low', 'confidence': 'Moderate'}
}
},
'katG': {
'start': 2153889,
'end': 2156111,
'description': 'Catalase-peroxidase',
'drug': 'Isoniazid',
'mutations': {
'315': {'from': 'S', 'to': ['T', 'N'], 'freq': 'High', 'confidence': 'High'},
'463': {'from': 'R', 'to': ['L'], 'freq': 'Moderate', 'confidence': 'Moderate'}
}
},
'inhA': {
'start': 1674202,
'end': 1675011,
'description': 'Enoyl-ACP reductase',
'drug': 'Isoniazid/Ethionamide',
'mutations': {
'-15': {'from': 'C', 'to': ['T'], 'freq': 'High', 'confidence': 'High'},
'94': {'from': 'S', 'to': ['A'], 'freq': 'Moderate', 'confidence': 'High'}
}
},
'gyrA': {
'start': 7302,
'end': 9818,
'description': 'DNA gyrase subunit A',
'drug': 'Fluoroquinolones',
'mutations': {
'90': {'from': 'A', 'to': ['V'], 'freq': 'High', 'confidence': 'High'},
'94': {'from': 'D', 'to': ['G', 'A', 'N'], 'freq': 'High', 'confidence': 'High'}
}
}
}
# File reading functions
def read_fasta_file(file_path):
"""Read a FASTA file from disk"""
try:
with open(file_path, 'r') as handle:
content = handle.read().strip()
parts = content.split('\n', 1)
sequence = ''.join(parts[1].split('\n')).replace(' ', '')
return sequence.upper()
except Exception as e:
st.error(f"Error reading file {file_path}: {str(e)}")
return None
def read_fasta_from_upload(uploaded_file):
"""Read a FASTA file from Streamlit upload"""
try:
content = uploaded_file.getvalue().decode('utf-8').strip()
parts = content.split('\n', 1)
sequence = ''.join(parts[1].split('\n')).replace(' ', '')
return sequence.upper()
except Exception as e:
st.error(f"Error reading uploaded file: {str(e)}")
return None
# Region extraction function
def extract_gene_region(genome_seq, gene_start, gene_end):
"""Extract a gene region with 200bp flanking for alignment context"""
flank = 200
start = max(0, gene_start - flank)
end = min(len(genome_seq), gene_end + flank)
extracted_seq = genome_seq[start:end]
return extracted_seq, start
# Codon alignment extraction
def extract_codon_alignment(ref_aligned, query_aligned, gene_start, gene_end, offset):
"""Extract codon-level differences from aligned sequences"""
codon_list = []
real_pos = 0
ref_codon = []
query_codon = []
for i in range(len(ref_aligned)):
ref_base = ref_aligned[i]
query_base = query_aligned[i]
if ref_base != '-':
real_pos += 1
ref_codon.append(ref_base)
query_codon.append(query_base if query_base != '-' else 'N')
if len(ref_codon) == 3:
codon_start_pos = offset + (real_pos - 3)
if (codon_start_pos >= gene_start) and (codon_start_pos + 2 <= gene_end):
ref_aa = str(Seq(''.join(ref_codon)).translate())
query_aa = str(Seq(''.join(query_codon)).translate())
gene_nt_pos = codon_start_pos - gene_start + 1
codon_number = (gene_nt_pos - 1) // 3 + 1
if ref_aa != query_aa:
codon_list.append({
'codon_number': codon_number,
'ref_aa': ref_aa,
'query_aa': query_aa
})
ref_codon = []
query_codon = []
return codon_list
# Mutation detection
def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
"""Find codon-level and nucleotide-level mutations"""
alignments = pairwise2.align.globalms(ref_seq, query_seq, match=2, mismatch=-3, open=-10, extend=-0.5)
if not alignments:
return {'codon_diffs': [], 'nt_diffs': []}
ref_aligned, query_aligned = alignments[0][0], alignments[0][1]
codon_diffs = extract_codon_alignment(ref_aligned, query_aligned, gene_start, gene_end, offset)
nt_diffs = []
ref_pos = 0
for i in range(len(ref_aligned)):
ref_base = ref_aligned[i]
query_base = query_aligned[i]
if ref_base != '-':
ref_pos += 1
actual_genome_pos = offset + ref_pos
if ref_base != query_base and query_base != '-':
nt_diffs.append({
'genome_pos': actual_genome_pos,
'ref_base': ref_base,
'query_base': query_base
})
return {'codon_diffs': codon_diffs, 'nt_diffs': nt_diffs}
# Resistance analysis
def analyze_resistance(mutation_data, gene_info):
"""Match mutations to known resistance patterns"""
codon_diffs = mutation_data['codon_diffs']
nt_diffs = mutation_data['nt_diffs']
resistance_found = []
for key_str, pattern in gene_info['mutations'].items():
key_val = int(key_str)
if key_val > 0:
for diff in codon_diffs:
if diff['codon_number'] == key_val and diff['ref_aa'] == pattern['from'] and diff['query_aa'] in pattern['to']:
resistance_found.append({
'position': key_str,
'change': f"{pattern['from']}{key_str}{diff['query_aa']}",
'frequency': pattern['freq'],
'confidence': pattern['confidence']
})
else:
promoter_genome_pos = gene_info['start'] + key_val
for diff in nt_diffs:
if diff['genome_pos'] == promoter_genome_pos and diff['ref_base'] == pattern['from'] and diff['query_base'] in pattern['to']:
resistance_found.append({
'position': key_str,
'change': f"{pattern['from']}{key_str}{diff['query_base']}",
'frequency': pattern['freq'],
'confidence': pattern['confidence']
})
return resistance_found
# Main Streamlit app
def main():
st.title("M. tuberculosis Drug Resistance Analysis")
st.markdown("""
### Automated Drug Resistance Analysis Tool
Upload your query genome (clinical isolate) in FASTA format to compare with the H37Rv reference.
**Note**: Detects codon-based (e.g., rpoB S531L) and nucleotide-based (e.g., inhA -15C>T) mutations.
""")
debug_mode = st.checkbox("Enable debug mode")
ref_genome = read_fasta_file("NC_000962.3.fasta")
if not ref_genome:
st.error("Failed to load reference genome")
return
st.success(f"Reference genome loaded (length: {len(ref_genome)}bp)")
query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
if query_file and st.button("Analyze Drug Resistance"):
query_genome = read_fasta_from_upload(query_file)
if not query_genome:
return
st.success(f"Query genome loaded (length: {len(query_genome)}bp)")
progress_bar = st.progress(0)
status_text = st.empty()
all_results = {}
for i, (gene, info) in enumerate(RESISTANCE_GENES.items()):
status_text.text(f"Analyzing {gene} ({info['drug']})...")
progress_bar.progress((i + 1) / len(RESISTANCE_GENES))
if debug_mode:
st.subheader(f"Analyzing {gene}")
st.write(f"Gene region: {info['start']}-{info['end']}")
ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end'])
query_region, _ = extract_gene_region(query_genome, info['start'], info['end'])
if ref_region and query_region:
mutation_data = find_mutations_with_context(ref_region, query_region, info['start'], info['end'], ref_start)
resistance = analyze_resistance(mutation_data, info)
all_results[gene] = {'mutation_data': mutation_data, 'resistance': resistance}
if debug_mode:
st.write(f"Codon diffs: {len(mutation_data['codon_diffs'])}", mutation_data['codon_diffs'])
st.write(f"Nucleotide diffs: {len(mutation_data['nt_diffs'])}", mutation_data['nt_diffs'])
st.write(f"Resistance patterns: {len(resistance)}")
else:
st.error(f"Failed to analyze {gene}")
progress_bar.empty()
status_text.empty()
# Summary table
summary_data = [
{
'Gene': gene,
'Drug': RESISTANCE_GENES[gene]['drug'],
'Codon Diffs': len(results['mutation_data']['codon_diffs']),
'Nucleotide Diffs': len(results['mutation_data']['nt_diffs']),
'Resistance Mutations': len(results['resistance'])
}
for gene, results in all_results.items()
]
summary_df = pd.DataFrame(summary_data)
def highlight_resistance(row):
return ['background-color: yellow' if row['Resistance Mutations'] > 0 else '' for _ in row]
styled_summary = summary_df.style.apply(highlight_resistance, axis=1)
st.subheader("Summary of Analysis")
st.dataframe(styled_summary)
# Detailed results
for gene, results in all_results.items():
st.subheader(f"{gene} Analysis")
info = RESISTANCE_GENES[gene]
st.write(f"**Drug**: {info['drug']}")
st.write(f"**Codon-level differences**: {len(results['mutation_data']['codon_diffs'])}")
st.write(f"**Nucleotide-level differences**: {len(results['mutation_data']['nt_diffs'])}")
if results['resistance']:
st.warning(f"Potential resistance mutations found in {gene}")
resistance_df = pd.DataFrame(results['resistance'])
st.dataframe(resistance_df)
else:
st.info(f"No known resistance mutations found in {gene}")
# Download button
st.markdown("### Download Complete Analysis")
st.write("Download a CSV file with all mutation and resistance data.")
report_data = []
for gene, results in all_results.items():
for diff in results['mutation_data']['codon_diffs']:
report_data.append({
'Gene': gene,
'Drug': RESISTANCE_GENES[gene]['drug'],
'Type': 'Codon_diff',
'Codon Number': diff['codon_number'],
'Reference AA': diff['ref_aa'],
'Query AA': diff['query_aa']
})
for diff in results['mutation_data']['nt_diffs']:
report_data.append({
'Gene': gene,
'Drug': RESISTANCE_GENES[gene]['drug'],
'Type': 'Nucleotide_diff',
'Genome Position': diff['genome_pos'],
'Reference Base': diff['ref_base'],
'Query Base': diff['query_base']
})
for res in results['resistance']:
report_data.append({
'Gene': gene,
'Drug': RESISTANCE_GENES[gene]['drug'],
'Type': 'Resistance',
'Position': res['position'],
'Change': res['change'],
'Frequency': res['frequency'],
'Confidence': res['confidence']
})
report_df = pd.DataFrame(report_data)
csv = report_df.to_csv(index=False)
st.download_button(
label="Download Full Report (CSV)",
data=csv,
file_name="mtb_analysis_report.csv",
mime="text/csv"
)
if __name__ == "__main__":
main()