Last commit not found
import streamlit as st | |
from stmol import showmol | |
import py3Dmol | |
import requests | |
import biotite.structure.io as bsio | |
import random | |
import hashlib | |
import urllib3 | |
from Bio.Blast import NCBIWWW, NCBIXML | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
import time | |
import urllib.parse | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
st.set_page_config(layout='wide') | |
st.sidebar.title('🔮 GenPro2 Protein Generator & Structure Predictor') | |
st.sidebar.write('GenPro2 is an end-to-end single sequence protein generator and structure predictor based [*ESMFold*](https://esmatlas.com/about) and the ESM-2 language model.') | |
def generate_sequence_from_words(words, length): | |
seed = ' '.join(words).encode('utf-8') | |
random.seed(hashlib.md5(seed).hexdigest()) | |
amino_acids = "ACDEFGHIKLMNPQRSTVWY" | |
return ''.join(random.choice(amino_acids) for _ in range(length)) | |
def render_mol(pdb): | |
pdbview = py3Dmol.view(width=800, height=500) | |
pdbview.addModel(pdb, 'pdb') | |
pdbview.setStyle({'cartoon': {'color': 'spectrum'}}) | |
pdbview.setBackgroundColor('white') | |
pdbview.zoomTo() | |
pdbview.zoom(2, 800) | |
pdbview.spin(True) | |
showmol(pdbview, height=500, width=800) | |
def perform_blast_analysis(sequence): | |
st.subheader('Protein Analysis') | |
with st.spinner("Analyzing generated protein... This may take a few minutes."): | |
progress_bar = st.progress(0) | |
for i in range(100): | |
progress_bar.progress(i + 1) | |
time.sleep(0.1) # Simulate analysis time | |
try: | |
record = SeqRecord(Seq(sequence), id='random_protein') | |
result_handle = NCBIWWW.qblast("blastp", "swissprot", record.seq) | |
blast_record = NCBIXML.read(result_handle) | |
if blast_record.alignments: | |
alignment = blast_record.alignments[0] # Get the top hit | |
hsp = alignment.hsps[0] # Get the first (best) HSP | |
# Extract protein name and organism | |
title_parts = alignment.title.split('|') | |
protein_name = title_parts[-1].strip() | |
organism = title_parts[-2].split('OS=')[-1].split('OX=')[0].strip() | |
# Calculate identity percentage | |
identity_percentage = (hsp.identities / alignment.length) * 100 | |
st.write(f"**Top Match:** {protein_name}") | |
st.write(f"**Organism:** {organism}") | |
st.write(f"**Sequence Identity:** {identity_percentage:.2f}%") | |
st.write(f"**E-value:** {hsp.expect:.2e}") | |
# Fetch protein function (if available) | |
if hasattr(alignment, 'description') and alignment.description: | |
st.write(f"**Potential Function:** {alignment.description}") | |
# Link to BLAST results | |
blast_link = f"https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome" | |
st.markdown(f"[View full BLAST results (may require re-running the search)]({blast_link})") | |
else: | |
st.write("No significant matches found. This might be a unique protein sequence!") | |
except Exception as e: | |
st.error(f"An error occurred during protein analysis: {str(e)}") | |
st.write("Please try again later or contact support if the issue persists.") | |
def update(sequence, word1, word2, word3, sequence_length): | |
headers = { | |
'Content-Type': 'application/x-www-form-urlencoded', | |
} | |
try: | |
response = requests.post('https://api.esmatlas.com/foldSequence/v1/pdb/', | |
headers=headers, | |
data=sequence, | |
verify=False, | |
timeout=300) | |
response.raise_for_status() | |
pdb_string = response.content.decode('utf-8') | |
with open('predicted.pdb', 'w') as f: | |
f.write(pdb_string) | |
struct = bsio.load_structure('predicted.pdb', extra_fields=["b_factor"]) | |
b_value = round(struct.b_factor.mean(), 2) | |
st.session_state.structure_info = { | |
'pdb_string': pdb_string, | |
'b_value': b_value, | |
'word1': word1, | |
'word2': word2, | |
'word3': word3, | |
'sequence_length': sequence_length | |
} | |
st.session_state.show_analyze_button = True | |
except requests.exceptions.RequestException as e: | |
st.error(f"An error occurred while calling the API: {str(e)}") | |
st.write("Please try again later or contact support if the issue persists.") | |
def share_on_twitter(word1, word2, word3, length, plddt): | |
tweet_text = f"I generated a unique protein structure from the words '{word1}', '{word2}', and '{word3}' with length {length}! plDDT Score: {plddt}% Try it yourself at [Your App URL] #GenPro2 #ProteinFolding" | |
tweet_url = f"https://twitter.com/intent/tweet?text={urllib.parse.quote(tweet_text)}" | |
return tweet_url | |
# Initialize session state variables | |
if 'sequence' not in st.session_state: | |
st.session_state.sequence = None | |
if 'show_analyze_button' not in st.session_state: | |
st.session_state.show_analyze_button = False | |
if 'structure_info' not in st.session_state: | |
st.session_state.structure_info = None | |
st.title("Word-Seeded Protein Sequence Generator and Structure Predictor") | |
st.sidebar.subheader("Generate Sequence from Words") | |
word1 = st.sidebar.text_input("Word 1") | |
word2 = st.sidebar.text_input("Word 2") | |
word3 = st.sidebar.text_input("Word 3") | |
sequence_length = st.sidebar.number_input("Sequence Length", min_value=50, max_value=400, value=100, step=10) | |
if st.sidebar.button('Generate and Predict'): | |
if word1 and word2 and word3: | |
sequence = generate_sequence_from_words([word1, word2, word3], sequence_length) | |
st.session_state.sequence = sequence | |
st.sidebar.text_area("Generated Sequence", sequence, height=100) | |
st.sidebar.info("Note: The same words and sequence length will always produce the same sequence.") | |
with st.spinner("Predicting protein structure... This may take a few minutes."): | |
update(sequence, word1, word2, word3, sequence_length) | |
else: | |
st.sidebar.warning("Please enter all three words to generate a sequence.") | |
# Display structure information if available | |
if st.session_state.structure_info: | |
info = st.session_state.structure_info | |
st.subheader(f'Predicted protein structure using seed: {info["word1"]}, {info["word2"]}, and {info["word3"]} + length {info["sequence_length"]}') | |
render_mol(info['pdb_string']) | |
st.subheader('plDDT Score') | |
st.write('plDDT is a per-residue estimate of the confidence in prediction on a scale from 0-100%.') | |
plddt_score = int(info["b_value"] * 100) | |
st.info(f'Average plDDT: {plddt_score}%') | |
col1, col2 = st.columns(2) | |
with col1: | |
if st.button('Analyze Protein'): | |
perform_blast_analysis(st.session_state.sequence) | |
with col2: | |
st.download_button( | |
label="Download PDB", | |
data=info['pdb_string'], | |
file_name='predicted.pdb', | |
mime='text/plain', | |
) | |
st.subheader("Share your unique protein on X") | |
st.write("1. Take a screenshot of the protein structure above.") | |
st.write("2. Click the 'Share Results' button below to open a pre-filled tweet.") | |
st.write("3. Attach your screenshot to the tweet before posting.") | |
tweet_url = share_on_twitter(info["word1"], info["word2"], info["word3"], info["sequence_length"], plddt_score) | |
st.markdown(f"[Share Results]({tweet_url})") | |
st.markdown(""" | |
## What to do next: | |
If you find interesting results from the sequence folding, you can explore further: | |
1. Learn more about protein structures and sequences. | |
2. Visit the [Protein Data Bank (PDB)](https://www.rcsb.org/) for known protein structures. | |
3. Compare your folded structure with known functional proteins by downloading your results. | |
4. Read about similar proteins to gain insights into potential functions. | |
5. Click the "Analyze Protein" button to get more information about your generated protein. | |
**Remember, this folding is based on randomly generated sequences. Interpret the results with caution. | |
Enjoy exploring the world of protein sequences! Share your high-confidence protein images with us on X [*@WandsAI*](https://x.com/wandsai)! | |
""") |