File size: 5,499 Bytes
c7c7b6a 5a1e198 ca19f3a c7c7b6a 5e885f1 c7c7b6a 5e885f1 7d03065 9870495 7d03065 ef8ec9c 5e885f1 ef8ec9c 7d03065 ef8ec9c 7d03065 ef8ec9c 7d03065 5e885f1 7d03065 5e885f1 ef8ec9c 7d03065 ef8ec9c 5e885f1 7d03065 5e885f1 ef8ec9c 5e885f1 7d03065 5e885f1 ef8ec9c 5e885f1 ef8ec9c 5e885f1 ef8ec9c 5e885f1 7d03065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# Install required dependencies if not present
import os
os.system("pip install streamlit pandas xlsxwriter openpyxl")
import streamlit as st
import pandas as pd
import xlsxwriter
from io import BytesIO
from collections import defaultdict
# Function to find repeated amino acids in the protein sequence
def find_homorepeats(protein):
n = len(protein)
freq = defaultdict(int)
i = 0
while i < n:
curr = protein[i]
repeat = ""
while i < n and curr == protein[i]:
repeat += protein[i]
i += 1
# Only consider repeats of length > 1
if len(repeat) > 1:
freq[repeat] += 1
return freq
# Function to process a single CSV file and return its analysis
def process_csv(file):
df = pd.read_csv(file)
if len(df.columns) < 3:
st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence")
return None
# Storing entry ID, protein name, and sequence
sequences = []
for _, row in df.iterrows():
entry_id = str(row[0])
protein_name = str(row[1])
sequence = str(row[2]).replace('"', '').replace(' ', '')
sequences.append((entry_id, protein_name, sequence))
# Analyzing homorepeats in the sequences
homorepeats = set()
sequence_data = []
for entry_id, protein_name, sequence in sequences:
freq = find_homorepeats(sequence)
homorepeats.update(freq.keys()) # Collect unique homorepeats
sequence_data.append((entry_id, protein_name, freq))
return homorepeats, sequence_data
import pandas as pd
import streamlit as st
from io import BytesIO
import xlsxwriter
# Function to process the Excel file
def process_excel(excel_data):
# Custom logic to process each sheet within the Excel file
homorepeats = set()
sequence_data = []
for sheet_name in excel_data.sheet_names:
df = excel_data.parse(sheet_name)
for index, row in df.iterrows():
entry_id = row['Entry ID']
protein_name = row['Protein Name']
freq = {repeat: row[repeat] for repeat in df.columns[2:]} # Assuming repeats start from 3rd column
sequence_data.append((entry_id, protein_name, freq))
homorepeats.update(freq.keys())
return homorepeats, sequence_data
# Function to generate and download Excel workbook with separate sheets for each input file
def create_excel(sequences_data, homorepeats, filenames):
output = BytesIO()
workbook = xlsxwriter.Workbook(output, {'in_memory': True})
# Iterate through sequences data grouped by filenames and create separate sheets
for file_index, file_data in enumerate(sequences_data):
filename = filenames[file_index]
worksheet = workbook.add_worksheet(filename[:31]) # Limit sheet name to 31 characters
# Write the header for the current file
worksheet.write(0, 0, "Entry ID")
worksheet.write(0, 1, "Protein Name")
col = 2
for repeat in sorted(homorepeats):
worksheet.write(0, col, repeat)
col += 1
# Write data for each sequence in the current file
row = 1
for entry_id, protein_name, freq in file_data:
worksheet.write(row, 0, entry_id)
worksheet.write(row, 1, protein_name)
col = 2
for repeat in sorted(homorepeats):
worksheet.write(row, col, freq.get(repeat, 0))
col += 1
row += 1
workbook.close()
output.seek(0)
return output
# Streamlit UI components
st.title("Protein Homorepeat Analysis")
# Step 1: Upload Excel Files
uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
# Step 2: Process files and display results
if uploaded_files:
all_homorepeats = set()
all_sequences_data = []
filenames = []
for file in uploaded_files:
excel_data = pd.ExcelFile(file)
homorepeats, sequence_data = process_excel(excel_data) # Modify your process_csv function to process_excel
if homorepeats is not None:
all_homorepeats.update(homorepeats)
all_sequences_data.append(sequence_data)
filenames.append(file.name)
if all_sequences_data:
st.success(f"Processed {len(uploaded_files)} files successfully!")
# Step 3: Generate and download the Excel report
excel_file = create_excel(all_sequences_data, all_homorepeats, filenames)
# Download the Excel file
st.download_button(
label="Download Excel file",
data=excel_file,
file_name="protein_homorepeat_results.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# Step 4: Display summary table
if st.checkbox("Show Results Table"):
# Convert the sequences data into a DataFrame for easy display
rows = []
for file_index, file_data in enumerate(all_sequences_data):
filename = filenames[file_index]
for entry_id, protein_name, freq in file_data:
row = {"Filename": filename, "Entry ID": entry_id, "Protein Name": protein_name}
row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_homorepeats)})
rows.append(row)
result_df = pd.DataFrame(rows)
st.dataframe(result_df)
|