|
|
|
import os |
|
os.system("pip install streamlit pandas xlsxwriter openpyxl") |
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import xlsxwriter |
|
from io import BytesIO |
|
from collections import defaultdict |
|
|
|
|
|
def find_homorepeats(protein): |
|
n = len(protein) |
|
freq = defaultdict(int) |
|
i = 0 |
|
|
|
while i < n: |
|
curr = protein[i] |
|
repeat = "" |
|
while i < n and curr == protein[i]: |
|
repeat += protein[i] |
|
i += 1 |
|
|
|
|
|
if len(repeat) > 1: |
|
freq[repeat] += 1 |
|
|
|
return freq |
|
|
|
|
|
def process_excel(excel_data): |
|
homorepeats = set() |
|
sequence_data = [] |
|
|
|
for sheet_name in excel_data.sheet_names: |
|
df = excel_data.parse(sheet_name) |
|
if len(df.columns) < 3: |
|
st.error(f"Error: The sheet '{sheet_name}' must have at least three columns: ID, Protein Name, Sequence") |
|
return None, None |
|
|
|
for _, row in df.iterrows(): |
|
entry_id = str(row[0]) |
|
protein_name = str(row[1]) |
|
sequence = str(row[2]).replace('"', '').replace(' ', '') |
|
freq = find_homorepeats(sequence) |
|
sequence_data.append((entry_id, protein_name, freq)) |
|
homorepeats.update(freq.keys()) |
|
|
|
return homorepeats, sequence_data |
|
|
|
|
|
def create_excel(sequences_data, homorepeats, filenames): |
|
output = BytesIO() |
|
workbook = xlsxwriter.Workbook(output, {'in_memory': True}) |
|
|
|
|
|
for file_index, file_data in enumerate(sequences_data): |
|
filename = filenames[file_index] |
|
worksheet = workbook.add_worksheet(filename[:31]) |
|
|
|
|
|
worksheet.write(0, 0, "Entry ID") |
|
worksheet.write(0, 1, "Protein Name") |
|
col = 2 |
|
for repeat in sorted(homorepeats): |
|
worksheet.write(0, col, repeat) |
|
col += 1 |
|
|
|
|
|
row = 1 |
|
for entry_id, protein_name, freq in file_data: |
|
worksheet.write(row, 0, entry_id) |
|
worksheet.write(row, 1, protein_name) |
|
col = 2 |
|
for repeat in sorted(homorepeats): |
|
worksheet.write(row, col, freq.get(repeat, 0)) |
|
col += 1 |
|
row += 1 |
|
|
|
workbook.close() |
|
output.seek(0) |
|
return output |
|
|
|
|
|
st.title("Protein Homorepeat Analysis") |
|
|
|
|
|
uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"]) |
|
|
|
|
|
if uploaded_files: |
|
all_homorepeats = set() |
|
all_sequences_data = [] |
|
filenames = [] |
|
|
|
for file in uploaded_files: |
|
excel_data = pd.ExcelFile(file) |
|
homorepeats, sequence_data = process_excel(excel_data) |
|
if homorepeats is not None: |
|
all_homorepeats.update(homorepeats) |
|
all_sequences_data.append(sequence_data) |
|
filenames.append(file.name) |
|
|
|
if all_sequences_data: |
|
st.success(f"Processed {len(uploaded_files)} files successfully!") |
|
|
|
|
|
excel_file = create_excel(all_sequences_data, all_homorepeats, filenames) |
|
|
|
|
|
st.download_button( |
|
label="Download Excel file", |
|
data=excel_file, |
|
file_name="protein_homorepeat_results.xlsx", |
|
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
|
) |
|
|
|
|
|
if st.checkbox("Show Results Table"): |
|
|
|
rows = [] |
|
for file_index, file_data in enumerate(all_sequences_data): |
|
filename = filenames[file_index] |
|
for entry_id, protein_name, freq in file_data: |
|
row = {"Filename": filename, "Entry ID": entry_id, "Protein Name": protein_name} |
|
row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_homorepeats)}) |
|
rows.append(row) |
|
|
|
result_df = pd.DataFrame(rows) |
|
st.dataframe(result_df) |
|
|