Jayesh13 commited on
Commit
82bea84
·
verified ·
1 Parent(s): a361d73

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -42
app.py CHANGED
@@ -8,8 +8,6 @@ import xlsxwriter
8
  from io import BytesIO
9
  from collections import defaultdict
10
 
11
-
12
-
13
  # Function to find repeated amino acids in the protein sequence
14
  def find_homorepeats(protein):
15
  n = len(protein)
@@ -29,31 +27,26 @@ def find_homorepeats(protein):
29
 
30
  return freq
31
 
32
- # Function to process a single CSV file and return its analysis
33
- def process_csv(file):
34
- df = pd.read_csv(file)
35
- if len(df.columns) < 3:
36
- st.error(f"Error: The file must have at least three columns: ID, Protein Name, Sequence")
37
- return None
38
-
39
- # Storing entry ID, protein name, and sequence
40
- sequences = []
41
- for _, row in df.iterrows():
42
- entry_id = str(row[0])
43
- protein_name = str(row[1])
44
- sequence = str(row[2]).replace('"', '').replace(' ', '')
45
- sequences.append((entry_id, protein_name, sequence))
46
-
47
- # Analyzing homorepeats in the sequences
48
  homorepeats = set()
49
  sequence_data = []
50
- for entry_id, protein_name, sequence in sequences:
51
- freq = find_homorepeats(sequence)
52
- homorepeats.update(freq.keys()) # Collect unique homorepeats
53
- sequence_data.append((entry_id, protein_name, freq))
54
 
55
- return homorepeats, sequence_data
 
 
 
 
 
 
 
 
 
 
 
 
56
 
 
57
 
58
  # Function to generate and download Excel workbook with separate sheets for each input file
59
  def create_excel(sequences_data, homorepeats, filenames):
@@ -88,23 +81,6 @@ def create_excel(sequences_data, homorepeats, filenames):
88
  output.seek(0)
89
  return output
90
 
91
- # Function to process the Excel file
92
- def process_excel(excel_data):
93
- # Custom logic to process each sheet within the Excel file
94
- homorepeats = set()
95
- sequence_data = []
96
-
97
- for sheet_name in excel_data.sheet_names:
98
- df = excel_data.parse(sheet_name)
99
- for index, row in df.iterrows():
100
- entry_id = row['Entry ID']
101
- protein_name = row['Protein Name']
102
- freq = {repeat: row[repeat] for repeat in df.columns[2:]} # Assuming repeats start from 3rd column
103
- sequence_data.append((entry_id, protein_name, freq))
104
- homorepeats.update(freq.keys())
105
-
106
- return homorepeats, sequence_data
107
-
108
  # Streamlit UI components
109
  st.title("Protein Homorepeat Analysis")
110
 
@@ -119,7 +95,7 @@ if uploaded_files:
119
 
120
  for file in uploaded_files:
121
  excel_data = pd.ExcelFile(file)
122
- homorepeats, sequence_data = process_excel(excel_data) # Modify your process_csv function to process_excel
123
  if homorepeats is not None:
124
  all_homorepeats.update(homorepeats)
125
  all_sequences_data.append(sequence_data)
@@ -152,4 +128,3 @@ if uploaded_files:
152
 
153
  result_df = pd.DataFrame(rows)
154
  st.dataframe(result_df)
155
-
 
8
  from io import BytesIO
9
  from collections import defaultdict
10
 
 
 
11
  # Function to find repeated amino acids in the protein sequence
12
  def find_homorepeats(protein):
13
  n = len(protein)
 
27
 
28
  return freq
29
 
30
+ # Function to process a single Excel sheet and return its analysis
31
+ def process_excel(excel_data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  homorepeats = set()
33
  sequence_data = []
 
 
 
 
34
 
35
+ for sheet_name in excel_data.sheet_names:
36
+ df = excel_data.parse(sheet_name)
37
+ if len(df.columns) < 3:
38
+ st.error(f"Error: The sheet '{sheet_name}' must have at least three columns: ID, Protein Name, Sequence")
39
+ return None, None
40
+
41
+ for _, row in df.iterrows():
42
+ entry_id = str(row[0])
43
+ protein_name = str(row[1])
44
+ sequence = str(row[2]).replace('"', '').replace(' ', '')
45
+ freq = find_homorepeats(sequence)
46
+ sequence_data.append((entry_id, protein_name, freq))
47
+ homorepeats.update(freq.keys()) # Collect unique homorepeats
48
 
49
+ return homorepeats, sequence_data
50
 
51
  # Function to generate and download Excel workbook with separate sheets for each input file
52
  def create_excel(sequences_data, homorepeats, filenames):
 
81
  output.seek(0)
82
  return output
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # Streamlit UI components
85
  st.title("Protein Homorepeat Analysis")
86
 
 
95
 
96
  for file in uploaded_files:
97
  excel_data = pd.ExcelFile(file)
98
+ homorepeats, sequence_data = process_excel(excel_data)
99
  if homorepeats is not None:
100
  all_homorepeats.update(homorepeats)
101
  all_sequences_data.append(sequence_data)
 
128
 
129
  result_df = pd.DataFrame(rows)
130
  st.dataframe(result_df)