diff --git "a/code/pdb_featureVector.py" "b/code/pdb_featureVector.py"
new file mode 100644--- /dev/null
+++ "b/code/pdb_featureVector.py"
@@ -0,0 +1,1716 @@
+# IMPORT NECESSARY MODULES AND LIBRARIES
+from timeit import default_timer as timer
+import xml.etree.ElementTree as ET
+from collections import Counter
+from bs4 import BeautifulSoup
+from io import StringIO
+from decimal import *
+import pandas as pd
+import requests
+import os.path as op
+import subprocess
+import shutil
+import ssbio.utils
+import warnings
+import sys
+import pathlib
+from pathlib import Path
+import os, glob
+import math
+import ssbio
+import ssl
+from Bio.Align import substitution_matrices
+from Bio.PDB.Polypeptide import *
+from Bio.PDB import PDBList
+from Bio import Align
+from Bio import SeqIO
+from Bio.PDB import *
+import streamlit as st
+from urllib.error import HTTPError
+import Bio
+
+warnings.filterwarnings("ignore")
+start = timer()
+
+# FUNCTIONS
+
+
+# FUNCTIONS
+from calc_pc_property import *
+from add_domains import *
+from add_annotations import *
+from add_sequence import *
+from add_structure import *
+from add_alignment import *
+from manage_files import *
+from add_3Dalignment import *
+from add_sasa import *
+from standard import *
+from add_interface_pos import *
+from standard import *
+from uniprotSequenceMatch import uniprotSequenceMatch
+from process_input import clean_data
+
+
+def pdb(input_set, mode, impute):
+    aligner = Align.PairwiseAligner()
+    """
+    STEP 1
+    Get input data as a console input.
+    Add datapoint identifier and remove non-standard input.
+    """
+    data = clean_data(input_set)
+    path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(
+        mode)
+    print('Creating directories...')
+
+    annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                       'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
+                       'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
+                       'region',
+                       'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
+                       'transitPeptide', 'glycosylation', 'propeptide']
+
+    print('Feature vector generation started...\n')
+    cont = True
+    try:
+        if cont == False:
+            print('Feature vectore generation terminated.')
+        else:
+            """
+            STEP 2
+            Add physicochemical properties.
+            """
+            print('Adding physicochemical properties...\n')
+
+            data = add_physicochemical(data)
+
+            """
+            STEP 3
+            Add domain-related information.
+            """
+            print('Adding domains\n')
+
+            data = add_domains(data, path_to_domains)
+            data = data.astype(str)
+            data = data.replace({'NaN': 'nan'})
+            data.domain = data.domain.replace({'nan': '-1'})
+            data.domStart = data.domStart.replace({'nan': '-1'})
+            data.domEnd = data.domEnd.replace({'nan': '-1'})
+            data.distance = data.distance.replace({'nan': '-1'})
+
+            """
+            STEP 4
+            Retrieve canonical and isoform UniProt sequences.
+            Add to the data frame.
+            """
+            print('Retrieving UniProt sequences...\n')
+
+            canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence'])
+            up_list = list(set(data['uniprotID'].to_list()))
+            for i in range(len(up_list)):
+                canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i])
+                canonical_fasta.at[i, 'uniprotID'] = up_list[i]
+            canonical_fasta = canonical_fasta.drop_duplicates()
+            isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence'])
+            iso_dict = []
+            for i in range(len(up_list)):
+                iso_dict.append(get_isoforms(up_list[i]))
+
+            index = 0
+            for i in iso_dict:
+                for key, val in i.items():
+                    isoform_fasta.at[index, 'uniprotID'] = key
+                    isoform_fasta.at[index, 'isoformSequence'] = val
+                    index += 1
+            isoform_fasta = isoform_fasta.drop_duplicates()
+            for i in isoform_fasta.index:
+                isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip()
+                isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6]
+            print('Sequence files created...\n')
+
+            data = data.merge(canonical_fasta, on='uniprotID', how='left')
+            data = data.astype(str)
+            data['whichIsoform'] = 'nan'
+            data.replace({'': 'nan'}, inplace=True)
+            data['wt_sequence_match'] = ''
+            for i in data.index:
+                if len(data.at[i, 'uniprotSequence']) >= int(data.at[i, 'pos']):
+                    wt = data.at[i, 'wt']
+                    can = str(data.at[i, 'uniprotSequence'])[int(data.at[i, 'pos']) - 1]
+                    if wt == can:
+                        data.at[i, 'wt_sequence_match'] = 'm'
+                    elif wt != can:
+                        isoList = isoform_fasta[
+                            isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
+                        for k in isoList:
+                            if len(k) >= int(data.at[i, 'pos']):
+                                resInIso = k[int(int(data.at[i, 'pos']) - 1)]
+                                if wt == resInIso:
+                                    whichIsoform = \
+                                        isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
+                                    data.at[i, 'wt_sequence_match'] = 'i'
+                                    data.at[i, 'whichIsoform'] = whichIsoform
+                                    break
+
+                elif len(data.at[i, 'uniprotSequence']) < int(data.at[i, 'pos']):
+                    isoList = isoform_fasta[
+                        isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
+                    for k in isoList:
+                        if len(k) >= int(data.at[i, 'pos']):
+                            resInIso = k[int(int(data.at[i, 'pos']) - 1)]
+                            wt = data.at[i, 'wt']
+                            if wt == resInIso:
+                                whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
+                                    0]
+                                data.at[i, 'wt_sequence_match'] = 'i'
+                                data.at[i, 'whichIsoform'] = whichIsoform
+                                break
+
+            data.wt_sequence_match = data.wt_sequence_match.astype('str')
+            data.replace({'': 'nan'}, inplace=True)
+            data_size = len(data.drop_duplicates(['datapoint']))
+            not_match_in_uniprot = data[(data.uniprotSequence == 'nan') | (data.wt_sequence_match == 'nan')]
+            uniprot_matched = data[(data.uniprotSequence != 'nan') & (data.wt_sequence_match != 'nan')]
+            data = None
+
+            print(
+                'You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
+                % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
+                   len(uniprot_matched.drop_duplicates(['datapoint']))))
+
+            """
+            STEP 5
+            Retrieve related PDB sequences, extract their sequences.
+            Add to the data frame.
+            """
+
+            pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence'])
+            pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution'])
+
+            print('Retrieving PDB structures...\n')
+            pdbs = []
+            protein = uniprot_matched.uniprotID.to_list()
+            protein = list(set(protein))
+            # pdbs = get_pdb_ids(protein)
+
+            for prot in protein:
+                pdbs.append(get_pdb_ids(prot))
+
+            pdbs = [item for sublist in pdbs for item in sublist]
+            print('Processing PDB structures...\n')
+            if pdbs == []:
+                print('No PDB structure found for the query. ')
+
+            print('Starting PDB structures download...\n')
+            pdbs = list(filter(None, pdbs))
+            pdbs = (set(pdbs))
+            pdbs = [i.lower() for i in pdbs]
+            pdbl = PDBList()
+            parser = PDBParser()
+            index = 0
+
+            try:
+                shutil.rmtree('obsolete')
+            except OSError as e:
+                pass
+
+            existing_pdb = list(Path(path_to_output_files / 'pdb_structures').glob("*"))
+            existing_pdb = [str(i) for i in existing_pdb]
+            existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
+
+            cnt = 0
+            for search in pdbs:
+                try:
+                    if search.lower() not in existing_pdb:
+
+                        # Specify the URL of the PDB file you want to download
+                        pdb_url = f"https://files.rcsb.org/download/{search}.pdb"
+                        # Set the path within your Hugging Face space where you want to store the PDB files
+                        pdb_folder_path = Path(path_to_output_files / 'pdb_structures')
+                        # Extract the PDB filename from the URL
+                        pdb_filename = pdb_url.split("/")[-1]
+
+                        # Set the path for the downloaded file
+                        pdb_file_path = os.path.join(pdb_folder_path, pdb_filename)
+
+                        # Send a GET request to download the PDB file
+                        response = requests.get(pdb_url)
+                        if response.status_code == 200:
+                            # Save the file to the specified path
+                            with open(pdb_file_path, "wb") as file:
+                                file.write(response.content)
+                            print("PDB file downloaded successfully!")
+                        else:
+                            print("Failed to download the PDB file.")
+
+                    else:
+                        print('PDB structure file exists..')
+                        for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
+                            filename_replace_ext = filename.with_suffix(".pdb")
+                            filename.rename(filename_replace_ext)
+
+                        file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
+
+                        base = os.path.splitext(str(file))[0]
+                        base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
+                        os.rename(file, base + ".ent")
+                        file = base + '.ent'
+
+                    # Parse the PDB file
+                    structure = parser.get_structure("structure", file)
+                    # Get the resolution from the Structure object
+                    resolution = structure.header["resolution"]
+
+                    for record in SeqIO.parse(file, "pdb-seqres"):
+                        if record.dbxrefs[0].split(':')[0] == 'UNP':
+                            pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
+                            pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
+                            pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
+                            pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1]
+                            pdb_info.at[index, 'pdbID'] = record.id.split(':')[0]
+                            pdb_info.at[index, 'chain'] = record.annotations["chain"]
+                            pdb_info.at[index, 'resolution'] = resolution
+                        index += 1
+
+                except:
+                    IndexError
+                    pdb_info.at[index, 'uniprotID'] = 'nan'
+                    pdb_info.at[index, 'pdbID'] = 'nan'
+                    pdb_info.at[index, 'chain'] = 'nan'
+                    pdb_info.at[index, 'resolution'] = 'nan'
+                    index += 1
+                cnt += 1
+
+            print('PDB file processing finished..')
+            for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
+                try:
+                    filename_replace_ext = filename.with_suffix(".pdb")
+                    filename.rename(filename_replace_ext)
+                except:
+                    FileNotFoundError
+
+            for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
+                try:
+                    if filename.stem.startswith("pdb"):
+                        filename_replace_ext = filename.with_name(filename.stem[3:])
+                        filename.rename(filename_replace_ext.with_suffix('.pdb'))
+                except:
+                    FileNotFoundError
+
+            uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left')
+            uniprot_matched = uniprot_matched.astype(str)
+            uniprot_matched = uniprot_matched.drop_duplicates()
+
+            uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left')
+            uniprot_matched = uniprot_matched.astype(str)
+
+            with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & (
+                    (uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & (
+                    uniprot_matched.resolution != 'None'))].drop_duplicates()
+            no_pdb = uniprot_matched[(uniprot_matched.pdbID == 'nan') | (
+                    (uniprot_matched.resolution == 'nan') | (uniprot_matched.resolution == 'OT') | (
+                    uniprot_matched.resolution == 'None'))]
+            no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())]
+            no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True)
+
+            print(
+                'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n'
+                % (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])),
+                   len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint']))))
+
+            with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
+            with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
+            with_pdb.replace({'': 'nan'}, inplace=True)
+
+            if len(with_pdb) == 0:
+                with_pdb['pdbInfo'] = ''
+            else:
+                for i in with_pdb.index:
+                    try:
+                        res = str(with_pdb.at[i, 'resolution'])
+                        chain = with_pdb.at[i, 'chain']
+                        new = with_pdb.at[i, 'pdbID'] + ':' + chain + ':' + res
+                        with_pdb.at[i, 'pdbInfo'] = new
+                    except:
+                        TypeError
+                        with_pdb.at[i, 'pdbInfo'] = 'nan'
+
+            with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                                 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
+                                 'wt_sequence_match',
+                                 'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
+
+            # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
+            # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
+            # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
+
+            """
+            STEP 6
+            Retrieve sequence annotations.
+            Add to the data frame.
+            """
+
+            if len(with_pdb) > 0:
+                with_pdb = add_annotations(with_pdb)
+            else:
+                new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
+                                                         'dnaBinding',
+                                                         'activeSite',
+                                                         'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                                                         'crosslink', 'mutagenesis', 'strand',
+                                                         'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain',
+                                                         'caBinding', 'bindingSite', 'region',
+                                                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                                                         'coiledCoil', 'peptide',
+                                                         'transitPeptide', 'glycosylation', 'propeptide',
+                                                         'disulfideBinary',
+                                                         'intMetBinary', 'intramembraneBinary',
+                                                         'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                                                         'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                                                         'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                                                         'strandBinary', 'helixBinary', 'turnBinary',
+                                                         'metalBindingBinary',
+                                                         'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                                                         'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                                                         'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                                                         'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                                                         'glycosylationBinary', 'propeptideBinary']
+                with_pdb = pd.DataFrame(columns=new_cols)
+            try:
+                with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
+            except:
+                AttributeError
+                with_pdb['whichIsoform'] = ''
+
+            with_pdb = with_pdb.astype(str)
+            with_pdb = with_pdb.replace({'NaN': 'nan'})
+            with_pdb.replace({'[]': 'nan'}, inplace=True)
+            with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
+            with_pdb.replace({'': 'nan'}, inplace=True)
+
+            """
+            STEP 7
+            Do alignment for PDB
+            """
+            # Canonical matches, i.e. labelled as m, canonical sequences will be aligned with PDB sequences.
+            # Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences.
+            with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C')
+            with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C')
+
+            dfM = with_pdb[with_pdb.wt_sequence_match == 'm']
+            dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
+            dfM = dfM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
+
+            dfNM = with_pdb[with_pdb.wt_sequence_match == 'i']
+            dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True)
+            dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first')
+            dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True)
+
+            dfM = dfM.astype(str)
+            dfNM = dfNM.astype(str)
+
+            dfM.reset_index(inplace=True)
+            dfM.drop(['index'], axis=1, inplace=True)
+            dfNM.reset_index(inplace=True)
+            dfNM.drop(['index'], axis=1, inplace=True)
+
+            uniprot_matched_size = len(uniprot_matched.drop_duplicates(['datapoint']))
+            uniprot_matched = None
+            pdb_fasta = None
+            pdb_info = None
+            pdbs = None
+            existing_pdb = None
+            with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
+            with_pdb = None
+
+            print('Aligning sequences...\n')
+
+            aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
+            aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
+            # When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them.
+            for i in aligned_m.index:
+                if aligned_m.at[i, 'pdbSequence'] == 'nan':
+                    aligned_m.at[i, 'mutationPositionOnPDB'] = 'nan'
+                    aligned_m.at[i, 'domainStartonPDB'] = 'nan'
+                    aligned_m.at[i, 'domainEndonPDB'] = 'nan'
+                    aligned_m.at[i, 'pdb_alignStatus'] = 'nan'
+
+            for i in aligned_nm.index:
+                if aligned_nm.at[i, 'pdbSequence'] == 'nan':
+                    aligned_nm.at[i, 'mutationPositionOnPDB'] = 'nan'
+                    aligned_nm.at[i, 'domainStartonPDB'] = 'nan'
+                    aligned_nm.at[i, 'domainEndonPDB'] = 'nan'
+                    aligned_nm.at[i, 'pdb_alignStatus'] = 'nan'
+
+            # Check if they the same column name before merging.
+            aligned_m = aligned_m.astype(str)
+            aligned_nm = aligned_nm.astype(str)
+
+            frames = [aligned_m, aligned_nm]
+            after_up_pdb_alignment = pd.concat(frames, sort=False)
+            if len(after_up_pdb_alignment) == 0:
+                after_up_pdb_alignment['pdb_alignStatus'] = ''
+                after_up_pdb_alignment['mutationPositionOnPDB'] = ''
+                after_up_pdb_alignment['domainStartonPDB'] = ''
+                after_up_pdb_alignment['domainEndonPDB'] = ''
+
+            after_up_pdb_alignment = after_up_pdb_alignment.sort_values(
+                by=['uniprotID', 'wt', 'mut', 'pos', 'pdb_alignStatus', 'resolution', 'chain'],
+                ascending=[True, True, True, True, True, True, True])
+
+            after_up_pdb_alignment = after_up_pdb_alignment.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos'],
+                                                                            keep='first')
+
+            after_up_pdb_alignment = after_up_pdb_alignment.astype('str')
+
+            pdb_aligned = after_up_pdb_alignment[
+                (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB != 'nan')]
+            yes_pdb_no_match = after_up_pdb_alignment[
+                (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
+            no_pdb = no_pdb.copy()
+
+            print('PDB matching is completed...\n')
+            print('SUMMARY')
+            print('-------')
+            print('%d data points that failed to match a UniProt Sequence are discarded.' % len(
+                not_match_in_uniprot.drop_duplicates(['datapoint'])))
+            print('Of the remaining %d:' % uniprot_matched_size)
+            print('--%d of %d successfully aligned with PDB structures.' % (
+                len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size))
+            print('--%d of %d not found on the covered area by the structure.' % (
+                len(yes_pdb_no_match.drop_duplicates(['datapoint'])), with_pdb_size))
+            print('--PDB structures not found for %d datapoints.' % len(no_pdb.drop_duplicates(['datapoint'])))
+            print('--%d will be searched in Swiss-Model database.\n' % (
+                    len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
+
+            dfM = None
+            dfNM = None
+            aligned_nm = None
+            aligned_m = None
+            after_up_pdb_alignment = None
+
+            print('Proceeding to  SwissModel search...')
+            print('------------------------------------\n')
+
+            # At this point we have 4 dataframes
+            # 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well.
+            # 1a. aligned --- we are done with this.
+            # 1b. yes_pdb_no_match --- They have PDB structures but not matched, so will be searched in the other databases.
+            # 2. not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
+            # 3. no_pdb --- No PDB structures were found for them. Will be searched in other databases.
+
+            """
+            Step 8
+            Neutralize data points that are to be searched in Swiss-Model
+            # One point is that yes_pdb_no_match's annotations are the adjusted according to the PDBs they are matched before.
+            # They need to be converted to their old original UniProt annotation positions.
+            """
+            yes_pdb_no_match.drop(['disulfide', 'intMet',
+                                   'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                                   'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                                   'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                   'caBinding', 'topologicalDomain', 'bindingSite', 'region',
+                                   'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                                   'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary',
+                                   'intMetBinary', 'intramembraneBinary',
+                                   'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                                   'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                                   'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                                   'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                                   'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                                   'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                                   'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                                   'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                                   'glycosylationBinary', 'propeptideBinary', 'pdbSequence', 'pdbInfo', 'pdbID',
+                                   'chain', 'resolution', 'pdb_alignStatus', 'mutationPositionOnPDB',
+                                   'domainStartonPDB', 'domainEndonPDB'], axis=1, inplace=True)
+
+            to_swiss = pd.concat(
+                [yes_pdb_no_match.drop_duplicates(['datapoint']), no_pdb.drop_duplicates(['datapoint'])])
+            no_pdb = None
+            to_swiss.reset_index(inplace=True)
+            to_swiss.drop(['index'], axis=1, inplace=True)
+            to_swiss = to_swiss.astype('str')
+            to_swiss = to_swiss.replace({'NaN': 'nan'})
+            # Create model summary dataframe.
+            if len(to_swiss) != 0:
+                # import zipfile
+                # with zipfile.ZipFile(Path(path_to_input_files / 'swissmodel_structures.txt.zip'),"r") as zip_ref:
+                #    zip_ref.extractall(Path(path_to_input_files))
+
+                print('Generating SwissModel file...\n')
+
+                swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
+                                          dtype=str, header=None, skiprows=1,
+                                          names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
+                                                 'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
+                                                 'qmean_norm', 'seqid', 'url'])
+
+            else:
+                swiss_model = pd.DataFrame(
+                    columns=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', 'coordinate_id',
+                             'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm', 'seqid', 'url',
+                             'whichIsoform'])
+            swiss_model = swiss_model.astype('str')
+            try:
+                swiss_model.iso_id = swiss_model.iso_id.astype('str')
+            except:
+                AttributeError
+                swiss_model['iso_id'] = 'nan'
+            swiss_model = swiss_model[swiss_model.UniProtKB_ac != 'nan']
+            for ind in swiss_model.index:
+                swiss_model.at[ind, 'UniProtKB_ac'] = swiss_model.at[ind, 'UniProtKB_ac'].split('-')[0]
+                if swiss_model.at[ind, 'iso_id'] != 'nan':
+
+                    swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
+                else:
+                    swiss_model.at[ind, 'whichIsoform'] = 'nan'
+            #        swiss_model.drop(['input'], axis=1, inplace=True)
+            swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
+            print('Index File Processed...\n')
+
+            # Get relevant columns
+            swiss_model = swiss_model[
+                ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
+            # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
+            swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
+            swiss_model.reset_index(inplace=True)
+            swiss_model.drop(['index'], axis=1, inplace=True)
+
+            # Get protein IDs for which there exist models.
+            swiss_model_ids = set(swiss_model.UniProtKB_ac.to_list())
+            to_swiss = to_swiss.astype(str)
+            no_swiss_models = pd.DataFrame()
+            for i in to_swiss.index:
+                if to_swiss.at[i, 'uniprotID'] not in swiss_model_ids:
+                    k = pd.Series(to_swiss.iloc[i])
+                    no_swiss_models = no_swiss_models.append(k, ignore_index=True)
+
+            no_swiss_models = no_swiss_models.astype(str)
+            if len(no_swiss_models) == 0:
+                no_swiss_models = pd.DataFrame(columns=to_swiss.columns)
+            else:
+                no_swiss_models = no_swiss_models[to_swiss.columns]
+                no_swiss_models.reset_index(inplace=True)
+                no_swiss_models.drop('index', axis=1, inplace=True)
+
+            with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False)
+            with_swiss_models = with_swiss_models[to_swiss.columns]
+
+            # Add model info.
+
+            with_swiss_models = with_swiss_models.astype(str)
+            swiss_model = swiss_model.astype(str)
+            swiss_models_with_data = pd.merge(with_swiss_models, swiss_model, left_on=['uniprotID', 'whichIsoform'],
+                                              right_on=['UniProtKB_ac', 'whichIsoform'],
+                                              how='left')
+            swiss_models_with_data = swiss_models_with_data.astype(str)
+            swiss_models_with_data = swiss_models_with_data.sort_values(
+                by=['uniprotID', 'wt', 'mut', 'pos', 'qmean_norm'],
+                ascending=False)
+            swiss_models_with_data = swiss_models_with_data.drop_duplicates()
+            swiss_models_with_data = swiss_models_with_data.drop(['UniProtKB_ac', 'seqid'], axis=1)
+            swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
+            swiss_models_with_data = swiss_models_with_data.astype(str)
+
+            # Get the ones in the list but without model url and add to the list to go to modbase.
+            url_nan = swiss_models_with_data[swiss_models_with_data.url == 'nan']
+
+            # Add this nan's to no_model. These will be searched in MODBASE because here they dont have urls.
+            url_nan = url_nan.drop(['from', 'qmean_norm', 'template', 'to', 'url'], axis=1)
+
+            no_swiss_models_2 = pd.concat([no_swiss_models, url_nan])
+            swiss_models_with_data = swiss_models_with_data[swiss_models_with_data.url != 'nan']
+            for i in swiss_models_with_data.index:
+                try:
+                    swiss_models_with_data.at[i, 'chain'] = swiss_models_with_data.at[i, 'template'].split('.')[2]
+                    swiss_models_with_data.at[i, 'template'] = swiss_models_with_data.at[i, 'template'].split('.')[0]
+                except:
+                    IndexError
+            if len(swiss_models_with_data) == 0:
+                swiss_models_with_data['chain'] = ''
+                swiss_models_with_data['template'] = ''
+
+            swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('str')
+            swiss_models_with_data.chain = swiss_models_with_data.chain.astype('str')
+            swiss_models_with_data['qmean_norm'] = swiss_models_with_data.qmean_norm.apply(lambda x: round(float(x), 2))
+            swiss_models_with_data = swiss_models_with_data.astype(str)
+
+            # swiss_models_with_data: These data points will be aligned with their corresponding model sequences.
+            # Add sequences
+
+            no_swiss_models_2.reset_index(inplace=True)
+            no_swiss_models_2.drop('index', axis=1, inplace=True)
+
+            swiss_models_with_data.reset_index(inplace=True)
+            swiss_models_with_data.drop('index', axis=1, inplace=True)
+
+            swiss_model_ids = None
+            with_swiss_models = None
+            swiss_model = None
+            no_swiss_models = None
+            url_nan = None
+
+            # At this point we have:
+            # pdb_aligned --- Align in the PDB phase
+            # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
+            # to_swiss (no_pdb + yes_pdb_no_match) --- to be searched in SwissModel database
+            # to_swiss (with_swiss_models & no_swiss_models)
+            # swiss_models_with_data --- We found swiss models for them.
+            # no_swiss_models_2 (no_swiss_models + url_nan)--- to be searched in modbase (the ones having swissmodels but not matching with the boundaries  & broken_swiss will be added here)
+
+            """
+            STEP 9
+            Associated model IDs are added. 
+            Download model files.
+            """
+            print('Beginning SwissModel files download...')
+            existing_swiss = list(Path(path_to_output_files / 'swissmodel_structures').glob("*"))
+            existing_swiss = [str(i) for i in existing_swiss]
+            existing_swiss = ['.'.join(i.split('/')[-1].split('.')[:-1]) for i in existing_swiss]
+            swissmodels_fasta = pd.DataFrame()
+
+            for i in swiss_models_with_data.index:
+                protein = swiss_models_with_data.at[i, 'uniprotID']
+                template = swiss_models_with_data.at[i, 'template'].split('.')[0]
+                qmean_norm = str(round(float(swiss_models_with_data.at[i, 'qmean_norm']), 2))
+                if protein + '_' + template + '_' + qmean_norm not in existing_swiss:
+                    url = swiss_models_with_data.at[i, 'url'].strip('\"').strip('}').replace('\\', '').strip(
+                        '\"').replace(
+                        'https',
+                        'https:')
+                    req = requests.get(url)
+                    name = Path(
+                        path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt')
+                    print('Downloading for Protein:', protein + ' Model: ' + template)
+                    with open(name, 'wb') as f:
+                        f.write(req.content)
+                else:
+                    print('Model exists.')
+                    name = Path(
+                        path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt')
+                with open(name, encoding="utf8") as f:
+                    fasta = ''
+                    lines = f.readlines()
+                    chain = ''
+                    for row in lines:
+                        if row[0:4] == 'ATOM' and row[13:15] == 'CA':
+                            chain = row[20:22].strip()
+                            fasta += threeToOne(row[17:20])
+                        if row[0:3] == 'TER':
+                            k = pd.Series([protein, template, qmean_norm, chain.upper(), fasta])
+                            swissmodels_fasta = swissmodels_fasta.append(k, ignore_index=True)
+                            fasta = ''
+
+            if len(swissmodels_fasta) == 0:
+                swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'])
+            else:
+                swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']
+
+            swissmodels_fasta = swissmodels_fasta.astype(str)
+
+            swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float)
+            swissmodels_fasta.qmean_norm = swissmodels_fasta.qmean_norm.astype(float)
+
+            swissmodels_fasta = swissmodels_fasta.sort_values(['uniprotID', 'template', 'qmean_norm', 'chain'],
+                                                              axis=0)  # example = 3gdh
+            swissmodels_fasta.reset_index(inplace=True)
+            swissmodels_fasta.drop(['index'], axis=1, inplace=True)
+            swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'qmean_norm', 'chain'])
+            swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'chain', 'fasta'])
+            swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'fasta'])
+            # Some files were broken, thus their PDBs couldnt be recorded.
+            swissmodels_fasta = swissmodels_fasta.drop_duplicates()
+            swissmodels_fasta = swissmodels_fasta.astype(str)
+
+            swiss_models_with_data = swiss_models_with_data.astype(str)
+            swissmodels_fasta = swissmodels_fasta.astype(str)
+            swiss_models_with_data1 = swiss_models_with_data.merge(swissmodels_fasta,
+                                                                   on=['uniprotID', 'template', 'qmean_norm', 'chain'])
+
+            swiss_models_with_data1 = swiss_models_with_data1.sort_values(['datapoint', 'fasta'], axis=0,
+                                                                          ascending=[True, False])
+            swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
+
+            swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
+            swiss_models_with_data.reset_index(inplace=True)
+            swiss_models_with_data.drop(['index'], axis=1, inplace=True)
+            broken_swiss = pd.DataFrame()
+            c = 0
+            for i in swiss_models_with_data.index:  # en baştaki dfde var ama model gelende yok.
+                if swiss_models_with_data.at[i, 'datapoint'] not in swiss_models_with_data1_dp:
+                    k = pd.Series(swiss_models_with_data.iloc[i])
+                    broken_swiss = broken_swiss.append(k, ignore_index=True)
+                    c += 1
+
+            if len(broken_swiss) == 0:
+                broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
+
+            swiss_models_with_data = swiss_models_with_data1.copy()
+
+            swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
+            swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
+                                                                        axis=0, ascending=[True, True, True, False])
+
+            # Delete the same model sequence with lower quality
+            swiss_models_with_data = swiss_models_with_data.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
+                                                                            keep='first')
+            swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
+            swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
+            len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
+                broken_swiss.drop_duplicates(['datapoint'])) + len(
+                no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
+            # This printed data here includes all possible models with different qualities,
+            # because we may get a hit in either of them.
+            swiss_models_with_data.rename({'fasta': 'pdbSequence'}, axis=1, inplace=True)  # for convenience.
+
+            # NOW DO ALIGNMENT HERE
+
+            swiss_models_with_data = swiss_models_with_data.replace({'[\'?\']': 'nan'})
+            swiss_models_with_data = swiss_models_with_data.replace({'[]': 'nan'})
+            swiss_models_with_data.rename({'template': 'pdbID'}, axis=1,
+                                          inplace=True)  # Only to be able use the alignment code above.
+            swiss_models_with_data = swiss_models_with_data.astype(str)
+            swiss_models_with_data.pdbSequence = swiss_models_with_data.pdbSequence.astype('str')
+            swiss_models_with_data = add_annotations(swiss_models_with_data)
+            swiss_models_with_data = swiss_models_with_data.astype(str)
+            swiss_models_with_data.replace({'NaN': 'nan'}, inplace=True)
+            swiss_models_with_data_copy = swiss_models_with_data.copy()
+            swiss_models_with_data1_dp = None
+            swiss_models_with_data1 = None
+            existing_swiss = None
+            swissmodels_fasta = None
+
+            print('Aligning sequences...\n')
+
+            swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
+            swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
+            swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
+                                            path_to_output_files / 'alignment_files')
+            swiss_models_with_data = None
+
+            if len(swiss_model_aligned) == 0:
+                swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
+                swiss_model_aligned['qmean_norm'] = 'nan'
+            else:
+                swiss_model_aligned = swiss_model_aligned.astype(str)
+                swiss_model_aligned.replace({'NaN': 'nan'}, inplace=True)
+
+            # Some datapoints appear in both nan and not_nan. If not_nan we take it only once.
+            nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB == 'nan']
+            not_nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB != 'nan']
+            not_nan.qmean_norm = not_nan.qmean_norm.astype('float')
+            not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'qmean_norm'], ascending=[True, True, False],
+                                inplace=True)
+
+            which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
+            swiss_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan']
+            swiss_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan']
+
+            swiss_match.qmean_norm = swiss_match.qmean_norm.astype('float')
+            swiss_match.sort_values(['uniprotID', 'wt', 'pos', 'mut', 'pdb_alignStatus', 'qmean_norm'],
+                                    ascending=[True, True, True, True, True, False], inplace=True)
+            swiss_match.drop_duplicates(['uniprotID', 'wt', 'pos', 'mut'], keep='first', inplace=True)
+            swiss_not_match = swiss_not_match[no_swiss_models_2.columns]
+            broken_swiss = broken_swiss[no_swiss_models_2.columns]
+            swiss_not_match = swiss_not_match.drop_duplicates(['datapoint'])
+            broken_swiss = broken_swiss.drop_duplicates(['datapoint'])
+
+            to_modbase = pd.concat([no_swiss_models_2, broken_swiss]).drop_duplicates()
+            to_modbase = pd.concat([to_modbase, swiss_not_match]).drop_duplicates()
+            to_modbase = to_modbase.astype(str)
+            to_swiss_columns = to_swiss.columns
+            to_swiss_size = len(to_swiss.drop_duplicates(['datapoint']))
+            to_swiss = None
+
+            # CONTROL
+
+            """
+            # This should be the whole data.
+            len(swiss_match.drop_duplicates(['datapoint'])) + len(aligned.drop_duplicates(['datapoint'])) + len(to_modbase.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) ,len(data)
+            len(aligned.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) +len(to_swiss.drop_duplicates(['datapoint']))== len(data)
+            """
+            print('SwissModel matching is completed...\n')
+            print('SUMMARY')
+            print('-------')
+            print('%d data points that failed to match a UniProt Sequence are discarded.' % len(
+                not_match_in_uniprot.drop_duplicates(['datapoint'])))
+            print('Of the remaining %d:' % uniprot_matched_size)
+            print('--%d of %d successfully aligned with PDB structures.' % (
+                len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size))
+            print('--%d of %d successfully aligned with SwissModels structures.' % (
+                len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size))
+            print('--%d will be searched in ModBase database.\n' % len(to_modbase.drop_duplicates(['datapoint'])))
+
+            print('Proceeding to ModBase search...')
+            print('------------------------------------\n')
+            no_swiss_models_2 = None
+            broken_swiss = None
+            swiss_model_aligned = None
+            nan = None
+            not_nan = None
+            which_ones_are_match = None
+            swiss_not_match = None
+
+            # STEP :  GO TO MODBASE
+            # Should not include anything related to prev models.
+            if len(to_modbase) != 0:
+                to_modbase = to_modbase.astype(str)
+
+                # GET MODBASE MODELS
+
+                # Get IDs from data to retrieve only their models from MODBASE
+                to_modbase.reset_index(inplace=True)
+                to_modbase.drop(['index'], axis=1, inplace=True)
+
+                existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*"))
+                existing_modbase_models = [str(i) for i in existing_modbase_models]
+                existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models]
+
+                existing_modbase_models_ind = list(
+                    Path(path_to_output_files / 'modbase_structures_individual').glob("*"))
+                existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind]
+                existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind]
+
+                modbase_reduced = pd.DataFrame()
+                modbase_fasta = pd.DataFrame()
+
+                print('Retrieving ModBase models...\n')
+                # Get model files associated with each UniProtID
+                for protein in list(set(to_modbase.uniprotID.to_list())):
+                    if protein not in existing_modbase_models:
+                        print('Downloading Modbase models for ', protein)
+                        url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
+                        print(url)
+                        req = requests.get(url)
+                        name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
+                        with open(name, 'wb') as f:
+                            f.write(req.content)
+                    else:
+                        print('Model exists for', protein)
+                        name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt')
+                    with open(name, encoding="utf8") as f:
+                        a = open(name, 'r').read()
+                        soup = BeautifulSoup(a, 'lxml')
+                        for pdb in soup.findAll('pdbfile'):
+                            model_id = str(pdb.contents[1])[10:-11]
+                            if model_id not in existing_modbase_models_ind:
+                                with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
+                                          'w',
+                                          encoding="utf8") as individual:
+                                    individual.write(str('UniProt ID: ' + protein))
+                                    individual.write('\n')
+                                    individual.write(str(pdb.contents[3])[10:-11].strip())
+                            with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
+                                      encoding="utf8") as f:
+                                fasta = ''
+                                chain = ''
+                                template_chain = ''
+                                score = -999
+                                for ind_line in f.readlines():
+                                    if ind_line[0:10] == 'UniProt ID':
+                                        uniprot_id = ind_line.split(':')[1].strip()
+                                    if ind_line[0:23] == 'REMARK 220 TARGET BEGIN':
+                                        target_begin = ind_line[40:43].strip()
+                                    if ind_line[0:21] == 'REMARK 220 TARGET END':
+                                        target_end = ind_line[40:43].strip()
+                                    if ind_line[0:25] == 'REMARK 220 TEMPLATE BEGIN':
+                                        pdb_begin = ind_line[40:43].strip()
+                                    if ind_line[0:23] == 'REMARK 220 TEMPLATE END':
+                                        pdb_end = ind_line[40:43].strip()
+                                    if ind_line[0:23] == 'REMARK 220 TEMPLATE PDB':
+                                        pdb_code = ind_line[40:43].strip()
+                                    if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN':
+                                        pdb_chain = ind_line[40:43].strip()
+                                    if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score':
+                                        quality_score = ind_line[40:].strip()
+                                    if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID':
+                                        model_id = ind_line[40:].strip()
+                                    if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN':
+                                        template_chain = ind_line[40:42].strip()
+                                    if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA':
+                                        fasta += threeToOne(ind_line[17:20])
+                                    if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score':
+                                        try:
+                                            score = ind_line[40:].strip()
+                                        except (ValueError):
+                                            score = -999
+                                    if ind_line[0:3] == 'TER' or ind_line[0:3] == 'END':
+                                        k = pd.Series([uniprot_id, model_id, str(score), template_chain, fasta])
+                                        modbase_fasta = modbase_fasta.append(k, ignore_index=True)
+                                        fasta = ''
+                                try:
+                                    k = pd.Series(
+                                        [uniprot_id, target_begin, target_end, pdb_code, pdb_chain, pdb_begin, pdb_end,
+                                         quality_score,
+                                         model_id])
+                                    modbase_reduced = modbase_reduced.append(k, ignore_index=True)
+                                except:
+                                    NameError
+                                    print('This file doesnt have Quality Score. Replacer: -999', model_id)
+                                    quality_score = -999
+
+                print()
+                if len(modbase_fasta) != 0:
+                    modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
+                else:
+                    modbase_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'score', 'chain', 'fasta'])
+                modbase_fasta = modbase_fasta.astype(str)
+                modbase_fasta = modbase_fasta.replace({'': 'nan'})
+                modbase_fasta = modbase_fasta.replace({'NaN': 'nan'})
+                modbase_fasta = modbase_fasta[modbase_fasta.fasta != 'nan']
+
+                print('Modbase model frame constructed.\n')
+                if len(modbase_reduced) != 0:
+                    modbase_reduced.columns = ['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin',
+                                               'PDBEnd',
+                                               'ModPipeQualityScore', 'ModelID']
+                else:
+                    modbase_reduced = pd.DataFrame(
+                        columns=['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', 'PDBEnd',
+                                 'ModPipeQualityScore', 'ModelID'])
+
+                to_modbase = add_annotations(to_modbase)
+
+                to_modbase = to_modbase.astype(str)
+                to_modbase.fillna('nan', inplace=True)
+                to_modbase = to_modbase.replace({'NaN': 'nan'})
+                to_modbase.replace({'[]': 'nan'}, inplace=True)
+                to_modbase.replace({'nan-nan': 'nan'}, inplace=True)
+                to_modbase.replace({'': 'nan'}, inplace=True)
+                model_info_added = to_modbase.merge(modbase_reduced, right_on='UniprotID', left_on='uniprotID',
+                                                    how='left')
+                modbase_reduced = None
+                existing_modbase_models = None
+                existing_modbase_models_ind = None
+
+                model_info_added = model_info_added.drop(['UniprotID'], axis=1)
+                model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
+                                                                    'PDBCode': 'template', 'PDBChain': 'chain',
+                                                                    'ModPipeQualityScore': 'score',
+                                                                    'ModelID': 'pdbID'})
+                model_info_added.drop(['PDBEnd', 'PDBBegin'], axis=1, inplace=True)
+                model_info_added.score = model_info_added.score.astype(float)
+                model_info_added = model_info_added.sort_values(by=['datapoint', 'score'],
+                                                                ascending=False)
+                model_info_added.reset_index(inplace=True)
+                model_info_added.drop(['index'], axis=1, inplace=True)
+                model_info_added = model_info_added.drop_duplicates()
+
+                model_info_added = model_info_added.astype(str)
+                model_info_added = model_info_added.replace({'NaN': 'nan'})
+                no_info = model_info_added[model_info_added.pdbID == 'nan']
+                with_modbase_info = model_info_added[model_info_added.pdbID != 'nan']
+                model_info_added = None
+
+                len(no_info.drop_duplicates(['datapoint'])), len(with_modbase_info.drop_duplicates(['datapoint']))
+                len(no_info.drop_duplicates(['datapoint'])) + len(
+                    with_modbase_info.drop_duplicates(['datapoint'])) == len(
+                    to_modbase.drop_duplicates(['datapoint']))
+
+                # Add no_info to the rest down below!
+                no_info = no_info[to_swiss_columns]
+
+                with_modbase_info.score = with_modbase_info.score.astype(float)
+                modbase_fasta.score = modbase_fasta.score.astype(float)
+
+                modbase_fasta = modbase_fasta.sort_values(['uniprotID', 'score', 'template', 'chain'],
+                                                          ascending=[True, False, True, True], axis=0)  # example = 3gdh
+
+                # I added this newly downloaded ones to the main model file.
+
+                modbase_fasta = modbase_fasta.rename(columns={'template': 'pdbID'})
+                with_modbase_info.pos = with_modbase_info.pos.astype('int')
+                with_modbase_info.score = with_modbase_info.score.astype(float)
+                with_modbase_info.score = with_modbase_info.score.apply(lambda x: round(x, 2))
+                modbase_fasta.score = modbase_fasta.score.astype(float)
+                modbase_fasta.score = modbase_fasta.score.apply(lambda x: round(x, 2))
+
+                with_modbase_info = with_modbase_info.merge(modbase_fasta, on='pdbID', how='left')
+
+                with_modbase_info.drop(['score_y'], axis=1, inplace=True)
+                with_modbase_info.rename(columns={'score_x': 'score'}, inplace=True)
+                with_modbase_info.drop(['uniprotID_y', 'chain_y'], axis=1, inplace=True)
+                with_modbase_info.rename(columns={'uniprotID_x': 'uniprotID', 'chain_x': 'chain'}, inplace=True)
+
+                with_modbase_info.score = with_modbase_info.score.astype('float')
+                with_modbase_info = with_modbase_info.sort_values(
+                    ['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
+                    axis=0,
+                    ascending=[True, True, True, True, False, True, False])
+                with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
+                                                                      keep='first')
+
+                with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
+                with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
+                with_modbase_info = with_modbase_info.replace({'\'?\', ': ''})
+                with_modbase_info = with_modbase_info.replace({', \'?\'': ''})
+                with_modbase_info = with_modbase_info.replace({'(': ''})
+                with_modbase_info = with_modbase_info.replace(
+                    {')': ''})
+                with_modbase_info = with_modbase_info.astype(str)
+                with_modbase_info.fasta = with_modbase_info.fasta.astype('str')
+                with_modbase_info.reset_index(inplace=True)
+                with_modbase_info.drop('index', axis=1, inplace=True)
+
+                align = with_modbase_info[
+                    with_modbase_info.fasta != 'nan']
+                yes_pdb_no_match = with_modbase_info[
+                    with_modbase_info.fasta == 'nan']
+                yes_pdb_no_match = yes_pdb_no_match[~yes_pdb_no_match.datapoint.isin(align.datapoint.to_list())]
+
+                align.rename(columns={'fasta': 'pdbSequence'}, inplace=True)
+                align['uniprotSequence'] = align['uniprotSequence'].str.replace('U', 'C')
+                align['pdbSequence'] = align['pdbSequence'].str.replace('U', 'C')
+
+                to_modbase_size = len(to_modbase.drop_duplicates(['datapoint']))
+                modbase_fasta = None
+                to_modbase = None
+                print('Aligning sequences...\n')
+                modbase_aligned = alignment(align, annotation_list, path_to_output_files / 'alignment_files')
+                modbase_aligned = modbase_aligned.astype(str)
+                modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
+
+                # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
+                if len(with_modbase_info) != 0:
+                    not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
+                                                with_modbase_info.drop_duplicates(['datapoint'])]).drop_duplicates(
+                        ['datapoint'],
+                        keep=False)
+                else:
+                    not_in_aligned = pd.DataFrame(
+                        columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                                 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
+                                 'intMet',
+                                 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                                 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                                 'crosslink',
+                                 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                                 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                                 'coiledCoil',
+                                 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                                 'disulfide',
+                                 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
+                                 'activeSite',
+                                 'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                                 'crosslink',
+                                 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                                 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                                 'coiledCoil',
+                                 'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
+                                 'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
+                with_modbase_info = None
+                if len(not_in_aligned) != 0:
+                    not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
+                                            not_in_aligned.drop_duplicates(['datapoint'])]).drop_duplicates(
+                        ['datapoint'],
+                        keep='first')
+                # Retain the best model among the aligned ones.
+                else:
+                    not_models = pd.DataFrame(columns=not_in_aligned.columns)
+
+                yes_pdb_no_match = None
+                # # Some datapoints appear in both nan and not_nan. If not_nan we take it only once.
+                modbase_aligned = modbase_aligned.astype(str)
+                if len(modbase_aligned) != 0:
+                    nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
+                    not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
+                    not_nan.score = not_nan.score.astype(float)
+                    not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
+                                        inplace=True)
+
+                    not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
+                                                  ascending=[True, True, False])
+                    not_nan = not_nan.drop_duplicates(['datapoint'], keep='first')
+                else:
+                    nan = pd.DataFrame(columns=modbase_aligned.columns)
+                    not_nan = pd.DataFrame(columns=modbase_aligned.columns)
+                modbase_aligned = None
+                which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
+                if len(which_ones_are_match) == 0:
+                    which_ones_are_match = pd.DataFrame(
+                        columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                                 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                 'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
+                                 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                                 'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                                 'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                                 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                                 'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                                 'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                                 'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
+                                 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                                 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                                 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                                 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                                 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                                 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                                 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                                 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                                 'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
+                                 'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
+                                 'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
+                    modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan']
+                    modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan']
+
+                else:
+                    modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan']
+                    modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan']
+
+                which_ones_are_match = None
+                modbase_match.score = modbase_match.score.astype('float')
+                modbase_match = modbase_match.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
+                                                          ascending=[True, True, False])
+                modbase_match.drop_duplicates(['datapoint'], keep='first', inplace=True)
+                not_nan = None
+                nan = None
+
+                # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
+
+                # No model
+                no_info = no_info[to_swiss_columns]
+                no_info = no_info.drop_duplicates()
+
+                # Model present, no sequence
+                not_models = not_models[to_swiss_columns]
+                not_models = not_models.drop_duplicates()
+
+                # Modbase model and sequence present, no match in PDB
+                modbase_not_match = modbase_not_match[to_swiss_columns]
+                modbase_not_match = modbase_not_match.drop_duplicates()
+                if len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) != 0:
+                    rest = pd.concat([not_in_aligned, modbase_not_match, no_info])
+                elif len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) == 0:
+                    rest = pd.concat([not_in_aligned, modbase_not_match])
+                elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) != 0:
+                    rest = pd.concat([modbase_not_match, no_info])
+                elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
+                    rest = pd.concat([not_in_aligned, no_info])
+                elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) == 0:
+                    rest = not_in_aligned
+                elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) == 0:
+                    rest = modbase_not_match
+                elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
+                    rest = no_info
+                else:
+                    rest = pd.DataFrame(
+                        columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                                 'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                                 'wt_sequence_match', 'whichIsoform', 'datapoint'])
+
+                rest = rest[to_swiss_columns]
+                rest = rest.drop_duplicates()
+
+                rest.reset_index(inplace=True)
+                rest.drop(['index'], axis=1, inplace=True)
+                rest = rest.astype('str')
+
+
+            else:
+
+                modbase_match = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                             'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
+                             'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                             'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                             'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                             'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
+                             'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                             'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                             'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                             'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                             'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                             'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                             'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                             'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                             'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
+                             'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
+                             'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
+                not_in_aligned = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                             'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
+                             'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                             'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                             'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
+                             'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                             'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                             'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
+                             'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
+                no_info = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                             'wt_sequence_match', 'whichIsoform', 'datapoint'])
+                rest = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                             'wt_sequence_match', 'whichIsoform', 'datapoint'])
+
+                rest = rest[to_swiss_columns]
+                rest = rest.drop_duplicates()
+
+                rest.reset_index(inplace=True)
+                rest.drop(['index'], axis=1, inplace=True)
+                rest = rest.astype('str')
+                to_modbase_size = 0
+
+            print('Modbase matching is completed...\n')
+            print('SUMMARY')
+            print('-------')
+            print('%d data points that failed to match a UniProt Sequence are discarded.' % len(
+                not_match_in_uniprot.drop_duplicates(['datapoint'])))
+            print('Of the remaining %d:' % uniprot_matched_size)
+            print('--%d of %d successfully aligned with PDB structures.' % (
+                len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size))
+            print('--%d of %d successfully aligned with SwissModels structures.' % (
+                len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size))
+            print('--%d of %d successfully aligned with Modbase structures.\n' % (
+                len(modbase_match.drop_duplicates(['datapoint'])), to_modbase_size))
+            print('--Remaining %d not found to match any models.' % len(rest.drop_duplicates(['datapoint'])))
+            print('--A total of %d datapoints will not be evaluated.\n' % (
+                    len(rest.drop_duplicates(['datapoint'])) + len(
+                not_match_in_uniprot.drop_duplicates(['datapoint']))))
+
+            print('FOR CHECKING : ',
+                  len(rest.drop_duplicates(['datapoint'])) + len(
+                      not_match_in_uniprot.drop_duplicates(['datapoint'])) + len(
+                      pdb_aligned.drop_duplicates(['datapoint'])) + len(
+                      swiss_match.drop_duplicates(['datapoint'])) + len(
+                      modbase_match.drop_duplicates(['datapoint'])) == data_size)
+            no_info = None
+            align = None
+            not_in_aligned = None
+            not_models = None
+            modbase_not_match = None
+
+            # Final corrections
+
+            # Now 3D alignment.
+            pdb = pdb_aligned.copy()
+            swiss = swiss_match.copy()
+            modbase = modbase_match.copy()
+
+            pdb_aligned = None
+            swiss_match = None
+            modbase_match = None
+
+            """
+            WHAT DO WE HAVE NOW?
+            - uniprot sequence not found
+            - pdb aligned
+            - swiss aligned
+            - modbase aligned
+            - not aligned with anything (rest)
+            """
+
+            # Fix the axes and  merge all data.
+
+            pdb.drop(['pdbInfo'], axis=1, inplace=True)
+            pdb.rename(columns={'resolution': 'score'}, inplace=True)
+            swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
+            modbase.rename(columns={'qmean_norm': 'score'}, inplace=True)
+
+            swiss = swiss[pdb.columns]
+            modbase = modbase[pdb.columns]
+            pdb['source'] = 'PDB'
+            swiss['source'] = 'SWISSMODEL'
+            modbase['source'] = 'MODBASE'
+            data = pd.concat([swiss, modbase, pdb])
+
+            data.reset_index(inplace=True)
+            data.drop(['index'], axis=1, inplace=True)
+            data = data.astype('str')
+            data_spare = pd.concat([not_match_in_uniprot, rest])
+            not_match_in_uniprot = None
+            pdb = None
+            swiss = None
+            modbase = None
+            rest = None
+
+            print('Generating FreeSASA files...')
+            print('------------------------------------\n')
+            # Folder to calculated RSA values.
+
+            existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
+
+            existing_free_sasa = [str(i) for i in existing_free_sasa]
+            existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
+
+            print('Calculation RSA for PDB Structure Files...\n')
+
+            pdb_only = data[data.source == 'PDB']
+            for pdbID in pdb_only.pdbID.to_list():
+                if pdbID not in existing_free_sasa:
+                    (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
+                                  Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
+                                  include_hetatms=True,
+                                  outdir=None, force_rerun=False, file_type='pdb'))
+
+            print('Calculation RSA for SwissModel Files...\n')
+            swiss_only = data[data.source == 'SWISSMODEL']
+            swiss_dp = []
+            for i in swiss_only.index:
+                swiss_dp.append(swiss_only.at[i, 'uniprotID'] + '_' + swiss_only.at[i, 'pdbID'].lower() + '_' + str(
+                    round(float(swiss_only.at[i, 'score']), 2)))
+            for pdbID in swiss_dp:
+                if pdbID not in existing_free_sasa:
+                    (run_freesasa(Path(path_to_output_files / 'swissmodel_structures' / f'{pdbID}.txt'),
+                                  Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True,
+                                  outdir=None, force_rerun=False, file_type='pdb'))
+
+            print('Calculation RSA for Modbase Model Files...\n')
+            modbase_only = data[data.source == 'MODBASE']
+            for pdbID in modbase_only.pdbID.to_list():
+                if pdbID not in existing_free_sasa:
+                    (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
+                                  Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
+                                  include_hetatms=True,
+                                  outdir=None, force_rerun=False, file_type='pdb'))
+
+            # This annotation list is different than the prev one, keep it.
+
+            annotation_list += ['domainStartonPDB', 'domainEndonPDB']
+
+            folder_path = path_to_output_files / 'freesasa_files'
+
+            aligner = Align.PairwiseAligner()
+            print('Proceeding to 3D distance calculation...\n')
+
+            data.domainEndonPDB = data.domainEndonPDB.astype(str)
+            data.domainStartonPDB = data.domainStartonPDB.astype(str)
+
+            existing_free_sasa = None
+            swiss_dp = None
+            pdb_only = None
+            swiss_only = None
+            modbase_only = None
+            data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C')
+            data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C')
+            for i in data.index:
+                id_ = data.at[i, 'pdbID'].lower()
+                up_id_ = data.at[i, 'uniprotID']
+                score_ = str(data.at[i, 'score'])
+                if data.at[i, 'source'] == 'PDB':
+                    pdb_path = Path(path_to_output_files / 'pdb_structures' / f'{id_}.pdb')
+                elif data.at[i, 'source'] == 'MODBASE':
+                    pdb_path = Path(path_to_output_files / 'modbase_structures_individual' / f'{id_}.txt')
+                elif data.at[i, 'source'] == 'SWISSMODEL':
+                    pdb_path = Path(path_to_output_files / 'swissmodel_structures' / f'{up_id_}_{id_}_{score_}.txt')
+
+                pdbSequence = data.at[i, 'pdbSequence']
+                source = data.at[i, 'source']
+                chain = data.at[i, 'chain']
+                uniprotID = data.at[i, 'uniprotID']
+                pdbID = data.at[i, 'pdbID']
+                alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
+                                               Path(path_to_output_files / '3D_alignment'), file_format='gzip')
+                mutPos = data.at[i, 'mutationPositionOnPDB']
+                try:
+                    coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
+                except:
+                    ValueError
+                    coordMut = 'nan'
+                try:
+                    sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
+                    data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'],
+                                              sasa_pos, data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb')
+                except:
+                    ValueError
+                    data.at[i, 'sasa'] = 'nan'  # mutation position is nan
+                for annot in annotation_list:
+                    annotx = []
+                    try:
+                        positions_of_annotations = data.at[i, annot].split(',')
+                        for pos in positions_of_annotations:
+                            pos = pos.strip().strip('\'').strip('[\'').strip('\']')
+                            try:
+                                if '-' not in pos:
+                                    pos = int(float(pos))
+                                    coordAnnot = get_coords(pos, alignments, 'nan', 'nan', mode)[0]
+                                    try:
+                                        annotx.append(find_distance(coordMut, coordAnnot))
+                                    except:
+                                        ValueError
+
+                                else:
+                                    for r in range(int(pos.split('-')[0]), int(pos.split('-')[1]) + 1):
+                                        coordAnnot = get_coords(r, alignments, 'nan', 'nan', mode)[0]
+                                        annotx.append(find_distance(coordMut, coordAnnot))
+                            except:
+                                ValueError
+                        try:
+                            data.at[i, annot] = min([float(i) for i in annotx])
+                        except:
+                            ValueError
+                            data.at[i, annot] = 'nan'
+
+                    except:
+                        ValueError
+
+                if (str(data.at[i, 'domainStartonPDB']) == 'NaN' or str(data.at[i, 'domainStartonPDB']) == 'nan') and (
+                        str(data.at[i, 'domainEndonPDB']) != 'NaN' and str(data.at[i, 'domainEndonPDB']) != 'nan'):
+                    data.at[i, 'domainStartonPDB'] = 100000
+                elif (str(data.at[i, 'domainEndonPDB']) == 'NaN' or str(data.at[i, 'domainEndonPDB']) == 'nan') and (
+                        str(data.at[i, 'domainStartonPDB']) != 'NaN' and str(data.at[i, 'domainStartonPDB']) != 'nan'):
+                    data.at[i, 'domainEndonPDB'] = 100000
+                elif (str(data.at[i, 'domainStartonPDB']) == 'NaN' and str(data.at[i, 'domainEndonPDB']) == 'nan'):
+                    data.at[i, 'domaindistance3D'] = 'nan'
+
+                data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
+                                                     float(data.at[i, 'domainEndonPDB']))
+                data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
+                                                     float(data.at[i, 'domainEndonPDB']))
+
+            data = data.astype(str)
+            data.replace({'NaN': 'nan'}, inplace=True)
+
+            # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
+
+            # Get interface positions from ECLAIR. Download HQ human
+            print()
+            print('Assigning surface regions...')
+            print('------------------------------------\n')
+
+            print('Extracting interface residues...\n')
+            data_interface = pd.read_csv(path_to_interfaces, sep='\t')
+
+            positions = get_interface_positions(data_interface, 'P1', 'P2')
+
+            interface_dataframe = pd.DataFrame()
+
+            for key, val in positions.items():
+                k = pd.Series((key, str(list(set(val)))))
+                interface_dataframe = interface_dataframe.append(k, ignore_index=True)
+            interface_dataframe.columns = ['uniprotID', 'positions']
+
+            if len(data) == 0:
+                data = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                             'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
+                             'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
+                             'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
+                             'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
+                             'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                             'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
+                             'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                             'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                             'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                             'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                             'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                             'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                             'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                             'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                             'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
+                             'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
+                             'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
+            else:
+                data.sasa = data.sasa.astype('str')
+
+            for i in data.index:
+                if '*' in data.at[i, 'sasa']:
+                    data.at[i, 'sasa'] = data.at[i, 'sasa'].split('*')[0]
+
+            data.sasa = data.sasa.replace({'N/A': 'nan'})
+            data.sasa = data.sasa.replace({'None': 'nan'})
+            data.replace({'   N/A': 'nan'}, inplace=True)
+            data.replace({'None': 'nan'}, inplace=True)
+            data.sasa = data.sasa.astype(float)
+            data = data.astype(str)
+            for i in data.index:
+                if float(data.at[i, 'sasa']) < 5:
+                    data.at[i, 'trsh4'] = 'core'
+                elif float(data.at[i, 'sasa']) >= 5:
+                    data.at[i, 'trsh4'] = 'surface'
+                elif data.at[i, 'sasa'] == 'nan':
+                    data.at[i, 'trsh4'] = 'nan'
+
+            data = data.merge(interface_dataframe, on='uniprotID', how='left')
+            data.positions = data.positions.astype('str')
+            for i in data.index:
+                if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
+                    print((str(data.at[i, 'pos']) in data.at[i, 'positions']))
+                    data.at[i, 'threeState_trsh4_HQ'] = 'interface'
+                elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface':
+                    data.at[i, 'threeState_trsh4_HQ'] = 'surface'
+                elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core':
+                    data.at[i, 'threeState_trsh4_HQ'] = 'core'
+                elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core':
+                    data.at[i, 'threeState_trsh4_HQ'] = 'conflict'
+                elif data.at[i, 'trsh4'] == 'nan':
+                    data.at[i, 'threeState_trsh4_HQ'] = 'nan'
+
+            data.drop(['positions'], axis=1, inplace=True)
+
+            # OPTIONAL
+            # DOMAIN SELECTION
+            # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
+            # significant domains and 53th category will be NULL.
+
+            fisherResult = pd.read_csv(fisher_path, sep='\t')
+
+            significant_domains = fisherResult.domain.to_list()
+            for i in data.index:
+                if data.at[i, 'domain'] in significant_domains:
+                    data.at[i, 'domain_fisher'] = data.at[i, 'domain']
+                else:
+                    data.at[i, 'domain_fisher'] = 'NULL'
+
+            # Change the numbering for binary annotations and create 3 classes:
+            # nan--> 0, 0 -->1 and 1 -->2
+
+            print('Final adjustments are being done...\n')
+            binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
+                          'dnaBindingBinary',
+                          'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                          'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                          'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                          'repeatBinary', 'caBindingBinary', 'topologicalDomainBinary',
+                          'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                          'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                          'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                          'glycosylationBinary', 'propeptideBinary']
+            data = data.astype(str)
+            data.replace({'NaN': 'nan'}, inplace=True)
+            for i in data.index:
+                for j in binaryCols:
+                    data[j] = data[j].astype('str')
+                    if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'):
+                        data.at[i, j] = '1'
+                    elif data.at[i, j] == 'nan':
+                        data.at[i, j] = '0'
+                    elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'):
+                        data.at[i, j] = '2'
+
+            annotCols = ['disulfide', 'intMet', 'intramembrane',
+                         'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
+                         'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
+                         'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding',
+                         'topologicalDomain', 'bindingSite', 'region', 'signalPeptide',
+                         'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
+                         'transitPeptide', 'glycosylation', 'propeptide']
+
+            for i in data.index:
+                for annot in annotCols:
+                    binaryName = str(annot) + 'Binary'
+                    if data.at[i, binaryName] == '2':
+                        data.at[i, annot] = '0.0'
+            data.replace({'100000': 'nan'}, inplace=True)
+            data = add_physicochemical(data)
+            data.rename(
+                columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
+                         'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
+                         'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
+                         'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state',
+                         'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin',
+                         'intramembraneBinary': 'intramembrane_bin',
+                         'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin',
+                         'activeSiteBinary': 'activeSite_bin',
+                         'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin',
+                         'siteBinary': 'site_bin',
+                         'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin',
+                         'mutagenesisBinary': 'mutagenesis_bin',
+                         'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin',
+                         'metalBindingBinary': 'metalBinding_bin',
+                         'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin',
+                         'caBindingBinary': 'caBinding_bin',
+                         'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin',
+                         'signalPeptideBinary': 'signalPeptide_bin',
+                         'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin',
+                         'motifBinary': 'motif_bin',
+                         'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin',
+                         'transitPeptideBinary': 'transitPeptide_bin',
+                         'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin',
+                         'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist',
+                         'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist',
+                         'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist',
+                         'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist',
+                         'site': 'site_dist',
+                         'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist',
+                         'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist',
+                         'turn': 'turn_dist',
+                         'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist',
+                         'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist',
+                         'bindingSite': 'bindingSite_dist', 'region': 'region_dist',
+                         'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist',
+                         'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist',
+                         'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
+                         'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
+
+            data = data[
+                ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity',
+                 'volume',
+                 'granthamScore', 'domains_all',
+                 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin',
+                 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin',
+                 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin',
+                 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin',
+                 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin',
+                 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin',
+                 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin',
+                 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin',
+                 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist',
+                 'intramembrane_dist',
+                 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist',
+                 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist',
+                 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist',
+                 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist',
+                 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist',
+                 'bindingSite_dist', 'region_dist', 'signalPeptide_dist',
+                 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist',
+                 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist',
+                 'glycosylation_dist', 'propeptide_dist']]
+
+            ready = data.copy()
+            # Imputation
+            if (impute == 'True') or (impute == 'true') or (impute == True):
+                filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9,
+                          15.99, 16.82,
+                          20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33,
+                          22.36]
+                col_index = 0
+                for col_ in ready.columns[-30:]:
+                    ready[col_] = ready[col_].fillna(filler[col_index])
+                    ready[col_] = ready[col_].replace({'nan': filler[col_index]})
+                    col_index += 1
+                ready['domains_3Ddist'] = ready['domains_3Ddist'].fillna(24.5)
+                ready['sasa'] = ready['sasa'].fillna(29.5)
+                ready['location_3state'] = ready['location_3state'].fillna('unknown')
+            elif (impute == 'False') or (impute == 'false') or (impute == False):
+                pass
+            ready = ready.replace({'nan': np.NaN})
+            ready = ready.astype(str)
+            ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
+            if len(ready) == 0:
+                print(
+                    'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
+            #st.write(ready)
+            print('Feature vector successfully created...')
+        end = timer()
+        hours, rem = divmod(end - start, 3600)
+        minutes, seconds = divmod(rem, 60)
+        print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
+
+        return ready
+
+    except:
+        AttributeError
+        
+