#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>

#include "include/weights.h"
#include "include/twister.h"

#ifdef USE_FLOAT
char* weights_fmt = "%0.8e\n";
#else
char* weights_fmt = "%0.16e\n";
#endif

void MSAReweightSequences(alignment_t *ali, options_t *options) {
    /* Reweight seqeuences by their inverse neighborhood size. Each sequence's
       weight is the inverse of the number of neighboring sequences with less
       than THETA percent divergence
    */
    for (int i = 0; i < ali->nSeqs; i++) ali->weights[i] = 1.0;

    /* Only apply reweighting if theta is on [0,1] */
    if (options->theta >= 0 && options->theta <= 1) {
        /* The neighborhood size of each sequence is the number of sequences
           in the alignment within theta percent divergence */

        if (options->fastWeights > 0 && options->fastWeights < ali->nSeqs) {
            /* Cluster the sequences with k-consensus */
            int nClusters = options->fastWeights;
            int nIterations = 10;
            int nSeqs = ali->nSeqs;
            int nCodes = ali->nCodes;
            int nSites = ali->nSites;

#define COUNTS(i,j,a) counts[i * nSites * nCodes + j * nCodes + a]
#define CONSENSUS(i,j) consensus[i * nSites + j]
#define ALI(i,j) aliPermute[i * nSites + j]

            /* Pick initial clusters with Reservoir sampling */
            int *clusters = (int *) malloc(nClusters * sizeof(int));
            letter_t *consensus = (letter_t *) malloc(nClusters * nSites * sizeof(letter_t));
            for (int i = 0; i < nClusters; i++) clusters[i] = i;
            for (int i = nClusters; i < nSeqs; i++) {
                int ix = genrand_int32() % (i);
                if (ix < nClusters) clusters[ix] = i;
            }
            for (int i = 0; i < nClusters; i++)
                for (int j = 0; j < nSites; j++)
                    CONSENSUS(i,j) = seq(clusters[i], j);
            free(clusters);

            /* EM steps */
            int *assignment = (int *) malloc(nSeqs * sizeof(int));
            int *counts = (int *) malloc(nClusters * nSites * nCodes * sizeof(int));
            int *radii = (int *) malloc(nClusters * sizeof(int));
            for (int i = 0; i < nSeqs; i++) assignment[i] = 0;
            fprintf(stderr, "Clustering");
            for (int t = 0; t < nIterations; t++) {
                fprintf(stderr, ".");
                /* Step 1. Update the assignments */
                for (int i = 0; i < nClusters; i++) radii[i] = 0;
#pragma omp parallel for
                for (int s = 0; s < nSeqs; s++) {
                    int ixOld = assignment[s];
                    /* Current distance to current assignment */
                    numeric_t distance = 0;
                    for (int j = 0; j < nSites; j++)
                        distance += (CONSENSUS(ixOld, j) != seq(s, j));
                    /* Find closest */
                    int ixNew = ixOld;
                    for (int i = 0; i < nClusters; i++) {
                        numeric_t distanceI = 0;
                        for (int j = 0; j < nSites; j++)
                            distanceI += (CONSENSUS(i, j) != seq(s, j));
                        if (distanceI < distance) {
                            ixNew = i;
                            distance = distanceI;
                        }
                    }
                    if (ixNew != ixOld) assignment[s] = ixNew;
                    if (radii[ixNew] < distance) radii[ixNew] = distance;
                }
                /* --------------------------_DEBUG_--------------------------*/
                // for (int s = 0; s < nClusters; s++) {
                //     int size = 0;
                //     for (int i = 0; i < nSeqs; i++) size += (assignment[i] == s);
                //     fprintf(stderr, ">Cluster %d, %d members, radius %d\n", s, size, radii[s]);
                //     for (int i = 0; i < ali->nSites; i++)
                //         if (CONSENSUS(s,i) >= 0) {
                //             fprintf(stderr, "%c", ali->alphabet[CONSENSUS(s,i)]);
                //         } else {
                //             fprintf(stderr, " ");
                //         }
                //     fprintf(stderr, "\n");
                // }
                /* --------------------------^DEBUG^--------------------------*/

                /* Step 2. Update the consensus sequences */
                /* Update the counts */
                if (t < nIterations - 1) {
                    for (int i = 0; i < nClusters * nSites * nCodes; i++)
                        counts[i] = 0;
                    for (int s = 0; s < nSeqs; s++)
                        for (int j = 0; j < nSites; j++)
                            COUNTS(assignment[s], j, seq(s, j)) += 1;
#pragma omp parallel for
                    for (int i = 0; i < nClusters; i++)
                        for (int j = 0; j < nSites; j++) {
                            int topCode = 0;
                            int topCounts = COUNTS(i, j, 0);
                            for (int b = 1; b < nCodes; b++)
                                if (COUNTS(i, j, b) > topCounts) {
                                    topCode = b;
                                    topCounts = COUNTS(i, j, b);
                                }
                            CONSENSUS(i ,j) = topCode;
                        }
                }
            }
            fprintf(stderr, "\n");

            /* Profile-profile distances */
            numeric_t *clusterID = (numeric_t *) malloc(nClusters * nClusters * sizeof(numeric_t));
            for (int i = 0; i < nClusters * nClusters; i++) clusterID[i] = 0;
#pragma omp parallel for
            for (int pi = 0; pi < nClusters; pi++)
                for (int pj = 0; pj < nClusters; pj++)
                    for (int j = 0; j < nSites; j++)
                        clusterID[pi + pj * nClusters] +=
                                (CONSENSUS(pi,j) == CONSENSUS(pj,j));

            free(consensus);
            free(counts);

            /* Permute alignment */
            int *clusterSizes = (int *) malloc(nClusters * sizeof(int));
            int *clusterStart = (int *) malloc(nClusters * sizeof(int));
            int *clusterEnd = (int *) malloc(nClusters * sizeof(int));
            int *permuteMap = (int *) malloc(nSeqs * sizeof(int));
            numeric_t *weightsP = (numeric_t *) malloc(nSeqs * sizeof(numeric_t));
            letter_t *aliPermute = (letter_t *) malloc(nSeqs * nSites * sizeof(letter_t));
            for (int i = 0; i < nClusters; i++) clusterSizes[i] = 0;
            for (int s = 0; s < ali->nSeqs; s++)
                clusterSizes[assignment[s]] += 1;
            int ix = 0;
            for (int i = 0; i < nClusters; i++) {
                clusterStart[i] = ix;
                ix += clusterSizes[i];
                clusterEnd[i] = ix;
            }
            ix = 0;
            for (int i = 0; i < nClusters; i++)
                for (int s = 0; s < ali->nSeqs; s++)
                    if (assignment[s] == i) {
                        for (int j = 0; j < nSites; j++)
                            ALI(ix,j) = seq(s,j);
                        permuteMap[ix] = s;
                        ix++;
                    }

            /* ----------------------------_DEBUG_----------------------------*/
            // for (int s = 0; s < nSeqs; s++) {
            //     fprintf(stdout, ">Seq %d\n", s);
            //     for (int i = 0; i < ali->nSites; i++)
            //             fprintf(stdout, "%c", ali->alphabet[ALI(s,i)]);
            //     fprintf(stdout, "\n");
            // }
            /* ----------------------------^DEBUG^----------------------------*/

            /* Sequence weights */
            numeric_t cutoff = (numeric_t) ((1 - options->theta) * ali->nSites);
            for (int s = 0; s < nSeqs; s++) weightsP[s] = 1;
#pragma omp parallel for
            for (int ci = 0; ci < nClusters; ci++)
                for (int cj = 0; cj < nClusters; cj++)
                    if (clusterID[ci * nClusters + cj] >= 0.9 * cutoff)
                        for (int s = clusterStart[ci]; s < clusterEnd[ci]; s++)
                            for (int t = clusterStart[cj]; t < clusterEnd[cj]; t++)
                                if (s != t) {
                                    int id = 0;
                                    for (int n = 0; n < ali->nSites; n++)
                                        id += (ALI(s, n) == ALI(t, n));
                                    if (id >= cutoff) weightsP[s] += 1.0;
                                }
            for (int s = 0; s < nSeqs; s++)
                ali->weights[permuteMap[s]] = weightsP[s];

#undef COUNTS
#undef CONSENSUS
#undef ALI

            free(clusterSizes);
            free(clusterStart);
            free(clusterEnd);
            free(permuteMap);
            free(weightsP);
            free(radii);
            free(aliPermute);
        } else {
            /* Deterministic sequence weights */
#if defined(_OPENMP)
            /* Naive parallelization is faster ignoring symmetry */
            #pragma omp parallel for
            for (int s = 0; s < ali->nSeqs; s++)
                for (int t = 0; t < ali->nSeqs; t++)
                    if (s != t) {
                        int id = 0;
                        for (int n = 0; n < ali->nSites; n++)
                            id += (seq(s, n) == seq(t, n));
                        if (id >= ((1 - options->theta) * ali->nSites))
                            ali->weights[s] += 1.0;
                    }
#else
            /* For a single core, take advantage of symmetry */
            for (int s = 0; s < ali->nSeqs - 1; s++)
                for (int t = s + 1; t < ali->nSeqs; t++) {
                    int id = 0;
                    for (int n = 0; n < ali->nSites; n++)
                        id += (seq(s, n) == seq(t, n));
                    if (id >= ((1 - options->theta) * ali->nSites)) {
                        ali->weights[s] += 1.0;
                        ali->weights[t] += 1.0;
                    }
                }
#endif
        }

        /* Reweight sequences by the inverse of the neighborhood size */
        for (int i = 0; i < ali->nSeqs; i++)
            ali->weights[i] = 1.0 / ali->weights[i];
    }

    /* Scale sets the effective number of samples per neighborhood */
    for (int i = 0; i < ali->nSeqs; i++)
        ali->weights[i] *= options->scale;

    /* The effective number of sequences is then the sum of the weights */
    ali->nEff = 0;
    for (int i = 0; i < ali->nSeqs; i++) ali->nEff += ali->weights[i];

    if (options->theta >= 0 && options->theta <= 1) {
        fprintf(stderr,
                "Effective number of samples: %.1f\t(%.0f%% identical neighborhood = %.3f samples)\n",
                ali->nEff, 100 * (1 - options->theta), options->scale);
    } else {
        fprintf(stderr,
                "Theta not between 0 and 1, no sequence reweighting applied (N = %.2f)\n",
                ali->nEff);
    }
}

int ValidateCustomWeightsFile(char *weightsFile, alignment_t *ali) {
    /* Check that the weights file exists */
    /* Remember to close file pointer before returning. */
    FILE *fp = fopen(weightsFile, "r");
    if (fp == NULL) {
        fprintf(stderr, "Error: weights file %s does not exist\n", weightsFile);
        fclose(fp);
        return 1;
    }

    /* Count number of lines in file */
    int nLines = 0;
    int MAX_LINE_LENGTH = 1024;
    char line[MAX_LINE_LENGTH];
    while (fgets(line, MAX_LINE_LENGTH, fp) != NULL) nLines++;

    int nSeqsRaw = ali->nSeqs + ali->nSkippedSeqs;
    if (nLines != nSeqsRaw) {
        fprintf(stderr, "Error: weights file %s has %d lines, but alignment has %d sequences\n",
                weightsFile, nLines, nSeqsRaw);
        fclose(fp);
        return 1;
    }

    fclose(fp);
    return 0;
}

void ReadCustomWeightsFile(char *weightsFile, alignment_t *ali) {
    /* Note: Not using options->scale (or options->theta) for now (assuming this is done in original weights calc).*/
    /* Most of this is copied from MSAReweightSequences() */

    int validCode = ValidateCustomWeightsFile(weightsFile, ali);
    if (validCode != 0) {
        fprintf(stderr, "Error: weights file %s is invalid\n", weightsFile);
        exit(1);
    }

    /* Load weights (float array) into ali->weights and set ali->nEff */
    FILE *fp = fopen(weightsFile, "r");
    if (fp == NULL) {
        fprintf(stderr, "Error: could not open weights file %s\n", weightsFile);
        exit(1);
    }
    /* Reinitialize array just in case */
    for (int i = 0; i < ali->nSeqs; i++) ali->weights[i] = 1.0;

    /* Read weights, one float per line */
    int skippedIdx = 0, reducedIdx = 0, nWarnings = 0, maxWarnings = 64;
    int nSeqsTotal = ali->nSeqs + ali->nSkippedSeqs;
    for (int i = 0; i < nSeqsTotal; i++) {
        long double w;
        if (fscanf(fp, "%Lf", &w) != 1) {
            fprintf(stderr, "Error reading weights file %s at position %d\n", weightsFile, i);
            exit(1);
        }
        // Skip invalid sequence weights
        if ((skippedIdx < ali->nSkippedSeqs) && (i == ali->skippedSeqs[skippedIdx])) {
            if (w > 0 && nWarnings < maxWarnings) {
                fprintf(stderr, "Warning: Skipped nonzero weight in file %s at position %d\n", weightsFile, i);
                nWarnings++;
            }
            skippedIdx++;
            continue;
        } else {
            ali->weights[reducedIdx] = (numeric_t)w;
            reducedIdx++;
        }
    }
    fclose(fp);

    /* The effective number of sequences is then the sum of the weights */
    ali->nEff = 0;
    for (int i = 0; i < ali->nSeqs; i++) ali->nEff += ali->weights[i];

    fprintf(stderr,
            "Weights loaded successfully. Effective number of samples (to 1 decimal place): %.1f.\n",
            ali->nEff);
}

void WriteWeightsFile(char *weightsFile, alignment_t *ali) {
    // Note: Ignoring options->scale and options->theta here, writing out raw weights
    /* Write weights to file */
    FILE *fpOutput = fopen(weightsFile, "w");
    if (fpOutput == NULL) {
        fprintf(stderr, "Error: could not open weights file %s\n", weightsFile);
        exit(1);
    }
    // Write out weights, one float per line, and include invalid seqs as weight 0
    // Copied from plm.OutputParametersFull()
    int skipix = 0, reducedix = 0;
    for (int i = 0; i < (ali->nSeqs + ali->nSkippedSeqs); i++) {
        if (skipix < ali->nSkippedSeqs && i == ali->skippedSeqs[skipix]) {
            /* Skip skipped sequences */
            fprintf(fpOutput, "0.0\n");
            skipix++;
        } else {
            numeric_t w = ali->weights[reducedix];
            fprintf(fpOutput, weights_fmt, w);
            reducedix++;
        }
    }
    fclose(fpOutput);
}