/* TODO:
 * +  prune all unused code
 * + profile
 */
/*
 * fec.c -- forward error correction based on Vandermonde matrices
 * 980624
 * (C) 1997-98 Luigi Rizzo (luigi@iet.unipi.it)
 *
 * Portions derived from code by Phil Karn (karn@ka9q.ampr.org),
 * Robert Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari
 * Thirumoorthy (harit@spectra.eng.hawaii.edu), Aug 1995
 *
 * Modifications by Dan Rubenstein (see Modifications.txt for 
 * their description.
 * Modifications (C) 1998 Dan Rubenstein (drubenst@cs.umass.edu)
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above
 *    copyright notice, this list of conditions and the following
 *    disclaimer in the documentation and/or other materials
 *    provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
 * OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include "fec.h"


/*
 * If you get a error returned (negative value) from a fec_* function, 
 * look in here for the error message.
 */

#define FEC_ERROR_SIZE 1025
char fec_error[FEC_ERROR_SIZE+1];

#define ERR(...) (snprintf(fec_error, FEC_ERROR_SIZE, __VA_ARGS__))

/*
 * Primitive polynomials - see Lin & Costello, Appendix A,
 * and  Lee & Messerschmitt, p. 453.
 */
static const char*const Pp="101110001";


/*
 * To speed up computations, we have tables for logarithm, exponent and
 * inverse of a number.  We use a table for multiplication as well (it takes
 * 64K, no big deal even on a PDA, especially because it can be
 * pre-initialized an put into a ROM!), otherwhise we use a table of
 * logarithms. In any case the macro gf_mul(x,y) takes care of
 * multiplications.
 */

static gf gf_exp[510];  /* index->poly form conversion table    */
static int gf_log[256]; /* Poly->index form conversion table    */
static gf inverse[256]; /* inverse of field elem.               */
                                /* inv[\alpha**i]=\alpha**(GF_SIZE-i-1) */

/*
 * modnn(x) computes x % GF_SIZE, where GF_SIZE is 2**GF_BITS - 1,
 * without a slow divide.
 */
static inline gf
modnn(int x) {
    while (x >= 255) {
        x -= 255;
        x = (x >> 8) + (x & 255);
    }
    return x;
}

#define SWAP(a,b,t) {t tmp; tmp=a; a=b; b=tmp;}

/*
 * gf_mul(x,y) multiplies two numbers.  It is much faster to use a
 * multiplication table.
 *
 * USE_GF_MULC, GF_MULC0(c) and GF_ADDMULC(x) can be used when multiplying
 * many numbers by the same constant. In this case the first call sets the
 * constant, and others perform the multiplications.  A value related to the
 * multiplication is held in a local variable declared with USE_GF_MULC . See
 * usage in addmul1().
 */
static gf gf_mul_table[256][256];

#define gf_mul(x,y) gf_mul_table[x][y]

#define USE_GF_MULC register gf * __gf_mulc_
#define GF_MULC0(c) __gf_mulc_ = gf_mul_table[c]
#define GF_ADDMULC(dst, x) dst ^= __gf_mulc_[x]

/*
 * Generate GF(2**m) from the irreducible polynomial p(X) in p[0]..p[m]
 * Lookup tables:
 *     index->polynomial form		gf_exp[] contains j= \alpha^i;
 *     polynomial form -> index form	gf_log[ j = \alpha^i ] = i
 * \alpha=x is the primitive element of GF(2^m)
 *
 * For efficiency, gf_exp[] has size 2*GF_SIZE, so that a simple
 * multiplication of two numbers can be resolved without calling modnn
 */
static void
init_mul_table () {
  int i, j;
  for (i = 0; i < 256; i++)
      for (j = 0; j < 256; j++)
          gf_mul_table[i][j] = gf_exp[modnn (gf_log[i] + gf_log[j])];

  for (j = 0; j < 256; j++)
      gf_mul_table[0][j] = gf_mul_table[j][0] = 0;
}

/*
 * i use malloc so many times, it is easier to put checks all in
 * one place.
 */
static void *
my_malloc (int sz, char *err_string) {
    void *p = malloc (sz);
    if (p == NULL) {
        ERR("Malloc failure allocating %s\n", err_string);
        exit (1);
    }
    return p;
}

#define NEW_GF_MATRIX(rows, cols) \
    (gf *)my_malloc(rows * cols * sizeof(gf), " ## __LINE__ ## " )

/*
 * initialize the data structures used for computations in GF.
 */
static void
generate_gf (void) {
    int i;
    gf mask;

    mask = 1;                     /* x ** 0 = 1 */
    gf_exp[8] = 0;          /* will be updated at the end of the 1st loop */
    /*
     * first, generate the (polynomial representation of) powers of \alpha,
     * which are stored in gf_exp[i] = \alpha ** i .
     * At the same time build gf_log[gf_exp[i]] = i .
     * The first 8 powers are simply bits shifted to the left.
     */
    for (i = 0; i < 8; i++, mask <<= 1) {
        gf_exp[i] = mask;
        gf_log[gf_exp[i]] = i;
        /*
         * If Pp[i] == 1 then \alpha ** i occurs in poly-repr
         * gf_exp[8] = \alpha ** 8
         */
        if (Pp[i] == '1')
            gf_exp[8] ^= mask;
    }
    /*
     * now gf_exp[8] = \alpha ** 8 is complete, so can also
     * compute its inverse.
     */
    gf_log[gf_exp[8]] = 8;
    /*
     * Poly-repr of \alpha ** (i+1) is given by poly-repr of
     * \alpha ** i shifted left one-bit and accounting for any
     * \alpha ** 8 term that may occur when poly-repr of
     * \alpha ** i is shifted.
     */
    mask = 1 << 7;
    for (i = 9; i < 255; i++) {
        if (gf_exp[i - 1] >= mask)
            gf_exp[i] = gf_exp[8] ^ ((gf_exp[i - 1] ^ mask) << 1);
        else
            gf_exp[i] = gf_exp[i - 1] << 1;
        gf_log[gf_exp[i]] = i;
    }
    /*
     * log(0) is not defined, so use a special value
     */
    gf_log[0] = 255;
    /* set the extended gf_exp values for fast multiply */
    for (i = 0; i < 255; i++)
        gf_exp[i + 255] = gf_exp[i];

    /*
     * again special cases. 0 has no inverse. This used to
     * be initialized to 255, but it should make no difference
     * since noone is supposed to read from here.
     */
    inverse[0] = 0;
    inverse[1] = 1;
    for (i = 2; i <= 255; i++)
        inverse[i] = gf_exp[255 - gf_log[i]];
}

/*
 * Various linear algebra operations that i use often.
 */

/*
 * addmul() computes dst[] = dst[] + c * src[]
 * This is used often, so better optimize it! Currently the loop is
 * unrolled 16 times, a good value for 486 and pentium-class machines.
 * The case c=0 is also optimized, whereas c=1 is not. These
 * calls are unfrequent in my typical apps so I did not bother.
 */
#define addmul(dst, src, c, sz)                 \
    if (c != 0) addmul1(dst, src, c, sz)

#define UNROLL 16               /* 1, 4, 8, 16 */
static void
addmul1 (gf * dst1, const gf * src1, gf c, int sz) {
    USE_GF_MULC;
    register gf *dst = dst1;
    register const gf *src = src1;
    gf *lim = &dst[sz - UNROLL + 1];

    GF_MULC0 (c);

#if (UNROLL > 1)                /* unrolling by 8/16 is quite effective on the pentium */
    for (; dst < lim; dst += UNROLL, src += UNROLL) {
        GF_ADDMULC (dst[0], src[0]);
        GF_ADDMULC (dst[1], src[1]);
        GF_ADDMULC (dst[2], src[2]);
        GF_ADDMULC (dst[3], src[3]);
#if (UNROLL > 4)
        GF_ADDMULC (dst[4], src[4]);
        GF_ADDMULC (dst[5], src[5]);
        GF_ADDMULC (dst[6], src[6]);
        GF_ADDMULC (dst[7], src[7]);
#endif
#if (UNROLL > 8)
        GF_ADDMULC (dst[8], src[8]);
        GF_ADDMULC (dst[9], src[9]);
        GF_ADDMULC (dst[10], src[10]);
        GF_ADDMULC (dst[11], src[11]);
        GF_ADDMULC (dst[12], src[12]);
        GF_ADDMULC (dst[13], src[13]);
        GF_ADDMULC (dst[14], src[14]);
        GF_ADDMULC (dst[15], src[15]);
#endif
    }
#endif
    lim += UNROLL - 1;
    for (; dst < lim; dst++, src++)       /* final components */
        GF_ADDMULC (*dst, *src);
}

/*
 * computes C = AB where A is n*k, B is k*m, C is n*m
 */
static void
matmul (gf * a, gf * b, gf * c, int n, int k, int m) {
    int row, col, i;

    for (row = 0; row < n; row++) {
        for (col = 0; col < m; col++) {
            gf *pa = &a[row * k];
            gf *pb = &b[col];
            gf acc = 0;
            for (i = 0; i < k; i++, pa++, pb += m)
                acc ^= gf_mul (*pa, *pb);
            c[row * m + col] = acc;
        }
    }
}

/*
 * invert_mat() takes a matrix and produces its inverse
 * k is the size of the matrix.
 * (Gauss-Jordan, adapted from Numerical Recipes in C)
 * Return non-zero if singular.
 */
static int
invert_mat (gf * src, int k) {
    gf c, *p;
    int irow, icol, row, col, i, ix;

    int error = -1;
    int *indxc = (int *) my_malloc (k * sizeof (int), "indxc");
    int *indxr = (int *) my_malloc (k * sizeof (int), "indxr");
    int *ipiv = (int *) my_malloc (k * sizeof (int), "ipiv");
    gf *id_row = NEW_GF_MATRIX (1, k);
    gf *temp_row = NEW_GF_MATRIX (1, k);

    memset (id_row, '\0', k * sizeof (gf));
    /*
     * ipiv marks elements already used as pivots.
     */
    for (i = 0; i < k; i++)
        ipiv[i] = 0;

    for (col = 0; col < k; col++) {
        gf *pivot_row;
        /*
         * Zeroing column 'col', look for a non-zero element.
         * First try on the diagonal, if it fails, look elsewhere.
         */
        irow = icol = -1;
        if (ipiv[col] != 1 && src[col * k + col] != 0) {
            irow = col;
            icol = col;
            goto found_piv;
        }
        for (row = 0; row < k; row++) {
            if (ipiv[row] != 1) {
                for (ix = 0; ix < k; ix++) {
                    if (ipiv[ix] == 0) {
                        if (src[row * k + ix] != 0) {
                            irow = row;
                            icol = ix;
                            goto found_piv;
                        }
                    } else if (ipiv[ix] > 1) {
                        ERR("singular matrix");
                        goto fail;
                    }
                }
            }
        }
        if (icol == -1) {
          ERR("Pivot not found!");
          goto fail;
        }
      found_piv:
        ++(ipiv[icol]);
        /*
         * swap rows irow and icol, so afterwards the diagonal
         * element will be correct. Rarely done, not worth
         * optimizing.
         */
        if (irow != icol)
            for (ix = 0; ix < k; ix++)
                SWAP (src[irow * k + ix], src[icol * k + ix], gf);
        indxr[col] = irow;
        indxc[col] = icol;
        pivot_row = &src[icol * k];
        c = pivot_row[icol];
        if (c == 0) {
            ERR("singular matrix 2");
            goto fail;
        }
        if (c != 1) {                       /* otherwhise this is a NOP */
            /*
             * this is done often , but optimizing is not so
             * fruitful, at least in the obvious ways (unrolling)
             */
            c = inverse[c];
            pivot_row[icol] = 1;
            for (ix = 0; ix < k; ix++)
                pivot_row[ix] = gf_mul (c, pivot_row[ix]);
        }
        /*
         * from all rows, remove multiples of the selected row
         * to zero the relevant entry (in fact, the entry is not zero
         * because we know it must be zero).
         * (Here, if we know that the pivot_row is the identity,
         * we can optimize the addmul).
         */
        id_row[icol] = 1;
        if (memcmp (pivot_row, id_row, k * sizeof (gf)) != 0) {
            for (p = src, ix = 0; ix < k; ix++, p += k) {
                if (ix != icol) {
                    c = p[icol];
                    p[icol] = 0;
                    addmul (p, pivot_row, c, k);
                }
            }
        }
        id_row[icol] = 0;
    }                           /* done all columns */
    for (col = k - 1; col >= 0; col--) {
        if (indxr[col] < 0 || indxr[col] >= k) {
            ERR("AARGH, indxr[col] %d\n", indxr[col]);
            goto fail;
        } else if (indxc[col] < 0 || indxc[col] >= k) {
            ERR("AARGH, indxc[col] %d\n", indxc[col]);
            goto fail;
        } else if (indxr[col] != indxc[col]) {
            for (row = 0; row < k; row++)
                SWAP (src[row * k + indxr[col]], src[row * k + indxc[col]], gf);
        }
    }
    error = 0;
  fail:
    free (indxc);
    free (indxr);
    free (ipiv);
    free (id_row);
    free (temp_row);
    return error;
}

/*
 * fast code for inverting a vandermonde matrix.
 *
 * NOTE: It assumes that the matrix is not singular and _IS_ a vandermonde
 * matrix. Only uses the second column of the matrix, containing the p_i's.
 *
 * Algorithm borrowed from "Numerical recipes in C" -- sec.2.8, but largely
 * revised for my purposes.
 * p = coefficients of the matrix (p_i)
 * q = values of the polynomial (known)
 */
int
invert_vdm (gf * src, int k) {
    int i, j, row, col;
    gf *b, *c, *p;
    gf t, xx;

    if (k == 1)                   /* degenerate case, matrix must be p^0 = 1 */
        return 0;
    /*
     * c holds the coefficient of P(x) = Prod (x - p_i), i=0..k-1
     * b holds the coefficient for the matrix inversion
     */
    c = NEW_GF_MATRIX (1, k);
    b = NEW_GF_MATRIX (1, k);

    p = NEW_GF_MATRIX (1, k);

    for (j = 1, i = 0; i < k; i++, j += k) {
        c[i] = 0;
        p[i] = src[j];            /* p[i] */
    }
    /*
     * construct coeffs. recursively. We know c[k] = 1 (implicit)
     * and start P_0 = x - p_0, then at each stage multiply by
     * x - p_i generating P_i = x P_{i-1} - p_i P_{i-1}
     * After k steps we are done.
     */
    c[k - 1] = p[0];              /* really -p(0), but x = -x in GF(2^m) */
    for (i = 1; i < k; i++) {
        gf p_i = p[i];            /* see above comment */
        for (j = k - 1 - (i - 1); j < k - 1; j++)
            c[j] ^= gf_mul (p_i, c[j + 1]);
        c[k - 1] ^= p_i;
    }

    for (row = 0; row < k; row++) {
        /*
         * synthetic division etc.
         */
        xx = p[row];
        t = 1;
        b[k - 1] = 1;             /* this is in fact c[k] */
        for (i = k - 2; i >= 0; i--) {
            b[i] = c[i + 1] ^ gf_mul (xx, b[i + 1]);
            t = gf_mul (xx, t) ^ b[i];
        }
        for (col = 0; col < k; col++)
            src[col * k + row] = gf_mul (inverse[t], b[col]);
    }
    free (c);
    free (b);
    free (p);
    return 0;
}

static int fec_initialized = 0;
static void
init_fec (void) {
    generate_gf ();
    init_mul_table ();
    fec_initialized = 1;
}

/*
 * This section contains the proper FEC encoding/decoding routines.
 * The encoding matrix is computed starting with a Vandermonde matrix,
 * and then transforming it into a systematic matrix.
 */

#define FEC_MAGIC	0xFECC0DEC

void
fec_free (fec_t *p) {
    if (p == NULL ||
        p->magic != (((FEC_MAGIC ^ p->k) ^ p->n) ^ (unsigned long) (p->enc_matrix))) {
        ERR("bad parameters to fec_free");
        return;
    }
    free (p->enc_matrix);
    free (p);
}

/*
 * create a new encoder, returning a descriptor. This contains k,n and
 * the encoding matrix.
 */
fec_t *
fec_new (unsigned char k, unsigned char n) {
    unsigned char row, col;
    gf *p, *tmp_m;

    fec_t *retval;

    fec_error[FEC_ERROR_SIZE] = '\0';

    if (fec_initialized == 0)
        init_fec ();

    retval = (fec_t *) my_malloc (sizeof (fec_t), "new_code");
    retval->k = k;
    retval->n = n;
    retval->enc_matrix = NEW_GF_MATRIX (n, k);
    retval->magic = ((FEC_MAGIC ^ k) ^ n) ^ (unsigned long) (retval->enc_matrix);
    tmp_m = NEW_GF_MATRIX (n, k);
    /*
     * fill the matrix with powers of field elements, starting from 0.
     * The first row is special, cannot be computed with exp. table.
     */
    tmp_m[0] = 1;
    for (col = 1; col < k; col++)
        tmp_m[col] = 0;
    for (p = tmp_m + k, row = 0; row < n - 1; row++, p += k) {
        for (col = 0; col < k; col++)
            p[col] = gf_exp[modnn (row * col)];
    }

    /*
     * quick code to build systematic matrix: invert the top
     * k*k vandermonde matrix, multiply right the bottom n-k rows
     * by the inverse, and construct the identity matrix at the top.
     */
    invert_vdm (tmp_m, k);        /* much faster than invert_mat */
    matmul (tmp_m + k * k, tmp_m, retval->enc_matrix + k * k, n - k, k, k);
    /*
     * the upper matrix is I so do not bother with a slow multiply
     */
    memset (retval->enc_matrix, '\0', k * k * sizeof (gf));
    for (p = retval->enc_matrix, col = 0; col < k; col++, p += k + 1)
        *p = 1;
    free (tmp_m);

    return retval;
}

void
fec_encode_all(const fec_t* code, const gf*restrict const*restrict const src, gf*restrict const*restrict const fecs, const unsigned char*restrict const share_ids, unsigned char num_share_ids, size_t sz) {
    unsigned i, j;
    unsigned char fecnum;
    gf* p;
    unsigned fecs_ix = 0; /* index into the fecs array */

    for (i=0; i<num_share_ids; i++) {
        fecnum=share_ids[i];
        if (fecnum >= code->k) {
            memset(fecs[fecs_ix], 0, sz);
            p = &(code->enc_matrix[fecnum * code->k]);
            for (j = 0; j < code->k; j++)
                addmul (fecs[fecs_ix], src[j], p[j], sz);
            fecs_ix++;
        }
    }
}

#if 0
/* By turning the nested loop inside out, we might incur different cache usage and therefore go slower or faster.  However in practice I'm not able to detect a difference, since >90% of the time is spent in my Python test script anyway.  :-) */
void
fec_encode_all(const fec_t* code, const gf*restrict const*restrict const src, gf*restrict const*restrict const fecs, const unsigned char*restrict const share_ids, unsigned char num_share_ids, size_t sz) {
    for (unsigned j=0; j < code->k; j++) {
        unsigned fecs_ix = 0; /* index into the fecs array */
        for (unsigned i=0; i<num_share_ids; i++) {
            unsigned char fecnum=share_ids[i];
            if (fecnum >= code->k) {
                if (j == 0)
                    memset(fecs[fecs_ix], 0, sz);
                gf* p = &(code->enc_matrix[fecnum * code->k]);
                addmul (fecs[fecs_ix], src[j], p[j], sz);
                fecs_ix++;
            }
        }
    }
}
#endif

/**
 * Build decode matrix into some memory space.
 *
 * @param matrix a space allocated for a k by k matrix
 */
void
build_decode_matrix_into_space(const fec_t*restrict const code, const unsigned char*const restrict index, const unsigned char k, gf*restrict const matrix) {
    unsigned i;
    gf* p;
    for (i=0, p=matrix; i < k; i++, p += k) {
        if (index[i] < k) {
            memset(p, 0, k);
            p[i] = 1;
        } else {
            memcpy(p, &(code->enc_matrix[index[i] * code->k]), k);
        }
    }
    invert_mat (matrix, k);
}

void
fec_decode_all(const fec_t* code, const gf*restrict const*restrict const inpkts, gf*restrict const*restrict const outpkts, const unsigned char*restrict const index, size_t sz) {
    gf m_dec[code->k * code->k];
    build_decode_matrix_into_space(code, index, code->k, m_dec);

    unsigned outix=0;
    for (unsigned row=0; row<code->k; row++) {
        if (index[row] >= code->k) {
            memset(outpkts[outix], 0, sz);
            for (unsigned col=0; col < code->k; col++)
                addmul(outpkts[outix], inpkts[col], m_dec[row * code->k + col], sz);
            outix++;
        }
    }
}