#include <stdlib.h>
#include <stdio.h>
#include <string.h>

unsigned int BWT(unsigned char *block, unsigned char *outblock, unsigned int blocksize);
void recursive_sort(unsigned char **strings, unsigned int string_length, unsigned int num_strings, unsigned long long bit, unsigned char **tempbuffer, unsigned int level);
unsigned int skipsymbols(unsigned char **strings, unsigned int numstrings, unsigned int start);
void iBWT(unsigned char *in, unsigned char *out, unsigned int blocksize, unsigned int offset);
void bubbleSort(unsigned char **strings, unsigned int string_length, unsigned int num_strings, unsigned int start_at);

int main(int argc, char *argv[]){
  unsigned int max_blocksize=1024*1024;
  int n,inverse=0;
  if(argc>1)
    for(n=1;n<argc;n++){
      if(atoi(argv[n]))
        max_blocksize=atoi(argv[n]);
      if(!strcmp(argv[n],"-d"))
        inverse=1;
    }
  if(inverse){
    unsigned int max_blocksize=256;
    unsigned char *enc=malloc(256);
    unsigned char *dec=malloc(256);
    unsigned char *header=malloc(17);
    header[16]=0;
    fread(header, 1, 16, stdin);
    do{
      unsigned int offset=strtol(header+8, NULL, 16);
      header[8]=0;
      unsigned int this_blocksize=strtol(header, NULL, 16);
//      printf("Block: %s %u %u\n", header, offset,this_blocksize);
//      return(1);
      if(this_blocksize>max_blocksize){
        free(enc);free(dec);
        enc=malloc(this_blocksize);dec=malloc(this_blocksize);
        max_blocksize=this_blocksize;
      }
      fread(enc, this_blocksize, 1, stdin);
      iBWT(enc, dec, this_blocksize, offset);
      fwrite(dec, this_blocksize, 1, stdout);
      fread(header, 1, 16, stdin);
    }while(!feof(stdin));
    return(0);
  }
  unsigned char *block=malloc(max_blocksize*2);

  do{
    unsigned int this_blocksize=fread(block, 1, max_blocksize, stdin);
    unsigned int offset=BWT(block, block, this_blocksize);
    if(offset==0xFFFFFFFF){
      fprintf(stderr, "Error in BWT, probably unable to allocate memory.\n");
      return(-1);
    }
    printf("%08X%08X", this_blocksize, offset); //Hex encoding used so that if the input is all printable, so will be the output.
//    fwrite(&this_blocksize, 4, 1, stdout);
//    fwrite(&offset, 4, 1, stdout);
    fwrite(block, this_blocksize, 1, stdout);
    fflush(stdout);
  }while(!feof(stdin));
}

unsigned int BWT(unsigned char *block, unsigned char *outblock, unsigned int blocksize){
  //Carries out the BWT. Requires:
  //block must be a buffer of 2*blocksize bytes. The first half contains the source data. The second half is used as a scratch space.
  //outblock must be a buffer of blocksize bytes. It may be equal to block, in which case the first blocksize bytes of input will be overwritten.
  //Temporary memory will be malloced. Rather a lot of it: 8*blocksize on a 32 bit binary, 16*blocksize on a 64-bit binary.
  //Arrays of pointers, thus why bigger on 64-bit.
  //Returns -1 (ie, all-ones, as it's unsigned) on error. Error probably being insufficient memory to malloc.
  unsigned char **offsets=malloc(sizeof(unsigned char*)*blocksize);
  if(!offsets)
    return(0xFFFFFFFF);
  unsigned char **scratchbuffer=malloc(sizeof(unsigned char*)*blocksize);
  if(!scratchbuffer){
    free(offsets);
    return(0xFFFFFFFF);
  }
  memcpy(block+blocksize, block, blocksize);
  unsigned int n;
  for(n=0; n<blocksize; n++)
    offsets[n]=block+n;
  recursive_sort(offsets, blocksize, blocksize, 0, scratchbuffer, 0);
  for(n=0; n<blocksize; n++){
    unsigned char *t=offsets[n];
    outblock[n]=t[blocksize-1];
  }
  n=0;
  n--; //Taking it around the clock, but only briefly.
  do{
    n++;  
  }while(offsets[n]!=block);

  free(offsets);
  free(scratchbuffer);
  return(n);
}

void recursive_sort(unsigned char **strings, unsigned int string_length, unsigned int num_strings, unsigned long long bit, unsigned char **tempbuffer, unsigned int level){
  unsigned int byte=bit>>3;
  if(num_strings<=1)
    return;
  unsigned char sub_bit=128>>(bit&0x07);
  if(sub_bit==128) //Partial mitigation of the  worst case.
    while(skipsymbols(strings, num_strings, byte) && (byte<string_length))
      byte++;
  if((num_strings<=6) || level>32000){ //Too deep! Might run out of stack. Also, on tiny lists, recurrsion overhead probable less than bubble inefficiency.
    bubbleSort(strings, string_length, num_strings, byte);
    return;
  }

  unsigned int pos=0,zeros=0,ones=0,n;
//  fprintf(stderr, "------ %u %u %u %u %u------\n", bit, byte, sub_bit, num_strings, string_length);fflush(stdout);
  if(byte>=string_length)
    return;
  for(n=0;n<num_strings;n++){
    unsigned char *t=strings[n];
    if((t[byte]&sub_bit)){
      tempbuffer[pos++]=strings[n];
      ones++;
    }
  }
  pos=0;
  for(n=0;n<num_strings;n++){
    unsigned char *t=strings[n];
    if(!(t[byte]&sub_bit)){
      strings[pos++]=strings[n];
      zeros++;
    }
  }
  for(n=0;n<ones;n++)
    strings[pos++]=tempbuffer[n];

  bit++;
  recursive_sort(strings, string_length, zeros, bit, tempbuffer, ++level);
  recursive_sort(&strings[zeros], string_length, ones, bit, tempbuffer, level);
}

unsigned int skipsymbols(unsigned char **strings, unsigned int numstrings, unsigned int start){
  int n=0;
  unsigned char *d=strings[0]; //Seems to have issues with strings[0][start];
  unsigned char t=d[start];
  for(n=1;n<numstrings;n++){
    d=strings[n];
    if(d[start]!=t)
      return(0);
  }
  return(1);
}

void bubbleSort(unsigned char **strings, unsigned int string_length, unsigned int num_strings, unsigned int start_at){
  //The much-loathed bubble sort!
  //Here used as an inefficient but reliable fallback sort when the recursive sort would otherwise recur too far and exaust the stack.
//  fprintf(stderr, "Bubble %u %u\n", num_strings, start_at);
  unsigned int n;
  unsigned char *t;
  unsigned int start=0;
  char done;
  do{
    done=1;
    for(n=start;n<(num_strings-1);n++){
      if(memcmp(strings[n]+start_at, strings[n+1]+start_at, string_length-start_at)>0){
        t=strings[n];strings[n]=strings[n+1]; strings[n+1]=t;
        done=0;if(n) n-=2;
      }
    }
    num_strings--; //This is a nifty little optimisation to the 'vanilla' bubble sort. Almost halves the time needed.
    start++; //As is this.
  }while(!done);
}

void iBWT(unsigned char *in, unsigned char *out, unsigned int blocksize, unsigned int offset){
//  printf("iBWT: %u %u\n", blocksize, offset);
  unsigned int *next = malloc(sizeof(unsigned int)*blocksize);

  unsigned int n,m,o=0;
  for(n=0;n<256;n++)
    for(m=0;m<blocksize;m++){
      if(in[m]==n)
        next[o++]=m;
  }
//    printf("%u\n", o);
  for (n = 0; n < blocksize; n++)
    out[n] = in[offset=next[offset]];
  free(next);
//  printf("iBWT done\n");
}
