/*
dupdelete V2.01
Improvements from V1.x:
 - Can read hash lists from a folder, not just one file.
 - Now supports unicode in file/folder names under Windows - though look a lot of #ifdefs to do that, UTF-16 is kind of evil.
 - Can use GDBM to store a cache of file hashes, greatly improving performance if a set of files is hashed on a regular basis.
 - Larger hash buffer (1M, up from 8k) improve disk read speed, and helps the OS best arrange the queue.

Limitations:
 - Uses the linear search on the hash list. Not really an issue unless you have a ridiculous number of hashes to check, but still...

V2.01: Fix tiny bug in Windows version that caused an error message to print incorrectly.s
V2.00: GDBM, performance improvements, improved Windows support.
*/




#define _LARGEFILE64_SOURCE 1 
#define _FILE_OFFSET_BITS   64

//Support for wchar/UTF-16 on Windows. Linux uses UTF-8 and so needs no special handling. Every time you see a function starting with _t, curse Microsoft.
#ifdef __MINGW32__
  #include <tchar.h>
#endif

#include <stdio.h>
#include <stdint.h>
#include <dirent.h>
#include <string.h>
#include <dirent.h>
#include <sys/stat.h>
#include <stdlib.h>
#include "md5.h"
#include <unistd.h>


#define HASH_BUFFER_SIZE 1024*1024


//Linux uses UTF-8, so needs none of this windows wchar bodgery:
#ifndef _UNICODE
  #define _TCHAR char
  #define _tstat64 stat64
  #define _tunlink unlink
  #define _tcscpy strcpy
  #define _tcslen strlen
  #define _WDIR DIR
  #define _tclosedir closedir
  #define _tfopen fopen
  #define _topendir opendir
  #define _tcscat strcat
  #define _treaddir readdir
  #define _tdirent dirent
#endif

#ifdef _use_gdbm
int getfilekey(_TCHAR *filename, uint8_t *key, struct stat64 *attrib);
  #ifdef __MINGW32__
    #include "gdbm.h"
  #else
    #include <gdbm.h>
  #endif
#endif

void dofolder(_TCHAR *foldername);
//int isdirectory(char *path);
void dofile(_TCHAR *filename, struct stat64 *statstruct);
int hashisinlist(uint8_t *hash);
void MDPrint(md5_byte_t *digest);
int stringtohash(char *string, uint8_t *hashgoeshere);
void gethashlistfromfile(char *filename);
void md5string(uint8_t *outbuffer, unsigned char *string, unsigned int len);
int getfilehash(_TCHAR *filename, uint8_t *hash, struct stat64 *attrib);


unsigned int listallocated; //Memory allocated to blacklist.
unsigned int deletelistlen=0;
uint8_t *deletelist;
unsigned char *saltstring;
char enabledelete;
char enablenamelogging;
char logunflagged;
char logsize;
unsigned long long minsize;
unsigned int stats_hashed=0,stats_cached=0;char keepstats=0;
unsigned char *md5tempbuffer; //Place to store pieces of file when hashing. Global to save on malloc calls.
//Handles the cache of already-calculated hashes.
#ifdef _use_gdbm
  GDBM_FILE dbf;
  char usedbm=0;
#endif

int main (int argc, char *argv[])
{
        enabledelete=0;
	listallocated=1024;
        deletelist=malloc(listallocated*16);
        enablenamelogging=1;
        logunflagged=1;
        logsize=0;
        minsize=0;
	if(argc==1){
	  printf(" dupdelete <directory> [options] : recursively md5s every file in directory and subdirectories.\n");
	  printf(" -l <file>    - reads list of md5s. Also enables 'show flagged only' mode, unless overridden by -v.\n");
          printf("                you can use '-l -' to read from stdin.\n");
          printf("                Files with hashes found on this list will be marked as (Flagged!) in the output list.\n");
          printf(" -s <string>  - Use <string> to salt the hash. Format is md5(md5(file)+salt).\n");
          printf(" -d           - causes files matching the hash list from -l to be deleted on sight, rather than flagged.\n");
          printf("                Does nothing without -l.\n");
          printf(" -q           - Surpresses filenames from output.\n");
          printf(" -v           - Show unflagged files too (Does nothing without -l)\n");
          printf(" -ss		- Show file size too.\n");
          printf(" -ms <size>		- Hash only files of <size> bytes or less.\n");
          printf(" -stats       - Display counts on stdout.\n");
	  #ifdef _use_gdbm
          printf(" -gdbm <file.gdbm> - Use a cache file to speed up multiple passes.\n");
	  #endif
          #ifdef __MINGW32__
            printf("\n\nExamples:\n  dupdelete c:\\stuff-I-have\\ | dupdelete c:\\new-stuff\\ -l - -d       Leaves behind only files you have not seen before.\n");
            printf("  dupdelete c:\\user_folders\\ -l c:\\contraband.md5                            Create a list of files in user areas matching a 'contraband' list.\n");
            printf("Please be aware that this program under Windows has some limitations in unicode handling: It'll mostly work, but can't use unicode for the initial arguments.\n");
          #else
            printf("\n\nExamples:\n  dupdelete ~/stuff-I-have/ | dupdelete ~/new-stuff/ -l - -d       Leaves behind only files you have not seen before.\n");
            printf("  dupdelete /home/ -l ~/contraband.md5                            Create a list of files in user areas matching a 'contraband' list.\n");
          #endif
          printf("\nFull description at http://birds-are-nice.me/programming/dupdelete.shtml\n");
	  return(1);
	}
        if(argc>2)
          for(int n=2;n<argc;n++)
            if(argv[n][0]=='-'){
              if(!strcmp(argv[n], "-l")){
                if(n==(argc-1)){
                  fprintf(stderr, "-l given without hash list file.\n");
                   return(-1);
                }
                logunflagged=0;
                gethashlistfromfile(argv[n+1]);
                n++;
                continue;
              }
              if(!strcmp(argv[n], "-s")){
                if(n==(argc-1)){
                  fprintf(stderr, "-s given without salt string.\n");
                   return(-1);
                }
                saltstring=malloc(strlen(argv[n+1])+16);
                strcpy(saltstring+16, argv[n+1]);
                n++;
                continue;
              }
	  #ifdef _use_gdbm
              if(!strcmp(argv[n], "-gdbm")){
                if(n==(argc-1)){
                  fprintf(stderr, "-gdbm given without filename.\n");
                   return(-1);
                }
	        dbf = gdbm_open(argv[n+1], 0, GDBM_WRCREAT, 0666, 0);
	        if(!dbf){
	          fprintf(stderr,"%s open failed.\n", argv[n+1]);
	          return(-1);
	        }
	        usedbm=1;
                n++;
                continue;
              }
	  #endif
              if(!strcmp(argv[n], "-d"))
                enabledelete=1;
              if(!strcmp(argv[n], "-q"))
                enablenamelogging=0;
              if(!strcmp(argv[n], "-stats"))
                keepstats=1;
              if(!strcmp(argv[n], "-v"))
                logunflagged=1;
              if(!strcmp(argv[n], "-ss"))
                logsize=1;
              if(!strcmp(argv[n], "-ms")){
                if(n==(argc-1)){
                  fprintf(stderr, "-ms given without size.\n");
                   return(-1);
                }
                minsize=atoll(argv[n+1]);
                n++;
                continue;
              }

            }
	int x=0;
	while((argv[1][x]!=0)) //Find trailing /
	  x++;
	if((argv[1][x-1]=='/') || (argv[1][x-1]=='\\')) //Remove trailing / if present. Or \, could be running on windows.
          argv[1][x-1]=0;
        md5tempbuffer=malloc(HASH_BUFFER_SIZE);
        #ifdef _UNICODE
          //One ugly unicode conversion:
          int l=strlen(argv[1])+1;
          _TCHAR *tempstring=malloc(l*sizeof(_TCHAR));
          for(int n=0;n<l;n++)
            tempstring[n]=argv[1][n];
          dofolder(tempstring);free(tempstring);
        #else
	  dofolder(argv[1]);
        #endif
  if(keepstats)
    fprintf(stderr, "Hashed: %u    From cache: %u\n", stats_hashed, stats_cached);
  return 0;
}

















void gethashlistfromfile(char *filename){
  struct stat64 statbuf;
  if (strcmp(filename, "-") && stat64(filename, &statbuf) == -1){
#ifdef _UNICODE
    fwprintf(stderr, L"Error statting %s\n", filename);
#else
    fprintf(stderr, "Error statting %s\n", filename);
#endif
    return;
  }

  if(S_ISDIR(statbuf.st_mode)){
    DIR *dp;
    struct dirent *ep;
    dp = opendir (filename);
    if (dp == NULL){
#ifdef _UNICODE
      fwprintf(stderr, L"Could not read hash folder %s\n", filename);
#else
      fprintf(stderr, "Could not read hash folder %s\n", filename);
#endif
      return;
    }
    while (ep = readdir (dp))
    if(ep->d_name[0]!='.'){
      char *subfilename=malloc(strlen(filename)+strlen(ep->d_name)+2); //+2: One for the /, one for the null term.
      strcpy(subfilename, filename);
      #ifdef __MINGW32__
        strcat(subfilename, "\\");
      #else
        strcat(subfilename, "/");
      #endif
      strcat(subfilename, ep->d_name);
      gethashlistfromfile(subfilename);
      free(subfilename);
    }
    (void) closedir (dp);

    return;
  }
  if(strcmp(filename, "-") && !S_ISREG(statbuf.st_mode)){
    fprintf(stderr, "Error opening hash list - appears neither file nor directory?\n");
    return; //What is this non-file, non-folder?
  }

  FILE *hashlistfile;

  if(strcmp(filename, "-")==0)
     hashlistfile=stdin;
  else
    hashlistfile=fopen(filename, "r");
  if(hashlistfile==NULL){
    fprintf(stderr, "Could not open hashlist!\n");
    exit(1);
  }
  unsigned char tempbuffer[128];
  do{
    if(deletelistlen==(listallocated-1)){
      listallocated+=1024;
      deletelist=realloc(deletelist, listallocated*16);
    }
    char *ret=fgets(tempbuffer, 128, hashlistfile);
    if(ret && !stringtohash(tempbuffer, deletelist+(deletelistlen*16)))
      deletelistlen++;
  }while(!feof(hashlistfile));
  fclose(hashlistfile);
  fprintf(stderr, "Read %d hashes from file.\n", deletelistlen);
}

void dofolder(_TCHAR *foldername){
  struct stat64 statbuf;

  if (_tstat64(foldername, &statbuf) == -1){
#ifdef _UNICODE
    fwprintf(stderr, L"Error statting %s\n", foldername);
#else
    fprintf(stderr, "Error statting %s\n", foldername);
#endif
    return;
  }
  if(S_ISREG(statbuf.st_mode)){
    dofile(foldername, &statbuf);
    return;
  }
  if(! S_ISDIR(statbuf.st_mode))
    return; //What is this non-file, non-folder?

  _WDIR *dp;
  struct _tdirent *ep;

  dp = _topendir (foldername);
  if (dp == NULL){
    fprintf(stderr, "Could not read folder %s\n", foldername);
    return;
  }
  while (ep = _treaddir (dp))
  if(ep->d_name[0]!='.'){
    _TCHAR *filename=malloc((_tcslen(foldername)+_tcslen(ep->d_name)+2)*sizeof(_TCHAR)); //+2: One for the /, one for the null term.
    _tcscpy(filename, foldername);
    #ifdef __MINGW32__
      _tcscat(filename, L"\\");
    #else
      strcat(filename, "/");
    #endif
    _tcscat(filename, ep->d_name);
    dofolder(filename);
    free(filename);
  }
  (void) _tclosedir (dp);
}

void dofile(_TCHAR *filename, struct stat64 *statstruct){
  unsigned long long filesize=statstruct->st_size;
  if(minsize && (filesize>=minsize)){
    return;
  }

  md5_byte_t digest[16];
  getfilehash(filename, digest, statstruct);
  if(saltstring){
    memcpy(saltstring, digest, 16);
    md5string(digest, saltstring, 16+strlen(saltstring+16));
  }
  char flagged=hashisinlist(digest);
  if(flagged || (!deletelist) || logunflagged){
    MDPrint(digest);
    if(logsize)
      printf(" %llu", filesize);
    if(enablenamelogging)
#ifdef _UNICODE
      _tprintf(L" %s", filename);
#else
      printf(" %s", filename);
#endif
    if(flagged){
      if(enabledelete){
        if(!_tunlink(filename))
          printf(" (Deleted!)");
        else{
          printf(" (Delete failed)");
          fprintf(stderr, "Failed to delete %s", filename);
        }
      }
      else
        printf(" (Flagged!)");
    }
    printf("\n");
  }
}

#ifdef _use_gdbm
int getfilekey(_TCHAR *filename, uint8_t *key, struct stat64 *attrib){
  md5_state_t state;
  md5_init(&state);
  md5_append(&state, (void *)filename, _tcslen(filename)*sizeof(_TCHAR));
  md5_append(&state, (void *)&(attrib->st_size), sizeof(&(attrib->st_size)));
  md5_append(&state, (void *)&(attrib->st_mtime), sizeof(&(attrib->st_mtime)));
  md5_finish(&state, key);
}
#endif

int getfilehash(_TCHAR *filename, uint8_t *hash, struct stat64 *attrib){
  //Gets the file hash (+size) for filename. Return -1 on error, 0 on success, -2 if file skipped.
  unsigned long long filesize=attrib->st_size;

  if(_tcslen(filename)>8192)
    return(-1); //Tidy up your filesystem!

#ifdef _use_gdbm
  uint8_t keyval[16];
  if(usedbm){
    getfilekey(filename, keyval, attrib);
    datum key={keyval, 16};
    datum value=gdbm_fetch(dbf, key);
//    printf("Key:");MDPrint(keyval);printf("|%u\n",value.dsize);
    if(value.dptr && (value.dsize>0)){
      memcpy(hash, value.dptr, 16);
      free(value.dptr);
      stats_cached++;
      return(0);
    }
  }
#endif


#ifdef _UNICODE
  FILE *currentfile=_tfopen(filename, L"rb");
#else
  FILE *currentfile=fopen(filename, "rb");
#endif
  if(currentfile==NULL){
#ifdef _UNICODE
    fwprintf(stderr, L"Could not open file %s\n", filename);
#else
    fprintf(stderr, "Could not open file %s\n", filename);
#endif
    return(-1);
  }
  md5_state_t state;
  md5_byte_t digest[16];
  md5_init(&state);

  do{
    unsigned int newbytes=fread(md5tempbuffer, 1, HASH_BUFFER_SIZE, currentfile);
    md5_append(&state, md5tempbuffer, newbytes);

  }while(!feof(currentfile));
  fclose(currentfile);
  md5_finish(&state, digest);
  memcpy(hash, digest, 16);
  stats_hashed++;
#ifdef _use_gdbm
  if(usedbm){
    datum insvalue={digest, 16};
    if(filesize>4096){ //Pollute not my database with your trivialities.
      datum key={keyval, 16};
      int ret=gdbm_store(dbf, key, insvalue, GDBM_INSERT);
      if(ret==-1)
        fprintf(stderr,"gdbm store error: Nonfatal, but couldn't add cache line.\n");
    }
  }
#endif
}
void md5string(uint8_t *outbuffer, unsigned char *string, unsigned int len){
  md5_state_t state;
  md5_byte_t digest[16];
  md5_init(&state);
  md5_append(&state, string, len);
  md5_finish(&state, digest);
  memcpy(outbuffer, digest, 16);
}

int hashisinlist(uint8_t *hash){
  if(deletelistlen==0)
    return(0);
  for(int n=0; n<deletelistlen; n++) //Ye olde linear search. Unless you have a hash list of ridiculous length, it'll do.
    if(!memcmp(hash, deletelist+(16*n), 16))
      return(1);
  return(0);
}

void MDPrint(md5_byte_t *digest)
{
  for (int i = 0; i < 16; i++)
    printf ("%02x", digest[i]);
}

int stringtohash(char *string, uint8_t *hashgoeshere){
  //Before calling this function, be sure it points to a 32-byte-or-longer buffer!
  //First task is to convert to a more convenient character set.
  if(strlen(string)<32)
    return(1);
  for(int n=0; n<32; n++){
    if(string[n]>90) //If upper-case
      string[n]-=32; //Convert to lower
    if((string[n]<48)||(string[n]>70)||((string[n]>57)&&(string[n]<65))) //Outside of allowable range.
      return(1);
    if(string[n]>57) //If a letter
      string[n]-=7; //Shift it down, so 'a' immediately follows '9'
    string[n]-=48; // Now '0' is actually 0.
  }
  //Input checked and shifted. The new character set is 0-9, A-F
  for(int n=0; n<16; n++)
    hashgoeshere[n]=(string[(2*n)]<<4)|(string[(2*n)+1]);
  
  return(0);
}
