/********** Heterogeneous Distance Function Comparison ***********************
 By D. Randall Wilson (randy@axon.cs.byu.edu), July 5, 1996.
 
 Code to read a training set, and use several heterogeneous distance
 functions in an instance-based learning system to compare their
 respective accuracy.
 
 This code was used to obtain results reported in:
 
 Wilson, D. Randall, and Tony R. Martinez, "Improved Heterogeneous Distance
   Functions," Journal of Artificial Intelligence Research, vol. 6, no. 1,
   pp. 1-34.
   
 Sorry if it's ugly, but I didn't originally plan on anyone else looking at it. :)

 --- 
 Usage: h [source.num] [-d debug-level] [-o output-file] [-s]
          [-a {1,2,3,4,5,6}]
   where:
     source.num is the '.num' file containing the database to be learned.
     -s    See what's going on while it runs.
     -d    Output debugging information.
     -o    output-file to rewrite standard out to. (or you can redirect
           output to a file if you prefer).
     -i    Incremental (outputs what accuracy is using only 1%, 2%, etc.,
           instead of all available instances in the training set).  
     -u    Unknown values treated as another value in VDM schemes.
     -a    Do only algorithm 1..6 {EUCLIDEAN, HCGC, HVDM, DVDM,
                                   IVDM, WVDM} instead of all 6 of them.
 ** Input file format:
 The input file (referred to as "source.num" in the Usage, above) has
 the following form:
 ---
 numInputs numOutputs numInstances
 for each input and output attribute a,
   <#values for attribute a.  0=>continuous; -1=>ignore; -2 or lower=>linear,
     so negate and that is how many values (e.g., -5 => 5 values, 0..4, and
     they are linear rather than nominal)>
 for each instance i
   for each input and output attribute i
     value (0 through #values-1 if nominal or linear; real value if continuous;
            '?' if unknown; '*' if "don't care" (i.e., match anything).)
 ---
 for example, a simple file might be:
 
   5 1 3             <- 5 inputs, 1 output, 3 instances
   5 2 2   0 -9  2   <- first input is nominal, with values 0..4,
   3 1 0 4.3  6  0      2nd & 3rd inputs are nominal, with values 0..1 (i.e., boolean)
   4 0 ? 1.1  8  1      4th input is continuous, with real-valued inputs
   0 ? 1 9.3  0  1      5th input is linear, with discrete values 0..8
                        Output class is boolean (nominal with values 0..1).
                        
 The question marks ('?') indicate unknown values. 

******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <values.h>
#include <string.h>
#include "t-test.h"

#define MAX_LEN    100
#define DONT_KNOW   -1
#define DONT_CARE   -2
#define F_DONT_KNOW MAXDOUBLE
#define F_DONT_CARE -MAXDOUBLE
#define KNN          1

/* define our random functions so they're easily changed. */
#define rnd()      drand48()
#define srnd(x)    srand48(x)
#define rndInt(x)  (lrand48()%x)
	     
typedef struct {
    int    d;  /* Discrete value */
    double f;  /* Floating-point (continuous) value */
    } Value;

typedef struct {
   Value *in;  /* Array of 'numVars' values, including input and outputs. */
   Value *out; /* Points to first output in array 'in', thus overlapping. */
   long  class; /* Integer composition of output values. */
   double sigFac; /* 1/Radius of significant influence of basis function
		    (= 1/distance to nearest neighbor) */
   double **p;  /* Probability p[var][class] for WVDM */
 } Instance;
/* Note that "STYLES" just counts how many styles (i.e., 6), but is not
   a style itself. */
enum {EUCLIDEAN=0, HCGC, HVDM, DVDM, IVDM, WVDM, STYLES};
char *styleName[STYLES]={"Euclid","HCGC","HVDM","DVDM","IVDM","WVDM"};
Instance *instance; /* Main list of instances */

double     *normFac;    /* Normalization factor for each input variable */
double     *rangeFac;   /* Normalization factor using ranges instead of s.d. */
int numInputs, numOutputs, numInstances; /* Number of inputs, outputs and
                                            instances in the dataset. */
int numVars; /* number of variables = numInputs + numOutputs */
int *numStates; /* >0 => number of discrete nominal classes.
		   =0 => continuously-valued variable.
		   -1 => ignored variable. (Stripped out in ReadFile)
		  <-1 => linear discrete variable. */
int *numValues; /* Number of discrete values (whether linear, nominal, or discretized) */
int numClasses; /* number of possible classes = product of numStates for all outputs*/
int  discNumStates; /* number of discrete states for discretized continuous variables*/
char filename[MAX_LEN]; /* Name of input file */
int debug=0, SEE=0;     /* Used for debugging and watching progress */
char outfilename[MAX_LEN]; /* Name of output file, if any (default is screen)*/
Instance **train; /* Instances in the current training set. */
Instance **test;  /* Instances in the current test set. */
int    numTrain, numTest; /* number of instances in training & test set (sum=numInstances)*/
int    partitions = 10; /* 10-fold cross-validation */
int    unknownVDM = 0; /* 0=> attribute difference is always 1 when a '?'
			  is involved.  1=> '?' is treated like another
			  discrete value in anything except plain Euclidean. */
long lastChecked=0;
long *checked;
double **accuracy; /* generalization accuracy[whichStyle][whichTrial] */
double ***P; /* P[i][a][c] = Ni,a,c / Ni,a; Assigned in 'BuildVDM()' */
int    randomShuffle = 0; /* Flag: 1=> randomly shuffle nominal values, to
                             show that VDM doesn't care about order. */
#define UNITS 24
int    units=UNITS; /* Number of divisions to use in incrementally measuring
		     accuracy on percent[unit]% of the available training data.*/
int    percent[UNITS]={1,2,3,4,5,6,7,8,9,10,12,15,20,25,30,35,40,45,50,60,70,80,90,100};
double ***results; /* results[trial][metric][unit] = accuracy obtained during
		      trial using metric and only percent[unit]% of the training data. */
int    incremental = 0; /* Flag to determine whether to do incremental stuff*/
int    dumpVDM = 0;     /* Flag for whether to dump VDM probability values. */
double *minVal,*maxVal; /* Minimum & maximum for each attribute (0 if discrete)*/
double *theWidth;       /* Width of a discretization range */
int globalV; /* Global variable needed by CompareVal and set in BuildWindowP */
Instance ***wvdmList; /* wvdmList[numInstances][wvdmNum[numInstances]]; */
int *wvdmNum; /* Number of instances in wvdmList[v] (after '?' and '*' removed) */
double *p1=NULL, *p2=NULL; /* arrays p1[numClasses] used by ivdm and wvdm */
int     onlyAlg=0; /* if 1..6, only do algorithm Euclid..WVDM */

/*********************************************************************/
/***************** Function Prototypes *******************************/
/*********************************************************************/
void   PrintReport();
double TestIBL(int metric, double *results);
double TestRBF(int metric);
void   FindSigmas(int metric);
void   BuildTrainingSet(int t);
void   BuildNormalization();
void   Discretize();
void   BuildVDM();
void   DumpVDM();
void   InterpolateP(int v, Instance *a, double *p);
int    CompareVal(const void *aa, const void *bb);
void   BuildWindowP();
int    BSearch(const void *key, const void *base, size_t nel, size_t size,
	      int (*compar)(const void *aa, const void *bb));
void   InterpolateWP(int v, Instance *a, double *p);
double FindDimDistance(int v, Instance *a, Instance *b, int metric);
double Distance(Instance *a, Instance *b, int metric);
int    SameClass(Instance *a, Instance *b);
char  *Allocate(long size, char *errorMessage);
void   AllocateStuff();
void   PrintNum(int num, int width);
void   PrintDouble(double d, int width);
void   PrintInstance(Instance *i);
void   PrintInstances();
void   ShuffleNominalValues();
int    Continuous(int v);
int    Nominal(int v);
int    LinearDiscrete(int v);
int    Discrete(int v);
int    Linear(int v);
void   ReadFile(char *filename);
int    GetInteger(FILE *fd);
double GetReal(FILE *fd);
void   GetValue(FILE *fd, char *s);
char   GetChar(FILE *fd);
FILE  *OpenFile(char *filename, char *mode);
void   Barf(char *s);

FILE *dump=NULL;

/*************************************************************/
/******************** main ***********************************/
/*************************************************************/
main(int argc, char *argv[])
{ FILE *fd;
  int i, trial;
  
  outfilename[0]='\0';
  srnd(1); /* Initialize Random Number Generator */
  
  /*** Parse Arguments */
  if (argc<2 || argv[1][0]=='-')
    { printf("Usage: %s infile.num [-o outfile] [-d debug_LEVEL] [-s]\n",argv[0]);
      printf("             [-r] [-c cv] [-v] [-a 1..6]\n",argv[0]);
      printf("  -r = randomly shuffle order of nominal attribute values.\n");
      printf("  -d = debugging level.\n");
      printf("  -s = see what's going on.\n");
      printf("  -c = change the number of cross-validation partitions (default=10).\n");
      printf("  -o = write output to outfile while running.\n");
      printf("  -i = Incremental: Use 1%, then 2%, etc.\n");
      printf("  -u = Unknown values treated as another value in VDM\n");
      printf("       [default is to make distance=1.0 if either is '?']\n");
      printf("  -v = Dump VDM probability values.\n");
      printf("  -a = Use only one algorithm [1=Euclid, 2=H-Overlap, 3=HVDM,\n");
      printf("         4=DVDM, 5=IVDM, 6=WVDM].\n");
      printf(" [ Using test.num as default ]\n");
      strcpy(filename,"test.num");
    }
  else strcpy(filename,argv[1]);
  for (i=1; i<argc; i++)
    { if (strcmp(argv[i],"-d")==0)
       debug = atoi(argv[++i]);
      if (strcmp(argv[i],"-s")==0)
        SEE = 1;
      if (strcmp(argv[i],"-c")==0)
        partitions = atoi(argv[++i]);
      if (strcmp(argv[i],"-r")==0)
	randomShuffle = 1;
      if (strcmp(argv[i],"-o")==0)
       { strcpy(outfilename,argv[++i]);
	 dump = freopen(outfilename,"wt",stdout);
       }
      if (strcmp(argv[i],"-u")==0)
	unknownVDM = 1;
      if (strcmp(argv[i],"-i")==0)
	incremental = 1;
      if (strcmp(argv[i],"-v")==0)
	dumpVDM = 1;
      if (strcmp(argv[i],"-a")==0)
        onlyAlg = atoi(argv[++i]);
    }

   printf("%s...\n",filename);
  
   /*** Read instances ***/
  ReadFile(filename); /* sets numInstances */
  AllocateStuff();
  if (debug>0) printf("Instances (%d)...\n",numInstances);
  if (debug>1) PrintInstances();
  if (randomShuffle)
    ShuffleNominalValues();
  for (trial=0; trial<partitions; trial++)
  { int unit, maxTrain;
    if (debug || SEE)
      printf("Trial %d...\n",trial);
    fflush(stdout);
    BuildTrainingSet(trial);
    maxTrain = numTrain; /* Remember how many training instances there are*/
    for (unit=0; unit<units; unit++)
    { if (incremental)
      { if (unit==units-1)
	numTrain = maxTrain; /* use all for last try */
	else numTrain = ((long)maxTrain) * ((long)percent[unit]) / ((long)100);
	if (numTrain<1)
	  numTrain=1; /* Make sure we're using at least one instance */
	if (numTrain>maxTrain)
	  numTrain = maxTrain; /* Make sure we don't use more than available*/
	if (debug>1 && incremental)
	  printf("  u%d, using %d/%d (%d percent) of training instances.\n",
		 unit,numTrain,maxTrain,percent[unit]);
      }
      else numTrain = maxTrain;
      BuildNormalization(); /* Assign NormFac[v] */
      Discretize(); /* Get integer value for each continuous one. */
      BuildVDM();   /* Find distance between values for VDM. i.e., P[][][] 
		      and i->sigFac */
      BuildWindowP();
      if (dumpVDM)
	DumpVDM();
      for (i=0; i<STYLES; i++)
        if (onlyAlg==0 || i==onlyAlg-1)
        { accuracy[i][trial]=TestIBL(i,NULL/* results[trial][i] */);
	  if (incremental)
	    results[trial][i][unit]=accuracy[i][trial];
        }
      if (!incremental)
	unit=units; /* terminate loop */
    } /* for each unit setting */
  } /* for each trial */
  PrintReport(); /* summarize results of all 'TestWeights' */
} /* main */

/******************** PrintReport ************************************/
void PrintReport()
{ /* Prints a report of the accuracy of each trial, the average, st.d., etc.*/
  double tt, conf, avg;
  int t, s, v, unit;
  int numContinuous=0, numLinear=0, numNominal=0;
  
  for (v=0; v<numInputs; v++)
  { if (numStates[v] < -1)
      numLinear++;
    else if (numStates[v] > 0)
      numNominal++;
    else numContinuous++;
  }
  printf("Report for %s\n", filename);
  printf("  %d instances, %d inputs (%d continuous, %d linear, %d nominal), %d outputs\n",
    numInstances, numInputs, numContinuous, numLinear, numNominal, numOutputs);
  
  printf("trial");
  for (s=0; s<STYLES; s++)
    if (!onlyAlg || s==onlyAlg-1)
      printf("%9s",styleName[s]);
  printf("\n");
  for (t=0; t<partitions; t++)
  { int s;
    printf("%3d  ",t);
    for (s=0; s<STYLES; s++)
      if (!onlyAlg || s==onlyAlg-1)
        printf("   %6.2lf",accuracy[s][t]);
    printf("\n");
  } /* for */
  printf("Avg: ");
  for (s=0; s<STYLES; s++)
    if (!onlyAlg || s==onlyAlg-1)
    { avg = Average(accuracy[s],partitions);
      printf("   %6.2lf",avg);
    }
  printf("\nSD. :");
  for (s=0; s<STYLES; s++)
    if (!onlyAlg || s==onlyAlg-1)
    { double sd;
      sd = StandardDeviation(accuracy[s],partitions);
      printf("   %6.2lf",sd);
    }
  if (!onlyAlg)
  { printf("\nTPair");
    for (s=0; s<STYLES; s++)
    { tt = TPair(accuracy[s],accuracy[HVDM],partitions);
      printf("   %6.2lf",tt);
    }
    printf("\nConf:");
    for (s=0; s<STYLES; s++)
    { conf = Confidence(accuracy[s],accuracy[HVDM],partitions);
      printf("   %6.2lf",conf);
    }
    printf("\n");
  }
  if (incremental)
  { printf("\nProgression:\n");
    for (unit=0; unit<units; unit++)
    { printf("Percent %d",percent[unit]);
      for (s=0; s<STYLES; s++)
	if (!onlyAlg || s==onlyAlg-1)
	  for (t=0; t<partitions; t++)
	    accuracy[s][t]=results[t][s][unit];
      for (s=0; s<STYLES; s++)
        if (!onlyAlg || s==onlyAlg-1)
        { avg = Average(accuracy[s],partitions);
	  printf("\t%.2lf",avg);
        }
      if (!onlyAlg)
      { printf("\tConf:");
        for (s=IVDM; s<STYLES; s++)
        { conf = Confidence(accuracy[s],accuracy[HVDM],partitions);
	  printf("\t%.0lf",conf*100.0);
        }
      }
      printf("\n");
    } /* for each unit */
  }
} /* PrintReport */

/******************** TestIBL *************************************/
double TestIBL(int metric, double *results)
{ /* Using the distance 'metric' indicated, find the nearest neighbor
     in the training set of each instance in the test set, and return
     the proportion of those that have the same class.
     Also, if 'results' is not NULL, store the accuracy that would
     have been obtained from using only unit * (1/units) of the
     available training instances to generalize from.  e.g., if
     units=100, then results[0..99] would contain the accuracy
     from using 1..100% of the training instance to classify
     each test instance. */
  int s,t,nearest=-1,correct=0,wrong=0,v,same;
  double d,nearestDist=MAXDOUBLE,accuracy;
  static int *numCorrect=NULL, *numWrong=NULL;
  int unit, newUnit;
  
  if (results!=NULL && (numCorrect==NULL || numWrong==NULL))
  { numCorrect = (int *) Allocate(sizeof(int)*(units+1),"numCorrect");
    numWrong   = (int *) Allocate(sizeof(int)*(units+1),"numWrong");
  }
  if (results!=NULL)
    for (unit=0; unit<units; unit++)
      numCorrect[unit]=numWrong[unit] = 0;
  
  for (s=0; s<numTest; s++)
  { nearestDist = MAXDOUBLE;
    nearest     = -1;
    unit = 0;
    for (t=0; t<numTrain; t++)
    { d = Distance(test[s],train[t],metric);
      if (d<nearestDist)
      { nearestDist=d;
	nearest=t;
      }
      if (results!=NULL)
      { newUnit = (t*units) / numTrain;
	while (unit < newUnit)
	{ /* We have used all the training instances allowed for
	     classification, so let's see if we're right. */
	  if (nearest>=0 && test[s]->class==train[nearest]->class)
	    numCorrect[unit]++;
	  else numWrong[unit]++;
	  unit++;
	}
      }
    } /* for each training instance */
    
    if (results!=NULL)
    { newUnit = units;
      while (unit < newUnit)
      { /* finish up with last unit(s). */
	if (nearest>=0 && test[s]->class==train[nearest]->class)
	  numCorrect[unit]++;
	else numWrong[unit]++;
	unit++;
      }
    }
    
    if (nearest>=0)
    { if (test[s]->class == train[nearest]->class)
        same=1;
      else same=0;
    }
    else
    { printf("Couldn't find nearest.  Strange.\n");
      same=0;
    }
    if (same)
      correct++;
    else wrong++;
  } /* for each test instance */
  if (results!=NULL)
  { for (unit=0; unit<units; unit++)
    { if (numCorrect[unit]==0 && numWrong[unit]==0)
        printf("Error: numCorrect and numWrong are both 0!\n");
      else results[unit] = ((double)numCorrect[unit]) /
	                   ((double)(numCorrect[unit]+numWrong[unit]));
    }
  }
  accuracy = ((double)correct) / ((double)(correct+wrong));
  return(accuracy*100.0);
} /* TestIBL */

/**************** FindSigmas **********************************/
void FindSigmas(int metric)
{ /* Finds the 'sigFac' of each training instance.  Does this by finding
     the nearest neighbor of each training instance and setting 'sigFac'
     to 1/2distance^2 to that neighbor, using the distance metric given. */
  int t,n;
  double d, nearest;
  for (t=0; t<numTrain; t++)
  { nearest=MAXDOUBLE;
    for (n=0; n<numTrain; n++)
      if (t!=n)
      { d = Distance(train[t],train[n],metric);
	if (d<nearest)
	  nearest=d;
      }
    train[t]->sigFac = 1.0 / (2.0 * nearest); /* nearest=dist^2 */
  } /* for */
} /* FindSigmas */

/******************** TestRBF ************************************/
double TestRBF(int metric)
{ /* Using the distance metric indicated, use a Generalized Radial
     Neural Network to find the probability of each class for each
     member of the test set.  Return the proportion of the instances
     in the test set whose class matches that found to have the highest
     probability. */
  int i,c,s,t,bestClass;
  int correct=0, wrong=0;
  double accuracy,d,h;
  static double *p=NULL, *sum=NULL;
  
  FindSigmas(metric); /* Set sigFac for each training instance */
  /** Allocate stuff first time through */
  if (p==NULL)
    p = (double *) Allocate(sizeof(double)*numClasses,"p");
  for (s=0; s<numTest; s++)
  { /*** Classify one instance from the test set using the entire training set*/
    /* First zero our counters...*/
    for (c=0; c<numClasses; c++)
        p[c]=0.0;
    
    for (t=0; t<numTrain; t++)
    { d = Distance(train[t],test[s],metric); /* Actually returns distance squared*/
      h = exp(-(d)*train[t]->sigFac);
      p[train[t]->class] += h;
    } /* for each training instance */
    
    /* At this point we could normalize the p's in order to get
       the probability of each class.  However, what we want is
       the class with the highest probability, and the highest
       will still be the highest, whether or not we divide all
       the values by the overall sum. */
    /* So, now we find the maximum value of p for each output var,
       and see if it matches the output vars of the test instance.*/
    bestClass=0;
    for (c=1; c<numClasses; c++)
      { if (p[c] >p[bestClass])
	  bestClass=c;
      }
    if (bestClass == test[s]->class)
      correct++;
    else wrong++;
  } /* for each test instance */    
  
  accuracy = ((double)correct) / ((double)(correct+wrong));
  return(accuracy*100.0);
} /* TestRBF */
    
/******************** BuildTrainingSet *******************************/
void BuildTrainingSet(int t)
{ /* Builds a training set for trial 't'.  Assumes that
     fin[numVars][numInstances] contains the continuous inputs & outputs,
     and nin[numVars][numInstaces] contains the nominal & linear values.
     Builds integer arrays train[1..numTrain] and test[1..numTest]
     which contain the index of instances in training and test sets,
     respectively.  Thus, to get input variable 'v' for instance 'i'
     in the training set, use fin[v][train[i]]. */
  /* for now, assume randomized 10-fold cross-validation. */
  static Instance **r;
  int i,j;
  Instance *k;
  int start,end;
  
  if (t==0)
  {  r = (Instance **) Allocate(sizeof(Instance *)*numInstances,"r (BuildTrainingSet)");
     /* Set r[i]=i, then swap each with some other to create random list */
     for (i=0; i<numInstances; i++)
	r[i]=&instance[i];
     if (debug)
     { printf("Original ordering...\n");
	for (i=0; i<numInstances; i++)
 	 PrintInstance(r[i]);
     }
     for (i=0; i<numInstances; i++)
     { j=rndInt(numInstances);  /* pick a random instance j */
       k=r[i];  /* swap the pointers contained in spots i and j */
       r[i]=r[j];
       r[j]=k;
     }
     if (debug)
     { printf("Final ordering...\n");
	for (i=0; i<numInstances; i++)
 	  PrintInstance(r[i]);
     }  
  }
  start = (t * numInstances) / partitions;
  end   = ((t+1) * numInstances) / partitions; /* start of next batch */
  if (end > numInstances)
     end = numInstances;
  for (i=numTrain=numTest=0; i<numInstances; i++)
  { if (i>=start && i<end) /* this instance will be in the test set */
      test[numTest++] = r[i];
    else train[numTrain++] = r[i];
  }
  if (debug)
    printf("New training set for partition %d: start=%d, end=%d\n",t,start,end);
  if (debug>1)
  { printf("Training set:\n");
    for (t=0; t<numTrain; t++)
      PrintInstance(train[t]);
    printf("Test set:\n");
    for (t=0; t<numTest; t++)
      PrintInstance(test[t]);
  }
} /* BuildTrainingSet */

/********************** BuildNormalization **************************/
void BuildNormalization()
{ /* Finds the standard deviation for each input variable that
     is continuous or linear-discrete, and sets normFac[v] to 1/(4sd[v]).
     i.e., normalizes so that 95% of values will be in the range -.5..+.5,
     so that the maximum distance between two values is usually less than 1.0.
     Also sets rangeFac[v] to 1/(range(v)).  Sets normFac and rangeFac for
     nominal attribtes to 1.0.  Uses ONLY THE TRAINING SET to pick the
     ranges, so values in the test set may be outside the specified range. */
  double min, max, range, sd;
  int i,v,t;
  static double *x=NULL;
  
  if (x==NULL)
    x = (double *) Allocate(sizeof(double)*numInstances,"x (Normalization)");
  for (v=0; v<numInputs; v++)
  { max = -MAXDOUBLE;
    min = +MAXDOUBLE;
    if (Nominal(v))
      normFac[v] = 1.0;
    else
    { if (LinearDiscrete(v))
      { for (i=0; i<numTrain; i++)
        { x[i]=(double)(train[i]->in[v].d);
          if (x[i]<min) min=x[i];
          if (x[i]>max) max=x[i];
        }
      }
      else if (Continuous(v))
      { for (i=0; i<numTrain; i++)
        { x[i]=train[i]->in[v].f;
          if (x[i]<min) min=x[i];
          if (x[i]>max) max=x[i];
        }
      } /* continuous */
      else Barf("Strange number of states in BuildNormalization.");
      sd = StandardDeviation(x,numTrain);
      if (sd == 0.0)
  	normFac[v] = 1.0;
      else
  	normFac[v] = 1.0 / (4.0 * sd); 
      if (min==MAXDOUBLE || max==-MAXDOUBLE || min==max)
        rangeFac[v] = 1.0;
      else rangeFac[v] = 1.0 / (max-min);
    } /* else not nominal */
  } /* for v */
  if (debug>1)
  { int t,v;
    Instance x;
    x.in = (Value *)Allocate(sizeof(Value) * numVars,"x in debug");
    
    printf("NormFac: ");
    for (v=0; v<numInputs; v++)
      printf("%lg ",normFac[v]);
    printf("\n");
    printf("rangeFac: ");
    for (v=0; v<numInputs; v++)
      printf("%lg ",rangeFac[v]);
    printf("\n");
    printf("This results in instances sort of like this...\n");
    for (t=0; t<numTrain; t++)
    { for (v=0; v<numVars; v++)
      { x.in[v].f = train[t]->in[v].f * normFac[v];
        x.in[v].d = train[t]->in[v].d;
      }
      x.class   = train[t]->class;
      PrintInstance(&x);
    }
    free(x.in);
  }
} /* BuildNormalization */

/******************** Discretize ***************************************/
void Discretize()
{ /* Discretizes any continuous values in the current training set by
     dividing the range into 5 chunks or as many chunks as there are
     classes, whichever is greater.  Thus, the in[v].d value for each
     in[v].f will be the discretized value.
     Uses only the training instances to decide on how to discretize,
     but then discretizes test instances as well using this scheme so
     that they are ready to be classified. (That's not cheating). */
  double max, min, width;
  int i,v;
  if (numClasses > 5)
    discNumStates = numClasses;
  else discNumStates = 5;
 
  for (v=0; v<numInputs; v++)
  { if (Continuous(v))
    { max = -MAXDOUBLE;
      min =  MAXDOUBLE;
      for (i=0; i<numTrain; i++)
      { if (train[i]->in[v].f!=F_DONT_CARE && train[i]->in[v].f!=F_DONT_KNOW)
        { if (train[i]->in[v].f < min)
  	  min = train[i]->in[v].f;
	  if (train[i]->in[v].f > max)
	    max = train[i]->in[v].f;
        }
      }
      width = (max - min) / ((double)discNumStates);
      if (width<=0.0)
	width=1.0;
      
      minVal[v]=min;
      maxVal[v]=max;
      theWidth[v]=width;
      
      for (i=0; i<numInstances; i++)
      { if (instance[i].in[v].f==F_DONT_CARE)
	instance[i].in[v].d=DONT_CARE;
	else if (instance[i].in[v].f==F_DONT_KNOW)
	  instance[i].in[v].d=DONT_KNOW;
	else
	{ instance[i].in[v].d = (int)((instance[i].in[v].f - min)/width);
	  if (instance[i].in[v].d < 0)
	    instance[i].in[v].d = 0;
	  if (instance[i].in[v].d >= discNumStates)
	    instance[i].in[v].d = discNumStates-1;
	}
      } /* for each instance i */
      numValues[v]=discNumStates;
    } /* if continuous */
    else if (numStates[v]<-1)
      numValues[v]= -numStates[v];
    else numValues[v]= numStates[v];
  }
  if (debug>1)
  { int t;
    printf("After discretization:\n");
    printf("Training set:\n");
    for (t=0; t<numTrain; t++)
      PrintInstance(train[t]);
    printf("Test set:\n");
    for (t=0; t<numTest; t++)
      PrintInstance(test[t]);
  }
} /* Discretize */

/******************** BuildVDM *****************************************/
void BuildVDM()
{ /* Builds the array P[numInputs][numStates][numClasses].
     P[v][a][c] = the number of times variable 'v' had attribute value 'a'
     and output class 'c' divided by the number of times 'v' had value 'a'.
     Assumes that "Discretize" has already been run, and that a training
     set has already been built. If variable 'v' NEVER had attribute value
     'a' and output class 'c', then P[v][a][c]=0.*/
  int v,ns,i,c,s;
  if (P==NULL)
  { P = (double ***) Allocate(sizeof(double **) * numInputs,"P");
    for (v=0; v<numInputs; v++)
    { P[v] = (double **) Allocate(sizeof(double *) * (numValues[v]+1),"P[v]");
      for (s=0; s<numValues[v]+1; s++)
	 P[v][s]=(double *) Allocate(sizeof(double) * numClasses,"P[v][s]");
    }
  }
  
  /* Zero out the whole P array. */
  for (v=0; v<numInputs; v++)
    for (s=0; s<numValues[v]+1; s++) /* add one to account for '?' */
      for (c=0; c<numClasses; c++)
	 P[v][s][c]=0.0;

  /* Set P[var][val][class] to the number of times in the training set
     that attribute 'var' has value 'val' and the output is 'class'. */
  for (v=0; v<numInputs; v++)
    for (i=0; i<numTrain; i++)
    { int val, class;
      val = train[i]->in[v].d;
      class = train[i]->class;
      if (class < 0 || class >= numClasses)
	Barf("Class out of range in BuildVDM");
      if (val==DONT_KNOW && unknownVDM==1)
        P[v][numValues[v]][train[i]->class] += 1.0;	
      else if (val!=DONT_KNOW && val!=DONT_CARE)
	P[v][val][train[i]->class] += 1.0;
    } /* for i & v */
  
  /* Now set P[var][val][class] to the number of times in the training set
     that attribute 'var' has value 'val' and the output is 'class' DIVIDED
     BY #times attribute 'var' has value 'val' (and any class). 
     i.e., P[i][a][c] = Ni,a,c/Ni,a (going by my formula). */
  if (debug>=3)
  { printf("VDM P-values:\n");
    printf("var val class 0..%d\n",numClasses-1);
  }
  for (v=0; v<numInputs; v++)
  { int val;
    for (val=0; val<numValues[v]+1; val++)
    { double sum;
      sum=0.0;
      for (c=0; c<numClasses; c++)
	sum = sum + P[v][val][c];
      if (sum > 0)
      { for (c=0; c<numClasses; c++)
          P[v][val][c] = P[v][val][c] / sum;
      }
      if (debug>=3)
      { printf("%3d %3d",v,val);
	for (c=0; c<numClasses; c++)
	  printf("  %.2lf",P[v][val][c]);
	printf("\n");
      }
    } /* for each value 'val' */
  } /* for each variable v */
} /* BuildVDM */

/******************** DumpVDM ******************************************/
void DumpVDM()
{ /* Displays the probability values P[][][] and train[i]->p for the current
     partition of the training set.  Also displays the interpolated p-values
     using both IVDM and WVDM. */
  int v, val, c, i;
  static double *p=NULL;
  
  if (p==NULL)
    p = (double *) Allocate(sizeof(double) * numClasses,"p in DumpVDM");
  
  /** Do this:----------
  Dumping VDM...
  P[var][val][class]:
  var     val     mid     c0      c1      c2
  0       0       4.91    1.000   0.000   0.000
  0       1       5.53    0.000   0.667   0.333
  0       2       6.15    0.000   0.571   0.429
  0       3       6.77    0.000   0.500   0.500
  0       4       7.39    0.000   0.000   1.000
  0       '?'     0       0.000   0.000   0.000
  1       0       2.36    0.000   0.000   1.000
  1       1       2.68    0.000   0.800   0.200
  -----------... and so on */
  
  printf("Dumping VDM. numTrain=%d, numTest=%d\n",numTrain,numTest);
  printf("P[var][val][class]:\n");
  printf("var     val     mid     ");
  for (c=0; c<numClasses; c++)
    printf("c%-7d",c);
  printf("\n");
  for (v=0; v<numInputs; v++)
  { for (val=0; val<numValues[v]+1; val++)
    { printf("%-8d",v);
      if (val==numValues[v]) /* '?' */
	printf("%-8s%-8lg","?",0.0);
      else printf("%-8d%-8lg",val,minVal[v] + theWidth[v]*(((double)val) + 0.5));
      for (c=0; c<numClasses; c++)
	printf("%-8.3lg",P[v][val][c]);
      printf("\n");
    }
  }
  
  /**** Do this -------------------
    Interpolated P[var][val][class] (vs. discretized P[][][]):
    var	val	c0	c1	c2	dc0	dc1	dc2
    0	4.900	0.984	0	0	1	0	0
    0	5.100	0.694	0.204	0.102	1	0	0
    1	3.100	0.519	0.206	0.275	0.3	0.3	0.4
    1	3.800	0.167	0	0.333	0.333	0	0.667
    2	1.500	0.594	0	0	1	0	0
    2	1.500	0.594	0	0	1	0	0
    3	0.100	0.5	0	0	1	0	0
    3	0.300	0.917	0	0	1	0	0
  --------------------------------*/
  
  printf("\nInterpolated P[var][val][class] (vs. discretized P[][][]):\n");
  printf("var     val     ");
  for (c=0; c<numClasses; c++)
    printf("c%-7d",c);
  for (c=0; c<numClasses; c++)
    printf("dc%-6d",c);
  printf("\n");
  for (v=0; v<numInputs; v++)
    if (Continuous(v))
    { for (i=0; i<numTest; i++)
      { printf("%-8d",v);
	PrintDouble(test[i]->in[v].f,-8);
	InterpolateP(v,test[i],p);
	for (c=0; c<numClasses; c++) /* print IVDM's p's */
	  printf("%-8.3lg",p[c]);
	for (c=0; c<numClasses; c++) /* DVDM's p's */
	{ val = test[i]->in[v].d;
	  if (val==DONT_KNOW)
	    val=numValues[v];
	  if (val==DONT_CARE)
	    printf("%-8.3lg",0);
	  else printf("%-8.3lg",P[v][val][c]);
	}
	printf("\n");
      }
    }
  
  /***** Do this: --------------------
    wvdmList[i]->in[var].p[var][class]:
    var	val	c0	c1	c2
    0	4.6	1.000	0.000	0.000
    0	4.7	1.000	0.000	0.000
    0	4.8	1.000	0.000	0.000
    0	4.8	1.000	0.000	0.000
    0	4.9	1.000	0.000	0.000
    0	4.9	1.000	0.000	0.000
    0	5.2	1.000	0.000	0.000
    0	5.7	0.000	0.600	0.400
    0	5.7	0.000	0.600	0.400...
   -------------------- */
  
  printf("\nwvdmList[i]->in[var].p[var][class]: (i.e., p-value of each\n");
  printf("      attribute value using a WVDM 'window').\n");
  printf("var     val     ");
  for (c=0; c<numClasses; c++)
    printf("c%-7d",c);
  printf("\n");
  for (v=0; v<numInputs; v++)
    if (Continuous(v))
    { for (i=0; i<wvdmNum[v]; i++)
      { printf("%-8d",v);
	PrintDouble(wvdmList[v][i]->in[v].f,-8);
	for (c=0; c<numClasses; c++)
	  printf("%-8.3lg",wvdmList[v][i]->p[v][c]);
	printf("\n");
      }
    }
  
  /*** Do this ------------------------
    test[t]->in[var].p[class] (Interpolated from adjacent values):
    var	val	c0	c1	c2
    0	4.9	1.000	0.000	0.000
    0	5.1	1.000	0.000	0.000
    1	3.1	0.455	0.182	0.364
    1	3.8	0.000	0.000	1.000
    2	1.5	1.000	0.000	0.000
    2	1.5	1.000	0.000	0.000
    3	0.1	1.000	0.000	0.000
    3	0.3	1.000	0.000	0.000
  *---------------------------***************/
  
  printf("\ntest[t]->in[var].p[class] (Interpolated from adjacent values):\n");
  printf("var     val     ");
  for (c=0; c<numClasses; c++)
    printf("c%-7d",c);
  printf("\n");
  for (v=0; v<numInputs; v++)
    if (Continuous(v))
    { for (i=0; i<numTest; i++)
      { printf("%-8d",v);
	PrintDouble(test[i]->in[v].f,-8);
	InterpolateWP(v,test[i],p);
	for (c=0; c<numClasses; c++)
	  printf("%-8.3lg",p[c]);
	printf("\n");
      }
    }
  exit(0);
} /* DumpVDM */  
  
/******************** InterpolateP *************************************/
void InterpolateP(int v, Instance *a, double *q)
{ /* Finds the probability that the output class is 'c' for c=0..numClasses-1,
     given that the input value for variable 'v' is 'x' (which is discretized
     as 'd'), and puts this probability in p[c].  Interpolates between
     P[v][u][c] and P[v][u+1][c], where mid[u] <= x < mid[u+1]. 
     Assumes that 'v' is a Continuous attribute. */
  int u, c;  /* u=range whose midpoint is highest but still less than x. */
  double fu, /* floating-point value of u */
    mid1,    /* midpoint of range u */
    mid2,    /* midpoint of range u+1.  i.e., mid1 <= x < mid2 */
    p1, p2;  /* Probability at midpoint mid1&mid2 for current class c. */
  double x;
  
  u = a->in[v].d;
  fu = (double) u;
  x = a->in[v].f;
  while (x <  (mid1 = minVal[v] + theWidth[v]*(fu + 0.5)) )
  { u--;
    fu = (double) u;
  }
  mid2 = mid1 + theWidth[v];
  for (c=0; c<numClasses; c++)
  { if (u<0 || u >= discNumStates)
      p1 = 0.0;
    else p1 = P[v][u][c];
    if ((u+1)<0 || (u+1) >= discNumStates)
      p2 = 0.0;
    else p2 = P[v][u+1][c];
    if (mid1==mid2)
      q[c] = p1; /* which should equal p2.  Actually this shouldn't even happen*/
    else q[c] = p1 + ((x-mid1)/(mid2-mid1)) * (p2 - p1);
  } /* for */
} /* InterpolateP */

/************************ CompareVal *****************************/
int CompareVal(const void *aa, const void *bb)
{ /* Comparison function for the qsort() call in BuildWindowP
     and the BSearch() call in InterpolateWP.  looks at the given
     instance's value f for input 'globalVal' and returns -1 if a<b,
     0 if a=b, and 1 if a>b.*/
  Instance *a, *b, **c, **d;
  c = (Instance **) aa; /* c now points to a pointer to an instance */
  d = (Instance **) bb; /* so does d */
  a =  *c; /* a now points to an instance */
  b =  *d; /* b now points to another instance */
  if (debug>3)
  { printf("I think %lg is ",a->in[globalV].f);
    if (a->in[globalV].f < b->in[globalV].f)
      printf("<");
    else if (a->in[globalV].f > b->in[globalV].f)
      printf(">");
    else printf("=");
    printf(" to %lg for globalV=%d\n", b->in[globalV].f,globalV);
    PrintInstance(a);
    PrintInstance(b);
  }
  if (a->in[globalV].f < b->in[globalV].f)
    return -1;
  else if (a->in[globalV].f > b->in[globalV].f)
    return 1;
  else return(0);
} /* CompareVal */

/******************** BuildWindowP *************************************/
void BuildWindowP()
{ /* Build a list wvdmList[0..numInputs-1][0..numTrain-1] of instances,
     sorted by attribute value for each attribute.  Then compute
     p[0..numInputs-1][0..numClasses-1] for each continuous input
     value of each instance. O(numTrain*numInputs) operation. */
  /* Assumes wvdmList[numInputs][numInstances] has been allocated as well
     as instance[i].p[numInputs][numClasses]. */
  int first, i, last; /* Fist, current, and last instance in i's window.
			 Actually, 'last' points to the next instance just
			 outside of the window, so those in the window are
			 really first..last-1. */
  int v; /* variable currently being used */
  int c; /* class currently being considered */
  static int *nc=NULL; /* nc[c] = #instances in window with output of class c. */
  int n; /* #instances in window at all.  Note: n and nc[] don't count instances
	    with '?' or '*' as their value for the current variable */
  double min, max, x; /* min/max value in range; x=i's value. */
  double f; /* temporary double */
  Instance **ww;
  
  first=last=0;
  if (nc==NULL)
    nc = (int *) Allocate(sizeof(int) * numClasses,"nc");
  for (v=0; v<numInputs; v++)
    if (Continuous(v))
    { /* First sort wvdmList[v] by attribute value */
      for (i=0; i<numTrain; i++)
	wvdmList[v][i] = train[i];
      if (debug>1)
      { printf("Var. %d is continuous.  Before sort the order is:\n",v);
	for (i=0; i<numTrain; i++)
	  printf("[%d] %lg,",i,wvdmList[v][i]->in[v].f);
	printf("\n");
      }
      globalV = v; /* Use global to pass v to comparison function */
      
      /* Sort instances by variable v. */
      ww = wvdmList[v]; /* so ww[0..numTrain] point to train[0..numTrain] */
      qsort((void *)ww,numTrain,sizeof(Instance *),CompareVal);
      
      /* Get rid of all '?' and '*' */
      if (debug>1)
      { printf("After sort the order is:\n");
	for (i=0; i<numTrain; i++)
	  printf("[%d] %lg,",i,wvdmList[v][i]->in[v].f);
	printf("\n");
      }
      wvdmNum[v]=0;
      for (i=0; i<numTrain; i++)
        if (wvdmList[v][i]->in[v].f != F_DONT_CARE && wvdmList[v][i]->in[v].f != F_DONT_KNOW)
	  wvdmList[v][wvdmNum[v]++] = wvdmList[v][i];    
      if (debug>1)
      { printf("After stripping '?' and '*' wvdmNum[%d]=%d and the order is:\n",
	       v,wvdmNum[v]);
	for (i=0; i<wvdmNum[v]; i++)
	  printf("[%d] %lg,",i,wvdmList[v][i]->in[v].f);
	printf("\n");
      }
      
      /* Initialize window counters */
      for (c=0; c<numClasses; c++)
	nc[c]=0;
      n=0;
      first=last=0;
      
      for (i=0; i<wvdmNum[v]; i++)
      { /* Figure out the P value for instance i's attribute v.*/
	x = wvdmList[v][i]->in[v].f;
	if (x==F_DONT_CARE || x==F_DONT_KNOW)
	  Barf("Yikes! We have a don't care in BuildWindowP()");
	min = x - (theWidth[v]*0.5);
	max = min + theWidth[v];
	
	 /* Update 'n' to be #instances in window, and 'nc[c]' to
	    be #instances in window with output class 'c'. */
	if (debug>1)
	  printf("Window: min=%lg, x=%lg, max=%lg.\n",min,x,max);
	if (last<wvdmNum[v])
	  f = wvdmList[v][last]->in[v].f;
	while (f < max  && last<wvdmNum[v]) /* Widen window to include up to max*/
	{ nc[wvdmList[v][last]->class]++;
	  n++;
	  last++;
	  if (last<wvdmNum[v])
	    f = wvdmList[v][last]->in[v].f;
	}
	if (first < wvdmNum[v])             /* Shrink window to exclude up to min */
  	  f = wvdmList[v][first]->in[v].f;
	while (f < min && first < wvdmNum[v])
	{ nc[wvdmList[v][first]->class]--;
	  n--;
	  first++;
	  if (first < wvdmNum[v])
	    f = wvdmList[v][first]->in[v].f;
	}

        /* Calculate p[v][c] for each instance = probability that output
	   class is 'c' given that the input for variable 'v' is the
	   instance's value for variable 'v'. Note that we do NOT set
	   the p values for instances with '?' or '*' for variable 'v'.
	   However, this is ok, since such variables will automatically
	   get a value of 1 or 0 (respectively) for their distance. */
	for (c=0; c<numClasses; c++)
	{ if (n>0)
	  wvdmList[v][i]->p[v][c] = ((double)nc[c])/((double)n);
	  else wvdmList[v][i]->p[v][c] = 0.0;
	}
        if (debug)
	{ int j;
	  for (j=first; j<last; j++)
	    printf("  instance %d: %lg -> %d\n",i,wvdmList[v][j]->in[v].f,
		   wvdmList[v][j]->class);
	  for (c=0; c<numClasses; c++)
	    printf(" p(%d)=%d/%d=%lg\n",c,nc[c],n,wvdmList[v][i]->p[v][c]);
	}
      } /* if continuous */
    } /* for each instance i, and for each variable v */
} /* BuildWindowP */

/********************* BSearch *****************************************/
int BSearch(const void *key, const void *base, size_t nel, size_t size,
	      int (*compar)(const void *, const void *))
{ /* Generic binary search function similar to 'bsearch', except that if
     the element 'key' is not found in the list, the element just before
     'key' is returned.  So, it does a search on elements base[0..nel-1],
     and returns an index a into base[], where base[a]<=key<base[a+1].
     Returns -1 if key<base[0], and returns nel-1 if base[nel-1]<key.*/
  int first=0,middle,last=nel-1;
  int comp;
  Instance **b;
  b = (Instance **)base;
  do {
    middle = (first + last)>>1;
    comp = compar(&key,&b[middle]);
    if (comp<0)
      last = middle;
    else if (comp>0)
      first = middle;
    else return middle; /* found an exact match */
  } while (first < last-1);
  if (comp<0)              /* then key < last. */
  { if (first>0)           /* then key > first, too. */
      return first;
    compar(&key,&b[0]);    /* else key<last and perhaps key<first, too. */
    if (comp<0)    /* then yes, key<first, and first=0, so return NULL. */
      return -1;
    else return 0; /* first<key and key<last, so return first (i.e., 0). */
  }
  else /* comp>0, so key > first */
  { if (last < nel-1) /* then key<last, so return 'first' */
      return first;
    else compar(&key,&b[last]); /* compare key with very last element in array*/
    if (comp<0)
      return first;
    else return last;
  }
  printf("Yikes! We're not supposed to get here!\n");
} /* BSearch */

/******************** InterpolateWP ************************************/
void InterpolateWP(int v, Instance *a, double *p)
{ /* Given an instance 'a' from the test set, find the two instances
     in wvdmList[v][0..numTrain-1] whose values span a->in[v].f, and interpolate
     the p for each class from these two instances.  Assumes that a->in[v].f
     is a real number, not '?' or '*', and that wvdmList[v][0..wvdmNum[v]] does
     not have any instances with '?' or '*' for attribute 'v'. */
  Instance *a1, *a2; /* Two instances whose values for variable 'v' span a's.*/
  double x1, x2, x; /* the values for variable 'v' for a1 and a2 and a. */
  double p1, p2;    /* the probability value for current class at x1&x2. */
  int c;
  int i; /* Index of which instances surround 'a' in attribute 'v'. */
  
  x = a->in[v].f;
  globalV = v;
  i = BSearch(a,wvdmList[v],wvdmNum[v],sizeof(Instance *),CompareVal);
  if (i<0)
  { a1=NULL; /* so x1=min[v]-width[v]/2, and p1=0 for all c. */
    x1 = minVal[v] - theWidth[v]*0.5;
    p1 = 0.0;
  }
  else 
  { a1 = wvdmList[v][i];
    x1 = a1->in[v].f;
  }
  
  /* Find the next larger attribute value in wvdmList. */
  if (i<0)
     i=0; /* Make sure we don't try to access wvdmList[v][-1]! */
  while (i<wvdmNum[v] && wvdmList[v][i]->in[v].f < x)
    i++; /* Really shouldn't happen more than once. */
  if (i<wvdmNum[v])
  { a2 = wvdmList[v][i];
    x2 = a2->in[v].f;
  }
  else
  { a2 = NULL;
    x2 = maxVal[v] + theWidth[v]*0.5;
    p2 = 0.0;
  }
  
  for (c=0; c<numClasses; c++)
  { if (a1!=NULL)
      p1 = a1->p[v][c];
    if (a2!=NULL)
      p2 = a2->p[v][c];
    if (x2==x1)
      p[c] = p1; /* which should equal p2 */
    else p[c] = p1 + ((x-x1)/(x2-x1)) * (p2-p1);
  } /* For each class c */
} /* InterpolateWP */

/*********************** dvdm *************************************/
double dvdm(int v, Instance *a, Instance *b)
{ /* Returns the Discretized Value Difference Metric distance between
     attribute 'v' of instances 'a' and 'b'.  One or both values could
     be '?'. Assumes 'a' and 'b' are both pointers to valid instances,
     and that neither value is '*' (dont' care).  Uses the discrete
     values for attribute v, so if v is Continuous, assumes that
     "Discretize" has already been run. */
  double sum=0.0, dist;
  int c, val1, val2;
  
  val1 = a->in[v].d;
  if (val1==DONT_KNOW)
    val1 = numValues[v];
  val2 = b->in[v].d;
  if (val2==DONT_KNOW)
    val2 = numValues[v];
  for (c=0; c<numClasses; c++)
  { dist = P[v][val1][c] - P[v][val2][c];
    sum = sum + (dist*dist);
  }
  dist = sum; /*  Don't do this anymore: * (double)numClasses; */
  /* To get distance in range 0..1 instead of 0..1/numClasses.
    'dist' is in range 0..1/C (on average), so dist*dist is
    in range 0..1/C^2, so sum is in range 0..C(1/C^2)=0..1/C,
    so we multiply by numClasses to get range 0..1.  Because of
    squared distances, the value will still usually be << 1.*/
  /* Actually, further research indicates that better normalization
    is achieved by NOT multiplying by numClasses.  The range of
    'dist' is 0..1/C on average, but can be anywhere in the range
    0..1, especially when very different values are used.  Thus
    the max distance is in range C*(0..1)=C.  Multiplying by C
    again puts the distance in the range 0..C^2.  Empirically,
    NOT multiplying by C results in better normalization. */
  return dist;
} /* dvdm */

/*********************** ivdm *************************************/
double ivdm(int v, Instance *a, Instance *b)
{ /* Returns the (squared) Interpolated Value Difference Metric distance
     between attribute 'v' of instances 'a' and 'b'.  One or both values
     could be '?'. Assumes 'a' and 'b' are both pointers to valid instances,
     and that neither value is '*' (dont' care).  Uses the continuous
     values for attribute v, so assumes that v is continuous. */
  double sum=0.0, dist;
  int c;
  
  if (a->in[v].d == DONT_KNOW)
  { for (c=0; c<numClasses; c++)
      p1[c] = P[v][numValues[v]][c];
  }
  else InterpolateP(v,a,p1);
  if (b->in[v].d == DONT_KNOW)
  { for (c=0; c<numClasses; c++)
      p2[c] = P[v][numValues[v]][c];
  }
  else InterpolateP(v,b,p2);
  for (c=0; c<numClasses; c++)
  { dist = p1[c] - p2[c];
    sum  = sum + (dist * dist);
  }
  dist = sum;
  return(dist);
} /* ivdm */

/*********************** wvdm *************************************/
double wvdm(int v, Instance *a, Instance *b)
{ /* Returns the (squared) Windowed Value Difference Metric distance
     between attribute 'v' of instances 'a' and 'b'.  One or both values
     could be '?'. Assumes 'a' and 'b' are both pointers to valid instances,
     and that neither value is '*' (dont' care).  Uses the continuous
     values for attribute v, so assumes that v is continuous. */
  double sum=0.0, dist;
  int c;
  
  if (a->in[v].d == DONT_KNOW)
  { for (c=0; c<numClasses; c++)
      p1[c] = P[v][numValues[v]][c];
  }
  else InterpolateWP(v,a,p1);
  if (b->in[v].d == DONT_KNOW)
  { for (c=0; c<numClasses; c++)
      p2[c] = P[v][numValues[v]][c];
  }
  else
  { /* We could do "InterpolateWP(v,b,p2);", but since 'b' is a training
	 instance, there is no real need to interpolate its 'p', since it
	 has already been calculated from its window. We will just set
	 p2[c] to b->p[v][c] for each c, instead of interpolating them. */
    for (c=0; c<numClasses; c++)
      p2[c] = b->p[v][c]; 
  }
  for (c=0; c<numClasses; c++)
  { dist = p1[c] - p2[c];
    sum  = sum + (dist * dist);
  }
  dist = sum;
  return(dist);
} /* wvdm */

/******************** FindDimDistance **********************************/
double FindDimDistance(int v, Instance *a, Instance *b, int metric)
{ /* Finds the squared distance--in the v-dimension only--between instances 
     a and b, using the 'metric' indicated.
     Returns MAXDOUBLE if either instance is NULL.*/
  double dist;
  int c;
  double sum;
  
  /* Screen for bogus instances or variables...*/
  if (a==NULL || b==NULL)
    return(MAXDOUBLE);
  if (v<0 || v>=numInputs)
    Barf("Bad value for 'v' in FindDimDistance");
  if (p1==NULL)
  { p1 = (double *) Allocate(sizeof(double)*numClasses,"p1");
    p2 = (double *) Allocate(sizeof(double)*numClasses,"p2");
  }
  /* Check for '*' */
  if (Discrete(v) && (a->in[v].d==DONT_CARE || b->in[v].d==DONT_CARE))
    return(0.0); /* distance to '*' is always 0.0 */
  if (Continuous(v) && (a->in[v].f==F_DONT_CARE || b->in[v].f==F_DONT_CARE))
    return(0.0); /* distance to '*' is always 0.0 */
  
  if (!unknownVDM)
  { /* Then use 1.0 as distance when either value is '?' */
    if (Discrete(v) && (a->in[v].d==DONT_KNOW || b->in[v].d==DONT_KNOW))
      return 1.0;
    if (Continuous(v) && (a->in[v].f==F_DONT_KNOW || b->in[v].f==F_DONT_KNOW))
      return 1.0;
  }
  /* Assume now that we have two actual values to compare, unless 'unknownVDM'
     is true, in which case '?' will be treated as a special VDM value.
                  |-Linear-|
            |Discrete|
             Nom. Int. Cont.
     Euclid   E    E    E     where E=Euclidean (linear) distance.
     DVDM     D    D    D           D=Discretized VDM
     IVDM     D    D    I           I=Interpolated VDM
     WVDM     D    D    W           W=Windowed VDM
     HVDM     D    E    E           O=Overlap (Hamming) distance (1 or 0).
     HCGC     O    E    E  
     ****************************/
  if (metric==HCGC)
  { /* Heterogeneous distance function proposed by Christophe Giraud-Carrier (CGC)
       Uses Overlap metric (hamming distance: 1 if different, 0 if same)
       for nominal attributes, and Euclidean distance normalized by range
       for linear attributes, thus putting the distance for each attribute
       in the approximate range 0..1.*/
    if (a->in[v].d==DONT_KNOW || b->in[v].d==DONT_KNOW)
      dist=1.0;
    else if (Nominal(v))
    { if (a->in[v].d == b->in[v].d)
        dist = 0.0;
      else dist = 1.0;
    }
    else /* Linear */
    { if (Continuous(v))
        dist = a->in[v].f - b->in[v].f;
      else dist = (double)(a->in[v].d - b->in[v].d);
      dist = dist * rangeFac[v]; /* normalize: may still be negative */
      dist = dist * dist;        /* square: definately >= 0 now */
    }
  } /* HCGC */
  else if (metric==DVDM 
      || (Nominal (v) && metric!=EUCLIDEAN)
      || (Discrete(v) && metric!=EUCLIDEAN && metric!=HVDM))
  { /* Use Discretized Value Difference Metric */
    dist = dvdm(v,a,b);
  }
  else if (Continuous(v))
  { if (metric==IVDM)
      dist = ivdm(v,a,b); /* Interpolate between discretized VDM values */
    else if (metric==WVDM)
      dist = wvdm(v,a,b); /* Windowed VDM */
    else /* EUCLIDEAN or HETERO */
    { /* Use floating-point Euclidean distance */
      if (a->in[v].f==F_DONT_KNOW || b->in[v].f==F_DONT_KNOW)
	dist = 1.0; /* Difference to unknown is always 1.0 for Euclidean. */
      else
      { dist = normFac[v] * (a->in[v].f - b->in[v].f);
        dist = dist*dist;
      }
    }
  }
  else /* Nominal & Euclid., or Linear & Euclid-or-hetero */
  { /* Use integer Euclidean distance */
    if (a->in[v].d==DONT_KNOW || b->in[v].d==DONT_KNOW)
      dist = 1.0; /* Difference to unknown is always 1.0 for Euclidean. */
    else
    { dist = normFac[v] * (double) (a->in[v].d - b->in[v].d);
      dist = dist*dist;
    }
  }
  return(dist);
} /* FindDimDistance */

/********************* Distance *********************************/
double Distance(Instance *a, Instance *b, int metric)
{ /* Finds the distance^2 between two instances 'a' and 'b' using the
     given 'metric'.  Returns sum of the squared differences of the
     individual attributes. Does not take the square root of the distance. */
  int v;
  double d;
  if (a==NULL || b==NULL)
    Barf("Null instance in 'Distance");
  d=0.0;
  for (v=0; v<numInputs; v++)
    d = d + FindDimDistance(v,a,b,metric);
  return(d);
} /* Distance */

/********************** SameClass ******************************/
int SameClass(Instance *a, Instance *b)
{ /* Returns 1 if instances a and b are of the same class, else returns 0.
     a and b are */
  if (a==NULL || b==NULL)
    Barf("Null instance in SameClass");
  if (a->class == b->class)
    return(1);
  else return(0);
} /* SameClass */

/**************** Allocate *****************************************/
char *Allocate(long size, char *s)
{ char *p;
  p = (char *) malloc(size);
  if (p==NULL)
  { printf("Couldn't allocate memory for %s\n",s);
    exit(0);
  }
  else return(p);
} /* Allocate */

/**************** AllocateStuff *************************************/
void AllocateStuff()
{ /* Allocate dynamic arrays test[numInstances], train[numInstances],
     proj[numInputs][numInstances], myProj[numInputs][numInstances],
     indiv[popSize]. */
  int t,s,v;
  test = (Instance **) Allocate(sizeof(Instance *) * numInstances,"test");
  train = (Instance **) Allocate(sizeof(Instance *) * numInstances,"train");
  normFac = (double *) Allocate(sizeof(double) * numInputs,"normFac");
  rangeFac= (double *) Allocate(sizeof(double) * numInputs,"rangeFac");
  minVal  = (double *) Allocate(sizeof(double) * numInputs,"minVal");
  maxVal  = (double *) Allocate(sizeof(double) * numInputs,"maxVal");
  theWidth= (double *) Allocate(sizeof(double) * numInputs,"theWidth");
  
  accuracy  = (double **) Allocate(sizeof(double *)*STYLES,"accuracy");
  for (s=0; s<STYLES; s++)
    accuracy[s] = (double *) Allocate(sizeof(double)*partitions,"accuracy[s]");
  results = (double ***) Allocate(sizeof(double **) * partitions, "results[][][]");
  for (t=0; t<partitions; t++)
    results[t] = (double **) Allocate(sizeof(double *) * STYLES, "results[trial][][]");
  for (t=0; t<partitions; t++)
  { for (s=0; s<STYLES; s++)
      if (incremental)
	results[t][s] = (double *) Allocate(sizeof(double)*units,"results[trial][style][]");
      else results[t][s] = NULL; /* flag as non-incremental */
  }
  wvdmList = (Instance ***) Allocate(sizeof(Instance **) * numInputs,"wvdmList");
  for (v=0; v<numInputs; v++)
  { if (Continuous(v))
    wvdmList[v] = (Instance **) Allocate(sizeof(Instance *)*numInstances,"wvdmList[v]");
    else wvdmList[v]=NULL;
  }
  wvdmNum = (int *) Allocate(sizeof(int) * numInputs,"wvdmNum");
}  /* AllocateStuff */

/************* ShuffleNominalValues ****************************/
void ShuffleNominalValues()
{ /* Randomly shuffles the order of attribute values for each nominal
     input value.  This is done to see if the ordering of nominal values
     are truly arbitrary, or whether there is really some inherent
     ordering even in seemingly unordered values. */
  int v, i, j, temp;
  int *r;
  for (v=0; v<numInputs; v++)
     if (Nominal(v))
     { r = (int *) Allocate(sizeof(int) * numStates[v],"r (shuffle nominal)");
       for (i=0; i<numStates[v]; i++)
	 r[i]=i;
       for (i=0; i<numStates[v]; i++)
       { j = rndInt(numStates[v]);
	 temp = r[i];
	 r[i] = r[j];
	 r[j] = temp;
       }
       for (i=0; i<numInstances; i++)
       { int oldVal, newVal;
	 oldVal = instance[i].in[v].d;
	 newVal = r[oldVal];
	 instance[i].in[v].d = newVal;
       }
       free(r);
     } /* for v */
} /* ShuffleNominalValues */

/************* PRINTING ROUTINES *******************************/
/**************** PrintNum *************************************/
void PrintNum(int num, int width)
{ /* Prints an integer in the field width given */
  /* Translates DONT_KNOW into '?' and DONT_CARE into '*' */
  if (num>=0)
    printf("%*d",width,num);
  else if (num==DONT_CARE)
    printf("%*c",width,'*');
  else if (num==DONT_KNOW)
    printf("%*c",width,'?');
  else printf("%*d",width,num); /* shouldn't ever happen. */
} /* PrintNum */

/******************* PrintDouble ********************************/
void PrintDouble(double d, int width)
{ /* Prints a double in the field width given (use negative for left-
     justified), translates F_DONT_KNOW into '?' and F_DONT_CARE into '*'. */
  if (d==F_DONT_CARE)
    printf("%*s","*");
  else if (d==F_DONT_KNOW)
    printf("%*s","?");
  else printf("%*.3lg",width,d);
} /* PrintDouble */

/********************* PrintInstance ****************************/
void PrintInstance(Instance *i)
  { /* Print a single instance, i */
    int v;
    for (v=0; v<numVars; v++)
      { if (v==numInputs)
          printf(" -> ");
        if (Continuous(v))
	{ if (i->in[v].f == F_DONT_CARE)
	    printf("    *    (*) ");
	  else if (i->in[v].f == F_DONT_KNOW)
	    printf("    ?    (?) ");
          else printf("%8.3lg (%2d)",i->in[v].f, i->in[v].d);
	}
	else printf("%4d ",i->in[v].d);
      }
    printf(" [%d]",i->class);
    printf("\n");
  } /* PrintInstance */

/********************** PrintInstances **************************/
void PrintInstances()
{ /* Print the entire list of instances. */
  int i;
  printf("Inputs: %d;  Outputs: %d;  Instances: %d\n",numInputs,numOutputs,numInstances);
  printf("  NumStates[i]=");
  for (i=0; i<numVars; i++)
    printf("%d ",numStates[i]);
  printf("\n");
  for (i=0; i<numInstances; i++)
  { printf("%5d) ",i);
    PrintInstance(&instance[i]);
  } 
  printf("\n");
} /* PrintInstances */

/********** Continuous/Discrete/Linear/LinearDiscrete/Nominal *********/
int Continuous(int v)    { if (numStates[v] == 0) return(1); else return(0); }
int Nominal(int v)       { if (numStates[v] >  0) return(1); else return(0); }
int LinearDiscrete(int v){ if (numStates[v] < -1) return(1); else return(0); }
int Discrete(int v)      { if (numStates[v]< -1 || numStates[v]>0) return(1); else return(0);}
int Linear(int v)        { if (numStates[v]< -1 || numStates[v]==0)return(1); else return(0);}

/******************** FILE I/O *********************************/
/******************** ReadFile *********************************/
void ReadFile(char *filename)
{ /* Reads the file 'oldfilename' (in NUM format) into a list, and returns
     a pointer to the list. */
  FILE *fd;
  int done=0, var,v, c, i, iNum;
  int *oldNumStates;
  int oldNumInputs, oldNumOutputs;
  int debug=0;
  char junk[MAX_LEN];
  
  if (debug>0 || debug > 0) printf("Reading %s...\n",filename); 
  fd = OpenFile(filename,"rt");
  
  /***** Read Header ******/
  if (debug>1 || debug > 1) printf("Reading Header...\n");
  oldNumInputs  = numInputs    = GetInteger(fd);
  oldNumOutputs = numOutputs   = GetInteger(fd);
  numInstances  = GetInteger(fd);
  numVars       = numInputs + numOutputs;
  oldNumStates  = (int *) Allocate(numVars * sizeof(int),"oldNumStates");
  numStates     = (int *) Allocate(numVars * sizeof(int),"numStates");
  numValues     = (int *) Allocate(numVars * sizeof(int),"numValues");

  /*** Get number of states for each variable.  Strip "ignores" and analogs. */
  for (var=0; var<numVars; var++)
    oldNumStates[var]=GetInteger(fd);
  for (v=var=0; var<numVars; var++)
    { if (oldNumStates[var]==0 && var >=oldNumInputs)
        Barf("Continuously-valued outputs are not allowed.");
      if (oldNumStates[var]== -1) /* 'ignore' variable */
        { if (var<oldNumInputs)
            numInputs--;
          else numOutputs--;
        }
      else numStates[v++]=oldNumStates[var];
    }
  numVars=numInputs+numOutputs;

   /** Allocate arrays to hold inputs and outputs.  Note that we first
       allocate an array of pointers, then an array for each pointer.
       Also, 'inf' points to the arrays for floating-point inputs (&outputs),
       and 'inn' points to the arrays for integer inputs.  For each variable,
       either 'inn' or 'inf' points to a real array, and the other is 'NULL'.
       Also, inf[numInputs..numInputs+numOutputs-1] point to the same arrays
       as outf[0..numOuputs-1] to make it easier to allocate them.  The same
       is true with 'inn' and 'outn' **/
  instance = (Instance *) Allocate(numInstances * sizeof(Instance),"instance");
  for (i=0; i<numInstances; i++)
  { instance[i].in = (Value *) Allocate(numVars * sizeof(Value),"value");
    instance[i].out = &instance[i].in[numInputs];
  }

  i=0;
  /** Read each instance */
  for (iNum=0; iNum<numInstances && !feof(fd); iNum++)
    { 
      for (v=var=0; var<oldNumInputs+oldNumOutputs; var++)
        { if (oldNumStates[var] == -1) /* ignore it */
            GetValue(fd,junk);
          else if (oldNumStates[var]==0) /* get analog */
            instance[i].in[v++].f=GetReal(fd);
          else instance[i].in[v++].d=GetInteger(fd);
        }
      if (oldNumStates[var] == 0 ||
	  (instance[i].out[0].d != DONT_KNOW &&
	   instance[i].out[0].d != DONT_CARE))
	i++;
    } /* for each instance */
  if (numInstances > i)
    printf("Deleted %d instances with bogus outputs.\n",numInstances-i);
  numInstances=i;
  free(oldNumStates);
  
  /** Assign 'class' of each output, in case there are multiple outputs */
  for (i=0; i<numInstances; i++)
  { instance[i].class = instance[i].out[0].d;
    for (v=1; v<numOutputs; v++)
    { int ns;
      ns = numStates[numInputs+v];
      if (ns< -1)
	 ns = -ns;
      if (ns>0)
	instance[i].class = (instance[i].class * ns) + instance[i].out[v].d;
    }
  } /* for */
  numClasses = 1;
  for (v=0; v<numOutputs; v++)
  { if (numStates[numInputs+v] < -1)
      numClasses = numClasses * (-numStates[numInputs+v]);
    else if (numStates[numInputs+v] > 0)
      numClasses = numClasses * numStates[numInputs+v];
    else if (numStates[numInputs+v] == 0)
      printf("Warning: continuous output.\n");
  }
  /* Allocate memory for WVDM's p[var][class] */
  for (i=0; i<numInstances; i++)
  { instance[i].p = (double **) Allocate(sizeof(double *)*numInputs,"instance[i].p");
    for (v=0; v<numInputs; v++)
    { if (numStates[v]==0) /* Continuous */
        instance[i].p[v] = (double *) Allocate(sizeof(double) * numClasses, "instance[i].p[v]");
      else instance[i].p[v] = NULL; /* Discrete attr. don't need p-values*/
    }
  }
} /* ReadFile */

/************************ GetInteger ******************************/
int GetInteger(fd)
  FILE *fd;
{ /* Reads a number from the file fd. If it reads a '*', it returns
     DONT_CARE (-1); if it reads a '?' it returns DONT_KNOW (-2);
     Calls GetChar(fd) for each character, which skips comments.
     Comments can be enclosed in {}'s or preceded by '|'*/
  char ch,buf[MAX_LEN];
  int  pos, val;
  ch=' ';
  while ((ch = GetChar(fd))<=' ' && !feof(fd))
    ; /* eat white space */
  if (ch=='*')
    val = DONT_CARE;
  else if (ch=='?')
    val = DONT_KNOW;
  else
    { pos=0;
      while (!feof(fd) && ((ch>='0' && ch<='9') || ch=='-'))
        { buf[pos++]=ch;
          ch = GetChar(fd);
        }
      ungetc(ch,fd); /* put back non-digit character */
      buf[pos]='\0';
      val = atoi(buf);
    }
  return(val);
} /* GetInteger */

/************************ GetReal ******************************/
double GetReal(fd)
  FILE *fd;
{ /* Reads a floating-point value from the file fd, and returns it.
     Returns F_DONT_CARE if it reads a '*', and it returns
     F_DONT_KNOW (-1) if it reads a '?'.  These two special values
     correspond to the largest negative and positive double values.
     Calls GetChar(fd) for each character, which skips comments.
     Comments can be enclosed in {}'s or preceded by '|'*/
  char ch,buf[MAX_LEN];
  int  pos;
  double d = 0.0;
  ch=' ';
  while ((ch = GetChar(fd))<=' ' && !feof(fd))
    ; /* eat white space */
  if (ch=='*')
    d = F_DONT_CARE;
  else if (ch=='?')
    d = F_DONT_KNOW;
  else
    { pos=0;
      while (!feof(fd) && ch>' ')
        { buf[pos++]=ch;
          ch = GetChar(fd);
        }
      ungetc(ch,fd); /* put back non-digit character */
      buf[pos]='\0';
      d = atof(buf);
    }
  return(d);
} /* GetReal */

/************************ GetValue *****************************/
void GetValue(fd,s)
  FILE *fd; char *s;
{ /* Reads a white-space-delimited string from the file 'fd', and puts
     it into the string 's'.  Eats leading white space, and skips over
     comments by using GetChar.  Comments can be enclosed in '{' and '}',
     or preceded by '|' on a line.  Reads at most MAX_LEN characters.
     Use this routine to read values of actual instances. */
  char ch;
  int pos=0;
  while (!feof(fd) && (ch=GetChar(fd)) <= ' ')
    ; /* eat white space */
  while (!feof(fd) && (s[pos++]=ch) > ' ' && pos<MAX_LEN)
    ch=GetChar(fd);
} /* GetValue */


/************************ GetChar ******************************/
char GetChar(fd)
  FILE *fd;
{ /* Reads a character from the fild fd.  If it reads a '|', it
     assumes there is a comment until the end of line, and thus
     reads until '\n', and returns '\n'.  If it reads a '{', it
     assumes there is a comment until '}', which it disposes of,
     after which it continues to try to read a character. */
  char ch;
  ch = fgetc(fd);
  if (ch=='|')
    { while (ch!='\n' && !feof(fd))
        ch=fgetc(fd);
      return('\n');
    }
  if (ch=='{')
    { while (ch!='}' && !feof(fd))
        ch=fgetc(fd);
      return(GetChar(fd));
    }
  return(ch);
} /* GetChar */

/*********************** OpenFile *****************************/
FILE *OpenFile(filename, mode)
  char *filename, *mode;
  { FILE *fd;
    fd = fopen(filename,mode);
    if (fd==NULL)
      { printf("Couldn't open file '%s' in mode '%s'.\n",filename,mode);
        printf("(Program terminated).\n");
        exit(0);
      }
    return(fd);
  } /* OpenFile */

/************************ Barf *********************************/
void Barf(char *s)
{ /* Prints an error message and terminates. */
  printf("%s\n",s);
  exit(0);
} /* Barf */
