/* munge.c -- simplistic program to collect program component statistics
 *
 * Usage:  munge file1 file2 ....
 *
 * The input files are assumed to be formatted from the PDF ISU CoP protocols
 * using "pdftotext -layout".  This program will parse the files and extract
 * the program components marks for each competitor, and do some statistical
 * analysis on them.  Specifically, we are interested in the standard deviation
 * of the marks for each competitor, by program component and by judge.
 * If the judges are marking the program components the way the ISU asserts
 * they are supposed to be, then we will see less variation between
 * different judges' marks for the same component, than in the same judges'
 * marks for different components for that competitor.
 *
 * The output is some statistics for the entire set of input files as a whole,
 * plus more detailed statistics for each segment and competitor if you've
 * defined the right compile-time verbosity flags.
 *
 * This program is (c) 2003 by Sandra Loosemore.  Permission is granted to
 * modify and redistribute this program in source form provided that
 * the original authorship and source is acknowledged.
 */

/* Control verbosity of output here */
#define PRINT_MARKS 0
#define PRINT_DETAILED_SDEV 0
#define PRINT_SDEV_BY_SKATER 0
#define PRINT_AVERAGE_SDEV_BY_SEGMENT 1

#define MAXJUDGES 14
#define MAXCOMPONENTS 5
#define BUFSIZE 1024


#include <ctype.h>
#include <string.h>
#include <stdio.h>
#include <math.h>

float total_sdev_within_judge = 0.0;
int n_total_sdev_within_judge = 0;

float total_sdev_within_component = 0.0;
int n_total_sdev_within_component = 0;

int total_competitors = 0;
int total_more_judge_than_component = 0;

float segment_sdev_within_judge = 0.0;
int n_segment_sdev_within_judge = 0;

float segment_sdev_within_component = 0.0;
int n_segment_sdev_within_component = 0;

int segment_competitors = 0;
int segment_more_judge_than_component = 0;


/* Helper function:  strip leading whitespace and the prefix from a line
 * of input, returning a pointer to the rest of the string.  If the string
 * doesn't match the prefix, return NULL.
 */

const char *match_prefix (const char *c, const char *prefix)
{
  while (*c && isspace(*c))
    c++;
  while (*prefix)
    {
      if (*c != *prefix) return 0;
      c++;
      prefix++;
    }
  return c;
}


/* Read marks for a single competitor, and process them.
 */

int collect_competitor_statistics (FILE *fp)
{
  float marks[MAXJUDGES][MAXCOMPONENTS];
  int njudges = 0;
  int ncomps = 0;
  char buffer[BUFSIZE];
  const char *p;
  int done = 0;
  float factor, panel;
  int j, c;
  float sdevbyjudge;
  float sdevbycomp;
  while (1)
    {
      if (!fgets (buffer, BUFSIZE, fp))
	return 0;
      /* General idea is to skip over all input lines that don't match one 
       * of the program component output lines.  Interpretation is always 
       * the last component of the set for a skater.
       */
      if (((p = match_prefix (buffer, "Interpretation")) && (done = 1))
	  || (p = match_prefix (buffer, "Skating Skills"))
	  || (p = match_prefix (buffer, "Transitions"))
	  || (p = match_prefix (buffer, "Performance/Execution"))
	  || (p = match_prefix (buffer, "Choreography"))
	  || (p = match_prefix (buffer, "Timing"))
	  || (p = match_prefix (buffer, "Performance")))
	{
	  int n = sscanf (p,
			  " %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f %f",
			  &factor,
			  &marks[0][ncomps], &marks[1][ncomps],
			  &marks[2][ncomps], &marks[3][ncomps],
			  &marks[4][ncomps], &marks[5][ncomps],
			  &marks[6][ncomps], &marks[7][ncomps],
			  &marks[8][ncomps], &marks[9][ncomps],
			  &marks[10][ncomps], &marks[11][ncomps],
			  &marks[12][ncomps], &marks[13][ncomps],
			  &panel);
	  if (n <= 2)
	    {
	      printf ("couldn't parse marks\n");
	      exit (1);
	    }
	  else if (njudges && (njudges != (n - 2)))
	    {
	      printf ("confused about number of judges\n");
	      exit (1);
	    }
	  else
	    njudges = n - 2;
	  ncomps ++;
	  if (done) break;
	  if (ncomps >= MAXCOMPONENTS)
	    {
	      printf ("got confused and read too many marks\n");
	      exit (1);
	    }
	}
    }

  if (!done)
    return 0;

#if PRINT_MARKS
  /* This is useful for debugging, to make sure we have parsed the marks
   * correctly.
   */
  for (c=0; c < ncomps; c++)
    {
      for (j = 0; j < njudges; j++)
	printf ("%6.2f ", marks[j][c]);
      printf ("\n");
    }
  printf ("\n");
#endif

  /* Compute the mean and standard deviation by component, printing output
   * along the way if we want to be verbose about it.
   */
#if PRINT_DETAILED_SDEV
  printf ("mean/sdev by component:\n");
#endif
  sdevbycomp = 0.0;
  for (c=0; c < ncomps; c++)
    {
      float total, mean, sdev, temp;
      total = 0.0;
      for (j = 0; j < njudges; j++)
	total += marks[j][c];
      mean = total / njudges;
      total = 0.0;
      for (j = 0; j < njudges; j++)
	{
	  temp = marks[j][c] - mean;
	  total += (temp * temp);
        }
      sdev = sqrt (total / njudges);
#if PRINT_DETAILED_SDEV
      printf ("%6.2f %f\n", mean, sdev);
#endif
      sdevbycomp += sdev;
      segment_sdev_within_component += sdev;
      n_segment_sdev_within_component ++;
    }
  sdevbycomp = sdevbycomp/ncomps;
#if PRINT_DETAILED_SDEV
  printf ("\n");
#endif

  /* Ditto for the mean and standard deviation by judge.
   */
#if PRINT_DETAILED_SDEV
  printf ("mean/sdev by judge:\n");
#endif
  sdevbyjudge = 0.0;
  for (j = 0; j < njudges; j++)
    {
      float total, mean, sdev, temp;
      total = 0.0;
      for (c = 0; c < ncomps; c++)
	total += marks[j][c];
      mean = total / ncomps;
      total = 0.0;
      for (c = 0; c < ncomps; c++)
	{
	  temp = marks[j][c] - mean;
	  total += (temp * temp);
        }
      sdev = sqrt (total / ncomps);
#if PRINT_DETAILED_SDEV
      printf ("%6.2f %f\n", mean, sdev);
#endif
      sdevbyjudge += sdev;
      segment_sdev_within_judge += sdev;
      n_segment_sdev_within_judge ++;
    }
  sdevbyjudge = sdevbyjudge/njudges;
#if PRINT_DETAILED_SDEV
  printf ("\n");
#endif

  segment_competitors++;
  if (sdevbyjudge > sdevbycomp)
      segment_more_judge_than_component ++;

#if PRINT_SDEV_BY_SKATER
  printf ("average sdev by component = %f\n", sdevbycomp);
  printf ("average sdev by judge = %f\n", sdevbyjudge);
  printf ("\n");
#endif

  return 1;
}

/* Read marks for an entire competition segment, and process them.
 */

void collect_segment_statistics (char *filename)
{
  FILE *fp = fopen (filename, "r");
  if (!fp)
    {
      printf ("can't open %s\n", filename);
      exit (1);
    }

  segment_sdev_within_judge = 0.0;
  segment_sdev_within_component = 0.0;
  n_segment_sdev_within_judge = 0;
  n_segment_sdev_within_component = 0;
  segment_competitors = 0;
  segment_more_judge_than_component = 0;

  while (collect_competitor_statistics (fp));
  fclose (fp);

#if PRINT_AVERAGE_SDEV_BY_SEGMENT
  printf ("for segment %s:\n", filename);
  printf ("segment sdev by component = %f\n",
	  segment_sdev_within_component / n_segment_sdev_within_component);
  printf ("segment sdev by judge = %f\n",
	  segment_sdev_within_judge / n_segment_sdev_within_judge);
  printf ("total competitors marked = %d\n", segment_competitors);
  printf ("number with more variation within judge than component = %d\n",
	  segment_more_judge_than_component);
  printf ("\n");
#endif

  total_sdev_within_judge += segment_sdev_within_judge;
  n_total_sdev_within_judge += n_segment_sdev_within_judge;

  total_sdev_within_component += segment_sdev_within_component;
  n_total_sdev_within_component += n_segment_sdev_within_component;

  total_competitors += segment_competitors;
  total_more_judge_than_component += segment_more_judge_than_component;
}


int main (int argc, char **argv)
{
  int i;
  if (argc < 2)
    {
      printf ("usage:  munge filename...\n");
      exit (1);
    }
  for (i=1; i<argc; i++)
    collect_segment_statistics (argv[i]);
  printf ("overall sdev by component = %f\n",
	  total_sdev_within_component / n_total_sdev_within_component);
  printf ("overall sdev by judge = %f\n",
	  total_sdev_within_judge / n_total_sdev_within_judge);
  printf ("total competitors marked = %d\n", total_competitors);
  printf ("number with more variation within judge than component = %d\n",
	  total_more_judge_than_component);
}
