///////////////////////////////////////////////////////////////////////////// 
/* 
  Copyright 2004,2008 Ronald S. Burkey <info@sandroid.org>
 
  This file is part of GutenMark. 
 
  GutenMark is free software; you can redistribute it and/or modify 
  it under the terms of the GNU General Public License as published by 
  the Free Software Foundation; either version 2 of the License, or 
  (at your option) any later version. 
 
  GutenMark is distributed in the hope that it will be useful, 
  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  GNU General Public License for more details. 
 
  You should have received a copy of the GNU General Public License 
  along with GutenMark; if not, write to the Free Software 
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 
  Filename:	GutenSplit.c 
  Purpose:	Splits the HTML files output by GutenMark into smaller
  		HTML files at <h1> boundaries, and adds a table of 
		contents and links between the files. 
  Mods:		01/21/04 RSB	Wrote. 
  		02/21/04 RSB	Oops!  Forgot to add the HTML header to
				the files after the first two.
		05/26/08 JP	(Jason Pollock.)  The options -1, -2, 
				-3, -4, --no-toc, and --no-skip were
				added.

  I don't know if this program works on arbitrary HTML files, as it
  is specialized to GutenMark output files.  It might, for all I know.
*/

#define _GNU_SOURCE
#include <string.h>

#include <stdio.h>

#if defined (WIN32) || defined (__APPLE__)
// Unfortunately, in MinGW for Windows and in early versions of Mac OS X
// (such as 10.2), there is no strcasestr() function. So we have to add one. 
char *
strcasestr (const char *haystack, const char *needle)
{
  const char *s;
  int Length;
  Length = strlen (needle);
  for (s = haystack; *s; s++)
    {
      if (!strncasecmp (s, needle, Length))
        return ((char *) s);
    }
  return (NULL);
}
#endif // WIN32

// On linux, we need the extension,  On OSX, it isn't an extension.

/* Change the algorithm slightly:
   States:
   1) Normal
   2) HeadingTags
   3) InsideHeadingTag
   Normal:
   If see Header, split, go to InsideHeadingTag

   InsideHeadingTag
   If see end tag, go to HeadingTags
   append contents of tag to TOC link.

   HeadingTags:
   If see anything other than heading, go to normal - everything must be contained in
   a heading tag, or blank - look for <p> tag...

 */
typedef enum {
  Normal = 1,
  HeadingTags = 2,
  InHeadingTag = 3,
  Done = 4
} State_t;

// A buffer used for holding the header of the document --- i.e., the stuff up
// to and including <body>.  It is assumed that this is followed by a bunch 
// of stuff, followed by </body></html>.
static char HeaderBuffer[32768];
static int HeaderSize = 0;

// A buffer used for inputting lines from the file.
static char LineBuffer[16384];
//static int LineSize;

// A buffer for the filename.
static char Filename[1024];

// A buffer for the title.
static char Title[1024];

// Which chapters to split on, bit field.
typedef enum { 
  H1 = 1,
  H2 = 2,
  H3 = 4,
  H4 = 8
} Split_t;

int NumTagBits = 4;
int TagBits[4] = { H1, H2, H3, H4 };

int SplitTags = 0;

// Whether or not to output a ToC
char OutputToc = 1;

// Whether or not to skip the first <H*> Tag
char SkipFirst = 1;

char *findTag(char *line, char *tagFormat) {
  char *retval = NULL;
  char tag[6] = "</H1>"; // Needs to increase for H10...
  int i;

  for (i = 0; i<NumTagBits;++i) 
    {
      if (SplitTags & TagBits[i])
	{
	  snprintf(tag, 6, tagFormat, i+1);
	  retval = strcasestr(line, tag);
	}
      if (retval)
	{
	  return retval;
	}
    }

  return NULL;
}
char *findOpenHeading(char *line) {
  return findTag(line, "<h%d");
}

char *findCloseHeading(char *line) 
{
  return findTag(line, "</h%d>");
}

int main (int argc, char *argv[])
{
  int i, j, RetVal = 1, Index = 0, Headings = 0;
  char *s = NULL, *ss, *InputFile = NULL, *OutputBase = NULL;
  char *endOfTag = NULL;
  FILE *fin = NULL, *fout = NULL, *toc = NULL;
  State_t state = Normal;
  
  printf ("GutenSplit --- An HTML-splitter provided with GutenMark.\n");
  printf ("Version " __DATE__ "\n");
  printf ("Copyright (C)2004 Ronald Burkey\n");

  // First, parse the command-line inputs.
  RetVal = 1;
  for (i = 1; i < argc; i++)
    {
      if (!strcmp (argv[i], "--help") || !strcmp (argv[i], "/?"))
        {
	Usage:
	  printf ("USAGE:\n");
	  printf ("\tGutenSplit [OPTIONS] InputFile OutputBase\n");
	  printf ("The InputFile is the name of an HTML file as output by\n");
	  printf ("GutenMark.  The OutputBase is a basename of HTML files\n");
	  printf ("which will be created by GutenSplit.  For example, if\n");
	  printf ("OutputBase is \"TomSawyer\", then GutenSplit will create\n");
	  printf ("HTML files called TomSawyer000.html, TomSawyer001.html,\n");
	  printf ("and so on.  The first of these will be the table of contents.\n");
	  printf ("Options:\n");
	  printf ("\t--help: Display this help message\n");
	  printf ("\t-[1|2|3|4]: Perform the chapter split on H1/H2/H3 tags.\n");
	  printf ("\t            Defaults to H1, multiple options are allowed.\n");
	  printf ("\t            -1 -2 -3 -4 results in chapter splits on all four tags\n");
	  printf ("\t--no-toc: Do not create a Table of Contents\n");
	  printf ("\t--no-skip: Do not skip over the first heading tag\n");
	  goto Done;
	}
      else if (strcmp("-1", argv[i]) == 0) 
	{
	  SplitTags |= H1;
	}
      else if (strcmp("-2", argv[i]) == 0) 
	{
	  SplitTags |= H2;
	}
      else if (strcmp("-3", argv[i]) == 0) 
	{
	  SplitTags |= H3;
	}
      else if (strcmp("-4", argv[i]) == 0) 
	{
	  SplitTags |= H4;
	}
      else if (strcmp("--no-toc", argv[i]) == 0) 
	{
	  OutputToc = 0;
	}
      else if (strcmp("--no-skip", argv[i]) == 0)
	{
	  SkipFirst = 0;
	}
      else if ('-' == argv[i][0])
        {
	  printf ("Unrecognized command-line switch: %s\n", argv[i]);
	  goto Usage;
	}
      else if (InputFile == NULL)
        InputFile = argv[i];
      else if (OutputBase == NULL)
        OutputBase = argv[i];
      else
        {
	  printf ("Too many command-line arguments.\n");
	  goto Usage;
	}
    }

  if (SplitTags == 0) 
    {
      SplitTags = H1;
    }

  if (OutputBase == NULL)
    {
      printf ("Too few command-line arguments.\n");
      goto Usage;
    }
  fin = fopen (InputFile, "r");
  if (fin == NULL)
    {
      printf ("The chosen input file (\"%s\") was not found.\n", InputFile);
      goto Usage;
    }

  if (OutputToc) 
    {
      sprintf (Filename, "%s%03d.html", OutputBase, Index++);
      toc = fopen (Filename, "w");
      if (toc == NULL)
	{
	  printf ("Cannot create the table-of-contents output file.\n");
	  goto Usage;
	}
    }

  sprintf (Filename, "%s%03d.html", OutputBase, Index++);
  fout = fopen (Filename, "w");
  if (fout == NULL)
    {
      printf ("Cannot create output file %03d.\n", Index-1);
      goto Usage;
    }

  // Read the header of the input file.  We don't do any checking of the HTML
  // at all.  We assume it's valid HTML, and we assume that the <body>...</body>
  // area is completely carved up with <h1> headings.
  RetVal = 2;
  LineBuffer[sizeof (LineBuffer) - 1] = 0;
  for (HeaderSize = 0; NULL != fgets (LineBuffer, sizeof (LineBuffer) - 1, fin); )
    {
      s = strstr (LineBuffer, "<title>");
      ss = strstr (LineBuffer, "</title>");
      if (s != NULL && ss != NULL)
        {
	  *ss = 0;
	  strcpy (Title, s + 7);
	  *ss = '<';
	}
      s = strcasestr (LineBuffer, "<body");
      if (s == NULL)
        j = strlen (LineBuffer);
      else 
        {
	  endOfTag = strstr(s, ">");
	  s = endOfTag + 1;
          j = s - LineBuffer;
	}
      if (HeaderSize + j > sizeof (HeaderBuffer))
        {
	  printf ("The size of this file\'s header has exceeded the max allowed.\n");
	  printf ("Perhaps this is not an HTML file.\n");
	  goto Usage;
	}
      strncpy(HeaderBuffer+HeaderSize, LineBuffer,  j);
      HeaderSize += j;

      if (s != NULL)
        break;
    }

  // Write header to the table of contents and first section.
  RetVal = 3;
  if (toc)
    {
      if (HeaderSize != fwrite (HeaderBuffer, 1, HeaderSize, toc))
	{
	  printf ("Write-error in table of contents header.\n");
	  goto Usage;
	}
      fprintf (toc, "<h1 style=\"text-align: center;\">%s</h1>\n", Title);
      fprintf (toc, "<hr style=\"width: 100%%; height: 2px;\">");
      fprintf (toc, "<h1>Table of Contents</h1>\n");
      fprintf (toc, "<ul>\n");
    }
  if (HeaderSize != fwrite (HeaderBuffer, 1, HeaderSize, fout))
    {
      printf ("Write-error in output file 001.\n");
      goto Usage;
    }

  fprintf (fout, "<h1 style=\"text-align: center;\">%s</h1>\n", Title);
  fprintf (fout, "<div style=\"text-align: center;\">");
  if (toc)
    {
      fprintf (fout, "<a href=\"%s000.html\">Contents</a> &nbsp;&nbsp;", OutputBase);
    }
  fprintf (fout, "<a href=\"%s%03d.html\">Next</a></div>\n", OutputBase, Index);
  fprintf (fout, "<hr style=\"width: 100%%; height: 2px;\">");

  // Now begin parsing the input for the non-header stuff.  Some of it
  // may still be lingering in s.
  RetVal = 4;
  while (state != Done)
    {
      // Get new input, if none already lingering.
      if (s == NULL)
        {
	  s = fgets (LineBuffer, sizeof (LineBuffer) - 1, fin);
	  if (s == NULL)
	    break;
	}
      switch (state) 
	{
	case InHeadingTag:
	  ss = findCloseHeading(s);
	  if (!ss)
	    {
	      if (toc) 
		{
		  fprintf (toc, "%s", s);
		}
	      fprintf (fout, "%s", s);
	      s = NULL;
	      continue;
	    }
	  else
	    {
	      *(ss+4) = 0;
	      fprintf (fout, "%s>\n", s);
	      *ss = 0;
	      if (toc)
		{
		  fprintf (toc, "%s</a></li>\n", s);
		}
	      state = HeadingTags;
	      s = ss + 5;
	    }
	  break;
	case Normal:
	  // Check for start of new heading.
	  ss = findOpenHeading(s);
	  if (ss)
	    {
	      endOfTag = strstr(ss, ">");
	      if (!endOfTag) 
		{
		  printf("Cannot find end of header tag, exiting\n");
		  goto Usage;
		}

	      *endOfTag = 0;

	      *ss = 0;
	      fprintf (fout, "%s", s);
	      s = endOfTag + 1;
	      // Go to next output file.  For the FIRST heading we encounter,
	      // we're actually already there and don't need to worry about it.
	      Headings++;
	      if (!SkipFirst || Headings != 1)
		{
		  fprintf (fout, "<hr style=\"width: 100%%; height: 2px;\">\n");
		  fprintf (fout, "<div style=\"text-align: center;\">");
		  if (Index - 2 >= 0)
		    {
		      fprintf (fout, "<a href=\"%s%03d.html\">Previous</a> &nbsp;&nbsp;", OutputBase, Index - 2);
		    }
		  if (toc)
		    {
		      fprintf (fout, "<a href=\"%s000.html\">Contents</a> &nbsp;&nbsp;", OutputBase);
		    }
		  fprintf (fout, "<a href=\"%s%03d.html\">Next</a></div>\n", OutputBase, Index);
		  fprintf (fout, "\n</body>\n</html>\n");	      
		  fclose (fout);
		  sprintf (Filename, "%s%03d.html", OutputBase, Index++);
		  fout = fopen (Filename, "w");
		  if (fout == NULL)
		    {
		      printf ("Cannot create output file %03d.\n", Index - 1);
		      goto Usage;
		    }
		  fprintf (fout, "%s\n", HeaderBuffer);
		  fprintf (fout, "<h1 style=\"text-align: center;\">%s</h1>\n", Title);
		  fprintf (fout, "<div style=\"text-align: center;\">");
		  fprintf (fout, "<a href=\"%s%03d.html\">Previous</a> &nbsp;&nbsp;", OutputBase, Index - 2);
		  if (toc)
		    {
		      fprintf (fout, "<a href=\"%s000.html\">Contents</a> &nbsp;&nbsp;", OutputBase);
		    }
		  fprintf (fout, "<a href=\"%s%03d.html\">Next</a></div>\n", OutputBase, Index);
		  fprintf (fout, "<hr style=\"width: 100%%; height: 2px;\">\n");
		}
	      fprintf (fout, "<%s>", ss+1);
	      if (toc) 
		{
		  fprintf (toc, "<li><a href=\"%s%03d.html\">", OutputBase, Index - 1);
		}
	      state = InHeadingTag;
	      continue;
	    }
	  // If we've gotten to here, we're not in a heading, and the string
	  // doesn't contain any new heading starts.  Check for end-of-body.
	  ss = strstr (s, "</body>");
	  if (ss)
	    {
	      *ss = 0;
	      fprintf (fout, "%s", s);
	      state = Done;	// All done!
	      break;
	    }

	  // Okay, contains nothing fishy at all.
	  fprintf (fout, "%s", s);
	  s = NULL;
	  break;
	case HeadingTags:
	  // Only back to normal on <p ...> or <p>, looking for the body.
	  ss = strcasestr(s, "<p>");
	  if (!ss) 
	    {
	      ss = strcasestr(s, "<p ");
	    }
	  if (ss) 
	    {
	      state = Normal;
	    }
	  fprintf(fout, "%s", s);
	  s = NULL;
	  break;
	default:
	  printf("Invalid state when parsing file, exiting\n");
	  goto Usage;
	}
    }

  // All done!
  RetVal = 0;
Done:
  if (fin != NULL)
    fclose (fin);
  if (toc)
    {
      fprintf (toc, "</ul>\n");
      fprintf (toc, "<hr style=\"width: 100%%; height: 2px;\">\n");
      fprintf (toc, "<div style=\"text-align: center;\">");
      fprintf (toc, "<a href=\"%s001.html\">Begin</a></div>\n", OutputBase);
      fprintf (toc, "</body>\n</html>\n");
      fclose (toc);
    }
  if (fout != NULL)
    {
      fprintf (fout, "<hr style=\"width: 100%%; height: 2px;\">");
      fprintf (fout, "<div style=\"text-align: center;\">");
      fprintf (fout, "<a href=\"%s%03d.html\">Previous</a> &nbsp;&nbsp;", OutputBase, Index - 2);
      if (toc)
	{
	  fprintf (fout, "<a href=\"%s000.html\">Contents</a> &nbsp;&nbsp;", OutputBase);
	}
      fprintf (fout, "<a href=\"%s%03d.html\">Next</a></div>\n", OutputBase, Index);
      fprintf (fout, "\n</body>\n</html>\n");
      fclose (fout);
      // Add a fictitious last page, so that the "Next" links above will have
      // something to refer to.
      sprintf (Filename, "%s%03d.html", OutputBase, Index++);
      fout = fopen (Filename, "w");
      if (fout == NULL)
        {
          printf ("Cannot create output file %03d.\n", Index - 1);
          goto Usage;
        }
      fprintf (fout, "%s\n", HeaderBuffer);
      fprintf (fout, "<h1 style=\"text-align: center;\">%s</h1>\n", Title);
      fprintf (fout, "<div style=\"text-align: center;\">");
      fprintf (fout, "<a href=\"%s%03d.html\">Previous</a> &nbsp;&nbsp;", OutputBase, Index - 2);
      if (toc) 
	{
	  fprintf (fout, "<a href=\"%s000.html\">Contents</a>", OutputBase);
	}
      fprintf (fout, "<hr style=\"width: 100%%; height: 2px;\">\n");
      fprintf (fout, "<h1>The End</h1>\n");
      fprintf (fout, "\n</body>\n</html>\n");
      fclose (fout);
    }
  return (RetVal);
}
