///////////////////////////////////////////////////////////////////////////
/*
  Copyright 2001 Ronald S. Burkey

  This file is part of GutenMark.

  GutenMark is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  GutenMark is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with GutenMark; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Filename:	words197.c
  Purpose:	Converts Whitaker's Latin dictionary in plain-text format
  		to the fully-expanded one-word-per-line wordlist format 
		needed by GutenMark.  See below for licensing information
		concerning the wordlist itself.
  Mods:		11/09/01 RSB	Began.
*/
///////////////////////////////////////////////////////////////////////////

/*
  A quick program thrown together to process William Whitaker's Latin word
  list (http://users.erols.com/whitaker/dictpage.txt) into a form usable 
  by GutenMark.  I can't find a specific license in Whittaker's materials, 
  but here is a quote from the documentation:
  
    "This is a free Shareware program, which means it is proper to 
    copy it and pass it on to your friends. Consider it a developmental 
    item for which there is no charge. However, it is Copyrighted (c), 
    so please don't sell it as your own without at least telling me. 

    "This version is distributed without obligation, but the developer 
    would appreciate comments and suggestions."
    
  Of course, as processed for GutenMark, most of the dictionary 
  (namely the definitions, declensions, conjugations and, of course,
  Whitaker's software itself) are discarded, leaving just the wordlist, 
  but it would seem reasonable that someone who wanted to SELL the 
  wordlist should inform Whitaker (whitaker@erols.com).  I have sent a
  request for approval of this usage to Whitaker, but have not yet 
  received a reply.
  
  The conversion provided by words197 is  not perfect, because
  the format of the wordlist is very word-processory, and because there
  are formatting errors in the dictionary itself, but hopefully
  the amount of manual work left over will be small.  The wordlist
  has lines of the form
  	word, word, ... word: definition
  Unfortunately, addtional lines, like
  	A:
  are also thrown in.  Furthermore, there are things other than
  words in the word-fields, such as
  	ABB.		Indicates an abbreviation.
	(GEN.)		Perhaps unknown genitive.
	-		Unknown or illegal declension/conjugation.
  Also, words may be followed (prior to the comma-delimiter) by 
  alternate suffixes, such as -US -A -UM /IS.  (These may, or may not,
  be preceded by space.)  Or, the words may be followed
  by optional additional stuff, in parentheses, such as (I).  Either
  I(I) or I(II) seems to mean that the word may end in II.
  Another difficulty is that the words are in all-caps.  Fortunately,
  in MOST cases, the defining part begins with a '\' delimiter,
  and what follows is in lower-case for regular words or begins with
  an upper-case character for names.  (There are a few instances in
  which this fails, though.  We just have to accept those, I'm afraid.)	
  And, of course, don't forget the AE ligature.		
*/

#include <stdio.h>
#include <string.h>
#include <ctype.h>

char s[1000], *ss, *scolon, *sdef, *scomma, *snext, *sgen;
int Capitalized;
#define MAX_ALTS 20
char Word[200], AltEndings[MAX_ALTS][200];
int NumAlts, i;

// Convert a string to lower case.
void
strlwr (char *s)
{
  for (; *s; s++)
    *s = tolower (*s);
}

// Check char to see if it's a lower-case "vowel".
int
islowvowel (char c)
{
  if (c == 'a')
    return (1);
  if (c == 'e')
    return (2);
  if (c == 'i')
    return (3);
  if (c == 'o')
    return (4);
  if (c == 'u')
    return (5);
  return (0);
}

//------------------------------------------------------------------

// Some funky strings that always need to be removed from the 
// word fields.
const char *FunkyStrings[] = {
  "(GEN.)", "GEN.", "ABB.", "UNDECLINED"
};
#define NUM_FUNKY (sizeof (FunkyStrings) / sizeof (FunkyStrings[0]))

int
main (void)
{
  // Loop on lines of the input file.
  while (NULL != fgets (s, sizeof (s) - 1, stdin))
    {
      // Trim the input line.
      for (ss = &s[strlen (s) - 1]; ss >= &s[0]; ss--)
	if (isspace (*ss))
	  *ss = '\0';
	else
	  break;
      // Line is empty, or a header for an alphabetical section,
      // like a simple "A:", "B:", etc.   
      if (ss < &s[0] || *ss == ':')
	continue;
      // Discard lines not having a words:definition demarcation.
      scolon = strstr (s, ":");
      if (NULL == scolon)
	continue;
      *scolon = '\0';
      scolon++;
      // Find the definition part.
      sdef = strstr (scolon, "\\");
      Capitalized = (sdef != NULL) && isupper (sdef[1]);
      // Now parse the comma-delimited fields:
      for (scomma = s; *scomma; scomma = snext)
	{
	  // Get rid of leading space.
	  while (isspace (*scomma))
	    scomma++;
	  // Look for the comma.
	  for (ss = scomma; *ss && *ss != ','; ss++);
	  if (*ss == ',')
	    {
	      *ss = '\0';
	      snext = ss + 1;
	    }
	  else
	    snext = ss;
	  // At this point, scomma points to the beginning of the
	  // field, and the field is nul-terminated.  It now needs
	  // to be parsed into sub-fields.  To make things simpler
	  // various cases can be immediately eliminated.
	  for (i = 0; i < NUM_FUNKY; i++)
	    {
	      sgen = strstr (scomma, FunkyStrings[i]);
	      if (sgen != NULL)
		strcpy (sgen, sgen + strlen (FunkyStrings[i]));
	    }
	  sgen = strstr (scomma, "I(I)");
	  if (sgen != NULL)
	    sgen[3] = 'I';
	  if (*scomma == '-' || isspace (*scomma))
	    continue;
	  for (ss = scomma; *ss; ss++)
	    if (*ss == '-' || *ss == '/' || *ss == '(' || *ss == ')')
	      *ss = ' ';
	  // After the transformations above, the field should look 
	  // like
	  //    word alt-suffix1 alt-suffix2 ... alt-suffixN
	  // with no /, -, (, etc.    
	  NumAlts = sscanf (scomma, "%s%s%s%s%s%s%s%s%s%s%s"
			    "%s%s%s%s%s%s%s%s%s%s",
			    Word, AltEndings[0], AltEndings[1],
			    AltEndings[2], AltEndings[3],
			    AltEndings[4], AltEndings[5],
			    AltEndings[6], AltEndings[7],
			    AltEndings[8], AltEndings[9],
			    AltEndings[10], AltEndings[11],
			    AltEndings[12], AltEndings[13],
			    AltEndings[14], AltEndings[15],
			    AltEndings[16], AltEndings[17],
			    AltEndings[18], AltEndings[19]) - 1;
	  if (NumAlts < 0 || NumAlts > 20)
	    continue;
	  // Take care of capitalization.   
	  strlwr (&Word[Capitalized]);
#ifdef NOT_OBSOLETE
	  // Take care of the AE and ae ligatures.  Perhaps it's a sign
	  // of ignorance (and I'm certainly ignorant), but I believe 
	  // that it's probably 99% accurate to convert all "ae" to 
	  // ligatures, except those that are part of "aet".
	  if (Word[0] == 'A' && Word[1] == 'e' && Word[2] != 't')
	    {
	      Word[0] = 198;
	      strcpy (&Word[1], &Word[2]);
	    }
	  for (sgen = strstr (Word, "ae"); sgen != NULL;
	       sgen = strstr (sgen + 1, "ae"))
	    {
	      if (sgen[2] != 't')
		{
		  sgen[0] = 230;
		  strcpy (sgen + 1, sgen + 2);
		}
	    }
#endif
	  // Output the word.
	  puts (Word);
	  // Now take care of the alternate endings.  This is done by
	  // backtracking over the trailing consonents (if any), and
	  // then past all preceding vowels.  That's the point at which
	  // the alternate endings must be added.  (I just made up this
	  // rule by inspection.  I don't know if it's the actual rule
	  // Whitaker &co. use.)
	  if (NumAlts > 0)
	    {
	      for (i = 0; i < NumAlts; i++)
		strlwr (AltEndings[i]);
	      sgen = &Word[strlen (Word) - 1];
	      for (; sgen >= &Word[0] && !islowvowel (*sgen); sgen--);
	      for (; sgen >= &Word[0] && islowvowel (*sgen); sgen--);
	      sgen++;
	      for (i = 0; i < NumAlts; i++)
		{
		  strcpy (sgen, AltEndings[i]);
		  puts (Word);
		}
	    }
	}
    }
  return (0);
}
