/* ========================================================================
 *
 * The ModelObjects Group Software License, Version 1.0
 *
 *
 * Copyright (c) 2000-2001 ModelObjects Group.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:  
 *       "This product includes software developed by the
 *        ModelObjects Group (http://www.modelobjects.com)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The name "ModelObjects" must not be used to endorse or promote
 *    products derived from this software without prior written permission.
 *    For written permission, please contact djacobs@modelobjects.com.
 *
 * 5. Products derived from this software may not be called "ModelObjects",
 *    nor may "ModelObjects" appear in their name, without prior written
 *    permission of the ModelObjects Group.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE MODEL OBJECTS GROUP OR ITS
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ========================================================================
 */


package modelobjects.util;

import java.io.FileReader;
import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Hashtable;

/**
 *  This class implements a fairly simple lexical analyzer that should
 *  meet the needs of most applications.
 *
 *  @author Dan Jacobs  --  ModelObjects Group
 */
public class Lexer implements LexerTokenTypes
{
  /** style for Java identifiers */
  public static final int JAVA_ID_STYLE     = 1;

  /** style for HTML identifiers */
  public static final int HTML_ID_STYLE     = 2;

  /** style for SQL identifiers */
  public static final int SQL_ID_STYLE      = 3;

  /** style for alphabetic-only identifiers */
  public static final int ALPHA_ID_STYLE    = 4;

  /** style for alphanumeric identifiers */
  public static final int ALPHANUM_ID_STYLE = 5;

  /**
   *  Construct a Lexer to lex the specified String into tokens.
   */
  public Lexer(String stringSource)
  {
    this(new StringReader(stringSource));
  }

  /**
   *  Construct a Lexer to lex from the specified Reader into tokens.
   */
  public Lexer(Reader source)
  {
    reader              = new PushbackReader(source);
    lineNumber          = 1;
    position            = 0;
    tokenLineNumber     = 1;
    tokenPosition       = 0;
    done                = false;
    tokenSource         = new StringBuffer();
    tokenWhiteSpace     = new StringBuffer();
    idStyle             = ALPHANUM_ID_STYLE;
    stringPool          = new Hashtable();
    internmentEnabled   = true;
  }

  /**
   *  Return the contents of the last variable token.  This is used only
   *  with token types IDENTIFIER, DBL_QUOTE_STR, SNGL_QUOTE_STR, INTEGER,
   *  and FLOAT.  Note that the enclosing quote characters on strings
   *  are not part of the returned value.  Note also that token strings
   *  collected in this manner are interned locally, so that there will
   *  never be multiple String instances that have the same contents.
   *  Interning can be disabled by calling setInternmentEnabled(false).
   */
  public String getTokenSource()
  {
    return(internString(tokenSource.toString()));
  }

  /**
   *  Return a String that represents the last token, given the token type
   *  of the last token.  This method uses getTokenSource() as appropriate,
   *  and adds the enclosing quotation characters to quoted string literals.
   */
  public String getTokenString(int tokenType)
  {
    switch (tokenType)
      {
      case IDENTIFIER: case INTEGER: case HEX_INTEGER: case FLOAT:
      case DBL_QUOTE_STR: case SNGL_QUOTE_STR: case LINE_LITERAL:
        return(getTokenSource());
      default:
        return(getOperatorName(tokenType));
      }
  }

  /**
   *  Return the String that represents the specified non-variable
   *  token type.
   */
  public static String getOperatorName(int tokenType)
  {
    switch (tokenType)
      {
      case SNGL_EQUALS: return("=");
      case DBL_EQUALS:  return("==");
      case NE:          return("!=");
      case LT:          return("<");
      case GT:          return(">");
      case LE:          return("<=");
      case GE:          return(">=");
      case LTGT:        return("<>");
      case LSH:         return("<<");
      case RSH:         return(">>");
      case PLUS:        return("+");
      case MINUS:       return("-");
      case TIMES:       return("*");
      case DIVIDE:      return("/");
      case LPAR:        return("(");
      case RPAR:        return(")");
      case LBRACE:      return("{");
      case RBRACE:      return("}");
      case LSQUARE:     return("[");
      case RSQUARE:     return("]");
      case DOT:         return(".");
      case COMMA:       return(",");
      case COLON:       return(":");
      case SEMICOLON:   return(";");
      case ATSIGN:      return("@");
      case QUESTION:    return("?");
      case SHARP:       return("#");
      case PERCENT:     return("%");
      case DOLLAR:      return("$");
      case SNGL_AND:    return("&");
      case SNGL_OR:     return("|");
      case DBL_AND:     return("&&");
      case DBL_OR:      return("||");
      case NOT:         return("!");
      case TILDE:       return("~");
      case CARET:       return("^");
      case EOF:         return("");
      default:          return("" + (char)tokenType);
      }
  }

  /**
   *  Return the whitespace preceding the last token.  The strings returned
   *  by this method are not guaranteed to be '==' if they have the same
   *  contents.
   */
  public String getWhiteSpaceBeforeToken()
  {
    String w = tokenWhiteSpace.toString();
    if (w.length() <= 16)
    {
        return(internString(w));
    }
    else
    {
        return(w);
    }
  }

  /**
   *  Return the current line number.  The return value is the line
   *  number of the end of the last token, at the beginning of the
   *  following whitespace before the next token.
   */
  public int getLineNumber()
  {
    return(lineNumber);
  }

  /**
   *  Return the zero-based position of the token in the source.
   */
  public int getTokenPosition()
  {
    return(tokenPosition);
  }

  /**
   *  Return the line number of the last token.  The return value is the
   *  the line number of the first character that's actually in the token,
   *  and not in the precediing whitespace.
   */
  public int getTokenLineNumber()
  {
    return(tokenLineNumber);
  }

  /**
   *  Return the style being used to lex identifiers.
   *  The style value is one of the *_ID_STYLE constants defined in this class.
   */
  public int getIdStyle()
  {
    return(idStyle);
  }

  /**
   *  Assign the style being used to lex identifiers.
   *  The style value is one of the *_ID_STYLE constants defined in this class.
   */
  public void setIdStyle(int style)
  {
    this.idStyle = style;
  }

  /**
   *  Return whether double-slash style comments are recognized as whitespace.
   */
  public boolean getLexSlashSlashComments()
  {
    return(slashSlashComments);
  }

  /**
   *  Assign whether double-slash style comments are recognized as whitespace.
   */
  public void setLexSlashSlashComments(boolean val)
  {
    slashSlashComments = val;
  }

  /**
   *  Return whether slash-start style comments are recognized as whitespace.
   */
  public boolean getLexSlashStarComments()
  {
    return(slashStarComments);
  }

  /**
   *  Assign whether slash-start style comments are recognized as whitespace.
   */
  public void setLexSlashStarComments(boolean val)
  {
    slashStarComments = val;
  }

  /**
   *  Return whether to recognize backslash with strings as a special
   *  escape character.
   */
  public boolean getUseBackslashEscapes()
  {
    return(useBackslashEscapes);
  }

  /**
   *  Assign whether to recognize backslash with strings as a special
   *  escape character.
   */
  public void setUseBackslashEscapes(boolean val)
  {
    useBackslashEscapes = val;
  }

  /**
   *  Return whether internment of token strings is enabled.
   */
  public boolean isInternmentEnabled()
  {
    return(internmentEnabled);
  }

  /**
   *  Assign whether internment of token strings is enabled.
   */
  public void setInternmentEnabled(boolean enabled)
  {
    internmentEnabled = enabled;
  }

  /**
   *  Extract the next token from the source and return its type as one
   *  of the constants defined in Lexer.TokenTypes, or as the value of
   *  an otherwise unrecognized character.
   */
  public int getToken()
       throws IOException
  {
    if (tokenPushedBack)
    {
        tokenPushedBack = false;
    }
    else
    {
        lastTokenType = getToken(true);
    }

    return(lastTokenType);
  }

  /**
   *  Read the rest of the current line verbatim and return it, setting
   *  the token-type to LINE_LITERAL.  The return value does not include the
   *  newline character at the end of the line.
   *
   *  This method can be used by parsers to perform special-purpose handling
   *  of line-oriented input.
   */
  public String readCurrentLine()
       throws IOException
  {
    if (done)
    {
        return(null);
    }

    tokenWhiteSpace.setLength(0);
    tokenSource.setLength(0);

    tokenLineNumber = lineNumber;
    tokenPosition = position;
    lastTokenType = LINE_LITERAL;

    for (int n = read(); n != -1; n = read())
      {
        char c = (char)n;
        if (c == '\n')
          {
            lineNumber++;
            break;
          }
        else if (c == '\r')
          {
            continue;   // just consume this
          }
        else
          {
            tokenSource.append(c);
          }
      }

    return(tokenSource.toString());
  }

  /**
   *  Push back the last read token so that the next call to getToken()
   *  will return it again.
   */
  public void pushBack()
  {
    tokenPushedBack = true;
  }


  ////////////////////////////////////////////////////////////////////////////
  ///
  ///  Private internals
  ///
  ////////////////////////////////////////////////////////////////////////////

  private int read()
       throws IOException
  {
    position++;
    return(reader.read());
  }

  private void unread(int c)
       throws IOException
  {
    position--;
    reader.unread(c);
  }

  private int endOfFile()
       throws IOException
  {
    done = true;
    reader.close();
    tokenPosition = position;
    tokenLineNumber = lineNumber;
    return(EOF);
  }

  private boolean isIdentifierStart(char c)
  {
    switch (idStyle)
      {
      case JAVA_ID_STYLE:
      default:
        return(Character.isJavaIdentifierStart(c));

      case HTML_ID_STYLE:
        return(Character.isLetter(c) ||
               (c == '_') || (c == '-') || (c == '.'));

      case SQL_ID_STYLE:
        return(Character.isLetter(c) || (c == '_'));

      case ALPHA_ID_STYLE:
      case ALPHANUM_ID_STYLE:
        return(Character.isLetter(c));
      }
  }

  private boolean isIdentifierPart(char c)
  {
    switch (idStyle)
      {
      case JAVA_ID_STYLE:
      default:
        return(Character.isJavaIdentifierPart(c));

      case HTML_ID_STYLE:
        return(Character.isLetterOrDigit(c) ||
               (c == '_') || (c == '-') || (c == '.'));

      case SQL_ID_STYLE:
        return(Character.isLetterOrDigit(c) || (c == '_'));

      case ALPHA_ID_STYLE:
        return(Character.isLetter(c));

      case ALPHANUM_ID_STYLE:
        return(Character.isLetterOrDigit(c));
      }
  }

  private int getToken(boolean clearBuffers)
       throws IOException
  {
    int n;

    if (done)
    {
        return(endOfFile());
    }

    if (clearBuffers)
      {
        // clear the whitespace buffer and variable-text token buffer
        tokenWhiteSpace.setLength(0);
        tokenSource.setLength(0);
      }

    if ((n = read()) == -1)
    {
        return(endOfFile());
    }

    // Skip (but collect) any normal whitespace preceding the next token.

    char c1 = (char)n;
    while (Character.isWhitespace(c1) || Character.isISOControl(c1))
      {
        tokenWhiteSpace.append(c1);
        if (c1 == '\n')
        {
            lineNumber++;
        }
        if ((n = read()) == -1)
        {
            return(endOfFile());
        }
        c1 = (char)n;
      }

    // Remember the line and position of the first character of the token.

    tokenPosition   = position - 1;
    tokenLineNumber = lineNumber;

    // Look for numbers and identifiers first, since the different syntax
    // conventions may interpret initial characters differently.

    if (Character.isDigit(c1))
    {
        return(lexNumber(c1));
    }
    else if (isIdentifierStart(c1))
    {
        return(lexIdentifier(c1));
    }

    // Now that we know it's not a number or identifier, determine the
    // token type based on the first character.

    switch (c1)
      {
      case '+':
        return(PLUS);
      case '-':
        return(MINUS);
      case '*':
        return(TIMES);
      case '(':
        return(LPAR);
      case ')':
        return(RPAR);
      case '{':
        return(LBRACE);
      case '}':
        return(RBRACE);
      case '[':
        return(LSQUARE);
      case ']':
        return(RSQUARE);
      case ',':
        return(COMMA);
      case ':':
        return(COLON);
      case ';':
        return(SEMICOLON);
      case '@':
        return(ATSIGN);
      case '?':
        return(QUESTION);
      case '#':
        return(SHARP);
      case '%':
        return(PERCENT);
      case '~':
        return(TILDE);
      case '^':
        return(CARET);
      case '$':
        return(DOLLAR);

      case '.':
        return(lexDOT());
      case '=':
        return(lexEQ());
      case '!':
        return(lexNOT());
      case '<':
        return(lexLT());
      case '>':
        return(lexGT());
      case '&':
        return(lexAND());
      case '|':
        return(lexOR());

      case '/':
        if (slashSlashComments || slashStarComments)
        {
            return(lexSLASH());
        }
        else
        {
            return(DIVIDE);
        }

      case '\'':
        return(lexQuotedString(c1));
      case '"':
        return(lexQuotedString(c1));

      default:
        return((int)c1);
      }
  }
  
  private int lexDOT()
       throws IOException
  {
    int c = read();

    if (Character.isDigit((char)c))
      {
        return(lexFraction(c));
      }
    else
      {
        unread(c);
        return(DOT);
      }
  }

  private int lexEQ()
       throws IOException
  {
    int c = read();

    if (c == '=')
      {
        return(DBL_EQUALS);
      }
    else
      {
        unread(c);
        return(SNGL_EQUALS);
      }
  }

  private int lexNOT()
       throws IOException
  {
    int c = read();

    if (c == '=')
      {
        return(NE);
      }
    else
      {
        unread(c);
        return(NOT);
      }
  }

  private int lexLT()
       throws IOException
  {
    int c = read();

    if (c == '=')
    {
        return(LE);
    }
    else if (c == '<')
    {
        return(LSH);
    }
    else if (c == '>')
    {
        return(LTGT);             // special '<>' token for preprocessor
    }
    else
      {
        unread(c);
        return(LT);
      }
  }

  private int lexGT()
       throws IOException
  {
    int c = read();

    if (c == '=')
    {
        return(GE);
    }
    else if (c == '>')
    {
        return(RSH);
    }
    else
      {
        unread(c);
        return(GT);
      }
  }

  private int lexAND()
       throws IOException
  {
    int c = read();

    if (c == '&')
      {
        return(DBL_AND);
      }
    else
      {
        unread(c);
        return(SNGL_AND);
      }
  }

  private int lexOR()
       throws IOException
  {
    int c = read();

    if (c == '|')
      {
        return(DBL_OR);
      }
    else
      {
        unread(c);
        return(SNGL_OR);
      }
  }

  private int lexSLASH()
       throws IOException
  {
    int c = read();

    if (slashSlashComments && (c == '/'))
      {
        tokenWhiteSpace.append('/');

        while (c != -1)
          {
            tokenWhiteSpace.append((char)c);

            if (c == '\n')
              {
                lineNumber++;
                break;
              }

            c = read();
          }

        if (c == -1)
        {
            done = true;
        }
        
        return(getToken(false));
      }
    else if (slashStarComments && (c == '*'))
      {
        tokenWhiteSpace.append('/');

        int prevc = 0;
        while (c != -1)
          {
            tokenWhiteSpace.append((char)c);

            if ((prevc == '*') && (c == '/'))
            {
                break;
            }
            else if (c == '\n')
            {
                lineNumber++;
            }

            prevc = c;
            c = read();
          }

        if (c == -1)
        {
            done = true;
        }

        return getToken(false);
      }
    else
      {
        unread(c);
        return(DIVIDE);
      }
  }

  private int lexQuotedString(char quoteChar)
       throws IOException
  {
    int c = read();

    while ((c != -1) && (c != quoteChar))
      {
        if (c == '\n')
        {
            lineNumber++;
        }
        else if ((c == '\\') && useBackslashEscapes)
          {
            c = read();

            switch (c)
              {
              case '\'':                                break;
              case '\"':                                break;
              case '\\':                                break;
              case '\n':        lineNumber++;           break;
              case 'b':         c = '\b';               break;
              case 'f':         c = '\f';               break;
              case 'n':         c = '\n';               break;
              case 'r':         c = '\r';               break;
              case 't':         c = '\t';               break;
              case 'u':         c = readUnicode();      break;
              case '0': case '1': case '2': case '3':
                                c = readOctal(c);       break;
              }
          }

        tokenSource.append((char)c);
        c = read();
      }

    if (c == -1)
    {
        done = true;
    }
        
    if (quoteChar == '"')
    {
        return(DBL_QUOTE_STR);
    }
    else
    {
        return(SNGL_QUOTE_STR);
    }
  }

  private char readUnicode()
       throws IOException
  {
    int c1 = Character.digit((char)read(), 16);
    int c2 = Character.digit((char)read(), 16);
    int c3 = Character.digit((char)read(), 16);
    int c4 = Character.digit((char)read(), 16);

    if ((c1 < 0) || (c2 < 0) || (c3 < 0) || (c4 < 0)) {
      System.err.println("invalid unicode escape sequence on line " +
                         lineNumber);
      return('?');
    }

    return((char)((c1 << 12) + (c2 << 8) + (c3 << 4) + c4));
  }

  /**
   *  Read an octal character of the for \0 or \377 or \21
   *  The argument is the first character, either '0', '1', '2', or '3'
   */
  private char readOctal(int c)
       throws IOException
  {
    int c1 = Character.digit((char)c, 8);       // first char is always ok

    int c2 = Character.digit((char)(c = read()), 8);
    if (c == '\'')
      {
        unread(c);
        return((char)c1);
      }
    else if (c < 0)
    {
        System.err.println("invalid octal digit '" + ((char)c) + "' on line " +
                             lineNumber);
    }

    int c3 = Character.digit((char)read(), 8);
    if (c == '\'')
      {
        unread(c);
        return((char)((c1 << 3) + c2));
      }
    else if (c < 0)
    {
        System.err.println("invalid octal digit '" + ((char)c) + "' on line " +
                             lineNumber);
    }

    return((char)((c1 << 6) + (c2 << 3) + c3));
  }

  private int lexNumber(int n)
       throws IOException
  {
    char c = (char)n;

    // check for a hexadecimal literal
    if (n == '0')
      {
        tokenSource.append((char)c);
        n = read();
        c = (char)n;
        if ((c == 'x') || (c == 'X'))
          {
            tokenSource.append((char)c);
            return(lexHexNumber(read()));
          }
      }

    while (Character.isDigit(c))
      {
        tokenSource.append((char)c);
        n = read();
        c = (char)n;
      }

    if (c == '.')
      {
        tokenSource.append(c);
        return(lexFraction(read()));
      }

    // check for an exponent
    if ((c == 'e') || (c == 'E'))
      {
        tokenSource.append(c);
        n = read();
        c = (char)n;
        if ((c == '-') || (c == '+'))
          {
            tokenSource.append(c);
            n = read();
            c = (char)n;
          }
        while (Character.isDigit(c))
          {
            tokenSource.append(c);
            n = read();
            c = (char)n;
          }
        unread(n);
        return(FLOAT);
      }

    unread(n);
    return(INTEGER);
  }

  private int lexHexNumber(int n)
       throws IOException
  {
    char c = (char)n;
    while (((c >= '0') && (c <= '9')) ||
           ((c >= 'a') && (c <= 'f')) ||
           ((c >= 'A') && (c <= 'F')))
      {
        tokenSource.append((char)c);
        n = read();
        c = (char)n;
      }

    unread(n);
    return(HEX_INTEGER);
  }

  private int lexFraction(int n)
       throws IOException
  {
    char c = (char)n;

    while (Character.isDigit(c))
      {
        tokenSource.append(c);
        n = read();
        c = (char)n;
      }

    // check for an exponent
    if ((c == 'e') || (c == 'E'))
      {
        tokenSource.append(c);
        n = read();
        c = (char)n;
        if ((c == '-') || (c == '+'))
          {
            tokenSource.append(c);
            n = read();
            c = (char)n;
          }
        while (Character.isDigit(c))
          {
            tokenSource.append(c);
            n = read();
            c = (char)n;
          }
      }

    unread(n);
    return(FLOAT);
  }

  private int lexIdentifier(int n)
       throws IOException
  {
    char c = (char)n;

    while (isIdentifierPart(c))
      {
        tokenSource.append(c);
        n = read();
        c = (char)n;
      }

    unread(c);
    return(IDENTIFIER);
  }

  private String internString(String s)
  {
    if (!internmentEnabled)
    {
        return(s);
    }

    String result = (String)stringPool.get(s);
    if (result == null)
      {
        stringPool.put(s, s);
        result = s;
      }
    return(result);
  }

  ///
  ///  Testing
  ///

  public static void main(String args[])
  {
    try {
      FileReader reader = new FileReader(args[0]);
      Lexer lexer = new Lexer(reader);

      lexer.setLexSlashSlashComments(true);
      lexer.setLexSlashStarComments(true);
      lexer.setUseBackslashEscapes(true);
      lexer.setIdStyle(JAVA_ID_STYLE);

      while (true)
        {
          int ttype = lexer.getToken();

          System.err.println(lexer.getTokenLineNumber() + "\t" +
                             lexer.getTokenPosition() + "\t" +
                             lexer.getTokenString(ttype));

          String tokString = lexer.getTokenString(ttype);
          showToken(lexer, tokString);

          if (ttype == EOF)
        {
            break;
        }
        }
    }
    catch (Exception e) {
      e.printStackTrace();
      System.exit(1);
    }

    System.exit(0);
  }

  static void showToken(Lexer lexer, String tokenString)
  {
    String whiteSpace  = lexer.getWhiteSpaceBeforeToken();
            
    System.out.print(whiteSpace);
    System.out.print(tokenString);
    System.out.flush();
  }

  ///
  ///  Representation
  ///

  private   int                 lastTokenType;
  private   boolean             tokenPushedBack;
  private   PushbackReader      reader;
  private   StringBuffer        tokenSource;
  private   StringBuffer        tokenWhiteSpace;
  private   int                 tokenPosition;
  private   int                 tokenLineNumber;
  private   int                 position;
  private   int                 lineNumber;
  private   boolean             done;
  private   boolean             slashSlashComments;
  private   boolean             slashStarComments;
  private   boolean             useBackslashEscapes;
  private   int                 idStyle;
  private   boolean             internmentEnabled;
  private   Hashtable           stringPool;
}
