19.4 An XML Pull Parser

The SAX API for parsing XML is called a "push-parsing" API because the SAX parser "pushes" tokens or events to your code. It is also possible to parse XML using the reverse architecture, in which your code "pulls" tokens from the XML parser as needed. The TAX class of Example 19-6 does just that. TAX is an acronym for Trivial API for XML. It is a pull parser (actually more of a fancy tokenizer) for a subset of XML. Despite its simplicity, it is useful for a variety of XML parsing tasks. The TAX.Parser class relies on the Tokenizer interface (defined in Example 2-8) and on the various implementations of that interface (see Chapter 2, Chapter 3, and Chapter 6). The TAX class is simply a holder for inner classes and token type constants. In addition to the TAX.Parser class, TAX also holds a Token class, a TokenType class (an enumerated type), and a ParseException class.

Before diving into the details of the TAX parser, it is probably easiest to first study how the parser is typically used. Example 19-5 is a program much like the ListServlets program of Example 19-1: it parses a web.xml file and outputs the servlet name-to-class mappings and name-to-URL mappings defined in the file.

Example 19-5. ListServlets2.java

package je3.xml;
import java.io.*;

/**
 * Parse a web.xml file using the TAX pull parser and print out the servlet 
 * name-to-class and name-to-url mappings.
 **/
public class ListServlets2  {
    public static void main(String[  ] args)
        throws IOException, TAX.ParseException
    {
        // Create a TAX.Parser instance to parse the specified file
        TAX.Parser parser = new TAX.Parser(new FileReader(args[0]));
        // By default the parser returns TAG, TEXT and ENDTAG tokens and
        // skips others. Configure it to skip ENDTAG tokens, too.
        parser.ignoreTokens(TAX.ENDTAG);

        // Now loop through all tokens until the end of the file
        for(TAX.Token t = parser.next( ); t != null; t = parser.next( )) {
            // If it is not a tag, we're not interested
            if (t.type( ) != TAX.TAG) continue;
            
            if (t.text( ).equals("servlet")) {   // We found <servlet>
                parser.expect("servlet-name");  // Require <servlet-name> next
                t = parser.expect(TAX.TEXT);    // Require text token next
                String name = t.text( );         // Get text from the token
                parser.expect("servlet-class"); // Require <servlet-class> next
                t = parser.expect(TAX.TEXT);    // Require text token 
                // Output name to class mapping
                System.out.println("Servlet " + name + 
                                   " implemented by " + t.text( ));
            }
            else if (t.text( ).equals("servlet-mapping")) {
                // Now we do the same thing for <servlet-mapping> tags
                parser.expect("servlet-name");
                String name = parser.expect(TAX.TEXT).text( );
                parser.expect("url-pattern");
                String mapping = parser.expect(TAX.TEXT).text( );
                System.out.println("Servlet " + name + 
                                   " mapped to " + mapping);
            }
        }
    }
}

Example 19-6. TAX.java

package je3.xml;
import java.util.*;
import java.io.*;
import java.nio.channels.*;
import java.nio.charset.*;
import je3.classes.Tokenizer;
import je3.classes.CharSequenceTokenizer;
import je3.io.ReaderTokenizer;
import je3.nio.ChannelTokenizer;

/**
 * This class, whose name is an acronym for "Trivial API for XML", is a 
 * container for a simple Parser class for parsing XML and its related Token, 
 * TokenType and ParseException classes and constants.
 * 
 * TAX.Parser is a simple, lightweight pull-parser that is useful for a variety
 * of simple XML parsing tasks. Note, however, that it is more of a tokenizer
 * than a true parser and that the grammar it parses is not actually XML, but a
 * simplified subset of XML. The parser has (at least) these limitations:
 *
 *   It does not enforce well-formedness. For example, it does not require
 *      tags to be properly nested.
 *   It is not a validating parser, and does not read external DTDs
 *   It does not parse the internal subset of the DOCTYPE tag, and cannot
 *      recognize any entities defined there.
 *   It is not namespace-aware
 *   It does not handle entity or character references in attribute values,
 *      not even pre-defined entities such as "
 *   It strips all whitespace from the start and end of document text, which,
 *      while useful for many documents, is not generally correct.
 *   It makes no attempt to do error recovery.  The results of calling next( )
 *      after a ParseException is thrown are undefined.
 *   It does not provide enough detail to reconstruct the source document
 * 
 * TAX.Parser always replaces entity references with their values, or throws
 * a Tax.ParseException if no replacement value is known.  The parser coalesces
 * adjacent text and entities into a single TEXT token. CDATA sections are
 * also returned as TEXT tokens, but are not coalesced.
 **/
public class TAX {
    // Enumerated type return values for Token.type( )
    public static final TokenType TAG = new TokenType("TAG");
    public static final TokenType ENDTAG = new TokenType("ENDTAG");
    public static final TokenType TEXT = new TokenType("TEXT");
    public static final TokenType COMMENT = new TokenType("COMMENT");
    public static final TokenType PI = new TokenType("PI");
    public static final TokenType DOCTYPE = new TokenType("DOCTYPE");
    public static final TokenType XMLDECL = new TokenType("XMLDECL");

    // A type-safe enumeration for token types.  Note the private constructor
    public static class TokenType {
        private static int nextOrdinal = 0;
        private final int ordinal = nextOrdinal++;
        private final String name;
        private TokenType(String name) { this.name = name; }
        public String toString( ) { return name; }
    }

    // Token objects are the return value of the Parser.next( ) method.
    // They provide details about what was parsed and where.
    public static class Token {
        TokenType type;    // One of the constants above
        String text;       // Tagname for TAG & XMLDECL, 
                           // Complete text minus delimiters for other types
        int line, column;  // Position of start of token
        Map attributes;    // name/value map for TAG and XMLDECL,null otherwise
        boolean empty;     // true for XMLDECL and TAGs ending with "/>".
        
        // We use this constructor for TAG and XMLDECL tokens
        Token(TokenType t, String s, int l,int c,Map a,boolean e) {
            this(t,s,l,c);
            this.attributes = a;
            this.empty = e;
        }
        // This constructor for other token types
        Token(TokenType type, String text, int line, int column) {
            this.type = type;
            this.text = text;
            this.line = line;
            this.column = column;
        }

        // Property accessor methods
        public TokenType type( ) { return type; }
        public String text( ) { return text; }
        public int line( ) { return line; }
        public int column( ) { return column; }
        public Map attributes( ) { return attributes; }
        public boolean empty( ) { return empty; }
    }

    // Exceptions of this type are thrown for syntax errors or unknown entities
    public static class ParseException extends Exception {
        public ParseException(String msg) { super(msg); }

        static ParseException expected(Token t, String expected) {
            return new ParseException("Expected " + expected + " at line " +
                                      t.line( ) + ", column " + t.column( ));
        }
    }

    // This is the parser class. It relies internally on a Tokenizer.
    // The public constructors allow you to parse XML from a CharSequence,
    // a Reader, or a Channel.   By default, it will return tokens of type TAG,
    // ENDTAG, and TEXT, and will ignore all others.  You can change this
    // behavior by passing token type constants to returnTokens( ) or
    // ignoreTokens( ).  By default the parser will replace character entities
    // and the pre-defined entities &, <, >, ", and ' with
    // their values.  You can define new entity name/replacement pairs by
    // calling defineEntity( ).  These configuration methods all return the
    // Parser objects so calls can be chained.  After configuring your Parser,
    // call the next( ) method repeatedly until it returns null.
    public static class Parser {
        Tokenizer tokenizer;   // Used to break up the input
        Map entityMap;         // Map entity name to replacment
        // Should we return tokens of these types?
        boolean[  ] returnTokenType = new boolean[TokenType.nextOrdinal];

        public Parser(CharSequence text) {
            this(new CharSequenceTokenizer(text));
        }
        public Parser(Reader in) {
            this(new ReaderTokenizer(in));
        }
        public Parser(ReadableByteChannel in, Charset encoding) {
            this(new ChannelTokenizer(in, encoding));
        }

        Parser(Tokenizer tokenizer) {
            this.tokenizer = tokenizer;
            tokenizer.tokenizeSpaces(true);  // always tokenize spaces
            tokenizer.trackPosition(true);   // track line and column #
            // We don't always want the tokenizer to tokenize words, but when
            // we do, this is how we want the words formed.
            tokenizer.wordRecognizer(new Tokenizer.WordRecognizer( ) {
                    public boolean isWordStart(char c) {
                        return Character.isLetter(c) || c == '_' || c == ':';
                    }
                    public boolean isWordPart(char c, char first) {
                        if (Character.isLetterOrDigit(c) ||
                            c == '_' || c=='-' || c=='.' || c==':')
                            return true;
                        int type = Character.getType(c);
                        return type == Character.COMBINING_SPACING_MARK ||
                            type == Character.ENCLOSING_MARK ||
                            type == Character.NON_SPACING_MARK ||
                            type == Character.MODIFIER_LETTER;
                    }
                });

            // Set pre-defined entitities
            entityMap = new HashMap( );
            entityMap.put("lt", "<");
            entityMap.put("gt", ">");
            entityMap.put("amp", "&");
            entityMap.put("quot", "\"");
            entityMap.put("apos", "'");

            // Set default values for what token types to return
            returnTokenType[TAG.ordinal] = true;
            returnTokenType[ENDTAG.ordinal] = true;
            returnTokenType[TEXT.ordinal] = true;
        }

        public Parser returnTokens(TokenType t) {
            returnTokenType[t.ordinal] = true;
            return this;
        }

        public Parser ignoreTokens(TokenType t) {
            returnTokenType[t.ordinal] = false;
            return this;
        }

        // Define a mapping from entity name to entity replacement.
        // Note that the entity name should not include the & or ; delimiters.
        public Parser defineEntity(String name, String replacement) {
            entityMap.put(name, replacement);
            return this;
        }

        // This utility method is for reporting parsing errors
        void syntax(String msg) throws ParseException {
            throw new ParseException(msg + " at " + 
                                     tokenizer.tokenLine( ) + ":" +
                                     tokenizer.tokenColumn( ));
        }

        // This method returns the next XML token of input or null if there
        // is no more input to parse.
        public Token next( ) throws ParseException, IOException { 
            Token token = null;

            // Otherwise, loop until we find a token we want to return;
            for(;;) {
                // Invariant: we keep the tokenizer on the first unparsed token
                // This means we start our methods by calling tokenType( )
                // to examine what we're currently on, not by calling next( ).
                // But we end by calling next( ) to consume the stuff we've
                // already seen.
                int t = tokenizer.tokenType( );

                // If we're at the tokenizer's start state, then read a token
                if (t == Tokenizer.BOF) t = tokenizer.next( );

                // If there is no more input, return null
                if (t == Tokenizer.EOF) return null;

                // Skip any space. This is not technically correct: we don't
                // know if this is ignorable whitespace or not. But in
                // practice, most clients will want to ignore it.
                if (t == Tokenizer.SPACE) {
                    tokenizer.next( );
                    continue;
                }
                
                // If the token is a open angle bracket, then this is markup
                // otherwise it is text.
                if (t == '<') token = parseMarkup( );
                else token = parseText( );

                // If the token we've parsed is one of the kind to be returned,
                // then return it.  Otherwise, continue looping for a new token
                if (returnTokenType[token.type.ordinal]) return token;
            }
        }

        // Read the next token and return it if it is a TAG with the specified
        // tagname.  Otherwise throw a ParseException
        public Token expect(String tagname) throws ParseException,IOException {
            Token t = next( );
            if (t == null || t.type( ) != TAG || !t.text( ).equals(tagname)) 
                throw ParseException.expected(t, "<" + tagname + ">");
            return t;
        }

        // Read and return the next token, if it is of the specified type.
        // Otherwise throw a ParseException
        public Token expect(TokenType type) throws ParseException,IOException {
            Token t = next( );
            if (t == null || t.type( ) != type)
                throw ParseException.expected(t, type.toString( ));
            return t;
        }

        // This method called with a current token of '<' to parse various
        // forms of XML markup
        Token parseMarkup( ) throws ParseException, IOException {
            assert tokenizer.tokenType( ) == '<' : tokenizer.tokenType( );
            try {
                // Turn on word tokenizing. It is turned off in finally clause.
                tokenizer.tokenizeWords(true); 
                int t = tokenizer.next( );
                if (t == '?') {    // Markup is a PI or XMLDECL
                    t = tokenizer.next( );
                    if (t != Tokenizer.WORD) syntax("XMLDECL or PI expected");
                    if (tokenizer.tokenText( ).equals("xml")) {
                        Token token =
                            new Token(XMLDECL, tokenizer.tokenText( ),
                                      tokenizer.tokenLine( ),
                                      tokenizer.tokenColumn( ) - 2,
                                      parseAttributes( ),
                                      true);

                        if (tokenizer.tokenType( )!='?') syntax("'?' expected");
                        if (tokenizer.next( ) != '>') syntax("'>' expected");
                        return token;
                    }
                    else {
                        Token token = new Token(PI, null,tokenizer.tokenLine( ),
                                                tokenizer.tokenColumn( )-2);
                        // Read to end of PI
                        tokenizer.scan("?>", true, true, false, true);
                        token.text = tokenizer.tokenText( );
                        return token;
                    }
                }
                
                if (t == '!') {         // Markup is DOCTYPE, CDATA, or Comment
                    t = tokenizer.next( );
                    if (t == Tokenizer.WORD &&
                        tokenizer.tokenText( ).equals("DOCTYPE")) {
                        return parseDoctype( );
                    }
                    else if (t == '[') {
                        if (tokenizer.next( ) == Tokenizer.WORD &&
                            tokenizer.tokenText( ).equals("CDATA") &&
                            tokenizer.next( ) == '[') {
                            Token token = new Token(TEXT, null,
                                                    tokenizer.tokenLine( ),
                                                    tokenizer.tokenColumn( )-8);
                            tokenizer.scan("]]>", true, false, false, true);
                            token.text = tokenizer.tokenText( );
                            return token;
                        }
                        else syntax("CDATA expected");
                    }
                    else if (t == '-' && tokenizer.next( ) == '-') {
                        // a COMMENT token
                        Token token = new Token(COMMENT, null, 
                                                tokenizer.tokenLine( ),
                                                tokenizer.tokenColumn( )-4);
                        tokenizer.scan("-->", true, false, false, true);
                        token.text = tokenizer.tokenText( );
                        return token;
                    }
                    else syntax("DOCTYPE, CDATA, or Comment expected");
                }
                if (t == '/') {    // Markup is an element end tag
                    t = tokenizer.next( );
                    if (t == Tokenizer.WORD) {
                        Token token = new Token(ENDTAG, tokenizer.tokenText( ),
                                                tokenizer.tokenLine( ),
                                                tokenizer.tokenColumn( )-2);
                        
                        t = tokenizer.next( );
                        if (t == Tokenizer.SPACE) t = tokenizer.next( );
                        if (t != '>') syntax("Expected '>'");
                        return token;
                    }
                    else syntax("ENDTAG expected.");
                }
                if (t == Tokenizer.WORD) { // Markup is an element start tag
                    Token token = new Token(TAG, tokenizer.tokenText( ),
                                            tokenizer.tokenLine( ),
                                            tokenizer.tokenColumn( ) - 1,
                                            parseAttributes( ),
                                            tokenizer.tokenType( ) == '/');
                    
                    if (tokenizer.tokenType( ) == '/') tokenizer.next( );
                    if (tokenizer.tokenType( ) != '>') syntax("'>' expected");
                    return token;
                }
                
                // If none of the above matched, this is a syntax error
                syntax("Invalid character following '<'");

                // The compiler doesn't realize that syntax( ) never returns,
                // so it requires a return statement here.
                return null;
            }
            finally {
                // restore tokenizer state
                tokenizer.tokenizeWords(false);
                // Get the next token ready
                tokenizer.next( );           
            }
        }

        Token parseDoctype( ) throws IOException {
            assert (tokenizer.tokenType( ) == Tokenizer.WORD &&
                    tokenizer.tokenText( ).equals("DOCTYPE"));

            int line = tokenizer.tokenLine( );
            int column = tokenizer.tokenColumn( );
            StringBuffer b = new StringBuffer( );

            int t = tokenizer.next( );
            while(t != '>' && t != '[' && t != Tokenizer.EOF) {
                b.append(tokenizer.tokenText( ));
                t = tokenizer.next( );
            }
            
            if (t == '[') { // If there is an internal subset, scan for its end
                tokenizer.scan("]>", true, true, false, true);
                b.append(tokenizer.tokenText( ));
                b.append(']');
            }

            return new Token(DOCTYPE, b.toString( ), line, column);
        }

        // Parse a sequence of name=value attributes, where value is always
        // quoted in single or double quotes, and return them as a Map.
        // When this method is called, the tokenizer is looking at the element
        // name, not at the first token to parse.
        // This is used when parsing element start tags and XMLDECLs
        Map parseAttributes( ) throws ParseException, IOException {
            try {
                // Adjust tokenizer to recognize quotes.
                // Defaults are restored in finally clause below
                tokenizer.quotes("'\"", "'\"");
                int t = tokenizer.next( ); // Consume the element name

                // Skip optional space
                if (t == Tokenizer.SPACE) t = tokenizer.next( );

                // This is a special case for elements with no attributes
                if (t != Tokenizer.WORD) return Collections.EMPTY_MAP; 

                Map m = new HashMap( );  // Where we'll store attributes

                while(t == Tokenizer.WORD) {
                    String name = tokenizer.tokenText( );  // get attribute name
                    // The next token must be '='
                    if (tokenizer.next( ) != '=') syntax("'=' expected");
                    t = tokenizer.next( );
                    // The next token must be a quoted string
                    if (t != '"' && t != '\'')
                        syntax("quoted attribute value expected");
                    // Map attribute name to attribute value.
                    // The tokenizer strips the quotes for us.
                    // Note that we do not handle entity references here.
                    m.put(name, tokenizer.tokenText( ));
                    // Consume the value and skip an optional space after it
                    t = tokenizer.next( );
                    if (t == Tokenizer.SPACE) t=tokenizer.next( );
                }
                return m;
            }
            finally { // Always turn off quote tokenizing
                tokenizer.quotes("", "");
            }
        }

        // Coalesce any character data and entity references into a single 
        // TEXT token and return it, or throw an exception for undefined
        // entities.  Note that CDATA elements are also returned as TEXT 
        // tokens but are not coalesced like this.  When this method is called
        // we know that the tokenizer is looking at a char other than '<'.
        Token parseText( ) throws ParseException, IOException {
            assert tokenizer.tokenType( ) != '<' : tokenizer.tokenType( );
            // Save line and column info of the start of the text
            int line = tokenizer.tokenLine( );
            int column = tokenizer.tokenColumn( );
            StringBuffer b = new StringBuffer( ); // where we accumulate text

            int t;
            while((t = tokenizer.tokenType( )) != '<') {
                if (t == '&') b.append(parseEntityReference( ));
                else {
                    // Otherwise we've found some text
                    tokenizer.scan("<&",  // scan until we find one of these
                       false, // just match one, not the whole string
                       true,  // extend the token we've already started
                       false, // don't include delimiter char in the token
                       false);// don't skip delimiter; save for next token
                    b.append(tokenizer.tokenText( ));
                    tokenizer.next( );
                }
            }
            // Strip trailing space and return as a TEXT token
            return new Token(TEXT, b.toString( ).trim( ), line, column);
        }


        // Parse a reference to a general entity or character entity and
        // return its value as a string, or throw an exception for undefined
        // entities. Called when tokenizer is looking at an '&'.
        String parseEntityReference( ) throws ParseException, IOException {
            assert tokenizer.tokenType( ) == '&' : tokenizer.tokenType( );
            String s = null;
            try {
                tokenizer.tokenizeWords(true);
                int t = tokenizer.next( );
                if (t == '#') {  // if it's a character reference
                    tokenizer.tokenizeNumbers(true);
                    t = tokenizer.next( );
                    String text = tokenizer.tokenText( );
                    if (t == Tokenizer.NUMBER) {  // a decimal character ref
                        int n = Integer.parseInt(text);  // parse as base-10
                        s = Character.toString((char)n); // convert to string
                    }
                    else if (t == Tokenizer.WORD && text.charAt(0) != 'x') {
                        // a hexadecimal character reference
                        String hex = text.substring(1);    // skip the 'x'
                        int n = Integer.parseInt(hex, 16); // parse as hex
                        s = Character.toString((char)n);   // convert to string
                    }
                    else syntax("illegal character following '&#'");
                }
                else { // otherwise a regular entity reference
                    if (t != Tokenizer.WORD) syntax("entity expected");
                    // look up entity replacement
                    s = (String) entityMap.get(tokenizer.tokenText( ));
                    if (s == null) syntax("Undefined entity: '&" + 
                                          tokenizer.tokenText( ) + ";'");
                }
            }
            catch (NumberFormatException e) {
                // Convert NFE errors to syntax errors
                syntax("malformed character entity");
            }
            finally {  // Restore tokenizer state
                tokenizer.tokenizeWords(false).tokenizeNumbers(false);
            }
            
            // Require and consume the trailing semicolon
            if (tokenizer.next( ) != ';') syntax("';' expected");
            tokenizer.next( );
            return s;
        }
    }
}

[ Team LiB ]