19.4 An XML Pull Parser
The SAX
API for parsing XML is called a
"push-parsing"
API because the SAX parser "pushes"
tokens or events to your code. It is also possible to parse XML using
the reverse architecture, in which your code
"pulls" tokens from the XML parser
as needed. The TAX class of Example 19-6 does just that. TAX is an
acronym for Trivial API for XML. It is a pull parser (actually more
of a fancy tokenizer) for a subset of XML. Despite its simplicity, it
is useful for a variety of XML parsing tasks. The
TAX.Parser class relies on the
Tokenizer
interface (defined in Example 2-8) and on the
various implementations of that interface (see Chapter 2, Chapter 3, and Chapter 6). The TAX class is simply a
holder for inner classes and token type constants. In addition to the
TAX.Parser class, TAX also
holds a Token class, a
TokenType class (an enumerated type), and a
ParseException class.
Before diving into the details of the TAX parser,
it is probably easiest to first study how the parser is typically
used. Example 19-5 is a program much like the
ListServlets program of Example 19-1: it parses a web.xml file
and outputs the servlet name-to-class mappings and name-to-URL
mappings defined in the file.
Example 19-5. ListServlets2.java
package je3.xml;
import java.io.*;
/**
* Parse a web.xml file using the TAX pull parser and print out the servlet
* name-to-class and name-to-url mappings.
**/
public class ListServlets2 {
public static void main(String[ ] args)
throws IOException, TAX.ParseException
{
// Create a TAX.Parser instance to parse the specified file
TAX.Parser parser = new TAX.Parser(new FileReader(args[0]));
// By default the parser returns TAG, TEXT and ENDTAG tokens and
// skips others. Configure it to skip ENDTAG tokens, too.
parser.ignoreTokens(TAX.ENDTAG);
// Now loop through all tokens until the end of the file
for(TAX.Token t = parser.next( ); t != null; t = parser.next( )) {
// If it is not a tag, we're not interested
if (t.type( ) != TAX.TAG) continue;
if (t.text( ).equals("servlet")) { // We found <servlet>
parser.expect("servlet-name"); // Require <servlet-name> next
t = parser.expect(TAX.TEXT); // Require text token next
String name = t.text( ); // Get text from the token
parser.expect("servlet-class"); // Require <servlet-class> next
t = parser.expect(TAX.TEXT); // Require text token
// Output name to class mapping
System.out.println("Servlet " + name +
" implemented by " + t.text( ));
}
else if (t.text( ).equals("servlet-mapping")) {
// Now we do the same thing for <servlet-mapping> tags
parser.expect("servlet-name");
String name = parser.expect(TAX.TEXT).text( );
parser.expect("url-pattern");
String mapping = parser.expect(TAX.TEXT).text( );
System.out.println("Servlet " + name +
" mapped to " + mapping);
}
}
}
}
Example 19-6. TAX.java
package je3.xml;
import java.util.*;
import java.io.*;
import java.nio.channels.*;
import java.nio.charset.*;
import je3.classes.Tokenizer;
import je3.classes.CharSequenceTokenizer;
import je3.io.ReaderTokenizer;
import je3.nio.ChannelTokenizer;
/**
* This class, whose name is an acronym for "Trivial API for XML", is a
* container for a simple Parser class for parsing XML and its related Token,
* TokenType and ParseException classes and constants.
*
* TAX.Parser is a simple, lightweight pull-parser that is useful for a variety
* of simple XML parsing tasks. Note, however, that it is more of a tokenizer
* than a true parser and that the grammar it parses is not actually XML, but a
* simplified subset of XML. The parser has (at least) these limitations:
*
* It does not enforce well-formedness. For example, it does not require
* tags to be properly nested.
* It is not a validating parser, and does not read external DTDs
* It does not parse the internal subset of the DOCTYPE tag, and cannot
* recognize any entities defined there.
* It is not namespace-aware
* It does not handle entity or character references in attribute values,
* not even pre-defined entities such as "
* It strips all whitespace from the start and end of document text, which,
* while useful for many documents, is not generally correct.
* It makes no attempt to do error recovery. The results of calling next( )
* after a ParseException is thrown are undefined.
* It does not provide enough detail to reconstruct the source document
*
* TAX.Parser always replaces entity references with their values, or throws
* a Tax.ParseException if no replacement value is known. The parser coalesces
* adjacent text and entities into a single TEXT token. CDATA sections are
* also returned as TEXT tokens, but are not coalesced.
**/
public class TAX {
// Enumerated type return values for Token.type( )
public static final TokenType TAG = new TokenType("TAG");
public static final TokenType ENDTAG = new TokenType("ENDTAG");
public static final TokenType TEXT = new TokenType("TEXT");
public static final TokenType COMMENT = new TokenType("COMMENT");
public static final TokenType PI = new TokenType("PI");
public static final TokenType DOCTYPE = new TokenType("DOCTYPE");
public static final TokenType XMLDECL = new TokenType("XMLDECL");
// A type-safe enumeration for token types. Note the private constructor
public static class TokenType {
private static int nextOrdinal = 0;
private final int ordinal = nextOrdinal++;
private final String name;
private TokenType(String name) { this.name = name; }
public String toString( ) { return name; }
}
// Token objects are the return value of the Parser.next( ) method.
// They provide details about what was parsed and where.
public static class Token {
TokenType type; // One of the constants above
String text; // Tagname for TAG & XMLDECL,
// Complete text minus delimiters for other types
int line, column; // Position of start of token
Map attributes; // name/value map for TAG and XMLDECL,null otherwise
boolean empty; // true for XMLDECL and TAGs ending with "/>".
// We use this constructor for TAG and XMLDECL tokens
Token(TokenType t, String s, int l,int c,Map a,boolean e) {
this(t,s,l,c);
this.attributes = a;
this.empty = e;
}
// This constructor for other token types
Token(TokenType type, String text, int line, int column) {
this.type = type;
this.text = text;
this.line = line;
this.column = column;
}
// Property accessor methods
public TokenType type( ) { return type; }
public String text( ) { return text; }
public int line( ) { return line; }
public int column( ) { return column; }
public Map attributes( ) { return attributes; }
public boolean empty( ) { return empty; }
}
// Exceptions of this type are thrown for syntax errors or unknown entities
public static class ParseException extends Exception {
public ParseException(String msg) { super(msg); }
static ParseException expected(Token t, String expected) {
return new ParseException("Expected " + expected + " at line " +
t.line( ) + ", column " + t.column( ));
}
}
// This is the parser class. It relies internally on a Tokenizer.
// The public constructors allow you to parse XML from a CharSequence,
// a Reader, or a Channel. By default, it will return tokens of type TAG,
// ENDTAG, and TEXT, and will ignore all others. You can change this
// behavior by passing token type constants to returnTokens( ) or
// ignoreTokens( ). By default the parser will replace character entities
// and the pre-defined entities &, <, >, ", and ' with
// their values. You can define new entity name/replacement pairs by
// calling defineEntity( ). These configuration methods all return the
// Parser objects so calls can be chained. After configuring your Parser,
// call the next( ) method repeatedly until it returns null.
public static class Parser {
Tokenizer tokenizer; // Used to break up the input
Map entityMap; // Map entity name to replacment
// Should we return tokens of these types?
boolean[ ] returnTokenType = new boolean[TokenType.nextOrdinal];
public Parser(CharSequence text) {
this(new CharSequenceTokenizer(text));
}
public Parser(Reader in) {
this(new ReaderTokenizer(in));
}
public Parser(ReadableByteChannel in, Charset encoding) {
this(new ChannelTokenizer(in, encoding));
}
Parser(Tokenizer tokenizer) {
this.tokenizer = tokenizer;
tokenizer.tokenizeSpaces(true); // always tokenize spaces
tokenizer.trackPosition(true); // track line and column #
// We don't always want the tokenizer to tokenize words, but when
// we do, this is how we want the words formed.
tokenizer.wordRecognizer(new Tokenizer.WordRecognizer( ) {
public boolean isWordStart(char c) {
return Character.isLetter(c) || c == '_' || c == ':';
}
public boolean isWordPart(char c, char first) {
if (Character.isLetterOrDigit(c) ||
c == '_' || c=='-' || c=='.' || c==':')
return true;
int type = Character.getType(c);
return type == Character.COMBINING_SPACING_MARK ||
type == Character.ENCLOSING_MARK ||
type == Character.NON_SPACING_MARK ||
type == Character.MODIFIER_LETTER;
}
});
// Set pre-defined entitities
entityMap = new HashMap( );
entityMap.put("lt", "<");
entityMap.put("gt", ">");
entityMap.put("amp", "&");
entityMap.put("quot", "\"");
entityMap.put("apos", "'");
// Set default values for what token types to return
returnTokenType[TAG.ordinal] = true;
returnTokenType[ENDTAG.ordinal] = true;
returnTokenType[TEXT.ordinal] = true;
}
public Parser returnTokens(TokenType t) {
returnTokenType[t.ordinal] = true;
return this;
}
public Parser ignoreTokens(TokenType t) {
returnTokenType[t.ordinal] = false;
return this;
}
// Define a mapping from entity name to entity replacement.
// Note that the entity name should not include the & or ; delimiters.
public Parser defineEntity(String name, String replacement) {
entityMap.put(name, replacement);
return this;
}
// This utility method is for reporting parsing errors
void syntax(String msg) throws ParseException {
throw new ParseException(msg + " at " +
tokenizer.tokenLine( ) + ":" +
tokenizer.tokenColumn( ));
}
// This method returns the next XML token of input or null if there
// is no more input to parse.
public Token next( ) throws ParseException, IOException {
Token token = null;
// Otherwise, loop until we find a token we want to return;
for(;;) {
// Invariant: we keep the tokenizer on the first unparsed token
// This means we start our methods by calling tokenType( )
// to examine what we're currently on, not by calling next( ).
// But we end by calling next( ) to consume the stuff we've
// already seen.
int t = tokenizer.tokenType( );
// If we're at the tokenizer's start state, then read a token
if (t == Tokenizer.BOF) t = tokenizer.next( );
// If there is no more input, return null
if (t == Tokenizer.EOF) return null;
// Skip any space. This is not technically correct: we don't
// know if this is ignorable whitespace or not. But in
// practice, most clients will want to ignore it.
if (t == Tokenizer.SPACE) {
tokenizer.next( );
continue;
}
// If the token is a open angle bracket, then this is markup
// otherwise it is text.
if (t == '<') token = parseMarkup( );
else token = parseText( );
// If the token we've parsed is one of the kind to be returned,
// then return it. Otherwise, continue looping for a new token
if (returnTokenType[token.type.ordinal]) return token;
}
}
// Read the next token and return it if it is a TAG with the specified
// tagname. Otherwise throw a ParseException
public Token expect(String tagname) throws ParseException,IOException {
Token t = next( );
if (t == null || t.type( ) != TAG || !t.text( ).equals(tagname))
throw ParseException.expected(t, "<" + tagname + ">");
return t;
}
// Read and return the next token, if it is of the specified type.
// Otherwise throw a ParseException
public Token expect(TokenType type) throws ParseException,IOException {
Token t = next( );
if (t == null || t.type( ) != type)
throw ParseException.expected(t, type.toString( ));
return t;
}
// This method called with a current token of '<' to parse various
// forms of XML markup
Token parseMarkup( ) throws ParseException, IOException {
assert tokenizer.tokenType( ) == '<' : tokenizer.tokenType( );
try {
// Turn on word tokenizing. It is turned off in finally clause.
tokenizer.tokenizeWords(true);
int t = tokenizer.next( );
if (t == '?') { // Markup is a PI or XMLDECL
t = tokenizer.next( );
if (t != Tokenizer.WORD) syntax("XMLDECL or PI expected");
if (tokenizer.tokenText( ).equals("xml")) {
Token token =
new Token(XMLDECL, tokenizer.tokenText( ),
tokenizer.tokenLine( ),
tokenizer.tokenColumn( ) - 2,
parseAttributes( ),
true);
if (tokenizer.tokenType( )!='?') syntax("'?' expected");
if (tokenizer.next( ) != '>') syntax("'>' expected");
return token;
}
else {
Token token = new Token(PI, null,tokenizer.tokenLine( ),
tokenizer.tokenColumn( )-2);
// Read to end of PI
tokenizer.scan("?>", true, true, false, true);
token.text = tokenizer.tokenText( );
return token;
}
}
if (t == '!') { // Markup is DOCTYPE, CDATA, or Comment
t = tokenizer.next( );
if (t == Tokenizer.WORD &&
tokenizer.tokenText( ).equals("DOCTYPE")) {
return parseDoctype( );
}
else if (t == '[') {
if (tokenizer.next( ) == Tokenizer.WORD &&
tokenizer.tokenText( ).equals("CDATA") &&
tokenizer.next( ) == '[') {
Token token = new Token(TEXT, null,
tokenizer.tokenLine( ),
tokenizer.tokenColumn( )-8);
tokenizer.scan("]]>", true, false, false, true);
token.text = tokenizer.tokenText( );
return token;
}
else syntax("CDATA expected");
}
else if (t == '-' && tokenizer.next( ) == '-') {
// a COMMENT token
Token token = new Token(COMMENT, null,
tokenizer.tokenLine( ),
tokenizer.tokenColumn( )-4);
tokenizer.scan("-->", true, false, false, true);
token.text = tokenizer.tokenText( );
return token;
}
else syntax("DOCTYPE, CDATA, or Comment expected");
}
if (t == '/') { // Markup is an element end tag
t = tokenizer.next( );
if (t == Tokenizer.WORD) {
Token token = new Token(ENDTAG, tokenizer.tokenText( ),
tokenizer.tokenLine( ),
tokenizer.tokenColumn( )-2);
t = tokenizer.next( );
if (t == Tokenizer.SPACE) t = tokenizer.next( );
if (t != '>') syntax("Expected '>'");
return token;
}
else syntax("ENDTAG expected.");
}
if (t == Tokenizer.WORD) { // Markup is an element start tag
Token token = new Token(TAG, tokenizer.tokenText( ),
tokenizer.tokenLine( ),
tokenizer.tokenColumn( ) - 1,
parseAttributes( ),
tokenizer.tokenType( ) == '/');
if (tokenizer.tokenType( ) == '/') tokenizer.next( );
if (tokenizer.tokenType( ) != '>') syntax("'>' expected");
return token;
}
// If none of the above matched, this is a syntax error
syntax("Invalid character following '<'");
// The compiler doesn't realize that syntax( ) never returns,
// so it requires a return statement here.
return null;
}
finally {
// restore tokenizer state
tokenizer.tokenizeWords(false);
// Get the next token ready
tokenizer.next( );
}
}
Token parseDoctype( ) throws IOException {
assert (tokenizer.tokenType( ) == Tokenizer.WORD &&
tokenizer.tokenText( ).equals("DOCTYPE"));
int line = tokenizer.tokenLine( );
int column = tokenizer.tokenColumn( );
StringBuffer b = new StringBuffer( );
int t = tokenizer.next( );
while(t != '>' && t != '[' && t != Tokenizer.EOF) {
b.append(tokenizer.tokenText( ));
t = tokenizer.next( );
}
if (t == '[') { // If there is an internal subset, scan for its end
tokenizer.scan("]>", true, true, false, true);
b.append(tokenizer.tokenText( ));
b.append(']');
}
return new Token(DOCTYPE, b.toString( ), line, column);
}
// Parse a sequence of name=value attributes, where value is always
// quoted in single or double quotes, and return them as a Map.
// When this method is called, the tokenizer is looking at the element
// name, not at the first token to parse.
// This is used when parsing element start tags and XMLDECLs
Map parseAttributes( ) throws ParseException, IOException {
try {
// Adjust tokenizer to recognize quotes.
// Defaults are restored in finally clause below
tokenizer.quotes("'\"", "'\"");
int t = tokenizer.next( ); // Consume the element name
// Skip optional space
if (t == Tokenizer.SPACE) t = tokenizer.next( );
// This is a special case for elements with no attributes
if (t != Tokenizer.WORD) return Collections.EMPTY_MAP;
Map m = new HashMap( ); // Where we'll store attributes
while(t == Tokenizer.WORD) {
String name = tokenizer.tokenText( ); // get attribute name
// The next token must be '='
if (tokenizer.next( ) != '=') syntax("'=' expected");
t = tokenizer.next( );
// The next token must be a quoted string
if (t != '"' && t != '\'')
syntax("quoted attribute value expected");
// Map attribute name to attribute value.
// The tokenizer strips the quotes for us.
// Note that we do not handle entity references here.
m.put(name, tokenizer.tokenText( ));
// Consume the value and skip an optional space after it
t = tokenizer.next( );
if (t == Tokenizer.SPACE) t=tokenizer.next( );
}
return m;
}
finally { // Always turn off quote tokenizing
tokenizer.quotes("", "");
}
}
// Coalesce any character data and entity references into a single
// TEXT token and return it, or throw an exception for undefined
// entities. Note that CDATA elements are also returned as TEXT
// tokens but are not coalesced like this. When this method is called
// we know that the tokenizer is looking at a char other than '<'.
Token parseText( ) throws ParseException, IOException {
assert tokenizer.tokenType( ) != '<' : tokenizer.tokenType( );
// Save line and column info of the start of the text
int line = tokenizer.tokenLine( );
int column = tokenizer.tokenColumn( );
StringBuffer b = new StringBuffer( ); // where we accumulate text
int t;
while((t = tokenizer.tokenType( )) != '<') {
if (t == '&') b.append(parseEntityReference( ));
else {
// Otherwise we've found some text
tokenizer.scan("<&", // scan until we find one of these
false, // just match one, not the whole string
true, // extend the token we've already started
false, // don't include delimiter char in the token
false);// don't skip delimiter; save for next token
b.append(tokenizer.tokenText( ));
tokenizer.next( );
}
}
// Strip trailing space and return as a TEXT token
return new Token(TEXT, b.toString( ).trim( ), line, column);
}
// Parse a reference to a general entity or character entity and
// return its value as a string, or throw an exception for undefined
// entities. Called when tokenizer is looking at an '&'.
String parseEntityReference( ) throws ParseException, IOException {
assert tokenizer.tokenType( ) == '&' : tokenizer.tokenType( );
String s = null;
try {
tokenizer.tokenizeWords(true);
int t = tokenizer.next( );
if (t == '#') { // if it's a character reference
tokenizer.tokenizeNumbers(true);
t = tokenizer.next( );
String text = tokenizer.tokenText( );
if (t == Tokenizer.NUMBER) { // a decimal character ref
int n = Integer.parseInt(text); // parse as base-10
s = Character.toString((char)n); // convert to string
}
else if (t == Tokenizer.WORD && text.charAt(0) != 'x') {
// a hexadecimal character reference
String hex = text.substring(1); // skip the 'x'
int n = Integer.parseInt(hex, 16); // parse as hex
s = Character.toString((char)n); // convert to string
}
else syntax("illegal character following '&#'");
}
else { // otherwise a regular entity reference
if (t != Tokenizer.WORD) syntax("entity expected");
// look up entity replacement
s = (String) entityMap.get(tokenizer.tokenText( ));
if (s == null) syntax("Undefined entity: '&" +
tokenizer.tokenText( ) + ";'");
}
}
catch (NumberFormatException e) {
// Convert NFE errors to syntax errors
syntax("malformed character entity");
}
finally { // Restore tokenizer state
tokenizer.tokenizeWords(false).tokenizeNumbers(false);
}
// Require and consume the trailing semicolon
if (tokenizer.next( ) != ';') syntax("';' expected");
tokenizer.next( );
return s;
}
}
}
|