[Opensrf-commits] r1880 - in trunk: include/opensrf src/c-apps src/libopensrf (scottmk)

svn at svn.open-ils.org svn at svn.open-ils.org
Tue Dec 29 08:22:50 EST 2009


Author: scottmk
Date: 2009-12-29 08:22:49 -0500 (Tue, 29 Dec 2009)
New Revision: 1880

Added:
   trunk/include/opensrf/jsonpush.h
   trunk/src/c-apps/format_json.c
   trunk/src/libopensrf/jsonpush.c
Log:
Add a stream parser for JSON, and a format_json utility
that uses it.

A    include/opensrf/jsonpush.h
A    src/c-apps/format_json.c
A    src/libopensrf/jsonpush.c


Added: trunk/include/opensrf/jsonpush.h
===================================================================
--- trunk/include/opensrf/jsonpush.h	                        (rev 0)
+++ trunk/include/opensrf/jsonpush.h	2009-12-29 13:22:49 UTC (rev 1880)
@@ -0,0 +1,90 @@
+/*
+Copyright (C) 2009 Equinox Software Inc.
+Scott McKellar <scott at esilibrary.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+*/
+
+/**
+	@file jsonpush.h
+	@brief Push parser for JSON.
+
+	This parser provides a way to parse JSON incrementally, without necessarily holding the
+	entire JSON string (or any representation thereof) in memory at once.  It can therefore
+	be used, for example, to parse large input files.
+
+	How to use it:
+
+	1. Call jsonNewPushParser() to create a parser, designating a series of callback
+	functions to be called when the parser encounters various syntactic features.
+
+	2. Pass one or more buffers to jsonPush() for parsing.
+
+	3. When the last buffer has been parsed, call jsonPushParserFinish() to tell the parser
+	that no more input will be forthcoming.
+
+	4. Call jsonPushParserFree() to free the parser when you're done with it.
+
+	By using jsonPushParserReset(), you can reuse a parser for multiple streams, without
+	having to free and recreate it.
+
+	By using jsonPushParserResume(), you can accept multiple JSON values in the same stream.
+	It is identical to jsonPushParserReset(), except that it does not reset the line number
+	and column number used in error messages.
+
+	This parser does @em not give any special attention to OSRF-specific conventions for
+	encoding class information.
+*/
+
+#ifndef JSONPUSH_H
+#define JSONPUSH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct JSONPushParserStruct;
+typedef struct JSONPushParserStruct JSONPushParser;
+
+/** @brief A collection of callback pointers */
+typedef struct {
+
+	int (*handleString)( void* blob, const char* str );
+	int (*handleNumber)( void* blob, const char* str );
+	int (*handleBeginArray )( void* blob );
+	int (*handleEndArray )( void* blob );
+	int (*handleBeginObj )( void* blob );
+	int (*handleObjKey )( void* blob, const char* key );
+	int (*handleEndObj )( void* blob );
+	int (*handleBool)   ( void* blob, int b );
+	int (*handleNull)   ( void* blob );
+	void (*handleEndJSON )( void* blob );
+	void (*handleError)( void* blob, const char* msg, unsigned line, unsigned pos );
+
+} JSONHandlerMap;
+
+JSONPushParser* jsonNewPushParser( const JSONHandlerMap* map, void* blob );
+
+void jsonPushParserReset( JSONPushParser* parser );
+
+void jsonPushParserResume( JSONPushParser* parser );
+
+int jsonPushParserFinish( JSONPushParser* parser );
+
+void jsonPushParserFree( JSONPushParser* parser );
+
+int jsonPush( JSONPushParser* parser, const char* str, size_t length );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

Added: trunk/src/c-apps/format_json.c
===================================================================
--- trunk/src/c-apps/format_json.c	                        (rev 0)
+++ trunk/src/c-apps/format_json.c	2009-12-29 13:22:49 UTC (rev 1880)
@@ -0,0 +1,605 @@
+/*
+Copyright (C) 2009  Equinox Software Inc.
+Scott McKellar <scott at esilibrary.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+*/
+
+/**
+	@file format_json.c
+	@brief Pretty-print JSON.
+
+	Read JSON from a file and output it to standard output with consistent indentation
+	and white space.
+
+	Synopsis:
+
+		format_json  [ filename [ ... ] ]
+
+	Each command-line argument is the name of a file that format_json will read in turn
+	and format as JSON.  A single hyphen denotes standard input.  If no file is specified,
+	format_json reads standard input.
+
+	The input file[s] may contain multiple JSON values, but a JSON value may not span more
+	than a single file.  In the output, successive JSON values are separated by blank lines.
+
+	The primary purpose of this formatter is to translate JSON into a canonical format that
+	can be easily read and parsed by, for example, a perl script, without having to create
+	a full JSON parser.  For that reason, every square bracket and curly brace is put on a
+	line by itself, although it might be more aesthetically pleasing to put it at the end of
+	the same line as whatever precedes it.
+
+	A secondary purpose is to make ugly, all-run-together JSON more readable to the human eye.
+
+	Finally, this program serves as an example of how to use the stream parser, especially
+	for files that are too big to be loaded into memory at once.  To that end, the internal
+	logic is extensively commented.
+
+	Implementation details:
+
+	When using a stream parser it is almost always necessary to implement a finite state
+	automaton, and this formatter is no exception.
+
+	We define a collection of callback functions for the parser to call at various points,
+	We also set up a structure (called a Formatter) for the parser to pass back to the
+	callbacks via a void pointer.  The Formatter supplies information about where we are and
+	what we're doing; in particular, it includes the state variable for our finite state
+	automaton.
+
+	The parser is also a finite state automaton internally, and it also needs a struct (called
+	a JSONPushParser) to keep track of where it is and what it's doing.  As a result, we have
+	two finite state automatons passing control back and forth.  The parser handles syntax and
+	the Formatter handles semantics.
+
+	With a couple of exceptions, each callback returns a status code back to the parser that
+	calls it: 0 for success and non-zero for error.  For example, a numeric literal might be
+	out of range, or an object key might be misspelled or out of place, or we might encounter
+	an object when we expect an array.  Those rules reflect the semantics of the particular
+	kind of JSON that we're trying to parse.  If a callback returns non-zero, the parser stops.
+
+	In the case of this formatter, any JSON is okay as long as the syntax is valid, and the
+	parser takes care of the syntax.  Hence the callback functions routinely return zero.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "opensrf/utils.h"
+#include "opensrf/osrf_utf8.h"
+#include "opensrf/jsonpush.h"
+
+/**
+	@brief Enumeration of states for a finite state automaton.
+*/
+typedef enum {
+	CTX_OPEN,           /**< Not currently within a JSON value. */
+	CTX_ARRAY_BEGIN,    /**< At the beginning of a JSON array. */
+	CTX_ARRAY,          /**< In a JSON array with at least one value so far. */
+	CTX_OBJ_BEGIN,      /**< At the beginning of a JSON object. */
+	CTX_OBJ_KEY,        /**< Between a key and its value in a JSON object. */
+	CTX_OBJ             /**< In a JSON object with at least one entry so far. */
+} Context;
+
+/**
+	@brief Node for storing a Context in a stack.
+*/
+struct ContextNode {
+	struct ContextNode* next;      /**< Linkage pointer for linked list. */
+	Context context;               /**< The Context being stored for eventual restoration. */
+};
+typedef struct ContextNode ContextNode;
+
+/**
+	@brief Structure to be passed back to callback functions to keep track of where we are.
+*/
+typedef struct {
+	const char* filename;         /**< Name of input file, or NULL for standard input */
+	Context context;              /**< Current state. */
+	ContextNode* context_stack;   /**< Stack of previous states. */
+	int indent;                   /**< How many current levels of indentation. */
+	growing_buffer* buf;          /**< For formatting strings with escaped characters. */
+	JSONPushParser* parser;       /**< Points to the current parser. */
+} Formatter;
+
+static int format_file( Formatter* formatter, FILE* infile );
+static void install_parser( Formatter* formatter );
+
+static void indent( unsigned n );
+static int formatString( void* blob, const char* str );
+static int formatNumber( void* blob, const char* str );
+static int formatLeftBracket( void* blob );
+static int formatRightBracket( void* blob );
+static int formatKey( void* blob, const char* str );
+static int formatLeftBrace( void* blob );
+static int formatRightBrace( void* blob );
+static int formatBool( void* blob, int b );
+static int formatNull( void* blob );
+static void formatEnd( void* blob );
+
+static void show_error( void* blob, const char* msg, unsigned line, unsigned pos );
+
+static void push_context( Formatter* formatter );
+static void pop_context( Formatter* formatter );
+
+static ContextNode* free_context = NULL;    // Free list for ContextNodes
+
+/**
+	@brief The usual.
+	@param argc Number of command line parameters, plus one.
+	@param argv Pointer to ragged array representing the command line.
+	@return EXIT_SUCCESS on success, or EXIT_FAILURE upon failure.
+*/
+int main( int argc, char* argv[] ) {
+
+	int rc = EXIT_SUCCESS;
+
+	// Declare and initialize a Formatter
+	static Formatter formatter;
+	formatter.filename = NULL;
+	formatter.context = CTX_OPEN;
+	formatter.context_stack = NULL;
+	formatter.indent = 0;
+	formatter.buf = buffer_init( 32 );
+	install_parser( &formatter );
+
+	if( argc > 1 ) {
+		int i = 0;
+		while( (++i < argc) && (0 == rc) ) {
+			// Iterate over the command line arguments.
+			// An argument "-" means to read standard input.
+			const char* filename = argv[ i ];
+			FILE* in;
+			if( '-' == filename[ 0 ] && '\0' == filename[ 1 ] ) {
+				in = stdin;
+				formatter.filename = NULL;
+			} else {
+				in = fopen( filename, "r" );
+				formatter.filename = filename;
+			}
+
+			if( !in ) {
+				fprintf( stderr, "Unable to open %s\n", filename );
+			} else {
+				// Reset the parser.  This tells the parser that we're starting over for a new
+				// JSON value, and that it needs to reset the line counter and position counter
+				// for error messages.  (We don't really need this for the first file, but it
+				// does no harm.)
+				jsonPushParserReset( formatter.parser );
+
+				// Format the file
+				if( format_file( &formatter, in ) )
+					rc = EXIT_FAILURE;
+				if( formatter.filename )
+					fclose( in );
+			}
+		} // end while
+	} else {
+		// No command line arguments?  Read standard input.  Note that we don't have to
+		// reset the parser in this case, because we're only parsing once anyway.
+		format_file( &formatter, stdin );
+	}
+
+	// Clean up the formatter
+	jsonPushParserFree( formatter.parser );
+	buffer_free( formatter.buf );
+	while( formatter.context_stack )
+		pop_context( &formatter );
+
+	// Free the free ContextNodes shed from the stack
+	while( free_context ) {
+		ContextNode* temp = free_context->next;
+		free( free_context );
+		free_context = temp;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Read and format a JSON file.
+	@param formatter Pointer to the current Formatter.
+	@param infile Pointer to the input file.
+	@return 0 if successful, or 1 upon error.
+*/
+static int format_file( Formatter* formatter, FILE* infile ) {
+
+	const int bufsize = 4096;
+	char buf[ bufsize ];
+	int num_read;
+	int rc = 0;
+
+	do {
+		num_read = fread( buf, 1, bufsize, infile );
+		if( num_read > 0 )
+			if( jsonPush( formatter->parser, buf, num_read ) )
+				rc = 1;
+	} while( num_read == bufsize && 0 == rc );
+
+	if( jsonPushParserFinish( formatter->parser ) )
+		rc = 1;
+
+	if( rc )
+		fprintf( stderr, "\nError found in JSON file\n" );
+
+	return rc;
+}
+
+/**
+	@brief Create a JSONPushParser and install it in a Formatter.
+	@param formatter Pointer to the Formatter in which the parser is to be installed.
+
+	First we create a JSONHandlerMap to tell the parser what callback functions to call
+	at various points.  Then we pass it to jsonNewPushParser, which makes its own copy of
+	the map, so it's okay for our original map to go out of scope.
+*/
+static void install_parser( Formatter* formatter ) {
+
+	// Designate the callback functions to be installed in the parser.
+	JSONHandlerMap map = {
+		formatString,         // string
+		formatNumber,         // number
+		formatLeftBracket,    // begin array
+		formatRightBracket,   // end array
+		formatLeftBrace,      // begin object
+		formatKey,            // object key
+		formatRightBrace,     // end object
+		formatBool,           // keyword true or false
+		formatNull,           // keyword null
+		formatEnd,            // end of JSON
+		show_error            // error handler
+	};
+
+	formatter->parser = jsonNewPushParser( &map, formatter );
+}
+
+/**
+	@brief Format a string literal.
+	@param blob Pointer to Formatter, cast to a void pointer.
+	@param str Pointer to the contents of the string, with all escape sequences decoded.
+	@return zero.
+
+	Called by the parser when it finds a string literal (other than the name portion of a
+	name/value pair in a JSON object).
+
+	Write the literal within double quotes, with special and multibyte characters escaped
+	as needed, and a comma and white as needed.
+*/
+static int formatString( void* blob, const char* str ) {
+	Formatter* formatter = (Formatter*) blob;
+	if( CTX_ARRAY == formatter->context )
+		printf( ",\n" );
+	else if( formatter->context != CTX_OBJ_KEY )
+		printf( "\n" );
+
+	if( formatter->context != CTX_OBJ_KEY )
+		indent( formatter->indent );
+
+	// Escape characters as needed
+	buffer_reset( formatter->buf );
+	buffer_append_utf8( formatter->buf, str );
+
+	printf( "\"%s\"", OSRF_BUFFER_C_STR( formatter->buf ) );
+
+	// Pick the next state
+	if( CTX_ARRAY_BEGIN == formatter->context )
+		formatter->context = CTX_ARRAY;
+	else if ( CTX_OBJ_KEY == formatter->context )
+		formatter->context = CTX_OBJ;
+
+	return 0;
+}
+
+/**
+	@brief Format a numeric literal.
+	@param blob Pointer to Formatter, cast to a void pointer.
+	@param str Pointer to a string containing the numeric literal.
+	@return zero.
+
+	Called by the parser when it finds a numeric literal.
+
+	Write the numeric literal, with a comma and white space as needed.
+*/
+static int formatNumber( void* blob, const char* str ) {
+	Formatter* formatter = (Formatter*) blob;
+	if( CTX_ARRAY == formatter->context )
+		printf( ",\n" );
+	else if( formatter->context != CTX_OBJ_KEY )
+		printf( "\n" );
+
+	if( formatter->context != CTX_OBJ_KEY )
+		indent( formatter->indent );
+
+	printf( "%s", str );
+
+	// Pick the next state
+	if( CTX_ARRAY_BEGIN == formatter->context )
+		formatter->context = CTX_ARRAY;
+	else if ( CTX_OBJ_KEY == formatter->context )
+		formatter->context = CTX_OBJ;
+
+	return 0;
+}
+
+/**
+	@brief Format a left square bracket.
+	@param blob Pointer to Formatter, cast to a void pointer.
+	@return zero.
+
+	Called by the parser when it finds a left square bracket opening a JSON array.
+
+	Write a left square bracket, with a comma and white space as needed.
+*/
+static int formatLeftBracket( void* blob ) {
+	Formatter* formatter = blob;
+	if( CTX_ARRAY == formatter->context || CTX_OBJ == formatter->context )
+		printf( "," );
+	printf( "\n" );
+	indent( formatter->indent++ );
+	printf( "[" );
+
+	// Pick the state to return to when we close the array.
+	if( CTX_ARRAY_BEGIN == formatter->context )
+		formatter->context = CTX_ARRAY;
+	else if ( CTX_OBJ_BEGIN == formatter->context )
+		formatter->context = CTX_OBJ;
+	push_context( formatter );
+
+	formatter->context = CTX_ARRAY_BEGIN;
+	return 0;
+}
+
+/**
+	@brief Format a right square bracket.
+	@param blob Pointer to Formatter, cast to a void pointer.
+	@return zero.
+
+	Called by the parser when it finds a right square bracket closing a JSON array.
+
+	Write a newline, indentation, and a right square bracket.
+*/
+static int formatRightBracket( void* blob ) {
+	Formatter* formatter = blob;
+	printf( "\n" );
+	indent( --formatter->indent );
+	printf( "]" );
+
+	pop_context( formatter );
+	return 0;
+}
+
+/**
+	@brief Formate a left curly brace.
+	@param blob Pointer to Formatter, cast to a void pointer.
+	@return zero.
+
+	Called by the parser when it finds a left curly brace opening a JSON object.
+
+	Write a left curly brace, with a comma and white space as needed.
+*/
+static int formatLeftBrace( void* blob ) {
+	Formatter* formatter = blob;
+	if( CTX_ARRAY == formatter->context || CTX_OBJ == formatter->context )
+		printf( "," );
+	printf( "\n" );
+	indent( formatter->indent++ );
+	printf( "{" );
+
+	// Pick the state to return to when we close the object.
+	if( CTX_ARRAY_BEGIN == formatter->context )
+		formatter->context = CTX_ARRAY;
+	else if ( CTX_OBJ_BEGIN == formatter->context )
+		formatter->context = CTX_OBJ;
+	push_context( formatter );
+
+	formatter->context = CTX_OBJ_BEGIN;
+	return 0;
+}
+
+/**
+	@brief Format a right curly brace.
+	@param blob Pointer to Formatter, cast to a void pointer.
+	@return zero.
+
+	Called by the parser when it finds a right curly brace closing a JSON object.
+
+	Write a newline, indentation, and a right curly brace.
+*/
+static int formatRightBrace( void* blob ) {
+	Formatter* formatter = blob;
+	printf( "\n" );
+	indent( --formatter->indent );
+	printf( "}" );
+
+	pop_context( formatter );
+	return 0;
+}
+
+/**
+	@brief Format the key of a key/value pair in a JSON object.
+	@param blob Pointer to Formatter, cast to a void pointer.
+	@param str Pointer to a string containing the key.
+	@return zero.
+
+	Called by the parser when it finds the key of a key/value pair.  It hasn't found the
+	accompanying colon yet, and if it doesn't find it later, it will return an error.
+
+	Write the key in double quotes, with a comma and white space as needed.
+*/
+static int formatKey( void* blob, const char* str ) {
+	Formatter* formatter = blob;
+	if( CTX_OBJ == formatter->context )
+		printf( ",\n" );
+	else
+		printf( "\n" );
+	indent( formatter->indent );
+
+	// Escape characters as needed
+	buffer_reset( formatter->buf );
+	buffer_append_utf8( formatter->buf, str );
+
+	printf( "\"%s\" : ", OSRF_BUFFER_C_STR( formatter->buf ) );
+
+	formatter->context = CTX_OBJ_KEY;
+	return 0;
+}
+
+/**
+	@brief Format a boolean value.
+	@param blob Pointer to Formatter, cast to a void pointer.
+	@param b An int used as a boolean to indicate whether the boolean value is true or false.
+	@return zero.
+
+	Called by the parser when it finds the JSON keyword "true" or "false".
+
+	Write "true" or "false" (without the quotes) with a comma and white as needed.
+*/
+static int formatBool( void* blob, int b ) {
+	Formatter* formatter = (Formatter*) blob;
+	if( CTX_ARRAY == formatter->context )
+		printf( ",\n" );
+	else if( formatter->context != CTX_OBJ_KEY )
+		printf( "\n" );
+
+	if( formatter->context != CTX_OBJ_KEY )
+		indent( formatter->indent );
+
+	printf( "%s", b ? "true" : "false" );
+
+	// Pick the next state.
+	if( CTX_ARRAY_BEGIN == formatter->context )
+		formatter->context = CTX_ARRAY;
+	else if ( CTX_OBJ_KEY == formatter->context )
+		formatter->context = CTX_OBJ;
+
+	return 0;
+}
+
+/**
+	@brief Format a null value.
+	@param blob Pointer to Formatter, cast to a void pointer.
+	@return zero.
+
+	Called by the parser when it finds the JSON keyword "null".
+
+	Write "null" (without the quotes) with a comma and white as needed.
+*/
+static int formatNull( void* blob ) {
+	Formatter* formatter = (Formatter*) blob;
+	if( CTX_ARRAY == formatter->context )
+		printf( ",\n" );
+	else if( formatter->context != CTX_OBJ_KEY )
+		printf( "\n" );
+
+	if( formatter->context != CTX_OBJ_KEY )
+		indent( formatter->indent );
+
+	printf( "null" );
+
+	if( CTX_ARRAY_BEGIN == formatter->context )
+		formatter->context = CTX_ARRAY;
+	else if ( CTX_OBJ_KEY == formatter->context )
+		formatter->context = CTX_OBJ;
+
+	return 0;
+}
+
+/**
+	@brief Respond to the end of a JSON value.
+	@param blob Pointer to Formatter, cast to a void pointer.
+
+	Called by the parser when it reaches the end of a JSON value.
+
+	This formatter acccepts multiple JSON values in succession.  Tell the parser to look
+	for another one.  Otherwise the parser will treat anything other than white space
+	beyond this point as an error.
+
+	Note that jsonPushParserResume() does @em not reset the line number and column number
+	used by the parser for error messages.  If you want to do that. call jsonPushParserReset().
+*/
+static void formatEnd( void* blob ) {
+	Formatter* formatter = blob;
+	jsonPushParserResume( formatter->parser );
+	printf( "\n" );
+}
+
+/**
+	@brief Issue an error message about a syntax error detected by the parser.
+	@param blob
+	@param msg Pointer to a message describing the syntax error.
+	@param line Line number in the current file where the error was detected.
+	@param pos Column position in the current line where the error was detected.
+
+	Called by the parser when it encounters a syntax error.
+
+	Write the message to standard error, providing the file name (saved in the Formatter),
+	line number, and column position.
+*/
+static void show_error( void* blob, const char* msg, unsigned line, unsigned pos ) {
+	Formatter* formatter = (Formatter*) blob;
+	const char* filename = formatter->filename;
+	if( !filename )
+		filename = "standard input";
+	fprintf( stderr, "\nError in %s at line %u, position %u:\n%s\n",
+		filename, line, pos, msg );
+}
+
+/**
+	@brief Write a specified number of indents, four spaces per indent.
+	@param n How many indents to write.
+*/
+static void indent( unsigned n ) {
+	while( n ) {
+		printf( "    " );
+		--n;
+	}
+}
+
+/**
+	@brief Push the current state onto the stack.
+	@param formatter Pointer to the current Formatter.
+
+	We call this when we enter a JSON array or object.  Later, when we reach the end of the
+	array or object, we'll call pop_context() to restore the saved state.
+*/
+static void push_context( Formatter* formatter ) {
+	// Allocate a ContextNode; from the free list if possible,
+	// or from the heap if necessary
+	ContextNode* node = NULL;
+	if( free_context ) {
+		node = free_context;
+		free_context = free_context->next;
+	} else
+		node = safe_malloc( sizeof( ContextNode ) );
+
+	node->context = formatter->context;
+	node->next = formatter->context_stack;
+	formatter->context_stack = node;
+}
+
+/**
+	@brief Pop a state off the stack.
+	@param formatter Pointer to the current Formatter.
+
+	We call this at the end of a JSON array or object, in order to restore the state saved
+	when we entered the array or object.
+*/
+static void pop_context( Formatter* formatter ) {
+	if( !formatter->context_stack )
+		return;                    // shouldn't happen
+
+	ContextNode* node = formatter->context_stack;
+	formatter->context_stack = node->next;
+
+	formatter->context = node->context;
+
+	node->next = free_context;
+	free_context = node;
+}

Added: trunk/src/libopensrf/jsonpush.c
===================================================================
--- trunk/src/libopensrf/jsonpush.c	                        (rev 0)
+++ trunk/src/libopensrf/jsonpush.c	2009-12-29 13:22:49 UTC (rev 1880)
@@ -0,0 +1,1281 @@
+/*
+Copyright (C) 2009 Equinox Software Inc.
+Scott McKellar <scott at esilibrary.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+*/
+
+/**
+	@file jsonpush.c
+	@brief Push parser for JSON.
+
+	This parser parses JSON incrementally, without necessarily holding the entire JSON string
+	(or any representation thereof) in memory at once.  It is therefore suitable for parsing
+	large input files.
+
+	A format such as JSON, with its arbitrarily nestable elements, cries out piteously for a
+	recursive descent parser to match the recursive structure of the format.  Unfortunately,
+	recursive descent doesn't work for an incremental parser, because the boundaries of
+	incoming chunks don't respect syntactic boundaries.
+
+	This parser is based on a finite state automaton, using a structure to retain state across
+	chunks, and a stack to simulate recursion.  The calling code designates a series of
+	callback functions to respond to various syntactic features as they are encountered.
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <ctype.h>
+#include "opensrf/osrf_json.h"
+#include "opensrf/jsonpush.h"
+
+/** Enumeration of states for a finite state automaton */
+typedef enum {
+	PP_BEGIN,            // outside of any JSON
+	PP_STR,              // inside a string literal
+	PP_SLASH,            // found a backslash in a string literal
+	PP_UTF8,             // collecting a UTF8 sequence
+	PP_NUM,              // inside a numeric literal
+	PP_ARRAY_BEGIN,      // started an array
+	PP_ARRAY_VALUE,      // found an array element
+	PP_ARRAY_COMMA,      // found a comma between array elements
+	PP_OBJ_BEGIN,        // started a JSON object
+	PP_OBJ_KEY,          // found a string for a key in an object
+	PP_OBJ_COLON,        // found a colon after a key in an object
+	PP_OBJ_VALUE,        // found a value for a key in an object
+	PP_OBJ_COMMA,        // found a comma separating entries in an object
+	PP_TRUE,             // true keyword
+	PP_FALSE,            // false keyword
+	PP_NULL,             // null keyword
+	PP_END,              // reached the end of the JSON stream
+	PP_ERROR             // encountered invalid JSON; can't continue
+} PPState;
+
+struct StateNodeStruct;
+typedef struct StateNodeStruct StateNode;
+
+/**
+	@brief Represents a parser state at a given level of nesting.
+
+	The parser maintains a stack of StateNodes to simulate recursive descent.
+*/
+struct StateNodeStruct {
+	StateNode* next;            /**< For a linked list to implement the stack */
+	PPState state;              /**< State to which we will return */
+	osrfStringArray* keylist;   /**< List of key strings, if the level is for a JSON object */
+};
+
+/**
+	@brief A collection of things the parser needs to remember about what it's doing.
+
+	This structure enables the parser to retain state from one chunk of input to the next.
+*/
+struct JSONPushParserStruct {
+	JSONHandlerMap handlers;
+	void* blob;               /**< To be passed back to callback functions. */
+	unsigned line;            /**< Line number. */
+	unsigned pos;             /**< Character position within line. */
+	PPState state;            /**< For finite state automaton. */
+	char again;               /**< If non-zero, re-read it as the next character. */
+	growing_buffer* buf;      /**< For accumulating strings and numbers. */
+	StateNode* state_stack;   /**< For simulating recursive descent. */
+	StateNode* free_states;   /**< Free list of unused StateNodes. */
+	unsigned word_idx;        /**< index of current characters keyword,
+	                               such as "true", "false", or "null". */
+	unsigned int point_code;  /**< for UTF-8 transformations. */
+	osrfStringArray* keylist; /**< Stores keys in current JSON object. */
+};
+
+// State handlers for the finite state automaton
+static int do_begin( JSONPushParser* parser, char c );
+static int do_str  ( JSONPushParser* parser, char c );
+static int do_slash( JSONPushParser* parser, char c );
+static int do_utf8 ( JSONPushParser* parser, char c );
+static int do_num  ( JSONPushParser* parser, char c );
+static int do_array_begin( JSONPushParser* parser, char c );
+static int do_array_value( JSONPushParser* parser, char c );
+static int do_array_comma( JSONPushParser* parser, char c );
+static int do_obj_begin( JSONPushParser* parser, char c );
+static int do_obj_key  ( JSONPushParser* parser, char c );
+static int do_obj_colon( JSONPushParser* parser, char c );
+static int do_obj_value( JSONPushParser* parser, char c );
+static int do_obj_comma( JSONPushParser* parser, char c );
+static int do_true ( JSONPushParser* parser, char c );
+static int do_false( JSONPushParser* parser, char c );
+static int do_null ( JSONPushParser* parser, char c );
+static int do_end( JSONPushParser* parser, char c );
+
+static int found_keyword( JSONPushParser* parser, char c,
+		const char* keyword, unsigned maxlen );
+static void push_pp_state( JSONPushParser* parser, PPState state );
+static void pop_pp_state( JSONPushParser* parser );
+static void check_pp_end( JSONPushParser* parser );
+static void report_pp_error( JSONPushParser* parser, const char* msg, ... );
+
+/**
+	@brief Create a new JSONPushParser.
+	@param map Pointer to a JSONHandlerMap designating the callback functions to call.
+	@param blob An arbitrary pointer to be passed to the callback functions.
+	@return A pointer to the new parser.
+
+	The calling code can use the @a blob parameter to specify its own context for the
+	callback functions.
+
+	The calling code is responsible for freeing the parser by calling jsonPushParserFree().
+*/
+JSONPushParser* jsonNewPushParser( const JSONHandlerMap* map, void* blob )
+{
+	if( ! map )
+		return NULL;
+
+	JSONPushParser* parser = safe_malloc( sizeof( JSONPushParser ) );
+	parser->handlers    = *map;
+	parser->blob        = blob;
+	parser->line        = 1;
+	parser->pos         = 1;
+	parser->state       = PP_BEGIN;
+	parser->again       = '\0';
+	parser->buf         = buffer_init( 64 );
+	parser->state_stack = NULL;
+	parser->free_states = NULL;
+	parser->word_idx     = 0;
+	parser->keylist     = osrfNewStringArray( 8 );
+	return parser;
+}
+
+/**
+	@brief Restore a JSONPushParser to its original pristine state.
+	@param parser Pointer to the JSONPushParser to be reset.
+
+	This function makes it possible to reuse the same parser for multiple documents, e.g.
+	multiple input files, without having to destroy and recreate it.  The expectation is
+	that it be called after jsonPush() returns.
+*/
+void jsonPushParserReset( JSONPushParser* parser ) {
+	if( parser ) {
+		parser->line = 1;
+		parser->pos = 1;
+		parser->state = PP_BEGIN;
+	}
+}
+
+/**
+	@brief Restore a JSONPushParser to a starting state.
+	@param parser Pointer to the JSONPushParser to be resumed.
+
+	This function is similar to jsonPushParserReset(), with two exceptions:
+	- It only works if the parser is between JSON values.  Otherwise it wouldn't be able
+	to continue sensibly.
+	- It doesn't reset the line number or position number used for error messages.
+
+	Purpose: make it possible to parse multiple JSON values in the same stream.  The
+	expectation is that it be called by the callback function that responds to end-of-JSON.
+*/
+void jsonPushParserResume( JSONPushParser* parser ) {
+	if( parser ) {
+		parser->state = PP_BEGIN;
+	}
+}
+
+/**
+	@brief Tell the JSON push parser that there is no more input to parse.
+	@param parser Pointer to the parser.
+	@return 0 if successful, or 1 upon error.
+
+	A call to this function is comparable to an end-of-file marker.  Without it, the parser
+	would be unable to recognize certain tokens at the very end of the last buffer, because
+	it wouldn't know that the token was finished.
+
+	For example: if the last byte is part of a number, the parser will not have reported the
+	numeric token because it was waiting to see if the next character was numeric.
+
+	Likewise, certain kinds of errors would be unrecognizable, such as a failure to complete
+	the current JSON expression.
+*/
+int jsonPushParserFinish( JSONPushParser* parser ) {
+	int rc = 0;
+
+	// If we're currently accumulating a token, finish it
+	if( PP_NUM == parser->state ) {
+		const char* num_str = OSRF_BUFFER_C_STR( parser->buf );
+
+		// Validate number
+		if( jsonIsNumeric( num_str ) ) {
+			if( parser->handlers.handleNumber )
+			rc = parser->handlers.handleNumber( parser->blob, num_str );
+			pop_pp_state( parser );
+			check_pp_end( parser );
+		} else {                            // Not numeric?  Try to fix it
+			char* temp = jsonScrubNumber( num_str );
+			if( temp ) {                    // Fixed
+				if( parser->handlers.handleNumber )
+					rc = parser->handlers.handleNumber( parser->blob, temp );
+				free( temp );
+				pop_pp_state( parser );
+				check_pp_end( parser );
+			} else {                       // Can't be fixed
+				report_pp_error( parser, "Invalid number: \"%s\"", num_str );
+				rc = 1;
+				parser->state = PP_ERROR;
+			}
+		}
+	} else if( PP_TRUE == parser->state ) {
+		if( 3 == parser->word_idx ) {
+			if( parser->handlers.handleBool )
+				rc = parser->handlers.handleBool( parser->blob, 1 );
+		} else {
+			report_pp_error( parser, "Keyword \"true\" is incomplete at end of input" );
+			printf( "Wordlen = %d\n", parser->word_idx );
+			rc = 1;
+			parser->state = PP_ERROR;
+		}
+		pop_pp_state( parser );
+		check_pp_end( parser );
+	} else if( PP_FALSE == parser->state ) {
+		if( 4 == parser->word_idx ) {
+			if( parser->handlers.handleBool )
+				rc = parser->handlers.handleBool( parser->blob, 0 );
+		} else {
+			report_pp_error( parser, "Keyword \"false\" is incomplete at end of input" );
+			rc = 1;
+			parser->state = PP_ERROR;
+		}
+		pop_pp_state( parser );
+		check_pp_end( parser );
+	} else if( PP_NULL == parser->state ) {
+		if( 3 == parser->word_idx ) {
+			if( parser->handlers.handleNull )
+				rc = parser->handlers.handleNull( parser->blob );
+		} else {
+			report_pp_error( parser, "Keyword \"null\" is incomplete at end of input" );
+			rc = 1;
+			parser->state = PP_ERROR;
+		}
+		pop_pp_state( parser );
+		check_pp_end( parser );
+	}
+
+	// At this point the state should be PP_END, or possibly PP_BEGIN if the JSON value is
+	// empty, or PP_ERROR if we already encountered an error.  Anything else means that the
+	// JSON value is incomplete.
+
+	switch( parser->state ) {
+		case PP_BEGIN       :
+			parser->state = PP_END;       // JSON value was empty
+			break;
+		case PP_STR         :
+		case PP_SLASH       :
+		case PP_UTF8        :
+			report_pp_error( parser, "String literal not closed" );
+			parser->state = PP_ERROR;
+			rc = 1;
+			break;
+		case PP_NUM         :             // not possible
+			break;
+		case PP_ARRAY_BEGIN :
+			report_pp_error( parser, "Empty JSON array not closed" );
+			parser->state = PP_ERROR;
+			rc = 1;
+			break;
+		case PP_ARRAY_VALUE :
+			report_pp_error( parser, "JSON array begun but not closed" );
+			parser->state = PP_ERROR;
+			rc = 1;
+			break;
+		case PP_ARRAY_COMMA :
+			report_pp_error( parser, "JSON array not closed" );
+			parser->state = PP_ERROR;
+			rc = 1;
+			break;
+		case PP_OBJ_BEGIN   :
+			report_pp_error( parser, "Empty JSON object not closed" );
+			parser->state = PP_ERROR;
+			rc = 1;
+			break;
+		case PP_OBJ_KEY     :
+			report_pp_error( parser, "JSON object not continued after key" );
+			parser->state = PP_ERROR;
+			rc = 1;
+			break;
+		case PP_OBJ_COLON   :
+			report_pp_error( parser, "JSON object not continued after colon" );
+			parser->state = PP_ERROR;
+			rc = 1;
+			break;
+		case PP_OBJ_VALUE   :
+			report_pp_error( parser, "JSON object begun but not closed" );
+			parser->state = PP_ERROR;
+			rc = 1;
+			break;
+		case PP_OBJ_COMMA   :
+			report_pp_error( parser, "JSON object not closed" );
+			parser->state = PP_ERROR;
+			rc = 1;
+			break;
+		case PP_TRUE        :   // not possible
+		case PP_FALSE       :   // not possible
+		case PP_NULL        :   // not possible
+		case PP_END         :   // okay
+		case PP_ERROR       :   // previous error, presumably already reported
+			break;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Incrementally parse a chunk of JSON.
+	@param parser Pointer to the JSONPushParser that will do the parsing.
+	@param str Pointer to a chunk of JSON, either all or part of a JSON stream.
+	@param length Length of the chunk of JSON.
+	@return 0 if successful, or 1 upon error.
+
+	Parse a fragment of JSON, possibly preceded or followed by one or more other chunks
+	in the same JSON stream.  Respond to various syntactical features by calling the
+	corresponding callback functions that were designated when the parser was created.
+*/
+int jsonPush( JSONPushParser* parser, const char* str, size_t length ) {
+	if( ! parser )
+		return 1;
+	else if( ! str ) {
+		report_pp_error( parser, "JSON parser received a NULL parameter for input" );
+		return 1;
+	} else if( PP_ERROR == parser->state ) {
+		report_pp_error( parser, "JSON parser cannot continue due to previous error" );
+		return 1;
+	}
+
+	int rc = 0;
+	// Loop through the chunk
+	int i = 0;
+	while( str[i] && i < length && parser->state != PP_ERROR ) {
+		// branch on the current parser state
+		switch( parser->state ) {
+			case PP_BEGIN :
+				rc = do_begin( parser, str[i] );
+				break;
+			case PP_STR :
+				rc = do_str( parser, str[i] );
+				break;
+			case PP_SLASH :
+				rc = do_slash( parser, str[i] );
+				break;
+			case PP_UTF8 :
+				rc = do_utf8( parser, str[i] );
+				break;
+			case PP_NUM :
+				rc = do_num( parser, str[i] );
+				break;
+			case PP_ARRAY_BEGIN :
+				rc = do_array_begin( parser, str[i] );
+				break;
+			case PP_ARRAY_VALUE :
+				rc = do_array_value( parser, str[i] );
+				break;
+			case PP_ARRAY_COMMA :
+				rc = do_array_comma( parser, str[i] );
+				break;
+			case PP_OBJ_BEGIN :
+				rc = do_obj_begin( parser, str[i] );
+				break;
+			case PP_OBJ_KEY :
+				rc = do_obj_key( parser, str[i] );
+				break;
+			case PP_OBJ_COLON :
+				rc = do_obj_colon( parser, str[i] );
+				break;
+			case PP_OBJ_VALUE :
+				rc = do_obj_value( parser, str[i] );
+				break;
+			case PP_OBJ_COMMA :
+				rc = do_obj_comma( parser, str[i] );
+				break;
+			case PP_TRUE :
+				rc = do_true( parser, str[i] );
+				break;
+			case PP_FALSE :
+				rc = do_false( parser, str[i] );
+				break;
+			case PP_NULL :
+				rc = do_null( parser, str[i] );
+				break;
+			case PP_END :
+				rc = do_end( parser, str[i] );
+				break;
+			default :
+				break;     // stub for now; should be error
+		}
+		if( rc )
+			break;
+		else if( parser->again )
+			parser->again = '\0';  // reuse the current character
+		else {
+			// Advance to the next character
+			++i;
+			if( '\n' == str[i] ) {
+				++parser->line;
+				parser->pos = 0;
+			} else
+				++parser->pos;
+		}
+	}
+
+	if( 1 == rc )
+		parser->state = PP_ERROR;
+
+	return rc;
+}
+
+// -------- Beginning of state handlers --------------------------
+
+/**
+	@brief Look for the beginning of a JSON value.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	After some optional leading white space, look for a value comprising the entire
+	JSON stream.
+*/
+static int do_begin( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else if( '\"' == c ) {         // Found a string
+		buffer_reset( parser->buf );
+		push_pp_state( parser, PP_END );
+		parser->state = PP_STR;
+	} else if( '[' == c ) {        // Found an array
+		if( parser->handlers.handleBeginArray )
+			rc = parser->handlers.handleBeginArray( parser->blob );
+		push_pp_state( parser, PP_END );
+		parser->state = PP_ARRAY_BEGIN;
+	} else if( '{' == c ) {     // Found an object
+		if( parser->handlers.handleBeginObj )
+			rc = parser->handlers.handleBeginObj( parser->blob );
+		push_pp_state( parser, PP_END );
+		parser->state = PP_OBJ_BEGIN;
+	} else if( 't' == c ) {
+		push_pp_state( parser, PP_END );
+		parser->word_idx = 0;
+		parser->state = PP_TRUE;
+	} else if( 'f' == c ) {
+		push_pp_state( parser, PP_END );
+		parser->word_idx = 0;
+		parser->state = PP_FALSE;
+	} else if( 'n' == c ) {
+		push_pp_state( parser, PP_END );
+		parser->word_idx = 0;
+		parser->state = PP_NULL;
+	} else if( isdigit( (unsigned char) c )
+			   || '-' == c
+			   || '-' == c
+			   || '+' == c
+			   || '.' == c
+			   || 'e' == c
+			   || 'E' == c ) {      // Found a number
+		buffer_reset( parser->buf );
+		buffer_add_char( parser->buf, c );
+		push_pp_state( parser, PP_END );
+		parser->state = PP_NUM;
+	} else {
+		report_pp_error( parser, "Unexpected character \'%c\' at beginning of JSON string", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Accumulate characters in a string literal.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+*/
+static int do_str    ( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( '\"' == c ) {
+		// Reached the end of the string.  Report it either as a string
+		// or as a key, depending on the context.
+		pop_pp_state( parser );
+		if( PP_OBJ_KEY == parser->state ) {         // Report as a key
+			const char* key = OSRF_BUFFER_C_STR( parser->buf );
+			if( osrfStringArrayContains( parser->keylist, key ) ) {
+				report_pp_error( parser, "Duplicate key \"%s\" in JSON object", key );
+				rc = 1;
+			} else {
+				osrfStringArrayAdd( parser->keylist, key );
+				if( parser->handlers.handleObjKey ) {
+					rc = parser->handlers.handleObjKey(
+							parser->blob, key );
+				}
+			}
+		} else {                                    // Report as a string
+			if( parser->handlers.handleString ) {
+				rc = parser->handlers.handleString(
+						parser->blob, OSRF_BUFFER_C_STR( parser->buf ) );
+			}
+			check_pp_end( parser );
+		}
+	} else if( '\\' == c ) {
+		parser->state = PP_SLASH;       // Handle an escaped special character
+	} else if( iscntrl( (unsigned char) c ) || ! isprint( (unsigned char) c ) ) {
+		report_pp_error( parser, "Illegal character 0x%02X in string literal",
+			(unsigned int) c );
+		rc = 1;
+	} else {
+		buffer_add_char( parser->buf, c );
+	}
+
+	return rc;
+}
+
+/**
+	@brief Look for an escaped special character.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+*/
+static int do_slash( JSONPushParser* parser, char c ) {
+	int rc = 0;
+
+	switch( c ) {
+		case '\"' :
+			OSRF_BUFFER_ADD_CHAR( parser->buf, '\"' );
+			parser->state = PP_STR;
+			break;
+		case '\\' :
+			OSRF_BUFFER_ADD_CHAR( parser->buf, '\\' );
+			parser->state = PP_STR;
+			break;
+		case '/' :
+			OSRF_BUFFER_ADD_CHAR( parser->buf, '/' );
+			parser->state = PP_STR;
+			break;
+		case 'b' :
+			OSRF_BUFFER_ADD_CHAR( parser->buf, '\b' );
+			parser->state = PP_STR;
+			break;
+		case 'f' :
+			OSRF_BUFFER_ADD_CHAR( parser->buf, '\f' );
+			parser->state = PP_STR;
+			break;
+		case 'n' :
+			OSRF_BUFFER_ADD_CHAR( parser->buf, '\n' );
+			parser->state = PP_STR;
+			break;
+		case 'r' :
+			OSRF_BUFFER_ADD_CHAR( parser->buf, '\r' );
+			parser->state = PP_STR;
+			break;
+		case 't' :
+			OSRF_BUFFER_ADD_CHAR( parser->buf, '\t' );
+			parser->state = PP_STR;
+			break;
+		case 'u' :
+			parser->word_idx = 0;
+			parser->point_code = 0;
+			parser->state = PP_UTF8;
+			break;
+		default :
+			report_pp_error( parser,
+				"Unexpected character '%c' escaped by preceding backslash", c );
+			rc = 1;
+			break;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Accumulate and convert hex digits into a multibyte UTF-8 character.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character (should be a hex digit).
+	@return 0 if successful, or 1 upon error.
+
+	Convert each character to the corresponding numeric value and incorporate it into a sum.
+	When all four characters have been accumulated, translate the result into a multibyte
+	UTF-8 character and append it to the buffer.
+
+	The algorithm for converting the input character into a numeric value assumes that the
+	the characters [a-f] and [A-F] are contiguous in the execution character set, and that
+	the lower 4 bits for 'a' and 'A' are 0001.  Those assumptions are true for ASCII and
+	EBCDIC, but there may be some character sets for which it is not true.
+*/
+static int do_utf8( JSONPushParser* parser, char c ) {
+	int rc = 0;
+
+	if( isxdigit( (unsigned char) c ) ) {
+		// Convert the numeric character to a hex value
+		unsigned char hex = (c <= '9') ? c - '0' : (c & 7) + 9;
+
+		// Branch according to how many characters we have so far
+		switch( parser->word_idx ) {
+			case 0 :
+				parser->point_code += hex << 12;
+				++parser->word_idx;
+				break;
+			case 1 :
+				parser->point_code += hex << 8;
+				++parser->word_idx;
+				break;
+			case 2 :
+				parser->point_code += hex << 4;
+				++parser->word_idx;
+				break;
+			default : {
+				// We have all four hex characters.  Now finish the
+				// point code and translate it to a UTF-8 character.
+				unsigned int point_code = parser->point_code + hex;
+				unsigned char ubuf[ 4 ];
+
+				if (point_code < 0x80) {
+					ubuf[0] = point_code;
+					ubuf[1] = '\0';
+
+				} else if (point_code < 0x800) {
+					ubuf[0] = 0xc0 | (point_code >> 6);
+					ubuf[1] = 0x80 | (point_code & 0x3f);
+					ubuf[2] = '\0';
+
+				} else {
+					ubuf[0] = 0xe0 | (point_code >> 12);
+					ubuf[1] = 0x80 | ((point_code >> 6) & 0x3f);
+					ubuf[2] = 0x80 | (point_code & 0x3f);
+					ubuf[3] = '\0';
+				}
+
+				if( ubuf[ 0 ] ) {
+					// Append the UTF-8 sequence to the buffer
+					OSRF_BUFFER_ADD( parser->buf, (char*) ubuf );
+					parser->state = PP_STR;
+				} else {
+					report_pp_error( parser, "UTF-8 sequence codes for nul character" );
+					rc = 1;
+				}
+			} // end default
+		} // end switch
+	} else {
+		report_pp_error( parser, "Non-hex character '%c' found in UTF-8 sequence", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Accumulate characters into a numeric literal.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	Once we see a character that doesn't belong in a numeric literal, we check to make sure
+	that the characters we accumulate are a well-formed number according to JSON rules.  If
+	they aren't, we try to massage them into something valid (e.g. by removing a leading
+	plus sign, which official JSON doesn't allow).
+*/
+static int do_num  ( JSONPushParser* parser, char c ) {
+	int rc = 0;
+
+	if( isdigit( (unsigned char) c )
+				|| '-' == c
+				|| '-' == c
+				|| '+' == c
+				|| '.' == c
+				|| 'e' == c
+				|| 'E' == c ) {
+		buffer_add_char( parser->buf, c );
+	} else {
+		const char* num_str = OSRF_BUFFER_C_STR( parser->buf );
+
+		// Validate number
+		if( jsonIsNumeric( num_str ) ) {
+			if( parser->handlers.handleNumber )
+				rc = parser->handlers.handleNumber( parser->blob, num_str );
+			parser->again = c;
+			pop_pp_state( parser );
+			check_pp_end( parser );
+		} else {                            // Not valid?  Try to fix it
+			char* temp = jsonScrubNumber( num_str );
+			if( temp ) {                    // Fixed
+				if( parser->handlers.handleNumber )
+					rc = parser->handlers.handleNumber( parser->blob, temp );
+				free( temp );
+				parser->again = c;
+				pop_pp_state( parser );
+				check_pp_end( parser );
+			} else {                       // Can't be fixed
+				report_pp_error( parser, "Invalid number: \"%s\"", num_str );
+				rc = 1;
+			}
+		}
+	}
+	return rc;
+}
+
+/**
+	@brief Look for the first element of a JSON array, or the end of the array.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	We have just entered a JSON array.  We expect to see either a value or (in the case of
+	an empty array) a closing brace.  Anything else is an error.
+*/
+static int do_array_begin( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else if( '\"' == c ) {    // Found a string
+		buffer_reset( parser->buf );
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->state = PP_STR;
+	} else if( '[' == c ) {     // Found a nested array
+		if( parser->handlers.handleBeginArray )
+			rc  = parser->handlers.handleBeginArray( parser->blob );
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->state = PP_ARRAY_BEGIN;
+	} else if( '{' == c ) {     // Found a nested object
+		if( parser->handlers.handleBeginObj )
+			rc = parser->handlers.handleBeginObj( parser->blob );
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->state = PP_OBJ_BEGIN;
+	} else if( ']' == c ) {     // End of array
+		if( parser->handlers.handleEndArray )
+			rc = parser->handlers.handleEndArray( parser->blob );
+		pop_pp_state( parser );
+		check_pp_end( parser );
+	} else if( 't' == c ) {
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->word_idx = 0;
+		parser->state = PP_TRUE;
+	} else if( 'f' == c ) {
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->word_idx = 0;
+		parser->state = PP_FALSE;
+	} else if( 'n' == c ) {
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->word_idx = 0;
+		parser->state = PP_NULL;
+	} else if( isdigit( (unsigned char) c )  // Found a number
+				|| '-' == c
+				|| '-' == c
+				|| '+' == c
+				|| '.' == c
+				|| 'e' == c
+				|| 'E' == c ) {
+		buffer_reset( parser->buf );
+		buffer_add_char( parser->buf, c );
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->state = PP_NUM;
+	} else {
+		report_pp_error( parser, "Unexpected character \'%c\' at beginning of array", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Look for the comma after a value in an array, or the end of the array.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	We have just passed a value in a JSON array.  We expect to see either a separating
+	comma or a right square bracket.
+*/
+static int do_array_value( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else if( ',' == c ) {       // Found a comma
+		parser->state = PP_ARRAY_COMMA;
+	} else if( ']' == c ) {     // End of array
+		if( parser->handlers.handleEndArray )
+			rc = parser->handlers.handleEndArray( parser->blob );
+		pop_pp_state( parser );
+		check_pp_end( parser );
+	} else {
+		report_pp_error( parser,
+			"Unexpected character \'%c\' in array; expected comma or right bracket", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Look for the next element of a JSON array, or the end of the array.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	We have just passed a separator comma within a JSON array.  We expect to see a value.
+	Anything else is an error.
+*/
+static int do_array_comma( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else if( '\"' == c ) {    // Found a string
+		buffer_reset( parser->buf );
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->state = PP_STR;
+	} else if( '[' == c ) {     // Found a nested array
+		if( parser->handlers.handleBeginArray )
+			rc  = parser->handlers.handleBeginArray( parser->blob );
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->state = PP_ARRAY_BEGIN;
+	} else if( '{' == c ) {     // Found a nested object
+		if( parser->handlers.handleBeginObj )
+			rc = parser->handlers.handleBeginObj( parser->blob );
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->state = PP_OBJ_BEGIN;
+	} else if( 't' == c ) {
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->word_idx = 0;
+		parser->state = PP_TRUE;
+	} else if( 'f' == c ) {
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->word_idx = 0;
+		parser->state = PP_FALSE;
+	} else if( 'n' == c ) {
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->word_idx = 0;
+		parser->state = PP_NULL;
+	} else if( isdigit( (unsigned char) c )  // Found a number
+				|| '-' == c
+				|| '-' == c
+				|| '+' == c
+				|| '.' == c
+				|| 'e' == c
+				|| 'E' == c ) {
+		buffer_reset( parser->buf );
+		buffer_add_char( parser->buf, c );
+		push_pp_state( parser, PP_ARRAY_VALUE );
+		parser->state = PP_NUM;
+	} else {
+		report_pp_error( parser, "Expected array value; found \'%c\'", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Look for the first entry of a JSON object, or the end of the object.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	We have just entered a JSON object.  We expect to see a string literal (the key for the
+	first entry), or the end of the object.  Anything else is an error.
+*/
+static int do_obj_begin( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else if( '\"' == c ) {    // Found a string
+		buffer_reset( parser->buf );
+		push_pp_state( parser, PP_OBJ_KEY );
+		parser->state = PP_STR;
+	} else if( '}' == c ) {     // End of object
+		if( parser->handlers.handleEndObj )
+			rc = parser->handlers.handleEndObj( parser->blob );
+		pop_pp_state( parser );
+		check_pp_end( parser );
+	} else {
+		report_pp_error( parser, "Unexpected character \'%c\' at beginning of object", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Look for a colon between the key and value of an entry in a JSON object.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	We have just found the key for an entry in a JSON object.  We expect to see a colon next.
+	Anything else is an error.
+*/
+static int do_obj_key  ( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else if( ':' == c ) {
+		parser->state = PP_OBJ_COLON;
+	} else {
+		report_pp_error( parser, "Expected colon within JSON object; found \'%c\'", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Look for a value in a JSON object.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	We have just found a colon after the key of an entry in a JSON object.  We expect to see
+	the associated value next.  Anything else is an error.
+*/
+static int do_obj_colon( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else if( '\"' == c ) {    // Found a string
+		buffer_reset( parser->buf );
+		push_pp_state( parser, PP_OBJ_VALUE );
+		parser->state = PP_STR;
+	} else if( '[' == c ) {     // Found a nested array
+		if( parser->handlers.handleBeginArray )
+			rc = parser->handlers.handleBeginArray( parser->blob );
+		push_pp_state( parser, PP_OBJ_VALUE );
+		parser->state = PP_ARRAY_BEGIN;
+	} else if( '{' == c ) {     // Found a nested object
+		if( parser->handlers.handleBeginObj )
+			rc = parser->handlers.handleBeginObj( parser->blob );
+		push_pp_state( parser, PP_OBJ_VALUE );
+		parser->state = PP_OBJ_BEGIN;
+	} else if( 't' == c ) {
+		push_pp_state( parser, PP_OBJ_VALUE );
+		parser->word_idx = 0;
+		parser->state = PP_TRUE;
+	} else if( 'f' == c ) {
+		push_pp_state( parser, PP_OBJ_VALUE );
+		parser->word_idx = 0;
+		parser->state = PP_FALSE;
+	} else if( 'n' == c ) {
+		push_pp_state( parser, PP_OBJ_VALUE );
+		parser->word_idx = 0;
+		parser->state = PP_NULL;
+	} else if( isdigit( (unsigned char) c )  // Found a number
+				|| '-' == c
+				|| '-' == c
+				|| '+' == c
+				|| '.' == c
+				|| 'e' == c
+				|| 'E' == c ) {
+		buffer_reset( parser->buf );
+		buffer_add_char( parser->buf, c );
+		push_pp_state( parser, PP_OBJ_VALUE );
+		parser->state = PP_NUM;
+	} else {
+		report_pp_error( parser,
+			"Unexpected character \'%c\' after colon within JSON object", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Look for a comma in a JSON object, or for the end of the object.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	We have just finished a key/value entry in a JSON object.  We expect to see either a comma
+	or a right curly brace.  Anything else is an error.
+*/
+static int do_obj_value( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else if( ',' == c ) {
+		parser->state = PP_OBJ_COMMA;
+	} else if( '}' == c ) {
+		if( parser->handlers.handleEndObj )
+			rc = parser->handlers.handleEndObj( parser->blob );
+		pop_pp_state( parser );
+		check_pp_end( parser );
+	} else {
+		report_pp_error( parser, "Expected comma or '}' within JSON object; found \'%c\'", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Look for the next entry in a JSON object.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	We have just found a separator comma within a JSON object.  We expect to find a string to
+	serve as the key for the next entry.  Anything else is an error.
+*/
+static int do_obj_comma( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else if( '\"' == c ) {    // Found a string
+		buffer_reset( parser->buf );
+		push_pp_state( parser, PP_OBJ_KEY );
+		parser->state = PP_STR;
+	} else {
+		report_pp_error( parser, "Expected key string in a JSON object; found \'%c\'", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Accumulate characters of the keyword "true".
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	There are several ways to recognize keywords.  You can accumulate characters and then
+	look at the whole thing; you can have a distinct parser state for each letter; etc..
+
+	In this parser we have only three keywords to recognize, starting with three different
+	letters; no other bare words are allowed.  When we see the opening "t" we expect to
+	see "rue" following it, and similarly for "false" and "null".  We compare each letter
+	to the letter we expect to see at that position, and complain if they don't match.
+*/
+static int do_true( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	switch ( found_keyword( parser, c, "true", 4 ) ) {
+		case -1 :
+			rc = 1;       // wrong character found (already reported)
+			break;
+		case 0  :         // so far so good
+			break;
+		case 1  :         // we have all the right characters
+			if( parser->handlers.handleBool )
+				rc = parser->handlers.handleBool( parser->blob, 1 );
+			parser->again = c;
+			pop_pp_state( parser );
+			check_pp_end( parser );
+			break;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Accumulate characters of the keyword "false".
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	See the discussion of do_true().
+*/
+static int do_false( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	switch ( found_keyword( parser, c, "false", 5 ) ) {
+		case -1 :
+			rc = 1;       // wrong character found (already reported)
+			break;
+		case 0  :         // so far so good
+			break;
+		case 1  :         // we have all the right characters
+			if( parser->handlers.handleBool )
+				rc = parser->handlers.handleBool( parser->blob, 0 );
+			parser->again = c;
+			pop_pp_state( parser );
+			check_pp_end( parser );
+			break;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Accumulate characters of the keyword "null".
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+	See the discussion of do_true().
+*/
+static int do_null( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	switch ( found_keyword( parser, c, "null", 4 ) ) {
+		case -1 :
+			rc = 1;       // wrong character found (already reported)
+			break;
+		case 0  :         // so far so good
+			break;
+		case 1  :         // we have all the right characters
+			if( parser->handlers.handleNull )
+				rc = parser->handlers.handleNull( parser->blob );
+			parser->again = c;         // Revisit this character next time around
+			pop_pp_state( parser );
+			check_pp_end( parser );
+			break;
+	}
+
+	return rc;
+}
+
+/**
+	@brief Accumulate a character for a specified keyword
+	@param parser Pointer to the current JSONPushParser
+	@param c The current input character
+	@param keyword The keyword we're looking for
+	@param maxlen The length of the keyword (obviating strlen())
+	@return 0 If @a c is the correct next letter in the keyword,
+	or 1 if the keyword is finished correctly, or -1 upon error.
+
+	Accumulate successive letters in a specified keyword.  We don't actually store the
+	letters anywhere; we just check to make sure they're the letters we expect.
+*/
+static int found_keyword( JSONPushParser* parser, char c,
+		const char* keyword, unsigned maxlen ) {
+	int rc = 0;
+	if( ++parser->word_idx >= maxlen ) {
+		// We have all the characters; now check the one following.  It had better be
+		// either white space or punctuation.
+		if( !isspace( (unsigned char) c ) && !ispunct( (unsigned char) c ) ) {
+			report_pp_error( parser, "Unexpected character '%c' after \"true\" keyword", c );
+			return -1;     // bad character at end of keyword -- e.g. "trueY"
+		} else
+			return 1;
+	} else if( keyword[ parser->word_idx ] == c ) {
+		;        // so far so good
+	} else {
+		report_pp_error( parser, "Expected '%c' in keyword \"%s\"; found '%c'\n",
+			keyword[ parser->word_idx ], keyword, c );
+		rc = -1;
+	}
+	return rc;
+}
+
+/**
+	@brief We have reached the end of the JSON string.  There should be nothing but white space.
+	@param parser Pointer to the current JSONPushParser.
+	@param c The current input character.
+	@return 0 if successful, or 1 upon error.
+
+*/
+static int do_end( JSONPushParser* parser, char c ) {
+	int rc = 0;
+	if( isspace( (unsigned char) c ) )   // skip white space
+		;
+	else {
+		report_pp_error( parser,
+			"Expected nothing but white space afer a JSON string; found \'%c\'", c );
+		rc = 1;
+	}
+
+	return rc;
+}
+
+// -------- End of state handlers --------------------------
+
+/**
+	@brief Push the current parser state onto a stack.
+	@param parser Pointer to the current JSONPushParser.
+	@param state The state to which we will return when we pop it off.
+
+	We use a stack to simulate recursive descent.  At every point where a recursive descent
+	parser would descend, we push the a state onto the stack, i.e. the state we want to
+	go when we come back.  Where a recursive descent parser would return from the descent,
+	we pop the previously stored state off the stack.
+
+	Note that the state we push is not the current state, but some other state.  We simulate
+	a descent in order to parse some JSON value, and after parsing it, we need to be in some
+	other state.  So we push that future state onto the stack in advance.
+*/
+static void push_pp_state( JSONPushParser* parser, PPState state ) {
+	// Allocate a StateNode -- from the free list if possible,
+	// Or from the heap if necessary.
+	StateNode* node;
+	if( parser->free_states ) {
+		node = parser->free_states;
+		parser->free_states = node->next;
+	} else {
+		node = safe_malloc( sizeof( StateNode ) );
+		node->keylist = osrfNewStringArray( 8 );
+	}
+
+	// Now popuate it, and push it onto the stack.
+	node->state = state;
+	osrfStringArraySwap( parser->keylist, node->keylist );
+	node->next = parser->state_stack;
+	parser->state_stack = node;
+}
+
+/**
+	@brief Restore the previous state of the parser.
+	@param parser Pointer to the current JSONPushParser.
+
+	See also push_pp_state().
+*/
+static void pop_pp_state( JSONPushParser* parser ) {
+	if( ! parser->state_stack ) {
+		parser->state = PP_END;    // shouldn't happen
+	} else {
+		StateNode* node = parser->state_stack;
+		parser->state_stack = node->next;
+		node->next = parser->free_states;
+		parser->free_states = node;
+		// Transfer the contents of the popped node to the parser
+		parser->state = node->state;
+		osrfStringArraySwap( parser->keylist, node->keylist );
+		osrfStringArrayClear( node->keylist );
+	}
+}
+
+static void check_pp_end( JSONPushParser* parser ) {
+	if( PP_END == parser->state && parser->handlers.handleEndJSON )
+		parser->handlers.handleEndJSON( parser->blob );
+}
+
+/**
+	@brief Issue an error message from the parser.
+	@param parser Pointer to the parser issuing the message
+	@param msg A printf-style format string.  Subsequent parameters, if any, will be
+	expanded and inserted into the output message.
+*/
+static void report_pp_error( JSONPushParser* parser, const char* msg, ... ) {
+	VA_LIST_TO_STRING( msg );
+	if( parser->handlers.handleError )
+		parser->handlers.handleError( parser->blob, VA_BUF, parser->line, parser->pos );
+	else
+		osrfLogError( OSRF_LOG_MARK, "JSON Error at line %u, position %u: %s",
+			parser->line, parser->pos, VA_BUF );
+}
+
+/**
+	@brief Free a JSONPushParser and everything it owns.
+	@param parser Pointer to the JSONPustParser to be freed.
+*/
+void jsonPushParserFree( JSONPushParser* parser ) {
+	if( parser ) {
+		buffer_free( parser->buf );
+
+		// Pop off all the StateNodes, and then free them
+		while( parser->state_stack ) {
+			pop_pp_state( parser );
+		}
+
+		while( parser->free_states ) {
+			StateNode* temp = parser->free_states->next;
+			osrfStringArrayFree( parser->free_states->keylist );
+			free( parser->free_states  );
+			parser->free_states = temp;
+		}
+		osrfStringArrayFree( parser->keylist );
+		free( parser );
+	}
+}



More information about the opensrf-commits mailing list