[Opensrf-commits] r1880 - in trunk: include/opensrf src/c-apps src/libopensrf (scottmk)
svn at svn.open-ils.org
svn at svn.open-ils.org
Tue Dec 29 08:22:50 EST 2009
Author: scottmk
Date: 2009-12-29 08:22:49 -0500 (Tue, 29 Dec 2009)
New Revision: 1880
Added:
trunk/include/opensrf/jsonpush.h
trunk/src/c-apps/format_json.c
trunk/src/libopensrf/jsonpush.c
Log:
Add a stream parser for JSON, and a format_json utility
that uses it.
A include/opensrf/jsonpush.h
A src/c-apps/format_json.c
A src/libopensrf/jsonpush.c
Added: trunk/include/opensrf/jsonpush.h
===================================================================
--- trunk/include/opensrf/jsonpush.h (rev 0)
+++ trunk/include/opensrf/jsonpush.h 2009-12-29 13:22:49 UTC (rev 1880)
@@ -0,0 +1,90 @@
+/*
+Copyright (C) 2009 Equinox Software Inc.
+Scott McKellar <scott at esilibrary.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+*/
+
+/**
+ @file jsonpush.h
+ @brief Push parser for JSON.
+
+ This parser provides a way to parse JSON incrementally, without necessarily holding the
+ entire JSON string (or any representation thereof) in memory at once. It can therefore
+ be used, for example, to parse large input files.
+
+ How to use it:
+
+ 1. Call jsonNewPushParser() to create a parser, designating a series of callback
+ functions to be called when the parser encounters various syntactic features.
+
+ 2. Pass one or more buffers to jsonPush() for parsing.
+
+ 3. When the last buffer has been parsed, call jsonPushParserFinish() to tell the parser
+ that no more input will be forthcoming.
+
+ 4. Call jsonPushParserFree() to free the parser when you're done with it.
+
+ By using jsonPushParserReset(), you can reuse a parser for multiple streams, without
+ having to free and recreate it.
+
+ By using jsonPushParserResume(), you can accept multiple JSON values in the same stream.
+ It is identical to jsonPushParserReset(), except that it does not reset the line number
+ and column number used in error messages.
+
+ This parser does @em not give any special attention to OSRF-specific conventions for
+ encoding class information.
+*/
+
+#ifndef JSONPUSH_H
+#define JSONPUSH_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct JSONPushParserStruct;
+typedef struct JSONPushParserStruct JSONPushParser;
+
+/** @brief A collection of callback pointers */
+typedef struct {
+
+ int (*handleString)( void* blob, const char* str );
+ int (*handleNumber)( void* blob, const char* str );
+ int (*handleBeginArray )( void* blob );
+ int (*handleEndArray )( void* blob );
+ int (*handleBeginObj )( void* blob );
+ int (*handleObjKey )( void* blob, const char* key );
+ int (*handleEndObj )( void* blob );
+ int (*handleBool) ( void* blob, int b );
+ int (*handleNull) ( void* blob );
+ void (*handleEndJSON )( void* blob );
+ void (*handleError)( void* blob, const char* msg, unsigned line, unsigned pos );
+
+} JSONHandlerMap;
+
+JSONPushParser* jsonNewPushParser( const JSONHandlerMap* map, void* blob );
+
+void jsonPushParserReset( JSONPushParser* parser );
+
+void jsonPushParserResume( JSONPushParser* parser );
+
+int jsonPushParserFinish( JSONPushParser* parser );
+
+void jsonPushParserFree( JSONPushParser* parser );
+
+int jsonPush( JSONPushParser* parser, const char* str, size_t length );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
Added: trunk/src/c-apps/format_json.c
===================================================================
--- trunk/src/c-apps/format_json.c (rev 0)
+++ trunk/src/c-apps/format_json.c 2009-12-29 13:22:49 UTC (rev 1880)
@@ -0,0 +1,605 @@
+/*
+Copyright (C) 2009 Equinox Software Inc.
+Scott McKellar <scott at esilibrary.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+*/
+
+/**
+ @file format_json.c
+ @brief Pretty-print JSON.
+
+ Read JSON from a file and output it to standard output with consistent indentation
+ and white space.
+
+ Synopsis:
+
+ format_json [ filename [ ... ] ]
+
+ Each command-line argument is the name of a file that format_json will read in turn
+ and format as JSON. A single hyphen denotes standard input. If no file is specified,
+ format_json reads standard input.
+
+ The input file[s] may contain multiple JSON values, but a JSON value may not span more
+ than a single file. In the output, successive JSON values are separated by blank lines.
+
+ The primary purpose of this formatter is to translate JSON into a canonical format that
+ can be easily read and parsed by, for example, a perl script, without having to create
+ a full JSON parser. For that reason, every square bracket and curly brace is put on a
+ line by itself, although it might be more aesthetically pleasing to put it at the end of
+ the same line as whatever precedes it.
+
+ A secondary purpose is to make ugly, all-run-together JSON more readable to the human eye.
+
+ Finally, this program serves as an example of how to use the stream parser, especially
+ for files that are too big to be loaded into memory at once. To that end, the internal
+ logic is extensively commented.
+
+ Implementation details:
+
+ When using a stream parser it is almost always necessary to implement a finite state
+ automaton, and this formatter is no exception.
+
+ We define a collection of callback functions for the parser to call at various points,
+ We also set up a structure (called a Formatter) for the parser to pass back to the
+ callbacks via a void pointer. The Formatter supplies information about where we are and
+ what we're doing; in particular, it includes the state variable for our finite state
+ automaton.
+
+ The parser is also a finite state automaton internally, and it also needs a struct (called
+ a JSONPushParser) to keep track of where it is and what it's doing. As a result, we have
+ two finite state automatons passing control back and forth. The parser handles syntax and
+ the Formatter handles semantics.
+
+ With a couple of exceptions, each callback returns a status code back to the parser that
+ calls it: 0 for success and non-zero for error. For example, a numeric literal might be
+ out of range, or an object key might be misspelled or out of place, or we might encounter
+ an object when we expect an array. Those rules reflect the semantics of the particular
+ kind of JSON that we're trying to parse. If a callback returns non-zero, the parser stops.
+
+ In the case of this formatter, any JSON is okay as long as the syntax is valid, and the
+ parser takes care of the syntax. Hence the callback functions routinely return zero.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "opensrf/utils.h"
+#include "opensrf/osrf_utf8.h"
+#include "opensrf/jsonpush.h"
+
+/**
+ @brief Enumeration of states for a finite state automaton.
+*/
+typedef enum {
+ CTX_OPEN, /**< Not currently within a JSON value. */
+ CTX_ARRAY_BEGIN, /**< At the beginning of a JSON array. */
+ CTX_ARRAY, /**< In a JSON array with at least one value so far. */
+ CTX_OBJ_BEGIN, /**< At the beginning of a JSON object. */
+ CTX_OBJ_KEY, /**< Between a key and its value in a JSON object. */
+ CTX_OBJ /**< In a JSON object with at least one entry so far. */
+} Context;
+
+/**
+ @brief Node for storing a Context in a stack.
+*/
+struct ContextNode {
+ struct ContextNode* next; /**< Linkage pointer for linked list. */
+ Context context; /**< The Context being stored for eventual restoration. */
+};
+typedef struct ContextNode ContextNode;
+
+/**
+ @brief Structure to be passed back to callback functions to keep track of where we are.
+*/
+typedef struct {
+ const char* filename; /**< Name of input file, or NULL for standard input */
+ Context context; /**< Current state. */
+ ContextNode* context_stack; /**< Stack of previous states. */
+ int indent; /**< How many current levels of indentation. */
+ growing_buffer* buf; /**< For formatting strings with escaped characters. */
+ JSONPushParser* parser; /**< Points to the current parser. */
+} Formatter;
+
+static int format_file( Formatter* formatter, FILE* infile );
+static void install_parser( Formatter* formatter );
+
+static void indent( unsigned n );
+static int formatString( void* blob, const char* str );
+static int formatNumber( void* blob, const char* str );
+static int formatLeftBracket( void* blob );
+static int formatRightBracket( void* blob );
+static int formatKey( void* blob, const char* str );
+static int formatLeftBrace( void* blob );
+static int formatRightBrace( void* blob );
+static int formatBool( void* blob, int b );
+static int formatNull( void* blob );
+static void formatEnd( void* blob );
+
+static void show_error( void* blob, const char* msg, unsigned line, unsigned pos );
+
+static void push_context( Formatter* formatter );
+static void pop_context( Formatter* formatter );
+
+static ContextNode* free_context = NULL; // Free list for ContextNodes
+
+/**
+ @brief The usual.
+ @param argc Number of command line parameters, plus one.
+ @param argv Pointer to ragged array representing the command line.
+ @return EXIT_SUCCESS on success, or EXIT_FAILURE upon failure.
+*/
+int main( int argc, char* argv[] ) {
+
+ int rc = EXIT_SUCCESS;
+
+ // Declare and initialize a Formatter
+ static Formatter formatter;
+ formatter.filename = NULL;
+ formatter.context = CTX_OPEN;
+ formatter.context_stack = NULL;
+ formatter.indent = 0;
+ formatter.buf = buffer_init( 32 );
+ install_parser( &formatter );
+
+ if( argc > 1 ) {
+ int i = 0;
+ while( (++i < argc) && (0 == rc) ) {
+ // Iterate over the command line arguments.
+ // An argument "-" means to read standard input.
+ const char* filename = argv[ i ];
+ FILE* in;
+ if( '-' == filename[ 0 ] && '\0' == filename[ 1 ] ) {
+ in = stdin;
+ formatter.filename = NULL;
+ } else {
+ in = fopen( filename, "r" );
+ formatter.filename = filename;
+ }
+
+ if( !in ) {
+ fprintf( stderr, "Unable to open %s\n", filename );
+ } else {
+ // Reset the parser. This tells the parser that we're starting over for a new
+ // JSON value, and that it needs to reset the line counter and position counter
+ // for error messages. (We don't really need this for the first file, but it
+ // does no harm.)
+ jsonPushParserReset( formatter.parser );
+
+ // Format the file
+ if( format_file( &formatter, in ) )
+ rc = EXIT_FAILURE;
+ if( formatter.filename )
+ fclose( in );
+ }
+ } // end while
+ } else {
+ // No command line arguments? Read standard input. Note that we don't have to
+ // reset the parser in this case, because we're only parsing once anyway.
+ format_file( &formatter, stdin );
+ }
+
+ // Clean up the formatter
+ jsonPushParserFree( formatter.parser );
+ buffer_free( formatter.buf );
+ while( formatter.context_stack )
+ pop_context( &formatter );
+
+ // Free the free ContextNodes shed from the stack
+ while( free_context ) {
+ ContextNode* temp = free_context->next;
+ free( free_context );
+ free_context = temp;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Read and format a JSON file.
+ @param formatter Pointer to the current Formatter.
+ @param infile Pointer to the input file.
+ @return 0 if successful, or 1 upon error.
+*/
+static int format_file( Formatter* formatter, FILE* infile ) {
+
+ const int bufsize = 4096;
+ char buf[ bufsize ];
+ int num_read;
+ int rc = 0;
+
+ do {
+ num_read = fread( buf, 1, bufsize, infile );
+ if( num_read > 0 )
+ if( jsonPush( formatter->parser, buf, num_read ) )
+ rc = 1;
+ } while( num_read == bufsize && 0 == rc );
+
+ if( jsonPushParserFinish( formatter->parser ) )
+ rc = 1;
+
+ if( rc )
+ fprintf( stderr, "\nError found in JSON file\n" );
+
+ return rc;
+}
+
+/**
+ @brief Create a JSONPushParser and install it in a Formatter.
+ @param formatter Pointer to the Formatter in which the parser is to be installed.
+
+ First we create a JSONHandlerMap to tell the parser what callback functions to call
+ at various points. Then we pass it to jsonNewPushParser, which makes its own copy of
+ the map, so it's okay for our original map to go out of scope.
+*/
+static void install_parser( Formatter* formatter ) {
+
+ // Designate the callback functions to be installed in the parser.
+ JSONHandlerMap map = {
+ formatString, // string
+ formatNumber, // number
+ formatLeftBracket, // begin array
+ formatRightBracket, // end array
+ formatLeftBrace, // begin object
+ formatKey, // object key
+ formatRightBrace, // end object
+ formatBool, // keyword true or false
+ formatNull, // keyword null
+ formatEnd, // end of JSON
+ show_error // error handler
+ };
+
+ formatter->parser = jsonNewPushParser( &map, formatter );
+}
+
+/**
+ @brief Format a string literal.
+ @param blob Pointer to Formatter, cast to a void pointer.
+ @param str Pointer to the contents of the string, with all escape sequences decoded.
+ @return zero.
+
+ Called by the parser when it finds a string literal (other than the name portion of a
+ name/value pair in a JSON object).
+
+ Write the literal within double quotes, with special and multibyte characters escaped
+ as needed, and a comma and white as needed.
+*/
+static int formatString( void* blob, const char* str ) {
+ Formatter* formatter = (Formatter*) blob;
+ if( CTX_ARRAY == formatter->context )
+ printf( ",\n" );
+ else if( formatter->context != CTX_OBJ_KEY )
+ printf( "\n" );
+
+ if( formatter->context != CTX_OBJ_KEY )
+ indent( formatter->indent );
+
+ // Escape characters as needed
+ buffer_reset( formatter->buf );
+ buffer_append_utf8( formatter->buf, str );
+
+ printf( "\"%s\"", OSRF_BUFFER_C_STR( formatter->buf ) );
+
+ // Pick the next state
+ if( CTX_ARRAY_BEGIN == formatter->context )
+ formatter->context = CTX_ARRAY;
+ else if ( CTX_OBJ_KEY == formatter->context )
+ formatter->context = CTX_OBJ;
+
+ return 0;
+}
+
+/**
+ @brief Format a numeric literal.
+ @param blob Pointer to Formatter, cast to a void pointer.
+ @param str Pointer to a string containing the numeric literal.
+ @return zero.
+
+ Called by the parser when it finds a numeric literal.
+
+ Write the numeric literal, with a comma and white space as needed.
+*/
+static int formatNumber( void* blob, const char* str ) {
+ Formatter* formatter = (Formatter*) blob;
+ if( CTX_ARRAY == formatter->context )
+ printf( ",\n" );
+ else if( formatter->context != CTX_OBJ_KEY )
+ printf( "\n" );
+
+ if( formatter->context != CTX_OBJ_KEY )
+ indent( formatter->indent );
+
+ printf( "%s", str );
+
+ // Pick the next state
+ if( CTX_ARRAY_BEGIN == formatter->context )
+ formatter->context = CTX_ARRAY;
+ else if ( CTX_OBJ_KEY == formatter->context )
+ formatter->context = CTX_OBJ;
+
+ return 0;
+}
+
+/**
+ @brief Format a left square bracket.
+ @param blob Pointer to Formatter, cast to a void pointer.
+ @return zero.
+
+ Called by the parser when it finds a left square bracket opening a JSON array.
+
+ Write a left square bracket, with a comma and white space as needed.
+*/
+static int formatLeftBracket( void* blob ) {
+ Formatter* formatter = blob;
+ if( CTX_ARRAY == formatter->context || CTX_OBJ == formatter->context )
+ printf( "," );
+ printf( "\n" );
+ indent( formatter->indent++ );
+ printf( "[" );
+
+ // Pick the state to return to when we close the array.
+ if( CTX_ARRAY_BEGIN == formatter->context )
+ formatter->context = CTX_ARRAY;
+ else if ( CTX_OBJ_BEGIN == formatter->context )
+ formatter->context = CTX_OBJ;
+ push_context( formatter );
+
+ formatter->context = CTX_ARRAY_BEGIN;
+ return 0;
+}
+
+/**
+ @brief Format a right square bracket.
+ @param blob Pointer to Formatter, cast to a void pointer.
+ @return zero.
+
+ Called by the parser when it finds a right square bracket closing a JSON array.
+
+ Write a newline, indentation, and a right square bracket.
+*/
+static int formatRightBracket( void* blob ) {
+ Formatter* formatter = blob;
+ printf( "\n" );
+ indent( --formatter->indent );
+ printf( "]" );
+
+ pop_context( formatter );
+ return 0;
+}
+
+/**
+ @brief Formate a left curly brace.
+ @param blob Pointer to Formatter, cast to a void pointer.
+ @return zero.
+
+ Called by the parser when it finds a left curly brace opening a JSON object.
+
+ Write a left curly brace, with a comma and white space as needed.
+*/
+static int formatLeftBrace( void* blob ) {
+ Formatter* formatter = blob;
+ if( CTX_ARRAY == formatter->context || CTX_OBJ == formatter->context )
+ printf( "," );
+ printf( "\n" );
+ indent( formatter->indent++ );
+ printf( "{" );
+
+ // Pick the state to return to when we close the object.
+ if( CTX_ARRAY_BEGIN == formatter->context )
+ formatter->context = CTX_ARRAY;
+ else if ( CTX_OBJ_BEGIN == formatter->context )
+ formatter->context = CTX_OBJ;
+ push_context( formatter );
+
+ formatter->context = CTX_OBJ_BEGIN;
+ return 0;
+}
+
+/**
+ @brief Format a right curly brace.
+ @param blob Pointer to Formatter, cast to a void pointer.
+ @return zero.
+
+ Called by the parser when it finds a right curly brace closing a JSON object.
+
+ Write a newline, indentation, and a right curly brace.
+*/
+static int formatRightBrace( void* blob ) {
+ Formatter* formatter = blob;
+ printf( "\n" );
+ indent( --formatter->indent );
+ printf( "}" );
+
+ pop_context( formatter );
+ return 0;
+}
+
+/**
+ @brief Format the key of a key/value pair in a JSON object.
+ @param blob Pointer to Formatter, cast to a void pointer.
+ @param str Pointer to a string containing the key.
+ @return zero.
+
+ Called by the parser when it finds the key of a key/value pair. It hasn't found the
+ accompanying colon yet, and if it doesn't find it later, it will return an error.
+
+ Write the key in double quotes, with a comma and white space as needed.
+*/
+static int formatKey( void* blob, const char* str ) {
+ Formatter* formatter = blob;
+ if( CTX_OBJ == formatter->context )
+ printf( ",\n" );
+ else
+ printf( "\n" );
+ indent( formatter->indent );
+
+ // Escape characters as needed
+ buffer_reset( formatter->buf );
+ buffer_append_utf8( formatter->buf, str );
+
+ printf( "\"%s\" : ", OSRF_BUFFER_C_STR( formatter->buf ) );
+
+ formatter->context = CTX_OBJ_KEY;
+ return 0;
+}
+
+/**
+ @brief Format a boolean value.
+ @param blob Pointer to Formatter, cast to a void pointer.
+ @param b An int used as a boolean to indicate whether the boolean value is true or false.
+ @return zero.
+
+ Called by the parser when it finds the JSON keyword "true" or "false".
+
+ Write "true" or "false" (without the quotes) with a comma and white as needed.
+*/
+static int formatBool( void* blob, int b ) {
+ Formatter* formatter = (Formatter*) blob;
+ if( CTX_ARRAY == formatter->context )
+ printf( ",\n" );
+ else if( formatter->context != CTX_OBJ_KEY )
+ printf( "\n" );
+
+ if( formatter->context != CTX_OBJ_KEY )
+ indent( formatter->indent );
+
+ printf( "%s", b ? "true" : "false" );
+
+ // Pick the next state.
+ if( CTX_ARRAY_BEGIN == formatter->context )
+ formatter->context = CTX_ARRAY;
+ else if ( CTX_OBJ_KEY == formatter->context )
+ formatter->context = CTX_OBJ;
+
+ return 0;
+}
+
+/**
+ @brief Format a null value.
+ @param blob Pointer to Formatter, cast to a void pointer.
+ @return zero.
+
+ Called by the parser when it finds the JSON keyword "null".
+
+ Write "null" (without the quotes) with a comma and white as needed.
+*/
+static int formatNull( void* blob ) {
+ Formatter* formatter = (Formatter*) blob;
+ if( CTX_ARRAY == formatter->context )
+ printf( ",\n" );
+ else if( formatter->context != CTX_OBJ_KEY )
+ printf( "\n" );
+
+ if( formatter->context != CTX_OBJ_KEY )
+ indent( formatter->indent );
+
+ printf( "null" );
+
+ if( CTX_ARRAY_BEGIN == formatter->context )
+ formatter->context = CTX_ARRAY;
+ else if ( CTX_OBJ_KEY == formatter->context )
+ formatter->context = CTX_OBJ;
+
+ return 0;
+}
+
+/**
+ @brief Respond to the end of a JSON value.
+ @param blob Pointer to Formatter, cast to a void pointer.
+
+ Called by the parser when it reaches the end of a JSON value.
+
+ This formatter acccepts multiple JSON values in succession. Tell the parser to look
+ for another one. Otherwise the parser will treat anything other than white space
+ beyond this point as an error.
+
+ Note that jsonPushParserResume() does @em not reset the line number and column number
+ used by the parser for error messages. If you want to do that. call jsonPushParserReset().
+*/
+static void formatEnd( void* blob ) {
+ Formatter* formatter = blob;
+ jsonPushParserResume( formatter->parser );
+ printf( "\n" );
+}
+
+/**
+ @brief Issue an error message about a syntax error detected by the parser.
+ @param blob
+ @param msg Pointer to a message describing the syntax error.
+ @param line Line number in the current file where the error was detected.
+ @param pos Column position in the current line where the error was detected.
+
+ Called by the parser when it encounters a syntax error.
+
+ Write the message to standard error, providing the file name (saved in the Formatter),
+ line number, and column position.
+*/
+static void show_error( void* blob, const char* msg, unsigned line, unsigned pos ) {
+ Formatter* formatter = (Formatter*) blob;
+ const char* filename = formatter->filename;
+ if( !filename )
+ filename = "standard input";
+ fprintf( stderr, "\nError in %s at line %u, position %u:\n%s\n",
+ filename, line, pos, msg );
+}
+
+/**
+ @brief Write a specified number of indents, four spaces per indent.
+ @param n How many indents to write.
+*/
+static void indent( unsigned n ) {
+ while( n ) {
+ printf( " " );
+ --n;
+ }
+}
+
+/**
+ @brief Push the current state onto the stack.
+ @param formatter Pointer to the current Formatter.
+
+ We call this when we enter a JSON array or object. Later, when we reach the end of the
+ array or object, we'll call pop_context() to restore the saved state.
+*/
+static void push_context( Formatter* formatter ) {
+ // Allocate a ContextNode; from the free list if possible,
+ // or from the heap if necessary
+ ContextNode* node = NULL;
+ if( free_context ) {
+ node = free_context;
+ free_context = free_context->next;
+ } else
+ node = safe_malloc( sizeof( ContextNode ) );
+
+ node->context = formatter->context;
+ node->next = formatter->context_stack;
+ formatter->context_stack = node;
+}
+
+/**
+ @brief Pop a state off the stack.
+ @param formatter Pointer to the current Formatter.
+
+ We call this at the end of a JSON array or object, in order to restore the state saved
+ when we entered the array or object.
+*/
+static void pop_context( Formatter* formatter ) {
+ if( !formatter->context_stack )
+ return; // shouldn't happen
+
+ ContextNode* node = formatter->context_stack;
+ formatter->context_stack = node->next;
+
+ formatter->context = node->context;
+
+ node->next = free_context;
+ free_context = node;
+}
Added: trunk/src/libopensrf/jsonpush.c
===================================================================
--- trunk/src/libopensrf/jsonpush.c (rev 0)
+++ trunk/src/libopensrf/jsonpush.c 2009-12-29 13:22:49 UTC (rev 1880)
@@ -0,0 +1,1281 @@
+/*
+Copyright (C) 2009 Equinox Software Inc.
+Scott McKellar <scott at esilibrary.com>
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+*/
+
+/**
+ @file jsonpush.c
+ @brief Push parser for JSON.
+
+ This parser parses JSON incrementally, without necessarily holding the entire JSON string
+ (or any representation thereof) in memory at once. It is therefore suitable for parsing
+ large input files.
+
+ A format such as JSON, with its arbitrarily nestable elements, cries out piteously for a
+ recursive descent parser to match the recursive structure of the format. Unfortunately,
+ recursive descent doesn't work for an incremental parser, because the boundaries of
+ incoming chunks don't respect syntactic boundaries.
+
+ This parser is based on a finite state automaton, using a structure to retain state across
+ chunks, and a stack to simulate recursion. The calling code designates a series of
+ callback functions to respond to various syntactic features as they are encountered.
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <ctype.h>
+#include "opensrf/osrf_json.h"
+#include "opensrf/jsonpush.h"
+
+/** Enumeration of states for a finite state automaton */
+typedef enum {
+ PP_BEGIN, // outside of any JSON
+ PP_STR, // inside a string literal
+ PP_SLASH, // found a backslash in a string literal
+ PP_UTF8, // collecting a UTF8 sequence
+ PP_NUM, // inside a numeric literal
+ PP_ARRAY_BEGIN, // started an array
+ PP_ARRAY_VALUE, // found an array element
+ PP_ARRAY_COMMA, // found a comma between array elements
+ PP_OBJ_BEGIN, // started a JSON object
+ PP_OBJ_KEY, // found a string for a key in an object
+ PP_OBJ_COLON, // found a colon after a key in an object
+ PP_OBJ_VALUE, // found a value for a key in an object
+ PP_OBJ_COMMA, // found a comma separating entries in an object
+ PP_TRUE, // true keyword
+ PP_FALSE, // false keyword
+ PP_NULL, // null keyword
+ PP_END, // reached the end of the JSON stream
+ PP_ERROR // encountered invalid JSON; can't continue
+} PPState;
+
+struct StateNodeStruct;
+typedef struct StateNodeStruct StateNode;
+
+/**
+ @brief Represents a parser state at a given level of nesting.
+
+ The parser maintains a stack of StateNodes to simulate recursive descent.
+*/
+struct StateNodeStruct {
+ StateNode* next; /**< For a linked list to implement the stack */
+ PPState state; /**< State to which we will return */
+ osrfStringArray* keylist; /**< List of key strings, if the level is for a JSON object */
+};
+
+/**
+ @brief A collection of things the parser needs to remember about what it's doing.
+
+ This structure enables the parser to retain state from one chunk of input to the next.
+*/
+struct JSONPushParserStruct {
+ JSONHandlerMap handlers;
+ void* blob; /**< To be passed back to callback functions. */
+ unsigned line; /**< Line number. */
+ unsigned pos; /**< Character position within line. */
+ PPState state; /**< For finite state automaton. */
+ char again; /**< If non-zero, re-read it as the next character. */
+ growing_buffer* buf; /**< For accumulating strings and numbers. */
+ StateNode* state_stack; /**< For simulating recursive descent. */
+ StateNode* free_states; /**< Free list of unused StateNodes. */
+ unsigned word_idx; /**< index of current characters keyword,
+ such as "true", "false", or "null". */
+ unsigned int point_code; /**< for UTF-8 transformations. */
+ osrfStringArray* keylist; /**< Stores keys in current JSON object. */
+};
+
+// State handlers for the finite state automaton
+static int do_begin( JSONPushParser* parser, char c );
+static int do_str ( JSONPushParser* parser, char c );
+static int do_slash( JSONPushParser* parser, char c );
+static int do_utf8 ( JSONPushParser* parser, char c );
+static int do_num ( JSONPushParser* parser, char c );
+static int do_array_begin( JSONPushParser* parser, char c );
+static int do_array_value( JSONPushParser* parser, char c );
+static int do_array_comma( JSONPushParser* parser, char c );
+static int do_obj_begin( JSONPushParser* parser, char c );
+static int do_obj_key ( JSONPushParser* parser, char c );
+static int do_obj_colon( JSONPushParser* parser, char c );
+static int do_obj_value( JSONPushParser* parser, char c );
+static int do_obj_comma( JSONPushParser* parser, char c );
+static int do_true ( JSONPushParser* parser, char c );
+static int do_false( JSONPushParser* parser, char c );
+static int do_null ( JSONPushParser* parser, char c );
+static int do_end( JSONPushParser* parser, char c );
+
+static int found_keyword( JSONPushParser* parser, char c,
+ const char* keyword, unsigned maxlen );
+static void push_pp_state( JSONPushParser* parser, PPState state );
+static void pop_pp_state( JSONPushParser* parser );
+static void check_pp_end( JSONPushParser* parser );
+static void report_pp_error( JSONPushParser* parser, const char* msg, ... );
+
+/**
+ @brief Create a new JSONPushParser.
+ @param map Pointer to a JSONHandlerMap designating the callback functions to call.
+ @param blob An arbitrary pointer to be passed to the callback functions.
+ @return A pointer to the new parser.
+
+ The calling code can use the @a blob parameter to specify its own context for the
+ callback functions.
+
+ The calling code is responsible for freeing the parser by calling jsonPushParserFree().
+*/
+JSONPushParser* jsonNewPushParser( const JSONHandlerMap* map, void* blob )
+{
+ if( ! map )
+ return NULL;
+
+ JSONPushParser* parser = safe_malloc( sizeof( JSONPushParser ) );
+ parser->handlers = *map;
+ parser->blob = blob;
+ parser->line = 1;
+ parser->pos = 1;
+ parser->state = PP_BEGIN;
+ parser->again = '\0';
+ parser->buf = buffer_init( 64 );
+ parser->state_stack = NULL;
+ parser->free_states = NULL;
+ parser->word_idx = 0;
+ parser->keylist = osrfNewStringArray( 8 );
+ return parser;
+}
+
+/**
+ @brief Restore a JSONPushParser to its original pristine state.
+ @param parser Pointer to the JSONPushParser to be reset.
+
+ This function makes it possible to reuse the same parser for multiple documents, e.g.
+ multiple input files, without having to destroy and recreate it. The expectation is
+ that it be called after jsonPush() returns.
+*/
+void jsonPushParserReset( JSONPushParser* parser ) {
+ if( parser ) {
+ parser->line = 1;
+ parser->pos = 1;
+ parser->state = PP_BEGIN;
+ }
+}
+
+/**
+ @brief Restore a JSONPushParser to a starting state.
+ @param parser Pointer to the JSONPushParser to be resumed.
+
+ This function is similar to jsonPushParserReset(), with two exceptions:
+ - It only works if the parser is between JSON values. Otherwise it wouldn't be able
+ to continue sensibly.
+ - It doesn't reset the line number or position number used for error messages.
+
+ Purpose: make it possible to parse multiple JSON values in the same stream. The
+ expectation is that it be called by the callback function that responds to end-of-JSON.
+*/
+void jsonPushParserResume( JSONPushParser* parser ) {
+ if( parser ) {
+ parser->state = PP_BEGIN;
+ }
+}
+
+/**
+ @brief Tell the JSON push parser that there is no more input to parse.
+ @param parser Pointer to the parser.
+ @return 0 if successful, or 1 upon error.
+
+ A call to this function is comparable to an end-of-file marker. Without it, the parser
+ would be unable to recognize certain tokens at the very end of the last buffer, because
+ it wouldn't know that the token was finished.
+
+ For example: if the last byte is part of a number, the parser will not have reported the
+ numeric token because it was waiting to see if the next character was numeric.
+
+ Likewise, certain kinds of errors would be unrecognizable, such as a failure to complete
+ the current JSON expression.
+*/
+int jsonPushParserFinish( JSONPushParser* parser ) {
+ int rc = 0;
+
+ // If we're currently accumulating a token, finish it
+ if( PP_NUM == parser->state ) {
+ const char* num_str = OSRF_BUFFER_C_STR( parser->buf );
+
+ // Validate number
+ if( jsonIsNumeric( num_str ) ) {
+ if( parser->handlers.handleNumber )
+ rc = parser->handlers.handleNumber( parser->blob, num_str );
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else { // Not numeric? Try to fix it
+ char* temp = jsonScrubNumber( num_str );
+ if( temp ) { // Fixed
+ if( parser->handlers.handleNumber )
+ rc = parser->handlers.handleNumber( parser->blob, temp );
+ free( temp );
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else { // Can't be fixed
+ report_pp_error( parser, "Invalid number: \"%s\"", num_str );
+ rc = 1;
+ parser->state = PP_ERROR;
+ }
+ }
+ } else if( PP_TRUE == parser->state ) {
+ if( 3 == parser->word_idx ) {
+ if( parser->handlers.handleBool )
+ rc = parser->handlers.handleBool( parser->blob, 1 );
+ } else {
+ report_pp_error( parser, "Keyword \"true\" is incomplete at end of input" );
+ printf( "Wordlen = %d\n", parser->word_idx );
+ rc = 1;
+ parser->state = PP_ERROR;
+ }
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else if( PP_FALSE == parser->state ) {
+ if( 4 == parser->word_idx ) {
+ if( parser->handlers.handleBool )
+ rc = parser->handlers.handleBool( parser->blob, 0 );
+ } else {
+ report_pp_error( parser, "Keyword \"false\" is incomplete at end of input" );
+ rc = 1;
+ parser->state = PP_ERROR;
+ }
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else if( PP_NULL == parser->state ) {
+ if( 3 == parser->word_idx ) {
+ if( parser->handlers.handleNull )
+ rc = parser->handlers.handleNull( parser->blob );
+ } else {
+ report_pp_error( parser, "Keyword \"null\" is incomplete at end of input" );
+ rc = 1;
+ parser->state = PP_ERROR;
+ }
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ }
+
+ // At this point the state should be PP_END, or possibly PP_BEGIN if the JSON value is
+ // empty, or PP_ERROR if we already encountered an error. Anything else means that the
+ // JSON value is incomplete.
+
+ switch( parser->state ) {
+ case PP_BEGIN :
+ parser->state = PP_END; // JSON value was empty
+ break;
+ case PP_STR :
+ case PP_SLASH :
+ case PP_UTF8 :
+ report_pp_error( parser, "String literal not closed" );
+ parser->state = PP_ERROR;
+ rc = 1;
+ break;
+ case PP_NUM : // not possible
+ break;
+ case PP_ARRAY_BEGIN :
+ report_pp_error( parser, "Empty JSON array not closed" );
+ parser->state = PP_ERROR;
+ rc = 1;
+ break;
+ case PP_ARRAY_VALUE :
+ report_pp_error( parser, "JSON array begun but not closed" );
+ parser->state = PP_ERROR;
+ rc = 1;
+ break;
+ case PP_ARRAY_COMMA :
+ report_pp_error( parser, "JSON array not closed" );
+ parser->state = PP_ERROR;
+ rc = 1;
+ break;
+ case PP_OBJ_BEGIN :
+ report_pp_error( parser, "Empty JSON object not closed" );
+ parser->state = PP_ERROR;
+ rc = 1;
+ break;
+ case PP_OBJ_KEY :
+ report_pp_error( parser, "JSON object not continued after key" );
+ parser->state = PP_ERROR;
+ rc = 1;
+ break;
+ case PP_OBJ_COLON :
+ report_pp_error( parser, "JSON object not continued after colon" );
+ parser->state = PP_ERROR;
+ rc = 1;
+ break;
+ case PP_OBJ_VALUE :
+ report_pp_error( parser, "JSON object begun but not closed" );
+ parser->state = PP_ERROR;
+ rc = 1;
+ break;
+ case PP_OBJ_COMMA :
+ report_pp_error( parser, "JSON object not closed" );
+ parser->state = PP_ERROR;
+ rc = 1;
+ break;
+ case PP_TRUE : // not possible
+ case PP_FALSE : // not possible
+ case PP_NULL : // not possible
+ case PP_END : // okay
+ case PP_ERROR : // previous error, presumably already reported
+ break;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Incrementally parse a chunk of JSON.
+ @param parser Pointer to the JSONPushParser that will do the parsing.
+ @param str Pointer to a chunk of JSON, either all or part of a JSON stream.
+ @param length Length of the chunk of JSON.
+ @return 0 if successful, or 1 upon error.
+
+ Parse a fragment of JSON, possibly preceded or followed by one or more other chunks
+ in the same JSON stream. Respond to various syntactical features by calling the
+ corresponding callback functions that were designated when the parser was created.
+*/
+int jsonPush( JSONPushParser* parser, const char* str, size_t length ) {
+ if( ! parser )
+ return 1;
+ else if( ! str ) {
+ report_pp_error( parser, "JSON parser received a NULL parameter for input" );
+ return 1;
+ } else if( PP_ERROR == parser->state ) {
+ report_pp_error( parser, "JSON parser cannot continue due to previous error" );
+ return 1;
+ }
+
+ int rc = 0;
+ // Loop through the chunk
+ int i = 0;
+ while( str[i] && i < length && parser->state != PP_ERROR ) {
+ // branch on the current parser state
+ switch( parser->state ) {
+ case PP_BEGIN :
+ rc = do_begin( parser, str[i] );
+ break;
+ case PP_STR :
+ rc = do_str( parser, str[i] );
+ break;
+ case PP_SLASH :
+ rc = do_slash( parser, str[i] );
+ break;
+ case PP_UTF8 :
+ rc = do_utf8( parser, str[i] );
+ break;
+ case PP_NUM :
+ rc = do_num( parser, str[i] );
+ break;
+ case PP_ARRAY_BEGIN :
+ rc = do_array_begin( parser, str[i] );
+ break;
+ case PP_ARRAY_VALUE :
+ rc = do_array_value( parser, str[i] );
+ break;
+ case PP_ARRAY_COMMA :
+ rc = do_array_comma( parser, str[i] );
+ break;
+ case PP_OBJ_BEGIN :
+ rc = do_obj_begin( parser, str[i] );
+ break;
+ case PP_OBJ_KEY :
+ rc = do_obj_key( parser, str[i] );
+ break;
+ case PP_OBJ_COLON :
+ rc = do_obj_colon( parser, str[i] );
+ break;
+ case PP_OBJ_VALUE :
+ rc = do_obj_value( parser, str[i] );
+ break;
+ case PP_OBJ_COMMA :
+ rc = do_obj_comma( parser, str[i] );
+ break;
+ case PP_TRUE :
+ rc = do_true( parser, str[i] );
+ break;
+ case PP_FALSE :
+ rc = do_false( parser, str[i] );
+ break;
+ case PP_NULL :
+ rc = do_null( parser, str[i] );
+ break;
+ case PP_END :
+ rc = do_end( parser, str[i] );
+ break;
+ default :
+ break; // stub for now; should be error
+ }
+ if( rc )
+ break;
+ else if( parser->again )
+ parser->again = '\0'; // reuse the current character
+ else {
+ // Advance to the next character
+ ++i;
+ if( '\n' == str[i] ) {
+ ++parser->line;
+ parser->pos = 0;
+ } else
+ ++parser->pos;
+ }
+ }
+
+ if( 1 == rc )
+ parser->state = PP_ERROR;
+
+ return rc;
+}
+
+// -------- Beginning of state handlers --------------------------
+
+/**
+ @brief Look for the beginning of a JSON value.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ After some optional leading white space, look for a value comprising the entire
+ JSON stream.
+*/
+static int do_begin( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else if( '\"' == c ) { // Found a string
+ buffer_reset( parser->buf );
+ push_pp_state( parser, PP_END );
+ parser->state = PP_STR;
+ } else if( '[' == c ) { // Found an array
+ if( parser->handlers.handleBeginArray )
+ rc = parser->handlers.handleBeginArray( parser->blob );
+ push_pp_state( parser, PP_END );
+ parser->state = PP_ARRAY_BEGIN;
+ } else if( '{' == c ) { // Found an object
+ if( parser->handlers.handleBeginObj )
+ rc = parser->handlers.handleBeginObj( parser->blob );
+ push_pp_state( parser, PP_END );
+ parser->state = PP_OBJ_BEGIN;
+ } else if( 't' == c ) {
+ push_pp_state( parser, PP_END );
+ parser->word_idx = 0;
+ parser->state = PP_TRUE;
+ } else if( 'f' == c ) {
+ push_pp_state( parser, PP_END );
+ parser->word_idx = 0;
+ parser->state = PP_FALSE;
+ } else if( 'n' == c ) {
+ push_pp_state( parser, PP_END );
+ parser->word_idx = 0;
+ parser->state = PP_NULL;
+ } else if( isdigit( (unsigned char) c )
+ || '-' == c
+ || '-' == c
+ || '+' == c
+ || '.' == c
+ || 'e' == c
+ || 'E' == c ) { // Found a number
+ buffer_reset( parser->buf );
+ buffer_add_char( parser->buf, c );
+ push_pp_state( parser, PP_END );
+ parser->state = PP_NUM;
+ } else {
+ report_pp_error( parser, "Unexpected character \'%c\' at beginning of JSON string", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Accumulate characters in a string literal.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+*/
+static int do_str ( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( '\"' == c ) {
+ // Reached the end of the string. Report it either as a string
+ // or as a key, depending on the context.
+ pop_pp_state( parser );
+ if( PP_OBJ_KEY == parser->state ) { // Report as a key
+ const char* key = OSRF_BUFFER_C_STR( parser->buf );
+ if( osrfStringArrayContains( parser->keylist, key ) ) {
+ report_pp_error( parser, "Duplicate key \"%s\" in JSON object", key );
+ rc = 1;
+ } else {
+ osrfStringArrayAdd( parser->keylist, key );
+ if( parser->handlers.handleObjKey ) {
+ rc = parser->handlers.handleObjKey(
+ parser->blob, key );
+ }
+ }
+ } else { // Report as a string
+ if( parser->handlers.handleString ) {
+ rc = parser->handlers.handleString(
+ parser->blob, OSRF_BUFFER_C_STR( parser->buf ) );
+ }
+ check_pp_end( parser );
+ }
+ } else if( '\\' == c ) {
+ parser->state = PP_SLASH; // Handle an escaped special character
+ } else if( iscntrl( (unsigned char) c ) || ! isprint( (unsigned char) c ) ) {
+ report_pp_error( parser, "Illegal character 0x%02X in string literal",
+ (unsigned int) c );
+ rc = 1;
+ } else {
+ buffer_add_char( parser->buf, c );
+ }
+
+ return rc;
+}
+
+/**
+ @brief Look for an escaped special character.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+*/
+static int do_slash( JSONPushParser* parser, char c ) {
+ int rc = 0;
+
+ switch( c ) {
+ case '\"' :
+ OSRF_BUFFER_ADD_CHAR( parser->buf, '\"' );
+ parser->state = PP_STR;
+ break;
+ case '\\' :
+ OSRF_BUFFER_ADD_CHAR( parser->buf, '\\' );
+ parser->state = PP_STR;
+ break;
+ case '/' :
+ OSRF_BUFFER_ADD_CHAR( parser->buf, '/' );
+ parser->state = PP_STR;
+ break;
+ case 'b' :
+ OSRF_BUFFER_ADD_CHAR( parser->buf, '\b' );
+ parser->state = PP_STR;
+ break;
+ case 'f' :
+ OSRF_BUFFER_ADD_CHAR( parser->buf, '\f' );
+ parser->state = PP_STR;
+ break;
+ case 'n' :
+ OSRF_BUFFER_ADD_CHAR( parser->buf, '\n' );
+ parser->state = PP_STR;
+ break;
+ case 'r' :
+ OSRF_BUFFER_ADD_CHAR( parser->buf, '\r' );
+ parser->state = PP_STR;
+ break;
+ case 't' :
+ OSRF_BUFFER_ADD_CHAR( parser->buf, '\t' );
+ parser->state = PP_STR;
+ break;
+ case 'u' :
+ parser->word_idx = 0;
+ parser->point_code = 0;
+ parser->state = PP_UTF8;
+ break;
+ default :
+ report_pp_error( parser,
+ "Unexpected character '%c' escaped by preceding backslash", c );
+ rc = 1;
+ break;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Accumulate and convert hex digits into a multibyte UTF-8 character.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character (should be a hex digit).
+ @return 0 if successful, or 1 upon error.
+
+ Convert each character to the corresponding numeric value and incorporate it into a sum.
+ When all four characters have been accumulated, translate the result into a multibyte
+ UTF-8 character and append it to the buffer.
+
+ The algorithm for converting the input character into a numeric value assumes that the
+ the characters [a-f] and [A-F] are contiguous in the execution character set, and that
+ the lower 4 bits for 'a' and 'A' are 0001. Those assumptions are true for ASCII and
+ EBCDIC, but there may be some character sets for which it is not true.
+*/
+static int do_utf8( JSONPushParser* parser, char c ) {
+ int rc = 0;
+
+ if( isxdigit( (unsigned char) c ) ) {
+ // Convert the numeric character to a hex value
+ unsigned char hex = (c <= '9') ? c - '0' : (c & 7) + 9;
+
+ // Branch according to how many characters we have so far
+ switch( parser->word_idx ) {
+ case 0 :
+ parser->point_code += hex << 12;
+ ++parser->word_idx;
+ break;
+ case 1 :
+ parser->point_code += hex << 8;
+ ++parser->word_idx;
+ break;
+ case 2 :
+ parser->point_code += hex << 4;
+ ++parser->word_idx;
+ break;
+ default : {
+ // We have all four hex characters. Now finish the
+ // point code and translate it to a UTF-8 character.
+ unsigned int point_code = parser->point_code + hex;
+ unsigned char ubuf[ 4 ];
+
+ if (point_code < 0x80) {
+ ubuf[0] = point_code;
+ ubuf[1] = '\0';
+
+ } else if (point_code < 0x800) {
+ ubuf[0] = 0xc0 | (point_code >> 6);
+ ubuf[1] = 0x80 | (point_code & 0x3f);
+ ubuf[2] = '\0';
+
+ } else {
+ ubuf[0] = 0xe0 | (point_code >> 12);
+ ubuf[1] = 0x80 | ((point_code >> 6) & 0x3f);
+ ubuf[2] = 0x80 | (point_code & 0x3f);
+ ubuf[3] = '\0';
+ }
+
+ if( ubuf[ 0 ] ) {
+ // Append the UTF-8 sequence to the buffer
+ OSRF_BUFFER_ADD( parser->buf, (char*) ubuf );
+ parser->state = PP_STR;
+ } else {
+ report_pp_error( parser, "UTF-8 sequence codes for nul character" );
+ rc = 1;
+ }
+ } // end default
+ } // end switch
+ } else {
+ report_pp_error( parser, "Non-hex character '%c' found in UTF-8 sequence", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Accumulate characters into a numeric literal.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ Once we see a character that doesn't belong in a numeric literal, we check to make sure
+ that the characters we accumulate are a well-formed number according to JSON rules. If
+ they aren't, we try to massage them into something valid (e.g. by removing a leading
+ plus sign, which official JSON doesn't allow).
+*/
+static int do_num ( JSONPushParser* parser, char c ) {
+ int rc = 0;
+
+ if( isdigit( (unsigned char) c )
+ || '-' == c
+ || '-' == c
+ || '+' == c
+ || '.' == c
+ || 'e' == c
+ || 'E' == c ) {
+ buffer_add_char( parser->buf, c );
+ } else {
+ const char* num_str = OSRF_BUFFER_C_STR( parser->buf );
+
+ // Validate number
+ if( jsonIsNumeric( num_str ) ) {
+ if( parser->handlers.handleNumber )
+ rc = parser->handlers.handleNumber( parser->blob, num_str );
+ parser->again = c;
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else { // Not valid? Try to fix it
+ char* temp = jsonScrubNumber( num_str );
+ if( temp ) { // Fixed
+ if( parser->handlers.handleNumber )
+ rc = parser->handlers.handleNumber( parser->blob, temp );
+ free( temp );
+ parser->again = c;
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else { // Can't be fixed
+ report_pp_error( parser, "Invalid number: \"%s\"", num_str );
+ rc = 1;
+ }
+ }
+ }
+ return rc;
+}
+
+/**
+ @brief Look for the first element of a JSON array, or the end of the array.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ We have just entered a JSON array. We expect to see either a value or (in the case of
+ an empty array) a closing brace. Anything else is an error.
+*/
+static int do_array_begin( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else if( '\"' == c ) { // Found a string
+ buffer_reset( parser->buf );
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->state = PP_STR;
+ } else if( '[' == c ) { // Found a nested array
+ if( parser->handlers.handleBeginArray )
+ rc = parser->handlers.handleBeginArray( parser->blob );
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->state = PP_ARRAY_BEGIN;
+ } else if( '{' == c ) { // Found a nested object
+ if( parser->handlers.handleBeginObj )
+ rc = parser->handlers.handleBeginObj( parser->blob );
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->state = PP_OBJ_BEGIN;
+ } else if( ']' == c ) { // End of array
+ if( parser->handlers.handleEndArray )
+ rc = parser->handlers.handleEndArray( parser->blob );
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else if( 't' == c ) {
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->word_idx = 0;
+ parser->state = PP_TRUE;
+ } else if( 'f' == c ) {
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->word_idx = 0;
+ parser->state = PP_FALSE;
+ } else if( 'n' == c ) {
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->word_idx = 0;
+ parser->state = PP_NULL;
+ } else if( isdigit( (unsigned char) c ) // Found a number
+ || '-' == c
+ || '-' == c
+ || '+' == c
+ || '.' == c
+ || 'e' == c
+ || 'E' == c ) {
+ buffer_reset( parser->buf );
+ buffer_add_char( parser->buf, c );
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->state = PP_NUM;
+ } else {
+ report_pp_error( parser, "Unexpected character \'%c\' at beginning of array", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Look for the comma after a value in an array, or the end of the array.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ We have just passed a value in a JSON array. We expect to see either a separating
+ comma or a right square bracket.
+*/
+static int do_array_value( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else if( ',' == c ) { // Found a comma
+ parser->state = PP_ARRAY_COMMA;
+ } else if( ']' == c ) { // End of array
+ if( parser->handlers.handleEndArray )
+ rc = parser->handlers.handleEndArray( parser->blob );
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else {
+ report_pp_error( parser,
+ "Unexpected character \'%c\' in array; expected comma or right bracket", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Look for the next element of a JSON array, or the end of the array.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ We have just passed a separator comma within a JSON array. We expect to see a value.
+ Anything else is an error.
+*/
+static int do_array_comma( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else if( '\"' == c ) { // Found a string
+ buffer_reset( parser->buf );
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->state = PP_STR;
+ } else if( '[' == c ) { // Found a nested array
+ if( parser->handlers.handleBeginArray )
+ rc = parser->handlers.handleBeginArray( parser->blob );
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->state = PP_ARRAY_BEGIN;
+ } else if( '{' == c ) { // Found a nested object
+ if( parser->handlers.handleBeginObj )
+ rc = parser->handlers.handleBeginObj( parser->blob );
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->state = PP_OBJ_BEGIN;
+ } else if( 't' == c ) {
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->word_idx = 0;
+ parser->state = PP_TRUE;
+ } else if( 'f' == c ) {
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->word_idx = 0;
+ parser->state = PP_FALSE;
+ } else if( 'n' == c ) {
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->word_idx = 0;
+ parser->state = PP_NULL;
+ } else if( isdigit( (unsigned char) c ) // Found a number
+ || '-' == c
+ || '-' == c
+ || '+' == c
+ || '.' == c
+ || 'e' == c
+ || 'E' == c ) {
+ buffer_reset( parser->buf );
+ buffer_add_char( parser->buf, c );
+ push_pp_state( parser, PP_ARRAY_VALUE );
+ parser->state = PP_NUM;
+ } else {
+ report_pp_error( parser, "Expected array value; found \'%c\'", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Look for the first entry of a JSON object, or the end of the object.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ We have just entered a JSON object. We expect to see a string literal (the key for the
+ first entry), or the end of the object. Anything else is an error.
+*/
+static int do_obj_begin( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else if( '\"' == c ) { // Found a string
+ buffer_reset( parser->buf );
+ push_pp_state( parser, PP_OBJ_KEY );
+ parser->state = PP_STR;
+ } else if( '}' == c ) { // End of object
+ if( parser->handlers.handleEndObj )
+ rc = parser->handlers.handleEndObj( parser->blob );
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else {
+ report_pp_error( parser, "Unexpected character \'%c\' at beginning of object", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Look for a colon between the key and value of an entry in a JSON object.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ We have just found the key for an entry in a JSON object. We expect to see a colon next.
+ Anything else is an error.
+*/
+static int do_obj_key ( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else if( ':' == c ) {
+ parser->state = PP_OBJ_COLON;
+ } else {
+ report_pp_error( parser, "Expected colon within JSON object; found \'%c\'", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Look for a value in a JSON object.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ We have just found a colon after the key of an entry in a JSON object. We expect to see
+ the associated value next. Anything else is an error.
+*/
+static int do_obj_colon( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else if( '\"' == c ) { // Found a string
+ buffer_reset( parser->buf );
+ push_pp_state( parser, PP_OBJ_VALUE );
+ parser->state = PP_STR;
+ } else if( '[' == c ) { // Found a nested array
+ if( parser->handlers.handleBeginArray )
+ rc = parser->handlers.handleBeginArray( parser->blob );
+ push_pp_state( parser, PP_OBJ_VALUE );
+ parser->state = PP_ARRAY_BEGIN;
+ } else if( '{' == c ) { // Found a nested object
+ if( parser->handlers.handleBeginObj )
+ rc = parser->handlers.handleBeginObj( parser->blob );
+ push_pp_state( parser, PP_OBJ_VALUE );
+ parser->state = PP_OBJ_BEGIN;
+ } else if( 't' == c ) {
+ push_pp_state( parser, PP_OBJ_VALUE );
+ parser->word_idx = 0;
+ parser->state = PP_TRUE;
+ } else if( 'f' == c ) {
+ push_pp_state( parser, PP_OBJ_VALUE );
+ parser->word_idx = 0;
+ parser->state = PP_FALSE;
+ } else if( 'n' == c ) {
+ push_pp_state( parser, PP_OBJ_VALUE );
+ parser->word_idx = 0;
+ parser->state = PP_NULL;
+ } else if( isdigit( (unsigned char) c ) // Found a number
+ || '-' == c
+ || '-' == c
+ || '+' == c
+ || '.' == c
+ || 'e' == c
+ || 'E' == c ) {
+ buffer_reset( parser->buf );
+ buffer_add_char( parser->buf, c );
+ push_pp_state( parser, PP_OBJ_VALUE );
+ parser->state = PP_NUM;
+ } else {
+ report_pp_error( parser,
+ "Unexpected character \'%c\' after colon within JSON object", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Look for a comma in a JSON object, or for the end of the object.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ We have just finished a key/value entry in a JSON object. We expect to see either a comma
+ or a right curly brace. Anything else is an error.
+*/
+static int do_obj_value( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else if( ',' == c ) {
+ parser->state = PP_OBJ_COMMA;
+ } else if( '}' == c ) {
+ if( parser->handlers.handleEndObj )
+ rc = parser->handlers.handleEndObj( parser->blob );
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ } else {
+ report_pp_error( parser, "Expected comma or '}' within JSON object; found \'%c\'", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Look for the next entry in a JSON object.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ We have just found a separator comma within a JSON object. We expect to find a string to
+ serve as the key for the next entry. Anything else is an error.
+*/
+static int do_obj_comma( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else if( '\"' == c ) { // Found a string
+ buffer_reset( parser->buf );
+ push_pp_state( parser, PP_OBJ_KEY );
+ parser->state = PP_STR;
+ } else {
+ report_pp_error( parser, "Expected key string in a JSON object; found \'%c\'", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Accumulate characters of the keyword "true".
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ There are several ways to recognize keywords. You can accumulate characters and then
+ look at the whole thing; you can have a distinct parser state for each letter; etc..
+
+ In this parser we have only three keywords to recognize, starting with three different
+ letters; no other bare words are allowed. When we see the opening "t" we expect to
+ see "rue" following it, and similarly for "false" and "null". We compare each letter
+ to the letter we expect to see at that position, and complain if they don't match.
+*/
+static int do_true( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ switch ( found_keyword( parser, c, "true", 4 ) ) {
+ case -1 :
+ rc = 1; // wrong character found (already reported)
+ break;
+ case 0 : // so far so good
+ break;
+ case 1 : // we have all the right characters
+ if( parser->handlers.handleBool )
+ rc = parser->handlers.handleBool( parser->blob, 1 );
+ parser->again = c;
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ break;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Accumulate characters of the keyword "false".
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ See the discussion of do_true().
+*/
+static int do_false( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ switch ( found_keyword( parser, c, "false", 5 ) ) {
+ case -1 :
+ rc = 1; // wrong character found (already reported)
+ break;
+ case 0 : // so far so good
+ break;
+ case 1 : // we have all the right characters
+ if( parser->handlers.handleBool )
+ rc = parser->handlers.handleBool( parser->blob, 0 );
+ parser->again = c;
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ break;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Accumulate characters of the keyword "null".
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+ See the discussion of do_true().
+*/
+static int do_null( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ switch ( found_keyword( parser, c, "null", 4 ) ) {
+ case -1 :
+ rc = 1; // wrong character found (already reported)
+ break;
+ case 0 : // so far so good
+ break;
+ case 1 : // we have all the right characters
+ if( parser->handlers.handleNull )
+ rc = parser->handlers.handleNull( parser->blob );
+ parser->again = c; // Revisit this character next time around
+ pop_pp_state( parser );
+ check_pp_end( parser );
+ break;
+ }
+
+ return rc;
+}
+
+/**
+ @brief Accumulate a character for a specified keyword
+ @param parser Pointer to the current JSONPushParser
+ @param c The current input character
+ @param keyword The keyword we're looking for
+ @param maxlen The length of the keyword (obviating strlen())
+ @return 0 If @a c is the correct next letter in the keyword,
+ or 1 if the keyword is finished correctly, or -1 upon error.
+
+ Accumulate successive letters in a specified keyword. We don't actually store the
+ letters anywhere; we just check to make sure they're the letters we expect.
+*/
+static int found_keyword( JSONPushParser* parser, char c,
+ const char* keyword, unsigned maxlen ) {
+ int rc = 0;
+ if( ++parser->word_idx >= maxlen ) {
+ // We have all the characters; now check the one following. It had better be
+ // either white space or punctuation.
+ if( !isspace( (unsigned char) c ) && !ispunct( (unsigned char) c ) ) {
+ report_pp_error( parser, "Unexpected character '%c' after \"true\" keyword", c );
+ return -1; // bad character at end of keyword -- e.g. "trueY"
+ } else
+ return 1;
+ } else if( keyword[ parser->word_idx ] == c ) {
+ ; // so far so good
+ } else {
+ report_pp_error( parser, "Expected '%c' in keyword \"%s\"; found '%c'\n",
+ keyword[ parser->word_idx ], keyword, c );
+ rc = -1;
+ }
+ return rc;
+}
+
+/**
+ @brief We have reached the end of the JSON string. There should be nothing but white space.
+ @param parser Pointer to the current JSONPushParser.
+ @param c The current input character.
+ @return 0 if successful, or 1 upon error.
+
+*/
+static int do_end( JSONPushParser* parser, char c ) {
+ int rc = 0;
+ if( isspace( (unsigned char) c ) ) // skip white space
+ ;
+ else {
+ report_pp_error( parser,
+ "Expected nothing but white space afer a JSON string; found \'%c\'", c );
+ rc = 1;
+ }
+
+ return rc;
+}
+
+// -------- End of state handlers --------------------------
+
+/**
+ @brief Push the current parser state onto a stack.
+ @param parser Pointer to the current JSONPushParser.
+ @param state The state to which we will return when we pop it off.
+
+ We use a stack to simulate recursive descent. At every point where a recursive descent
+ parser would descend, we push the a state onto the stack, i.e. the state we want to
+ go when we come back. Where a recursive descent parser would return from the descent,
+ we pop the previously stored state off the stack.
+
+ Note that the state we push is not the current state, but some other state. We simulate
+ a descent in order to parse some JSON value, and after parsing it, we need to be in some
+ other state. So we push that future state onto the stack in advance.
+*/
+static void push_pp_state( JSONPushParser* parser, PPState state ) {
+ // Allocate a StateNode -- from the free list if possible,
+ // Or from the heap if necessary.
+ StateNode* node;
+ if( parser->free_states ) {
+ node = parser->free_states;
+ parser->free_states = node->next;
+ } else {
+ node = safe_malloc( sizeof( StateNode ) );
+ node->keylist = osrfNewStringArray( 8 );
+ }
+
+ // Now popuate it, and push it onto the stack.
+ node->state = state;
+ osrfStringArraySwap( parser->keylist, node->keylist );
+ node->next = parser->state_stack;
+ parser->state_stack = node;
+}
+
+/**
+ @brief Restore the previous state of the parser.
+ @param parser Pointer to the current JSONPushParser.
+
+ See also push_pp_state().
+*/
+static void pop_pp_state( JSONPushParser* parser ) {
+ if( ! parser->state_stack ) {
+ parser->state = PP_END; // shouldn't happen
+ } else {
+ StateNode* node = parser->state_stack;
+ parser->state_stack = node->next;
+ node->next = parser->free_states;
+ parser->free_states = node;
+ // Transfer the contents of the popped node to the parser
+ parser->state = node->state;
+ osrfStringArraySwap( parser->keylist, node->keylist );
+ osrfStringArrayClear( node->keylist );
+ }
+}
+
+static void check_pp_end( JSONPushParser* parser ) {
+ if( PP_END == parser->state && parser->handlers.handleEndJSON )
+ parser->handlers.handleEndJSON( parser->blob );
+}
+
+/**
+ @brief Issue an error message from the parser.
+ @param parser Pointer to the parser issuing the message
+ @param msg A printf-style format string. Subsequent parameters, if any, will be
+ expanded and inserted into the output message.
+*/
+static void report_pp_error( JSONPushParser* parser, const char* msg, ... ) {
+ VA_LIST_TO_STRING( msg );
+ if( parser->handlers.handleError )
+ parser->handlers.handleError( parser->blob, VA_BUF, parser->line, parser->pos );
+ else
+ osrfLogError( OSRF_LOG_MARK, "JSON Error at line %u, position %u: %s",
+ parser->line, parser->pos, VA_BUF );
+}
+
+/**
+ @brief Free a JSONPushParser and everything it owns.
+ @param parser Pointer to the JSONPustParser to be freed.
+*/
+void jsonPushParserFree( JSONPushParser* parser ) {
+ if( parser ) {
+ buffer_free( parser->buf );
+
+ // Pop off all the StateNodes, and then free them
+ while( parser->state_stack ) {
+ pop_pp_state( parser );
+ }
+
+ while( parser->free_states ) {
+ StateNode* temp = parser->free_states->next;
+ osrfStringArrayFree( parser->free_states->keylist );
+ free( parser->free_states );
+ parser->free_states = temp;
+ }
+ osrfStringArrayFree( parser->keylist );
+ free( parser );
+ }
+}
More information about the opensrf-commits
mailing list