[Opensrf-commits] r1891 - in trunk: include/opensrf src/libopensrf (scottmk)

svn at svn.open-ils.org svn at svn.open-ils.org
Sat Jan 16 14:57:20 EST 2010


Author: scottmk
Date: 2010-01-16 14:57:15 -0500 (Sat, 16 Jan 2010)
New Revision: 1891

Modified:
   trunk/include/opensrf/osrf_system.h
   trunk/src/libopensrf/osrf_system.c
Log:
Refactored the signal handling so that we shut down in an orderly
fashion instead of calling _exit() from inside a signal handler.

M    include/opensrf/osrf_system.h
M    src/libopensrf/osrf_system.c


Modified: trunk/include/opensrf/osrf_system.h
===================================================================
--- trunk/include/opensrf/osrf_system.h	2010-01-15 04:52:00 UTC (rev 1890)
+++ trunk/include/opensrf/osrf_system.h	2010-01-16 19:57:15 UTC (rev 1891)
@@ -1,3 +1,8 @@
+/**
+	@file osrf_system.h
+	@brief Header for various top-level system routines.
+*/
+
 #ifndef OSRF_SYSTEM_H
 #define OSRF_SYSTEM_H
 
@@ -12,51 +17,22 @@
 extern "C" {
 #endif
 
-/** Connects to jabber.  Returns 1 on success, 0 on failure 
-	contextnode is the location in the config file where we collect config info
-*/
-
-
 int osrf_system_bootstrap_client( char* config_file, char* contextnode );
 
-/* bootstraps a client adding the given resource string to the host/pid, etc. resource string */
-/**
-  Sets up the global connection.
-  @param configFile The OpenSRF bootstrap config file
-  @param contextNode The location in the config file where we'll find the necessary info
-  @param resource The login resource.  If NULL a default will be created
-  @return 1 on successs, 0 on failure.
-  */
-int osrfSystemBootstrapClientResc( const char* configFile,
-		const char* contextNode, const char* resource );
+int osrfSystemBootstrapClientResc( const char* config_file,
+		const char* contextnode, const char* resource );
 
-/**
-  Bootstrap the server.
-  @param hostname The name of this host.  This is the name that will be used to 
-	load the settings.
-  @param configfile The OpenSRF bootstrap config file
-  @param contextnode The config context
-  @return 0 on success, -1 on error
-  */
-int osrfSystemBootstrap( const char* hostName, const char* configfile,
+int osrfSystemBootstrap( const char* hostname, const char* configfile,
 		const char* contextNode );
 
 transport_client* osrfSystemGetTransportClient( void );
 
-/* disconnects and destroys the current client connection */
 int osrf_system_disconnect_client();
-int osrf_system_shutdown( void ); 
 
+int osrf_system_shutdown( void );
 
-/* this will clear the global transport client pointer without
- * actually destroying the socket.  this is useful for allowing
- * children to have their own socket, even though their parent
- * already created a socket
- */
 void osrfSystemIgnoreTransportClient();
 
-
-/** Initialize the cache connection */
 int osrfSystemInitCache(void);
 
 #ifdef __cplusplus

Modified: trunk/src/libopensrf/osrf_system.c
===================================================================
--- trunk/src/libopensrf/osrf_system.c	2010-01-15 04:52:00 UTC (rev 1890)
+++ trunk/src/libopensrf/osrf_system.c	2010-01-16 19:57:15 UTC (rev 1891)
@@ -1,3 +1,8 @@
+/**
+	@file osrf_system.c
+	@brief Launch a collection of servers.
+*/
+
 #include <sys/types.h>
 #include <sys/time.h>
 #include <unistd.h>
@@ -22,18 +27,6 @@
 struct child_node;
 typedef struct child_node ChildNode;
 
-static void handleKillSignal(int signo) {
-	/* we are the top-level process and we've been
-		killed. Kill all of our children */
-	kill(0, SIGTERM);
-	sleep(1); /* give the children a chance to die before we reap them */
-	pid_t child_pid;
-	int status;
-	while( (child_pid=waitpid(-1,&status,WNOHANG)) > 0)
-		osrfLogInfo(OSRF_LOG_MARK, "Killed child %d", child_pid);
-	_exit(0);
-}
-
 /**
 	@brief Represents a child process.
 */
@@ -52,12 +45,69 @@
 /** Pointer to the global transport_client; i.e. our connection to Jabber. */
 static transport_client* osrfGlobalTransportClient = NULL;
 
+/** Switch to be set by signal handler */
+static volatile sig_atomic_t sig_caught;
+
+/** Boolean: set to true when we finish shutting down. */
+static int shutdownComplete = 0;
+
 static void add_child( pid_t pid, const char* app, const char* libfile );
 static void delete_child( ChildNode* node );
 static void delete_all_children( void );
 static ChildNode* seek_child( pid_t pid );
 
 /**
+	@brief Wait on all dead child processes so that they won't be zombies.
+*/
+static void reap_children( void ) {
+	if( sig_caught ) {
+		if( SIGTERM == sig_caught || SIGINT == sig_caught ) {
+			osrfLogInfo( OSRF_LOG_MARK, "Killed by %s; terminating",
+					SIGTERM == sig_caught ? "SIGTERM" : "SIGINT" );
+		} else
+			osrfLogInfo( OSRF_LOG_MARK, "Killed by signal %d; terminating", (int) sig_caught );
+	}
+
+	// If we caught a signal, then the signal handler already did a kill().
+	// If we didn't, then do the kill() now.
+	if( ! sig_caught )
+		kill( 0, SIGTERM );
+
+	sleep(1); /* Give the children a chance to die before we reap them. */
+
+	// Wait for each dead child.  The WNOHANG option means to return immediately if
+	// there are no dead children, instead of waiting for them to die.  It is therefore
+	// possible for a child still to be alive when we exit this function, either because
+	// it intercepted the SIGTERM and ignored it, or because it took longer to die than
+	// the time we gave it.
+	pid_t child_pid;
+	while( (child_pid = waitpid(-1, NULL, WNOHANG)) > 0 )
+		osrfLogInfo(OSRF_LOG_MARK, "Killed child %d", child_pid);
+
+	// Remove all nodes from the list of child processes.
+	delete_all_children();
+}
+
+/**
+	@brief Signal handler for SIGTERM and SIGINT.
+
+	Kill all child processes, and set a switch so that we'll know that the signal arrived.
+*/
+static void handleKillSignal( int signo ) {
+	// First ignore SIGTERM.  Otherwise we would send SIGTERM to ourself, intercept it,
+	// and kill() again in an endless loop.
+	signal( SIGTERM, SIG_IGN );
+
+	//Kill all child processes.  This is safe to do in a signal handler, because POSIX
+	// specifies that kill() is reentrant.  It is necessary because, if we did the kill()
+	// only in reap_children() (above), then there would be a narrow window of vulnerability
+	// in the main loop: if the signal arrives between checking sig_caught and calling wait(),
+	// we would wait indefinitely for a child to die on its own.
+	kill( 0, SIGTERM );
+	sig_caught = signo;
+}
+
+/**
 	@brief Return a pointer to the global transport_client.
 	@return Pointer to the global transport_client, or NULL.
 
@@ -65,7 +115,7 @@
 	file scope.  This function returns that pointer.
 
 	If the connection has been opened by a previous call to osrfSystemBootstrapClientResc(),
-	Return the pointer.  Otherwise return NULL.
+	return the pointer.  Otherwise return NULL.
 */
 transport_client* osrfSystemGetTransportClient( void ) {
 	return osrfGlobalTransportClient;
@@ -82,10 +132,27 @@
 	osrfGlobalTransportClient = NULL;
 }
 
+/**
+	@brief Bootstrap a generic application from info in the configuration file.
+	@param config_file Name of the configuration file.
+	@param contextnode Name of an aggregate within the configuration file, containing the
+	relevant subset of configuration stuff.
+	@return 1 if successful; zero or -1 if error.
+
+	- Load the configuration file.
+	- Open the log.
+	- Open a connection to Jabber.
+
+	A thin wrapper for osrfSystemBootstrapClientResc, passing it NULL for a resource.
+*/
 int osrf_system_bootstrap_client( char* config_file, char* contextnode ) {
 	return osrfSystemBootstrapClientResc(config_file, contextnode, NULL);
 }
 
+/**
+	@brief Connect to one or more cache servers.
+	@return Zero in all cases.
+*/
 int osrfSystemInitCache( void ) {
 
 	jsonObject* cacheServers = osrf_settings_host_value_object("/cache/global/servers/server");
@@ -117,7 +184,6 @@
 	return 0;
 }
 
-
 /**
 	@brief Launch a collection of servers, as defined by the settings server.
 	@param hostname Full network name of the host where the process is running; or
@@ -140,6 +206,8 @@
 		return -1;
 	}
 
+	shutdownComplete = 0;
+
 	// Get a list of applications to launch from the settings server
 	int retcode = osrf_settings_retrieve(hostname);
 	osrf_system_disconnect_client();
@@ -209,35 +277,46 @@
 					exit(0);
 				}
 			} // language == c
-		}
-	} // should we do something if there are no apps? does the wait(NULL) below do that for us?
+		} // end while
+	}
 
 	osrfStringArrayFree(arr);
 
 	signal(SIGTERM, handleKillSignal);
 	signal(SIGINT, handleKillSignal);
 
-	while(1) {
-		errno = 0;
-		int status;
-		pid_t pid = wait( &status );
-		if(-1 == pid) {
-			if(errno == ECHILD)
-				osrfLogError(OSRF_LOG_MARK, "We have no more live services... exiting");
-			else
+	// Wait indefinitely for all the child processes to terminate, or for a signal to
+	// tell us to stop.  When there are no more child processes, wait() returns an
+	// ECHILD error and we break out of the loop.
+	int status;
+	pid_t pid;
+	while( ! sig_caught ) {
+		pid = wait( &status );
+		if( -1 == pid ) {
+			if( errno == ECHILD )
+				osrfLogError( OSRF_LOG_MARK, "We have no more live services... exiting" );
+			else if( errno != EINTR )
 				osrfLogError(OSRF_LOG_MARK, "Exiting top-level system loop with error: %s",
-						strerror(errno));
+						strerror( errno ) );
+
 			break;
 		} else {
 			report_child_status( pid, status );
 		}
 	}
 
-	delete_all_children();
+	reap_children();
+	osrfConfigCleanup();
+	osrf_system_disconnect_client();
+	osrf_settings_free_host_config(NULL);
 	return 0;
 }
 
-
+/**
+	@brief Report the exit status of a dead child process, then remove it from the list.
+	@param pid Process ID of the child.
+	@param status Exit status as captured by wait().
+*/
 static void report_child_status( pid_t pid, int status )
 {
 	const char* app;
@@ -248,7 +327,7 @@
 		app     = node->app     ? node->app     : "[unknown]";
 		libfile = node->libfile ? node->libfile : "[none]";
 	} else
-		app = libfile = NULL;
+		app = libfile = "";
 
 	if( WIFEXITED( status ) )
 	{
@@ -276,6 +355,12 @@
 
 /*----------- Routines to manage list of children --*/
 
+/**
+	@brief Add a node to the list of child processes.
+	@param pid Process ID of the child process.
+	@param app Name of the child application.
+	@param libfile Name of the shared library where the child process resides.
+*/
 static void add_child( pid_t pid, const char* app, const char* libfile )
 {
 	/* Construct new child node */
@@ -305,6 +390,10 @@
 	child_list = node;
 }
 
+/**
+	@brief Remove a node from the list of child processes.
+	@param node Pointer to the node to be removed.
+*/
 static void delete_child( ChildNode* node ) {
 
 	/* Sanity check */
@@ -329,12 +418,20 @@
 	free( node );
 }
 
+/**
+	@brief Remove all nodes from the list of child processes, rendering it empty.
+*/
 static void delete_all_children( void ) {
 
 	while( child_list )
 		delete_child( child_list );
 }
 
+/**
+	@brief Find the node for a child process of a given process ID.
+	@param pid The process ID of the child process.
+	@return A pointer to the corresponding node if found; otherwise NULL.
+*/
 static ChildNode* seek_child( pid_t pid ) {
 
 	/* Return a pointer to the child node for the */
@@ -356,7 +453,7 @@
 /**
 	@brief Bootstrap a generic application from info in the configuration file.
 	@param config_file Name of the configuration file.
-	@param context_node Name of an aggregate within the configuration file, containing the
+	@param contextnode Name of an aggregate within the configuration file, containing the
 	relevant subset of configuration stuff.
 	@param resource Used to construct a Jabber resource name; may be NULL.
 	@return 1 if successful; zero or -1 if error.
@@ -523,15 +620,28 @@
 	return 0;
 }
 
-static int shutdownComplete = 0;
+/**
+	@brief Shut down a laundry list of facilities typically used by servers.
+
+	Things to shut down:
+	- Settings from configuration file
+	- Cache
+	- Connection to Jabber
+	- Settings from settings server
+	- Application sessions
+	- Logs
+*/
 int osrf_system_shutdown( void ) {
-	if(shutdownComplete) return 0;
-	osrfConfigCleanup();
-	osrfCacheCleanup();
-	osrf_system_disconnect_client();
-	osrf_settings_free_host_config(NULL);
-	osrfAppSessionCleanup();
-	osrfLogCleanup();
-	shutdownComplete = 1;
-	return 1;
+	if(shutdownComplete)
+		return 0;
+	else {
+		osrfConfigCleanup();
+		osrfCacheCleanup();
+		osrf_system_disconnect_client();
+		osrf_settings_free_host_config(NULL);
+		osrfAppSessionCleanup();
+		osrfLogCleanup();
+		shutdownComplete = 1;
+		return 1;
+	}
 }



More information about the opensrf-commits mailing list