Fixing a race condition in the Crash Generation Server which has to

do with simultaneous handling of dump requests and client process
termination events.

http://breakpad.appspot.com/430002/ 



git-svn-id: http://google-breakpad.googlecode.com/svn/trunk@1013 4c0a9323-5329-0410-9bdc-e9ce6186880e
This commit is contained in:
ivan.penkov@gmail.com 2012-08-15 22:09:42 +00:00
parent f8c1de5643
commit d7de392b05
4 changed files with 184 additions and 152 deletions

View file

@ -85,16 +85,38 @@ bool ClientInfo::Initialize() {
return dump_generated_handle_ != NULL; return dump_generated_handle_ != NULL;
} }
ClientInfo::~ClientInfo() { void ClientInfo::UnregisterDumpRequestWaitAndBlockUntilNoPending() {
if (dump_request_wait_handle_) { if (dump_request_wait_handle_) {
// Wait for callbacks that might already be running to finish. // Wait for callbacks that might already be running to finish.
UnregisterWaitEx(dump_request_wait_handle_, INVALID_HANDLE_VALUE); UnregisterWaitEx(dump_request_wait_handle_, INVALID_HANDLE_VALUE);
dump_request_wait_handle_ = NULL;
} }
}
void ClientInfo::UnregisterProcessExitWait(bool block_until_no_pending) {
if (process_exit_wait_handle_) { if (process_exit_wait_handle_) {
// Wait for the callback that might already be running to finish. if (block_until_no_pending) {
UnregisterWaitEx(process_exit_wait_handle_, INVALID_HANDLE_VALUE); // Wait for the callback that might already be running to finish.
UnregisterWaitEx(process_exit_wait_handle_, INVALID_HANDLE_VALUE);
} else {
UnregisterWait(process_exit_wait_handle_);
}
process_exit_wait_handle_ = NULL;
} }
}
ClientInfo::~ClientInfo() {
// Waiting for the callback to finish here is safe because ClientInfo's are
// never destroyed from the dump request handling callback.
UnregisterDumpRequestWaitAndBlockUntilNoPending();
// This is a little tricky because ClientInfo's may be destroyed by the same
// callback (OnClientEnd) and waiting for it to finish will cause a deadlock.
// Regardless of this complication, wait for any running callbacks to finish
// so that the common case is properly handled. In order to avoid deadlocks,
// the OnClientEnd callback must call UnregisterProcessExitWait(false)
// before deleting the ClientInfo.
UnregisterProcessExitWait(true);
if (process_handle_) { if (process_handle_) {
CloseHandle(process_handle_); CloseHandle(process_handle_);
@ -109,18 +131,6 @@ ClientInfo::~ClientInfo() {
} }
} }
void ClientInfo::UnregisterWaits() {
if (dump_request_wait_handle_) {
UnregisterWait(dump_request_wait_handle_);
dump_request_wait_handle_ = NULL;
}
if (process_exit_wait_handle_) {
UnregisterWait(process_exit_wait_handle_);
process_exit_wait_handle_ = NULL;
}
}
bool ClientInfo::GetClientExceptionInfo(EXCEPTION_POINTERS** ex_info) const { bool ClientInfo::GetClientExceptionInfo(EXCEPTION_POINTERS** ex_info) const {
SIZE_T bytes_count = 0; SIZE_T bytes_count = 0;
if (!ReadProcessMemory(process_handle_, if (!ReadProcessMemory(process_handle_,

View file

@ -67,24 +67,22 @@ class ClientInfo {
HANDLE dump_generated_handle() const { return dump_generated_handle_; } HANDLE dump_generated_handle() const { return dump_generated_handle_; }
DWORD crash_id() const { return crash_id_; } DWORD crash_id() const { return crash_id_; }
HANDLE dump_request_wait_handle() const {
return dump_request_wait_handle_;
}
void set_dump_request_wait_handle(HANDLE value) { void set_dump_request_wait_handle(HANDLE value) {
dump_request_wait_handle_ = value; dump_request_wait_handle_ = value;
} }
HANDLE process_exit_wait_handle() const {
return process_exit_wait_handle_;
}
void set_process_exit_wait_handle(HANDLE value) { void set_process_exit_wait_handle(HANDLE value) {
process_exit_wait_handle_ = value; process_exit_wait_handle_ = value;
} }
// Unregister all waits for the client. // Unregister the dump request wait operation and wait for all callbacks
void UnregisterWaits(); // that might already be running to complete before returning.
void UnregisterDumpRequestWaitAndBlockUntilNoPending();
// Unregister the process exit wait operation. If block_until_no_pending is
// true, wait for all callbacks that might already be running to complete
// before returning.
void UnregisterProcessExitWait(bool block_until_no_pending);
bool Initialize(); bool Initialize();
bool GetClientExceptionInfo(EXCEPTION_POINTERS** ex_info) const; bool GetClientExceptionInfo(EXCEPTION_POINTERS** ex_info) const;

View file

@ -76,13 +76,6 @@ static const ULONG kPipeIOThreadFlags = WT_EXECUTEINWAITTHREAD;
static const ULONG kDumpRequestThreadFlags = WT_EXECUTEINWAITTHREAD | static const ULONG kDumpRequestThreadFlags = WT_EXECUTEINWAITTHREAD |
WT_EXECUTELONGFUNCTION; WT_EXECUTELONGFUNCTION;
// Maximum delay during server shutdown if some work items
// are still executing.
static const int kShutdownDelayMs = 10000;
// Interval for each sleep during server shutdown.
static const int kShutdownSleepIntervalMs = 5;
static bool IsClientRequestValid(const ProtocolMessage& msg) { static bool IsClientRequestValid(const ProtocolMessage& msg) {
return msg.tag == MESSAGE_TAG_UPLOAD_REQUEST || return msg.tag == MESSAGE_TAG_UPLOAD_REQUEST ||
(msg.tag == MESSAGE_TAG_REGISTRATION_REQUEST && (msg.tag == MESSAGE_TAG_REGISTRATION_REQUEST &&
@ -123,8 +116,7 @@ CrashGenerationServer::CrashGenerationServer(
server_state_(IPC_SERVER_STATE_UNINITIALIZED), server_state_(IPC_SERVER_STATE_UNINITIALIZED),
shutting_down_(false), shutting_down_(false),
overlapped_(), overlapped_(),
client_info_(NULL), client_info_(NULL) {
cleanup_item_count_(0) {
InitializeCriticalSection(&sync_); InitializeCriticalSection(&sync_);
if (dump_path) { if (dump_path) {
@ -132,91 +124,85 @@ CrashGenerationServer::CrashGenerationServer(
} }
} }
// This should never be called from the OnPipeConnected callback.
// Otherwise the UnregisterWaitEx call below will cause a deadlock.
CrashGenerationServer::~CrashGenerationServer() { CrashGenerationServer::~CrashGenerationServer() {
// New scope to release the lock automatically. // New scope to release the lock automatically.
{ {
// Make sure no clients are added or removed beyond this point.
// Before adding or removing any clients, the critical section
// must be entered and the shutting_down_ flag checked. The
// critical section is then exited only after the clients_ list
// modifications are done and the list is in a consistent state.
AutoCriticalSection lock(&sync_); AutoCriticalSection lock(&sync_);
// Indicate to existing threads that server is shutting down. // Indicate to existing threads that server is shutting down.
shutting_down_ = true; shutting_down_ = true;
}
// No one will modify the clients_ list beyond this point -
// not even from another thread.
// Even if there are no current worker threads running, it is possible that // Even if there are no current worker threads running, it is possible that
// an I/O request is pending on the pipe right now but not yet done. // an I/O request is pending on the pipe right now but not yet done.
// In fact, it's very likely this is the case unless we are in an ERROR // In fact, it's very likely this is the case unless we are in an ERROR
// state. If we don't wait for the pending I/O to be done, then when the I/O // state. If we don't wait for the pending I/O to be done, then when the I/O
// completes, it may write to invalid memory. AppVerifier will flag this // completes, it may write to invalid memory. AppVerifier will flag this
// problem too. So we disconnect from the pipe and then wait for the server // problem too. So we disconnect from the pipe and then wait for the server
// to get into error state so that the pending I/O will fail and get // to get into error state so that the pending I/O will fail and get
// cleared. // cleared.
DisconnectNamedPipe(pipe_); DisconnectNamedPipe(pipe_);
int num_tries = 100; int num_tries = 100;
while (num_tries-- && server_state_ != IPC_SERVER_STATE_ERROR) { while (num_tries-- && server_state_ != IPC_SERVER_STATE_ERROR) {
lock.Release(); Sleep(10);
Sleep(10);
lock.Acquire();
}
// Unregister wait on the pipe.
if (pipe_wait_handle_) {
// Wait for already executing callbacks to finish.
UnregisterWaitEx(pipe_wait_handle_, INVALID_HANDLE_VALUE);
}
// Close the pipe to avoid further client connections.
if (pipe_) {
CloseHandle(pipe_);
}
// Request all ClientInfo objects to unregister all waits.
std::list<ClientInfo*>::iterator iter;
for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
ClientInfo* client_info = *iter;
client_info->UnregisterWaits();
}
} }
// Now that all waits have been unregistered, wait for some time // Unregister wait on the pipe.
// for all pending work items to finish. if (pipe_wait_handle_) {
int total_wait = 0; // Wait for already executing callbacks to finish.
while (cleanup_item_count_ > 0) { UnregisterWaitEx(pipe_wait_handle_, INVALID_HANDLE_VALUE);
Sleep(kShutdownSleepIntervalMs);
total_wait += kShutdownSleepIntervalMs;
if (total_wait >= kShutdownDelayMs) {
break;
}
} }
// New scope to hold the lock for the shortest time. // Close the pipe to avoid further client connections.
{ if (pipe_) {
AutoCriticalSection lock(&sync_); CloseHandle(pipe_);
// Clean up all the ClientInfo objects.
std::list<ClientInfo*>::iterator iter;
for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
ClientInfo* client_info = *iter;
delete client_info;
}
if (server_alive_handle_) {
// Release the mutex before closing the handle so that clients requesting
// dumps wait for a long time for the server to generate a dump.
ReleaseMutex(server_alive_handle_);
CloseHandle(server_alive_handle_);
}
if (overlapped_.hEvent) {
CloseHandle(overlapped_.hEvent);
}
} }
// Request all ClientInfo objects to unregister all waits.
// No need to enter the critical section because no one is allowed to modify
// the clients_ list once the shutting_down_ flag is set.
std::list<ClientInfo*>::iterator iter;
for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
ClientInfo* client_info = *iter;
// Unregister waits. Wait for already executing callbacks to finish.
// Unregister the client process exit wait first and only then unregister
// the dump request wait. The reason is that the OnClientExit callback
// also unregisters the dump request wait and such a race (doing the same
// unregistration from two threads) is undesirable.
client_info->UnregisterProcessExitWait(true);
client_info->UnregisterDumpRequestWaitAndBlockUntilNoPending();
// Destroying the ClientInfo here is safe because all wait operations for
// this ClientInfo were unregistered and no pending or running callbacks
// for this ClientInfo can possible exist (block_until_no_pending option
// was used).
delete client_info;
}
if (server_alive_handle_) {
// Release the mutex before closing the handle so that clients requesting
// dumps wait for a long time for the server to generate a dump.
ReleaseMutex(server_alive_handle_);
CloseHandle(server_alive_handle_);
}
if (overlapped_.hEvent) {
CloseHandle(overlapped_.hEvent);
}
DeleteCriticalSection(&sync_); DeleteCriticalSection(&sync_);
} }
bool CrashGenerationServer::Start() { bool CrashGenerationServer::Start() {
AutoCriticalSection lock(&sync_);
if (server_state_ != IPC_SERVER_STATE_UNINITIALIZED) { if (server_state_ != IPC_SERVER_STATE_UNINITIALIZED) {
return false; return false;
} }
@ -455,6 +441,7 @@ void CrashGenerationServer::HandleReadDoneState() {
return; return;
} }
// This is only valid as long as it can be found in the clients_ list
client_info_ = client_info.release(); client_info_ = client_info.release();
// Note that the asynchronous write issued by RespondToClient function // Note that the asynchronous write issued by RespondToClient function
@ -532,7 +519,32 @@ void CrashGenerationServer::HandleReadingAckState() {
// The connection handshake with the client is now complete; perform // The connection handshake with the client is now complete; perform
// the callback. // the callback.
if (connect_callback_) { if (connect_callback_) {
connect_callback_(connect_context_, client_info_); // Note that there is only a single copy of the ClientInfo of the
// currently connected client. However it is being referenced from
// two different places:
// - the client_info_ member
// - the clients_ list
// The lifetime of this ClientInfo depends on the lifetime of the
// client process - basically it can go away at any time.
// However, as long as it is referenced by the clients_ list it
// is guaranteed to be valid. Enter the critical section and check
// to see whether the client_info_ can be found in the list.
// If found, execute the callback and only then leave the critical
// section.
AutoCriticalSection lock(&sync_);
bool client_is_still_alive = false;
std::list<ClientInfo*>::iterator iter;
for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
if (client_info_ == *iter) {
client_is_still_alive = true;
break;
}
}
if (client_is_still_alive) {
connect_callback_(connect_context_, client_info_);
}
} }
} else { } else {
// We should never get an I/O incomplete since we should not execute this // We should never get an I/O incomplete since we should not execute this
@ -605,16 +617,39 @@ bool CrashGenerationServer::PrepareReply(const ClientInfo& client_info,
return true; return true;
} }
// Closing of remote handles (belonging to a different process) can
// only be done through DuplicateHandle.
if (reply->dump_request_handle) { if (reply->dump_request_handle) {
CloseHandle(reply->dump_request_handle); DuplicateHandle(client_info.process_handle(), // hSourceProcessHandle
reply->dump_request_handle, // hSourceHandle
NULL, // hTargetProcessHandle
0, // lpTargetHandle
0, // dwDesiredAccess
FALSE, // bInheritHandle
DUPLICATE_CLOSE_SOURCE); // dwOptions
reply->dump_request_handle = NULL;
} }
if (reply->dump_generated_handle) { if (reply->dump_generated_handle) {
CloseHandle(reply->dump_generated_handle); DuplicateHandle(client_info.process_handle(), // hSourceProcessHandle
reply->dump_generated_handle, // hSourceHandle
NULL, // hTargetProcessHandle
0, // lpTargetHandle
0, // dwDesiredAccess
FALSE, // bInheritHandle
DUPLICATE_CLOSE_SOURCE); // dwOptions
reply->dump_generated_handle = NULL;
} }
if (reply->server_alive_handle) { if (reply->server_alive_handle) {
CloseHandle(reply->server_alive_handle); DuplicateHandle(client_info.process_handle(), // hSourceProcessHandle
reply->server_alive_handle, // hSourceHandle
NULL, // hTargetProcessHandle
0, // lpTargetHandle
0, // dwDesiredAccess
FALSE, // bInheritHandle
DUPLICATE_CLOSE_SOURCE); // dwOptions
reply->server_alive_handle = NULL;
} }
return false; return false;
@ -676,21 +711,15 @@ bool CrashGenerationServer::RespondToClient(ClientInfo* client_info) {
// Takes over ownership of client_info. We MUST return true if AddClient // Takes over ownership of client_info. We MUST return true if AddClient
// succeeds. // succeeds.
if (!AddClient(client_info)) { return AddClient(client_info);
return false;
}
return true;
} }
// The server thread servicing the clients runs this method. The method // The server thread servicing the clients runs this method. The method
// implements the state machine described in ReadMe.txt along with the // implements the state machine described in ReadMe.txt along with the
// helper methods HandleXXXState. // helper methods HandleXXXState.
void CrashGenerationServer::HandleConnectionRequest() { void CrashGenerationServer::HandleConnectionRequest() {
AutoCriticalSection lock(&sync_); // If the server is shutting down, get into ERROR state, reset the event so
// more workers don't run and return immediately.
// If we are shutting doen then get into ERROR state, reset the event so more
// workers don't run and return immediately.
if (shutting_down_) { if (shutting_down_) {
server_state_ = IPC_SERVER_STATE_ERROR; server_state_ = IPC_SERVER_STATE_ERROR;
ResetEvent(overlapped_.hEvent); ResetEvent(overlapped_.hEvent);
@ -776,6 +805,10 @@ bool CrashGenerationServer::AddClient(ClientInfo* client_info) {
// New scope to hold the lock for the shortest time. // New scope to hold the lock for the shortest time.
{ {
AutoCriticalSection lock(&sync_); AutoCriticalSection lock(&sync_);
if (shutting_down_) {
// If server is shutting down, don't add new clients
return false;
}
clients_.push_back(client_info); clients_.push_back(client_info);
} }
@ -812,56 +845,55 @@ void CALLBACK CrashGenerationServer::OnClientEnd(void* context, BOOLEAN) {
CrashGenerationServer* crash_server = client_info->crash_server(); CrashGenerationServer* crash_server = client_info->crash_server();
assert(crash_server); assert(crash_server);
client_info->UnregisterWaits(); crash_server->HandleClientProcessExit(client_info);
InterlockedIncrement(&crash_server->cleanup_item_count_);
if (!QueueUserWorkItem(CleanupClient, context, WT_EXECUTEDEFAULT)) {
InterlockedDecrement(&crash_server->cleanup_item_count_);
}
} }
// static void CrashGenerationServer::HandleClientProcessExit(ClientInfo* client_info) {
DWORD WINAPI CrashGenerationServer::CleanupClient(void* context) {
assert(context);
ClientInfo* client_info = reinterpret_cast<ClientInfo*>(context);
CrashGenerationServer* crash_server = client_info->crash_server();
assert(crash_server);
if (crash_server->exit_callback_) {
crash_server->exit_callback_(crash_server->exit_context_, client_info);
}
crash_server->DoCleanup(client_info);
InterlockedDecrement(&crash_server->cleanup_item_count_);
return 0;
}
void CrashGenerationServer::DoCleanup(ClientInfo* client_info) {
assert(client_info); assert(client_info);
// Must unregister the dump request wait operation and wait for any
// dump requests that might be pending to finish before proceeding
// with the client_info cleanup.
client_info->UnregisterDumpRequestWaitAndBlockUntilNoPending();
if (exit_callback_) {
exit_callback_(exit_context_, client_info);
}
// Start a new scope to release lock automatically. // Start a new scope to release lock automatically.
{ {
AutoCriticalSection lock(&sync_); AutoCriticalSection lock(&sync_);
if (shutting_down_) {
// The crash generation server is shutting down and as part of the
// shutdown process it will delete all clients from the clients_ list.
return;
}
clients_.remove(client_info); clients_.remove(client_info);
} }
// Explicitly unregister the process exit wait using the non-blocking method.
// Otherwise, the destructor will attempt to unregister it using the blocking
// method which will lead to a deadlock because it is being called from the
// callback of the same wait operation
client_info->UnregisterProcessExitWait(false);
delete client_info; delete client_info;
} }
void CrashGenerationServer::HandleDumpRequest(const ClientInfo& client_info) { void CrashGenerationServer::HandleDumpRequest(const ClientInfo& client_info) {
bool execute_callback = true;
// Generate the dump only if it's explicitly requested by the // Generate the dump only if it's explicitly requested by the
// server application; otherwise the server might want to generate // server application; otherwise the server might want to generate
// dump in the callback. // dump in the callback.
std::wstring dump_path; std::wstring dump_path;
if (generate_dumps_) { if (generate_dumps_) {
if (!GenerateDump(client_info, &dump_path)) { if (!GenerateDump(client_info, &dump_path)) {
return; // client proccess terminated or some other error
execute_callback = false;
} }
} }
if (dump_callback_) { if (dump_callback_ && execute_callback) {
std::wstring* ptr_dump_path = (dump_path == L"") ? NULL : &dump_path; std::wstring* ptr_dump_path = (dump_path == L"") ? NULL : &dump_path;
dump_callback_(dump_context_, &client_info, ptr_dump_path); dump_callback_(dump_context_, &client_info, ptr_dump_path);
} }

View file

@ -189,11 +189,8 @@ class CrashGenerationServer {
// Callback for client process exit event. // Callback for client process exit event.
static void CALLBACK OnClientEnd(void* context, BOOLEAN timer_or_wait); static void CALLBACK OnClientEnd(void* context, BOOLEAN timer_or_wait);
// Releases resources for a client. // Handles client process exit.
static DWORD WINAPI CleanupClient(void* context); void HandleClientProcessExit(ClientInfo* client_info);
// Cleans up for the given client.
void DoCleanup(ClientInfo* client_info);
// Adds the given client to the list of registered clients. // Adds the given client to the list of registered clients.
bool AddClient(ClientInfo* client_info); bool AddClient(ClientInfo* client_info);
@ -216,8 +213,7 @@ class CrashGenerationServer {
// asynchronous IO operation. // asynchronous IO operation.
void EnterStateWhenSignaled(IPCServerState state); void EnterStateWhenSignaled(IPCServerState state);
// Sync object for thread-safe access to the shared list of clients and // Sync object for thread-safe access to the shared list of clients.
// the server's state.
CRITICAL_SECTION sync_; CRITICAL_SECTION sync_;
// List of clients. // List of clients.
@ -286,10 +282,6 @@ class CrashGenerationServer {
// Client Info for the client that's connecting to the server. // Client Info for the client that's connecting to the server.
ClientInfo* client_info_; ClientInfo* client_info_;
// Count of clean-up work items that are currently running or are
// already queued to run.
volatile LONG cleanup_item_count_;
// Disable copy ctor and operator=. // Disable copy ctor and operator=.
CrashGenerationServer(const CrashGenerationServer& crash_server); CrashGenerationServer(const CrashGenerationServer& crash_server);
CrashGenerationServer& operator=(const CrashGenerationServer& crash_server); CrashGenerationServer& operator=(const CrashGenerationServer& crash_server);