Fixing a race condition in the Crash Generation Server which has to
do with simultaneous handling of dump requests and client process termination events. http://breakpad.appspot.com/430002/ git-svn-id: http://google-breakpad.googlecode.com/svn/trunk@1013 4c0a9323-5329-0410-9bdc-e9ce6186880e
This commit is contained in:
parent
f8c1de5643
commit
d7de392b05
4 changed files with 184 additions and 152 deletions
|
@ -85,16 +85,38 @@ bool ClientInfo::Initialize() {
|
|||
return dump_generated_handle_ != NULL;
|
||||
}
|
||||
|
||||
ClientInfo::~ClientInfo() {
|
||||
void ClientInfo::UnregisterDumpRequestWaitAndBlockUntilNoPending() {
|
||||
if (dump_request_wait_handle_) {
|
||||
// Wait for callbacks that might already be running to finish.
|
||||
UnregisterWaitEx(dump_request_wait_handle_, INVALID_HANDLE_VALUE);
|
||||
dump_request_wait_handle_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
void ClientInfo::UnregisterProcessExitWait(bool block_until_no_pending) {
|
||||
if (process_exit_wait_handle_) {
|
||||
// Wait for the callback that might already be running to finish.
|
||||
UnregisterWaitEx(process_exit_wait_handle_, INVALID_HANDLE_VALUE);
|
||||
if (block_until_no_pending) {
|
||||
// Wait for the callback that might already be running to finish.
|
||||
UnregisterWaitEx(process_exit_wait_handle_, INVALID_HANDLE_VALUE);
|
||||
} else {
|
||||
UnregisterWait(process_exit_wait_handle_);
|
||||
}
|
||||
process_exit_wait_handle_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
ClientInfo::~ClientInfo() {
|
||||
// Waiting for the callback to finish here is safe because ClientInfo's are
|
||||
// never destroyed from the dump request handling callback.
|
||||
UnregisterDumpRequestWaitAndBlockUntilNoPending();
|
||||
|
||||
// This is a little tricky because ClientInfo's may be destroyed by the same
|
||||
// callback (OnClientEnd) and waiting for it to finish will cause a deadlock.
|
||||
// Regardless of this complication, wait for any running callbacks to finish
|
||||
// so that the common case is properly handled. In order to avoid deadlocks,
|
||||
// the OnClientEnd callback must call UnregisterProcessExitWait(false)
|
||||
// before deleting the ClientInfo.
|
||||
UnregisterProcessExitWait(true);
|
||||
|
||||
if (process_handle_) {
|
||||
CloseHandle(process_handle_);
|
||||
|
@ -109,18 +131,6 @@ ClientInfo::~ClientInfo() {
|
|||
}
|
||||
}
|
||||
|
||||
void ClientInfo::UnregisterWaits() {
|
||||
if (dump_request_wait_handle_) {
|
||||
UnregisterWait(dump_request_wait_handle_);
|
||||
dump_request_wait_handle_ = NULL;
|
||||
}
|
||||
|
||||
if (process_exit_wait_handle_) {
|
||||
UnregisterWait(process_exit_wait_handle_);
|
||||
process_exit_wait_handle_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
bool ClientInfo::GetClientExceptionInfo(EXCEPTION_POINTERS** ex_info) const {
|
||||
SIZE_T bytes_count = 0;
|
||||
if (!ReadProcessMemory(process_handle_,
|
||||
|
|
|
@ -67,24 +67,22 @@ class ClientInfo {
|
|||
HANDLE dump_generated_handle() const { return dump_generated_handle_; }
|
||||
DWORD crash_id() const { return crash_id_; }
|
||||
|
||||
HANDLE dump_request_wait_handle() const {
|
||||
return dump_request_wait_handle_;
|
||||
}
|
||||
|
||||
void set_dump_request_wait_handle(HANDLE value) {
|
||||
dump_request_wait_handle_ = value;
|
||||
}
|
||||
|
||||
HANDLE process_exit_wait_handle() const {
|
||||
return process_exit_wait_handle_;
|
||||
}
|
||||
|
||||
void set_process_exit_wait_handle(HANDLE value) {
|
||||
process_exit_wait_handle_ = value;
|
||||
}
|
||||
|
||||
// Unregister all waits for the client.
|
||||
void UnregisterWaits();
|
||||
// Unregister the dump request wait operation and wait for all callbacks
|
||||
// that might already be running to complete before returning.
|
||||
void UnregisterDumpRequestWaitAndBlockUntilNoPending();
|
||||
|
||||
// Unregister the process exit wait operation. If block_until_no_pending is
|
||||
// true, wait for all callbacks that might already be running to complete
|
||||
// before returning.
|
||||
void UnregisterProcessExitWait(bool block_until_no_pending);
|
||||
|
||||
bool Initialize();
|
||||
bool GetClientExceptionInfo(EXCEPTION_POINTERS** ex_info) const;
|
||||
|
|
|
@ -76,13 +76,6 @@ static const ULONG kPipeIOThreadFlags = WT_EXECUTEINWAITTHREAD;
|
|||
static const ULONG kDumpRequestThreadFlags = WT_EXECUTEINWAITTHREAD |
|
||||
WT_EXECUTELONGFUNCTION;
|
||||
|
||||
// Maximum delay during server shutdown if some work items
|
||||
// are still executing.
|
||||
static const int kShutdownDelayMs = 10000;
|
||||
|
||||
// Interval for each sleep during server shutdown.
|
||||
static const int kShutdownSleepIntervalMs = 5;
|
||||
|
||||
static bool IsClientRequestValid(const ProtocolMessage& msg) {
|
||||
return msg.tag == MESSAGE_TAG_UPLOAD_REQUEST ||
|
||||
(msg.tag == MESSAGE_TAG_REGISTRATION_REQUEST &&
|
||||
|
@ -123,8 +116,7 @@ CrashGenerationServer::CrashGenerationServer(
|
|||
server_state_(IPC_SERVER_STATE_UNINITIALIZED),
|
||||
shutting_down_(false),
|
||||
overlapped_(),
|
||||
client_info_(NULL),
|
||||
cleanup_item_count_(0) {
|
||||
client_info_(NULL) {
|
||||
InitializeCriticalSection(&sync_);
|
||||
|
||||
if (dump_path) {
|
||||
|
@ -132,91 +124,85 @@ CrashGenerationServer::CrashGenerationServer(
|
|||
}
|
||||
}
|
||||
|
||||
// This should never be called from the OnPipeConnected callback.
|
||||
// Otherwise the UnregisterWaitEx call below will cause a deadlock.
|
||||
CrashGenerationServer::~CrashGenerationServer() {
|
||||
// New scope to release the lock automatically.
|
||||
{
|
||||
// Make sure no clients are added or removed beyond this point.
|
||||
// Before adding or removing any clients, the critical section
|
||||
// must be entered and the shutting_down_ flag checked. The
|
||||
// critical section is then exited only after the clients_ list
|
||||
// modifications are done and the list is in a consistent state.
|
||||
AutoCriticalSection lock(&sync_);
|
||||
|
||||
// Indicate to existing threads that server is shutting down.
|
||||
shutting_down_ = true;
|
||||
}
|
||||
// No one will modify the clients_ list beyond this point -
|
||||
// not even from another thread.
|
||||
|
||||
// Even if there are no current worker threads running, it is possible that
|
||||
// an I/O request is pending on the pipe right now but not yet done.
|
||||
// In fact, it's very likely this is the case unless we are in an ERROR
|
||||
// state. If we don't wait for the pending I/O to be done, then when the I/O
|
||||
// completes, it may write to invalid memory. AppVerifier will flag this
|
||||
// problem too. So we disconnect from the pipe and then wait for the server
|
||||
// to get into error state so that the pending I/O will fail and get
|
||||
// cleared.
|
||||
DisconnectNamedPipe(pipe_);
|
||||
int num_tries = 100;
|
||||
while (num_tries-- && server_state_ != IPC_SERVER_STATE_ERROR) {
|
||||
lock.Release();
|
||||
Sleep(10);
|
||||
lock.Acquire();
|
||||
}
|
||||
|
||||
// Unregister wait on the pipe.
|
||||
if (pipe_wait_handle_) {
|
||||
// Wait for already executing callbacks to finish.
|
||||
UnregisterWaitEx(pipe_wait_handle_, INVALID_HANDLE_VALUE);
|
||||
}
|
||||
|
||||
// Close the pipe to avoid further client connections.
|
||||
if (pipe_) {
|
||||
CloseHandle(pipe_);
|
||||
}
|
||||
|
||||
// Request all ClientInfo objects to unregister all waits.
|
||||
std::list<ClientInfo*>::iterator iter;
|
||||
for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
|
||||
ClientInfo* client_info = *iter;
|
||||
client_info->UnregisterWaits();
|
||||
}
|
||||
// Even if there are no current worker threads running, it is possible that
|
||||
// an I/O request is pending on the pipe right now but not yet done.
|
||||
// In fact, it's very likely this is the case unless we are in an ERROR
|
||||
// state. If we don't wait for the pending I/O to be done, then when the I/O
|
||||
// completes, it may write to invalid memory. AppVerifier will flag this
|
||||
// problem too. So we disconnect from the pipe and then wait for the server
|
||||
// to get into error state so that the pending I/O will fail and get
|
||||
// cleared.
|
||||
DisconnectNamedPipe(pipe_);
|
||||
int num_tries = 100;
|
||||
while (num_tries-- && server_state_ != IPC_SERVER_STATE_ERROR) {
|
||||
Sleep(10);
|
||||
}
|
||||
|
||||
// Now that all waits have been unregistered, wait for some time
|
||||
// for all pending work items to finish.
|
||||
int total_wait = 0;
|
||||
while (cleanup_item_count_ > 0) {
|
||||
Sleep(kShutdownSleepIntervalMs);
|
||||
|
||||
total_wait += kShutdownSleepIntervalMs;
|
||||
|
||||
if (total_wait >= kShutdownDelayMs) {
|
||||
break;
|
||||
}
|
||||
// Unregister wait on the pipe.
|
||||
if (pipe_wait_handle_) {
|
||||
// Wait for already executing callbacks to finish.
|
||||
UnregisterWaitEx(pipe_wait_handle_, INVALID_HANDLE_VALUE);
|
||||
}
|
||||
|
||||
// New scope to hold the lock for the shortest time.
|
||||
{
|
||||
AutoCriticalSection lock(&sync_);
|
||||
|
||||
// Clean up all the ClientInfo objects.
|
||||
std::list<ClientInfo*>::iterator iter;
|
||||
for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
|
||||
ClientInfo* client_info = *iter;
|
||||
delete client_info;
|
||||
}
|
||||
|
||||
if (server_alive_handle_) {
|
||||
// Release the mutex before closing the handle so that clients requesting
|
||||
// dumps wait for a long time for the server to generate a dump.
|
||||
ReleaseMutex(server_alive_handle_);
|
||||
CloseHandle(server_alive_handle_);
|
||||
}
|
||||
|
||||
if (overlapped_.hEvent) {
|
||||
CloseHandle(overlapped_.hEvent);
|
||||
}
|
||||
// Close the pipe to avoid further client connections.
|
||||
if (pipe_) {
|
||||
CloseHandle(pipe_);
|
||||
}
|
||||
|
||||
// Request all ClientInfo objects to unregister all waits.
|
||||
// No need to enter the critical section because no one is allowed to modify
|
||||
// the clients_ list once the shutting_down_ flag is set.
|
||||
std::list<ClientInfo*>::iterator iter;
|
||||
for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
|
||||
ClientInfo* client_info = *iter;
|
||||
// Unregister waits. Wait for already executing callbacks to finish.
|
||||
// Unregister the client process exit wait first and only then unregister
|
||||
// the dump request wait. The reason is that the OnClientExit callback
|
||||
// also unregisters the dump request wait and such a race (doing the same
|
||||
// unregistration from two threads) is undesirable.
|
||||
client_info->UnregisterProcessExitWait(true);
|
||||
client_info->UnregisterDumpRequestWaitAndBlockUntilNoPending();
|
||||
|
||||
// Destroying the ClientInfo here is safe because all wait operations for
|
||||
// this ClientInfo were unregistered and no pending or running callbacks
|
||||
// for this ClientInfo can possible exist (block_until_no_pending option
|
||||
// was used).
|
||||
delete client_info;
|
||||
}
|
||||
|
||||
if (server_alive_handle_) {
|
||||
// Release the mutex before closing the handle so that clients requesting
|
||||
// dumps wait for a long time for the server to generate a dump.
|
||||
ReleaseMutex(server_alive_handle_);
|
||||
CloseHandle(server_alive_handle_);
|
||||
}
|
||||
|
||||
if (overlapped_.hEvent) {
|
||||
CloseHandle(overlapped_.hEvent);
|
||||
}
|
||||
|
||||
DeleteCriticalSection(&sync_);
|
||||
}
|
||||
|
||||
bool CrashGenerationServer::Start() {
|
||||
AutoCriticalSection lock(&sync_);
|
||||
|
||||
if (server_state_ != IPC_SERVER_STATE_UNINITIALIZED) {
|
||||
return false;
|
||||
}
|
||||
|
@ -455,6 +441,7 @@ void CrashGenerationServer::HandleReadDoneState() {
|
|||
return;
|
||||
}
|
||||
|
||||
// This is only valid as long as it can be found in the clients_ list
|
||||
client_info_ = client_info.release();
|
||||
|
||||
// Note that the asynchronous write issued by RespondToClient function
|
||||
|
@ -532,7 +519,32 @@ void CrashGenerationServer::HandleReadingAckState() {
|
|||
// The connection handshake with the client is now complete; perform
|
||||
// the callback.
|
||||
if (connect_callback_) {
|
||||
connect_callback_(connect_context_, client_info_);
|
||||
// Note that there is only a single copy of the ClientInfo of the
|
||||
// currently connected client. However it is being referenced from
|
||||
// two different places:
|
||||
// - the client_info_ member
|
||||
// - the clients_ list
|
||||
// The lifetime of this ClientInfo depends on the lifetime of the
|
||||
// client process - basically it can go away at any time.
|
||||
// However, as long as it is referenced by the clients_ list it
|
||||
// is guaranteed to be valid. Enter the critical section and check
|
||||
// to see whether the client_info_ can be found in the list.
|
||||
// If found, execute the callback and only then leave the critical
|
||||
// section.
|
||||
AutoCriticalSection lock(&sync_);
|
||||
|
||||
bool client_is_still_alive = false;
|
||||
std::list<ClientInfo*>::iterator iter;
|
||||
for (iter = clients_.begin(); iter != clients_.end(); ++iter) {
|
||||
if (client_info_ == *iter) {
|
||||
client_is_still_alive = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (client_is_still_alive) {
|
||||
connect_callback_(connect_context_, client_info_);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// We should never get an I/O incomplete since we should not execute this
|
||||
|
@ -605,16 +617,39 @@ bool CrashGenerationServer::PrepareReply(const ClientInfo& client_info,
|
|||
return true;
|
||||
}
|
||||
|
||||
// Closing of remote handles (belonging to a different process) can
|
||||
// only be done through DuplicateHandle.
|
||||
if (reply->dump_request_handle) {
|
||||
CloseHandle(reply->dump_request_handle);
|
||||
DuplicateHandle(client_info.process_handle(), // hSourceProcessHandle
|
||||
reply->dump_request_handle, // hSourceHandle
|
||||
NULL, // hTargetProcessHandle
|
||||
0, // lpTargetHandle
|
||||
0, // dwDesiredAccess
|
||||
FALSE, // bInheritHandle
|
||||
DUPLICATE_CLOSE_SOURCE); // dwOptions
|
||||
reply->dump_request_handle = NULL;
|
||||
}
|
||||
|
||||
if (reply->dump_generated_handle) {
|
||||
CloseHandle(reply->dump_generated_handle);
|
||||
DuplicateHandle(client_info.process_handle(), // hSourceProcessHandle
|
||||
reply->dump_generated_handle, // hSourceHandle
|
||||
NULL, // hTargetProcessHandle
|
||||
0, // lpTargetHandle
|
||||
0, // dwDesiredAccess
|
||||
FALSE, // bInheritHandle
|
||||
DUPLICATE_CLOSE_SOURCE); // dwOptions
|
||||
reply->dump_generated_handle = NULL;
|
||||
}
|
||||
|
||||
if (reply->server_alive_handle) {
|
||||
CloseHandle(reply->server_alive_handle);
|
||||
DuplicateHandle(client_info.process_handle(), // hSourceProcessHandle
|
||||
reply->server_alive_handle, // hSourceHandle
|
||||
NULL, // hTargetProcessHandle
|
||||
0, // lpTargetHandle
|
||||
0, // dwDesiredAccess
|
||||
FALSE, // bInheritHandle
|
||||
DUPLICATE_CLOSE_SOURCE); // dwOptions
|
||||
reply->server_alive_handle = NULL;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
@ -676,21 +711,15 @@ bool CrashGenerationServer::RespondToClient(ClientInfo* client_info) {
|
|||
|
||||
// Takes over ownership of client_info. We MUST return true if AddClient
|
||||
// succeeds.
|
||||
if (!AddClient(client_info)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return AddClient(client_info);
|
||||
}
|
||||
|
||||
// The server thread servicing the clients runs this method. The method
|
||||
// implements the state machine described in ReadMe.txt along with the
|
||||
// helper methods HandleXXXState.
|
||||
void CrashGenerationServer::HandleConnectionRequest() {
|
||||
AutoCriticalSection lock(&sync_);
|
||||
|
||||
// If we are shutting doen then get into ERROR state, reset the event so more
|
||||
// workers don't run and return immediately.
|
||||
// If the server is shutting down, get into ERROR state, reset the event so
|
||||
// more workers don't run and return immediately.
|
||||
if (shutting_down_) {
|
||||
server_state_ = IPC_SERVER_STATE_ERROR;
|
||||
ResetEvent(overlapped_.hEvent);
|
||||
|
@ -776,6 +805,10 @@ bool CrashGenerationServer::AddClient(ClientInfo* client_info) {
|
|||
// New scope to hold the lock for the shortest time.
|
||||
{
|
||||
AutoCriticalSection lock(&sync_);
|
||||
if (shutting_down_) {
|
||||
// If server is shutting down, don't add new clients
|
||||
return false;
|
||||
}
|
||||
clients_.push_back(client_info);
|
||||
}
|
||||
|
||||
|
@ -812,56 +845,55 @@ void CALLBACK CrashGenerationServer::OnClientEnd(void* context, BOOLEAN) {
|
|||
CrashGenerationServer* crash_server = client_info->crash_server();
|
||||
assert(crash_server);
|
||||
|
||||
client_info->UnregisterWaits();
|
||||
InterlockedIncrement(&crash_server->cleanup_item_count_);
|
||||
|
||||
if (!QueueUserWorkItem(CleanupClient, context, WT_EXECUTEDEFAULT)) {
|
||||
InterlockedDecrement(&crash_server->cleanup_item_count_);
|
||||
}
|
||||
crash_server->HandleClientProcessExit(client_info);
|
||||
}
|
||||
|
||||
// static
|
||||
DWORD WINAPI CrashGenerationServer::CleanupClient(void* context) {
|
||||
assert(context);
|
||||
ClientInfo* client_info = reinterpret_cast<ClientInfo*>(context);
|
||||
|
||||
CrashGenerationServer* crash_server = client_info->crash_server();
|
||||
assert(crash_server);
|
||||
|
||||
if (crash_server->exit_callback_) {
|
||||
crash_server->exit_callback_(crash_server->exit_context_, client_info);
|
||||
}
|
||||
|
||||
crash_server->DoCleanup(client_info);
|
||||
|
||||
InterlockedDecrement(&crash_server->cleanup_item_count_);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void CrashGenerationServer::DoCleanup(ClientInfo* client_info) {
|
||||
void CrashGenerationServer::HandleClientProcessExit(ClientInfo* client_info) {
|
||||
assert(client_info);
|
||||
|
||||
// Must unregister the dump request wait operation and wait for any
|
||||
// dump requests that might be pending to finish before proceeding
|
||||
// with the client_info cleanup.
|
||||
client_info->UnregisterDumpRequestWaitAndBlockUntilNoPending();
|
||||
|
||||
if (exit_callback_) {
|
||||
exit_callback_(exit_context_, client_info);
|
||||
}
|
||||
|
||||
// Start a new scope to release lock automatically.
|
||||
{
|
||||
AutoCriticalSection lock(&sync_);
|
||||
if (shutting_down_) {
|
||||
// The crash generation server is shutting down and as part of the
|
||||
// shutdown process it will delete all clients from the clients_ list.
|
||||
return;
|
||||
}
|
||||
clients_.remove(client_info);
|
||||
}
|
||||
|
||||
// Explicitly unregister the process exit wait using the non-blocking method.
|
||||
// Otherwise, the destructor will attempt to unregister it using the blocking
|
||||
// method which will lead to a deadlock because it is being called from the
|
||||
// callback of the same wait operation
|
||||
client_info->UnregisterProcessExitWait(false);
|
||||
|
||||
delete client_info;
|
||||
}
|
||||
|
||||
void CrashGenerationServer::HandleDumpRequest(const ClientInfo& client_info) {
|
||||
bool execute_callback = true;
|
||||
// Generate the dump only if it's explicitly requested by the
|
||||
// server application; otherwise the server might want to generate
|
||||
// dump in the callback.
|
||||
std::wstring dump_path;
|
||||
if (generate_dumps_) {
|
||||
if (!GenerateDump(client_info, &dump_path)) {
|
||||
return;
|
||||
// client proccess terminated or some other error
|
||||
execute_callback = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (dump_callback_) {
|
||||
if (dump_callback_ && execute_callback) {
|
||||
std::wstring* ptr_dump_path = (dump_path == L"") ? NULL : &dump_path;
|
||||
dump_callback_(dump_context_, &client_info, ptr_dump_path);
|
||||
}
|
||||
|
|
|
@ -189,11 +189,8 @@ class CrashGenerationServer {
|
|||
// Callback for client process exit event.
|
||||
static void CALLBACK OnClientEnd(void* context, BOOLEAN timer_or_wait);
|
||||
|
||||
// Releases resources for a client.
|
||||
static DWORD WINAPI CleanupClient(void* context);
|
||||
|
||||
// Cleans up for the given client.
|
||||
void DoCleanup(ClientInfo* client_info);
|
||||
// Handles client process exit.
|
||||
void HandleClientProcessExit(ClientInfo* client_info);
|
||||
|
||||
// Adds the given client to the list of registered clients.
|
||||
bool AddClient(ClientInfo* client_info);
|
||||
|
@ -216,8 +213,7 @@ class CrashGenerationServer {
|
|||
// asynchronous IO operation.
|
||||
void EnterStateWhenSignaled(IPCServerState state);
|
||||
|
||||
// Sync object for thread-safe access to the shared list of clients and
|
||||
// the server's state.
|
||||
// Sync object for thread-safe access to the shared list of clients.
|
||||
CRITICAL_SECTION sync_;
|
||||
|
||||
// List of clients.
|
||||
|
@ -286,10 +282,6 @@ class CrashGenerationServer {
|
|||
// Client Info for the client that's connecting to the server.
|
||||
ClientInfo* client_info_;
|
||||
|
||||
// Count of clean-up work items that are currently running or are
|
||||
// already queued to run.
|
||||
volatile LONG cleanup_item_count_;
|
||||
|
||||
// Disable copy ctor and operator=.
|
||||
CrashGenerationServer(const CrashGenerationServer& crash_server);
|
||||
CrashGenerationServer& operator=(const CrashGenerationServer& crash_server);
|
||||
|
|
Loading…
Reference in a new issue