List:Commits« Previous MessageNext Message »
From:vasil.dimov Date:July 22 2010 8:17am
Subject:bzr commit into mysql-next-mr-innodb branch (vasil.dimov:3241)
View as plain text  
#At file:///usr/local/devel/bzrroot/server/mysql-next-mr-innodb/ based on revid:vasil.dimov@stripped

 3241 Vasil Dimov	2010-07-22 [merge]
      Merge mysql-trunk-innodb -> mysql-next-mr-innodb

    modified:
      storage/innobase/CMakeLists.txt
      storage/innobase/include/os0file.h
      storage/innobase/include/os0sync.h
      storage/innobase/include/os0sync.ic
      storage/innobase/include/srv0srv.h
      storage/innobase/mtr/mtr0mtr.c
      storage/innobase/os/os0file.c
      storage/innobase/os/os0sync.c
      storage/innobase/os/os0thread.c
      storage/innobase/srv/srv0srv.c
      storage/innobase/srv/srv0start.c
      storage/innobase/sync/sync0arr.c
      storage/innobase/sync/sync0sync.c
=== modified file 'storage/innobase/CMakeLists.txt'
--- a/storage/innobase/CMakeLists.txt	revid:vasil.dimov@stripped
+++ b/storage/innobase/CMakeLists.txt	revid:vasil.dimov@stripped
@@ -188,11 +188,7 @@ IF(SIZEOF_PTHREAD_T)
 ENDIF()
 
 IF(MSVC)
-  # Windows atomics do not perform well. Disable Windows atomics by default.
-  # See bug#52102 for details.
-
-  #ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DINNODB_RW_LOCKS_USE_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
-  ADD_DEFINITIONS(-DHAVE_IB_PAUSE_INSTRUCTION)
+  ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
 ENDIF()
 
 

=== modified file 'storage/innobase/include/os0file.h'
--- a/storage/innobase/include/os0file.h	revid:vasil.dimov@stripped
+++ b/storage/innobase/include/os0file.h	revid:vasil.dimov@stripped
@@ -177,6 +177,13 @@ log. */
 #define OS_WIN95	2	/*!< Microsoft Windows 95 */
 #define OS_WINNT	3	/*!< Microsoft Windows NT 3.x */
 #define OS_WIN2000	4	/*!< Microsoft Windows 2000 */
+#define OS_WINXP	5	/*!< Microsoft Windows XP
+				or Windows Server 2003 */
+#define OS_WINVISTA	6	/*!< Microsoft Windows Vista
+				or Windows Server 2008 */
+#define OS_WIN7		7	/*!< Microsoft Windows 7
+				or Windows Server 2008 R2 */
+
 
 extern ulint	os_n_file_reads;
 extern ulint	os_n_file_writes;
@@ -368,7 +375,8 @@ typedef DIR*	os_file_dir_t;	/*!< directo
 
 /***********************************************************************//**
 Gets the operating system version. Currently works only on Windows.
-@return	OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */
+@return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
+OS_WIN7. */
 UNIV_INTERN
 ulint
 os_get_os_version(void);

=== modified file 'storage/innobase/include/os0sync.h'
--- a/storage/innobase/include/os0sync.h	revid:vasil.dimov@stripped
+++ b/storage/innobase/include/os0sync.h	revid:vasil.dimov@stripped
@@ -38,28 +38,18 @@ Created 9/6/1995 Heikki Tuuri
 #include "ut0lst.h"
 
 #ifdef __WIN__
-
+/** Native event (slow)*/
+typedef HANDLE			os_native_event_t;
 /** Native mutex */
-#define os_fast_mutex_t CRITICAL_SECTION
-
-/** Native event */
-typedef HANDLE		os_native_event_t;
-
-/** Operating system event */
-typedef struct os_event_struct	os_event_struct_t;
-/** Operating system event handle */
-typedef os_event_struct_t*	os_event_t;
-
-/** An asynchronous signal sent between threads */
-struct os_event_struct {
-	os_native_event_t		  handle;
-					/*!< Windows event */
-	UT_LIST_NODE_T(os_event_struct_t) os_event_list;
-					/*!< list of all created events */
-};
+typedef CRITICAL_SECTION	os_fast_mutex_t;
+/** Native condition variable. */
+typedef CONDITION_VARIABLE	os_cond_t;
 #else
 /** Native mutex */
-typedef pthread_mutex_t	os_fast_mutex_t;
+typedef pthread_mutex_t		os_fast_mutex_t;
+/** Native condition variable */
+typedef pthread_cond_t		os_cond_t;
+#endif
 
 /** Operating system event */
 typedef struct os_event_struct	os_event_struct_t;
@@ -68,6 +58,10 @@ typedef os_event_struct_t*	os_event_t;
 
 /** An asynchronous signal sent between threads */
 struct os_event_struct {
+#ifdef __WIN__
+	HANDLE		handle;		/*!< kernel event object, slow,
+					used on older Windows */
+#endif
 	os_fast_mutex_t	os_mutex;	/*!< this mutex protects the next
 					fields */
 	ibool		is_set;		/*!< this is TRUE when the event is
@@ -76,24 +70,17 @@ struct os_event_struct {
 					this event */
 	ib_int64_t	signal_count;	/*!< this is incremented each time
 					the event becomes signaled */
-	pthread_cond_t	cond_var;	/*!< condition variable is used in
+	os_cond_t	cond_var;	/*!< condition variable is used in
 					waiting for the event */
 	UT_LIST_NODE_T(os_event_struct_t) os_event_list;
 					/*!< list of all created events */
 };
-#endif
 
 /** Operating system mutex */
 typedef struct os_mutex_struct	os_mutex_str_t;
 /** Operating system mutex handle */
 typedef os_mutex_str_t*		os_mutex_t;
 
-/** Denotes an infinite delay for os_event_wait_time() */
-#define OS_SYNC_INFINITE_TIME	((ulint)(-1))
-
-/** Return value of os_event_wait_time() when the time is exceeded */
-#define OS_SYNC_TIME_EXCEEDED	1
-
 /** Mutex protecting counts and the event and OS 'slow' mutex lists */
 extern os_mutex_t	os_sync_mutex;
 
@@ -187,42 +174,14 @@ os_event_wait_low(
 
 #define os_event_wait(event) os_event_wait_low(event, 0)
 
-/**********************************************************//**
-Waits for an event object until it is in the signaled state or
-a timeout is exceeded. In Unix the timeout is always infinite.
-@return	0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
-UNIV_INTERN
-ulint
-os_event_wait_time(
-/*===============*/
-	os_event_t	event,	/*!< in: event to wait */
-	ulint		time);	/*!< in: timeout in microseconds, or
-				OS_SYNC_INFINITE_TIME */
-#ifdef __WIN__
-/**********************************************************//**
-Waits for any event in an OS native event array. Returns if even a single
-one is signaled or becomes signaled.
-@return	index of the event which was signaled */
-UNIV_INTERN
-ulint
-os_event_wait_multiple(
-/*===================*/
-	ulint			n,	/*!< in: number of events in the
-					array */
-	os_native_event_t*	native_event_array);
-					/*!< in: pointer to an array of event
-					handles */
-#endif
 /*********************************************************//**
 Creates an operating system mutex semaphore. Because these are slow, the
 mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
 @return	the mutex handle */
 UNIV_INTERN
 os_mutex_t
-os_mutex_create(
-/*============*/
-	const char*	name);	/*!< in: the name of the mutex, if NULL
-				the mutex is created without a name */
+os_mutex_create(void);
+/*=================*/
 /**********************************************************//**
 Acquires ownership of a mutex semaphore. */
 UNIV_INTERN

=== modified file 'storage/innobase/include/os0sync.ic'
--- a/storage/innobase/include/os0sync.ic	revid:vasil.dimov@stripped
+++ b/storage/innobase/include/os0sync.ic	revid:vasil.dimov@stripped
@@ -28,8 +28,7 @@ Created 9/6/1995 Heikki Tuuri
 #endif
 
 /**********************************************************//**
-Acquires ownership of a fast mutex. Currently in Windows this is the same
-as os_fast_mutex_lock!
+Acquires ownership of a fast mutex.
 @return	0 if success, != 0 if was reserved by another thread */
 UNIV_INLINE
 ulint
@@ -38,9 +37,13 @@ os_fast_mutex_trylock(
 	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to acquire */
 {
 #ifdef __WIN__
-	EnterCriticalSection(fast_mutex);
+	if (TryEnterCriticalSection(fast_mutex)) {
 
-	return(0);
+		return(0);
+	} else {
+
+		return(1);
+	}
 #else
 	/* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
 	so that it returns 0 on success. In the operating system

=== modified file 'storage/innobase/include/srv0srv.h'
--- a/storage/innobase/include/srv0srv.h	revid:vasil.dimov@stripped
+++ b/storage/innobase/include/srv0srv.h	revid:vasil.dimov@stripped
@@ -112,6 +112,9 @@ OS (provided we compiled Innobase with i
 use simulated aio we build below with threads.
 Currently we support native aio on windows and linux */
 extern my_bool	srv_use_native_aio;
+#ifdef __WIN__
+extern ibool	srv_use_native_conditions;
+#endif
 extern ulint	srv_n_data_files;
 extern char**	srv_data_file_names;
 extern ulint*	srv_data_file_sizes;

=== modified file 'storage/innobase/mtr/mtr0mtr.c'
--- a/storage/innobase/mtr/mtr0mtr.c	revid:vasil.dimov@stripped
+++ b/storage/innobase/mtr/mtr0mtr.c	revid:vasil.dimov@stripped
@@ -337,9 +337,12 @@ mtr_memo_release(
 		slot = dyn_array_get_element(memo, offset);
 
 		if (object == slot->object && type == slot->type) {
-			if (mtr->modifications) {
-				mtr_memo_slot_note_modification(mtr, slot);
-			}
+
+			/* We cannot release a page that has been written
+			to in the middle of a mini-transaction. */
+
+			ut_ad(!(mtr->modifications
+			       	&& slot->type == MTR_MEMO_PAGE_X_FIX));
 
 			mtr_memo_slot_release(mtr, slot);
 

=== modified file 'storage/innobase/os/os0file.c'
--- a/storage/innobase/os/os0file.c	revid:vasil.dimov@stripped
+++ b/storage/innobase/os/os0file.c	revid:vasil.dimov@stripped
@@ -184,7 +184,7 @@ struct os_aio_slot_struct{
 					which pending aio operation was
 					completed */
 #ifdef WIN_ASYNC_IO
-	os_event_t	event;		/*!< event object we need in the
+	HANDLE		handle;		/*!< handle object we need in the
 					OVERLAPPED struct */
 	OVERLAPPED	control;	/*!< Windows control block for the
 					aio request */
@@ -226,7 +226,7 @@ struct os_aio_array_struct{
 				aio array outside the ibuf segment */
 	os_aio_slot_t*	slots;	/*!< Pointer to the slots in the array */
 #ifdef __WIN__
-	os_native_event_t* native_events;
+	HANDLE*		handles;
 				/*!< Pointer to an array of OS native
 				event handles where we copied the
 				handles from slots, in the same
@@ -305,7 +305,8 @@ UNIV_INTERN ulint	os_n_pending_reads = 0
 
 /***********************************************************************//**
 Gets the operating system version. Currently works only on Windows.
-@return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
+@return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
+OS_WIN7. */
 UNIV_INTERN
 ulint
 os_get_os_version(void)
@@ -323,10 +324,18 @@ os_get_os_version(void)
 	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
 		return(OS_WIN95);
 	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
-		if (os_info.dwMajorVersion <= 4) {
-			return(OS_WINNT);
-		} else {
-			return(OS_WIN2000);
+		switch (os_info.dwMajorVersion) {
+		case 3:
+		case 4:
+			return OS_WINNT;
+		case 5:
+			return (os_info.dwMinorVersion == 0) ? OS_WIN2000
+							     : OS_WINXP;
+		case 6:
+			return (os_info.dwMinorVersion == 0) ? OS_WINVISTA
+							     : OS_WIN7;
+		default:
+			return OS_WIN7;
 		}
 	} else {
 		ut_error;
@@ -674,10 +683,10 @@ os_io_init_simple(void)
 {
 	ulint	i;
 
-	os_file_count_mutex = os_mutex_create(NULL);
+	os_file_count_mutex = os_mutex_create();
 
 	for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
-		os_file_seek_mutexes[i] = os_mutex_create(NULL);
+		os_file_seek_mutexes[i] = os_mutex_create();
 	}
 }
 
@@ -3235,7 +3244,7 @@ os_aio_array_create(
 
 	array = ut_malloc(sizeof(os_aio_array_t));
 
-	array->mutex		= os_mutex_create(NULL);
+	array->mutex		= os_mutex_create();
 	array->not_full		= os_event_create(NULL);
 	array->is_empty		= os_event_create(NULL);
 
@@ -3247,7 +3256,7 @@ os_aio_array_create(
 	array->cur_seg		= 0;
 	array->slots		= ut_malloc(n * sizeof(os_aio_slot_t));
 #ifdef __WIN__
-	array->native_events	= ut_malloc(n * sizeof(os_native_event_t));
+	array->handles		= ut_malloc(n * sizeof(HANDLE));
 #endif
 
 #if defined(LINUX_NATIVE_AIO)
@@ -3291,13 +3300,13 @@ skip_native_aio:
 		slot->pos = i;
 		slot->reserved = FALSE;
 #ifdef WIN_ASYNC_IO
-		slot->event = os_event_create(NULL);
+		slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
 
 		over = &(slot->control);
 
-		over->hEvent = slot->event->handle;
+		over->hEvent = slot->handle;
 
-		*((array->native_events) + i) = over->hEvent;
+		*((array->handles) + i) = over->hEvent;
 
 #elif defined(LINUX_NATIVE_AIO)
 
@@ -3323,12 +3332,12 @@ os_aio_array_free(
 
 	for (i = 0; i < array->n_slots; i++) {
 		os_aio_slot_t*	slot = os_aio_array_get_nth_slot(array, i);
-		os_event_free(slot->event);
+		CloseHandle(slot->handle);
 	}
 #endif /* WIN_ASYNC_IO */
 
 #ifdef __WIN__
-	ut_free(array->native_events);
+	ut_free(array->handles);
 #endif /* __WIN__ */
 	os_mutex_free(array->mutex);
 	os_event_free(array->not_full);
@@ -3481,7 +3490,7 @@ os_aio_array_wake_win_aio_at_shutdown(
 
 	for (i = 0; i < array->n_slots; i++) {
 
-		os_event_set((array->slots + i)->event);
+		SetEvent((array->slots + i)->handle);
 	}
 }
 #endif
@@ -3720,7 +3729,7 @@ found:
 	control = &(slot->control);
 	control->Offset = (DWORD)offset;
 	control->OffsetHigh = (DWORD)offset_high;
-	os_event_reset(slot->event);
+	ResetEvent(slot->handle);
 
 #elif defined(LINUX_NATIVE_AIO)
 
@@ -3792,7 +3801,7 @@ os_aio_array_free_slot(
 
 #ifdef WIN_ASYNC_IO
 
-	os_event_reset(slot->event);
+	ResetEvent(slot->handle);
 
 #elif defined(LINUX_NATIVE_AIO)
 
@@ -4226,13 +4235,20 @@ os_aio_windows_handle(
 	n = array->n_slots / array->n_segments;
 
 	if (array == os_aio_sync_array) {
-		os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
+		WaitForSingleObject(
+			os_aio_array_get_nth_slot(array, pos)->handle,
+			INFINITE);
 		i = pos;
 	} else {
 		srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
-		i = os_event_wait_multiple(n,
-					   (array->native_events)
-					   + segment * n);
+		i = WaitForMultipleObjects((DWORD) n,
+					   array->handles + segment * n,
+					   FALSE,
+					   INFINITE);
+	}
+
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+		os_thread_exit(NULL);
 	}
 
 	os_mutex_enter(array->mutex);

=== modified file 'storage/innobase/os/os0sync.c'
--- a/storage/innobase/os/os0sync.c	revid:vasil.dimov@stripped
+++ b/storage/innobase/os/os0sync.c	revid:vasil.dimov@stripped
@@ -35,6 +35,7 @@ Created 9/6/1995 Heikki Tuuri
 
 #include "ut0mem.h"
 #include "srv0start.h"
+#include "srv0srv.h"
 
 /* Type definition for an operating system mutex struct */
 struct os_mutex_struct{
@@ -76,6 +77,155 @@ event embedded inside a mutex, on free, 
 This version of the free event function doesn't acquire the global lock */
 static void os_event_free_internal(os_event_t	event);
 
+/* On Windows (Vista and later), load function pointers for condition
+variable handling. Those functions are not available in prior versions,
+so we have to use them via runtime loading, as long as we support XP. */
+static void os_cond_module_init(void);
+
+#ifdef __WIN__
+/* Prototypes and function pointers for condition variable functions */
+typedef VOID (WINAPI* InitializeConditionVariableProc)
+	     (PCONDITION_VARIABLE ConditionVariable);
+static InitializeConditionVariableProc initialize_condition_variable;
+
+typedef BOOL (WINAPI* SleepConditionVariableCSProc)
+	     (PCONDITION_VARIABLE ConditionVariable,
+	      PCRITICAL_SECTION CriticalSection,
+	      DWORD dwMilliseconds);
+static SleepConditionVariableCSProc sleep_condition_variable;
+
+typedef VOID (WINAPI* WakeAllConditionVariableProc)
+	     (PCONDITION_VARIABLE ConditionVariable);
+static WakeAllConditionVariableProc wake_all_condition_variable;
+
+typedef VOID (WINAPI* WakeConditionVariableProc)
+	     (PCONDITION_VARIABLE ConditionVariable);
+static WakeConditionVariableProc wake_condition_variable;
+#endif
+
+/*********************************************************//**
+Initialitze condition variable */
+UNIV_INLINE
+void
+os_cond_init(
+/*=========*/
+	os_cond_t*	cond)	/*!< in: condition variable. */
+{
+	ut_a(cond);
+
+#ifdef __WIN__
+	ut_a(initialize_condition_variable != NULL);
+	initialize_condition_variable(cond);
+#else
+	ut_a(pthread_cond_init(cond, NULL) == 0);
+#endif
+}
+
+/*********************************************************//**
+Wait on condition variable */
+UNIV_INLINE
+void
+os_cond_wait(
+/*=========*/
+	os_cond_t*		cond,	/*!< in: condition variable. */
+	os_fast_mutex_t*	mutex)	/*!< in: fast mutex */
+{
+	ut_a(cond);
+	ut_a(mutex);
+
+#ifdef __WIN__
+	ut_a(sleep_condition_variable != NULL);
+	ut_a(sleep_condition_variable(cond, mutex, INFINITE));
+#else
+	ut_a(pthread_cond_wait(cond, mutex) == 0);
+#endif
+}
+
+/*********************************************************//**
+Wakes all threads  waiting for condition variable */
+UNIV_INLINE
+void
+os_cond_broadcast(
+/*==============*/
+	os_cond_t*	cond)	/*!< in: condition variable. */
+{
+	ut_a(cond);
+
+#ifdef __WIN__
+	ut_a(wake_all_condition_variable != NULL);
+	wake_all_condition_variable(cond);
+#else
+	ut_a(pthread_cond_broadcast(cond) == 0);
+#endif
+}
+
+/*********************************************************//**
+Wakes one thread waiting for condition variable */
+UNIV_INLINE
+void
+os_cond_signal(
+/*==========*/
+	os_cond_t*	cond)	/*!< in: condition variable. */
+{
+	ut_a(cond);
+
+#ifdef __WIN__
+	ut_a(wake_condition_variable != NULL);
+	wake_condition_variable(cond);
+#else
+	ut_a(pthread_cond_signal(cond) == 0);
+#endif
+}
+
+/*********************************************************//**
+Destroys condition variable */
+UNIV_INLINE
+void
+os_cond_destroy(
+/*============*/
+	os_cond_t*	cond)	/*!< in: condition variable. */
+{
+#ifdef __WIN__
+	/* Do nothing */
+#else
+	ut_a(pthread_cond_destroy(cond) == 0);
+#endif
+}
+
+/*********************************************************//**
+On Windows (Vista and later), load function pointers for condition variable
+handling. Those functions are not available in prior versions, so we have to
+use them via runtime loading, as long as we support XP. */
+static
+void
+os_cond_module_init(void)
+/*=====================*/
+{
+#ifdef __WIN__
+	HMODULE		h_dll;
+
+	if (!srv_use_native_conditions)
+		return;
+
+	h_dll = GetModuleHandle("kernel32");
+
+	initialize_condition_variable = (InitializeConditionVariableProc)
+			 GetProcAddress(h_dll, "InitializeConditionVariable");
+	sleep_condition_variable = (SleepConditionVariableCSProc)
+			  GetProcAddress(h_dll, "SleepConditionVariableCS");
+	wake_all_condition_variable = (WakeAllConditionVariableProc)
+			     GetProcAddress(h_dll, "WakeAllConditionVariable");
+	wake_condition_variable = (WakeConditionVariableProc)
+			 GetProcAddress(h_dll, "WakeConditionVariable");
+
+	/* When using native condition variables, check function pointers */
+	ut_a(initialize_condition_variable);
+	ut_a(sleep_condition_variable);
+	ut_a(wake_all_condition_variable);
+	ut_a(wake_condition_variable);
+#endif
+}
+
 /*********************************************************//**
 Initializes global event and OS 'slow' mutex lists. */
 UNIV_INTERN
@@ -89,7 +239,10 @@ os_sync_init(void)
 	os_sync_mutex = NULL;
 	os_sync_mutex_inited = FALSE;
 
-	os_sync_mutex = os_mutex_create(NULL);
+	/* Now for Windows only */
+	os_cond_module_init();
+
+	os_sync_mutex = os_mutex_create();
 
 	os_sync_mutex_inited = TRUE;
 }
@@ -143,42 +296,45 @@ os_event_create(
 	const char*	name)	/*!< in: the name of the event, if NULL
 				the event is created without a name */
 {
+	os_event_t	event;
+
 #ifdef __WIN__
-	os_event_t event;
+	if(!srv_use_native_conditions) {
 
-	event = ut_malloc(sizeof(struct os_event_struct));
+		event = ut_malloc(sizeof(struct os_event_struct));
 
-	event->handle = CreateEvent(NULL, /* No security attributes */
-				    TRUE, /* Manual reset */
-				    FALSE, /* Initial state nonsignaled */
-				    (LPCTSTR) name);
-	if (!event->handle) {
-		fprintf(stderr,
-			"InnoDB: Could not create a Windows event semaphore;"
-			" Windows error %lu\n",
-			(ulong) GetLastError());
-	}
-#else /* Unix */
-	os_event_t	event;
+		event->handle = CreateEvent(NULL,
+					    TRUE,
+					    FALSE,
+					    (LPCTSTR) name);
+		if (!event->handle) {
+			fprintf(stderr,
+				"InnoDB: Could not create a Windows event"
+				" semaphore; Windows error %lu\n",
+				(ulong) GetLastError());
+		}
+	} else /* Windows with condition variables */
+#endif
 
-	UT_NOT_USED(name);
+	{
+		UT_NOT_USED(name);
 
-	event = ut_malloc(sizeof(struct os_event_struct));
+		event = ut_malloc(sizeof(struct os_event_struct));
 
-	os_fast_mutex_init(&(event->os_mutex));
+		os_fast_mutex_init(&(event->os_mutex));
 
-	ut_a(0 == pthread_cond_init(&(event->cond_var), NULL));
+		os_cond_init(&(event->cond_var));
 
-	event->is_set = FALSE;
+		event->is_set = FALSE;
 
-	/* We return this value in os_event_reset(), which can then be
-	be used to pass to the os_event_wait_low(). The value of zero
-	is reserved in os_event_wait_low() for the case when the
-	caller does not want to pass any signal_count value. To
-	distinguish between the two cases we initialize signal_count
-	to 1 here. */
-	event->signal_count = 1;
-#endif /* __WIN__ */
+		/* We return this value in os_event_reset(), which can then be
+		be used to pass to the os_event_wait_low(). The value of zero
+		is reserved in os_event_wait_low() for the case when the
+		caller does not want to pass any signal_count value. To
+		distinguish between the two cases we initialize signal_count
+		to 1 here. */
+		event->signal_count = 1;
+	}
 
 	/* The os_sync_mutex can be NULL because during startup an event
 	can be created [ because it's embedded in the mutex/rwlock ] before
@@ -208,10 +364,15 @@ os_event_set(
 /*=========*/
 	os_event_t	event)	/*!< in: event to set */
 {
-#ifdef __WIN__
 	ut_a(event);
-	ut_a(SetEvent(event->handle));
-#else
+
+#ifdef __WIN__
+	if (!srv_use_native_conditions) {
+		ut_a(SetEvent(event->handle));
+		return;
+	}
+#endif
+
 	ut_a(event);
 
 	os_fast_mutex_lock(&(event->os_mutex));
@@ -221,11 +382,10 @@ os_event_set(
 	} else {
 		event->is_set = TRUE;
 		event->signal_count += 1;
-		ut_a(0 == pthread_cond_broadcast(&(event->cond_var)));
+		os_cond_broadcast(&(event->cond_var));
 	}
 
 	os_fast_mutex_unlock(&(event->os_mutex));
-#endif
 }
 
 /**********************************************************//**
@@ -244,12 +404,14 @@ os_event_reset(
 {
 	ib_int64_t	ret = 0;
 
-#ifdef __WIN__
 	ut_a(event);
 
-	ut_a(ResetEvent(event->handle));
-#else
-	ut_a(event);
+#ifdef __WIN__
+	if(!srv_use_native_conditions) {
+		ut_a(ResetEvent(event->handle));
+		return(0);
+	}
+#endif
 
 	os_fast_mutex_lock(&(event->os_mutex));
 
@@ -261,7 +423,6 @@ os_event_reset(
 	ret = event->signal_count;
 
 	os_fast_mutex_unlock(&(event->os_mutex));
-#endif
 	return(ret);
 }
 
@@ -274,19 +435,21 @@ os_event_free_internal(
 	os_event_t	event)	/*!< in: event to free */
 {
 #ifdef __WIN__
-	ut_a(event);
+	if(!srv_use_native_conditions) {
+		ut_a(event);
+		ut_a(CloseHandle(event->handle));
+	} else
+#endif
+	{
+		ut_a(event);
 
-	ut_a(CloseHandle(event->handle));
-#else
-	ut_a(event);
+		/* This is to avoid freeing the mutex twice */
+		os_fast_mutex_free(&(event->os_mutex));
 
-	/* This is to avoid freeing the mutex twice */
-	os_fast_mutex_free(&(event->os_mutex));
+		os_cond_destroy(&(event->cond_var));
+	}
 
-	ut_a(0 == pthread_cond_destroy(&(event->cond_var)));
-#endif
 	/* Remove from the list of events */
-
 	UT_LIST_REMOVE(os_event_list, os_event_list, event);
 
 	os_event_count--;
@@ -303,18 +466,19 @@ os_event_free(
 	os_event_t	event)	/*!< in: event to free */
 
 {
-#ifdef __WIN__
 	ut_a(event);
+#ifdef __WIN__
+	if(!srv_use_native_conditions){
+		ut_a(CloseHandle(event->handle));
+	} else /*Windows with condition variables */
+#endif
+	{
+		os_fast_mutex_free(&(event->os_mutex));
 
-	ut_a(CloseHandle(event->handle));
-#else
-	ut_a(event);
+		os_cond_destroy(&(event->cond_var));
+	}
 
-	os_fast_mutex_free(&(event->os_mutex));
-	ut_a(0 == pthread_cond_destroy(&(event->cond_var)));
-#endif
 	/* Remove from the list of events */
-
 	os_mutex_enter(os_sync_mutex);
 
 	UT_LIST_REMOVE(os_event_list, os_event_list, event);
@@ -355,23 +519,27 @@ os_event_wait_low(
 					returned by previous call of
 					os_event_reset(). */
 {
+	ib_int64_t	old_signal_count;
+
 #ifdef __WIN__
-	DWORD	err;
+	if(!srv_use_native_conditions) {
+		DWORD	err;
 
-	ut_a(event);
+		ut_a(event);
 
-	UT_NOT_USED(reset_sig_count);
+		UT_NOT_USED(reset_sig_count);
 
-	/* Specify an infinite time limit for waiting */
-	err = WaitForSingleObject(event->handle, INFINITE);
+		/* Specify an infinite wait */
+		err = WaitForSingleObject(event->handle, INFINITE);
 
-	ut_a(err == WAIT_OBJECT_0);
+		ut_a(err == WAIT_OBJECT_0);
 
-	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
-		os_thread_exit(NULL);
+		if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+			os_thread_exit(NULL);
+		}
+		return;
 	}
-#else
-	ib_int64_t	old_signal_count;
+#endif
 
 	os_fast_mutex_lock(&(event->os_mutex));
 
@@ -396,123 +564,29 @@ os_event_wait_low(
 			return;
 		}
 
-		pthread_cond_wait(&(event->cond_var), &(event->os_mutex));
+		os_cond_wait(&(event->cond_var), &(event->os_mutex));
 
 		/* Solaris manual said that spurious wakeups may occur: we
 		have to check if the event really has been signaled after
 		we came here to wait */
 	}
-#endif
 }
 
-/**********************************************************//**
-Waits for an event object until it is in the signaled state or
-a timeout is exceeded. In Unix the timeout is always infinite.
-@return	0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
-UNIV_INTERN
-ulint
-os_event_wait_time(
-/*===============*/
-	os_event_t	event,	/*!< in: event to wait */
-	ulint		time)	/*!< in: timeout in microseconds, or
-				OS_SYNC_INFINITE_TIME */
-{
-#ifdef __WIN__
-	DWORD	err;
-
-	ut_a(event);
-
-	if (time != OS_SYNC_INFINITE_TIME) {
-		err = WaitForSingleObject(event->handle, (DWORD) time / 1000);
-	} else {
-		err = WaitForSingleObject(event->handle, INFINITE);
-	}
-
-	if (err == WAIT_OBJECT_0) {
-
-		return(0);
-	} else if (err == WAIT_TIMEOUT) {
-
-		return(OS_SYNC_TIME_EXCEEDED);
-	} else {
-		ut_error;
-		return(1000000); /* dummy value to eliminate compiler warn. */
-	}
-#else
-	UT_NOT_USED(time);
-
-	/* In Posix this is just an ordinary, infinite wait */
-
-	os_event_wait(event);
-
-	return(0);
-#endif
-}
-
-#ifdef __WIN__
-/**********************************************************//**
-Waits for any event in an OS native event array. Returns if even a single
-one is signaled or becomes signaled.
-@return	index of the event which was signaled */
-UNIV_INTERN
-ulint
-os_event_wait_multiple(
-/*===================*/
-	ulint			n,	/*!< in: number of events in the
-					array */
-	os_native_event_t*	native_event_array)
-					/*!< in: pointer to an array of event
-					handles */
-{
-	DWORD	index;
-
-	ut_a(native_event_array);
-	ut_a(n > 0);
-
-	index = WaitForMultipleObjects((DWORD) n, native_event_array,
-				       FALSE,	   /* Wait for any 1 event */
-				       INFINITE); /* Infinite wait time
-						  limit */
-	ut_a(index >= WAIT_OBJECT_0);	/* NOTE: Pointless comparison */
-	ut_a(index < WAIT_OBJECT_0 + n);
-
-	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
-		os_thread_exit(NULL);
-	}
-
-	return(index - WAIT_OBJECT_0);
-}
-#endif
-
 /*********************************************************//**
 Creates an operating system mutex semaphore. Because these are slow, the
 mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
 @return	the mutex handle */
 UNIV_INTERN
 os_mutex_t
-os_mutex_create(
-/*============*/
-	const char*	name)	/*!< in: the name of the mutex, if NULL
-				the mutex is created without a name */
+os_mutex_create(void)
+/*=================*/
 {
-#ifdef __WIN__
-	HANDLE		mutex;
-	os_mutex_t	mutex_str;
-
-	mutex = CreateMutex(NULL,	/* No security attributes */
-			    FALSE,		/* Initial state: no owner */
-			    (LPCTSTR) name);
-	ut_a(mutex);
-#else
 	os_fast_mutex_t*	mutex;
 	os_mutex_t		mutex_str;
 
-	UT_NOT_USED(name);
-
 	mutex = ut_malloc(sizeof(os_fast_mutex_t));
 
 	os_fast_mutex_init(mutex);
-#endif
 	mutex_str = ut_malloc(sizeof(os_mutex_str_t));
 
 	mutex_str->handle = mutex;
@@ -543,25 +617,11 @@ os_mutex_enter(
 /*===========*/
 	os_mutex_t	mutex)	/*!< in: mutex to acquire */
 {
-#ifdef __WIN__
-	DWORD	err;
-
-	ut_a(mutex);
-
-	/* Specify infinite time limit for waiting */
-	err = WaitForSingleObject(mutex->handle, INFINITE);
-
-	ut_a(err == WAIT_OBJECT_0);
-
-	(mutex->count)++;
-	ut_a(mutex->count == 1);
-#else
 	os_fast_mutex_lock(mutex->handle);
 
 	(mutex->count)++;
 
 	ut_a(mutex->count == 1);
-#endif
 }
 
 /**********************************************************//**
@@ -577,11 +637,7 @@ os_mutex_exit(
 	ut_a(mutex->count == 1);
 
 	(mutex->count)--;
-#ifdef __WIN__
-	ut_a(ReleaseMutex(mutex->handle));
-#else
 	os_fast_mutex_unlock(mutex->handle);
-#endif
 }
 
 /**********************************************************//**
@@ -610,15 +666,9 @@ os_mutex_free(
 		os_mutex_exit(os_sync_mutex);
 	}
 
-#ifdef __WIN__
-	ut_a(CloseHandle(mutex->handle));
-
-	ut_free(mutex);
-#else
 	os_fast_mutex_free(mutex->handle);
 	ut_free(mutex->handle);
 	ut_free(mutex);
-#endif
 }
 
 /*********************************************************//**

=== modified file 'storage/innobase/os/os0thread.c'
--- a/storage/innobase/os/os0thread.c	revid:vasil.dimov@stripped
+++ b/storage/innobase/os/os0thread.c	revid:vasil.dimov@stripped
@@ -242,7 +242,7 @@ os_thread_yield(void)
 /*=================*/
 {
 #if defined(__WIN__)
-	Sleep(0);
+	SwitchToThread();
 #elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H))
 	sched_yield();
 #elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG)

=== modified file 'storage/innobase/srv/srv0srv.c'
--- a/storage/innobase/srv/srv0srv.c	revid:vasil.dimov@stripped
+++ b/storage/innobase/srv/srv0srv.c	revid:vasil.dimov@stripped
@@ -143,6 +143,21 @@ use simulated aio we build below with th
 Currently we support native aio on windows and linux */
 UNIV_INTERN my_bool	srv_use_native_aio = TRUE;
 
+#ifdef __WIN__
+/* Windows native condition variables. We use runtime loading / function
+pointers, because they are not available on Windows Server 2003 and
+Windows XP/2000.
+
+We use condition for events on Windows if possible, even if os_event
+resembles Windows kernel event object well API-wise. The reason is
+performance, kernel objects are heavyweights and WaitForSingleObject() is a
+performance killer causing calling thread to context switch. Besides, Innodb
+is preallocating large number (often millions) of os_events. With kernel event
+objects it takes a big chunk out of non-paged pool, which is better suited
+for tasks like IO than for storing idle event objects. */
+UNIV_INTERN ibool	srv_use_native_conditions = FALSE;
+#endif /* __WIN__ */
+
 UNIV_INTERN ulint	srv_n_data_files = 0;
 UNIV_INTERN char**	srv_data_file_names = NULL;
 /* size in database pages */
@@ -3035,6 +3050,8 @@ srv_purge_thread(
 
 	slot_no = srv_table_reserve_slot(SRV_WORKER);
 
+	slot = srv_table_get_nth_slot(slot_no);
+
 	++srv_n_threads_active[SRV_WORKER];
 
 	mutex_exit(&kernel_mutex);
@@ -3086,20 +3103,16 @@ srv_purge_thread(
 
 	mutex_enter(&kernel_mutex);
 
+	ut_ad(srv_table_get_nth_slot(slot_no) == slot);
+
 	/* Decrement the active count. */
 	srv_suspend_thread();
 
-	mutex_exit(&kernel_mutex);
+	slot->in_use = FALSE;
 
 	/* Free the thread local memory. */
 	thr_local_free(os_thread_get_curr_id());
 
-	mutex_enter(&kernel_mutex);
-
-	/* Free the slot for reuse. */
-	slot = srv_table_get_nth_slot(slot_no);
-	slot->in_use = FALSE;
-
 	mutex_exit(&kernel_mutex);
 
 #ifdef UNIV_DEBUG_THREAD_CREATION

=== modified file 'storage/innobase/srv/srv0start.c'
--- a/storage/innobase/srv/srv0start.c	revid:vasil.dimov@stripped
+++ b/storage/innobase/srv/srv0start.c	revid:vasil.dimov@stripped
@@ -1160,9 +1160,17 @@ innobase_start_or_create_for_mysql(void)
 
 		srv_use_native_aio = FALSE;
 		break;
+
+	case OS_WIN2000:
+	case OS_WINXP:
+		/* On 2000 and XP, async IO is available. */
+		srv_use_native_aio = TRUE;
+		break;
+
 	default:
-		/* On Win 2000 and XP use async i/o */
+		/* Vista and later have both async IO and condition variables */
 		srv_use_native_aio = TRUE;
+		srv_use_native_conditions = TRUE;
 		break;
 	}
 
@@ -1695,20 +1703,6 @@ innobase_start_or_create_for_mysql(void)
 	/* fprintf(stderr, "Max allowed record size %lu\n",
 	page_get_free_space_of_empty() / 2); */
 
-	/* Create the thread which watches the timeouts for lock waits */
-	os_thread_create(&srv_lock_timeout_thread, NULL,
-			 thread_ids + 2 + SRV_MAX_N_IO_THREADS);
-
-	/* Create the thread which warns of long semaphore waits */
-	os_thread_create(&srv_error_monitor_thread, NULL,
-			 thread_ids + 3 + SRV_MAX_N_IO_THREADS);
-
-	/* Create the thread which prints InnoDB monitor info */
-	os_thread_create(&srv_monitor_thread, NULL,
-			 thread_ids + 4 + SRV_MAX_N_IO_THREADS);
-
-	srv_is_being_started = FALSE;
-
 	if (trx_doublewrite == NULL) {
 		/* Create the doublewrite buffer to a new tablespace */
 
@@ -1721,8 +1715,29 @@ innobase_start_or_create_for_mysql(void)
 	We create the new segments only if it's a new database or
 	the database was shutdown cleanly. */
 
+	/* Note: When creating the extra rollback segments during an upgrade
+	we violate the latching order, even if the change buffer is empty.
+	We make an exception in sync0sync.c and check srv_is_being_started
+	for that violation. It cannot create a deadlock because we are still
+	running in single threaded mode essentially. Only the IO threads
+	should be running at this stage. */
+
 	trx_sys_create_rsegs(TRX_SYS_N_RSEGS - 1);
 
+	/* Create the thread which watches the timeouts for lock waits */
+	os_thread_create(&srv_lock_timeout_thread, NULL,
+			 thread_ids + 2 + SRV_MAX_N_IO_THREADS);
+
+	/* Create the thread which warns of long semaphore waits */
+	os_thread_create(&srv_error_monitor_thread, NULL,
+			 thread_ids + 3 + SRV_MAX_N_IO_THREADS);
+
+	/* Create the thread which prints InnoDB monitor info */
+	os_thread_create(&srv_monitor_thread, NULL,
+			 thread_ids + 4 + SRV_MAX_N_IO_THREADS);
+
+	srv_is_being_started = FALSE;
+
 	err = dict_create_or_check_foreign_constraint_tables();
 
 	if (err != DB_SUCCESS) {

=== modified file 'storage/innobase/sync/sync0arr.c'
--- a/storage/innobase/sync/sync0arr.c	revid:vasil.dimov@stripped
+++ b/storage/innobase/sync/sync0arr.c	revid:vasil.dimov@stripped
@@ -250,7 +250,7 @@ sync_array_create(
 
 	/* Then create the mutex to protect the wait array complex */
 	if (protection == SYNC_ARRAY_OS_MUTEX) {
-		arr->os_mutex = os_mutex_create(NULL);
+		arr->os_mutex = os_mutex_create();
 	} else if (protection == SYNC_ARRAY_MUTEX) {
 		mutex_create(syn_arr_mutex_key,
 			     &arr->mutex, SYNC_NO_ORDER_CHECK);

=== modified file 'storage/innobase/sync/sync0sync.c'
--- a/storage/innobase/sync/sync0sync.c	revid:vasil.dimov@stripped
+++ b/storage/innobase/sync/sync0sync.c	revid:vasil.dimov@stripped
@@ -40,6 +40,9 @@ Created 9/5/1995 Heikki Tuuri
 #include "srv0srv.h"
 #include "buf0types.h"
 #include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+#ifdef UNIV_SYNC_DEBUG
+# include "srv0start.h" /* srv_is_being_started */
+#endif /* UNIV_SYNC_DEBUG */
 
 /*
 	REASONS FOR IMPLEMENTING THE SPIN LOCK MUTEX
@@ -1152,6 +1155,13 @@ sync_thread_add_level(
 	case SYNC_TREE_NODE_FROM_HASH:
 		/* Do no order checking */
 		break;
+	case SYNC_TRX_SYS_HEADER:
+		if (srv_is_being_started) {
+			/* This is violated during trx_sys_create_rsegs()
+			when creating additional rollback segments when
+			upgrading in innobase_start_or_create_for_mysql(). */
+			break;
+		}
 	case SYNC_MEM_POOL:
 	case SYNC_MEM_HASH:
 	case SYNC_RECV:
@@ -1160,7 +1170,6 @@ sync_thread_add_level(
 	case SYNC_LOG_FLUSH_ORDER:
 	case SYNC_THR_LOCAL:
 	case SYNC_ANY_LATCH:
-	case SYNC_TRX_SYS_HEADER:
 	case SYNC_FILE_FORMAT_TAG:
 	case SYNC_DOUBLEWRITE:
 	case SYNC_SEARCH_SYS:
@@ -1222,8 +1231,12 @@ sync_thread_add_level(
 			ut_a(sync_thread_levels_g(array, SYNC_IBUF_BITMAP - 1,
 						  TRUE));
 		} else {
-			ut_a(sync_thread_levels_g(array, SYNC_IBUF_BITMAP,
-						  TRUE));
+			/* This is violated during trx_sys_create_rsegs()
+			when creating additional rollback segments when
+			upgrading in innobase_start_or_create_for_mysql(). */
+			ut_a(srv_is_being_started
+			     || sync_thread_levels_g(array, SYNC_IBUF_BITMAP,
+						     TRUE));
 		}
 		break;
 	case SYNC_FSP_PAGE:


Attachment: [text/bzr-bundle] bzr/vasil.dimov@oracle.com-20100722081702-n5hadky1zapo8uah.bundle
Thread
bzr commit into mysql-next-mr-innodb branch (vasil.dimov:3241) vasil.dimov22 Jul