List:Commits« Previous MessageNext Message »
From:vasil.dimov Date:April 21 2010 10:00am
Subject:bzr push into mysql-trunk-innodb-persistent-stats branch (vasil.dimov:3035
to 3037)
View as plain text  
 3037 Vasil Dimov	2010-04-20
      Import the Persistent Stats code from (svn) branches/innodb+_persistent_stats
      
      The history of this WIP is irrelevant and is not worth the effort importing
      into BZR, if needed it can still be viewed in SVN.

    added:
      mysql-test/suite/innodb/include/innodb_stats.inc
      mysql-test/suite/innodb/r/innodb_stats.result
      mysql-test/suite/innodb/t/innodb_stats.test
      storage/innobase/dict/dict0stats.c
      storage/innobase/include/dict0stats.h
      storage/innobase/scripts/
      storage/innobase/scripts/persistent_storage.sql
    modified:
      storage/innobase/CMakeLists.txt
      storage/innobase/Makefile.am
      storage/innobase/btr/btr0cur.c
      storage/innobase/btr/btr0pcur.c
      storage/innobase/data/data0type.c
      storage/innobase/dict/dict0dict.c
      storage/innobase/dict/dict0load.c
      storage/innobase/handler/ha_innodb.cc
      storage/innobase/handler/ha_innodb.h
      storage/innobase/handler/handler0alter.cc
      storage/innobase/include/btr0cur.h
      storage/innobase/include/btr0pcur.h
      storage/innobase/include/btr0pcur.ic
      storage/innobase/include/db0err.h
      storage/innobase/include/dict0dict.h
      storage/innobase/include/dict0mem.h
      storage/innobase/include/lock0lock.h
      storage/innobase/include/srv0srv.h
      storage/innobase/lock/lock0lock.c
      storage/innobase/row/row0mysql.c
      storage/innobase/srv/srv0srv.c
 3036 Vasil Dimov	2010-04-20
      Update tree name in .bzr-mysql/default.conf

    modified:
      .bzr-mysql/default.conf
 3035 Vasil Dimov	2010-04-20
      Adjust the result for the mysql-test sys_vars.all_vars after the
      addition of a new config variable.

    modified:
      mysql-test/suite/sys_vars/r/all_vars.result
=== modified file '.bzr-mysql/default.conf'
--- a/.bzr-mysql/default.conf	2010-04-14 20:05:38 +0000
+++ b/.bzr-mysql/default.conf	2010-04-20 15:18:27 +0000
@@ -1,4 +1,4 @@
 [MYSQL]
 post_commit_to = commits@stripped, innodb_dev_ww@stripped
 post_push_to = commits@stripped, innodb_dev_ww@stripped
-tree_name = "mysql-trunk-innodb"
+tree_name = "mysql-trunk-innodb-persistent-stats"

=== added file 'mysql-test/suite/innodb/include/innodb_stats.inc'
--- a/mysql-test/suite/innodb/include/innodb_stats.inc	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/innodb/include/innodb_stats.inc	2010-04-20 15:23:28 +0000
@@ -0,0 +1,23 @@
+-- disable_query_log
+DELETE FROM test_innodb_stats;
+
+-- enable_query_log
+-- eval $insert
+
+-- disable_query_log
+-- disable_result_log
+ANALYZE TABLE test_innodb_stats;
+
+-- enable_result_log
+SELECT
+stat_name,
+stat_value,
+sample_size,
+stat_description
+FROM innodb.index_stats
+WHERE
+database_name = DATABASE() AND
+table_name = 'test_innodb_stats' AND
+index_name = 'a_key' AND
+stat_name IN ('n_diff_pfx01', 'n_diff_pfx02', 'n_leaf_pages', 'size')
+ORDER BY stat_name;

=== added file 'mysql-test/suite/innodb/r/innodb_stats.result'
--- a/mysql-test/suite/innodb/r/innodb_stats.result	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/innodb/r/innodb_stats.result	2010-04-20 15:23:28 +0000
@@ -0,0 +1,32 @@
+SELECT 'dummy INSERT, the table should be empty';
+dummy INSERT, the table should be empty
+dummy INSERT, the table should be empty
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	0
+INSERT INTO test_innodb_stats (a) VALUES (1);
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	1
+INSERT INTO test_innodb_stats (a) VALUES (1), (1);
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	1
+INSERT INTO test_innodb_stats (a) VALUES (1), (1), (1);
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	1
+INSERT INTO test_innodb_stats (a) VALUES (1), (1), (1), (1), (1), (1), (1), (1), (1), (1);
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	1
+INSERT INTO test_innodb_stats (a) VALUES (1), (2);
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	2
+INSERT INTO test_innodb_stats (a) VALUES (1), (1), (2);
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	2
+INSERT INTO test_innodb_stats (a) VALUES (1), (2), (3);
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	3
+INSERT INTO test_innodb_stats (a) VALUES (1), (1), (2), (3), (3);
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	3
+INSERT INTO test_innodb_stats (a) VALUES (1), (2), (3), (4), (5), (1), (2), (3), (4), (5);
+index_size	n_leaf_pages	n_diff_key_vals_pfx02
+1	1	5

=== added file 'mysql-test/suite/innodb/t/innodb_stats.test'
--- a/mysql-test/suite/innodb/t/innodb_stats.test	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/innodb/t/innodb_stats.test	2010-04-20 15:23:28 +0000
@@ -0,0 +1,93 @@
+#
+# Test the persistent stats feature
+#
+
+-- source include/have_innodb.inc
+
+-- disable_warnings
+-- disable_query_log
+DROP DATABASE IF EXISTS innodb;
+CREATE DATABASE innodb;
+
+CREATE TABLE innodb.table_stats (
+	database_name			VARCHAR(512) NOT NULL,
+	table_name			VARCHAR(512) NOT NULL,
+	stats_timestamp			TIMESTAMP NOT NULL,
+	n_rows				BIGINT UNSIGNED NOT NULL,
+	clustered_index_size		BIGINT UNSIGNED NOT NULL,
+	sum_of_other_index_sizes	BIGINT UNSIGNED NOT NULL,
+	PRIMARY KEY (database_name, table_name)
+) ENGINE=INNODB;
+
+CREATE TABLE innodb.index_stats (
+	database_name			VARCHAR(512) NOT NULL,
+	table_name			VARCHAR(512) NOT NULL,
+	index_name			VARCHAR(512) NOT NULL,
+	stat_timestamp			TIMESTAMP NOT NULL,
+	/* there are at least:
+	stat_name='index_size'
+	stat_name='n_leaf_pages'
+	stat_name='n_diff_pfx%' */
+	stat_name			VARCHAR(64) NOT NULL,
+	stat_value			BIGINT UNSIGNED NOT NULL,
+	sample_size			BIGINT UNSIGNED,
+	stat_description		VARCHAR(1024) NOT NULL,
+	PRIMARY KEY (database_name, table_name, index_name, stat_name),
+	FOREIGN KEY (database_name, table_name)
+	  REFERENCES table_stats (database_name, table_name)
+) ENGINE=INNODB;
+
+DROP TABLE IF EXISTS test_innodb_stats;
+
+CREATE TABLE test_innodb_stats (
+	a INT,
+	KEY a_key (a)
+) ENGINE=INNODB;
+
+SET SESSION innodb_analyze_is_persistent=ON;
+
+-- enable_warnings
+
+# test empty table
+-- let $insert = SELECT 'dummy INSERT, the table should be empty'
+-- source suite/innodb/include/innodb_stats.inc
+
+# test table with 1 row
+-- let $insert = INSERT INTO test_innodb_stats (a) VALUES (1)
+-- source suite/innodb/include/innodb_stats.inc
+
+# test table with 2 eq rows
+-- let $insert = INSERT INTO test_innodb_stats (a) VALUES (1), (1)
+-- source suite/innodb/include/innodb_stats.inc
+
+# test table with 3 eq rows
+-- let $insert = INSERT INTO test_innodb_stats (a) VALUES (1), (1), (1)
+-- source suite/innodb/include/innodb_stats.inc
+
+# test table with 10 eq rows
+-- let $insert = INSERT INTO test_innodb_stats (a) VALUES (1), (1), (1), (1), (1), (1), (1), (1), (1), (1)
+-- source suite/innodb/include/innodb_stats.inc
+
+# test table with 2 diff rows
+-- let $insert = INSERT INTO test_innodb_stats (a) VALUES (1), (2)
+-- source suite/innodb/include/innodb_stats.inc
+
+# test table with 2 diff rows, 3 rows in total
+-- let $insert = INSERT INTO test_innodb_stats (a) VALUES (1), (1), (2)
+-- source suite/innodb/include/innodb_stats.inc
+
+# test table with 3 diff rows
+-- let $insert = INSERT INTO test_innodb_stats (a) VALUES (1), (2), (3)
+-- source suite/innodb/include/innodb_stats.inc
+
+# test table with 3 diff rows, 5 rows in total
+-- let $insert = INSERT INTO test_innodb_stats (a) VALUES (1), (1), (2), (3), (3)
+-- source suite/innodb/include/innodb_stats.inc
+
+# test table with 5 diff rows, 10 rows in total
+-- let $insert = INSERT INTO test_innodb_stats (a) VALUES (1), (2), (3), (4), (5), (1), (2), (3), (4), (5)
+-- source suite/innodb/include/innodb_stats.inc
+
+-- disable_query_log
+DROP TABLE test_innodb_stats;
+DROP DATABASE innodb;

=== modified file 'storage/innobase/CMakeLists.txt'
--- a/storage/innobase/CMakeLists.txt	2010-04-12 14:45:05 +0000
+++ b/storage/innobase/CMakeLists.txt	2010-04-20 15:23:28 +0000
@@ -210,7 +210,7 @@ ENDIF()
 SET(INNOBASE_SOURCES	btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
 			buf/buf0buddy.c buf/buf0buf.c buf/buf0flu.c buf/buf0lru.c buf/buf0rea.c
 			data/data0data.c data/data0type.c
-			dict/dict0boot.c dict/dict0crea.c dict/dict0dict.c dict/dict0load.c dict/dict0mem.c
+			dict/dict0boot.c dict/dict0crea.c dict/dict0dict.c dict/dict0load.c dict/dict0mem.c dict/dict0stats.c
 			dyn/dyn0dyn.c
 			eval/eval0eval.c eval/eval0proc.c
 			fil/fil0fil.c

=== modified file 'storage/innobase/Makefile.am'
--- a/storage/innobase/Makefile.am	2010-04-12 14:45:05 +0000
+++ b/storage/innobase/Makefile.am	2010-04-20 15:23:28 +0000
@@ -66,6 +66,7 @@ noinst_HEADERS=		\
 			include/dict0load.ic	\
 			include/dict0mem.h	\
 			include/dict0mem.ic	\
+			include/dict0stats.h	\
 			include/dict0types.h	\
 			include/dyn0dyn.h	\
 			include/dyn0dyn.ic	\
@@ -247,6 +248,7 @@ libinnobase_a_SOURCES=	\
 			dict/dict0dict.c		\
 			dict/dict0load.c		\
 			dict/dict0mem.c			\
+			dict/dict0stats.c		\
 			dyn/dyn0dyn.c			\
 			eval/eval0eval.c		\
 			eval/eval0proc.c		\

=== modified file 'storage/innobase/btr/btr0cur.c'
--- a/storage/innobase/btr/btr0cur.c	2010-04-19 15:44:15 +0000
+++ b/storage/innobase/btr/btr0cur.c	2010-04-20 15:23:28 +0000
@@ -3347,7 +3347,9 @@ btr_estimate_n_rows_in_range(
 /*******************************************************************//**
 Estimates the number of different key values in a given index, for
 each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals. */
+The estimates are stored in the array index->stat_n_diff_key_vals[] and
+the number of pages that were sampled is saved in
+index->stat_n_sample_sizes[]. */
 UNIV_INTERN
 void
 btr_estimate_number_of_different_key_vals(
@@ -3382,14 +3384,14 @@ btr_estimate_number_of_different_key_val
 
 	/* It makes no sense to test more pages than are contained
 	in the index, thus we lower the number if it is too high */
-	if (srv_stats_sample_pages > index->stat_index_size) {
+	if (srv_stats_transient_sample_pages > index->stat_index_size) {
 		if (index->stat_index_size > 0) {
 			n_sample_pages = index->stat_index_size;
 		} else {
 			n_sample_pages = 1;
 		}
 	} else {
-		n_sample_pages = srv_stats_sample_pages;
+		n_sample_pages = srv_stats_transient_sample_pages;
 	}
 
 	/* We sample some pages in the index to get an estimate */
@@ -3517,6 +3519,8 @@ btr_estimate_number_of_different_key_val
 		}
 
 		index->stat_n_diff_key_vals[j] += add_on;
+
+		index->stat_n_sample_sizes[j] = n_sample_pages;
 	}
 
 	mem_free(n_diff);

=== modified file 'storage/innobase/btr/btr0pcur.c'
--- a/storage/innobase/btr/btr0pcur.c	2010-03-11 11:57:05 +0000
+++ b/storage/innobase/btr/btr0pcur.c	2010-04-20 15:23:28 +0000
@@ -572,8 +572,8 @@ btr_pcur_open_on_user_rec_func(
 	ulint		line,		/*!< in: line where called */
 	mtr_t*		mtr)		/*!< in: mtr */
 {
-	btr_pcur_open_func(index, tuple, mode, latch_mode, cursor,
-			   file, line, mtr);
+	btr_pcur_open_low(index, 0, tuple, mode, latch_mode, cursor,
+			  file, line, mtr);
 
 	if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
 

=== modified file 'storage/innobase/data/data0type.c'
--- a/storage/innobase/data/data0type.c	2009-12-26 19:17:43 +0000
+++ b/storage/innobase/data/data0type.c	2010-04-20 15:23:28 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software

=== modified file 'storage/innobase/dict/dict0dict.c'
--- a/storage/innobase/dict/dict0dict.c	2010-04-06 12:18:47 +0000
+++ b/storage/innobase/dict/dict0dict.c	2010-04-20 15:23:28 +0000
@@ -41,6 +41,7 @@ UNIV_INTERN dict_index_t*	dict_ind_compa
 #include "dict0boot.h"
 #include "dict0mem.h"
 #include "dict0crea.h"
+#include "dict0stats.h"
 #include "trx0undo.h"
 #include "btr0btr.h"
 #include "btr0cur.h"
@@ -671,7 +672,7 @@ dict_table_get(
 			/* If table->ibd_file_missing == TRUE, this will
 			print an error message and return without doing
 			anything. */
-			dict_update_statistics(table);
+			dict_stats_update(table, DICT_STATS_UPD_FETCH);
 		}
 	}
 
@@ -1588,13 +1589,20 @@ undo_size_ok:
 		new_index->stat_n_diff_key_vals = mem_heap_alloc(
 			new_index->heap,
 			(1 + dict_index_get_n_unique(new_index))
-			* sizeof(ib_int64_t));
+			* sizeof(*new_index->stat_n_diff_key_vals));
+
+		new_index->stat_n_sample_sizes = mem_heap_alloc(
+			new_index->heap,
+			(1 + dict_index_get_n_unique(new_index))
+			* sizeof(*new_index->stat_n_sample_sizes));
+
 		/* Give some sensible values to stat_n_... in case we do
 		not calculate statistics quickly enough */
 
 		for (i = 0; i <= dict_index_get_n_unique(new_index); i++) {
 
 			new_index->stat_n_diff_key_vals[i] = 100;
+			new_index->stat_n_sample_sizes[i] = 0;
 		}
 	}
 
@@ -4115,101 +4123,6 @@ dict_index_calc_min_rec_len(
 	return(sum);
 }
 
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization. */
-UNIV_INTERN
-void
-dict_update_statistics_low(
-/*=======================*/
-	dict_table_t*	table,		/*!< in/out: table */
-	ibool		has_dict_mutex __attribute__((unused)))
-					/*!< in: TRUE if the caller has the
-					dictionary mutex */
-{
-	dict_index_t*	index;
-	ulint		size;
-	ulint		sum_of_index_sizes	= 0;
-
-	if (table->ibd_file_missing) {
-		ut_print_timestamp(stderr);
-		fprintf(stderr,
-			"  InnoDB: cannot calculate statistics for table %s\n"
-			"InnoDB: because the .ibd file is missing.  For help,"
-			" please refer to\n"
-			"InnoDB: " REFMAN "innodb-troubleshooting.html\n",
-			table->name);
-
-		return;
-	}
-
-	/* If we have set a high innodb_force_recovery level, do not calculate
-	statistics, as a badly corrupted index can cause a crash in it. */
-
-	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
-
-		return;
-	}
-
-	/* Find out the sizes of the indexes and how many different values
-	for the key they approximately have */
-
-	index = dict_table_get_first_index(table);
-
-	if (index == NULL) {
-		/* Table definition is corrupt */
-
-		return;
-	}
-
-	while (index) {
-		size = btr_get_size(index, BTR_TOTAL_SIZE);
-
-		index->stat_index_size = size;
-
-		sum_of_index_sizes += size;
-
-		size = btr_get_size(index, BTR_N_LEAF_PAGES);
-
-		if (size == 0) {
-			/* The root node of the tree is a leaf */
-			size = 1;
-		}
-
-		index->stat_n_leaf_pages = size;
-
-		btr_estimate_number_of_different_key_vals(index);
-
-		index = dict_table_get_next_index(index);
-	}
-
-	index = dict_table_get_first_index(table);
-
-	table->stat_n_rows = index->stat_n_diff_key_vals[
-		dict_index_get_n_unique(index)];
-
-	table->stat_clustered_index_size = index->stat_index_size;
-
-	table->stat_sum_of_other_index_sizes = sum_of_index_sizes
-		- index->stat_index_size;
-
-	table->stat_initialized = TRUE;
-
-	table->stat_modified_counter = 0;
-}
-
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization. */
-UNIV_INTERN
-void
-dict_update_statistics(
-/*===================*/
-	dict_table_t*	table)	/*!< in/out: table */
-{
-	dict_update_statistics_low(table, FALSE);
-}
-
 /**********************************************************************//**
 Prints info of a foreign key constraint. */
 static
@@ -4287,7 +4200,7 @@ dict_table_print_low(
 
 	ut_ad(mutex_own(&(dict_sys->mutex)));
 
-	dict_update_statistics_low(table, TRUE);
+	dict_stats_update(table, DICT_STATS_UPD_FETCH);
 
 	fprintf(stderr,
 		"--------------------------------------\n"

=== modified file 'storage/innobase/dict/dict0load.c'
--- a/storage/innobase/dict/dict0load.c	2010-02-20 16:45:41 +0000
+++ b/storage/innobase/dict/dict0load.c	2010-04-20 15:23:28 +0000
@@ -37,6 +37,7 @@ Created 4/24/1996 Heikki Tuuri
 #include "mach0data.h"
 #include "dict0dict.h"
 #include "dict0boot.h"
+#include "dict0stats.h"
 #include "rem0cmp.h"
 #include "srv0start.h"
 #include "srv0srv.h"
@@ -222,7 +223,8 @@ loop:
 			is no index */
 
 			if (dict_table_get_first_index(table)) {
-				dict_update_statistics_low(table, TRUE);
+				dict_stats_update(table,
+						  DICT_STATS_UPD_FETCH);
 			}
 
 			dict_table_print_low(table);

=== added file 'storage/innobase/dict/dict0stats.c'
--- a/storage/innobase/dict/dict0stats.c	1970-01-01 00:00:00 +0000
+++ b/storage/innobase/dict/dict0stats.c	2010-04-20 15:23:28 +0000
@@ -0,0 +1,2747 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0stats.c
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+
+#include "univ.i"
+
+#include "btr0btr.h" /* btr_get_size() */
+#include "btr0cur.h" /* btr_estimate_number_of_different_key_vals() */
+#include "dict0dict.h" /* dict_table_get_first_index() */
+#include "dict0mem.h" /* DICT_TABLE_MAGIC_N */
+#include "dict0stats.h"
+#include "data0type.h" /* dtype_t */
+#include "db0err.h" /* db_err */
+#include "dyn0dyn.h" /* dyn_array* */
+#include "lock0lock.h" /* lock_table_by_name() */
+#include "pars0pars.h" /* pars_info_create() */
+#include "pars0types.h" /* pars_info_t */
+#include "que0que.h" /* que_eval_sql() */
+#include "rem0cmp.h" /* cmp_rec_rec_with_match() */
+#include "row0sel.h" /* sel_node_struct */
+#include "row0types.h" /* sel_node_t */
+#include "trx0trx.h" /* trx_create() */
+#include "trx0roll.h" /* trx_rollback_for_mysql() */
+#include "usr0types.h" /* sess_t */
+#include "ut0rnd.h" /* ut_rnd_interval() */
+
+/* names of the tables from the persistent storage */
+#define TABLE_STATS_NAME	"innodb/table_stats"
+#define INDEX_STATS_NAME	"innodb/index_stats"
+
+#if 1
+#define DEBUG_PRINTF(fmt, ...)	printf(fmt, ## __VA_ARGS__)
+#else
+#define DEBUG_PRINTF(fmt, ...)	/* noop */
+#endif
+
+/* number of distinct records on a given level that are required to stop 
+descending to lower levels and fetch
+srv_stats_persistent_sample_pages records from that level */
+#define N_DIFF_REQUIRED	(srv_stats_persistent_sample_pages * 10)
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively quick and is used to calculate transient statistics that
+are not saved on disk.
+dict_stats_update_transient() @{ */
+static
+void
+dict_stats_update_transient(
+/*========================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	dict_index_t*	index;
+	ulint		size;
+	ulint		sum_of_index_sizes	= 0;
+
+	/* Find out the sizes of the indexes and how many different values
+	for the key they approximately have */
+
+	index = dict_table_get_first_index(table);
+
+	if (index == NULL) {
+		/* Table definition is corrupt */
+
+		return;
+	}
+
+	while (index) {
+		size = btr_get_size(index, BTR_TOTAL_SIZE);
+
+		index->stat_index_size = size;
+
+		sum_of_index_sizes += size;
+
+		size = btr_get_size(index, BTR_N_LEAF_PAGES);
+
+		if (size == 0) {
+			/* The root node of the tree is a leaf */
+			size = 1;
+		}
+
+		index->stat_n_leaf_pages = size;
+
+		btr_estimate_number_of_different_key_vals(index);
+
+		index = dict_table_get_next_index(index);
+	}
+
+	index = dict_table_get_first_index(table);
+
+	table->stat_n_rows = index->stat_n_diff_key_vals[
+		dict_index_get_n_unique(index)];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	table->stat_sum_of_other_index_sizes = sum_of_index_sizes
+		- index->stat_index_size;
+
+	table->stat_initialized = TRUE;
+
+	table->stat_modified_counter = 0;
+}
+/* @} */
+
+/* auxiliary structs for checking a table definition @{ */
+struct column_data_struct {
+	const char*	name;
+	ulint		mtype;
+	ulint		prtype_mask;
+	ulint		len;
+};
+
+typedef struct column_data_struct	column_data_t;
+
+struct table_schema_struct {
+	const char*	table_name;
+	ulint		n_cols;
+	column_data_t*	columns;
+};
+
+typedef struct table_schema_struct	table_schema_t;
+/* @} */
+
+/*********************************************************************//**
+Checks whether a table exists and whether it has the given structure.
+The caller must own the dictionary mutex.
+dict_stats_table_check() @{
+@return TRUE if the table exists and contains the necessary columns */
+static
+ibool
+dict_stats_table_check(
+/*===================*/
+	table_schema_t*	req_schema)	/*!< in/out: required table schema */
+{
+	dict_table_t*	table;
+	ulint		i;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	table = dict_table_get_low(req_schema->table_name);
+
+	if (table == NULL) {
+		/* no such table */
+
+		return(FALSE);
+	}
+
+	if (table->n_def - DATA_N_SYS_COLS != req_schema->n_cols) {
+
+		/* the table has a different number of columns than
+		required */
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: %s has %d columns but should have %lu.\n",
+			req_schema->table_name,
+			table->n_def,
+			req_schema->n_cols);
+
+		goto err_exit;
+	}
+
+	/* For each column from req_schema->columns[] search
+	whether it is present in table->cols[].
+	The following algorithm is O(n_cols^2), but is optimized to
+	be O(n_cols) if the columns are in the same order in both arrays. */
+
+	for (i = 0; i < req_schema->n_cols; i++) {
+		ulint	j;
+
+		/* check if i'th column is the same in both arrays */
+		if (strcasecmp(req_schema->columns[i].name,
+			       dict_table_get_col_name(table, i)) == 0) {
+
+			/* we found the column in table->cols[] quickly */
+			j = i;
+		} else {
+
+			/* columns in both arrays are not in the same order,
+			do a full scan of the second array */
+			for (j = 0; j < table->n_def; j++) {
+
+				if (strcasecmp(req_schema->columns[i].name,
+					       dict_table_get_col_name(table, j))
+				    == 0) {
+
+					/* found the column on j'th position */
+					break;
+				}
+			}
+
+			if (j == table->n_def) {
+
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					" InnoDB: required column %s.%s "
+					"not found.\n",
+					req_schema->table_name,
+					req_schema->columns[i].name);
+
+				goto err_exit;
+			}
+		}
+
+		/* we found a column with the same name on j'th position,
+		compare column types and flags */
+
+		/* check length for exact match */
+		if (req_schema->columns[i].len != table->cols[j].len) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Column %s.%s has length %d "
+				"but should have length %lu.\n",
+				req_schema->table_name,
+				req_schema->columns[i].name,
+				table->cols[j].len,
+				req_schema->columns[i].len);
+
+			goto err_exit;
+		}
+
+		/* check mtype for exact match */
+		if (req_schema->columns[i].mtype != table->cols[j].mtype) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Column %s.%s is of type %d "
+				"but should be of type %lu.\n",
+				req_schema->table_name,
+				req_schema->columns[i].name,
+				table->cols[j].mtype,
+				req_schema->columns[i].mtype);
+
+			goto err_exit;
+		}
+
+		/* check whether required prtype mask is set */
+		if (req_schema->columns[i].prtype_mask != 0
+		    && (table->cols[j].prtype
+			& req_schema->columns[i].prtype_mask)
+		       != req_schema->columns[i].prtype_mask) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Column %s.%s flag %#lx "
+				"is not set in column's flags %#x.\n",
+				req_schema->table_name,
+				req_schema->columns[i].name,
+				req_schema->columns[i].prtype_mask,
+				table->cols[j].prtype);
+
+			goto err_exit;
+		}
+	}
+
+	return(TRUE);
+
+err_exit:
+
+	/* XXX add pointer to the doc */
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: Try to recreate %s with the required structure.\n",
+		req_schema->table_name);
+
+	return(FALSE);
+}
+/* @} */
+
+/*********************************************************************//**
+Checks whether the persistent storage exists and that all tables have the
+proper structure.
+dict_stats_persistent_storage_check() @{
+@return TRUE if exists and all tables are ok */
+static
+ibool
+dict_stats_persistent_storage_check()
+/*=================================*/
+{
+	/* definition for the table TABLE_STATS_NAME */
+	column_data_t	table_stats_columns[] = {
+		{"database_name", DATA_VARCHAR,
+			DATA_NOT_NULL, 512},
+
+		{"table_name", DATA_VARCHAR,
+			DATA_NOT_NULL, 512},
+
+		{"stats_timestamp", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 4},
+
+		{"n_rows", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"clustered_index_size", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"sum_of_other_index_sizes", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8}
+	};
+	table_schema_t	table_stats_schema = {
+		TABLE_STATS_NAME,
+		UT_ARR_SIZE(table_stats_columns),
+		table_stats_columns
+	};
+
+	/* definition for the table INDEX_STATS_NAME */
+	column_data_t	index_stats_columns[] = {
+		{"database_name", DATA_VARCHAR,
+			DATA_NOT_NULL, 512},
+
+		{"table_name", DATA_VARCHAR,
+			DATA_NOT_NULL, 512},
+
+		{"index_name", DATA_VARCHAR,
+			DATA_NOT_NULL, 512},
+
+		{"stat_timestamp", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 4},
+
+		{"stat_name", DATA_VARCHAR,
+			DATA_NOT_NULL, 64},
+
+		{"stat_value", DATA_INT,
+			DATA_NOT_NULL | DATA_UNSIGNED, 8},
+
+		{"sample_size", DATA_INT,
+			DATA_UNSIGNED, 8},
+
+		{"stat_description", DATA_VARCHAR,
+			DATA_NOT_NULL, 1024}
+	};
+	table_schema_t	index_stats_schema = {
+		INDEX_STATS_NAME,
+		UT_ARR_SIZE(index_stats_columns),
+		index_stats_columns
+	};
+
+	ibool	ret;
+
+	mutex_enter(&(dict_sys->mutex));
+
+	ret = dict_stats_table_check(&table_stats_schema)
+	   && dict_stats_table_check(&index_stats_schema);
+
+	mutex_exit(&(dict_sys->mutex));
+
+	return(ret);
+}
+/* @} */
+
+/* @{ Pseudo code about the relation between the following functions
+
+let N = srv_stats_persistent_sample_pages
+
+dict_stats_analyze_index()
+  for each n_prefix
+    search for good enough level
+      dict_stats_analyze_index_level() // only called if level has <= N pages
+        // full scan of the level in one mtr
+        collect statistics about the given level
+    we have found a good enough level here
+    dict_stats_analyze_index_for_n_prefix(that level, stats collected above)
+      // full scan of the level in one mtr
+      dive below some records and analyze the leaf page there:
+      dict_stats_analyze_index_below_pcur()
+@} */
+
+/*********************************************************************//**
+Find the total number and the number of distinct keys on a given level in
+an index. Each of the 1..n_uniq prefixes are looked up and the results are
+saved in the array n_diff[]. Notice that n_diff[] must be able to store
+n_uniq+1 numbers because the results are saved in
+n_diff[1] .. n_diff[n_uniq]. The total number of records on the level is
+saved in total.
+Also, the index of the last record in each group of equal records is saved
+in n_diff_boundaries[1..n_uniq], records indexing starts from the leftmost
+record on the level and continues cross pages boundaries, counting from 0.
+dict_stats_analyze_index_level() @{ */
+static
+void
+dict_stats_analyze_index_level(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		level,		/*!< in: level */
+	ib_uint64_t*	n_diff,		/*!< out: array for number of
+					distinct keys for all prefixes */
+	ib_uint64_t*	total_recs,	/*!< out: total number of records */
+	ib_uint64_t*	total_pages,	/*!< out: total number of pages */
+	dyn_array_t*	n_diff_boundaries)/*!< out: boundaries of the groups
+					of distinct keys */
+{
+	ulint		n_uniq;
+	mem_heap_t*	heap;
+	dtuple_t*	dtuple;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	page_t*		page;
+	rec_t*		rec;
+	rec_t*		prev_rec;
+	byte*		prev_rec_buf = NULL;
+	ulint		prev_rec_buf_size = 0;
+	ulint		i;
+
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=%lu)\n", __func__,
+		     index->table->name, index->name, level);
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* elements in the n_diff array are 1..n_uniq (inclusive) */
+	memset(n_diff, 0x0, (n_uniq + 1) * sizeof(ib_int64_t));
+
+	heap = mem_heap_create(256);
+
+	/* reset the dynamic arrays n_diff_boundaries[1..n_uniq];
+	n_diff_boundaries[0] is ignored to follow the same convention
+	as n_diff[] */
+	if (n_diff_boundaries != NULL) {
+		for (i = 1; i <= n_uniq; i++) {
+			dyn_array_free(&n_diff_boundaries[i]);
+
+			dyn_array_create(&n_diff_boundaries[i]);
+		}
+	}
+
+	/* craft a record that is always smaller than the others,
+	this way we are sure that the cursor pcur will be positioned
+	on the leftmost record on the leftmost page on the desired level */
+	dtuple = dtuple_create(heap, dict_index_get_n_unique(index));
+	dict_table_copy_types(dtuple, index->table);
+	dtuple_set_info_bits(dtuple, REC_INFO_MIN_REC_FLAG);
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_low(index, level, dtuple, PAGE_CUR_LE, BTR_SEARCH_LEAF,
+			  &pcur, __FILE__, __LINE__, &mtr);
+
+	page = btr_pcur_get_page(&pcur);
+
+	/* check that we are indeed on the desired level */
+	ut_a(btr_page_get_level(page, &mtr) == level);
+
+	/* there should not be any pages on the left */
+	ut_a(btr_page_get_prev(page, &mtr) == FIL_NULL);
+
+	/* check whether the first record on the leftmost page is marked
+	as such, if we are on a non-leaf level */
+	ut_a(level == 0 || REC_INFO_MIN_REC_FLAG
+	     & rec_get_info_bits(page_rec_get_next(page_get_infimum_rec(page)),
+				 page_is_comp(page)));
+
+	if (btr_pcur_is_before_first_on_page(&pcur)) {
+		btr_pcur_move_to_next_on_page(&pcur);
+	}
+
+	if (btr_pcur_is_after_last_on_page(&pcur)) {
+		btr_pcur_move_to_prev_on_page(&pcur);
+	}
+
+	prev_rec = NULL;
+
+	/* no records by default */
+	*total_recs = 0;
+
+	*total_pages = 0;
+
+	/* iterate over all user records on this level
+	and compare each two adjacent ones, even the last on page
+	X and the fist on page X+1 */
+	while (btr_pcur_is_on_user_rec(&pcur)) {
+
+		ulint	matched_fields = 0;
+		ulint	matched_bytes = 0;
+		ulint	offsets_rec_onstack[REC_OFFS_NORMAL_SIZE];
+		ulint*	offsets_rec;
+
+		rec_offs_init(offsets_rec_onstack);
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		offsets_rec = rec_get_offsets(rec, index, offsets_rec_onstack,
+					      n_uniq, &heap);
+
+		(*total_recs)++;
+
+		if (prev_rec != NULL) {
+
+			ulint	offsets_prev_rec_onstack[REC_OFFS_NORMAL_SIZE];
+			ulint*	offsets_prev_rec;
+
+			rec_offs_init(offsets_prev_rec_onstack);
+
+			offsets_prev_rec = rec_get_offsets(prev_rec, index,
+							   offsets_prev_rec_onstack,
+							   n_uniq, &heap);
+
+			cmp_rec_rec_with_match(rec,
+					       prev_rec,
+					       offsets_rec,
+					       offsets_prev_rec,
+					       index,
+					       &matched_fields,
+					       &matched_bytes);
+
+			for (i = matched_fields + 1; i <= n_uniq; i++) {
+
+				if (n_diff_boundaries != NULL) {
+					/* push the index of the previous record,
+					that is - the last one from a group of equal
+					keys */
+
+					void*		p;
+					ib_uint64_t	idx;
+
+					/* the index of the current record is
+					total_recs - 1, the index of the
+					previous record is total_recs - 2;
+					we know that idx is not going to become
+					negative here because if we are in this
+					branch then there is a previous record
+					and thus total_recs >= 2 */
+					idx = *total_recs - 2;
+
+					p = dyn_array_push(&n_diff_boundaries[i],
+							   sizeof(ib_uint64_t));
+					memcpy(p, &idx, sizeof(ib_uint64_t));
+				}
+
+				/* increment the number of different keys
+				for n_prefix=i */
+				n_diff[i]++;
+			}
+		} else {
+			/* this is the first record */
+			for (i = 1; i <= n_uniq; i++) {
+				n_diff[i] = 1;
+			}
+		}
+
+		/* we need to copy the record instead of assigning like
+		prev_rec = rec; because when we traverse the records
+		on this level at some point we will jump from one page
+		to the next and then rec and prev_rec will be on different
+		pages and btr_pcur_move_to_next_user_rec() will release
+		the latch on the page that prev_rec is on */
+		prev_rec = rec_copy_prefix_to_buf(rec, index,
+						  rec_offs_n_fields(offsets_rec),
+						  &prev_rec_buf, &prev_rec_buf_size);
+
+		/* increment the pages counter at the end of each page */
+		if (page_rec_is_supremum(page_rec_get_next(rec))) {
+
+			(*total_pages)++;
+		}
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	/* if *total_pages is left untouched then the above loop was not
+	entered at all and there is one page which is empty */
+	if (*total_pages == 0) {
+
+		ut_ad(*total_recs == 0);
+
+		*total_pages = 1;
+	}
+
+	/* if there are records on this level and boundaries should be saved */
+	if (*total_recs > 0 && n_diff_boundaries != NULL) {
+
+		/* remember the index of the last record on the level as the
+		last one from the last group of equal keys; this holds for all
+		possible prefixes */
+		for (i = 1; i <= n_uniq; i++) {
+			void*		p;
+			ib_uint64_t	idx;
+
+			idx = *total_recs - 1;
+
+			p = dyn_array_push(&n_diff_boundaries[i],
+					   sizeof(ib_uint64_t));
+
+			memcpy(p, &idx, sizeof(ib_uint64_t));
+		}
+	}
+
+	/* now in n_diff_boundaries[i] there are exactly n_diff[i] integers,
+	for i=1..n_uniq */
+
+	for (i = 1; i <= n_uniq; i++) {
+
+		DEBUG_PRINTF("    %s(): total recs: %llu, total pages: %llu, "
+			     "n_diff[%lu]: %lld\n",
+			     __func__, *total_recs, *total_pages,
+			     i, n_diff[i]);
+
+#if 0
+		if (n_diff_boundaries != NULL) {
+			ib_int64_t	j;
+
+			printf("boundaries: ");
+			for (j = 0; j < n_diff[i]; j++) {
+				ib_uint64_t	idx;
+
+				idx = *(ib_uint64_t*) dyn_array_get_element(
+					&n_diff_boundaries[i], j * sizeof(ib_uint64_t));
+
+				printf("%lld=%llu, ", j, idx);
+			}
+			printf("\n");
+		}
+#endif
+	}
+
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	if (prev_rec_buf != NULL) {
+
+		mem_free(prev_rec_buf);
+	}
+
+	mem_heap_free(heap);
+}
+/* @} */
+
+/*********************************************************************//**
+Dive below the current position of a cursor and calculate the number of
+distinct records on the leaf page, when looking at the fist n_prefix
+columns. The result is returned in n_diff.
+dict_stats_analyze_index_below_pcur() @{ */
+static
+void
+dict_stats_analyze_index_below_pcur(
+/*================================*/
+	dict_index_t*	index,		/*!< in: index */
+	btr_pcur_t*	pcur,		/*!< in: cursor, not modified */
+	ulint		n_prefix,	/*!< in: look at the first n_prefix
+					columns when comparing records */
+	ib_uint64_t*	n_diff,		/*!< out: number of distinct records
+					on the leaf page */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	ulint		space;
+	ulint		zip_size;
+	buf_block_t*	block;
+	ulint		page_no;
+	page_t*		page;
+	mem_heap_t*	heap;
+	rec_t*		supremum;
+	rec_t*		rec;
+	rec_t*		next_rec;
+	ulint		offsets_onstack1[REC_OFFS_NORMAL_SIZE];
+	ulint		offsets_onstack2[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets_rec = offsets_onstack1;
+	ulint*		offsets_next_rec = offsets_onstack2;
+	ulint		root_height;
+
+	rec_offs_init(offsets_onstack1);
+	rec_offs_init(offsets_onstack2);
+
+#if 0
+	DEBUG_PRINTF("      %s(table=%s, index=%s, rec=%p, n_prefix=%lu)\n",
+		     __func__, index->table->name, index->name,
+		     btr_pcur_get_rec(pcur),
+		     n_prefix);
+#endif
+
+	heap = mem_heap_create(256);
+
+	root_height = btr_page_get_level(btr_root_get(index, mtr), mtr);
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+				      ULINT_UNDEFINED, &heap);
+
+	page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec);
+
+	/* descend to the leaf level on the B-tree */
+	for (;;) {
+
+		block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH,
+					 NULL /* no guessed block */,
+					 BUF_GET, __FILE__, __LINE__, mtr);
+
+		page = buf_block_get_frame(block);
+
+		if (btr_page_get_level(page, mtr) == 0) {
+			/* leaf level */
+			break;
+		}
+		/* else */
+
+		/* search for a non-boring record on the page */
+
+		supremum = page_get_supremum_rec(page);
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+
+		/* empty pages are allowed only if the whole B-tree is empty
+		and contains a signle empty page */
+		if (root_height > 0) {
+
+			ut_a(rec != supremum);
+		} else if (rec == supremum) {
+			/* the whole B-tree consists of a single empty page */
+			*n_diff = 0;
+			goto end;
+		}
+
+		offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+					      ULINT_UNDEFINED, &heap);
+
+		next_rec = page_rec_get_next(rec);
+
+		for (;;) {
+			ulint	matched_fields = 0;
+			ulint	matched_bytes = 0;
+
+			if (next_rec == supremum) {
+				/* page has all boring keys, no need to
+				descend to the leaf level */
+				*n_diff = 1;
+				goto end;
+			}
+			/* else */
+
+			offsets_next_rec = rec_get_offsets(next_rec, index,
+							   offsets_next_rec,
+							   ULINT_UNDEFINED, &heap);
+
+			/* check whether rec != next_rec when looking at
+			the first n_prefix fields */
+			cmp_rec_rec_with_match(rec, next_rec,
+                                               offsets_rec, offsets_next_rec,
+                                               index, &matched_fields,
+                                               &matched_bytes);
+
+
+			if (matched_fields < n_prefix) {
+				/* rec != next_rec, => rec is non-boring */
+				break;
+			}
+
+			rec = next_rec;
+			{
+				ulint*	offsets_tmp;
+				offsets_tmp = offsets_rec;
+				offsets_rec = offsets_next_rec;
+				offsets_next_rec = offsets_tmp;
+			}
+			next_rec = page_rec_get_next(next_rec);
+		}
+
+		/* now we got a non-boring record in rec, descend below it */
+
+		page_no = btr_node_ptr_get_child_page_no(rec, offsets_rec);
+	}
+
+	/* make sure we got a leaf page as a result from the above loop */
+	ut_ad(btr_page_get_level(page, mtr) == 0);
+
+	/* scan the leaf page and find the number of distinct keys,
+	when looking only at the first n_prefix columns */
+
+	supremum = page_get_supremum_rec(page);
+	rec = page_rec_get_next(page_get_infimum_rec(page));
+
+	if (root_height > 0) {
+
+		/* empty pages are allowed only if the whole B-tree is empty
+		and contains a signle empty page */
+		ut_a(rec != supremum);
+
+		/* start with 1 */
+		*n_diff = 1;
+	} else if (rec == supremum) {
+		/* the whole B-tree consists of a single empty page */
+		*n_diff = 0;
+		goto end;
+	}
+
+	offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+				      ULINT_UNDEFINED, &heap);
+
+	next_rec = page_rec_get_next(rec);
+
+	/* iterate over the records on the page */
+	while (next_rec != supremum) {
+
+		ulint	matched_fields = 0;
+		ulint	matched_bytes = 0;
+
+		offsets_next_rec = rec_get_offsets(next_rec, index,
+						   offsets_next_rec,
+						   ULINT_UNDEFINED, &heap);
+
+		/* check whether rec != next_rec when looking at
+		the first n_prefix columns */
+		cmp_rec_rec_with_match(rec, next_rec,
+				       offsets_rec, offsets_next_rec,
+				       index, &matched_fields,
+				       &matched_bytes);
+
+		if (matched_fields < n_prefix) {
+			(*n_diff)++;
+		}
+
+		rec = next_rec;
+		{
+			ulint*	offsets_tmp;
+			offsets_tmp = offsets_rec;
+			offsets_rec = offsets_next_rec;
+			offsets_next_rec = offsets_tmp;
+		}
+		next_rec = page_rec_get_next(next_rec);
+	}
+
+end:
+#if 0
+	DEBUG_PRINTF("      %s(): n_diff below page_no=%lu: %llu\n",
+		     __func__, page_no, *n_diff);
+#endif
+
+	mem_heap_free(heap);
+}
+/* @} */
+
+/*********************************************************************//**
+For a given level in an index select srv_stats_persistent_sample_pages
+(or less) records from that level and dive below them to the corresponding
+leaf pages, then scan those leaf pages and save the sampling results in
+index->stat_n_diff_key_vals[n_prefix] and the number of pages scanned in
+index->stat_n_sample_sizes[n_prefix].
+dict_stats_analyze_index_for_n_prefix() @{ */
+static
+void
+dict_stats_analyze_index_for_n_prefix(
+/*==================================*/
+	dict_index_t*	index,			/*!< in/out: index */
+	ulint		level,			/*!< in: level, must be >= 1 */
+	ib_uint64_t	total_recs_on_level,	/*!< in: total number of
+						records on the given level */
+	ulint		n_prefix,		/*!< in: look at first n_prefix
+						columns when comparing records */
+	ib_uint64_t	n_diff_for_this_prefix,	/*!< in: number of distinct
+						records on the given level, when
+						looking at the first n_prefix
+						columns */
+
+	dyn_array_t*	boundaries)		/*!< in: array that contains
+						n_diff_for_this_prefix integers
+						each of which represents the
+						index (on the level, counting from
+						left/smallest to right/biggest
+						from 0) of the last record from
+						each group of distinct keys */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	dtuple;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	page_t*		page;
+	ib_uint64_t	rec_idx;
+	ib_uint64_t	last_idx_on_level;
+	ib_uint64_t	n_recs_to_dive_below;
+	ib_uint64_t	n_diff_sum_of_all_analyzed_pages;
+	ib_uint64_t	i;
+
+#if 0
+	DEBUG_PRINTF("    %s(table=%s, index=%s, level=%lu, n_prefix=%lu, "
+		     "n_diff_for_this_prefix=%llu)\n",
+		     __func__, index->table->name, index->name, level,
+		     n_prefix, n_diff_for_this_prefix);
+#endif
+
+	/* if some of those is 0 then this means that there is exactlty one
+	page in the B-tree and it is empty and we should have done full scan
+	and should not be here */
+	ut_ad(total_recs_on_level > 0);
+	ut_ad(n_diff_for_this_prefix > 0);
+
+	/* this is configured to be min 1, someone has changed the code */
+	ut_ad(srv_stats_persistent_sample_pages > 0);
+
+	heap = mem_heap_create(256);
+
+	/* craft a record that is always smaller than the others,
+	this way we are sure that the cursor pcur will be positioned
+	on the leftmost record on the leftmost page on the desired level */
+	dtuple = dtuple_create(heap, dict_index_get_n_unique(index));
+	dict_table_copy_types(dtuple, index->table);
+	dtuple_set_info_bits(dtuple, REC_INFO_MIN_REC_FLAG);
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_low(index, level, dtuple, PAGE_CUR_LE, BTR_SEARCH_LEAF,
+			  &pcur, __FILE__, __LINE__, &mtr);
+
+	page = btr_pcur_get_page(&pcur);
+
+	/* check that we are indeed on the desired level */
+	ut_a(btr_page_get_level(page, &mtr) == level);
+
+	/* there should not be any pages on the left */
+	ut_a(btr_page_get_prev(page, &mtr) == FIL_NULL);
+
+	/* check whether the first record on the leftmost page is marked
+	as such, if we are on a non-leaf level */
+	ut_a(level == 0 || REC_INFO_MIN_REC_FLAG
+	     & rec_get_info_bits(page_rec_get_next(page_get_infimum_rec(page)),
+				 page_is_comp(page)));
+
+	if (btr_pcur_is_before_first_on_page(&pcur)) {
+		btr_pcur_move_to_next_on_page(&pcur);
+	}
+
+	if (btr_pcur_is_after_last_on_page(&pcur)) {
+		btr_pcur_move_to_prev_on_page(&pcur);
+	}
+
+	last_idx_on_level = *(ib_uint64_t*) dyn_array_get_element(boundaries,
+		(n_diff_for_this_prefix - 1) * sizeof(ib_uint64_t));
+
+	rec_idx = 0;
+
+	n_diff_sum_of_all_analyzed_pages = 0;
+
+	n_recs_to_dive_below = ut_min(srv_stats_persistent_sample_pages,
+				      n_diff_for_this_prefix);
+
+	for (i = 0; i < n_recs_to_dive_below; i++) {
+		ib_uint64_t	left;
+		ib_uint64_t	right;
+		ulint		rnd;
+		ib_uint64_t	dive_below_idx;
+		ib_uint64_t	n_diff_on_leaf;
+
+		/* there are n_diff_for_this_prefix elements
+		in the array boundaries[] and we divide those elements
+		into n_recs_to_dive_below segments, for example:
+
+		let n_diff_for_this_prefix=100, n_recs_to_dive_below=4, then:
+		segment i=0:  [0, 24]
+		segment i=1: [25, 49]
+		segment i=2: [50, 74]
+		segment i=3: [75, 99] or
+		
+		let n_diff_for_this_prefix=1, n_recs_to_dive_below=1, then:
+		segment i=0: [0, 0] or
+
+		let n_diff_for_this_prefix=2, n_recs_to_dive_below=2, then:
+		segment i=0: [0, 0]
+		segment i=1: [1, 1] or
+
+		let n_diff_for_this_prefix=13, n_recs_to_dive_below=7, then:
+		segment i=0:  [0,  0]
+		segment i=1:  [1,  2]
+		segment i=2:  [3,  4]
+		segment i=3:  [5,  6]
+		segment i=4:  [7,  8]
+		segment i=5:  [9, 10]
+		segment i=6: [11, 12]
+
+		then we select a random record from each segment and dive
+		below it */
+		left = n_diff_for_this_prefix * i / n_recs_to_dive_below;
+		right = n_diff_for_this_prefix * (i + 1) / n_recs_to_dive_below - 1;
+
+		ut_a(left <= right);
+		ut_a(right <= last_idx_on_level);
+
+		/* we do not pass (left, right) because we do not want to ask
+		ut_rnd_interval() to work with too big numbers since
+		ib_uint64_t could be bigger than ulint */
+		rnd = ut_rnd_interval(0, right - left);
+
+		dive_below_idx = *(ib_uint64_t*) dyn_array_get_element(
+			boundaries, (left + rnd) * sizeof(ib_uint64_t));
+
+#if 0
+		DEBUG_PRINTF("    %s(): dive below rec_idx=%llu\n",
+			     __func__, dive_below_idx);
+#endif
+
+		/* seek to the record with index dive_below_idx */
+		while (rec_idx < dive_below_idx
+		       && btr_pcur_is_on_user_rec(&pcur)) {
+
+			btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+			rec_idx++;
+		}
+
+		/* if the level has finished before the record we are
+		searching for, this means that the B-tree has changed in
+		the meantime, abort */
+		if (rec_idx < dive_below_idx) {
+
+			ut_ad(!btr_pcur_is_on_user_rec(&pcur));
+			break;
+		}
+
+		ut_a(rec_idx == dive_below_idx);
+
+		dict_stats_analyze_index_below_pcur(index, &pcur, n_prefix,
+						    &n_diff_on_leaf, &mtr);
+
+		n_diff_sum_of_all_analyzed_pages += n_diff_on_leaf;
+	}
+
+	index->stat_n_diff_key_vals[n_prefix]
+		= total_recs_on_level * n_diff_sum_of_all_analyzed_pages
+		/ n_recs_to_dive_below;
+
+	index->stat_n_sample_sizes[n_prefix] = n_recs_to_dive_below;
+
+	DEBUG_PRINTF("    %s(): n_diff=%llu for n_prefix=%lu\n",
+		     __func__, index->stat_n_diff_key_vals[n_prefix],
+		     n_prefix);
+
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	mem_heap_free(heap);
+}
+/* @} */
+
+/*********************************************************************//**
+Calculates new statistics for a given index and saves them to the index
+members stat_n_diff_key_vals[], stat_n_sample_sizes[], stat_index_size and
+stat_n_leaf_pages. This function could be slow.
+dict_stats_analyze_index() @{
+@return DB_SUCCESS or error code */
+static
+enum db_err
+dict_stats_analyze_index(
+/*=====================*/
+	dict_index_t*	index)	/*!< in/out: index to analyze */
+{
+	ulint		root_level;
+	ulint		level;
+	ibool		level_is_analyzed;
+	ulint		n_uniq;
+	ulint		n_prefix;
+	ib_uint64_t*	n_diff_on_level;
+	ib_uint64_t	total_recs;
+	ib_uint64_t	total_pages;
+	dyn_array_t*	n_diff_boundaries;
+	mtr_t		mtr;
+	ulint		i;
+
+	DEBUG_PRINTF("  %s(index=%s)\n", __func__, index->name);
+
+	index->stat_index_size = btr_get_size(index, BTR_TOTAL_SIZE);
+
+	index->stat_n_leaf_pages = btr_get_size(index, BTR_N_LEAF_PAGES);
+	if (index->stat_n_leaf_pages == 0) {
+		/* The root node of the tree is a leaf */
+		index->stat_n_leaf_pages = 1;
+	}
+
+	mtr_start(&mtr);
+
+	mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+	root_level = btr_page_get_level(btr_root_get(index, &mtr), &mtr);
+
+	mtr_commit(&mtr);
+
+	n_uniq = dict_index_get_n_unique(index);
+
+	/* if the tree has just one level (and one page) or if the user
+	has requested to sample too many pages then do full scan */
+	if (root_level == 0
+	    /* for each n-column prefix (for n=1..n_uniq)
+	    srv_stats_persistent_sample_pages will be sampled, so in total
+	    srv_stats_persistent_sample_pages * n_uniq leaf pages will be
+	    sampled. If that number is bigger than the total number of leaf
+	    pages then do full scan of the leaf level instead since it will
+	    be faster and will give better results. */
+	    || srv_stats_persistent_sample_pages * n_uniq
+	       > index->stat_n_leaf_pages) {
+
+		if (root_level == 0) {
+			DEBUG_PRINTF("  %s(): just one page, "
+				     "doing full scan\n", __func__);
+		} else {
+			DEBUG_PRINTF("  %s(): too many pages requested for "
+				     "sampling, doing full scan\n", __func__);
+		}
+
+		/* do full scan of level 0; save results directly
+		into the index */
+
+		dict_stats_analyze_index_level(index,
+					       0 /* leaf level */,
+					       index->stat_n_diff_key_vals,
+					       &total_recs,
+					       &total_pages,
+					       NULL /* boundaries not needed */);
+
+		for (i = 1; i <= n_uniq; i++) {
+			index->stat_n_sample_sizes[i] = total_pages;
+		}
+
+		return(DB_SUCCESS);
+	}
+	/* else */
+
+	/* set to zero */
+	n_diff_on_level = (ib_uint64_t*) mem_zalloc((n_uniq + 1) * sizeof(ib_uint64_t));
+
+	n_diff_boundaries = (dyn_array_t*) mem_alloc((n_uniq + 1) * sizeof(dyn_array_t));
+
+	for (i = 1; i <= n_uniq; i++) {
+		/* initialize the dynamic arrays, the first one
+		(index=0) is ignored to follow the same indexing
+		scheme as n_diff_on_level[] */
+		dyn_array_create(&n_diff_boundaries[i]);
+	}
+
+	/* total_recs is also used to estimate the number of pages on one
+	level below, so at the start we have 1 page (the root) */
+	total_recs = 1;
+
+	/* Here we use the following optimization:
+	If we find that level L is the first one (searching from the
+	root) that contains at least D distinct keys when looking at
+	the first n_prefix columns, then:
+	if we look at the first n_prefix-1 columns then the first
+	level that contains D distinct keys will be either L or a
+	lower one.
+	So if we find that the first level containing D distinct
+	keys (on n_prefix columns) is L, we continue from L when
+	searching for D distinct keys on n_prefix-1 columns. */
+	level = (long) root_level;
+	level_is_analyzed = FALSE;
+	for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
+
+		DEBUG_PRINTF("  %s(): searching level with >=%llu "
+			     "distinct records, n_prefix=%lu\n",
+			     __func__, N_DIFF_REQUIRED, n_prefix);
+
+		/* check whether we should pick the current level;
+		we pick level 1 even if it does not have enough
+		distinct records because we do not want to scan the
+		leaf level because it may contain too many records */
+		if (level_is_analyzed
+		    && (n_diff_on_level[n_prefix] >= N_DIFF_REQUIRED
+			|| level == 1)) {
+
+			goto found_level;
+		}
+		/* else */
+
+		/* search for a level that contains enough distinct records */
+
+		if (level_is_analyzed && level > 1) {
+
+			/* if this does not hold we should be on
+			"found_level" instead of here */
+			ut_ad(n_diff_on_level[n_prefix] < N_DIFF_REQUIRED);
+
+			level--;
+			level_is_analyzed = FALSE;
+		}
+
+		for (;;) {
+
+			/* make sure we do not scan the leaf level
+			accidentally, it may contain too many pages */
+			ut_ad(level > 0);
+
+			/* scanning the same level twice is an optimization
+			bug */
+			ut_ad(!level_is_analyzed);
+
+			/* Do not scan if this would read too many pages.
+			Here we use the following fact:
+			the number of pages on level L equals the number
+			of records on level L+1, thus we deduce that the
+			following call would scan total_recs pages, because
+			total_recs is left from the previous iteration when
+			we scanned one level upper or we have not scanned any
+			levels yet in which case total_recs is 1. */
+			if (total_recs > srv_stats_persistent_sample_pages) {
+
+				/* if the above cond is true then we are not
+				at the root level since on the root level
+				total_recs == 1 and cannot
+				be > srv_stats_persistent_sample_pages */
+				ut_a(level != root_level);
+
+				/* step one level back and be satisfied with
+				whatever it contains */
+				level++;
+				level_is_analyzed = TRUE;
+
+				break;
+			}
+
+			dict_stats_analyze_index_level(index,
+						       level,
+						       n_diff_on_level,
+						       &total_recs,
+						       &total_pages,
+						       n_diff_boundaries);
+
+			level_is_analyzed = TRUE;
+
+			if (n_diff_on_level[n_prefix] >= N_DIFF_REQUIRED
+			    || level == 1) {
+				/* we found a good level with many distinct
+				records or we have reached the last level we
+				could scan */
+				break;
+			}
+			/* else */
+
+			level--;
+			level_is_analyzed = FALSE;
+		}
+found_level:
+
+		DEBUG_PRINTF("  %s(): found level %lu that has %llu "
+			     "distinct records for n_prefix=%lu\n",
+			     __func__, level, n_diff_on_level[n_prefix],
+			     n_prefix);
+
+		/* here we are either on level 1 or the level that we are on
+		contains >= N_DIFF_REQUIRED distinct keys or we did not scan
+		deeper levels because they would contain too many pages */
+
+		ut_ad(level > 0);
+
+		ut_ad(level_is_analyzed);
+
+		/* pick some records from this level and dive below them for the
+		given n_prefix */
+
+		dict_stats_analyze_index_for_n_prefix(
+			index, level, total_recs, n_prefix,
+			n_diff_on_level[n_prefix],
+			&n_diff_boundaries[n_prefix]);
+	}
+
+	for (i = 1; i <= n_uniq; i++) {
+		dyn_array_free(&n_diff_boundaries[i]);
+	}
+
+	mem_free(n_diff_boundaries);
+
+	mem_free(n_diff_on_level);
+
+	return(DB_SUCCESS);
+}
+/* @} */
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. This function
+is relatively slow and is used to calculate persistent statistics that
+will be saved on disk.
+dict_stats_update_persistent() @{
+@return DB_SUCCESS or error code */
+static
+enum db_err
+dict_stats_update_persistent(
+/*=========================*/
+	dict_table_t*	table)		/*!< in/out: table */
+{
+	dict_index_t*	index;
+
+	DEBUG_PRINTF("%s(table=%s)\n", __func__, table->name);
+
+	/* XXX quit if interrupted, e.g. SIGTERM */
+
+	/* analyze the clustered index first */
+
+	index = dict_table_get_first_index(table);
+
+	if (index == NULL) {
+		/* Table definition is corrupt */
+		return DB_CORRUPTION;
+	}
+
+	dict_stats_analyze_index(index);
+
+	table->stat_n_rows
+		= index->stat_n_diff_key_vals[dict_index_get_n_unique(index)];
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	/* analyze other indexes from the table, if any */
+
+	table->stat_sum_of_other_index_sizes = 0;
+
+	for (index = dict_table_get_next_index(index);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		dict_stats_analyze_index(index);
+
+		table->stat_sum_of_other_index_sizes += index->stat_index_size;
+	}
+
+	table->stat_initialized = TRUE;
+
+	table->stat_modified_counter = 0;
+
+	return(DB_SUCCESS);
+}
+/* @} */
+
+/*********************************************************************//**
+Save an individual index's statistic into the persistent storage.
+dict_stats_save_index_stat() @{
+@return DB_SUCCESS or error code */
+static
+enum db_err
+dict_stats_save_index_stat(
+/*=======================*/
+	dict_index_t*	index,		/*!< in: index */
+	lint		stat_timestamp,	/*!< in: timestamp of the stat */
+	const char*	stat_name,	/*!< in: name of the stat */
+	ib_uint64_t	stat_value,	/*!< in: value of the stat */
+	ib_uint64_t*	sample_size,	/*!< in: n pages sampled or NULL */
+	const char*	stat_description,/*!< in: description of the stat */
+	trx_t*		trx)		/*!< in/out: transaction to use */
+{
+	pars_info_t*	pinfo;
+	ulint		ret;
+
+	pinfo = pars_info_create();
+
+	/* we do this because the slash in INDEX_STATS_NAME confuses the
+	parser if used directly inside the SQL */
+	pars_info_add_id(pinfo, "index_stats", INDEX_STATS_NAME);
+
+	pars_info_add_literal(pinfo, "database_name", index->table->name,
+			      dict_get_db_name_len(index->table->name),
+			      DATA_VARCHAR, 0);
+
+	pars_info_add_str_literal(pinfo, "table_name",
+				  dict_remove_db_name(index->table->name));
+
+	pars_info_add_str_literal(pinfo, "index_name", index->name);
+
+	pars_info_add_int4_literal(pinfo, "stat_timestamp", stat_timestamp);
+
+	pars_info_add_str_literal(pinfo, "stat_name", stat_name);
+
+	pars_info_add_uint64_literal(pinfo, "stat_value", stat_value);
+
+	if (sample_size != NULL) {
+		pars_info_add_uint64_literal(pinfo, "sample_size", *sample_size);
+	} else {
+		pars_info_add_literal(pinfo, "sample_size", NULL,
+				      UNIV_SQL_NULL, DATA_FIXBINARY, 0);
+	}
+
+	pars_info_add_str_literal(pinfo, "stat_description",
+				  stat_description);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE INDEX_STATS_SAVE () IS\n"
+			   "dummy CHAR;\n"
+			   "BEGIN\n"
+
+			   "SELECT database_name INTO dummy\n"
+			   "FROM $index_stats\n"
+			   "WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name AND\n"
+			   "index_name = :index_name AND\n"
+			   "stat_name = :stat_name\n"
+			   "FOR UPDATE;\n"
+
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "  INSERT INTO $index_stats\n"
+			   "  VALUES\n"
+			   "  (\n"
+			   "  :database_name,\n"
+			   "  :table_name,\n"
+			   "  :index_name,\n"
+			   "  :stat_timestamp,\n"
+			   "  :stat_name,\n"
+			   "  :stat_value,\n"
+			   "  :sample_size,\n"
+			   "  :stat_description\n"
+			   "  );\n"
+			   "ELSE\n"
+			   "  UPDATE $index_stats SET\n"
+			   "  stat_value = :stat_value,\n"
+			   "  sample_size = :sample_size,\n"
+			   "  stat_description = :stat_description\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name AND\n"
+			   "  index_name = :index_name AND\n"
+			   "  stat_name = :stat_name;\n"
+			   "END IF;\n"
+			   "END;",
+		TRUE, trx);
+
+	/* pinfo is freed by que_eval_sql() */
+
+	if (ret != DB_SUCCESS) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error while trying to save index "
+			"statistics for table %s, index %s, "
+			"stat name %s: %s\n",
+			index->table->name, index->name,
+			stat_name, ut_strerr(ret));
+	}
+
+	return(ret);
+}
+/* @} */
+
+/*********************************************************************//**
+Save the table's statistics into the persistent storage.
+dict_stats_save() @{
+@return DB_SUCCESS or error code */
+static
+enum db_err
+dict_stats_save(
+/*============*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	trx_t*		trx;
+	pars_info_t*	pinfo;
+	dict_index_t*	index;
+	lint		now;
+	ulint		ret;
+
+	/* MySQL's timestamp is 4 byte, so we use
+	pars_info_add_int4_literal() which takes a lint arg, so "now" is
+	lint */
+	now = (lint) ut_time();
+
+	mutex_enter(&kernel_mutex);
+	trx = trx_create(trx_dummy_sess);
+	mutex_exit(&kernel_mutex);
+
+	trx->op_info = "";
+	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+	trx_start(trx, ULINT_UNDEFINED);
+
+	pinfo = pars_info_create();
+
+	/* we do this because the slash in TABLE_STATS_NAME confuses
+	the parser if used directly inside the SQL */
+	pars_info_add_id(pinfo, "table_stats", TABLE_STATS_NAME);
+
+	pars_info_add_literal(pinfo, "database_name", table->name,
+			      dict_get_db_name_len(table->name),
+			      DATA_VARCHAR, 0);
+
+	pars_info_add_str_literal(pinfo, "table_name",
+				  dict_remove_db_name(table->name));
+
+	pars_info_add_int4_literal(pinfo, "stats_timestamp", now);
+
+	pars_info_add_uint64_literal(pinfo, "n_rows", table->stat_n_rows);
+
+	pars_info_add_uint64_literal(pinfo, "clustered_index_size",
+				     table->stat_clustered_index_size);
+
+	pars_info_add_uint64_literal(pinfo, "sum_of_other_index_sizes",
+				     table->stat_sum_of_other_index_sizes);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE TABLE_STATS_SAVE () IS\n"
+			   "dummy CHAR;\n"
+			   "BEGIN\n"
+
+			   "SELECT database_name INTO dummy\n"
+			   "FROM $table_stats\n"
+			   "WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name\n"
+			   "FOR UPDATE;\n"
+
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "  INSERT INTO $table_stats\n"
+			   "  VALUES\n"
+			   "  (\n"
+			   "  :database_name,\n"
+			   "  :table_name,\n"
+			   "  :stats_timestamp,\n"
+			   "  :n_rows,\n"
+			   "  :clustered_index_size,\n"
+			   "  :sum_of_other_index_sizes\n"
+			   "  );\n"
+			   "ELSE\n"
+			   "  UPDATE $table_stats SET\n"
+			   "  stats_timestamp = :stats_timestamp,\n"
+			   "  n_rows = :n_rows,\n"
+			   "  clustered_index_size = :clustered_index_size,\n"
+			   "  sum_of_other_index_sizes = :sum_of_other_index_sizes\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+			   "END IF;\n"
+			   "END;",
+			   TRUE, trx);
+
+	/* pinfo is freed by que_eval_sql() */
+
+	if (ret != DB_SUCCESS) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Error while trying to save table "
+			"statistics for table %s: %s\n",
+			table->name, ut_strerr(ret));
+
+		goto end_rollback;
+	}
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		ulint	i;
+
+		ret = dict_stats_save_index_stat(index, now, "size",
+						 index->stat_index_size,
+						 NULL,
+						 "Number of pages "
+						 "in the index",
+						 trx);
+		if (ret != DB_SUCCESS) {
+			goto end_rollback;
+		}
+
+		ret = dict_stats_save_index_stat(index, now, "n_leaf_pages",
+						 index->stat_n_leaf_pages,
+						 NULL,
+						 "Number of leaf pages "
+						 "in the index",
+						 trx);
+		if (ret != DB_SUCCESS) {
+			goto end_rollback;
+		}
+
+		for (i = 1; i <= dict_index_get_n_unique(index); i++) {
+
+			char	stat_name[16];
+			char	stat_description[1024];
+			ulint	j;
+
+			ut_snprintf(stat_name, sizeof(stat_name),
+				    "n_diff_pfx%02lu", i);
+
+			/* craft a string that contains the columns names */
+			ut_snprintf(stat_description,
+				    sizeof(stat_description),
+				    "%s", index->fields[0].name);
+			for (j = 2; j <= i; j++) {
+				size_t	len;
+
+				len = strlen(stat_description);
+
+				ut_snprintf(stat_description + len,
+					    sizeof(stat_description) - len,
+					    ",%s", index->fields[j - 1].name);
+			}
+
+			ret = dict_stats_save_index_stat(
+				index, now, stat_name,
+				index->stat_n_diff_key_vals[i],
+				&index->stat_n_sample_sizes[i],
+				stat_description, trx);
+
+			if (ret != DB_SUCCESS) {
+				goto end_rollback;
+			}
+		}
+	}
+
+	goto end_commit;
+
+end_rollback:
+
+	trx_rollback_for_mysql(trx);
+	goto end_free;
+
+end_commit:
+
+	trx_commit_for_mysql(trx);
+	ret = DB_SUCCESS;
+
+end_free:
+
+	mutex_enter(&kernel_mutex);
+	trx_free(trx);
+	mutex_exit(&kernel_mutex);
+
+	return(ret);
+}
+/* @} */
+
+/*********************************************************************//**
+Called for the row that is selected by
+SELECT ... FROM innodb.table_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to it.
+dict_stats_fetch_table_stats_step() @{
+@return non-NULL dummy */
+static
+void*
+dict_stats_fetch_table_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	table_void)	/*!< out: table */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	dict_table_t*	table = (dict_table_t*) table_void;
+	que_common_t*	cnode;
+	int		i;
+
+	/* this should loop exactly 3 times - for
+	n_rows,clustered_index_size,sum_of_other_index_sizes */
+	for (cnode = node->select_list, i = 0;
+	     cnode != NULL;
+	     cnode = que_node_get_next(cnode), i++) {
+
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+		void*		data = dfield_get_data(dfield);
+
+		switch (i) {
+		case 0: /* innodb.table_stats.n_rows */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_n_rows = mach_read_ull(data);
+
+			break;
+
+		case 1: /* innodb.table_stats.clustered_index_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_clustered_index_size
+				= (ulint) mach_read_ull(data);
+
+			break;
+
+		case 2: /* innodb.table_stats.sum_of_other_index_sizes */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			table->stat_sum_of_other_index_sizes
+				= (ulint) mach_read_ull(data);
+
+			break;
+
+		default:
+
+			/* someone changed SELECT
+			n_rows,clustered_index_size,sum_of_other_index_sizes
+			to select more columns from table_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 3 this means someone changed the
+	SELECT n_rows,clustered_index_size,sum_of_other_index_sizes
+	to select less columns from table_stats without adjusting here;
+	if i > 3 we would have ut_error'ed earlier */
+	ut_a(i == 3 /* n_rows,clustered_index_size,sum_of_other_index_sizes */);
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return((void*) 1);
+}
+/* @} */
+
+/*********************************************************************//**
+Called for the rows that are selected by
+SELECT ... FROM innodb.index_stats WHERE table='...'
+The second argument is a pointer to the table and the fetched stats are
+written to its indexes.
+Let a table has N indexes and each index has Ui unique columns, then
+innodb.index_stats will have N*SUM(Ui) rows for for that table. So this
+function will be called N*SUM(Ui) times. In each call it searches for the
+currently fetched index into table->indexes linearly,
+assuming this list is not sorted. Thus, overall, fetching all indexes' stats
+from innodb.index_stats is O(N^2) where N is the number of indexes. This can
+be improved if we sort table->indexes in a temporary area just once and then
+search in that sorted list. Then the complexity will be O(N*log(N)).
+We assume a table will not have more than 100 indexes, so we go with the
+simpler N^2 algorithm.
+dict_stats_fetch_index_stats_step() @{
+@return non-NULL dummy */
+static
+void*
+dict_stats_fetch_index_stats_step(
+/*==============================*/
+	void*	node_void,	/*!< in: select node */
+	void*	table_void)	/*!< out: table */
+{
+	sel_node_t*	node = (sel_node_t*) node_void;
+	dict_table_t*	table = (dict_table_t*) table_void;
+	dict_index_t*	index = NULL;
+	que_common_t*	cnode;
+	const char*	stat_name = NULL;
+	ulint		stat_name_len = (ulint) -1;
+	ib_uint64_t	stat_value = (ib_uint64_t) -1;
+	ib_uint64_t	sample_size = (ib_uint64_t) -1;
+	int		i;
+
+	/* this should loop exactly 4 times - for the columns that
+	were selected: index_name,stat_name,stat_value,sample_size */
+	for (cnode = node->select_list, i = 0;
+	     cnode != NULL;
+	     cnode = que_node_get_next(cnode), i++) {
+
+		dfield_t*	dfield = que_node_get_val(cnode);
+		dtype_t*	type = dfield_get_type(dfield);
+		ulint		len = dfield_get_len(dfield);
+		void*		data = dfield_get_data(dfield);
+
+		switch (i) {
+		case 0: /* innodb.index_stats.index_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+
+			/* search for index in table's indexes whose name
+			matches data; the fetched index name is in data,
+			has no terminating '\0' and has length len */
+			for (index = dict_table_get_first_index(table);
+			     index != NULL;
+			     index = dict_table_get_next_index(index)) {
+
+				if (strncasecmp(index->name, data, len) == 0) {
+					/* the corresponding index was found */
+					break;
+				}
+			}
+
+			/* if index is NULL here this means that
+			innodb.index_stats contains more rows than the number
+			of indexes in the table; this is ok, we just return
+			ignoring those extra rows; in other words
+			dict_stats_fetch_index_stats_step() has been called
+			for a row from index_stats with unknown index_name
+			column */
+			if (index == NULL) {
+
+				return((void*) 1);
+			}
+
+			break;
+
+		case 1: /* innodb.index_stats.stat_name */
+
+			ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+
+			ut_a(index != NULL);
+
+			stat_name = (const char*) data;
+			stat_name_len = len;
+
+			break;
+
+		case 2: /* innodb.index_stats.stat_value */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != (ulint) -1);
+
+			stat_value = (ib_uint64_t) mach_read_ull(data);
+
+			break;
+
+		case 3: /* innodb.index_stats.sample_size */
+
+			ut_a(dtype_get_mtype(type) == DATA_INT);
+			ut_a(len == 8 || len == UNIV_SQL_NULL);
+
+			ut_a(index != NULL);
+			ut_a(stat_name != NULL);
+			ut_a(stat_name_len != (ulint) -1);
+			ut_a(stat_value != (ib_uint64_t) -1);
+
+			if (len == UNIV_SQL_NULL) {
+				break;
+			}
+			/* else */
+
+			sample_size = (ib_uint64_t) mach_read_ull(data);
+
+			break;
+
+		default:
+
+			/* someone changed
+			SELECT index_name,stat_name,stat_value,sample_size
+			to select more columns from index_stats without
+			adjusting here */
+			ut_error;
+		}
+	}
+
+	/* if i < 4 this means someone changed the
+	SELECT index_name,stat_name,stat_value,sample_size
+	to select less columns from index_stats without adjusting here;
+	if i > 4 we would have ut_error'ed earlier */
+	ut_a(i == 4 /* index_name,stat_name,stat_value,sample_size */);
+
+	ut_a(index != NULL);
+	ut_a(stat_name != NULL);
+	ut_a(stat_name_len != (ulint) -1);
+	ut_a(stat_value != (ib_uint64_t) -1);
+	/* sample_size could be (ib_uint64_t) -1 here, if it is NULL */
+
+#define PFX	"n_diff_pfx"
+
+	if (strncasecmp("size", stat_name, stat_name_len) == 0) {
+		index->stat_index_size = stat_value;
+	} else if (strncasecmp("n_leaf_pages", stat_name, stat_name_len) == 0) {
+		index->stat_n_leaf_pages = stat_value;
+	} else if (strncasecmp(PFX, stat_name,
+			       ut_min(strlen(PFX), stat_name_len)) == 0) {
+
+		const char*	num_ptr;
+		unsigned long	n_pfx;
+
+		/* point num_ptr into "1" from "n_diff_pfx12..." */
+		num_ptr = stat_name + strlen(PFX);
+
+		/* stat_name should have exactly 2 chars appended to PFX
+		and they should be digits */
+		if (stat_name_len != strlen(PFX) + 2
+		    || num_ptr[0] < '0' || num_ptr[0] > '9'
+		    || num_ptr[1] < '0' || num_ptr[1] > '9') {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Ignoring strange row from "
+				"innodb.index_stats WHERE "
+				"database_name = '%.*s' AND "
+				"table_name = '%s' AND "
+				"index_name = '%s' AND "
+				"stat_name = '%.*s'; because stat_name "
+				"is malformed\n",
+				(int) dict_get_db_name_len(table->name),
+				table->name,
+				dict_remove_db_name(table->name),
+				index->name,
+				(int) stat_name_len,
+				stat_name);
+			return((void*) 1);
+		}
+		/* else */
+
+		/* extract 12 from "n_diff_pfx12..." into n_pfx
+		note that stat_name does not have a terminating '\0' */
+		n_pfx = (num_ptr[0] - '0') * 10 + (num_ptr[1] - '0');
+
+		if (n_pfx == 0 || n_pfx > dict_index_get_n_unique(index)) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				" InnoDB: Ignoring strange row from "
+				"innodb.index_stats WHERE "
+				"database_name = '%.*s' AND "
+				"table_name = '%s' AND "
+				"index_name = '%s' AND "
+				"stat_name = '%.*s'; because stat_name is "
+				"out of range, the index has %lu unique "
+				"columns\n",
+				(int) dict_get_db_name_len(table->name),
+				table->name,
+				dict_remove_db_name(table->name),
+				index->name,
+				(int) stat_name_len,
+				stat_name,
+				dict_index_get_n_unique(index));
+			return((void*) 1);
+		}
+		/* else */
+
+		index->stat_n_diff_key_vals[n_pfx] = stat_value;
+
+		if (sample_size != (ib_uint64_t) -1) {
+			index->stat_n_sample_sizes[n_pfx] = sample_size;
+		} else {
+			/* hmm, strange... the user must have UPDATEd the
+			table manually and SET sample_size = NULL */
+			index->stat_n_sample_sizes[n_pfx] = 0;
+		}
+	} else {
+		/* silently ignore rows with unknown stat_name, the
+		user may have developed her own stats */
+	}
+
+	/* XXX this is not used but returning non-NULL is necessary */
+	return((void*) 1);
+}
+/* @} */
+
+/*********************************************************************//**
+Read table's statistics from the persistent storage.
+dict_stats_fetch_from_ps() @{
+@return DB_SUCCESS or error code */
+static
+enum db_err
+dict_stats_fetch_from_ps(
+/*=====================*/
+	dict_table_t*	table) /*!< in/out: table */
+{
+	trx_t*		trx;
+	pars_info_t*	pinfo;
+	ulint		ret;
+
+	mutex_enter(&kernel_mutex);
+	trx = trx_create(trx_dummy_sess);
+	mutex_exit(&kernel_mutex);
+
+	trx->op_info = "";
+	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+	trx_start(trx, ULINT_UNDEFINED);
+
+	pinfo = pars_info_create();
+
+	/* we do this because the slash in TABLE_STATS_NAME confuses
+	the parser if used directly inside the SQL */
+	pars_info_add_id(pinfo, "table_stats", TABLE_STATS_NAME);
+	pars_info_add_id(pinfo, "index_stats", INDEX_STATS_NAME);
+
+	pars_info_add_literal(pinfo, "database_name", table->name,
+			      dict_get_db_name_len(table->name),
+			      DATA_VARCHAR, 0);
+
+	pars_info_add_str_literal(pinfo, "table_name",
+				  dict_remove_db_name(table->name));
+
+	pars_info_add_function(pinfo,
+			       "fetch_table_stats_step",
+			       dict_stats_fetch_table_stats_step,
+			       table);
+
+	pars_info_add_function(pinfo,
+			       "fetch_index_stats_step",
+			       dict_stats_fetch_index_stats_step,
+			       table);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE FETCH_STATS () IS\n"
+			   "found INT;\n"
+			   "DECLARE FUNCTION fetch_table_stats_step;\n"
+			   "DECLARE FUNCTION fetch_index_stats_step;\n"
+			   "DECLARE CURSOR table_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_table_stats_step() */
+			   "  n_rows,\n"
+			   "  clustered_index_size,\n"
+			   "  sum_of_other_index_sizes\n"
+			   "  FROM $table_stats\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+			   "DECLARE CURSOR index_stats_cur IS\n"
+			   "  SELECT\n"
+			   /* if you change the selected fields, be
+			   sure to adjust
+			   dict_stats_fetch_index_stats_step() */
+			   "  index_name,\n"
+			   "  stat_name,\n"
+			   "  stat_value,\n"
+			   "  sample_size\n"
+			   "  FROM $index_stats\n"
+			   "  WHERE\n"
+			   "  database_name = :database_name AND\n"
+			   "  table_name = :table_name;\n"
+
+			   "BEGIN\n"
+
+			   "OPEN table_stats_cur;\n"
+			   "FETCH table_stats_cur INTO\n"
+			   "  fetch_table_stats_step();\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "  CLOSE table_stats_cur;\n"
+			   "  RETURN;\n"
+			   "END IF;\n"
+			   "CLOSE table_stats_cur;\n"
+
+			   "OPEN index_stats_cur;\n"
+			   "found := 1;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "  FETCH index_stats_cur INTO\n"
+			   "    fetch_index_stats_step();\n"
+			   "  IF (SQL % NOTFOUND) THEN\n"
+			   "    found := 0;\n"
+			   "  END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE index_stats_cur;\n"
+
+			   "END;",
+			   TRUE, trx);
+
+	/* pinfo is freed by que_eval_sql() */
+
+	/* XXX If innodb.index_stats contained less rows than the number
+	of indexes in the table, then some of the indexes of the table
+	were left uninitialized. Currently this is ignored and those
+	indexes are left with uninitialized stats until ANALYZE TABLE is
+	run. This condition happens when the user creates a new index
+	on a table. We could return DB_STATS_DO_NOT_EXIST from here,
+	forcing the usage of transient stats until innodb.index_stats
+	is complete. */
+
+	trx_commit_for_mysql(trx);
+
+	mutex_enter(&kernel_mutex);
+	trx_free(trx);
+	mutex_exit(&kernel_mutex);
+
+	return(ret);
+}
+/* @} */
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+dict_stats_update() @{
+@return DB_* error code or DB_SUCCESS */
+UNIV_INTERN
+enum db_err
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	enum dict_stats_upd_how	stats_upd_how)
+					/*!< in: whether to (re)calc
+					the stats or to fetch them from
+					the persistent storage */
+{
+	enum db_err	ret;
+
+	if (table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: cannot calculate statistics for table %s\n"
+			"InnoDB: because the .ibd file is missing.  For help,"
+			" please refer to\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+			table->name);
+
+		return(DB_TABLESPACE_DELETED);
+	}
+
+	/* If we have set a high innodb_force_recovery level, do not calculate
+	statistics, as a badly corrupted index can cause a crash in it. */
+
+	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+
+		return(DB_SUCCESS);
+	}
+
+	switch (stats_upd_how) {
+	case DICT_STATS_UPD_RECALC_PERSISTENT_VERBOSE:
+	case DICT_STATS_UPD_RECALC_PERSISTENT_SILENT:
+		/* Persistent recalculation requested, probably called from
+		ANALYZE TABLE */
+
+		/* check if the persistent storage exists before calling
+		the potentially slow function
+		dict_stats_update_persistent(); that is a
+		prerequisite for dict_stats_save() succeeding */
+		if (dict_stats_persistent_storage_check()) {
+
+			ret = dict_stats_update_persistent(table);
+
+			if (ret == DB_SUCCESS) {
+				ret = dict_stats_save(table);
+			}
+
+		} else {
+			/* Fall back to transient stats since the persistent
+			storage is not present or is corrupted */
+
+			if (stats_upd_how
+			    == DICT_STATS_UPD_RECALC_PERSISTENT_VERBOSE) {
+
+				ut_print_timestamp(stderr);
+				/* XXX add link to the doc about storage
+				creation */
+				fprintf(stderr,
+					" InnoDB: Recalculation of persistent "
+					"statistics requested but the required "
+					"persistent storage is not present "
+					"or is corrupted. Using quick transient "
+					"stats instead.\n");
+			}
+
+			dict_stats_update_transient(table);
+
+			ret = DB_SUCCESS;
+		}
+
+		break;
+
+	case DICT_STATS_UPD_RECALC_TRANSIENT:
+
+		dict_stats_update_transient(table);
+		ret = DB_SUCCESS;
+
+		break;
+
+	case DICT_STATS_UPD_FETCH:
+		/* fetch requested, either fetch from persistent storage
+		or use the old method */
+
+		if (dict_stats_persistent_storage_check()) {
+
+			ret = dict_stats_fetch_from_ps(table);
+		} else {
+
+			/* if persistent storage does not exist, then
+			force the following code to calculate the
+			transient stats */
+			ret = DB_STATS_DO_NOT_EXIST;
+		}
+
+		if (ret == DB_STATS_DO_NOT_EXIST) {
+
+			/* The persistent storage does not exist or stats
+			for this particular table do not exist, then
+			calculate the quick transient statistics */
+			dict_stats_update_transient(table);
+			ret = DB_SUCCESS;
+		}
+		/* else either success or some other failure, return
+		ret whatever it is */
+
+		break;
+
+	/* no "default:" in order to produce a compilation warning
+	about unhandled enumeration value */
+	}
+
+	return(ret);
+}
+/* @} */
+
+/*********************************************************************//**
+Removes the information for a particular index from the persistent
+storage if it exists and if there is data stored for this index.
+The transaction is not committed, it must not be committed in this
+function because this is the user trx that is running DROP INDEX.
+The transaction will be committed at the very end when dropping an
+index.
+A note from Marko why we cannot edit user and sys_* tables in one trx:
+marko: The problem is that ibuf merges should be disabled while we are
+rolling back dict transactions.
+marko: If ibuf merges are not disabled, we need to scan the *.ibd files.
+But we shouldn't open *.ibd files before we have rolled back dict
+transactions and opened the SYS_* records for the *.ibd files.
+dict_stats_drop_index() @{ */
+UNIV_INTERN
+void
+dict_stats_drop_index(
+/*==================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction to use */
+{
+	pars_info_t*	pinfo;
+	ulint		ret;
+
+	/* skip indexes whose table names do not contain a database name
+	e.g. if we are dropping an index from SYS_TABLES */
+	if (strchr(index->table_name, '/') == NULL) {
+
+		return;
+	}
+
+	/* If the persistent storage does not exist or is corrupted,
+	then do not attempt to DELETE from its tables because the
+	internal parser will crash.
+	There is a small chance that the PS gets dropped after we
+	have checked that it is present and then the internal parser
+	will crash. We are ok with this because we do not want to lock
+	the data dictionary for the whole operation that includes DELETE
+	from user-visible tables and could continue for a very long
+	period if the user has locks on that table.
+	It is also innocent if PS gets created after this function has
+	returned false. At worse some orphaned rows will be left in PS. */
+	if (!dict_stats_persistent_storage_check()) {
+
+		return;
+	}
+
+	pinfo = pars_info_create();
+
+	/* we do this because the slash in INDEX_STATS_NAME confuses
+	the parser if used directly inside the SQL */
+	pars_info_add_id(pinfo, "index_stats", INDEX_STATS_NAME);
+
+	pars_info_add_literal(pinfo, "database_name", index->table_name,
+			      dict_get_db_name_len(index->table_name),
+			      DATA_VARCHAR, 0);
+
+	pars_info_add_str_literal(pinfo, "table_name",
+				  dict_remove_db_name(index->table_name));
+
+	pars_info_add_str_literal(pinfo, "index_name", index->name);
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE DROP_INDEX_STATS () IS\n"
+			   "BEGIN\n"
+			   "DELETE FROM $index_stats WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name AND\n"
+			   "index_name = :index_name;\n"
+			   "END;\n",
+			   TRUE,
+			   trx);
+
+	/* pinfo is freed by que_eval_sql() */
+
+	/* do not to commit here, see the function's comment */
+
+	ut_a(ret == DB_SUCCESS);
+}
+/* @} */
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent storage if it exists and if there is data stored for the table.
+This function creates its own transaction and commits it.
+dict_stats_drop_table() @{ */
+UNIV_INTERN
+void
+dict_stats_drop_table(
+/*==================*/
+	const char*	table_name)	/*!< in: table name */
+{
+	trx_t*		trx;
+	pars_info_t*	pinfo;
+	ulint		ret;
+
+	/* skip tables that do not contain a database name
+	e.g. if we are dropping SYS_TABLES */
+	if (strchr(table_name, '/') == NULL) {
+
+		return;
+	}
+
+	mutex_enter(&kernel_mutex);
+	trx = trx_create(trx_dummy_sess);
+	mutex_exit(&kernel_mutex);
+
+	trx->op_info = "";
+	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
+	trx_start(trx, ULINT_UNDEFINED);
+
+	/* Cannot continue without locking the stats tables
+	because the SQL parser will crash if they disappear after
+	dict_stats_persistent_storage_check(). The tables will be
+	unlocked when the transaction is committed. */
+
+	/* XXX those locks still do not prevent the table from being dropped */
+
+	if (lock_table_by_name(TABLE_STATS_NAME, LOCK_X, trx) != DB_SUCCESS) {
+
+		goto commit_and_return;
+	}
+
+	if (lock_table_by_name(INDEX_STATS_NAME, LOCK_X, trx) != DB_SUCCESS) {
+
+		/* the commit will unlock table_stats */
+		goto commit_and_return;
+	}
+
+	/* If the persistent storage does not exist or is corrupted,
+	then do not attempt to DELETE from its tables because the
+	internal SQL parser will crash. */
+	if (!dict_stats_persistent_storage_check()) {
+
+		/* the commit will unlock the stats tables */
+		goto commit_and_return;
+	}
+
+	pinfo = pars_info_create();
+
+	/* we do this because the slash in TABLE_STATS_NAME confuses
+	the parser if used directly inside the SQL */
+	pars_info_add_id(pinfo, "table_stats", TABLE_STATS_NAME);
+	pars_info_add_id(pinfo, "index_stats", INDEX_STATS_NAME);
+
+	pars_info_add_literal(pinfo, "database_name", table_name,
+			      dict_get_db_name_len(table_name),
+			      DATA_VARCHAR, 0);
+
+	pars_info_add_str_literal(pinfo, "table_name",
+				  dict_remove_db_name(table_name));
+
+	ret = que_eval_sql(pinfo,
+			   "PROCEDURE DROP_TABLE_STATS () IS\n"
+			   "BEGIN\n"
+
+			   "DELETE FROM $index_stats WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name;\n"
+
+			   "DELETE FROM $table_stats WHERE\n"
+			   "database_name = :database_name AND\n"
+			   "table_name = :table_name;\n"
+
+			   "END;\n",
+			   TRUE,
+			   trx);
+
+	/* pinfo is freed by que_eval_sql() */
+
+	ut_a(ret == DB_SUCCESS);
+
+commit_and_return:
+
+	trx_commit_for_mysql(trx);
+
+	mutex_enter(&kernel_mutex);
+	trx_free(trx);
+	mutex_exit(&kernel_mutex);
+}
+/* @} */
+
+/* tests @{ */
+#define UNIV_COMPILE_TEST_FUNCS
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/* test_dict_stats_table_check() @{ */
+void
+test_dict_stats_table_check()
+{
+	/*
+	CREATE TABLE tcheck (
+		c01 VARCHAR(123),
+		c02 INT,
+		c03 INT NOT NULL,
+		c04 INT UNSIGNED,
+		c05 BIGINT,
+		c06 BIGINT UNSIGNED NOT NULL,
+		c07 TIMESTAMP
+	) ENGINE=INNODB;
+	*/
+	/* definition for the table 'test/tcheck' */
+	column_data_t	columns[] = {
+		{"c01", DATA_VARCHAR, 0, 123},
+		{"c02", DATA_INT, 0, 4},
+		{"c03", DATA_INT, DATA_NOT_NULL, 4},
+		{"c04", DATA_INT, DATA_UNSIGNED, 4},
+		{"c05", DATA_INT, 0, 8},
+		{"c06", DATA_INT, DATA_NOT_NULL | DATA_UNSIGNED, 8},
+		{"c07", DATA_INT, 0, 4},
+		{"c_extra", DATA_INT, 0, 4}
+	};
+	table_schema_t	schema = {
+		"test/tcheck",
+		0 /* will be set individually for each test below */,
+		columns
+	};
+
+	/* prevent any data dictionary modifications while we are checking
+	the tables' structure */
+
+	mutex_enter(&(dict_sys->mutex));
+
+	/* check that a valid table is reported as valid */
+	schema.n_cols = 7;
+	if (dict_stats_table_check(&schema)) {
+		printf("OK: test.tcheck ok\n");
+	} else {
+		printf("ERROR: test.tcheck not present or corrupted\n");
+		goto test_dict_stats_table_check_end;
+	}
+
+	/* check columns with wrong length */
+	schema.columns[1].len = 8;
+	if (!dict_stats_table_check(&schema)) {
+		printf("OK: test.tcheck.c02 has different length and is "
+		       "reported as corrupted\n");
+	} else {
+		printf("OK: test.tcheck.c02 has different length but is "
+		       "reported as ok\n");
+		goto test_dict_stats_table_check_end;
+	}
+	schema.columns[1].len = 4;
+
+	/* request that c02 is NOT NULL while actually it does not have
+	this flag set */
+	schema.columns[1].prtype_mask |= DATA_NOT_NULL;
+	if (!dict_stats_table_check(&schema)) {
+		printf("OK: test.tcheck.c02 does not have NOT NULL while "
+		       "it should and is reported as corrupted\n");
+	} else {
+		printf("ERROR: test.tcheck.c02 does not have NOT NULL while "
+		       "it should and is not reported as corrupted\n");
+		goto test_dict_stats_table_check_end;
+	}
+	schema.columns[1].prtype_mask &= ~DATA_NOT_NULL;
+
+	/* check a table that contains some extra columns */
+	schema.n_cols = 6;
+	if (dict_stats_table_check(&schema)) {
+		printf("ERROR: test.tcheck has more columns which is ok\n");
+		goto test_dict_stats_table_check_end;
+	} else {
+		printf("OK: test.tcheck has more columns and is "
+		       "reported as corrupted\n");
+	}
+
+	/* check a table that has some columns missing */
+	schema.n_cols = 8;
+	if (!dict_stats_table_check(&schema)) {
+		printf("OK: test.tcheck has missing columns and is "
+		       "reported as corrupted\n");
+	} else {
+		printf("ERROR: test.tcheck has missing columns but is "
+		       "reported as ok\n");
+		goto test_dict_stats_table_check_end;
+	}
+
+	/* check non-existent table */
+	schema.table_name = "test/tcheck_nonexistent";
+	if (!dict_stats_table_check(&schema)) {
+		printf("OK: test.tcheck_nonexistent is not present\n");
+	} else {
+		printf("ERROR: test.tcheck_nonexistent is present!?\n");
+		goto test_dict_stats_table_check_end;
+	}
+
+test_dict_stats_table_check_end:
+
+	mutex_exit(&(dict_sys->mutex));
+}
+/* @} */
+
+/* save/fetch aux macros @{ */
+#define TEST_DATABASE_NAME		"foobardb"
+#define TEST_TABLE_NAME			"test_dict_stats"
+
+#define TEST_N_ROWS			111
+#define TEST_CLUSTERED_INDEX_SIZE	222
+#define TEST_SUM_OF_OTHER_INDEX_SIZES	333
+
+#define TEST_IDX1_NAME			"tidx1"
+#define TEST_IDX1_COL1_NAME		"tidx1_col1"
+#define TEST_IDX1_INDEX_SIZE		123
+#define TEST_IDX1_N_LEAF_PAGES		234
+#define TEST_IDX1_N_DIFF1		50
+#define TEST_IDX1_N_DIFF1_SAMPLE_SIZE	500
+
+#define TEST_IDX2_NAME			"tidx2"
+#define TEST_IDX2_COL1_NAME		"tidx2_col1"
+#define TEST_IDX2_COL2_NAME		"tidx2_col2"
+#define TEST_IDX2_COL3_NAME		"tidx2_col3"
+#define TEST_IDX2_COL4_NAME		"tidx2_col4"
+#define TEST_IDX2_INDEX_SIZE		321
+#define TEST_IDX2_N_LEAF_PAGES		432
+#define TEST_IDX2_N_DIFF1		60
+#define TEST_IDX2_N_DIFF1_SAMPLE_SIZE	600
+#define TEST_IDX2_N_DIFF2		61
+#define TEST_IDX2_N_DIFF2_SAMPLE_SIZE	610
+#define TEST_IDX2_N_DIFF3		62
+#define TEST_IDX2_N_DIFF3_SAMPLE_SIZE	620
+#define TEST_IDX2_N_DIFF4		63
+#define TEST_IDX2_N_DIFF4_SAMPLE_SIZE	630
+/* @} */
+
+/* test_dict_stats_save() @{ */
+void
+test_dict_stats_save()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	dict_field_t	index1_fields[1];
+	ib_uint64_t	index1_stat_n_diff_key_vals[2];
+	ib_uint64_t	index1_stat_n_sample_sizes[2];
+	dict_index_t	index2;
+	dict_field_t	index2_fields[4];
+	ib_uint64_t	index2_stat_n_diff_key_vals[5];
+	ib_uint64_t	index2_stat_n_sample_sizes[5];
+	enum db_err	ret;
+
+	/* craft a dummy dict_table_t */
+	table.name = TEST_DATABASE_NAME "/" TEST_TABLE_NAME;
+	table.stat_n_rows = TEST_N_ROWS;
+	table.stat_clustered_index_size = TEST_CLUSTERED_INDEX_SIZE;
+	table.stat_sum_of_other_index_sizes = TEST_SUM_OF_OTHER_INDEX_SIZES;
+	UT_LIST_INIT(table.indexes);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index1);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index2);
+#ifdef UNIV_DEBUG
+	table.magic_n = DICT_TABLE_MAGIC_N;
+#endif /* UNIV_DEBUG */
+
+	index1.name = TEST_IDX1_NAME;
+	index1.table = &table;
+#ifdef UNIV_DEBUG
+	index1.magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.fields = index1_fields;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+	index1.stat_index_size = TEST_IDX1_INDEX_SIZE;
+	index1.stat_n_leaf_pages = TEST_IDX1_N_LEAF_PAGES;
+	index1_fields[0].name = TEST_IDX1_COL1_NAME;
+	index1_stat_n_diff_key_vals[0] = 1; /* dummy */
+	index1_stat_n_diff_key_vals[1] = TEST_IDX1_N_DIFF1;
+	index1_stat_n_sample_sizes[0] = 0; /* dummy */
+	index1_stat_n_sample_sizes[1] = TEST_IDX1_N_DIFF1_SAMPLE_SIZE;
+
+	index2.name = TEST_IDX2_NAME;
+	index2.table = &table;
+#ifdef UNIV_DEBUG
+	index2.magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.fields = index2_fields;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+	index2.stat_index_size = TEST_IDX2_INDEX_SIZE;
+	index2.stat_n_leaf_pages = TEST_IDX2_N_LEAF_PAGES;
+	index2_fields[0].name = TEST_IDX2_COL1_NAME;
+	index2_fields[1].name = TEST_IDX2_COL2_NAME;
+	index2_fields[2].name = TEST_IDX2_COL3_NAME;
+	index2_fields[3].name = TEST_IDX2_COL4_NAME;
+	index2_stat_n_diff_key_vals[0] = 1; /* dummy */
+	index2_stat_n_diff_key_vals[1] = TEST_IDX2_N_DIFF1;
+	index2_stat_n_diff_key_vals[2] = TEST_IDX2_N_DIFF2;
+	index2_stat_n_diff_key_vals[3] = TEST_IDX2_N_DIFF3;
+	index2_stat_n_diff_key_vals[4] = TEST_IDX2_N_DIFF4;
+	index2_stat_n_sample_sizes[0] = 0; /* dummy */
+	index2_stat_n_sample_sizes[1] = TEST_IDX2_N_DIFF1_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[2] = TEST_IDX2_N_DIFF2_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[3] = TEST_IDX2_N_DIFF3_SAMPLE_SIZE;
+	index2_stat_n_sample_sizes[4] = TEST_IDX2_N_DIFF4_SAMPLE_SIZE;
+
+	ret = dict_stats_save(&table);
+	
+	ut_a(ret == DB_SUCCESS);
+
+	printf("\nOK: stats saved successfully, now go ahead and read "
+	       "what's inside innodb.table_stats and innodb.index_stats:\n\n");
+
+	printf("SELECT COUNT(*) = 1 AS table_stats_saved_successfully\n"
+	       "FROM innodb.table_stats\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "n_rows = %d AND\n"
+	       "clustered_index_size = %d AND\n"
+	       "sum_of_other_index_sizes = %d;\n"
+	       "\n",
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_N_ROWS,
+	       TEST_CLUSTERED_INDEX_SIZE,
+	       TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	printf("SELECT COUNT(*) = 3 AS tidx1_stats_saved_successfully\n"
+	       "FROM innodb.index_stats\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND sample_size = '%d' AND stat_description = '%s')\n"
+	       ");\n"
+	       "\n",
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX1_NAME,
+	       TEST_IDX1_INDEX_SIZE,
+	       TEST_IDX1_N_LEAF_PAGES,
+	       TEST_IDX1_N_DIFF1,
+	       TEST_IDX1_N_DIFF1_SAMPLE_SIZE,
+	       TEST_IDX1_COL1_NAME);
+
+	printf("SELECT COUNT(*) = 6 AS tidx2_stats_saved_successfully\n"
+	       "FROM innodb.index_stats\n"
+	       "WHERE\n"
+	       "database_name = '%s' AND\n"
+	       "table_name = '%s' AND\n"
+	       "index_name = '%s' AND\n"
+	       "(\n"
+	       " (stat_name = 'size' AND stat_value = %d AND sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_leaf_pages' AND stat_value = %d AND sample_size IS NULL) OR\n"
+	       " (stat_name = 'n_diff_pfx01' AND stat_value = %d AND sample_size = '%d' AND stat_description = '%s') OR\n"
+	       " (stat_name = 'n_diff_pfx02' AND stat_value = %d AND sample_size = '%d' AND stat_description = '%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx03' AND stat_value = %d AND sample_size = '%d' AND stat_description = '%s,%s,%s') OR\n"
+	       " (stat_name = 'n_diff_pfx04' AND stat_value = %d AND sample_size = '%d' AND stat_description = '%s,%s,%s,%s')\n"
+	       ");\n"
+	       "\n",
+	       TEST_DATABASE_NAME,
+	       TEST_TABLE_NAME,
+	       TEST_IDX2_NAME,
+	       TEST_IDX2_INDEX_SIZE,
+	       TEST_IDX2_N_LEAF_PAGES,
+	       TEST_IDX2_N_DIFF1,
+	       TEST_IDX2_N_DIFF1_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME,
+	       TEST_IDX2_N_DIFF2,
+	       TEST_IDX2_N_DIFF2_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME,
+	       TEST_IDX2_N_DIFF3,
+	       TEST_IDX2_N_DIFF3_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME,
+	       TEST_IDX2_N_DIFF4,
+	       TEST_IDX2_N_DIFF4_SAMPLE_SIZE,
+	       TEST_IDX2_COL1_NAME, TEST_IDX2_COL2_NAME, TEST_IDX2_COL3_NAME, TEST_IDX2_COL4_NAME);
+}
+/* @} */
+
+/* test_dict_stats_fetch_from_ps() @{ */
+void
+test_dict_stats_fetch_from_ps()
+{
+	dict_table_t	table;
+	dict_index_t	index1;
+	ib_uint64_t	index1_stat_n_diff_key_vals[2];
+	ib_uint64_t	index1_stat_n_sample_sizes[2];
+	dict_index_t	index2;
+	ib_uint64_t	index2_stat_n_diff_key_vals[5];
+	ib_uint64_t	index2_stat_n_sample_sizes[5];
+	enum db_err	ret;
+
+	/* craft a dummy dict_table_t */
+	table.name = TEST_DATABASE_NAME "/" TEST_TABLE_NAME;
+	UT_LIST_INIT(table.indexes);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index1);
+	UT_LIST_ADD_LAST(indexes, table.indexes, &index2);
+#ifdef UNIV_DEBUG
+	table.magic_n = DICT_TABLE_MAGIC_N;
+#endif /* UNIV_DEBUG */
+
+	index1.name = TEST_IDX1_NAME;
+#ifdef UNIV_DEBUG
+	index1.magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+	index1.cached = 1;
+	index1.n_uniq = 1;
+	index1.stat_n_diff_key_vals = index1_stat_n_diff_key_vals;
+	index1.stat_n_sample_sizes = index1_stat_n_sample_sizes;
+
+	index2.name = TEST_IDX2_NAME;
+#ifdef UNIV_DEBUG
+	index2.magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+	index2.cached = 1;
+	index2.n_uniq = 4;
+	index2.stat_n_diff_key_vals = index2_stat_n_diff_key_vals;
+	index2.stat_n_sample_sizes = index2_stat_n_sample_sizes;
+
+	ret = dict_stats_fetch_from_ps(&table);
+	
+	ut_a(ret == DB_SUCCESS);
+
+	ut_a(table.stat_n_rows == TEST_N_ROWS);
+	ut_a(table.stat_clustered_index_size == TEST_CLUSTERED_INDEX_SIZE);
+	ut_a(table.stat_sum_of_other_index_sizes == TEST_SUM_OF_OTHER_INDEX_SIZES);
+
+	ut_a(index1.stat_index_size == TEST_IDX1_INDEX_SIZE);
+	ut_a(index1.stat_n_leaf_pages == TEST_IDX1_N_LEAF_PAGES);
+	ut_a(index1_stat_n_diff_key_vals[1] == TEST_IDX1_N_DIFF1);
+	ut_a(index1_stat_n_sample_sizes[1] == TEST_IDX1_N_DIFF1_SAMPLE_SIZE);
+
+	ut_a(index2.stat_index_size == TEST_IDX2_INDEX_SIZE);
+	ut_a(index2.stat_n_leaf_pages == TEST_IDX2_N_LEAF_PAGES);
+	ut_a(index2_stat_n_diff_key_vals[1] == TEST_IDX2_N_DIFF1);
+	ut_a(index2_stat_n_sample_sizes[1] == TEST_IDX2_N_DIFF1_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[2] == TEST_IDX2_N_DIFF2);
+	ut_a(index2_stat_n_sample_sizes[2] == TEST_IDX2_N_DIFF2_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[3] == TEST_IDX2_N_DIFF3);
+	ut_a(index2_stat_n_sample_sizes[3] == TEST_IDX2_N_DIFF3_SAMPLE_SIZE);
+	ut_a(index2_stat_n_diff_key_vals[4] == TEST_IDX2_N_DIFF4);
+	ut_a(index2_stat_n_sample_sizes[4] == TEST_IDX2_N_DIFF4_SAMPLE_SIZE);
+
+	printf("OK: fetch successful\n");
+}
+/* @} */
+
+/* test_dict_stats_all() @{ */
+void
+test_dict_stats_all()
+{
+	test_dict_stats_table_check();
+
+	test_dict_stats_save();
+
+	test_dict_stats_fetch_from_ps();
+}
+/* @} */
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+/* @} */
+
+#endif /* UNIV_HOTBACKUP */
+
+/* vim: set foldmethod=marker foldmarker=@{,@}: */

=== modified file 'storage/innobase/handler/ha_innodb.cc'
--- a/storage/innobase/handler/ha_innodb.cc	2010-04-19 17:53:16 +0000
+++ b/storage/innobase/handler/ha_innodb.cc	2010-04-20 15:23:28 +0000
@@ -84,9 +84,11 @@ extern "C" {
 #include "row0merge.h"
 #include "thr0loc.h"
 #include "dict0boot.h"
+#include "dict0stats.h"
 #include "ha_prototypes.h"
 #include "ut0mem.h"
 #include "ibuf0ibuf.h"
+#include "dict0dict.h"
 }
 
 #include "ha_innodb.h"
@@ -442,6 +444,12 @@ static MYSQL_THDVAR_BOOL(strict_mode, PL
   "Use strict mode when evaluating create options.",
   NULL, NULL, FALSE);
 
+static MYSQL_THDVAR_BOOL(analyze_is_persistent, PLUGIN_VAR_OPCMDARG,
+  "ANALYZE TABLE in InnoDB uses a more precise (and slow) sampling "
+  "algorithm and saves the results persistently.",
+  /* check_func */ NULL, /* update_func */ NULL,
+  /* default */ FALSE);
+
 static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
   "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
   NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
@@ -7559,12 +7567,19 @@ ha_innobase::read_time(
 
 /*********************************************************************//**
 Returns statistics information of the table to the MySQL interpreter,
-in various fields of the handle object. */
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
 UNIV_INTERN
 int
-ha_innobase::info(
-/*==============*/
-	uint flag)	/*!< in: what information MySQL requests */
+ha_innobase::info_low(
+/*==================*/
+					/*!< out: HA_ERR_* error code */
+	uint			flag,	/*!< in: what information MySQL
+					requests */
+	enum dict_stats_upd_how	stats_upd_how)
+					/*!< in: whether to (re)calc
+					the stats or to fetch them from
+					the persistent storage */
 {
 	dict_table_t*	ib_table;
 	dict_index_t*	index;
@@ -7611,10 +7626,16 @@ ha_innobase::info(
 		if (innobase_stats_on_metadata) {
 			/* In sql_show we call with this flag: update
 			then statistics so that they are up-to-date */
+			enum db_err	ret;
 
 			prebuilt->trx->op_info = "updating table statistics";
 
-			dict_update_statistics(ib_table);
+			ret = dict_stats_update(ib_table, stats_upd_how);
+
+			if (ret != DB_SUCCESS) {
+				prebuilt->trx->op_info = "";
+				DBUG_RETURN(HA_ERR_GENERIC);
+			}
 
 			prebuilt->trx->op_info = "returning various info to MySQL";
 		}
@@ -7838,10 +7859,23 @@ ha_innobase::info(
 	DBUG_RETURN(0);
 }
 
+/*********************************************************************//**
+Returns statistics information of the table to the MySQL interpreter,
+in various fields of the handle object.
+@return HA_ERR_* error code or 0 */
+UNIV_INTERN
+int
+ha_innobase::info(
+/*==============*/
+	uint flag)	/*!< in: what information MySQL requests */
+{
+	return(info_low(flag, DICT_STATS_UPD_FETCH));
+}
+
 /**********************************************************************//**
 Updates index cardinalities of the table, based on 8 random dives into
 each index tree. This does NOT calculate exact statistics on the table.
-@return	returns always 0 (success) */
+@return	HA_ADMIN_* error code or HA_ADMIN_OK */
 UNIV_INTERN
 int
 ha_innobase::analyze(
@@ -7849,16 +7883,31 @@ ha_innobase::analyze(
 	THD*		thd,		/*!< in: connection thread handle */
 	HA_CHECK_OPT*	check_opt)	/*!< in: currently ignored */
 {
+	enum dict_stats_upd_how	upd_how;
+	int			ret;
+
+	if (THDVAR(thd, analyze_is_persistent)) {
+		upd_how = DICT_STATS_UPD_RECALC_PERSISTENT_VERBOSE;
+	} else {
+		upd_how = DICT_STATS_UPD_RECALC_TRANSIENT;
+	}
+
 	/* Serialize ANALYZE TABLE inside InnoDB, see
 	Bug#38996 Race condition in ANALYZE TABLE */
 	mysql_mutex_lock(&analyze_mutex);
 
-	/* Simply call ::info() with all the flags */
-	info(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE);
+	/* Simply call ::info_low() with all the flags
+	and request recalculation of the statistics */
+	ret = info_low(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE,
+		       upd_how);
 
 	mysql_mutex_unlock(&analyze_mutex);
 
-	return(0);
+	if (ret != 0) {
+		return(HA_ADMIN_FAILED);
+	}
+
+	return(HA_ADMIN_OK);
 }
 
 /**********************************************************************//**
@@ -10793,11 +10842,25 @@ static MYSQL_SYSVAR_BOOL(stats_on_metada
   "Enable statistics gathering for metadata commands such as SHOW TABLE STATUS (on by default)",
   NULL, NULL, TRUE);
 
-static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_sample_pages,
+static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_transient_sample_pages,
   PLUGIN_VAR_RQCMDARG,
-  "The number of index pages to sample when calculating statistics (default 8)",
+  "Deprecated, use innodb_stats_transient_sample_pages instead",
   NULL, NULL, 8, 1, ~0ULL, 0);
 
+static MYSQL_SYSVAR_ULONGLONG(stats_transient_sample_pages,
+  srv_stats_transient_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of leaf index pages to sample when calculating transient "
+  "statistics (if persistent statistics are not found, default 8)",
+  NULL, NULL, 8, 1, ~0ULL, 0);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_persistent_sample_pages,
+  srv_stats_persistent_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of leaf index pages to sample when calculating persistent "
+  "statistics (by ANALYZE, default 20)",
+  NULL, NULL, 20, 1, ~0ULL, 0);
+
 static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
   PLUGIN_VAR_OPCMDARG,
   "Enable InnoDB adaptive hash index (enabled by default).  "
@@ -11002,11 +11065,14 @@ static struct st_mysql_sys_var* innobase
   MYSQL_SYSVAR(rollback_on_timeout),
   MYSQL_SYSVAR(stats_on_metadata),
   MYSQL_SYSVAR(stats_sample_pages),
+  MYSQL_SYSVAR(stats_transient_sample_pages),
+  MYSQL_SYSVAR(stats_persistent_sample_pages),
   MYSQL_SYSVAR(adaptive_hash_index),
   MYSQL_SYSVAR(replication_delay),
   MYSQL_SYSVAR(status_file),
   MYSQL_SYSVAR(strict_mode),
   MYSQL_SYSVAR(support_xa),
+  MYSQL_SYSVAR(analyze_is_persistent),
   MYSQL_SYSVAR(sync_spin_loops),
   MYSQL_SYSVAR(spin_wait_delay),
   MYSQL_SYSVAR(table_locks),

=== modified file 'storage/innobase/handler/ha_innodb.h'
--- a/storage/innobase/handler/ha_innodb.h	2010-02-20 16:45:41 +0000
+++ b/storage/innobase/handler/ha_innodb.h	2010-04-20 15:23:28 +0000
@@ -27,6 +27,8 @@ Place, Suite 330, Boston, MA 02111-1307 
 #pragma interface			/* gcc class implementation */
 #endif
 
+#include "dict0stats.h"
+
 /* Structure defines translation table between mysql index and innodb
 index structures */
 typedef struct innodb_idx_translate_struct {
@@ -109,6 +111,7 @@ class ha_innobase: public handler
 	ulint innobase_update_autoinc(ulonglong	auto_inc);
 	void innobase_initialize_autoinc();
 	dict_index_t* innobase_get_index(uint keynr);
+	int info_low(uint flag, enum dict_stats_upd_how stats_upd_how);
 
 	/* Init values for the class: */
  public:

=== modified file 'storage/innobase/handler/handler0alter.cc'
--- a/storage/innobase/handler/handler0alter.cc	2010-04-13 15:26:27 +0000
+++ b/storage/innobase/handler/handler0alter.cc	2010-04-20 15:23:28 +0000
@@ -27,6 +27,7 @@ Smart ALTER TABLE
 #include <mysql/innodb_priv.h>
 
 extern "C" {
+#include "dict0stats.h"
 #include "log0log.h"
 #include "row0merge.h"
 #include "srv0srv.h"
@@ -1179,6 +1180,29 @@ ha_innobase::final_drop_index(
 		row_merge_lock_table(prebuilt->trx, prebuilt->table, LOCK_X),
 		prebuilt->table->flags, user_thd);
 
+	/* Delete corresponding rows from the stats table. This operation
+	can take a long time if the user is having a long-running
+	transaction that has acquired locks on the stats table. Thus we do
+	this before locking the data dictionary which should only be locked
+	for a short periods of time.
+	Marko advises not to edit both user tables and SYS_* tables in one
+	trx, thus we use prebuilt->trx instead of trx. Because of this the
+	drop from SYS_* and from the stats table cannot happen in one
+	transaction and eventually if a crash occurs below, between
+	trx_commit_for_mysql(trx); which drops the indexes from SYS_* and
+	trx_commit_for_mysql(prebuilt->trx);
+	then an orphaned rows will be left in the stats table. XXX would be
+	best if we wipe away those orphaned rows at some point. */
+	for (index = dict_table_get_first_index(prebuilt->table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+
+		if (index->to_be_dropped) {
+
+			dict_stats_drop_index(index, prebuilt->trx);
+		}
+	}
+
 	row_mysql_lock_data_dictionary(trx);
 	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
 

=== modified file 'storage/innobase/include/btr0cur.h'
--- a/storage/innobase/include/btr0cur.h	2010-03-17 07:11:11 +0000
+++ b/storage/innobase/include/btr0cur.h	2010-04-20 15:23:28 +0000
@@ -446,7 +446,9 @@ btr_estimate_n_rows_in_range(
 /*******************************************************************//**
 Estimates the number of different key values in a given index, for
 each n-column prefix of the index where n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals. */
+The estimates are stored in the array index->stat_n_diff_key_vals[] and
+the number of pages that were sampled is saved in
+index->stat_n_sample_sizes[]. */
 UNIV_INTERN
 void
 btr_estimate_number_of_different_key_vals(

=== modified file 'storage/innobase/include/btr0pcur.h'
--- a/storage/innobase/include/btr0pcur.h	2010-03-04 10:15:07 +0000
+++ b/storage/innobase/include/btr0pcur.h	2010-04-20 15:23:28 +0000
@@ -82,9 +82,10 @@ Initializes and opens a persistent curso
 closed with btr_pcur_close. */
 UNIV_INLINE
 void
-btr_pcur_open_func(
-/*===============*/
+btr_pcur_open_low(
+/*==============*/
 	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level in the btree */
 	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
 	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
 				NOTE that if the search is made using a unique
@@ -98,7 +99,7 @@ btr_pcur_open_func(
 	ulint		line,	/*!< in: line where called */
 	mtr_t*		mtr);	/*!< in: mtr */
 #define btr_pcur_open(i,t,md,l,c,m)				\
-	btr_pcur_open_func(i,t,md,l,c,__FILE__,__LINE__,m)
+	btr_pcur_open_low(i,0,t,md,l,c,__FILE__,__LINE__,m)
 /**************************************************************//**
 Opens an persistent cursor to an index tree without initializing the
 cursor. */

=== modified file 'storage/innobase/include/btr0pcur.ic'
--- a/storage/innobase/include/btr0pcur.ic	2010-03-04 10:15:07 +0000
+++ b/storage/innobase/include/btr0pcur.ic	2010-04-20 15:23:28 +0000
@@ -466,9 +466,10 @@ Initializes and opens a persistent curso
 closed with btr_pcur_close. */
 UNIV_INLINE
 void
-btr_pcur_open_func(
-/*===============*/
+btr_pcur_open_low(
+/*==============*/
 	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level in the btree */
 	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
 	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
 				NOTE that if the search is made using a unique
@@ -495,7 +496,7 @@ btr_pcur_open_func(
 
 	btr_cursor = btr_pcur_get_btr_cur(cursor);
 
-	btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+	btr_cur_search_to_nth_level(index, level, tuple, mode, latch_mode,
 				    btr_cursor, 0, file, line, mtr);
 	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
 

=== modified file 'storage/innobase/include/db0err.h'
--- a/storage/innobase/include/db0err.h	2010-03-27 18:37:58 +0000
+++ b/storage/innobase/include/db0err.h	2010-04-20 15:23:28 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software

=== modified file 'storage/innobase/include/dict0dict.h'
--- a/storage/innobase/include/dict0dict.h	2010-04-06 12:18:47 +0000
+++ b/storage/innobase/include/dict0dict.h	2010-04-20 15:23:28 +0000
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -1031,24 +1031,6 @@ ulint
 dict_index_calc_min_rec_len(
 /*========================*/
 	const dict_index_t*	index);	/*!< in: index */
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization. */
-UNIV_INTERN
-void
-dict_update_statistics_low(
-/*=======================*/
-	dict_table_t*	table,		/*!< in/out: table */
-	ibool		has_dict_mutex);/*!< in: TRUE if the caller has the
-					dictionary mutex */
-/*********************************************************************//**
-Calculates new estimates for table and index statistics. The statistics
-are used in query optimization. */
-UNIV_INTERN
-void
-dict_update_statistics(
-/*===================*/
-	dict_table_t*	table);	/*!< in/out: table */
 /********************************************************************//**
 Reserves the dictionary system mutex for MySQL. */
 UNIV_INTERN

=== modified file 'storage/innobase/include/dict0mem.h'
--- a/storage/innobase/include/dict0mem.h	2010-03-18 12:18:25 +0000
+++ b/storage/innobase/include/dict0mem.h	2010-04-20 15:23:28 +0000
@@ -299,13 +299,18 @@ struct dict_index_struct{
 	/*----------------------*/
 	/** Statistics for query optimization */
 	/* @{ */
-	ib_int64_t*	stat_n_diff_key_vals;
+	ib_uint64_t*	stat_n_diff_key_vals;
 				/*!< approximate number of different
 				key values for this index, for each
 				n-column prefix where n <=
 				dict_get_n_unique(index); we
 				periodically calculate new
 				estimates */
+	ib_uint64_t*	stat_n_sample_sizes;
+				/*!< number of pages that were sampled
+				to calculate each of stat_n_diff_key_vals[],
+				e.g. stat_n_sample_sizes[3] pages were sampled
+				to get the number stat_n_diff_key_vals[3]. */
 	ulint		stat_index_size;
 				/*!< approximate index size in
 				database pages */

=== added file 'storage/innobase/include/dict0stats.h'
--- a/storage/innobase/include/dict0stats.h	1970-01-01 00:00:00 +0000
+++ b/storage/innobase/include/dict0stats.h	2010-04-20 15:23:28 +0000
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 2009, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0stats.h
+Code used for calculating and manipulating table statistics.
+
+Created Jan 06, 2010 Vasil Dimov
+*******************************************************/
+
+#ifndef dict0stats_h
+#define dict0stats_h
+
+#include "univ.i"
+
+#include "db0err.h"
+#include "dict0types.h"
+#include "trx0types.h"
+
+enum dict_stats_upd_how {
+	DICT_STATS_UPD_RECALC_PERSISTENT_VERBOSE,/* (re)calculate the
+				statistics using a precise and slow
+				algo and save them to the persistent
+				storage, if the persistent storage is
+				not present then emit a warning and
+				fall back to transient stats */
+	DICT_STATS_UPD_RECALC_PERSISTENT_SILENT,/* same as
+				DICT_STATS_UPD_RECALC_PERSISTENT_VERBOSE
+				but do not emit a warning */
+	DICT_STATS_UPD_RECALC_TRANSIENT,/* (re)calculate the statistics
+				using an imprecise quick algo
+				without saving the results
+				persistently */
+	DICT_STATS_UPD_FETCH	/* fetch the statistics from the
+				persistent storage */
+};
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization.
+@return DB_* error code or DB_SUCCESS */
+UNIV_INTERN
+enum db_err
+dict_stats_update(
+/*==============*/
+	dict_table_t*		table,	/*!< in/out: table */
+	enum dict_stats_upd_how	stats_upd_how);
+					/*!< in: whether to (re)calc
+					the stats or to fetch them from
+					the persistent storage */
+
+/*********************************************************************//**
+Removes the information for a particular index from the persistent
+storage if it exists and if there is data stored for this index.
+The transaction is not committed, it must not be committed in this
+function because this is the user trx that is running DROP INDEX.
+The transaction will be committed at the very end when dropping an
+index. */
+UNIV_INTERN
+void
+dict_stats_drop_index(
+/*==================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx);	/*!< in: transaction to use */
+
+/*********************************************************************//**
+Removes the statistics for a table and all of its indexes from the
+persistent storage if it exists and if there is data stored for the table.
+This function creates its own transaction and commits it. */
+UNIV_INTERN
+void
+dict_stats_drop_table(
+/*==================*/
+	const char*	table);	/*!< in: table */
+
+#endif /* dict0stats_h */

=== modified file 'storage/innobase/include/lock0lock.h'
--- a/storage/innobase/include/lock0lock.h	2010-03-18 12:18:25 +0000
+++ b/storage/innobase/include/lock0lock.h	2010-04-20 15:23:28 +0000
@@ -467,6 +467,20 @@ lock_table(
 	dict_table_t*	table,	/*!< in: database table in dictionary cache */
 	enum lock_mode	mode,	/*!< in: lock mode */
 	que_thr_t*	thr);	/*!< in: query thread */
+/*********************************************************************//**
+Lock a given table with the given lock mode. This function may block the
+execution for some time, but will not wait infinitely if the table is
+already locked, it may return DB_LOCK_WAIT_TIMEOUT.
+The table is unlocked when the trx is committed.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+enum db_err
+lock_table_by_name(
+/*===============*/
+	const char*	table_name,	/*!< in: table name */
+	enum lock_mode	mode,		/*!< in: lock mode */
+	trx_t*		trx);		/*!< in/out: transaction into which to
+					lock the table */
 /*************************************************************//**
 Removes a granted record lock of a transaction from the queue and grants
 locks to other transactions waiting in the queue if they now are entitled

=== modified file 'storage/innobase/include/srv0srv.h'
--- a/storage/innobase/include/srv0srv.h	2010-04-19 15:44:15 +0000
+++ b/storage/innobase/include/srv0srv.h	2010-04-20 15:23:28 +0000
@@ -190,7 +190,8 @@ extern ulint	srv_fast_shutdown;	 /* If t
 					 transactions). */
 extern ibool	srv_innodb_status;
 
-extern unsigned long long	srv_stats_sample_pages;
+extern unsigned long long	srv_stats_transient_sample_pages;
+extern unsigned long long	srv_stats_persistent_sample_pages;
 
 extern ibool	srv_use_doublewrite_buf;
 extern ibool	srv_use_checksums;

=== modified file 'storage/innobase/lock/lock0lock.c'
--- a/storage/innobase/lock/lock0lock.c	2010-04-05 19:31:35 +0000
+++ b/storage/innobase/lock/lock0lock.c	2010-04-20 15:23:28 +0000
@@ -38,6 +38,11 @@ Created 5/7/1996 Heikki Tuuri
 #include "trx0purge.h"
 #include "dict0mem.h"
 #include "trx0sys.h"
+#include "pars0pars.h" /* pars_complete_graph_for_exec() */
+#include "que0que.h" /* que_node_get_parent() */
+#include "row0mysql.h" /* row_mysql_handle_errors() */
+#include "row0sel.h" /* sel_node_create(), sel_node_struct */
+#include "row0types.h" /* sel_node_t */
 
 /* Restricts the length of search we will do in the waits-for
 graph of transactions */
@@ -3851,6 +3856,91 @@ lock_table(
 }
 
 /*********************************************************************//**
+Lock a given table with the given lock mode. This function may block the
+execution for some time, but will not wait infinitely if the table is
+already locked, it may return DB_LOCK_WAIT_TIMEOUT.
+The table is unlocked when the trx is committed.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+enum db_err
+lock_table_by_name(
+/*===============*/
+	const char*	table_name,	/*!< in: table name */
+	enum lock_mode	mode,		/*!< in: lock mode */
+	trx_t*		trx)		/*!< in/out: transaction into which to
+					lock the table */
+{
+	mem_heap_t*	heap;
+	sel_node_t*	node;
+	que_fork_t*	graph;
+	que_thr_t*	thr;
+	dict_table_t*	table;
+	ibool		was_lock_wait;
+	ulint		ret;
+
+	trx->op_info = "locking table";
+
+	heap = mem_heap_create(sizeof(sel_node_t)
+			       + sizeof(que_fork_t)
+			       + sizeof(que_thr_t));
+
+	node = sel_node_create(heap);
+
+	graph = que_node_get_parent(
+			pars_complete_graph_for_exec(node, trx, heap));
+
+	graph->state = QUE_FORK_ACTIVE;
+
+	thr = que_fork_get_first_thr(graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	do {
+		mutex_enter(&dict_sys->mutex);
+
+		table = dict_table_get_low(table_name);
+
+		if (table == NULL) {
+
+			mutex_exit(&dict_sys->mutex);
+
+			ret = DB_TABLE_NOT_FOUND;
+
+			break;
+		}
+
+		thr->run_node = thr;
+		thr->prev_node = thr->common.parent;
+
+		/* try lock */
+		ret = lock_table(0, table, mode, thr);
+
+		mutex_exit(&dict_sys->mutex);
+
+		trx->error_state = ret;
+
+		if (ret == DB_SUCCESS) {
+
+			que_thr_stop_for_mysql_no_error(thr, trx);
+
+			was_lock_wait = FALSE;
+		} else {
+
+			que_thr_stop_for_mysql(thr);
+
+			was_lock_wait = row_mysql_handle_errors(&ret, trx,
+								thr, NULL);
+		}
+	} while (was_lock_wait);
+
+	mem_heap_free(heap);
+
+	trx->op_info = "";
+
+	return(ret);
+}
+
+/*********************************************************************//**
 Checks if a waiting table lock request still has to wait in a queue.
 @return	TRUE if still has to wait */
 static

=== modified file 'storage/innobase/row/row0mysql.c'
--- a/storage/innobase/row/row0mysql.c	2010-03-04 10:15:07 +0000
+++ b/storage/innobase/row/row0mysql.c	2010-04-20 15:23:28 +0000
@@ -41,6 +41,7 @@ Created 9/17/2000 Heikki Tuuri
 #include "dict0crea.h"
 #include "dict0load.h"
 #include "dict0boot.h"
+#include "dict0stats.h"
 #include "trx0roll.h"
 #include "trx0purge.h"
 #include "trx0rec.h"
@@ -861,7 +862,7 @@ row_update_statistics_if_needed(
 	if (counter > 2000000000
 	    || ((ib_int64_t)counter > 16 + table->stat_n_rows / 16)) {
 
-		dict_update_statistics(table);
+		dict_stats_update(table, DICT_STATS_UPD_FETCH);
 	}
 }
 
@@ -2939,7 +2940,6 @@ next_rec:
 	dict_table_autoinc_lock(table);
 	dict_table_autoinc_initialize(table, 1);
 	dict_table_autoinc_unlock(table);
-	dict_update_statistics(table);
 
 	trx_commit_for_mysql(trx);
 
@@ -2947,6 +2947,10 @@ funct_exit:
 
 	row_mysql_unlock_data_dictionary(trx);
 
+	/* We are supposed to recalc and save the stats only
+	on ANALYZE, but it also makes sense to do so on TRUNCATE */
+	dict_stats_update(table, DICT_STATS_UPD_RECALC_PERSISTENT_SILENT);
+
 	trx->op_info = "";
 
 	srv_wake_master_thread();
@@ -3030,6 +3034,12 @@ row_drop_table_for_mysql(
 		srv_print_innodb_table_monitor = FALSE;
 	}
 
+	/* Remove stats for this table and all of its indexes from the
+	persistent storage if it exists and if there are stats for this
+	table in there. This function creates its own trx and commits
+	it. */
+	dict_stats_drop_table(name);
+
 	/* Serialize data dictionary operations with dictionary mutex:
 	no deadlocks can occur then in these operations */
 

=== added directory 'storage/innobase/scripts'
=== added file 'storage/innobase/scripts/persistent_storage.sql'
--- a/storage/innobase/scripts/persistent_storage.sql	1970-01-01 00:00:00 +0000
+++ b/storage/innobase/scripts/persistent_storage.sql	2010-04-20 15:23:28 +0000
@@ -0,0 +1,32 @@
+DROP DATABASE IF EXISTS innodb;
+CREATE DATABASE innodb;
+
+-- DROP TABLE IF EXISTS innodb.table_stats;
+CREATE TABLE innodb.table_stats (
+	database_name			VARCHAR(512) NOT NULL,
+	table_name			VARCHAR(512) NOT NULL,
+	stats_timestamp			TIMESTAMP NOT NULL,
+	n_rows				BIGINT UNSIGNED NOT NULL,
+	clustered_index_size		BIGINT UNSIGNED NOT NULL,
+	sum_of_other_index_sizes	BIGINT UNSIGNED NOT NULL,
+	PRIMARY KEY (database_name, table_name)
+) ENGINE=INNODB;
+
+-- DROP TABLE IF EXISTS innodb.index_stats;
+CREATE TABLE innodb.index_stats (
+	database_name			VARCHAR(512) NOT NULL,
+	table_name			VARCHAR(512) NOT NULL,
+	index_name			VARCHAR(512) NOT NULL,
+	stat_timestamp			TIMESTAMP NOT NULL,
+	/* there are at least:
+	stat_name='size'
+	stat_name='n_leaf_pages'
+	stat_name='n_diff_pfx%' */
+	stat_name			VARCHAR(64) NOT NULL,
+	stat_value			BIGINT UNSIGNED NOT NULL,
+	sample_size			BIGINT UNSIGNED,
+	stat_description		VARCHAR(1024) NOT NULL,
+	PRIMARY KEY (database_name, table_name, index_name, stat_name),
+	FOREIGN KEY (database_name, table_name)
+	  REFERENCES table_stats (database_name, table_name)
+) ENGINE=INNODB;

=== modified file 'storage/innobase/srv/srv0srv.c'
--- a/storage/innobase/srv/srv0srv.c	2010-04-19 15:44:15 +0000
+++ b/storage/innobase/srv/srv0srv.c	2010-04-20 15:23:28 +0000
@@ -370,8 +370,13 @@ UNIV_INTERN ulint	srv_fast_shutdown	= 0;
 UNIV_INTERN ibool	srv_innodb_status	= FALSE;
 
 /* When estimating number of different key values in an index, sample
-this many index pages */
-UNIV_INTERN unsigned long long	srv_stats_sample_pages = 8;
+this many index pages, there are 2 ways to calculate statistics:
+* persistent stats that are calculated by ANALYZE TABLE and saved
+  in the innodb database.
+* quick transient stats, that are used if persistent stats for the given
+  table/index are not found in the innodb database */
+UNIV_INTERN unsigned long long	srv_stats_transient_sample_pages = 8;
+UNIV_INTERN unsigned long long	srv_stats_persistent_sample_pages = 20;
 
 UNIV_INTERN ibool	srv_use_doublewrite_buf	= TRUE;
 UNIV_INTERN ibool	srv_use_checksums = TRUE;


Attachment: [text/bzr-bundle] bzr/vasil.dimov@oracle.com-20100420152328-m3ct5ybua7qpvon1.bundle
Thread
bzr push into mysql-trunk-innodb-persistent-stats branch (vasil.dimov:3035to 3037) vasil.dimov21 Apr