Author: mcbrown
Date: 2007-11-29 12:23:14 +0100 (Thu, 29 Nov 2007)
New Revision: 8973
Log:
Adding InnoDB summary table
Removed:
trunk/refman-6.0/se-innodb.xml
Renamed/Moved:
trunk/refman-6.0/se-innodb-core.xml (from rev 8971, trunk/refman-6.0/se-innodb.xml)
Modified:
trunk/refman-6.0/Makefile.depends
trunk/refman-6.0/se-falcon-core.xml
trunk/refman-6.0/storage-engines.xml
Modified: trunk/refman-6.0/Makefile.depends
===================================================================
--- trunk/refman-6.0/Makefile.depends 2007-11-29 11:20:29 UTC (rev 8972)
+++ trunk/refman-6.0/Makefile.depends 2007-11-29 11:23:14 UTC (rev 8973)
Changed blocks: 5, Lines Added: 42, Lines Deleted: 25; 4504 bytes
@@ -751,6 +751,32 @@
dynxml-local-se-falcon-manprepped.xml: $(dynxml_local_se_falcon_SOURCES) $(dynxml_local_se_falcon_IDMAPS)
dynxml-local-se-falcon-remprepped.xml: $(dynxml_local_se_falcon_SOURCES) $(dynxml_local_se_falcon_IDMAPS)
dynxml-local-se-falcon.xml: $(dynxml_local_se_falcon_INCLUDES)
+dynxml_local_se_innodb_INCLUDES = \
+ ../common/fixedchars.ent \
+ ../common/phrases.ent \
+ ../dynamic-docs/command-optvars/mysqld.xml \
+ ../refman-common/urls.ent \
+ all-entities.ent \
+ se-innodb-core.xml \
+ versions.ent
+dynxml_local_se_innodb_IMAGES =
+dynxml_local_se_innodb_SOURCES = dynxml-local-se-innodb.xml $(dynxml_local_se_innodb_INCLUDES)
+dynxml_local_se_innodb_IDMAPS = \
+ metadata/dba-core.idmap \
+ metadata/errors-problems.idmap \
+ metadata/installing-core.idmap \
+ metadata/se-innodb.idmap \
+ metadata/sql-syntax.idmap \
+ metadata/using-mysql-programs.idmap
+dynxml-local-se-innodb.validpure: $(dynxml_local_se_innodb_SOURCES)
+dynxml-local-se-innodb.titles: $(dynxml_local_se_innodb_SOURCES)
+dynxml-local-se-innodb.useless: $(dynxml_local_se_innodb_SOURCES)
+dynxml-local-se-innodb.valid: $(dynxml_local_se_innodb_SOURCES) $(dynxml_local_se_innodb_IDMAPS)
+dynxml-local-se-innodb.validwarn: $(dynxml_local_se_innodb_SOURCES) $(dynxml_local_se_innodb_IDMAPS)
+dynxml-local-se-innodb-prepped.xml: $(dynxml_local_se_innodb_SOURCES) $(dynxml_local_se_innodb_IDMAPS)
+dynxml-local-se-innodb-manprepped.xml: $(dynxml_local_se_innodb_SOURCES) $(dynxml_local_se_innodb_IDMAPS)
+dynxml-local-se-innodb-remprepped.xml: $(dynxml_local_se_innodb_SOURCES) $(dynxml_local_se_innodb_IDMAPS)
+dynxml-local-se-innodb.xml: $(dynxml_local_se_innodb_INCLUDES)
errmsgs_client_INCLUDES =
errmsgs_client_IMAGES =
errmsgs_client_SOURCES = errmsgs-client.xml $(errmsgs_client_INCLUDES)
@@ -1278,6 +1304,7 @@
dynxml-local-news-5.2.xml \
dynxml-local-news-6.0.xml \
dynxml-local-se-falcon.xml \
+ dynxml-local-se-innodb.xml \
errmsgs-client.xml \
errmsgs-server.xml \
errors-problems.xml \
@@ -1328,7 +1355,7 @@
se-example.xml \
se-falcon-core.xml \
se-federated.xml \
- se-innodb.xml \
+ se-innodb-core.xml \
se-memory.xml \
se-merge.xml \
se-myisam.xml \
@@ -2437,29 +2464,18 @@
se-federated-manprepped.xml: $(se_federated_SOURCES) $(se_federated_IDMAPS)
se-federated-remprepped.xml: $(se_federated_SOURCES) $(se_federated_IDMAPS)
-se_innodb_INCLUDES = \
- ../common/fixedchars.ent \
- ../common/phrases.ent \
- ../refman-common/urls.ent \
- all-entities.ent \
- versions.ent
-se_innodb_IMAGES =
-se_innodb_SOURCES = se-innodb.xml $(se_innodb_INCLUDES)
-se_innodb_IDMAPS = \
- metadata/dba-core.idmap \
- metadata/errors-problems.idmap \
- metadata/installing-core.idmap \
- metadata/se-innodb.idmap \
- metadata/sql-syntax.idmap \
- metadata/using-mysql-programs.idmap
-se-innodb.validpure: $(se_innodb_SOURCES)
-se-innodb.titles: $(se_innodb_SOURCES)
-se-innodb.useless: $(se_innodb_SOURCES)
-se-innodb.valid: $(se_innodb_SOURCES) $(se_innodb_IDMAPS)
-se-innodb.validwarn: $(se_innodb_SOURCES) $(se_innodb_IDMAPS)
-se-innodb-prepped.xml: $(se_innodb_SOURCES) $(se_innodb_IDMAPS)
-se-innodb-manprepped.xml: $(se_innodb_SOURCES) $(se_innodb_IDMAPS)
-se-innodb-remprepped.xml: $(se_innodb_SOURCES) $(se_innodb_IDMAPS)
+se_innodb_core_INCLUDES =
+se_innodb_core_IMAGES =
+se_innodb_core_SOURCES = se-innodb-core.xml $(se_innodb_core_INCLUDES)
+se_innodb_core_IDMAPS =
+se-innodb-core.validpure: $(se_innodb_core_SOURCES)
+se-innodb-core.titles: $(se_innodb_core_SOURCES)
+se-innodb-core.useless: $(se_innodb_core_SOURCES)
+se-innodb-core.valid: $(se_innodb_core_SOURCES) $(se_innodb_core_IDMAPS)
+se-innodb-core.validwarn: $(se_innodb_core_SOURCES) $(se_innodb_core_IDMAPS)
+se-innodb-core-prepped.xml: $(se_innodb_core_SOURCES) $(se_innodb_core_IDMAPS)
+se-innodb-core-manprepped.xml: $(se_innodb_core_SOURCES) $(se_innodb_core_IDMAPS)
+se-innodb-core-remprepped.xml: $(se_innodb_core_SOURCES) $(se_innodb_core_IDMAPS)
se_memory_INCLUDES = \
../common/fixedchars.ent \
@@ -2617,13 +2633,14 @@
../refman-common/urls.ent \
all-entities.ent \
dynxml-local-se-falcon.xml \
+ dynxml-local-se-innodb.xml \
se-archive.xml \
se-blackhole.xml \
se-csv.xml \
se-example.xml \
se-falcon-core.xml \
se-federated.xml \
- se-innodb.xml \
+ se-innodb-core.xml \
se-memory.xml \
se-merge.xml \
se-myisam.xml \
Modified: trunk/refman-6.0/se-falcon-core.xml
===================================================================
--- trunk/refman-6.0/se-innodb-core.xml (rev 0)
+++ trunk/refman-6.0/se-innodb-core.xml 2007-11-29 11:23:14 UTC (rev 8973)
Changed blocks: 2, Lines Added: 8355, Lines Deleted: 1; 309634 bytes
@@ -9,6 +9,15 @@
<remark role="dynamic-dependency-list"/>
+ <indexterm>
+ <primary>Falcon storage engine</primary>
+ </indexterm>
+
+ <indexterm>
+ <primary>tables</primary>
+ <secondary>Falcon</secondary>
+ </indexterm>
+
<para>
The Falcon Storage Engine has been designed with modern database
requirements in mind, and particularly for use within high-volume
Copied: trunk/refman-6.0/se-innodb-core.xml (from rev 8971, trunk/refman-6.0/se-innodb.xml)
===================================================================
--- trunk/refman-6.0/se-innodb-core.xml (rev 0)
+++ trunk/refman-6.0/se-innodb-core.xml 2007-11-29 11:23:14 UTC (rev 8973)
@@ -0,0 +1,8345 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE section PUBLIC "-//OASIS//DTD DocBook XML V4.3//EN" "http://www.oasis-open.org/docbook/xml/4.3/docbookx.dtd" [
+<!ENTITY % all.entities SYSTEM "all-entities.ent">
+ %all.entities;
+]>
+<section id="innodb">
+
+ <title>The <literal>InnoDB</literal> Storage Engine</title>
+
+ <remark role="dynamic-dependency-list"/>
+
+ <indexterm>
+ <primary>InnoDB storage engine</primary>
+ </indexterm>
+
+ <indexterm>
+ <primary>tables</primary>
+ <secondary>InnoDB</secondary>
+ </indexterm>
+
+ <remark role="todo">
+ A number of node names in this chapter are very generic (e.g.,
+ "Backing up") and could apply in non-InnoDB contexts. Might be best
+ to rename a bunch of them.
+ </remark>
+
+ <section id="innodb-overview">
+
+ <title><literal>InnoDB</literal> Overview</title>
+
+ <indexterm>
+ <primary>InnoDB</primary>
+ </indexterm>
+
+ <indexterm>
+ <primary>transactions</primary>
+ <secondary>support</secondary>
+ </indexterm>
+
+ <indexterm>
+ <primary>transaction-safe tables</primary>
+ </indexterm>
+
+ <indexterm>
+ <primary>ACID</primary>
+ </indexterm>
+
+ <para>
+ <literal>InnoDB</literal> provides MySQL with a transaction-safe
+ (<literal>ACID</literal> compliant) storage engine that has
+ commit, rollback, and crash recovery capabilities.
+ <literal>InnoDB</literal> does locking on the row level and also
+ provides an Oracle-style consistent non-locking read in
+ <literal>SELECT</literal> statements. These features increase
+ multi-user concurrency and performance. There is no need for lock
+ escalation in <literal>InnoDB</literal> because row-level locks
+ fit in very little space. <literal>InnoDB</literal> also supports
+ <literal>FOREIGN KEY</literal> constraints. You can freely mix
+ <literal>InnoDB</literal> tables with tables from other MySQL
+ storage engines, even within the same statement.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> has been designed for maximum
+ performance when processing large data volumes. Its CPU efficiency
+ is probably not matched by any other disk-based relational
+ database engine.
+ </para>
+
+ <para>
+ Fully integrated with MySQL Server, the <literal>InnoDB</literal>
+ storage engine maintains its own buffer pool for caching data and
+ indexes in main memory. <literal>InnoDB</literal> stores its
+ tables and indexes in a tablespace, which may consist of several
+ files (or raw disk partitions). This is different from, for
+ example, <literal>MyISAM</literal> tables where each table is
+ stored using separate files. <literal>InnoDB</literal> tables can
+ be of any size even on operating systems where file size is
+ limited to 2GB.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> is included in binary distributions by
+ default. The Windows Essentials installer makes
+ <literal>InnoDB</literal> the MySQL default storage engine on
+ Windows.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> is used in production at numerous large
+ database sites requiring high performance. The famous Internet
+ news site Slashdot.org runs on <literal>InnoDB</literal>. Mytrix,
+ Inc. stores over 1TB of data in <literal>InnoDB</literal>, and
+ another site handles an average load of 800 inserts/updates per
+ second in <literal>InnoDB</literal>.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> is published under the same GNU GPL
+ License Version 2 (of June 1991) as MySQL. For more information on
+ MySQL licensing, see
+ <ulink url="http://www.mysql.com/company/legal/licensing/"/>.
+ </para>
+
+ <para>
+ <emphasis role="bold">Additional resources</emphasis>
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ A forum dedicated to the <literal>InnoDB</literal> storage
+ engine is available at
+ <ulink url="&base-url-forum-list;?22"/>.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ </section>
+
+ <section id="innodb-contact-information">
+
+ <title><literal>InnoDB</literal> Contact Information</title>
+
+ <para>
+ Contact information for Innobase Oy, producer of the
+ <literal>InnoDB</literal> engine:
+ </para>
+
+<programlisting>
+Web site: <ulink url="http://www.innodb.com/"/>
+Email: <email>sales@stripped</email>
+Phone: +358-9-6969 3250 (office)
+ +358-40-5617367 (mobile)
+
+Innobase Oy Inc.
+World Trade Center Helsinki
+Aleksanterinkatu 17
+P.O.Box 800
+00101 Helsinki
+Finland
+</programlisting>
+
+ </section>
+
+ <section id="innodb-configuration">
+
+ <title><literal>InnoDB</literal> Configuration</title>
+
+ <para>
+ The <literal>InnoDB</literal> storage engine is enabled by
+ default. If you don't want to use <literal>InnoDB</literal>
+ tables, you can add the <literal>skip-innodb</literal> option to
+ your MySQL option file.
+ </para>
+
+ <note>
+ <para>
+ <literal>InnoDB</literal> provides MySQL with a transaction-safe
+ (<literal>ACID</literal> compliant) storage engine that has
+ commit, rollback, and crash recovery capabilities.
+ <emphasis role="bold">However, it cannot do so</emphasis> if the
+ underlying operating system or hardware does not work as
+ advertised. Many operating systems or disk subsystems may delay
+ or reorder write operations to improve performance. On some
+ operating systems, the very system call that should wait until
+ all unwritten data for a file has been flushed —
+ <literal>fsync()</literal> — might actually return before
+ the data has been flushed to stable storage. Because of this, an
+ operating system crash or a power outage may destroy recently
+ committed data, or in the worst case, even corrupt the database
+ because of write operations having been reordered. If data
+ integrity is important to you, you should perform some
+ <quote>pull-the-plug</quote> tests before using anything in
+ production. On Mac OS X 10.3 and up, <literal>InnoDB</literal>
+ uses a special <literal>fcntl()</literal> file flush method.
+ Under Linux, it is advisable to <emphasis role="bold">disable
+ the write-back cache</emphasis>.
+ </para>
+ </note>
+
+ <para>
+ On ATAPI hard disks, a command such <literal>hdparm -W0
+ /dev/hda</literal> may work to disable the write-back cache.
+ <emphasis role="bold">Beware that some drives or disk controllers
+ may be unable to disable the write-back cache.</emphasis>
+ </para>
+
+ <para>
+ Two important disk-based resources managed by the
+ <literal>InnoDB</literal> storage engine are its tablespace data
+ files and its log files.
+ </para>
+
+ <note>
+ <para>
+ If you specify no <literal>InnoDB</literal> configuration
+ options, MySQL creates an auto-extending 10MB data file named
+ <filename>ibdata1</filename> and two 5MB log files named
+ <filename>ib_logfile0</filename> and
+ <filename>ib_logfile1</filename> in the MySQL data directory. To
+ get good performance, you should explicitly provide
+ <literal>InnoDB</literal> parameters as discussed in the
+ following examples. Naturally, you should edit the settings to
+ suit your hardware and requirements.
+ </para>
+ </note>
+
+ <indexterm>
+ <primary>NFS</primary>
+ <secondary>InnoDB</secondary>
+ </indexterm>
+
+ <indexterm>
+ <primary>InnoDB</primary>
+ <secondary>NFS</secondary>
+ </indexterm>
+
+ <note>
+ <para>
+ It is not a good idea to configure <literal>InnoDB</literal> to
+ use datafiles or logfiles on NFS volumes. Otherwise, the files
+ might be locked by other processes and become unavailable for
+ use by MySQL.
+ </para>
+ </note>
+
+ <formalpara role="mnmas">
+
+ <title>MySQL Enterprise</title>
+
+ <para>
+ For advice on settings suitable to your specific circumstances,
+ subscribe to the MySQL Enterprise Monitor. For more information
+ see <ulink url="&base-url-enterprise;advisors.html"/>.
+ </para>
+
+ </formalpara>
+
+ <para>
+ The examples shown here are representative. See
+ <xref linkend="innodb-parameters"/> for additional information
+ about <literal>InnoDB</literal>-related configuration parameters.
+ </para>
+
+ <para>
+ To set up the <literal>InnoDB</literal> tablespace files, use the
+ <literal>innodb_data_file_path</literal> option in the
+ <literal>[mysqld]</literal> section of the
+ <filename>my.cnf</filename> option file. On Windows, you can use
+ <filename>my.ini</filename> instead. The value of
+ <literal>innodb_data_file_path</literal> should be a list of one
+ or more data file specifications. If you name more than one data
+ file, separate them by semicolon
+ (<quote><literal>;</literal></quote>) characters:
+ </para>
+
+<programlisting>
+innodb_data_file_path=<replaceable>datafile_spec1</replaceable>[;<replaceable>datafile_spec2</replaceable>]...
+</programlisting>
+
+ <para>
+ For example, a setting that explicitly creates a tablespace having
+ the same characteristics as the default is as follows:
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_data_file_path=ibdata1:10M:autoextend
+</programlisting>
+
+ <para>
+ This setting configures a single 10MB data file named
+ <filename>ibdata1</filename> that is auto-extending. No location
+ for the file is given, so by default, <literal>InnoDB</literal>
+ creates it in the MySQL data directory.
+ </para>
+
+ <para>
+ Sizes are specified using <literal>M</literal> or
+ <literal>G</literal> suffix letters to indicate units of MB or GB.
+ </para>
+
+ <para>
+ A tablespace containing a fixed-size 50MB data file named
+ <filename>ibdata1</filename> and a 50MB auto-extending file named
+ <filename>ibdata2</filename> in the data directory can be
+ configured like this:
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_data_file_path=ibdata1:50M;ibdata2:50M:autoextend
+</programlisting>
+
+ <para>
+ The full syntax for a data file specification includes the
+ filename, its size, and several optional attributes:
+ </para>
+
+<programlisting>
+<replaceable>file_name</replaceable>:<replaceable>file_size</replaceable>[:autoextend[:max:<replaceable>max_file_size</replaceable>]]
+</programlisting>
+
+ <para>
+ The <literal>autoextend</literal> attribute and those following
+ can be used only for the last data file in the
+ <literal>innodb_data_file_path</literal> line.
+ </para>
+
+ <para>
+ If you specify the <literal>autoextend</literal> option for the
+ last data file, <literal>InnoDB</literal> extends the data file if
+ it runs out of free space in the tablespace. The increment is 8MB
+ at a time by default. It can be modified by changing the
+ <literal>innodb_autoextend_increment</literal> system variable.
+ </para>
+
+ <para>
+ If the disk becomes full, you might want to add another data file
+ on another disk. Instructions for reconfiguring an existing
+ tablespace are given in <xref linkend="adding-and-removing"/>.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> is not aware of the filesystem maximum
+ file size, so be cautious on filesystems where the maximum file
+ size is a small value such as 2GB. To specify a maximum size for
+ an auto-extending data file, use the <literal>max</literal>
+ attribute. The following configuration allows
+ <filename>ibdata1</filename> to grow up to a limit of 500MB:
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_data_file_path=ibdata1:10M:autoextend:max:500M
+</programlisting>
+
+ <para>
+ <literal>InnoDB</literal> creates tablespace files in the MySQL
+ data directory by default. To specify a location explicitly, use
+ the <literal>innodb_data_home_dir</literal> option. For example,
+ to use two files named <filename>ibdata1</filename> and
+ <filename>ibdata2</filename> but create them in the
+ <filename>/ibdata</filename> directory, configure
+ <literal>InnoDB</literal> like this:
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_data_home_dir = /ibdata
+innodb_data_file_path=ibdata1:50M;ibdata2:50M:autoextend
+</programlisting>
+
+ <note>
+ <para>
+ <literal>InnoDB</literal> does not create directories, so make
+ sure that the <filename>/ibdata</filename> directory exists
+ before you start the server. This is also true of any log file
+ directories that you configure. Use the Unix or DOS
+ <literal>mkdir</literal> command to create any necessary
+ directories.
+ </para>
+ </note>
+
+ <para>
+ <literal>InnoDB</literal> forms the directory path for each data
+ file by textually concatenating the value of
+ <literal>innodb_data_home_dir</literal> to the data file name,
+ adding a pathname separator (slash or backslash) between values if
+ necessary. If the <literal>innodb_data_home_dir</literal> option
+ is not mentioned in <filename>my.cnf</filename> at all, the
+ default value is the <quote>dot</quote> directory
+ <filename>./</filename>, which means the MySQL data directory.
+ (The MySQL server changes its current working directory to its
+ data directory when it begins executing.)
+ </para>
+
+ <para>
+ If you specify <literal>innodb_data_home_dir</literal> as an empty
+ string, you can specify absolute paths for the data files listed
+ in the <literal>innodb_data_file_path</literal> value. The
+ following example is equivalent to the preceding one:
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_data_home_dir =
+innodb_data_file_path=/ibdata/ibdata1:50M;/ibdata/ibdata2:50M:autoextend
+</programlisting>
+
+ <para>
+ <emphasis role="bold">A simple <filename>my.cnf</filename>
+ example.</emphasis> Suppose that you have a computer with 128MB
+ RAM and one hard disk. The following example shows possible
+ configuration parameters in <filename>my.cnf</filename> or
+ <filename>my.ini</filename> for <literal>InnoDB</literal>,
+ including the <literal>autoextend</literal> attribute. The example
+ suits most users, both on Unix and Windows, who do not want to
+ distribute <literal>InnoDB</literal> data files and log files onto
+ several disks. It creates an auto-extending data file
+ <filename>ibdata1</filename> and two <literal>InnoDB</literal> log
+ files <filename>ib_logfile0</filename> and
+ <filename>ib_logfile1</filename> in the MySQL data directory.
+ </para>
+
+<programlisting>
+[mysqld]
+# You can write your other MySQL server options here
+# ...
+# Data files must be able to hold your data and indexes.
+# Make sure that you have enough free disk space.
+innodb_data_file_path = ibdata1:10M:autoextend
+#
+# Set buffer pool size to 50-80% of your computer's memory
+innodb_buffer_pool_size=70M
+innodb_additional_mem_pool_size=10M
+#
+# Set the log file size to about 25% of the buffer pool size
+innodb_log_file_size=20M
+innodb_log_buffer_size=8M
+#
+innodb_flush_log_at_trx_commit=1
+</programlisting>
+
+ <para>
+ Make sure that the MySQL server has the proper access rights to
+ create files in the data directory. More generally, the server
+ must have access rights in any directory where it needs to create
+ data files or log files.
+ </para>
+
+ <para>
+ Note that data files must be less than 2GB in some filesystems.
+ The combined size of the log files must be less than 4GB. The
+ combined size of data files must be at least 10MB.
+ </para>
+
+ <para>
+ When you create an <literal>InnoDB</literal> tablespace for the
+ first time, it is best that you start the MySQL server from the
+ command prompt. <literal>InnoDB</literal> then prints the
+ information about the database creation to the screen, so you can
+ see what is happening. For example, on Windows, if
+ <command>mysqld</command> is located in <filename>C:\Program
+ Files\MySQL\MySQL Server ¤t-series;\bin</filename>, you can
+ start it like this:
+ </para>
+
+<programlisting>
+C:\> <userinput>"C:\Program Files\MySQL\MySQL Server ¤t-series;\bin\mysqld" --console</userinput>
+</programlisting>
+
+ <para>
+ If you do not send server output to the screen, check the server's
+ error log to see what <literal>InnoDB</literal> prints during the
+ startup process.
+ </para>
+
+ <para>
+ See <xref linkend="innodb-init"/>, for an example of what the
+ information displayed by <literal>InnoDB</literal> should look
+ like.
+ </para>
+
+ <para>
+ You can place <literal>InnoDB</literal> options in the
+ <literal>[mysqld]</literal> group of any option file that your
+ server reads when it starts. The locations for option files are
+ described in <xref linkend="option-files"/>.
+ </para>
+
+ <para>
+ If you installed MySQL on Windows using the installation and
+ configuration wizards, the option file will be the
+ <filename>my.ini</filename> file located in your MySQL
+ installation directory. See
+ <xref linkend="mysql-config-wizard-file-location"/>.
+ </para>
+
+ <para>
+ If your PC uses a boot loader where the <filename>C:</filename>
+ drive is not the boot drive, your only option is to use the
+ <filename>my.ini</filename> file in your Windows directory
+ (typically <filename>C:\WINDOWS</filename>). You can use the
+ <literal>SET</literal> command at the command prompt in a console
+ window to print the value of <literal>WINDIR</literal>:
+ </para>
+
+<programlisting>
+C:\> <userinput>SET WINDIR</userinput>
+windir=C:\WINDOWS
+</programlisting>
+
+ <para>
+ If you want to make sure that <command>mysqld</command> reads
+ options only from a specific file, you can use the
+ <option>--defaults-file</option> option as the first option on the
+ command line when starting the server:
+ </para>
+
+<programlisting>
+mysqld --defaults-file=<replaceable>your_path_to_my_cnf</replaceable>
+</programlisting>
+
+ <para>
+ <emphasis role="bold">An advanced <filename>my.cnf</filename>
+ example.</emphasis> Suppose that you have a Linux computer with
+ 2GB RAM and three 60GB hard disks at directory paths
+ <filename>/</filename>, <filename>/dr2</filename> and
+ <filename>/dr3</filename>. The following example shows possible
+ configuration parameters in <filename>my.cnf</filename> for
+ <literal>InnoDB</literal>.
+ </para>
+
+<programlisting>
+[mysqld]
+# You can write your other MySQL server options here
+# ...
+innodb_data_home_dir =
+#
+# Data files must be able to hold your data and indexes
+innodb_data_file_path = /db/ibdata1:2000M;/dr2/db/ibdata2:2000M:autoextend
+#
+# Set buffer pool size to 50-80% of your computer's memory,
+# but make sure on Linux x86 total memory usage is < 2GB
+innodb_buffer_pool_size=1G
+innodb_additional_mem_pool_size=20M
+innodb_log_group_home_dir = /dr3/iblogs
+#
+innodb_log_files_in_group = 2
+#
+# Set the log file size to about 25% of the buffer pool size
+innodb_log_file_size=250M
+innodb_log_buffer_size=8M
+#
+innodb_flush_log_at_trx_commit=1
+innodb_lock_wait_timeout=50
+#
+# Uncomment the next lines if you want to use them
+#innodb_thread_concurrency=5
+</programlisting>
+
+ <para>
+ In some cases, database performance improves if all the data is
+ not placed on the same physical disk. Putting log files on a
+ different disk from data is very often beneficial for performance.
+ The example illustrates how to do this. It places the two data
+ files on different disks and places the log files on the third
+ disk. <literal>InnoDB</literal> fills the tablespace beginning
+ with the first data file. You can also use raw disk partitions
+ (raw devices) as <literal>InnoDB</literal> data files, which may
+ speed up I/O. See <xref linkend="innodb-raw-devices"/>.
+ </para>
+
+ <warning>
+ <para>
+ On 32-bit GNU/Linux x86, you must be careful not to set memory
+ usage too high. <literal>glibc</literal> may allow the process
+ heap to grow over thread stacks, which crashes your server. It
+ is a risk if the value of the following expression is close to
+ or exceeds 2GB:
+ </para>
+ </warning>
+
+<programlisting>
+innodb_buffer_pool_size
++ key_buffer_size
++ max_connections*(sort_buffer_size+read_buffer_size+binlog_cache_size)
++ max_connections*2MB
+</programlisting>
+
+ <para>
+ Each thread uses a stack (often 2MB, but only 256KB in MySQL AB
+ binaries) and in the worst case also uses
+ <literal>sort_buffer_size + read_buffer_size</literal> additional
+ memory.
+ </para>
+
+ <para>
+ By compiling MySQL yourself, you can use up to 64GB of physical
+ memory in 32-bit Windows. See the description for
+ <literal>innodb_buffer_pool_awe_mem_mb</literal> in
+ <xref linkend="innodb-parameters"/>.
+ </para>
+
+ <para>
+ <emphasis role="bold">How to tune other <command>mysqld</command>
+ server parameters?</emphasis> The following values are typical and
+ suit most users:
+ </para>
+
+<programlisting>
+[mysqld]
+skip-external-locking
+max_connections=200
+read_buffer_size=1M
+sort_buffer_size=1M
+#
+# Set key_buffer to 5 - 50% of your RAM depending on how much
+# you use MyISAM tables, but keep key_buffer_size + InnoDB
+# buffer pool size < 80% of your RAM
+key_buffer_size=<replaceable>value</replaceable>
+</programlisting>
+
+ <section id="multiple-tablespaces">
+
+ <title>Using Per-Table Tablespaces</title>
+
+ <para>
+ You can store each <literal>InnoDB</literal> table and its
+ indexes in its own file. This feature is called <quote>multiple
+ tablespaces</quote> because in effect each table has its own
+ tablespace.
+ </para>
+
+ <para>
+ Using multiple tablespaces can be beneficial to users who want
+ to move specific tables to separate physical disks or who wish
+ to restore backups of single tables quickly without interrupting
+ the use of the remaining <literal>InnoDB</literal> tables.
+ </para>
+
+ <para>
+ You can enable multiple tablespaces by adding this line to the
+ <literal>[mysqld]</literal> section of
+ <filename>my.cnf</filename>:
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_file_per_table
+</programlisting>
+
+ <para>
+ After restarting the server, <literal>InnoDB</literal> stores
+ each newly created table into its own file
+ <filename><replaceable>tbl_name</replaceable>.ibd</filename> in
+ the database directory where the table belongs. This is similar
+ to what the <literal>MyISAM</literal> storage engine does, but
+ <literal>MyISAM</literal> divides the table into a data file
+ <filename><replaceable>tbl_name</replaceable>.MYD</filename> and
+ the index file
+ <filename><replaceable>tbl_name</replaceable>.MYI</filename>.
+ For <literal>InnoDB</literal>, the data and the indexes are
+ stored together in the <filename>.ibd</filename> file. The
+ <filename><replaceable>tbl_name</replaceable>.frm</filename>
+ file is still created as usual.
+ </para>
+
+ <para>
+ If you remove the <literal>innodb_file_per_table</literal> line
+ from <filename>my.cnf</filename> and restart the server,
+ <literal>InnoDB</literal> creates tables inside the shared
+ tablespace files again.
+ </para>
+
+ <para>
+ <literal>innodb_file_per_table</literal> affects only table
+ creation, not access to existing tables. If you start the server
+ with this option, new tables are created using
+ <filename>.ibd</filename> files, but you can still access tables
+ that exist in the shared tablespace. If you remove the option
+ and restart the server, new tables are created in the shared
+ tablespace, but you can still access any tables that were
+ created using multiple tablespaces.
+ </para>
+
+ <note>
+ <para>
+ <literal>InnoDB</literal> always needs the shared tablespace
+ because it puts its internal data dictionary and undo logs
+ there. The <filename>.ibd</filename> files are not sufficient
+ for <literal>InnoDB</literal> to operate.
+ </para>
+ </note>
+
+ <note>
+ <para>
+ You cannot freely move <filename>.ibd</filename> files between
+ database directories as you can with <literal>MyISAM</literal>
+ table files. This is because the table definition that is
+ stored in the <literal>InnoDB</literal> shared tablespace
+ includes the database name, and because
+ <literal>InnoDB</literal> must preserve the consistency of
+ transaction IDs and log sequence numbers.
+ </para>
+ </note>
+
+ <para>
+ To move an <filename>.ibd</filename> file and the associated
+ table from one database to another, use a <literal>RENAME
+ TABLE</literal> statement:
+ </para>
+
+<programlisting>
+RENAME TABLE <replaceable>db1.tbl_name</replaceable> TO <replaceable>db2.tbl_name</replaceable>;
+</programlisting>
+
+ <indexterm>
+ <primary>DISCARD TABLESPACE</primary>
+ </indexterm>
+
+ <indexterm>
+ <primary>IMPORT TABLESPACE</primary>
+ </indexterm>
+
+ <para>
+ If you have a <quote>clean</quote> backup of an
+ <filename>.ibd</filename> file, you can restore it to the MySQL
+ installation from which it originated as follows:
+ </para>
+
+ <orderedlist>
+
+ <listitem>
+ <para>
+ Issue this <literal>ALTER TABLE</literal> statement:
+ </para>
+
+<programlisting>
+ALTER TABLE <replaceable>tbl_name</replaceable> DISCARD TABLESPACE;
+</programlisting>
+
+ <caution>
+ <para>
+ This statement deletes the current
+ <filename>.ibd</filename> file.
+ </para>
+ </caution>
+ </listitem>
+
+ <listitem>
+ <para>
+ Put the backup <filename>.ibd</filename> file back in the
+ proper database directory.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Issue this <literal>ALTER TABLE</literal> statement:
+ </para>
+
+<programlisting>
+ALTER TABLE <replaceable>tbl_name</replaceable> IMPORT TABLESPACE;
+</programlisting>
+ </listitem>
+
+ </orderedlist>
+
+ <para>
+ In this context, a <quote>clean</quote>
+ <filename>.ibd</filename> file backup means:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ There are no uncommitted modifications by transactions in
+ the <filename>.ibd</filename> file.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ There are no unmerged insert buffer entries in the
+ <filename>.ibd</filename> file.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Purge has removed all delete-marked index records from the
+ <filename>.ibd</filename> file.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <command>mysqld</command> has flushed all modified pages of
+ the <filename>.ibd</filename> file from the buffer pool to
+ the file.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ You can make a clean backup <filename>.ibd</filename> file using
+ the following method:
+ </para>
+
+ <orderedlist>
+
+ <listitem>
+ <para>
+ Stop all activity from the <command>mysqld</command> server
+ and commit all transactions.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Wait until <literal>SHOW ENGINE INNODB STATUS</literal>
+ shows that there are no active transactions in the database,
+ and the main thread status of <literal>InnoDB</literal> is
+ <literal>Waiting for server activity</literal>. Then you can
+ make a copy of the <filename>.ibd</filename> file.
+ </para>
+ </listitem>
+
+ </orderedlist>
+
+ <para>
+ Another method for making a clean copy of an
+ <filename>.ibd</filename> file is to use the commercial
+ <command>InnoDB Hot Backup</command> tool:
+ </para>
+
+ <orderedlist>
+
+ <listitem>
+ <para>
+ Use <command>InnoDB Hot Backup</command> to back up the
+ <literal>InnoDB</literal> installation.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Start a second <command>mysqld</command> server on the
+ backup and let it clean up the <filename>.ibd</filename>
+ files in the backup.
+ </para>
+ </listitem>
+
+ </orderedlist>
+
+ </section>
+
+ <section id="innodb-raw-devices">
+
+ <title>Using Raw Devices for the Shared Tablespace</title>
+
+ <para>
+ You can use raw disk partitions as data files in the shared
+ tablespace. By using a raw disk, you can perform non-buffered
+ I/O on Windows and on some Unix systems without filesystem
+ overhead, which may improve performance.
+ </para>
+
+ <para>
+ When you create a new data file, you must put the keyword
+ <literal>newraw</literal> immediately after the data file size
+ in <literal>innodb_data_file_path</literal>. The partition must
+ be at least as large as the size that you specify. Note that 1MB
+ in <literal>InnoDB</literal> is 1024 × 1024 bytes, whereas
+ 1MB in disk specifications usually means 1,000,000 bytes.
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_data_home_dir=
+innodb_data_file_path=/dev/hdd1:3Gnewraw;/dev/hdd2:2Gnewraw
+</programlisting>
+
+ <para>
+ The next time you start the server, <literal>InnoDB</literal>
+ notices the <literal>newraw</literal> keyword and initializes
+ the new partition. However, do not create or change any
+ <literal>InnoDB</literal> tables yet. Otherwise, when you next
+ restart the server, <literal>InnoDB</literal> reinitializes the
+ partition and your changes are lost. (As a safety measure
+ <literal>InnoDB</literal> prevents users from modifying data
+ when any partition with <literal>newraw</literal> is specified.)
+ </para>
+
+ <para>
+ After <literal>InnoDB</literal> has initialized the new
+ partition, stop the server, change <literal>newraw</literal> in
+ the data file specification to <literal>raw</literal>:
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_data_home_dir=
+innodb_data_file_path=/dev/hdd1:5Graw;/dev/hdd2:2Graw
+</programlisting>
+
+ <para>
+ Then restart the server and <literal>InnoDB</literal> allows
+ changes to be made.
+ </para>
+
+ <para>
+ On Windows, you can allocate a disk partition as a data file
+ like this:
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_data_home_dir=
+innodb_data_file_path=//./D::10Gnewraw
+</programlisting>
+
+ <para>
+ The <filename>//./</filename> corresponds to the Windows syntax
+ of <filename>\\.\</filename> for accessing physical drives.
+ </para>
+
+ <para>
+ When you use raw disk partitions, be sure that they have
+ permissions that allow read and write access by the account used
+ for running the MySQL server.
+ </para>
+
+ </section>
+
+ </section>
+
+ <section id="innodb-parameters">
+
+ <title><literal>InnoDB</literal> Startup Options and System Variables</title>
+
+ <para>
+ This section describes the <literal>InnoDB</literal>-related
+ command options and system variables. System variables that are
+ true or false can be enabled at server startup by naming them, or
+ disabled by using a <literal>skip-</literal> prefix. For example,
+ to enable or disable <literal>InnoDB</literal> checksums, you can
+ use <option>--innodb_checksums</option> or
+ <option>--skip-innodb_checksums</option> on the command line, or
+ <literal>innodb_checksums</literal> or
+ <literal>skip-innodb_checksums</literal> in an option file. System
+ variables that take a numeric value can be specified as
+ <option>--<replaceable>var_name</replaceable>=<replaceable>value</replaceable></option>
+ on the command line or as
+ <literal><replaceable>var_name</replaceable>=<replaceable>value</replaceable></literal>
+ in option files. For more information on specifying options and
+ system variables, see <xref linkend="program-options"/>. Many of
+ the system variables can be changed at runtime (see
+ <xref linkend="dynamic-system-variables"/>).
+ </para>
+
+ <formalpara role="mnmas">
+
+ <title>MySQL Enterprise</title>
+
+ <para>
+ The MySQL Enterprise Monitor provides expert advice on InnoDB
+ start-up options and related system variables. For more
+ information see
+ <ulink url="&base-url-enterprise;advisors.html"/>.
+ </para>
+
+ </formalpara>
+
+ <para>
+ <literal>InnoDB</literal> command options:
+ </para>
+
+ <para condition="dynamic:optvar:fullsummary" role="6.0:mysqld:section-innodb"/>
+
+ <itemizedlist>
+
+ <listitem>
+ <para id="option_mysqld_innodb">
+ <indexterm>
+ <primary>mysqld</primary>
+ <secondary>innodb option</secondary>
+ </indexterm>
+
+ <indexterm>
+ <primary>innodb option</primary>
+ <secondary>mysqld</secondary>
+ </indexterm>
+
+ <option>--innodb</option>
+ </para>
+
+ <para>
+ Enables the <literal>InnoDB</literal> storage engine, if the
+ server was compiled with <literal>InnoDB</literal> support.
+ Use <option>--skip-innodb</option> to disable
+ <literal>InnoDB</literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_status_file">
+ <indexterm>
+ <primary>mysqld</primary>
+ <secondary>innodb_status_file option</secondary>
+ </indexterm>
+
+ <indexterm>
+ <primary>innodb_status_file option</primary>
+ <secondary>mysqld</secondary>
+ </indexterm>
+
+ <option>--innodb_status_file</option>
+ </para>
+
+ <para>
+ Causes <literal>InnoDB</literal> to create a file named
+ <filename><replaceable><datadir></replaceable>/innodb_status.<replaceable><pid></replaceable></filename>
+ in the MySQL data directory. <literal>InnoDB</literal>
+ periodically writes the output of <literal>SHOW ENGINE INNODB
+ STATUS</literal> to this file.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ <literal>InnoDB</literal> system variables:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para id="option_mysqld_innodb_additional_mem_pool_size">
+ <literal>innodb_additional_mem_pool_size</literal>
+ </para>
+
+ <para>
+ The size in bytes of a memory pool <literal>InnoDB</literal>
+ uses to store data dictionary information and other internal
+ data structures. The more tables you have in your application,
+ the more memory you need to allocate here. If
+ <literal>InnoDB</literal> runs out of memory in this pool, it
+ starts to allocate memory from the operating system and writes
+ warning messages to the MySQL error log. The default value is
+ 1MB.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_autoextend_increment">
+ <literal>innodb_autoextend_increment</literal>
+ </para>
+
+ <para>
+ The increment size (in MB) for extending the size of an
+ auto-extending tablespace when it becomes full. The default
+ value is 8.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_buffer_pool_awe_mem_mb">
+ <literal>innodb_buffer_pool_awe_mem_mb</literal>
+ </para>
+
+ <para>
+ The size of the buffer pool (in MB), if it is placed in the
+ AWE memory. This is relevant only in 32-bit Windows. If your
+ 32-bit Windows operating system supports more than 4GB memory,
+ using so-called <quote>Address Windowing Extensions,</quote>
+ you can allocate the <literal>InnoDB</literal> buffer pool
+ into the AWE physical memory using this variable. The maximum
+ possible value for this variable is 63000. If it is greater
+ than 0, <literal>innodb_buffer_pool_size</literal> is the
+ window in the 32-bit address space of
+ <command>mysqld</command> where <literal>InnoDB</literal> maps
+ that AWE memory. A good value for
+ <literal>innodb_buffer_pool_size</literal> is 500MB.
+ </para>
+
+ <para>
+ To take advantage of AWE memory, you will need to recompile
+ MySQL yourself. The current project settings needed for doing
+ this can be found in the
+ <filename>storage/innobase/os/os0proj.c</filename> source
+ file.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_autoinc_lock_mode">
+ <literal>innodb_autoinc_lock_mode</literal>
+ </para>
+
+ <para>
+ The locking mode to use for generating auto-increment values.
+ The allowable values are 0, 1, or 2, for
+ <quote>traditional</quote>, <quote>consecutive</quote>, or
+ <quote>interleaved</quote> lock mode, respectively. The
+ characteristics of these modes are described in
+ <xref linkend="innodb-auto-increment-handling"/>.
+ </para>
+
+ <para>
+ This variable has a default of 1 (<quote>consecutive</quote>
+ lock mode).
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_buffer_pool_size">
+ <literal>innodb_buffer_pool_size</literal>
+ </para>
+
+ <para>
+ The size in bytes of the memory buffer
+ <literal>InnoDB</literal> uses to cache data and indexes of
+ its tables. The larger you set this value, the less disk I/O
+ is needed to access data in tables. On a dedicated database
+ server, you may set this to up to 80% of the machine physical
+ memory size. However, do not set it too large because
+ competition for physical memory might cause paging in the
+ operating system.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_checksums">
+ <literal>innodb_checksums</literal>
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> can use checksum validation on all
+ pages read from the disk to ensure extra fault tolerance
+ against broken hardware or data files. This validation is
+ enabled by default. However, under some rare circumstances
+ (such as when running benchmarks) this extra safety feature is
+ unneeded and can be disabled with
+ <option>--skip-innodb-checksums</option>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_commit_concurrency">
+ <literal>innodb_commit_concurrency</literal>
+ </para>
+
+ <para>
+ The number of threads that can commit at the same time.
+ Setting this parameter to 0 allows any number of transactions
+ to commit simultaneously.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_concurrency_tickets">
+ <literal>innodb_concurrency_tickets</literal>
+ </para>
+
+ <para>
+ The number of threads that can enter <literal>InnoDB</literal>
+ concurrently is determined by the
+ <literal>innodb_thread_concurrency</literal> variable. A
+ thread is placed in a queue when it tries to enter
+ <literal>InnoDB</literal> if the number of threads has already
+ reached the concurrency limit. When a thread is allowed to
+ enter <literal>InnoDB</literal>, it is given a number of
+ <quote>free tickets</quote> equal to the value of
+ <literal>innodb_concurrency_tickets</literal>, and the thread
+ can enter and leave <literal>InnoDB</literal> freely until it
+ has used up its tickets. After that point, the thread again
+ becomes subject to the concurrency check (and possible
+ queuing) the next time it tries to enter
+ <literal>InnoDB</literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_data_file_path">
+ <literal>innodb_data_file_path</literal>
+ </para>
+
+ <para>
+ The paths to individual data files and their sizes. The full
+ directory path to each data file is formed by concatenating
+ <literal>innodb_data_home_dir</literal> to each path specified
+ here. The file sizes are specified in MB or GB (1024MB) by
+ appending <literal>M</literal> or <literal>G</literal> to the
+ size value. The sum of the sizes of the files must be at least
+ 10MB. If you do not specify
+ <literal>innodb_data_file_path</literal>, the default behavior
+ is to create a single 10MB auto-extending data file named
+ <filename>ibdata1</filename>. The size limit of individual
+ files is determined by your operating system. You can set the
+ file size to more than 4GB on those operating systems that
+ support big files. You can also use raw disk partitions as
+ data files. See <xref linkend="innodb-raw-devices"/>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_data_home_dir">
+ <literal>innodb_data_home_dir</literal>
+ </para>
+
+ <para>
+ The common part of the directory path for all
+ <literal>InnoDB</literal> data files. If you do not set this
+ value, the default is the MySQL data directory. You can
+ specify the value as an empty string, in which case you can
+ use absolute file paths in
+ <literal>innodb_data_file_path</literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_doublewrite">
+ <literal>innodb_doublewrite</literal>
+ </para>
+
+ <para>
+ By default, <literal>InnoDB</literal> stores all data twice,
+ first to the doublewrite buffer, and then to the actual data
+ files. This variable is enabled by default. It can be turned
+ off with <option>--skip-innodb_doublewrite</option> for
+ benchmarks or cases when top performance is needed rather than
+ concern for data integrity or possible failures.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_fast_shutdown">
+ <literal>innodb_fast_shutdown</literal>
+ </para>
+
+ <para>
+ If you set this variable to 0, <literal>InnoDB</literal> does
+ a full purge and an insert buffer merge before a shutdown.
+ These operations can take minutes, or even hours in extreme
+ cases. If you set this variable to 1,
+ <literal>InnoDB</literal> skips these operations at shutdown.
+ The default value is 1. If you set it to 2,
+ <literal>InnoDB</literal> will just flush its logs and then
+ shut down cold, as if MySQL had crashed; no committed
+ transaction will be lost, but crash recovery will be done at
+ the next startup. A value of 2 cannot be used on NetWare.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_file_io_threads">
+ <literal>innodb_file_io_threads</literal>
+ </para>
+
+ <para>
+ The number of file I/O threads in <literal>InnoDB</literal>.
+ Normally, this should be left at the default value of 4, but
+ disk I/O on Windows may benefit from a larger number. On Unix,
+ increasing the number has no effect; <literal>InnoDB</literal>
+ always uses the default value.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_file_per_table">
+ <literal>innodb_file_per_table</literal>
+ </para>
+
+ <para>
+ If this variable is enabled, <literal>InnoDB</literal> creates
+ each new table using its own <filename>.ibd</filename> file
+ for storing data and indexes, rather than in the shared
+ tablespace. The default is to create tables in the shared
+ tablespace. See <xref linkend="multiple-tablespaces"/>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_flush_log_at_trx_commit">
+ <literal>innodb_flush_log_at_trx_commit</literal>
+ </para>
+
+ <para>
+ When <literal>innodb_flush_log_at_trx_commit</literal> is set
+ to 0, the log buffer is written out to the log file once per
+ second and the flush to disk operation is performed on the log
+ file, but nothing is done at a transaction commit. When this
+ value is 1 (the default), the log buffer is written out to the
+ log file at each transaction commit and the flush to disk
+ operation is performed on the log file. When set to 2, the log
+ buffer is written out to the file at each commit, but the
+ flush to disk operation is not performed on it. However, the
+ flushing on the log file takes place once per second also when
+ the value is 2. Note that the once-per-second flushing is not
+ 100% guaranteed to happen every second, due to process
+ scheduling issues.
+ </para>
+
+ <para>
+ The default value of this variable is 1, which is the value
+ that is required for ACID compliance. You can achieve better
+ performance by setting the value different from 1, but then
+ you can lose at most one second worth of transactions in a
+ crash. If you set the value to 0, then any
+ <command>mysqld</command> process crash can erase the last
+ second of transactions. If you set the value to 2, then only
+ an operating system crash or a power outage can erase the last
+ second of transactions. However, <literal>InnoDB</literal>'s
+ crash recovery is not affected and thus crash recovery does
+ work regardless of the value. Note that many operating systems
+ and some disk hardware fool the flush-to-disk operation. They
+ may tell <command>mysqld</command> that the flush has taken
+ place, even though it has not. Then the durability of
+ transactions is not guaranteed even with the setting 1, and in
+ the worst case a power outage can even corrupt the
+ <literal>InnoDB</literal> database. Using a battery-backed
+ disk cache in the SCSI disk controller or in the disk itself
+ speeds up file flushes, and makes the operation safer. You can
+ also try using the Unix command <command>hdparm</command> to
+ disable the caching of disk writes in hardware caches, or use
+ some other command specific to the hardware vendor.
+ </para>
+
+ <para>
+ Note: For the greatest possible durability and consistency in
+ a replication setup using <literal>InnoDB</literal> with
+ transactions, you should use
+ <literal>innodb_flush_log_at_trx_commit=1</literal> and
+ <literal>sync_binlog=1</literal> in your master server
+ <filename>my.cnf</filename> file.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_flush_method">
+ <literal>innodb_flush_method</literal>
+ </para>
+
+ <para>
+ If set to <literal>fdatasync</literal> (the default),
+ <literal>InnoDB</literal> uses <literal>fsync()</literal> to
+ flush both the data and log files. If set to
+ <literal>O_DSYNC</literal>, <literal>InnoDB</literal> uses
+ <literal>O_SYNC</literal> to open and flush the log files, but
+ uses <literal>fsync()</literal> to flush the data files. If
+ <literal>O_DIRECT</literal> is specified (available on some
+ GNU/Linux versions, FreeBSD and Solaris),
+ <literal>InnoDB</literal> uses <literal>O_DIRECT</literal> (or
+ <literal>directio()</literal> on Solaris) to open the data
+ files, and uses <literal>fsync()</literal> to flush both the
+ data and log files. Note that <literal>InnoDB</literal> uses
+ <literal>fsync()</literal> instead of
+ <literal>fdatasync()</literal>, and it does not use
+ <literal>O_DSYNC</literal> by default because there have been
+ problems with it on many varieties of Unix. This variable is
+ relevant only for Unix. On Windows, the flush method is always
+ <literal>async_unbuffered</literal> and cannot be changed.
+ </para>
+
+ <para>
+ Different values of this variable can have a marked effect on
+ <literal>InnoDB performance</literal>. For example, on some
+ systems where <literal>InnoDB</literal> data and log files are
+ located on a SAN, it has been found that setting
+ <literal>innodb_flush_method</literal> to
+ <literal>O_DIRECT</literal> can degrade performance of simple
+ <literal>SELECT</literal> statements by a factor of three.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_force_recovery">
+ <literal>innodb_force_recovery</literal>
+ </para>
+
+ <para>
+ The crash recovery mode.
+ </para>
+
+ <warning>
+ <para>
+ This variable should be set greater than 0 only in an
+ emergency situation when you want to dump your tables from a
+ corrupt database! Possible values are from 1 to 6. The
+ meanings of these values are described in
+ <xref linkend="forcing-recovery"/>. As a safety measure,
+ <literal>InnoDB</literal> prevents any changes to its data
+ when this variable is greater than 0.
+ </para>
+ </warning>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_lock_wait_timeout">
+ <literal>innodb_lock_wait_timeout</literal>
+ </para>
+
+ <para>
+ The timeout in seconds an <literal>InnoDB</literal>
+ transaction may wait for a lock before being rolled back.
+ <literal>InnoDB</literal> automatically detects transaction
+ deadlocks in its own lock table and rolls back the
+ transaction. <literal>InnoDB</literal> notices locks set using
+ the <literal>LOCK TABLES</literal> statement. The default is
+ 50 seconds.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_locks_unsafe_for_binlog">
+ <literal>innodb_locks_unsafe_for_binlog</literal>
+ </para>
+
+ <para>
+ <indexterm>
+ <primary>next-key lock</primary>
+ </indexterm>
+
+ <indexterm>
+ <primary>gap lock</primary>
+ </indexterm>
+
+ This variable controls next-key locking in
+ <literal>InnoDB</literal> searches and index scans. By
+ default, this variable is 0 (disabled), which means that
+ next-key locking is enabled.
+ </para>
+
+ <para>
+ Normally, <literal>InnoDB</literal> uses an algorithm called
+ <firstterm>next-key locking</firstterm>.
+ <literal>InnoDB</literal> performs row-level locking in such a
+ way that when it searches or scans a table index, it sets
+ shared or exclusive locks on any index records it encounters.
+ Thus, the row-level locks are actually index record locks. The
+ locks that <literal>InnoDB</literal> sets on index records
+ also affect the <quote>gap</quote> preceding that index
+ record. If a user has a shared or exclusive lock on record
+ <emphasis>R</emphasis> in an index, another user cannot insert
+ a new index record immediately before <emphasis>R</emphasis>
+ in the order of the index. Enabling this variable causes
+ <literal>InnoDB</literal> not to use next-key locking in
+ searches or index scans. Next-key locking is still used to
+ ensure foreign key constraints and duplicate key checking.
+ Note that enabling this variable may cause phantom problems:
+ Suppose that you want to read and lock all children from the
+ <literal>child</literal> table with an identifier value larger
+ than 100, with the intention of updating some column in the
+ selected rows later:
+ </para>
+
+<programlisting>
+SELECT * FROM child WHERE id > 100 FOR UPDATE;
+</programlisting>
+
+ <para>
+ Suppose that there is an index on the <literal>id</literal>
+ column. The query scans that index starting from the first
+ record where <literal>id</literal> is greater than 100. If the
+ locks set on the index records do not lock out inserts made in
+ the gaps, another client can insert a new row into the table.
+ If you execute the same <literal>SELECT</literal> within the
+ same transaction, you see a new row in the result set returned
+ by the query. This also means that if new items are added to
+ the database, <literal>InnoDB</literal> does not guarantee
+ serializability. Therefore, if this variable is enabled,
+ <literal>InnoDB</literal> guarantees at most isolation level
+ <literal>READ COMMITTED</literal>. (Conflict serializability
+ is still guaranteed.)
+ </para>
+
+ <para>
+ Enabling this variable has an additional effect:
+ <literal>InnoDB</literal> in an <literal>UPDATE</literal> or a
+ <literal>DELETE</literal> only locks rows that it updates or
+ deletes. This greatly reduces the probability of deadlocks,
+ but they can happen. Note that enabling this variable still
+ does not allow operations such as <literal>UPDATE</literal> to
+ overtake other similar operations (such as another
+ <literal>UPDATE</literal>) even in the case when they affect
+ different rows. Consider the following example, beginning with
+ this table:
+ </para>
+
+<programlisting>
+CREATE TABLE A(A INT NOT NULL, B INT) ENGINE = InnoDB;
+INSERT INTO A VALUES (1,2),(2,3),(3,2),(4,3),(5,2);
+COMMIT;
+</programlisting>
+
+ <para>
+ Suppose that one client executes these statements:
+ </para>
+
+<programlisting>
+SET AUTOCOMMIT = 0;
+UPDATE A SET B = 5 WHERE B = 3;
+</programlisting>
+
+ <para>
+ Then suppose that another client executes these statements
+ following those of the first client:
+ </para>
+
+<programlisting>
+SET AUTOCOMMIT = 0;
+UPDATE A SET B = 4 WHERE B = 2;
+</programlisting>
+
+ <para>
+ In this case, the second <literal>UPDATE</literal> must wait
+ for a commit or rollback of the first
+ <literal>UPDATE</literal>. The first <literal>UPDATE</literal>
+ has an exclusive lock on row (2,3), and the second
+ <literal>UPDATE</literal> while scanning rows also tries to
+ acquire an exclusive lock for the same row, which it cannot
+ have. This is because <literal>UPDATE</literal> two first
+ acquires an exclusive lock on a row and then determines
+ whether the row belongs to the result set. If not, it releases
+ the unnecessary lock, when the
+ <literal>innodb_locks_unsafe_for_binlog</literal> variable is
+ enabled.
+ </para>
+
+ <para>
+ Therefore, <literal>InnoDB</literal> executes
+ <literal>UPDATE</literal> one as follows:
+ </para>
+
+<programlisting>
+x-lock(1,2)
+unlock(1,2)
+x-lock(2,3)
+update(2,3) to (2,5)
+x-lock(3,2)
+unlock(3,2)
+x-lock(4,3)
+update(4,3) to (4,5)
+x-lock(5,2)
+unlock(5,2)
+</programlisting>
+
+ <para>
+ <literal>InnoDB</literal> executes <literal>UPDATE</literal>
+ two as follows:
+ </para>
+
+<programlisting>
+x-lock(1,2)
+update(1,2) to (1,4)
+x-lock(2,3) - wait for query one to commit or rollback
+</programlisting>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_log_archive">
+ <literal>innodb_log_archive</literal>
+ </para>
+
+ <para>
+ Whether to log <literal>InnoDB</literal> archive files. This
+ variable is present for historical reasons, but is unused.
+ Recovery from a backup is done by MySQL using its own log
+ files, so there is no need to archive
+ <literal>InnoDB</literal> log files. The default for this
+ variable is 0.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_log_buffer_size">
+ <literal>innodb_log_buffer_size</literal>
+ </para>
+
+ <para>
+ The size in bytes of the buffer that <literal>InnoDB</literal>
+ uses to write to the log files on disk. Sensible values range
+ from 1MB to 8MB. The default is 1MB. A large log buffer allows
+ large transactions to run without a need to write the log to
+ disk before the transactions commit. Thus, if you have big
+ transactions, making the log buffer larger saves disk I/O.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_log_file_size">
+ <literal>innodb_log_file_size</literal>
+ </para>
+
+ <para>
+ The size in bytes of each log file in a log group. The
+ combined size of log files must be less than 4GB on 32-bit
+ computers. The default is 5MB. Sensible values range from 1MB
+ to 1/<replaceable>N</replaceable>-th of the size of the buffer
+ pool, where <replaceable>N</replaceable> is the number of log
+ files in the group. The larger the value, the less checkpoint
+ flush activity is needed in the buffer pool, saving disk I/O.
+ But larger log files also mean that recovery is slower in case
+ of a crash.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_log_files_in_group">
+ <literal>innodb_log_files_in_group</literal>
+ </para>
+
+ <para>
+ The number of log files in the log group.
+ <literal>InnoDB</literal> writes to the files in a circular
+ fashion. The default (and recommended) is 2.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_log_group_home_dir">
+ <literal>innodb_log_group_home_dir</literal>
+ </para>
+
+ <para>
+ The directory path to the <literal>InnoDB</literal> log files.
+ If you do not specify any <literal>InnoDB</literal> log
+ variables, the default is to create two 5MB files names
+ <filename>ib_logfile0</filename> and
+ <filename>ib_logfile1</filename> in the MySQL data directory.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_max_dirty_pages_pct">
+ <literal>innodb_max_dirty_pages_pct</literal>
+ </para>
+
+ <para>
+ This is an integer in the range from 0 to 100. The default is
+ 90. The main thread in <literal>InnoDB</literal> tries to
+ write pages from the buffer pool so that the percentage of
+ dirty (not yet written) pages will not exceed this value.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_max_purge_lag">
+ <literal>innodb_max_purge_lag</literal>
+ </para>
+
+ <para>
+ This variable controls how to delay <literal>INSERT</literal>,
+ <literal>UPDATE</literal> and <literal>DELETE</literal>
+ operations when the purge operations are lagging (see
+ <xref linkend="innodb-multi-versioning"/>). The default value
+ of this variable is 0, meaning that there are no delays.
+ </para>
+
+ <para>
+ The <literal>InnoDB</literal> transaction system maintains a
+ list of transactions that have delete-marked index records by
+ <literal>UPDATE</literal> or <literal>DELETE</literal>
+ operations. Let the length of this list be
+ <replaceable>purge_lag</replaceable>. When
+ <replaceable>purge_lag</replaceable> exceeds
+ <literal>innodb_max_purge_lag</literal>, each
+ <literal>INSERT</literal>, <literal>UPDATE</literal> and
+ <literal>DELETE</literal> operation is delayed by
+ ((<replaceable>purge_lag</replaceable>/<literal>innodb_max_purge_lag</literal>)×10)−5
+ milliseconds. The delay is computed in the beginning of a
+ purge batch, every ten seconds. The operations are not delayed
+ if purge cannot run because of an old consistent read view
+ that could see the rows to be purged.
+ </para>
+
+ <para>
+ A typical setting for a problematic workload might be 1
+ million, assuming that our transactions are small, only 100
+ bytes in size, and we can allow 100MB of unpurged rows in our
+ tables.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_mirrored_log_groups">
+ <literal>innodb_mirrored_log_groups</literal>
+ </para>
+
+ <para>
+ The number of identical copies of log groups to keep for the
+ database. Currently, this should be set to 1.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_open_files">
+ <literal>innodb_open_files</literal>
+ </para>
+
+ <para>
+ This variable is relevant only if you use multiple tablespaces
+ in <literal>InnoDB</literal>. It specifies the maximum number
+ of <filename>.ibd</filename> files that
+ <literal>InnoDB</literal> can keep open at one time. The
+ minimum value is 10. The default is 300.
+ </para>
+
+ <para>
+ The file descriptors used for <filename>.ibd</filename> files
+ are for <literal>InnoDB</literal> only. They are independent
+ of those specified by the <option>--open-files-limit</option>
+ server option, and do not affect the operation of the table
+ cache.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_rollback_on_timeout">
+ <literal>innodb_rollback_on_timeout</literal>
+ </para>
+
+ <para>
+ In MySQL ¤t-series;, <literal>InnoDB</literal> rolls
+ back only the last statement on a transaction timeout. If this
+ option is given, a transaction timeout causes
+ <literal>InnoDB</literal> to abort and roll back the entire
+ transaction (the same behavior as in MySQL 4.1).
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_stats_on_metadata">
+ <literal>innodb_stats_on_metadata</literal>
+ </para>
+
+ <para>
+ When this variable is enabled (which is the default, as before
+ the variable was created), <literal>InnoDB</literal> updates
+ statistics during metadata statements such as <literal>SHOW
+ TABLE STATUS</literal> or <literal>SHOW INDEX</literal>, or
+ when accessing the <literal>INFORMATION_SCHEMA</literal>
+ tables <literal>TABLES</literal> or
+ <literal>STATISTICS</literal>. (These updates are similar to
+ what happens for <literal>ANALYZE TABLE</literal>.) When
+ disabled, <literal>InnoDB</literal> does not updates
+ statistics during these operations. Disabling this variable
+ can improve access speed for schemas that have a large number
+ of tables or indexes. It can also improve the stability of
+ execution plans for queries that involve
+ <literal>InnoDB</literal> tables.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_support_xa">
+ <literal>innodb_support_xa</literal>
+ </para>
+
+ <para>
+ When set to <literal>ON</literal> or 1 (the default), this
+ variable enables <literal>InnoDB</literal> support for
+ two-phase commit in XA transactions. Enabling
+ <literal>innodb_support_xa</literal> causes an extra disk
+ flush for transaction preparation.
+ </para>
+
+ <para>
+ If you don't care about using XA, you can disable this
+ variable by setting it to <literal>OFF</literal> or 0 to
+ reduce the number of disk flushes and get better
+ <literal>InnoDB</literal> performance. If you are using
+ replication and/or the binary log then you should set this
+ variable to <literal>ON</literal> or 1 to ensure that the
+ binary log does not get out of sync compared to the table
+ data.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_sync_spin_loops">
+ <literal>innodb_sync_spin_loops</literal>
+ </para>
+
+ <para>
+ The number of times a thread waits for an
+ <literal>InnoDB</literal> mutex to be freed before the thread
+ is suspended.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_table_locks">
+ <literal>innodb_table_locks</literal>
+ </para>
+
+ <para>
+ If <literal>AUTOCOMMIT=0</literal>, <literal>InnoDB</literal>
+ honors <literal>LOCK TABLES</literal>; MySQL does not return
+ from <literal>LOCK TABLE .. WRITE</literal> until all other
+ threads have released all their locks to the table. The
+ default value of <literal>innodb_table_locks</literal> is 1,
+ which means that <literal>LOCK TABLES</literal> causes InnoDB
+ to lock a table internally if <literal>AUTOCOMMIT=0</literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_thread_concurrency">
+ <literal>innodb_thread_concurrency</literal>
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> tries to keep the number of
+ operating system threads concurrently inside
+ <literal>InnoDB</literal> less than or equal to the limit
+ given by this variable. Once the number of threads reaches
+ this limit, additional threads are placed into a wait state
+ within a FIFO queue for execution. Threads waiting for locks
+ are not counted in the number of concurrently executing
+ threads.
+ </para>
+
+ <para>
+ The correct value for this variable is dependent on
+ environment and workload. You will need to try a range of
+ different values to determine what value works for your
+ application.
+ </para>
+
+ <para>
+ The range of this variable is 0 to 1000. You can disable
+ thread concurrency checking by setting the value to 0, which
+ allows InnoDB to create as many threads as it needs.
+ </para>
+
+ <para>
+ The default value is 8.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para id="option_mysqld_innodb_thread_sleep_delay">
+ <literal>innodb_thread_sleep_delay</literal>
+ </para>
+
+ <para>
+ How long <literal>InnoDB</literal> threads sleep before
+ joining the <literal>InnoDB</literal> queue, in microseconds.
+ The default value is 10,000. A value of 0 disables sleep.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>sync_binlog</literal>
+ </para>
+
+ <para>
+ If the value of this variable is positive, the MySQL server
+ synchronizes its binary log to disk
+ (<literal>fdatasync()</literal>) after every
+ <literal>sync_binlog</literal> writes to this binary log. Note
+ that there is one write to the binary log per statement if in
+ autocommit mode, and otherwise one write per transaction. The
+ default value is 0 which does no synchronizing to disk. A
+ value of 1 is the safest choice, because in the event of a
+ crash you lose at most one statement/transaction from the
+ binary log; however, it is also the slowest choice (unless the
+ disk has a battery-backed cache, which makes synchronization
+ very fast).
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ </section>
+
+ <section id="innodb-init">
+
+ <title>Creating the <literal>InnoDB</literal> Tablespace</title>
+
+ <para>
+ Suppose that you have installed MySQL and have edited your option
+ file so that it contains the necessary <literal>InnoDB</literal>
+ configuration parameters. Before starting MySQL, you should verify
+ that the directories you have specified for
+ <literal>InnoDB</literal> data files and log files exist and that
+ the MySQL server has access rights to those directories.
+ <literal>InnoDB</literal> does not create directories, only files.
+ Check also that you have enough disk space for the data and log
+ files.
+ </para>
+
+ <para>
+ It is best to run the MySQL server <command>mysqld</command> from
+ the command prompt when you first start the server with
+ <literal>InnoDB</literal> enabled, not from the
+ <command>mysqld_safe</command> wrapper or as a Windows service.
+ When you run from a command prompt you see what
+ <command>mysqld</command> prints and what is happening. On Unix,
+ just invoke <command>mysqld</command>. On Windows, use the
+ <option>--console</option> option.
+ </para>
+
+ <para>
+ When you start the MySQL server after initially configuring
+ <literal>InnoDB</literal> in your option file,
+ <literal>InnoDB</literal> creates your data files and log files,
+ and prints something like this:
+ </para>
+
+<programlisting>
+InnoDB: The first specified datafile /home/heikki/data/ibdata1
+did not exist:
+InnoDB: a new database to be created!
+InnoDB: Setting file /home/heikki/data/ibdata1 size to 134217728
+InnoDB: Database physically writes the file full: wait...
+InnoDB: datafile /home/heikki/data/ibdata2 did not exist:
+new to be created
+InnoDB: Setting file /home/heikki/data/ibdata2 size to 262144000
+InnoDB: Database physically writes the file full: wait...
+InnoDB: Log file /home/heikki/data/logs/ib_logfile0 did not exist:
+new to be created
+InnoDB: Setting log file /home/heikki/data/logs/ib_logfile0 size
+to 5242880
+InnoDB: Log file /home/heikki/data/logs/ib_logfile1 did not exist:
+new to be created
+InnoDB: Setting log file /home/heikki/data/logs/ib_logfile1 size
+to 5242880
+InnoDB: Doublewrite buffer not found: creating new
+InnoDB: Doublewrite buffer created
+InnoDB: Creating foreign key constraint system tables
+InnoDB: Foreign key constraint system tables created
+InnoDB: Started
+mysqld: ready for connections
+</programlisting>
+
+ <para>
+ At this point <literal>InnoDB</literal> has initialized its
+ tablespace and log files. You can connect to the MySQL server with
+ the usual MySQL client programs like <command>mysql</command>.
+ When you shut down the MySQL server with <command>mysqladmin
+ shutdown</command>, the output is like this:
+ </para>
+
+<programlisting>
+010321 18:33:34 mysqld: Normal shutdown
+010321 18:33:34 mysqld: Shutdown Complete
+InnoDB: Starting shutdown...
+InnoDB: Shutdown completed
+</programlisting>
+
+ <para>
+ You can look at the data file and log directories and you see the
+ files created there. When MySQL is started again, the data files
+ and log files have been created already, so the output is much
+ briefer:
+ </para>
+
+<programlisting>
+InnoDB: Started
+mysqld: ready for connections
+</programlisting>
+
+ <para>
+ If you add the <literal>innodb_file_per_table</literal> option to
+ <filename>my.cnf</filename>, <literal>InnoDB</literal> stores each
+ table in its own <filename>.ibd</filename> file in the same MySQL
+ database directory where the <filename>.frm</filename> file is
+ created. See <xref linkend="multiple-tablespaces"/>.
+ </para>
+
+ <section id="error-creating-innodb">
+
+ <title>Dealing with <literal>InnoDB</literal> Initialization Problems</title>
+
+ <para>
+ If <literal>InnoDB</literal> prints an operating system error
+ during a file operation, usually the problem has one of the
+ following causes:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ You did not create the <literal>InnoDB</literal> data file
+ directory or the <literal>InnoDB</literal> log directory.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <command>mysqld</command> does not have access rights to
+ create files in those directories.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <command>mysqld</command> cannot read the proper
+ <filename>my.cnf</filename> or <filename>my.ini</filename>
+ option file, and consequently does not see the options that
+ you specified.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ The disk is full or a disk quota is exceeded.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ You have created a subdirectory whose name is equal to a
+ data file that you specified, so the name cannot be used as
+ a filename.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ There is a syntax error in the
+ <literal>innodb_data_home_dir</literal> or
+ <literal>innodb_data_file_path</literal> value.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ If something goes wrong when <literal>InnoDB</literal> attempts
+ to initialize its tablespace or its log files, you should delete
+ all files created by <literal>InnoDB</literal>. This means all
+ <filename>ibdata</filename> files and all
+ <filename>ib_logfile</filename> files. In case you have already
+ created some <literal>InnoDB</literal> tables, delete the
+ corresponding <filename>.frm</filename> files for these tables
+ (and any <filename>.ibd</filename> files if you are using
+ multiple tablespaces) from the MySQL database directories as
+ well. Then you can try the <literal>InnoDB</literal> database
+ creation again. It is best to start the MySQL server from a
+ command prompt so that you see what is happening.
+ </para>
+
+ </section>
+
+ </section>
+
+ <section id="using-innodb-tables">
+
+ <title>Creating and Using <literal>InnoDB</literal> Tables</title>
+
+ <para>
+ To create an <literal>InnoDB</literal> table, specify an
+ <literal>ENGINE = InnoDB</literal> option in the <literal>CREATE
+ TABLE</literal> statement:
+ </para>
+
+<programlisting>
+CREATE TABLE customers (a INT, b CHAR (20), INDEX (a)) ENGINE=InnoDB;
+</programlisting>
+
+ <para>
+ The statement creates a table and an index on column
+ <literal>a</literal> in the <literal>InnoDB</literal> tablespace
+ that consists of the data files that you specified in
+ <filename>my.cnf</filename>. In addition, MySQL creates a file
+ <filename>customers.frm</filename> in the
+ <filename>test</filename> directory under the MySQL database
+ directory. Internally, <literal>InnoDB</literal> adds an entry for
+ the table to its own data dictionary. The entry includes the
+ database name. For example, if <literal>test</literal> is the
+ database in which the <literal>customers</literal> table is
+ created, the entry is for <literal>'test/customers'</literal>.
+ This means you can create a table of the same name
+ <literal>customers</literal> in some other database, and the table
+ names do not collide inside <literal>InnoDB</literal>.
+ </para>
+
+ <para>
+ You can query the amount of free space in the
+ <literal>InnoDB</literal> tablespace by issuing a <literal>SHOW
+ TABLE STATUS</literal> statement for any <literal>InnoDB</literal>
+ table. The amount of free space in the tablespace appears in the
+ <literal>Comment</literal> section in the output of <literal>SHOW
+ TABLE STATUS</literal>. For example:
+ </para>
+
+<programlisting>
+SHOW TABLE STATUS FROM test LIKE 'customers'
+</programlisting>
+
+ <para>
+ Note that the statistics <literal>SHOW</literal> displays for
+ <literal>InnoDB</literal> tables are only approximate. They are
+ used in SQL optimization. Table and index reserved sizes in bytes
+ are accurate, though.
+ </para>
+
+ <section id="innodb-transactions-with-different-apis">
+
+ <title>How to Use Transactions in <literal>InnoDB</literal> with Different APIs</title>
+
+ <para>
+ By default, each client that connects to the MySQL server begins
+ with autocommit mode enabled, which automatically commits every
+ SQL statement as you execute it. To use multiple-statement
+ transactions, you can switch autocommit off with the SQL
+ statement <literal>SET AUTOCOMMIT = 0</literal> and use
+ <literal>COMMIT</literal> and <literal>ROLLBACK</literal> to
+ commit or roll back your transaction. If you want to leave
+ autocommit on, you can enclose your transactions within
+ <literal>START TRANSACTION</literal> and either
+ <literal>COMMIT</literal> or <literal>ROLLBACK</literal>. The
+ following example shows two transactions. The first is
+ committed; the second is rolled back.
+ </para>
+
+<programlisting>
+shell> <userinput>mysql test</userinput>
+
+mysql> <userinput>CREATE TABLE CUSTOMER (A INT, B CHAR (20), INDEX (A))</userinput>
+ -> <userinput>ENGINE=InnoDB;</userinput>
+Query OK, 0 rows affected (0.00 sec)
+mysql> <userinput>START TRANSACTION;</userinput>
+Query OK, 0 rows affected (0.00 sec)
+mysql> <userinput>INSERT INTO CUSTOMER VALUES (10, 'Heikki');</userinput>
+Query OK, 1 row affected (0.00 sec)
+mysql> <userinput>COMMIT;</userinput>
+Query OK, 0 rows affected (0.00 sec)
+mysql> <userinput>SET AUTOCOMMIT=0;</userinput>
+Query OK, 0 rows affected (0.00 sec)
+mysql> <userinput>INSERT INTO CUSTOMER VALUES (15, 'John');</userinput>
+Query OK, 1 row affected (0.00 sec)
+mysql> <userinput>ROLLBACK;</userinput>
+Query OK, 0 rows affected (0.00 sec)
+mysql> <userinput>SELECT * FROM CUSTOMER;</userinput>
++------+--------+
+| A | B |
++------+--------+
+| 10 | Heikki |
++------+--------+
+1 row in set (0.00 sec)
+mysql>
+</programlisting>
+
+ <para>
+ In APIs such as PHP, Perl DBI, JDBC, ODBC, or the standard C
+ call interface of MySQL, you can send transaction control
+ statements such as <literal>COMMIT</literal> to the MySQL server
+ as strings just like any other SQL statements such as
+ <literal>SELECT</literal> or <literal>INSERT</literal>. Some
+ APIs also offer separate special transaction commit and rollback
+ functions or methods.
+ </para>
+
+ </section>
+
+ <section id="converting-tables-to-innodb">
+
+ <title>Converting <literal>MyISAM</literal> Tables to <literal>InnoDB</literal></title>
+
+ <para>
+ Important: Do not convert MySQL system tables in the
+ <literal>mysql</literal> database (such as
+ <literal>user</literal> or <literal>host</literal>) to the
+ <literal>InnoDB</literal> type. This is an unsupported
+ operation. The system tables must always be of the
+ <literal>MyISAM</literal> type.
+ </para>
+
+ <para>
+ If you want all your (non-system) tables to be created as
+ <literal>InnoDB</literal> tables, you can simply add the line
+ <literal>default-storage-engine=innodb</literal> to the
+ <literal>[mysqld]</literal> section of your server option file.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> does not have a special optimization
+ for separate index creation the way the
+ <literal>MyISAM</literal> storage engine does. Therefore, it
+ does not pay to export and import the table and create indexes
+ afterward. The fastest way to alter a table to
+ <literal>InnoDB</literal> is to do the inserts directly to an
+ <literal>InnoDB</literal> table. That is, use <literal>ALTER
+ TABLE ... ENGINE=INNODB</literal>, or create an empty
+ <literal>InnoDB</literal> table with identical definitions and
+ insert the rows with <literal>INSERT INTO ... SELECT * FROM
+ ...</literal>.
+ </para>
+
+ <para>
+ If you have <literal>UNIQUE</literal> constraints on secondary
+ keys, you can speed up a table import by turning off the
+ uniqueness checks temporarily during the import operation:
+ </para>
+
+<programlisting>
+SET UNIQUE_CHECKS=0;
+<replaceable>... import operation ...</replaceable>
+SET UNIQUE_CHECKS=1;
+</programlisting>
+
+ <para>
+ For big tables, this saves a lot of disk I/O because
+ <literal>InnoDB</literal> can then use its insert buffer to
+ write secondary index records as a batch. Be certain that the
+ data contains no duplicate keys.
+ <literal>UNIQUE_CHECKS</literal> allows but does not require
+ storage engines to ignore duplicate keys.
+ </para>
+
+ <para>
+ To get better control over the insertion process, it might be
+ good to insert big tables in pieces:
+ </para>
+
+<programlisting>
+INSERT INTO newtable SELECT * FROM oldtable
+ WHERE yourkey > something AND yourkey <= somethingelse;
+</programlisting>
+
+ <para>
+ After all records have been inserted, you can rename the tables.
+ </para>
+
+ <para>
+ During the conversion of big tables, you should increase the
+ size of the <literal>InnoDB</literal> buffer pool to reduce disk
+ I/O. Do not use more than 80% of the physical memory, though.
+ You can also increase the sizes of the <literal>InnoDB</literal>
+ log files.
+ </para>
+
+ <para>
+ Make sure that you do not fill up the tablespace:
+ <literal>InnoDB</literal> tables require a lot more disk space
+ than <literal>MyISAM</literal> tables. If an <literal>ALTER
+ TABLE</literal> operation runs out of space, it starts a
+ rollback, and that can take hours if it is disk-bound. For
+ inserts, <literal>InnoDB</literal> uses the insert buffer to
+ merge secondary index records to indexes in batches. That saves
+ a lot of disk I/O. For rollback, no such mechanism is used, and
+ the rollback can take 30 times longer than the insertion.
+ </para>
+
+ <para>
+ In the case of a runaway rollback, if you do not have valuable
+ data in your database, it may be advisable to kill the database
+ process rather than wait for millions of disk I/O operations to
+ complete. For the complete procedure, see
+ <xref linkend="forcing-recovery"/>.
+ </para>
+
+ </section>
+
+ <section id="innodb-auto-increment-handling">
+
+ <title>How <literal>AUTO_INCREMENT</literal> Handling Works in
+ <literal>InnoDB</literal></title>
+
+ <para>
+ <literal>InnoDB</literal> provides a locking strategy that
+ significantly improves scalability and performance of SQL
+ statements that add rows to tables with
+ <literal>AUTO_INCREMENT</literal> columns. This section provides
+ background information on the original
+ (<quote>traditional</quote>) implementation of auto-increment
+ locking in <literal>InnoDB</literal>, explains the configurable
+ locking mechanism, documents the parameter for configuring the
+ mechanism, and describes its behavior and interaction with
+ replication.
+ </para>
+
+ <section id="innodb-auto-increment-traditional">
+
+ <title><quote>Traditional</quote> <literal>InnoDB</literal> Auto-Increment
+ Locking</title>
+
+ <para>
+ The original implementation of auto-increment handling in
+ <literal>InnoDB</literal> uses the following strategy to
+ prevent problems when using the binary log for statement-based
+ replication or for certain recovery scenarios.
+ </para>
+
+ <para>
+ If you specify an <literal>AUTO_INCREMENT</literal> column for
+ an <literal>InnoDB</literal> table, the table handle in the
+ <literal>InnoDB</literal> data dictionary contains a special
+ counter called the auto-increment counter that is used in
+ assigning new values for the column. This counter is stored
+ only in main memory, not on disk.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> uses the following algorithm to
+ initialize the auto-increment counter for a table
+ <literal>t</literal> that contains an
+ <literal>AUTO_INCREMENT</literal> column named
+ <literal>ai_col</literal>: After a server startup, for the
+ first insert into a table <literal>t</literal>,
+ <literal>InnoDB</literal> executes the equivalent of this
+ statement:
+ </para>
+
+<programlisting>
+SELECT MAX(ai_col) FROM t FOR UPDATE;
+</programlisting>
+
+ <para>
+ <literal>InnoDB</literal> increments by one the value
+ retrieved by the statement and assigns it to the column and to
+ the auto-increment counter for the table. If the table is
+ empty, <literal>InnoDB</literal> uses the value
+ <literal>1</literal>. If a user invokes a <literal>SHOW TABLE
+ STATUS</literal> statement that displays output for the table
+ <literal>t</literal> and the auto-increment counter has not
+ been initialized, <literal>InnoDB</literal> initializes but
+ does not increment the value and stores it for use by later
+ inserts. This initialization uses a normal exclusive-locking
+ read on the table and the lock lasts to the end of the
+ transaction.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> follows the same procedure for
+ initializing the auto-increment counter for a freshly created
+ table.
+ </para>
+
+ <para>
+ After the auto-increment counter has been initialized, if a
+ user does not explicitly specify a value for an
+ <literal>AUTO_INCREMENT</literal> column,
+ <literal>InnoDB</literal> increments the counter by one and
+ assigns the new value to the column. If the user inserts a row
+ that explicitly specifies the column value, and the value is
+ bigger than the current counter value, the counter is set to
+ the specified column value.
+ </para>
+
+ <para>
+ When accessing the auto-increment counter,
+ <literal>InnoDB</literal> uses a special table-level
+ <literal>AUTO-INC</literal> lock that it keeps to the end of
+ the current SQL statement, not to the end of the transaction.
+ The special lock release strategy was introduced to improve
+ concurrency for inserts into a table containing an
+ <literal>AUTO_INCREMENT</literal> column. Nevertheless, two
+ transactions cannot have the <literal>AUTO-INC</literal> lock
+ on the same table simultaneously, which can have a performance
+ impact if the <literal>AUTO-INC</literal> lock is held for a
+ long time. That might be the case for a statement such as
+ <literal>INSERT INTO t1 ... SELECT ... FROM t2</literal> that
+ inserts all rows from one table into another.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> uses the in-memory auto-increment
+ counter as long as the server runs. When the server is stopped
+ and restarted, <literal>InnoDB</literal> reinitializes the
+ counter for each table for the first <literal>INSERT</literal>
+ to the table, as described earlier.
+ </para>
+
+ <para>
+ You may see gaps in the sequence of values assigned to the
+ <literal>AUTO_INCREMENT</literal> column if you roll back
+ transactions that have generated numbers using the counter.
+ </para>
+
+ <para>
+ If a user specifies <literal>NULL</literal> or
+ <literal>0</literal> for the <literal>AUTO_INCREMENT</literal>
+ column in an <literal>INSERT</literal>,
+ <literal>InnoDB</literal> treats the row as if the value had
+ not been specified and generates a new value for it.
+ </para>
+
+ <para>
+ The behavior of the auto-increment mechanism is not defined if
+ a user assigns a negative value to the column or if the value
+ becomes bigger than the maximum integer that can be stored in
+ the specified integer type.
+ </para>
+
+ <para>
+ An <literal>AUTO_INCREMENT</literal> column must be the first
+ column listed if it is part of a multiple-column index in an
+ <literal>InnoDB</literal> table.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> supports the <literal>AUTO_INCREMENT
+ = <replaceable>N</replaceable></literal> table option in
+ <literal>CREATE TABLE</literal> and <literal>ALTER
+ TABLE</literal> statements, to set the initial counter value
+ or alter the current counter value. The effect of this option
+ is canceled by a server restart, for reasons discussed earlier
+ in this section.
+ </para>
+
+ </section>
+
+ <section id="innodb-auto-increment-configurable">
+
+ <title>Configurable <literal>InnoDB</literal> Auto-Increment Locking</title>
+
+ <para>
+ As described in the previous section,
+ <literal>InnoDB</literal> uses a special lock called the
+ AUTO-INC table-level lock for inserts into tables with
+ <literal>AUTO_INCREMENT</literal> columns. This lock is
+ normally held to the end of the statement (not to the end of
+ the transaction), to ensure that auto-increment numbers are
+ assigned in a predictable and repeatable order for a given
+ sequence of <literal>INSERT</literal> statements.
+ </para>
+
+ <para>
+ In the case of statement-based replication, this means that
+ when a SQL statement is replicated on a slave server, the same
+ values are used for the auto-increment column as on the master
+ server. The result of execution of multiple
+ <literal>INSERT</literal> statements is deterministic, and the
+ slave reproduces the same data as on the master. If
+ auto-increment values generated by multiple
+ <literal>INSERT</literal> statements were interleaved, the
+ result of two concurrent <literal>INSERT</literal> statements
+ would be non-deterministic, and could not reliably be
+ propagated to a slave server using statement-based
+ replication.
+ </para>
+
+ <para>
+ To make this clear, consider an example that uses this table:
+ </para>
+
+<programlisting>
+CREATE TABLE t1 (
+ c1 INT(11) NOT NULL AUTO_INCREMENT,
+ c2 VARCHAR(10) DEFAULT NULL,
+ PRIMARY KEY (c1)
+) ENGINE=InnoDB;
+</programlisting>
+
+ <para>
+ Suppose that there are two transactions running, each
+ inserting rows into a table with an
+ <literal>AUTO_INCREMENT</literal> column. One transaction is
+ using an <literal>INSERT ... SELECT</literal> statement that
+ inserts 1000 rows, and another is using a simple
+ <literal>INSERT</literal> statement that inserts one row:
+ </para>
+
+<programlisting>
+Tx1: INSERT INTO t1 (c2) SELECT 1000 rows from another table ...
+Tx2: INSERT INTO t1 (c2) VALUES ('xxx');
+</programlisting>
+
+ <para>
+ <literal>InnoDB</literal> cannot tell in advance how many rows
+ will be retrieved from the <literal>SELECT</literal> in the
+ <literal>INSERT</literal> statement in Tx1, and it assigns the
+ auto-increment values one at a time as the statement proceeds.
+ With a table-level lock, held to the end of the statement,
+ only one <literal>INSERT</literal> statement referring to
+ table <literal>t1</literal> can execute at a time, and the
+ generation of auto-increment numbers by different statements
+ is not interleaved. The auto-increment value generated by the
+ Tx1 <literal>INSERT ... SELECT</literal> statement will be
+ consecutive, and the (single) auto-increment value used by the
+ <literal>INSERT</literal> statement in Tx2 will either be
+ smaller or larger than all those used for Tx1, depending on
+ which statement executes first.
+ </para>
+
+ <para>
+ As long as the SQL statements execute in the same order when
+ replayed from the binary log (when using statement-based
+ replication, or in recovery scenarios), the results will be
+ the same as they were when Tx1 and Tx2 first ran. Thus,
+ table-level locks held until the end of a statement make
+ <literal>INSERT</literal> statements using auto-increment safe
+ for use with statement-based replication. However, those locks
+ limit concurrency and scalability when multiple transactions
+ are executing insert statements at the same time.
+ </para>
+
+ <para>
+ In the preceding example, if there were no table-level lock,
+ the value of the auto-increment column used for the
+ <literal>INSERT</literal> in Tx2 depends on precisely when the
+ statement executes. If the <literal>INSERT</literal> of Tx2
+ executes while the <literal>INSERT</literal> of Tx1 is running
+ (rather than before it starts or after it completes), the
+ specific auto-increment values assigned by the two
+ <literal>INSERT</literal> statements are non-deterministic,
+ and may vary from run to run.
+ </para>
+
+ <para>
+ In MySQL ¤t-series;, <literal>InnoDB</literal> can avoid
+ using the table-level AUTO-INC lock for a class of
+ <literal>INSERT</literal> statements where the number of rows
+ is known in advance, and still preserve deterministic
+ execution and safety for statement-based replication. Further,
+ if you are not using the binary log to replay SQL statements
+ as part of recovery or replication, you can entirely eliminate
+ use of the AUTO-INC table-level lock for even greater
+ concurrency and performance—at the cost of permitting
+ gaps in auto-increment numbers assigned by a statement and
+ potentially having the numbers assigned by concurrently
+ executing statements interleaved.
+ </para>
+
+ <para>
+ For <literal>INSERT</literal> statements where the number of
+ rows to be inserted is known at the beginning of processing
+ the statement, <literal>InnoDB</literal> quickly allocates the
+ required number of auto-increment values without taking any
+ lock, but only if there is no concurrent session already
+ holding the table-level lock AUTO-INC lock (because that other
+ statement will be allocating auto-increment values one-by-one
+ as it proceeds). More precisely, such an
+ <literal>INSERT</literal> statement obtains auto-increment
+ values under the control of a mutex (a light-weight lock) that
+ is <emphasis>not</emphasis> held until the statement
+ completes, but only for the duration of the allocation
+ process.
+ </para>
+
+ <para>
+ This new locking scheme allows much greater scalability, but
+ it does introduce some subtle differences in how
+ auto-increment values are assigned compared to the original
+ mechanism. To describe the way auto-increment works in
+ <literal>InnoDB</literal>, the following discussion defines
+ some terms, and explains how <literal>InnoDB</literal> behaves
+ using different settings of the new
+ <literal>innodb_autoinc_lock_mode</literal> configuration
+ parameter. Additional considerations are described following
+ the explanation of auto-increment locking behavior.
+ </para>
+
+ <para>
+ First, some definitions:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ <quote><literal>INSERT</literal>-like</quote> statements
+ </para>
+
+ <para>
+ All statements that generate new rows in a table,
+ including <literal>INSERT</literal>, <literal>INSERT ...
+ SELECT</literal>, <literal>REPLACE</literal>,
+ <literal>REPLACE ... SELECT</literal>, and <literal>LOAD
+ DATA</literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <quote>Simple inserts</quote>
+ </para>
+
+ <para>
+ Statements for which the number of rows to be inserted can
+ be determined in advance (when the statement is initially
+ processed). This includes single-row and multiple-row
+ <literal>INSERT</literal>, <literal>INSERT ... ON
+ DUPLICATE KEY UPDATE</literal>, and
+ <literal>REPLACE</literal> statements that do not have a
+ nested subquery.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <quote>Bulk inserts</quote>
+ </para>
+
+ <para>
+ Statements for which the number of rows to be inserted
+ (and the number of required auto-increment values) is not
+ known in advance. This includes <literal>INSERT ...
+ SELECT</literal>, <literal>REPLACE ... SELECT</literal>,
+ and <literal>LOAD DATA</literal> statements.
+ <literal>InnoDB</literal> will assign new values for the
+ <literal>AUTO_INCREMENT</literal> column one at a time as
+ each row is processed.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <quote>Mixed-mode inserts</quote>
+ </para>
+
+ <para>
+ These are <quote>simple insert</quote> statements that
+ specify the auto-increment value for some (but not all) of
+ the new rows. An example follows, where
+ <literal>c1</literal> is an
+ <literal>AUTO_INCREMENT</literal> column of table
+ <literal>t1</literal>:
+ </para>
+
+<programlisting>
+INSERT INTO t1 (c1,c2) VALUES (1,'a'), (NULL,'b'), (5,'c'), (NULL,'d');
+</programlisting>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ In MySQL ¤t-series;, there is a new configuration
+ parameter that controls how <literal>InnoDB</literal> uses
+ locking when generating values for
+ <literal>AUTO_INCREMENT</literal> columns. This parameter can
+ be set using the <option>--innodb-autoinc-lock-mode</option>
+ option at <command>mysqld</command> startup, or at runtime by
+ setting the global <literal>innodb_autoinc_lock_mode</literal>
+ system variable. Although this variable can be changed at
+ runtime, the behavior is undefined if you do so while
+ transactions that generate auto-increment values are
+ executing.
+ </para>
+
+ <para>
+ In general, if you encounter problems with the way
+ auto-increment works (which will most likely involve
+ replication), you can force use of the original behavior by
+ setting the lock mode to 0.
+ </para>
+
+ <para>
+ There are three possible settings for the
+ <literal>innodb_autoinc_lock_mode</literal> parameter:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ <literal>innodb_autoinc_lock_mode = 0</literal>
+ (<quote>traditional</quote> lock mode)
+ </para>
+
+ <para>
+ This lock mode provides the same behavior as before
+ <literal>innodb_autoinc_lock_mode</literal> existed. For
+ all <quote><literal>INSERT</literal>-like</quote>
+ statements, a special AUTO-INC table-level lock is
+ obtained and held to the end of the statement. This
+ assures that the auto-increment values assigned by any
+ given statement are consecutive (although
+ <quote>gaps</quote> can exist within a table if a
+ transaction that generated auto-increment values is rolled
+ back, as discussed later).
+ </para>
+
+ <para>
+ This lock mode is provided only for backward compatibility
+ and performance testing. There is little reason to use
+ this lock mode unless you use <quote>mixed-mode
+ inserts</quote> and care about the minor difference in
+ semantics described later.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>innodb_autoinc_lock_mode = 1</literal>
+ (<quote>consecutive</quote> lock mode)
+ </para>
+
+ <para>
+ This is the default lock mode. In this mode, <quote>bulk
+ inserts</quote> use the special table-level AUTIONC
+ table-level lock and holds it until the end of the
+ statement. This applies to all <literal>INSERT ...
+ SELECT</literal>, <literal>REPLACE ... SELECT</literal>,
+ and <literal>LOAD DATA</literal> statements. Only one
+ statement holding the AUTO-INC lock can execute at a time.
+ </para>
+
+ <para>
+ With this lock mode, <quote>simple inserts</quote> (only)
+ use a new locking model where a light-weight mutex is used
+ during the allocation of auto-increment values, and no
+ AUTO-INC table-level lock is used, unless an AUTO-INC lock
+ is held by another transaction. If another transaction
+ does hold an AUTO-INC lock, a <quote>simple insert</quote>
+ waits for the AUTO-INC lock, as if it too were a
+ <quote>bulk insert.</quote>
+ </para>
+
+ <para>
+ This lock mode ensures that, in the presence of
+ <literal>INSERT</literal> statements where the number of
+ rows is not known in advance (and where auto-increment
+ numbers are assigned as the statement progresses), all
+ auto-increment values assigned by any
+ <quote><literal>INSERT</literal>-like</quote> statement
+ are consecutive, and operations are safe for
+ statement-based replication.
+ </para>
+
+ <para>
+ Simply put, the important impact of this lock mode is
+ significantly better scalability. This mode is safe for
+ use with statement-based replication. Further, as with
+ <quote>traditional</quote> lock mode, auto-increment
+ numbers assigned by any given statement are
+ <emphasis>consecutive</emphasis>. In this mode, there is
+ <emphasis>no change</emphasis> in semantics compared to
+ <quote>traditional</quote> mode for any statement that
+ uses auto-increment, with one minor exception.
+ </para>
+
+ <para>
+ The exception is for <quote>mixed-mode inserts</quote>,
+ where the user provides explicit values for an
+ <literal>AUTO_INCREMENT</literal> column for some, but not
+ all, rows in a multiple-row <quote>simple insert.</quote>
+ For such inserts, <literal>InnoDB</literal> will allocate
+ more auto-increment values than the number of rows to be
+ inserted. However, all values automatically assigned are
+ consecutively generated (and thus higher than) the
+ auto-increment value generated by the most recently
+ executed previous statement. <quote>Excess</quote> numbers
+ are lost.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>innodb_autoinc_lock_mode = 2</literal>
+ (<quote>interleaved</quote> lock mode)
+ </para>
+
+ <para>
+ In this lock mode, no
+ <quote><literal>INSERT</literal>-like</quote> statements
+ use the table-level AUTO-INC lock, and multiple statements
+ can execute at the same time. This is the fastest and most
+ scalable lock mode, but it is <emphasis>not
+ safe</emphasis> when using statement-based replication or
+ recovery scenarios when SQL statements are replayed from
+ the binary log.
+ </para>
+
+ <para>
+ In this lock mode, auto-increment values are guaranteed to
+ be unique and monotonically increasing across all
+ concurrently executing
+ <quote><literal>INSERT</literal>-like</quote> statements.
+ However, because multiple statements can be generating
+ numbers at the same time (that is, allocation of numbers
+ is <emphasis>interleaved</emphasis> across statements),
+ the values generated for the rows inserted by any given
+ statement may not be consecutive.
+ </para>
+
+ <para>
+ If the only statements executing are <quote>simple
+ inserts</quote> where the number of rows to be inserted is
+ known ahead of time, there will be no gaps in the numbers
+ generated for a single statement, except for
+ <quote>mixed-mode inserts.</quote> However, when
+ <quote>bulk inserts</quote> are executed, there may be
+ gaps in the auto-increment values assigned by any given
+ statement.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ The auto-increment locking modes provided by
+ <literal>innodb_autoinc_lock_mode</literal> have several usage
+ implications:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Using auto-increment with replication
+ </para>
+
+ <para>
+ If you are using statement-based replication, you should
+ set <literal>innodb_autoinc_lock_mode</literal> to 0 or 1
+ and use the same value on the master and its slaves.
+ Auto-increment values are not ensured to be the same on
+ the slaves as on the master if you use
+ <literal>innodb_autoinc_lock_mode</literal> = 2
+ (<quote>interleaved</quote>) or configurations where the
+ master and slaves do not use the same lock mode.
+ </para>
+
+ <para>
+ If you are using row-based replication, all of the
+ auto-increment lock modes are safe. Row-based replication
+ is not sensitive to the order of execution of the SQL
+ statements.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <quote>Lost</quote> auto-increment values and sequence
+ gaps
+ </para>
+
+ <para>
+ In all lock modes (0, 1, and 2), if a transaction that
+ generated auto-increment values rolls back, those
+ auto-increment values are <quote>lost.</quote> Once a
+ value is generated for an auto-increment column, it cannot
+ be rolled back, whether or not the
+ <quote><literal>INSERT</literal>-like</quote> statement is
+ completed, and whether or not the containing transaction
+ is rolled back. Such lost values are not reused. Thus,
+ there may be gaps in the values stored in an
+ <literal>AUTO_INCREMENT</literal> column of a table.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Auto-increment values assigned by <quote>mixed-mode
+ inserts</quote>
+ </para>
+
+ <para>
+ Consider a <quote>mixed-mode insert,</quote> where a
+ <quote>simple insert</quote> specifies the auto-increment
+ value for some (but not all) resulting rows. Such a
+ statement will behave differently in lock modes 0, 1, and
+ 2. For example, assume <literal>c1</literal> is an
+ <literal>AUTO_INCREMENT</literal> column of table
+ <literal>t1</literal>, and that the most recent
+ automatically generated sequence number is 100. Consider
+ the following <quote>mixed-mode insert</quote> statement:
+ </para>
+
+<programlisting>
+INSERT INTO t1 (c1,c2) VALUES (1,'a'), (NULL,'b'), (5,'c'), (NULL,'d');
+</programlisting>
+
+ <para>
+ With <literal>innodb_autoinc_lock_mode</literal> set to 0
+ (<quote>traditional</quote>), the four new rows will be:
+ </para>
+
+<programlisting>
+<!--
+mysql> DROP TABLE IF EXISTS t1;
+mysql> CREATE TABLE t1 (c1 INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, c2 CHAR(1));
+mysql> INSERT INTO t1 VALUES(1,'a'),(101,'b'),(5,'c'),(102,'d');
+mysql> SELECT c1, c2 FROM t1;
+-->
++-----+------+
+| c1 | c2 |
++-----+------+
+| 1 | a |
+| 101 | b |
+| 5 | c |
+| 102 | d |
++-----+------+
+</programlisting>
+
+ <para>
+ The next available auto-increment value will be 103
+ because the auto-increment values are allocated one at a
+ time, not all at once at the beginning of statement
+ execution. This result is true whether or not there are
+ concurrently executing
+ <quote><literal>INSERT</literal>-like</quote> statements
+ (of any type).
+ </para>
+
+ <para>
+ With <literal>innodb_autoinc_lock_mode</literal> set to 1
+ (<quote>consecutive</quote>), the four new rows will also
+ be:
+ </para>
+
+<programlisting>
++-----+------+
+| c1 | c2 |
++-----+------+
+| 1 | a |
+| 101 | b |
+| 5 | c |
+| 102 | d |
++-----+------+
+</programlisting>
+
+ <para>
+ However, in this case, the next available auto-increment
+ value will be 105, not 103 because four auto-increment
+ values are allocated at the time the statement is
+ processed, but only two are used. This result is true
+ whether or not there are concurrently executing
+ <quote><literal>INSERT</literal>-like</quote> statements
+ (of any type).
+ </para>
+
+ <para>
+ With <literal>innodb_autoinc_lock_mode</literal> set to
+ mode 2 (<quote>interleaved</quote>), the four new rows
+ will be:
+ </para>
+
+<programlisting>
++-----+------+
+| c1 | c2 |
++-----+------+
+| 1 | a |
+| <replaceable>x</replaceable> | b |
+| 5 | c |
+| <replaceable>y</replaceable> | d |
++-----+------+
+</programlisting>
+
+ <para>
+ The values of <replaceable>x</replaceable> and
+ <replaceable>y</replaceable> will be unique and larger
+ than any previously generated rows. However, the specific
+ values of <replaceable>x</replaceable> and
+ <replaceable>y</replaceable> will depend on the number of
+ auto-increment values generated by concurrently executing
+ statements.
+ </para>
+
+ <para>
+ Finally, consider the following statement, issued when the
+ most-recently generated sequence number was the value 4:
+ </para>
+
+<programlisting>
+INSERT INTO t1 (c1,c2) VALUES (1,'a'), (NULL,'b'), (5,'c'), (NULL,'d');
+</programlisting>
+
+ <para>
+ With any <literal>innodb_autoinc_lock_mode</literal>
+ setting, this statement will generate a duplicate-key
+ error 23000 (<literal>Can't write; duplicate key in
+ table</literal>) because 5 will be allocated for the row
+ <literal>(NULL, 'b')</literal> and insertion of the row
+ <literal>(5, 'c')</literal> will fail.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Gaps in auto-increment values for <quote>bulk
+ inserts</quote>
+ </para>
+
+ <para>
+ With <literal>innodb_autoinc_lock_mode</literal> set to 0
+ (<quote>traditional</quote>) or 1
+ (<quote>consecutive</quote>), the auto-increment values
+ generated by any given statement will be consecutive,
+ without gaps, because the table-level AUTO-INC lock is
+ held until the end of the statement, and only one such
+ statement can execute at a time.
+ </para>
+
+ <para>
+ With <literal>innodb_autoinc_lock_mode</literal> set to 2
+ (<quote>interleaved</quote>), there may be gaps in the
+ auto-increment values generated by <quote>bulk
+ inserts,</quote> but only if there are concurrently
+ executing <quote><literal>INSERT</literal>-like</quote>
+ statements.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ </section>
+
+ </section>
+
+ <section id="innodb-foreign-key-constraints">
+
+ <title><literal>FOREIGN KEY</literal> Constraints</title>
+
+ <remark role="help-category" condition="Data Definition"/>
+
+ <remark role="help-topic" condition="CONSTRAINT"/>
+
+ <remark role="help-keywords">
+ FOREIGN KEY REFERENCES NO ACTION ON DELETE CASCADE SET NULL
+ RESTRICT UPDATE
+ </remark>
+
+ <remark role="help-description-begin"/>
+
+ <para>
+ <literal>InnoDB</literal> also supports foreign key constraints.
+ The syntax for a foreign key constraint definition in
+ <literal>InnoDB</literal> looks like this:
+ </para>
+
+<programlisting>
+[CONSTRAINT <replaceable>symbol</replaceable>] FOREIGN KEY [<replaceable>id</replaceable>] (<replaceable>index_col_name</replaceable>, ...)
+ REFERENCES <replaceable>tbl_name</replaceable> (<replaceable>index_col_name</replaceable>, ...)
+ [ON DELETE {RESTRICT | CASCADE | SET NULL | NO ACTION}]
+ [ON UPDATE {RESTRICT | CASCADE | SET NULL | NO ACTION}]
+</programlisting>
+
+ <remark role="help-description-end"/>
+
+ <para>
+ Foreign keys definitions are subject to the following
+ conditions:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Both tables must be <literal>InnoDB</literal> tables and
+ they must not be <literal>TEMPORARY</literal> tables.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Corresponding columns in the foreign key and the referenced
+ key must have similar internal data types inside
+ <literal>InnoDB</literal> so that they can be compared
+ without a type conversion. <emphasis>The size and sign of
+ integer types must be the same</emphasis>. The length of
+ string types need not be the same. For non-binary
+ (character) string columns, the character set and collation
+ must be the same.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ In the referencing table, there must be an index where the
+ foreign key columns are listed as the
+ <emphasis>first</emphasis> columns in the same order. Such
+ an index is created on the referencing table automatically
+ if it does not exist.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ In the referenced table, there must be an index where the
+ referenced columns are listed as the
+ <emphasis>first</emphasis> columns in the same order.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Index prefixes on foreign key columns are not supported. One
+ consequence of this is that <literal>BLOB</literal> and
+ <literal>TEXT</literal> columns cannot be included in a
+ foreign key, because indexes on those columns must always
+ include a prefix length.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If the <literal>CONSTRAINT
+ <replaceable>symbol</replaceable></literal> clause is given,
+ the <replaceable>symbol</replaceable> value must be unique
+ in the database. If the clause is not given,
+ <literal>InnoDB</literal> creates the name automatically.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ <literal>InnoDB</literal> rejects any <literal>INSERT</literal>
+ or <literal>UPDATE</literal> operation that attempts to create a
+ foreign key value in a child table if there is no a matching
+ candidate key value in the parent table. The action
+ <literal>InnoDB</literal> takes for any
+ <literal>UPDATE</literal> or <literal>DELETE</literal> operation
+ that attempts to update or delete a candidate key value in the
+ parent table that has some matching rows in the child table is
+ dependent on the <emphasis>referential action</emphasis>
+ specified using <literal>ON UPDATE</literal> and <literal>ON
+ DELETE</literal> subclauses of the <literal>FOREIGN
+ KEY</literal> clause. When the user attempts to delete or update
+ a row from a parent table, and there are one or more matching
+ rows in the child table, <literal>InnoDB</literal> supports five
+ options regarding the action to be taken:
+ </para>
+
+ <itemizedlist>
+
+ <remark role="todo">
+ Rewrite following paragraph to make clearer.
+ </remark>
+
+ <listitem>
+ <para>
+ <literal>CASCADE</literal>: Delete or update the row from
+ the parent table and automatically delete or update the
+ matching rows in the child table. Both <literal>ON DELETE
+ CASCADE</literal> and <literal>ON UPDATE CASCADE</literal>
+ are supported. Between two tables, you should not define
+ several <literal>ON UPDATE CASCADE</literal> clauses that
+ act on the same column in the parent table or in the child
+ table.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>SET NULL</literal>: Delete or update the row from
+ the parent table and set the foreign key column or columns
+ in the child table to <literal>NULL</literal>. This is valid
+ only if the foreign key columns do not have the <literal>NOT
+ NULL</literal> qualifier specified. Both <literal>ON DELETE
+ SET NULL</literal> and <literal>ON UPDATE SET NULL</literal>
+ clauses are supported.
+ </para>
+
+ <para>
+ If you specify a <literal>SET NULL</literal> action,
+ <emphasis>make sure that you have not declared the columns
+ in the child table as <literal>NOT
+ NULL</literal></emphasis>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>NO ACTION</literal>: In standard SQL, <literal>NO
+ ACTION</literal> means <emphasis>no action</emphasis> in the
+ sense that an attempt to delete or update a primary key
+ value is not allowed to proceed if there is a related
+ foreign key value in the referenced table.
+ <literal>InnoDB</literal> rejects the delete or update
+ operation for the parent table.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>RESTRICT</literal>: Rejects the delete or update
+ operation for the parent table. <literal>NO ACTION</literal>
+ and <literal>RESTRICT</literal> are the same as omitting the
+ <literal>ON DELETE</literal> or <literal>ON UPDATE</literal>
+ clause. (Some database systems have deferred checks, and
+ <literal>NO ACTION</literal> is a deferred check. In MySQL,
+ foreign key constraints are checked immediately, so
+ <literal>NO ACTION</literal> and <literal>RESTRICT</literal>
+ are the same.)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>SET DEFAULT</literal>: This action is recognized by
+ the parser, but <literal>InnoDB</literal> rejects table
+ definitions containing <literal>ON DELETE SET
+ DEFAULT</literal> or <literal>ON UPDATE SET
+ DEFAULT</literal> clauses.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ Note that <literal>InnoDB</literal> supports foreign key
+ references within a table. In these cases, <quote>child table
+ records</quote> really refers to dependent records within the
+ same table.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> requires indexes on foreign keys and
+ referenced keys so that foreign key checks can be fast and not
+ require a table scan. The index on the foreign key is created
+ automatically. This is in contrast to some older versions, in
+ which indexes had to be created explicitly or the creation of
+ foreign key constraints would fail.
+ </para>
+
+ <para>
+ If MySQL reports an error number 1005 from a <literal>CREATE
+ TABLE</literal> statement, and the error message refers to errno
+ 150, table creation failed because a foreign key constraint was
+ not correctly formed. Similarly, if an <literal>ALTER
+ TABLE</literal> fails and it refers to errno 150, that means a
+ foreign key definition would be incorrectly formed for the
+ altered table. You can use <literal>SHOW ENGINE INNODB
+ STATUS</literal> to display a detailed explanation of the most
+ recent <literal>InnoDB</literal> foreign key error in the
+ server.
+ </para>
+
+ <note>
+ <para>
+ <literal>InnoDB</literal> does not check foreign key
+ constraints on those foreign key or referenced key values that
+ contain a <literal>NULL</literal> column.
+ </para>
+ </note>
+
+ <note>
+ <para>
+ Currently, triggers are not activated by cascaded foreign key
+ actions.
+ </para>
+ </note>
+
+ <para>
+ You cannot create a table with a column name that matches the
+ name of an internal InnoDB column (including
+ <literal>DB_ROW_ID</literal>, <literal>DB_TRX_ID</literal>,
+ <literal>DB_ROLL_PTR</literal> and
+ <literal>DB_MIX_ID</literal>). The server will report error 1005
+ and refers to <literal>errno</literal> -1 in the error message.
+ </para>
+
+ <para>
+ <emphasis role="bold">Deviation from SQL standards</emphasis>:
+ If there are several rows in the parent table that have the same
+ referenced key value, <literal>InnoDB</literal> acts in foreign
+ key checks as if the other parent rows with the same key value
+ do not exist. For example, if you have defined a
+ <literal>RESTRICT</literal> type constraint, and there is a
+ child row with several parent rows, <literal>InnoDB</literal>
+ does not allow the deletion of any of those parent rows.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> performs cascading operations through
+ a depth-first algorithm, based on records in the indexes
+ corresponding to the foreign key constraints.
+ </para>
+
+ <para>
+ <emphasis role="bold">Deviation from SQL standards</emphasis>: A
+ <literal>FOREIGN KEY</literal> constraint that references a
+ non-<literal>UNIQUE</literal> key is not standard SQL. It is an
+ <literal>InnoDB</literal> extension to standard SQL.
+ </para>
+
+ <para>
+ <emphasis role="bold">Deviation from SQL standards</emphasis>:
+ If <literal>ON UPDATE CASCADE</literal> or <literal>ON UPDATE
+ SET NULL</literal> recurses to update the <emphasis>same
+ table</emphasis> it has previously updated during the cascade,
+ it acts like <literal>RESTRICT</literal>. This means that you
+ cannot use self-referential <literal>ON UPDATE CASCADE</literal>
+ or <literal>ON UPDATE SET NULL</literal> operations. This is to
+ prevent infinite loops resulting from cascaded updates. A
+ self-referential <literal>ON DELETE SET NULL</literal>, on the
+ other hand, is possible, as is a self-referential <literal>ON
+ DELETE CASCADE</literal>. Cascading operations may not be nested
+ more than 15 levels deep.
+ </para>
+
+ <para>
+ <emphasis role="bold">Deviation from SQL standards</emphasis>:
+ Like MySQL in general, in an SQL statement that inserts,
+ deletes, or updates many rows, <literal>InnoDB</literal> checks
+ <literal>UNIQUE</literal> and <literal>FOREIGN KEY</literal>
+ constraints row-by-row. According to the SQL standard, the
+ default behavior should be deferred checking. That is,
+ constraints are only checked after the <emphasis>entire SQL
+ statement</emphasis> has been processed. Until
+ <literal>InnoDB</literal> implements deferred constraint
+ checking, some things will be impossible, such as deleting a
+ record that refers to itself via a foreign key.
+ </para>
+
+ <para>
+ Here is a simple example that relates <literal>parent</literal>
+ and <literal>child</literal> tables through a single-column
+ foreign key:
+ </para>
+
+<programlisting>
+CREATE TABLE parent (id INT NOT NULL,
+ PRIMARY KEY (id)
+) ENGINE=INNODB;
+CREATE TABLE child (id INT, parent_id INT,
+ INDEX par_ind (parent_id),
+ FOREIGN KEY (parent_id) REFERENCES parent(id)
+ ON DELETE CASCADE
+) ENGINE=INNODB;
+</programlisting>
+
+ <para>
+ A more complex example in which a
+ <literal>product_order</literal> table has foreign keys for two
+ other tables. One foreign key references a two-column index in
+ the <literal>product</literal> table. The other references a
+ single-column index in the <literal>customer</literal> table:
+ </para>
+
+ <remark role="help-example"/>
+
+<programlisting>
+CREATE TABLE product (category INT NOT NULL, id INT NOT NULL,
+ price DECIMAL,
+ PRIMARY KEY(category, id)) ENGINE=INNODB;
+CREATE TABLE customer (id INT NOT NULL,
+ PRIMARY KEY (id)) ENGINE=INNODB;
+CREATE TABLE product_order (no INT NOT NULL AUTO_INCREMENT,
+ product_category INT NOT NULL,
+ product_id INT NOT NULL,
+ customer_id INT NOT NULL,
+ PRIMARY KEY(no),
+ INDEX (product_category, product_id),
+ FOREIGN KEY (product_category, product_id)
+ REFERENCES product(category, id)
+ ON UPDATE CASCADE ON DELETE RESTRICT,
+ INDEX (customer_id),
+ FOREIGN KEY (customer_id)
+ REFERENCES customer(id)) ENGINE=INNODB;
+</programlisting>
+
+ <para>
+ <literal>InnoDB</literal> allows you to add a new foreign key
+ constraint to a table by using <literal>ALTER TABLE</literal>:
+ </para>
+
+<programlisting>
+ALTER TABLE <replaceable>tbl_name</replaceable>
+ ADD [CONSTRAINT <replaceable>symbol</replaceable>] FOREIGN KEY [<replaceable>id</replaceable>] (<replaceable>index_col_name</replaceable>, ...)
+ REFERENCES <replaceable>tbl_name</replaceable> (<replaceable>index_col_name</replaceable>, ...)
+ [ON DELETE {RESTRICT | CASCADE | SET NULL | NO ACTION}]
+ [ON UPDATE {RESTRICT | CASCADE | SET NULL | NO ACTION}]
+</programlisting>
+
+ <para>
+ <emphasis role="bold">Remember to create the required indexes
+ first</emphasis>. You can also add a self-referential foreign
+ key constraint to a table using <literal>ALTER TABLE</literal>.
+ </para>
+
+ <indexterm>
+ <primary>DROP FOREIGN KEY</primary>
+ </indexterm>
+
+ <indexterm>
+ <primary>deleting</primary>
+ <secondary>foreign key</secondary>
+ </indexterm>
+
+ <indexterm>
+ <primary>foreign key</primary>
+ <secondary>deleting</secondary>
+ </indexterm>
+
+ <para>
+ <literal>InnoDB</literal> also supports the use of
+ <literal>ALTER TABLE</literal> to drop foreign keys:
+ </para>
+
+<programlisting>
+ALTER TABLE <replaceable>tbl_name</replaceable> DROP FOREIGN KEY <replaceable>fk_symbol</replaceable>;
+</programlisting>
+
+ <para>
+ If the <literal>FOREIGN KEY</literal> clause included a
+ <literal>CONSTRAINT</literal> name when you created the foreign
+ key, you can refer to that name to drop the foreign key.
+ Otherwise, the <replaceable>fk_symbol</replaceable> value is
+ internally generated by <literal>InnoDB</literal> when the
+ foreign key is created. To find out the symbol value when you
+ want to drop a foreign key, use the <literal>SHOW CREATE
+ TABLE</literal> statement. For example:
+ </para>
+
+<programlisting>
+mysql> <userinput>SHOW CREATE TABLE ibtest11c\G</userinput>
+*************************** 1. row ***************************
+ Table: ibtest11c
+Create Table: CREATE TABLE `ibtest11c` (
+ `A` int(11) NOT NULL auto_increment,
+ `D` int(11) NOT NULL default '0',
+ `B` varchar(200) NOT NULL default '',
+ `C` varchar(175) default NULL,
+ PRIMARY KEY (`A`,`D`,`B`),
+ KEY `B` (`B`,`C`),
+ KEY `C` (`C`),
+ CONSTRAINT `0_38775` FOREIGN KEY (`A`, `D`)
+REFERENCES `ibtest11a` (`A`, `D`)
+ON DELETE CASCADE ON UPDATE CASCADE,
+ CONSTRAINT `0_38776` FOREIGN KEY (`B`, `C`)
+REFERENCES `ibtest11a` (`B`, `C`)
+ON DELETE CASCADE ON UPDATE CASCADE
+) ENGINE=INNODB CHARSET=latin1
+1 row in set (0.01 sec)
+
+mysql> <userinput>ALTER TABLE ibtest11c DROP FOREIGN KEY `0_38775`;</userinput>
+</programlisting>
+
+ <para>
+ You cannot add a foreign key and drop a foreign key in separate
+ clauses of a single <literal>ALTER TABLE</literal> statement.
+ Separate statements are required.
+ </para>
+
+ <para>
+ If <literal>ALTER TABLE</literal> for an
+ <literal>InnoDB</literal> table results in changes to column
+ values (for example, because a column is truncated),
+ <literal>InnoDB</literal>'s <literal>FOREIGN KEY</literal>
+ constraint checks do not notice possible violations caused by
+ changing the values.
+ </para>
+
+ <para>
+ The <literal>InnoDB</literal> parser allows table and column
+ identifiers in a <literal>FOREIGN KEY ... REFERENCES
+ ...</literal> clause to be quoted within backticks.
+ (Alternatively, double quotes can be used if the
+ <literal>ANSI_QUOTES</literal> SQL mode is enabled.) The
+ <literal>InnoDB</literal> parser also takes into account the
+ setting of the <literal>lower_case_table_names</literal> system
+ variable.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> returns a table's foreign key
+ definitions as part of the output of the <literal>SHOW CREATE
+ TABLE</literal> statement:
+ </para>
+
+<programlisting>
+SHOW CREATE TABLE <replaceable>tbl_name</replaceable>;
+</programlisting>
+
+ <para>
+ <command>mysqldump</command> also produces correct definitions
+ of tables to the dump file, and does not forget about the
+ foreign keys.
+ </para>
+
+ <para>
+ You can also display the foreign key constraints for a table
+ like this:
+ </para>
+
+<programlisting>
+SHOW TABLE STATUS FROM <replaceable>db_name</replaceable> LIKE '<replaceable>tbl_name</replaceable>';
+</programlisting>
+
+ <para>
+ The foreign key constraints are listed in the
+ <literal>Comment</literal> column of the output.
+ </para>
+
+ <para>
+ When performing foreign key checks, <literal>InnoDB</literal>
+ sets shared row-level locks on child or parent records it has to
+ look at. <literal>InnoDB</literal> checks foreign key
+ constraints immediately; the check is not deferred to
+ transaction commit.
+ </para>
+
+ <para>
+ To make it easier to reload dump files for tables that have
+ foreign key relationships, <command>mysqldump</command>
+ automatically includes a statement in the dump output to set
+ <literal>FOREIGN_KEY_CHECKS</literal> to 0. This avoids problems
+ with tables having to be reloaded in a particular order when the
+ dump is reloaded. It is also possible to set this variable
+ manually:
+ </para>
+
+<programlisting>
+mysql> <userinput>SET FOREIGN_KEY_CHECKS = 0;</userinput>
+mysql> <userinput>SOURCE <replaceable>dump_file_name</replaceable>;</userinput>
+mysql> <userinput>SET FOREIGN_KEY_CHECKS = 1;</userinput>
+</programlisting>
+
+ <para>
+ This allows you to import the tables in any order if the dump
+ file contains tables that are not correctly ordered for foreign
+ keys. It also speeds up the import operation. Setting
+ <literal>FOREIGN_KEY_CHECKS</literal> to 0 can also be useful
+ for ignoring foreign key constraints during <literal>LOAD
+ DATA</literal> and <literal>ALTER TABLE</literal> operations.
+ However, even if <literal>FOREIGN_KEY_CHECKS=0</literal>, InnoDB
+ does not allow the creation of a foreign key constraint where a
+ column references a non-matching column type. Also, if an
+ <literal>InnoDB</literal> table has foreign key constraints,
+ <literal>ALTER TABLE</literal> cannot be used to change the
+ table to use another storage engine. To alter the storage
+ engine, you must drop any foreign key constraints first.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> does not allow you to drop a table
+ that is referenced by a <literal>FOREIGN KEY</literal>
+ constraint, unless you do <literal>SET
+ FOREIGN_KEY_CHECKS=0</literal>. When you drop a table, the
+ constraints that were defined in its create statement are also
+ dropped.
+ </para>
+
+ <para>
+ If you re-create a table that was dropped, it must have a
+ definition that conforms to the foreign key constraints
+ referencing it. It must have the right column names and types,
+ and it must have indexes on the referenced keys, as stated
+ earlier. If these are not satisfied, MySQL returns error number
+ 1005 and refers to errno 150 in the error message.
+ </para>
+
+ </section>
+
+ <section id="innodb-and-mysql-replication">
+
+ <title><literal>InnoDB</literal> and MySQL Replication</title>
+
+ <para>
+ MySQL replication works for <literal>InnoDB</literal> tables as
+ it does for <literal>MyISAM</literal> tables. It is also
+ possible to use replication in a way where the storage engine on
+ the slave is not the same as the original storage engine on the
+ master. For example, you can replicate modifications to an
+ <literal>InnoDB</literal> table on the master to a
+ <literal>MyISAM</literal> table on the slave.
+ </para>
+
+ <para>
+ To set up a new slave for a master, you have to make a copy of
+ the <literal>InnoDB</literal> tablespace and the log files, as
+ well as the <filename>.frm</filename> files of the
+ <literal>InnoDB</literal> tables, and move the copies to the
+ slave. If the <literal>innodb_file_per_table</literal> variable
+ is enabled, you must also copy the <filename>.ibd</filename>
+ files as well. For the proper procedure to do this, see
+ <xref linkend="innodb-backup"/>.
+ </para>
+
+ <para>
+ If you can shut down the master or an existing slave, you can
+ take a cold backup of the <literal>InnoDB</literal> tablespace
+ and log files and use that to set up a slave. To make a new
+ slave without taking down any server you can also use the
+ non-free (commercial)
+ <ulink url="http://www.innodb.com/hot-backup"><literal>InnoDB
+ Hot Backup</literal> tool</ulink>.
+ </para>
+
+ <para>
+ You cannot set up replication for <literal>InnoDB</literal>
+ using the <literal>LOAD TABLE FROM MASTER</literal> statement,
+ which works only for <literal>MyISAM</literal> tables. There are
+ two possible workarounds:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Dump the table on the master and import the dump file into
+ the slave.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Use <literal>ALTER TABLE <replaceable>tbl_name</replaceable>
+ ENGINE=MyISAM</literal> on the master before setting up
+ replication with <literal>LOAD TABLE
+ <replaceable>tbl_name</replaceable> FROM MASTER</literal>,
+ and then use <literal>ALTER TABLE</literal> to convert the
+ master table back to <literal>InnoDB</literal> afterward.
+ However, this should not be done for tables that have
+ foreign key definitions because the definitions will be
+ lost.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ Transactions that fail on the master do not affect replication
+ at all. MySQL replication is based on the binary log where MySQL
+ writes SQL statements that modify data. A transaction that fails
+ (for example, because of a foreign key violation, or because it
+ is rolled back) is not written to the binary log, so it is not
+ sent to slaves. See <xref linkend="commit"/>.
+ </para>
+
+ <important>
+ <para>
+ Cascading actions for <literal>InnoDB</literal> tables on the
+ master are <emphasis>not</emphasis> replicated to the slave.
+ For example, suppose you have two tables defined and populated
+ on both the master and the slave as shown here:
+
+<programlisting>
+CREATE TABLE fc1 (
+ i INT PRIMARY KEY,
+ j INT
+) ENGINE = InnoDB;
+
+CREATE TABLE fc2 (
+ m INT PRIMARY KEY,
+ n INT,
+ FOREIGN KEY mi (m) REFERENCES fc1 (i)
+ ON DELETE CASCADE
+) ENGINE = InnoDB;
+
+INSERT INTO fc1 VALUES (1, 1), (2, 2);
+INSERT INTO fc2 VALUES (1, 1), (2, 2), (3, 1);
+</programlisting>
+
+ At this point, on both the master and the slave, table
+ <literal>fc1</literal> contains 2 rows, and table
+ <literal>fc2</literal> contains 3 rows. Now suppose that you
+ perform the following <literal>DELETE</literal> statement on
+ the master:
+
+<programlisting>
+DELETE FROM fc1 WHERE i = 1;
+</programlisting>
+
+ Due to the cascade, table <literal>fc2</literal> on the master
+ now contains only 1 row:
+
+<programlisting>
+mysql> <userinput>SELECT * FROM fc2;</userinput>
++---+---+
+| m | n |
++---+---+
+| 2 | 2 |
++---+---+
+1 row in set (0.00 sec)
+</programlisting>
+
+ However, the cascade does not propagate to the slave. The
+ slave's copy of <literal>fc2</literal> still contains all
+ of the rows that were originally inserted:
+
+<programlisting>
+mysql> <userinput>SELECT * FROM fc2;</userinput>
++---+---+
+| m | n |
++---+---+
+| 1 | 1 |
+| 3 | 1 |
+| 2 | 2 |
++---+---+
+3 rows in set (0.00 sec)
+</programlisting>
+ </para>
+
+ <para>
+ This is true whether you are using row-based replication or
+ statement-based replication.
+ </para>
+ </important>
+
+ </section>
+
+ </section>
+
+ <section id="adding-and-removing">
+
+ <title>Adding and Removing <literal>InnoDB</literal> Data and Log Files</title>
+
+ <para>
+ This section describes what you can do when your
+ <literal>InnoDB</literal> tablespace runs out of room or when you
+ want to change the size of the log files.
+ </para>
+
+ <para>
+ The easiest way to increase the size of the
+ <literal>InnoDB</literal> tablespace is to configure it from the
+ beginning to be auto-extending. Specify the
+ <literal>autoextend</literal> attribute for the last data file in
+ the tablespace definition. Then <literal>InnoDB</literal>
+ increases the size of that file automatically in 8MB increments
+ when it runs out of space. The increment size can be changed by
+ setting the value of the
+ <literal>innodb_autoextend_increment</literal> system variable,
+ which is measured in MB.
+ </para>
+
+ <para>
+ Alternatively, you can increase the size of your tablespace by
+ adding another data file. To do this, you have to shut down the
+ MySQL server, change the tablespace configuration to add a new
+ data file to the end of <literal>innodb_data_file_path</literal>,
+ and start the server again.
+ </para>
+
+ <para>
+ If your last data file was defined with the keyword
+ <literal>autoextend</literal>, the procedure for reconfiguring the
+ tablespace must take into account the size to which the last data
+ file has grown. Obtain the size of the data file, round it down to
+ the closest multiple of 1024 × 1024 bytes (= 1MB), and
+ specify the rounded size explicitly in
+ <literal>innodb_data_file_path</literal>. Then you can add another
+ data file. Remember that only the last data file in the
+ <literal>innodb_data_file_path</literal> can be specified as
+ auto-extending.
+ </para>
+
+ <para>
+ As an example, assume that the tablespace has just one
+ auto-extending data file <filename>ibdata1</filename>:
+ </para>
+
+<programlisting>
+innodb_data_home_dir =
+innodb_data_file_path = /ibdata/ibdata1:10M:autoextend
+</programlisting>
+
+ <para>
+ Suppose that this data file, over time, has grown to 988MB. Here
+ is the configuration line after modifying the original data file
+ to not be auto-extending and adding another auto-extending data
+ file:
+ </para>
+
+<programlisting>
+innodb_data_home_dir =
+innodb_data_file_path = /ibdata/ibdata1:988M;/disk2/ibdata2:50M:autoextend
+</programlisting>
+
+ <para>
+ When you add a new file to the tablespace configuration, make sure
+ that it does not exist. <literal>InnoDB</literal> will create and
+ initialize the file when you restart the server.
+ </para>
+
+ <para>
+ Currently, you cannot remove a data file from the tablespace. To
+ decrease the size of your tablespace, use this procedure:
+ </para>
+
+ <orderedlist>
+
+ <listitem>
+ <para>
+ Use <command>mysqldump</command> to dump all your
+ <literal>InnoDB</literal> tables.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Stop the server.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Remove all the existing tablespace files.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Configure a new tablespace.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Restart the server.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Import the dump files.
+ </para>
+ </listitem>
+
+ </orderedlist>
+
+ <para>
+ If you want to change the number or the size of your
+ <literal>InnoDB</literal> log files, use the following
+ instructions. The procedure to use depends on the value of
+ <literal>innodb_fast_shutdown</literal>:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ If <literal>innodb_fast_shutdown</literal> is not set to 2:
+ You must stop the MySQL server and make sure that it shuts
+ down without errors (to ensure that there is no information
+ for outstanding transactions in the logs). Then copy the old
+ log files into a safe place just in case something went wrong
+ in the shutdown and you need them to recover the tablespace.
+ Delete the old log files from the log file directory, edit
+ <filename>my.cnf</filename> to change the log file
+ configuration, and start the MySQL server again.
+ <command>mysqld</command> sees that no log files exist at
+ startup and tells you that it is creating new ones.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If <literal>innodb_fast_shutdown</literal> is set to 2: You
+ should shut down the server, set
+ <literal>innodb_fast_shutdown</literal> to 1, and restart the
+ server. The server should be allowed to recover. Then you
+ should shut down the server again and follow the procedure
+ described in the preceding item to change
+ <literal>InnoDB</literal> log file size. Set
+ <literal>innodb_fast_shutdown</literal> back to 2 and restart
+ the server.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ </section>
+
+ <section id="innodb-backup">
+
+ <title>Backing Up and Recovering an <literal>InnoDB</literal> Database</title>
+
+ <para>
+ The key to safe database management is making regular backups.
+ </para>
+
+ <para>
+ <command>InnoDB Hot Backup</command> is an online backup tool you
+ can use to backup your <literal>InnoDB</literal> database while it
+ is running. <command>InnoDB Hot Backup</command> does not require
+ you to shut down your database and it does not set any locks or
+ disturb your normal database processing. <command>InnoDB Hot
+ Backup</command> is a non-free (commercial) add-on tool with an
+ annual license fee of €390 per computer on which the MySQL
+ server is run. See the
+ <ulink url="http://www.innodb.com/hot-backup"><command>InnoDB Hot
+ Backup</command> home page</ulink> for detailed information and
+ screenshots.
+ </para>
+
+ <para>
+ If you are able to shut down your MySQL server, you can make a
+ binary backup that consists of all files used by
+ <literal>InnoDB</literal> to manage its tables. Use the following
+ procedure:
+ </para>
+
+ <orderedlist>
+
+ <listitem>
+ <para>
+ Shut down your MySQL server and make sure that it shuts down
+ without errors.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Copy all your data files (<filename>ibdata</filename> files
+ and <filename>.ibd</filename> files) into a safe place.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Copy all your <filename>ib_logfile</filename> files to a safe
+ place.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Copy your <filename>my.cnf</filename> configuration file or
+ files to a safe place.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Copy all the <filename>.frm</filename> files for your
+ <literal>InnoDB</literal> tables to a safe place.
+ </para>
+ </listitem>
+
+ </orderedlist>
+
+ <para>
+ Replication works with <literal>InnoDB</literal> tables, so you
+ can use MySQL replication capabilities to keep a copy of your
+ database at database sites requiring high availability.
+ </para>
+
+ <para>
+ In addition to making binary backups as just described, you should
+ also regularly make dumps of your tables with
+ <command>mysqldump</command>. The reason for this is that a binary
+ file might be corrupted without you noticing it. Dumped tables are
+ stored into text files that are human-readable, so spotting table
+ corruption becomes easier. Also, because the format is simpler,
+ the chance for serious data corruption is smaller.
+ <command>mysqldump</command> also has a
+ <option>--single-transaction</option> option that you can use to
+ make a consistent snapshot without locking out other clients.
+ </para>
+
+ <para>
+ To be able to recover your <literal>InnoDB</literal> database to
+ the present from the binary backup just described, you have to run
+ your MySQL server with binary logging turned on. Then you can
+ apply the binary log to the backup database to achieve
+ point-in-time recovery:
+ </para>
+
+<programlisting>
+mysqlbinlog <replaceable>yourhostname</replaceable>-bin.123 | mysql
+</programlisting>
+
+ <para>
+ To recover from a crash of your MySQL server, the only requirement
+ is to restart it. <literal>InnoDB</literal> automatically checks
+ the logs and performs a roll-forward of the database to the
+ present. <literal>InnoDB</literal> automatically rolls back
+ uncommitted transactions that were present at the time of the
+ crash. During recovery, <command>mysqld</command> displays output
+ something like this:
+ </para>
+
+<programlisting>
+InnoDB: Database was not shut down normally.
+InnoDB: Starting recovery from log files...
+InnoDB: Starting log scan based on checkpoint at
+InnoDB: log sequence number 0 13674004
+InnoDB: Doing recovery: scanned up to log sequence number 0 13739520
+InnoDB: Doing recovery: scanned up to log sequence number 0 13805056
+InnoDB: Doing recovery: scanned up to log sequence number 0 13870592
+InnoDB: Doing recovery: scanned up to log sequence number 0 13936128
+...
+InnoDB: Doing recovery: scanned up to log sequence number 0 20555264
+InnoDB: Doing recovery: scanned up to log sequence number 0 20620800
+InnoDB: Doing recovery: scanned up to log sequence number 0 20664692
+InnoDB: 1 uncommitted transaction(s) which must be rolled back
+InnoDB: Starting rollback of uncommitted transactions
+InnoDB: Rolling back trx no 16745
+InnoDB: Rolling back of trx no 16745 completed
+InnoDB: Rollback of uncommitted transactions completed
+InnoDB: Starting an apply batch of log records to the database...
+InnoDB: Apply batch completed
+InnoDB: Started
+mysqld: ready for connections
+</programlisting>
+
+ <para>
+ If your database gets corrupted or your disk fails, you have to do
+ the recovery from a backup. In the case of corruption, you should
+ first find a backup that is not corrupted. After restoring the
+ base backup, do the recovery from the binary log files using
+ <command>mysqlbinlog</command> and <command>mysql</command> to
+ restore the changes performed after the backup was made.
+ </para>
+
+ <para>
+ In some cases of database corruption it is enough just to dump,
+ drop, and re-create one or a few corrupt tables. You can use the
+ <literal>CHECK TABLE</literal> SQL statement to check whether a
+ table is corrupt, although <literal>CHECK TABLE</literal>
+ naturally cannot detect every possible kind of corruption. You can
+ use <literal>innodb_tablespace_monitor</literal> to check the
+ integrity of the file space management inside the tablespace
+ files.
+ </para>
+
+ <para>
+ In some cases, apparent database page corruption is actually due
+ to the operating system corrupting its own file cache, and the
+ data on disk may be okay. It is best first to try restarting your
+ computer. Doing so may eliminate errors that appeared to be
+ database page corruption.
+ </para>
+
+ <section id="forcing-recovery">
+
+ <title>Forcing <literal>InnoDB</literal> Recovery</title>
+
+ <para>
+ If there is database page corruption, you may want to dump your
+ tables from the database with <literal>SELECT INTO
+ OUTFILE</literal>. Usually, most of the data obtained in this
+ way is intact. Even so, the corruption may cause <literal>SELECT
+ * FROM <replaceable>tbl_name</replaceable></literal> statements
+ or <literal>InnoDB</literal> background operations to crash or
+ assert, or even to cause <literal>InnoDB</literal> roll-forward
+ recovery to crash. However, you can force the
+ <literal>InnoDB</literal> storage engine to start up while
+ preventing background operations from running, so that you are
+ able to dump your tables. For example, you can add the following
+ line to the <literal>[mysqld]</literal> section of your option
+ file before restarting the server:
+ </para>
+
+<programlisting>
+[mysqld]
+innodb_force_recovery = 4
+</programlisting>
+
+ <para>
+ The allowable non-zero values for
+ <literal>innodb_force_recovery</literal> follow. A larger number
+ includes all precautions of smaller numbers. If you are able to
+ dump your tables with an option value of at most 4, then you are
+ relatively safe that only some data on corrupt individual pages
+ is lost. A value of 6 is more drastic because database pages are
+ left in an obsolete state, which in turn may introduce more
+ corruption into B-trees and other database structures.
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ <literal>1</literal>
+ (<literal>SRV_FORCE_IGNORE_CORRUPT</literal>)
+ </para>
+
+ <para>
+ Let the server run even if it detects a corrupt page. Try to
+ make <literal>SELECT * FROM
+ <replaceable>tbl_name</replaceable></literal> jump over
+ corrupt index records and pages, which helps in dumping
+ tables.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>2</literal>
+ (<literal>SRV_FORCE_NO_BACKGROUND</literal>)
+ </para>
+
+ <para>
+ Prevent the main thread from running. If a crash would occur
+ during the purge operation, this recovery value prevents it.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>3</literal>
+ (<literal>SRV_FORCE_NO_TRX_UNDO</literal>)
+ </para>
+
+ <para>
+ Do not run transaction rollbacks after recovery.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>4</literal>
+ (<literal>SRV_FORCE_NO_IBUF_MERGE</literal>)
+ </para>
+
+ <para>
+ Prevent also insert buffer merge operations. If they would
+ cause a crash, do not do them. Do not calculate table
+ statistics.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>5</literal>
+ (<literal>SRV_FORCE_NO_UNDO_LOG_SCAN</literal>)
+ </para>
+
+ <para>
+ Do not look at undo logs when starting the database:
+ <literal>InnoDB</literal> treats even incomplete
+ transactions as committed.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>6</literal>
+ (<literal>SRV_FORCE_NO_LOG_REDO</literal>)
+ </para>
+
+ <para>
+ Do not do the log roll-forward in connection with recovery.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ You can <literal>SELECT</literal> from tables to dump them, or
+ <literal>DROP</literal> or <literal>CREATE</literal> tables even
+ if forced recovery is used. If you know that a given table is
+ causing a crash on rollback, you can drop it. You can also use
+ this to stop a runaway rollback caused by a failing mass import
+ or <literal>ALTER TABLE</literal>. You can kill the
+ <command>mysqld</command> process and set
+ <literal>innodb_force_recovery</literal> to <literal>3</literal>
+ to bring the database up without the rollback, then
+ <literal>DROP</literal> the table that is causing the runaway
+ rollback.
+ </para>
+
+ <para>
+ <emphasis>The database must not otherwise be used with any
+ non-zero value of
+ <literal>innodb_force_recovery</literal></emphasis>. As a safety
+ measure, <literal>InnoDB</literal> prevents users from
+ performing <literal>INSERT</literal>, <literal>UPDATE</literal>,
+ or <literal>DELETE</literal> operations when
+ <literal>innodb_force_recovery</literal> is greater than 0.
+ </para>
+
+ </section>
+
+ <section id="innodb-checkpoints">
+
+ <title>Checkpoints</title>
+
+ <para>
+ <literal>InnoDB</literal> implements a checkpoint mechanism
+ known as <quote>fuzzy</quote> checkpointing.
+ <literal>InnoDB</literal> flushes modified database pages from
+ the buffer pool in small batches. There is no need to flush the
+ buffer pool in one single batch, which would in practice stop
+ processing of user SQL statements during the checkpointing
+ process.
+ </para>
+
+ <para>
+ During crash recovery, <literal>InnoDB</literal> looks for a
+ checkpoint label written to the log files. It knows that all
+ modifications to the database before the label are present in
+ the disk image of the database. Then <literal>InnoDB</literal>
+ scans the log files forward from the checkpoint, applying the
+ logged modifications to the database.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> writes to its log files on a rotating
+ basis. All committed modifications that make the database pages
+ in the buffer pool different from the images on disk must be
+ available in the log files in case <literal>InnoDB</literal> has
+ to do a recovery. This means that when <literal>InnoDB</literal>
+ starts to reuse a log file, it has to make sure that the
+ database page images on disk contain the modifications logged in
+ the log file that <literal>InnoDB</literal> is going to reuse.
+ In other words, <literal>InnoDB</literal> must create a
+ checkpoint and this often involves flushing of modified database
+ pages to disk.
+ </para>
+
+ <para>
+ The preceding description explains why making your log files
+ very large may save disk I/O in checkpointing. It often makes
+ sense to set the total size of the log files as big as the
+ buffer pool or even bigger. The drawback of using large log
+ files is that crash recovery can take longer because there is
+ more logged information to apply to the database.
+ </para>
+
+ </section>
+
+ </section>
+
+ <section id="moving">
+
+ <title>Moving an <literal>InnoDB</literal> Database to Another Machine</title>
+
+ <para>
+ On Windows, <literal>InnoDB</literal> always stores database and
+ table names internally in lowercase. To move databases in a binary
+ format from Unix to Windows or from Windows to Unix, you should
+ have all table and database names in lowercase. A convenient way
+ to accomplish this is to add the following line to the
+ <literal>[mysqld]</literal> section of your
+ <filename>my.cnf</filename> or <filename>my.ini</filename> file
+ before creating any databases or tables:
+ </para>
+
+<programlisting>
+[mysqld]
+lower_case_table_names=1
+</programlisting>
+
+ <para>
+ Like <literal>MyISAM</literal> data files,
+ <literal>InnoDB</literal> data and log files are binary-compatible
+ on all platforms having the same floating-point number format. You
+ can move an <literal>InnoDB</literal> database simply by copying
+ all the relevant files listed in <xref linkend="innodb-backup"/>.
+ If the floating-point formats differ but you have not used
+ <literal>FLOAT</literal> or <literal>DOUBLE</literal> data types
+ in your tables, then the procedure is the same: simply copy the
+ relevant files. If the formats differ and your tables contain
+ floating-point data, you must use <command>mysqldump</command> to
+ dump your tables on one machine and then import the dump files on
+ the other machine.
+ </para>
+
+ <para>
+ One way to increase performance is to switch off autocommit mode
+ when importing data, assuming that the tablespace has enough space
+ for the big rollback segment that the import transactions
+ generate. Do the commit only after importing a whole table or a
+ segment of a table.
+ </para>
+
+ </section>
+
+ <section id="innodb-transaction-model">
+
+ <title><literal>InnoDB</literal> Transaction Model and Locking</title>
+
+ <para>
+ In the <literal>InnoDB</literal> transaction model, the goal is to
+ combine the best properties of a multi-versioning database with
+ traditional two-phase locking. <literal>InnoDB</literal> does
+ locking on the row level and runs queries as non-locking
+ consistent reads by default, in the style of Oracle. The lock
+ table in <literal>InnoDB</literal> is stored so space-efficiently
+ that lock escalation is not needed: Typically several users are
+ allowed to lock every row in the database, or any random subset of
+ the rows, without <literal>InnoDB</literal> running out of memory.
+ </para>
+
+ <section id="innodb-lock-modes">
+
+ <title><literal>InnoDB</literal> Lock Modes</title>
+
+ <para>
+ <literal>InnoDB</literal> implements standard row-level locking
+ where there are two types of locks:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ A shared (<replaceable>S</replaceable>) lock allows a
+ transaction to read a row (tuple).
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ An exclusive (<replaceable>X</replaceable>) lock allows a
+ transaction to update or delete a row.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ If transaction <literal>T1</literal> holds a shared
+ (<replaceable>S</replaceable>) lock on tuple
+ <literal>t</literal>, then
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ A request from some distinct transaction
+ <literal>T2</literal> for an <replaceable>S</replaceable>
+ lock on <literal>t</literal> can be granted immediately. As
+ a result, both <literal>T1</literal> and
+ <literal>T2</literal> hold an <replaceable>S</replaceable>
+ lock on <literal>t</literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ A request from some distinct transaction
+ <literal>T2</literal> for an <replaceable>X</replaceable>
+ lock on <literal>t</literal> cannot be granted immediately.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ If a transaction <literal>T1</literal> holds an exclusive
+ (<replaceable>X</replaceable>) lock on tuple
+ <literal>t</literal>, then a request from some distinct
+ transaction <literal>T2</literal> for a lock of either type on
+ <literal>t</literal> cannot be granted immediately. Instead,
+ transaction <literal>T2</literal> has to wait for transaction
+ <literal>T1</literal> to release its lock on tuple
+ <literal>t</literal>.
+ </para>
+
+ <para>
+ Additionally, <literal>InnoDB</literal> supports
+ <emphasis>multiple granularity locking</emphasis> which allows
+ coexistence of record locks and locks on entire tables. To make
+ locking at multiple granularity levels practical, additional
+ types of locks called <emphasis>intention locks</emphasis> are
+ used. Intention locks are table locks in
+ <literal>InnoDB</literal>. The idea behind intention locks is
+ for a transaction to indicate which type of lock (shared or
+ exclusive) it will require later for a row in that table. There
+ are two types of intention locks used in
+ <literal>InnoDB</literal> (assume that transaction
+ <literal>T</literal> has requested a lock of the indicated type
+ on table <literal>R</literal>):
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Intention shared (<replaceable>IS</replaceable>):
+ Transaction <literal>T</literal> intends to set
+ <replaceable>S</replaceable> locks on individual rows in
+ table <literal>R</literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Intention exclusive (<replaceable>IX</replaceable>):
+ Transaction <literal>T</literal> intends to set
+ <replaceable>X</replaceable> locks on those rows.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ The intention locking protocol is as follows:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Before a given transaction can acquire an
+ <replaceable>S</replaceable> lock on a given row, it must
+ first acquire an <replaceable>IS</replaceable> or stronger
+ lock on the table containing that row.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Before a given transaction can acquire an
+ <replaceable>X</replaceable> lock on a given row, it must
+ first acquire an <replaceable>IX</replaceable> lock on the
+ table containing that row.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ These rules can be conveniently summarized by means of a
+ <emphasis>lock type compatibility matrix</emphasis>:
+ </para>
+
+ <informaltable>
+ <tgroup cols="5">
+ <colspec colwidth="20*"/>
+ <colspec colwidth="20*"/>
+ <colspec colwidth="20*"/>
+ <colspec colwidth="20*"/>
+ <colspec colwidth="20*"/>
+ <tbody>
+ <row>
+ <entry/>
+ <entry><replaceable>X</replaceable></entry>
+ <entry><replaceable>IX</replaceable></entry>
+ <entry><replaceable>S</replaceable></entry>
+ <entry><replaceable>IS</replaceable></entry>
+ </row>
+ <row>
+ <entry><replaceable>X</replaceable></entry>
+ <entry>Conflict</entry>
+ <entry>Conflict</entry>
+ <entry>Conflict</entry>
+ <entry>Conflict</entry>
+ </row>
+ <row>
+ <entry><replaceable>IX</replaceable></entry>
+ <entry>Conflict</entry>
+ <entry>Compatible</entry>
+ <entry>Conflict</entry>
+ <entry>Compatible</entry>
+ </row>
+ <row>
+ <entry><replaceable>S</replaceable></entry>
+ <entry>Conflict</entry>
+ <entry>Conflict</entry>
+ <entry>Compatible</entry>
+ <entry>Compatible</entry>
+ </row>
+ <row>
+ <entry><replaceable>IS</replaceable></entry>
+ <entry>Conflict</entry>
+ <entry>Compatible</entry>
+ <entry>Compatible</entry>
+ <entry>Compatible</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </informaltable>
+
+ <para>
+ A lock is granted to a requesting transaction if it is
+ compatible with existing locks. A lock is not granted to a
+ requesting transaction if it conflicts with existing locks. A
+ transaction waits until the conflicting existing lock is
+ released. If a lock request conflicts with an existing lock and
+ cannot be granted because it would cause deadlock, an error
+ occurs.
+ </para>
+
+ <para>
+ Thus, intention locks do not block anything except full table
+ requests (for example, <literal>LOCK TABLES ...
+ WRITE</literal>). The main purpose of
+ <replaceable>IX</replaceable> and <replaceable>IS</replaceable>
+ locks is to show that someone is locking a row, or going to lock
+ a row in the table.
+ </para>
+
+ <para>
+ The following example illustrates how an error can occur when a
+ lock request would cause a deadlock. The example involves two
+ clients, A and B.
+ </para>
+
+ <para>
+ First, client A creates a table containing one row, and then
+ begins a transaction. Within the transaction, A obtains an
+ <replaceable>S</replaceable> lock on the row by selecting it in
+ share mode:
+ </para>
+
+<programlisting>
+<!--
+mysql> DROP TABLE IF EXISTS t;
+Query OK, 0 rows affected (0.26 sec)
+-->
+mysql> <userinput>CREATE TABLE t (i INT) ENGINE = InnoDB;</userinput>
+Query OK, 0 rows affected (1.07 sec)
+
+mysql> <userinput>INSERT INTO t (i) VALUES(1);</userinput>
+Query OK, 1 row affected (0.09 sec)
+
+mysql> <userinput>START TRANSACTION;</userinput>
+Query OK, 0 rows affected (0.00 sec)
+
+mysql> <userinput>SELECT * FROM t WHERE i = 1 LOCK IN SHARE MODE;</userinput>
++------+
+| i |
++------+
+| 1 |
++------+
+1 row in set (0.10 sec)
+</programlisting>
+
+ <para>
+ Next, client B begins a transaction and attempts to delete the
+ row from the table:
+ </para>
+
+<programlisting>
+mysql> <userinput>START TRANSACTION;</userinput>
+Query OK, 0 rows affected (0.00 sec)
+
+mysql> <userinput>DELETE FROM t WHERE i = 1;</userinput>
+</programlisting>
+
+ <para>
+ The delete operation requires an <replaceable>X</replaceable>
+ lock. The lock cannot be granted because it is incompatible with
+ the <replaceable>S</replaceable> lock that client A holds, so
+ the request goes on the queue of lock requests for the row and
+ client B blocks.
+ </para>
+
+ <para>
+ Finally, client A also attempts to delete the row from the
+ table:
+ </para>
+
+<programlisting>
+mysql> <userinput>DELETE FROM t WHERE i = 1;</userinput>
+ERROR 1213 (40001): Deadlock found when trying to get lock;
+try restarting transaction
+</programlisting>
+
+ <para>
+ Deadlock occurs here because client A needs an
+ <replaceable>X</replaceable> lock to delete the row. However,
+ that lock request cannot be granted because client B already has
+ a request for an <replaceable>X</replaceable> lock and is
+ waiting for client A to release its <replaceable>S</replaceable>
+ lock. Nor can the <replaceable>S</replaceable> lock held by A be
+ upgraded to an <replaceable>X</replaceable> lock because of the
+ prior request by B for an <replaceable>X</replaceable> lock. As
+ a result, <literal>InnoDB</literal> generates an error for
+ client A and releases its locks. At that point, the lock request
+ for client B can be granted and B deletes the row from the
+ table.
+ </para>
+
+ </section>
+
+ <section id="innodb-and-autocommit">
+
+ <title><literal>InnoDB</literal> and <literal>AUTOCOMMIT</literal></title>
+
+ <para>
+ In <literal>InnoDB</literal>, all user activity occurs inside a
+ transaction. If the autocommit mode is enabled, each SQL
+ statement forms a single transaction on its own. By default,
+ MySQL starts new connections with autocommit enabled.
+ </para>
+
+ <para>
+ If the autocommit mode is switched off with <literal>SET
+ AUTOCOMMIT = 0</literal>, then we can consider that a user
+ always has a transaction open. An SQL <literal>COMMIT</literal>
+ or <literal>ROLLBACK</literal> statement ends the current
+ transaction and a new one starts. A <literal>COMMIT</literal>
+ means that the changes made in the current transaction are made
+ permanent and become visible to other users. A
+ <literal>ROLLBACK</literal> statement, on the other hand,
+ cancels all modifications made by the current transaction. Both
+ statements release all <literal>InnoDB</literal> locks that were
+ set during the current transaction.
+ </para>
+
+ <para>
+ If the connection has autocommit enabled, the user can still
+ perform a multiple-statement transaction by starting it with an
+ explicit <literal>START TRANSACTION</literal> or
+ <literal>BEGIN</literal> statement and ending it with
+ <literal>COMMIT</literal> or <literal>ROLLBACK</literal>.
+ </para>
+
+ </section>
+
+ <section id="innodb-transaction-isolation">
+
+ <title><literal>InnoDB</literal> and <literal>TRANSACTION ISOLATION
+ LEVEL</literal></title>
+
+ <para>
+ In terms of the SQL:1992 transaction isolation levels, the
+ <literal>InnoDB</literal> default is <literal>REPEATABLE
+ READ</literal>. <literal>InnoDB</literal> offers all four
+ transaction isolation levels described by the SQL standard. You
+ can set the default isolation level for all connections by using
+ the <option>--transaction-isolation</option> option on the
+ command line or in an option file. For example, you can set the
+ option in the <literal>[mysqld]</literal> section of an option
+ file like this:
+ </para>
+
+<programlisting>
+[mysqld]
+transaction-isolation = {READ-UNCOMMITTED | READ-COMMITTED
+ | REPEATABLE-READ | SERIALIZABLE}
+</programlisting>
+
+ <para>
+ A user can change the isolation level for a single session or
+ for all new incoming connections with the <literal>SET
+ TRANSACTION</literal> statement. Its syntax is as follows:
+ </para>
+
+<programlisting>
+SET [SESSION | GLOBAL] TRANSACTION ISOLATION LEVEL
+ {READ UNCOMMITTED | READ COMMITTED
+ | REPEATABLE READ | SERIALIZABLE}
+</programlisting>
+
+ <para>
+ Note that there are hyphens in the level names for the
+ <option>--transaction-isolation</option> option, but not for the
+ <literal>SET TRANSACTION</literal> statement.
+ </para>
+
+ <para>
+ The default behavior is to set the isolation level for the next
+ (not started) transaction. If you use the
+ <literal>GLOBAL</literal> keyword, the statement sets the
+ default transaction level globally for all new connections
+ created from that point on (but not for existing connections).
+ You need the <literal>SUPER</literal> privilege to do this.
+ Using the <literal>SESSION</literal> keyword sets the default
+ transaction level for all future transactions performed on the
+ current connection.
+ </para>
+
+ <para>
+ Any client is free to change the session isolation level (even
+ in the middle of a transaction), or the isolation level for the
+ next transaction.
+ </para>
+
+ <para>
+ You can determine the global and session transaction isolation
+ levels by checking the value of the
+ <literal>tx_isolation</literal> system variable with these
+ statements:
+ </para>
+
+<programlisting>
+SELECT @@global.tx_isolation;
+SELECT @@tx_isolation;
+</programlisting>
+
+ <indexterm>
+ <primary>next-key lock</primary>
+ </indexterm>
+
+ <indexterm>
+ <primary>gap lock</primary>
+ </indexterm>
+
+ <para>
+ In row-level locking, <literal>InnoDB</literal> uses next-key
+ locking. That means that besides index records,
+ <literal>InnoDB</literal> can also lock the <quote>gap</quote>
+ preceding an index record to block insertions by other users
+ immediately before the index record. A next-key lock refers to a
+ lock that locks an index record and the gap before it. A gap
+ lock refers to a lock that only locks a gap before some index
+ record. Next-key locking for searches or index scans can be
+ disabled by enabling the
+ <literal>innodb_locks_unsafe_for_binlog</literal> system
+ variable.
+ </para>
+
+ <para>
+ A detailed description of each isolation level in
+ <literal>InnoDB</literal> follows:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ <literal>READ UNCOMMITTED</literal>
+ </para>
+
+ <para>
+ <literal>SELECT</literal> statements are performed in a
+ non-locking fashion, but a possible earlier version of a
+ record might be used. Thus, using this isolation level, such
+ reads are not consistent. This is also called a <quote>dirty
+ read.</quote> Otherwise, this isolation level works like
+ <literal>READ COMMITTED</literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>READ COMMITTED</literal>
+ </para>
+
+ <para>
+ A somewhat Oracle-like isolation level. All <literal>SELECT
+ ... FOR UPDATE</literal> and <literal>SELECT ... LOCK IN
+ SHARE MODE</literal> statements lock only the index records,
+ not the gaps before them, and thus allow the free insertion
+ of new records next to locked records.
+ <literal>UPDATE</literal> and <literal>DELETE</literal>
+ statements using a unique index with a unique search
+ condition lock only the index record found, not the gap
+ before it. In range-type <literal>UPDATE</literal> and
+ <literal>DELETE</literal> statements,
+ <literal>InnoDB</literal> must set next-key or gap locks and
+ block insertions by other users to the gaps covered by the
+ range. This is necessary because <quote>phantom rows</quote>
+ must be blocked for MySQL replication and recovery to work.
+ </para>
+
+ <para>
+ Consistent reads behave as in Oracle: Each consistent read,
+ even within the same transaction, sets and reads its own
+ fresh snapshot. See
+ <xref linkend="innodb-consistent-read"/>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>REPEATABLE READ</literal>
+ </para>
+
+ <para>
+ This is the default isolation level of
+ <literal>InnoDB</literal>. <literal>SELECT ... FOR
+ UPDATE</literal>, <literal>SELECT ... LOCK IN SHARE
+ MODE</literal>, <literal>UPDATE</literal>, and
+ <literal>DELETE</literal> statements that use a unique index
+ with a unique search condition lock only the index record
+ found, not the gap before it. With other search conditions,
+ these operations employ next-key locking, locking the index
+ range scanned with next-key or gap locks, and block new
+ insertions by other users.
+ </para>
+
+ <para>
+ In consistent reads, there is an important difference from
+ the <literal>READ COMMITTED</literal> isolation level: All
+ consistent reads within the same transaction read the same
+ snapshot established by the first read. This convention
+ means that if you issue several plain
+ <literal>SELECT</literal> statements within the same
+ transaction, these <literal>SELECT</literal> statements are
+ consistent also with respect to each other. See
+ <xref linkend="innodb-consistent-read"/>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>SERIALIZABLE</literal>
+ </para>
+
+ <para>
+ This level is like <literal>REPEATABLE READ</literal>, but
+ <literal>InnoDB</literal> implicitly converts all plain
+ <literal>SELECT</literal> statements to <literal>SELECT ...
+ LOCK IN SHARE MODE</literal>.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ In MySQL ¤t-series;, if the <literal>READ
+ COMMITTED</literal> isolation level is used or the
+ <literal>innodb_locks_unsafe_for_binlog</literal> system
+ variable is enabled, there is no <literal>InnoDB</literal> gap
+ locking except in constraint checking. Also, record locks for
+ non-matching rows are released after MySQL has evaluated the
+ <literal>WHERE</literal> condition.
+ </para>
+
+ </section>
+
+ <section id="innodb-consistent-read">
+
+ <title>Consistent Non-Locking Read</title>
+
+ <para>
+ A consistent read means that <literal>InnoDB</literal> uses
+ multi-versioning to present to a query a snapshot of the
+ database at a point in time. The query sees the changes made by
+ those transactions that committed before that point of time, and
+ no changes made by later or uncommitted transactions. The
+ exception to this rule is that the query sees the changes made
+ by earlier statements within the same transaction. Note that the
+ exception to the rule causes the following anomaly: if you
+ update some rows in a table, a <literal>SELECT</literal> will
+ see the latest version of the updated rows, but it might also
+ see older versions of any rows. If other users simultaneously
+ update the same table, the anomaly means that you may see the
+ table in a state that never existed in the database.
+ </para>
+
+ <para>
+ If you are running with the default <literal>REPEATABLE
+ READ</literal> isolation level, all consistent reads within the
+ same transaction read the snapshot established by the first such
+ read in that transaction. You can get a fresher snapshot for
+ your queries by committing the current transaction and after
+ that issuing new queries.
+ </para>
+
+ <para>
+ Consistent read is the default mode in which
+ <literal>InnoDB</literal> processes <literal>SELECT</literal>
+ statements in <literal>READ COMMITTED</literal> and
+ <literal>REPEATABLE READ</literal> isolation levels. A
+ consistent read does not set any locks on the tables it
+ accesses, and therefore other users are free to modify those
+ tables at the same time a consistent read is being performed on
+ the table.
+ </para>
+
+ <para>
+ Note that consistent read does not work over <literal>DROP
+ TABLE</literal> and over <literal>ALTER TABLE</literal>.
+ Consistent read does not work over <literal>DROP TABLE</literal>
+ because MySQL can't use a table that has been dropped and
+ <literal>InnoDB</literal> destroys the table. Consistent read
+ does not work over <literal>ALTER TABLE</literal> because
+ <literal>ALTER TABLE</literal> works by making a temporary copy
+ of the original table and deleting the original table when the
+ temporary copy is built. When you reissue a consistent read
+ within a transaction, rows in the new table are not visible
+ because those rows did not exist when the transaction's snapshot
+ was taken.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> uses a consistent read for select in
+ clauses like <literal>INSERT INTO ... SELECT</literal> and
+ <literal>UPDATE ... (SELECT)</literal> that do not specify
+ <literal>FOR UPDATE</literal> or <literal>IN SHARE
+ MODE</literal> if the
+ <literal>innodb_locks_unsafe_for_binlog</literal> option is set
+ and the isolation level of the transaction is not set to
+ serializable. Thus no locks are set to rows read from selected
+ table. Otherwise, <literal>InnoDB</literal> uses stronger locks
+ and the <literal>SELECT</literal> part acts like <literal>READ
+ COMMITTED</literal>, where each consistent read, even within the
+ same transaction, sets and reads its own fresh snapshot.
+ </para>
+
+ </section>
+
+ <section id="innodb-locking-reads">
+
+ <title><literal>SELECT ... FOR UPDATE</literal> and <literal>SELECT ... LOCK IN
+ SHARE MODE</literal> Locking Reads</title>
+
+ <para>
+ In some circumstances, a consistent read is not convenient. For
+ example, you might want to add a new row into your table
+ <literal>child</literal>, and make sure that the child has a
+ parent in table <literal>parent</literal>. The following example
+ shows how to implement referential integrity in your application
+ code.
+ </para>
+
+ <para>
+ Suppose that you use a consistent read to read the table
+ <literal>parent</literal> and indeed see the parent of the child
+ in the table. Can you safely add the child row to table
+ <literal>child</literal>? No, because it may happen that
+ meanwhile some other user deletes the parent row from the table
+ <literal>parent</literal> without you being aware of it.
+ </para>
+
+ <para>
+ The solution is to perform the <literal>SELECT</literal> in a
+ locking mode using <literal>LOCK IN SHARE MODE</literal>:
+ </para>
+
+<programlisting>
+SELECT * FROM parent WHERE NAME = 'Jones' LOCK IN SHARE MODE;
+</programlisting>
+
+ <para>
+ Performing a read in share mode means that we read the latest
+ available data, and set a shared mode lock on the rows we read.
+ A shared mode lock prevents others from updating or deleting the
+ row we have read. Also, if the latest data belongs to a yet
+ uncommitted transaction of another client connection, we wait
+ until that transaction commits. After we see that the preceding
+ query returns the parent <literal>'Jones'</literal>, we can
+ safely add the child record to the <literal>child</literal>
+ table and commit our transaction.
+ </para>
+
+ <para>
+ Let us look at another example: We have an integer counter field
+ in a table <literal>child_codes</literal> that we use to assign
+ a unique identifier to each child added to table
+ <literal>child</literal>. Obviously, using a consistent read or
+ a shared mode read to read the present value of the counter is
+ not a good idea because two users of the database may then see
+ the same value for the counter, and a duplicate-key error occurs
+ if two users attempt to add children with the same identifier to
+ the table.
+ </para>
+
+ <para>
+ Here, <literal>LOCK IN SHARE MODE</literal> is not a good
+ solution because if two users read the counter at the same time,
+ at least one of them ends up in deadlock when attempting to
+ update the counter.
+ </para>
+
+ <para>
+ In this case, there are two good ways to implement the reading
+ and incrementing of the counter: (1) update the counter first by
+ incrementing it by 1 and only after that read it, or (2) read
+ the counter first with a lock mode <literal>FOR
+ UPDATE</literal>, and increment after that. The latter approach
+ can be implemented as follows:
+ </para>
+
+<programlisting>
+SELECT counter_field FROM child_codes FOR UPDATE;
+UPDATE child_codes SET counter_field = counter_field + 1;
+</programlisting>
+
+ <para>
+ A <literal>SELECT ... FOR UPDATE</literal> reads the latest
+ available data, setting exclusive locks on each row it reads.
+ Thus, it sets the same locks a searched SQL
+ <literal>UPDATE</literal> would set on the rows.
+ </para>
+
+ <para>
+ The preceding description is merely an example of how
+ <literal>SELECT ... FOR UPDATE</literal> works. In MySQL, the
+ specific task of generating a unique identifier actually can be
+ accomplished using only a single access to the table:
+ </para>
+
+<programlisting>
+UPDATE child_codes SET counter_field = LAST_INSERT_ID(counter_field + 1);
+SELECT LAST_INSERT_ID();
+</programlisting>
+
+ <para>
+ The <literal>SELECT</literal> statement merely retrieves the
+ identifier information (specific to the current connection). It
+ does not access any table.
+ </para>
+
+ <para>
+ Locks set by <literal>IN SHARE MODE</literal> and <literal>FOR
+ UPDATE</literal> reads are released when the transaction is
+ committed or rolled back.
+ </para>
+
+ <note>
+ <para>
+ Locking of rows for update using <literal>SELECT FOR
+ UPDATE</literal> only applies when
+ <literal>AUTOCOMMIT</literal> is switched off. If
+ <literal>AUTOCOMMIT</literal> is on, then the rows matching
+ the specification are not locked.
+ </para>
+ </note>
+
+ </section>
+
+ <section id="innodb-next-key-locking">
+
+ <title>Next-Key Locking: Avoiding the Phantom Problem</title>
+
+ <indexterm>
+ <primary>next-key lock</primary>
+ </indexterm>
+
+ <indexterm>
+ <primary>gap lock</primary>
+ </indexterm>
+
+ <para>
+ In row-level locking, <literal>InnoDB</literal> uses an
+ algorithm called <emphasis>next-key locking</emphasis>.
+ <literal>InnoDB</literal> performs the row-level locking in such
+ a way that when it searches or scans an index of a table, it
+ sets shared or exclusive locks on the index records it
+ encounters. Thus, the row-level locks are actually index record
+ locks.
+ </para>
+
+ <para>
+ The next-key locks that <literal>InnoDB</literal> sets on index
+ records also affect the <quote>gap</quote> before that index
+ record. If a user has a shared or exclusive lock on record
+ <literal>R</literal> in an index, another user cannot insert a
+ new index record immediately before <literal>R</literal> in the
+ index order. (A gap lock refers to a lock that only locks a gap
+ before some index record.)
+ </para>
+
+ <para>
+ This next-key locking of gaps is done to prevent the so-called
+ <quote>phantom problem.</quote> Suppose that you want to read
+ and lock all children from the <literal>child</literal> table
+ having an identifier value greater than 100, with the intention
+ of updating some column in the selected rows later:
+ </para>
+
+<programlisting>
+SELECT * FROM child WHERE id > 100 FOR UPDATE;
+</programlisting>
+
+ <para>
+ Suppose that there is an index on the <literal>id</literal>
+ column. The query scans that index starting from the first
+ record where <literal>id</literal> is bigger than 100. If the
+ locks set on the index records would not lock out inserts made
+ in the gaps, a new row might meanwhile be inserted to the table.
+ If you execute the same <literal>SELECT</literal> within the
+ same transaction, you would see a new row in the result set
+ returned by the query. This is contrary to the isolation
+ principle of transactions: A transaction should be able to run
+ so that the data it has read does not change during the
+ transaction. If we regard a set of rows as a data item, the new
+ <quote>phantom</quote> child would violate this isolation
+ principle.
+ </para>
+
+ <para>
+ When <literal>InnoDB</literal> scans an index, it can also lock
+ the gap after the last record in the index. Just that happens in
+ the previous example: The locks set by <literal>InnoDB</literal>
+ prevent any insert to the table where <literal>id</literal>
+ would be bigger than 100.
+ </para>
+
+ <para>
+ You can use next-key locking to implement a uniqueness check in
+ your application: If you read your data in share mode and do not
+ see a duplicate for a row you are going to insert, then you can
+ safely insert your row and know that the next-key lock set on
+ the successor of your row during the read prevents anyone
+ meanwhile inserting a duplicate for your row. Thus, the next-key
+ locking allows you to <quote>lock</quote> the non-existence of
+ something in your table.
+ </para>
+
+ <para>
+ In MySQL ¤t-series;, if the <literal>READ
+ COMMITTED</literal> isolation level is used or the
+ <literal>innodb_locks_unsafe_for_binlog</literal> system
+ variable is enabled, there is no <literal>InnoDB</literal> gap
+ locking except in constraint checking. Also, record locks for
+ non-matching rows are released after MySQL has evaluated the
+ <literal>WHERE</literal> condition.
+ </para>
+
+ </section>
+
+ <section id="innodb-consistent-read-example">
+
+ <title>An Example of Consistent Read in <literal>InnoDB</literal></title>
+
+ <para>
+ Suppose that you are running in the default <literal>REPEATABLE
+ READ</literal> isolation level. When you issue a consistent read
+ (that is, an ordinary <literal>SELECT</literal> statement),
+ <literal>InnoDB</literal> gives your transaction a timepoint
+ according to which your query sees the database. If another
+ transaction deletes a row and commits after your timepoint was
+ assigned, you do not see the row as having been deleted. Inserts
+ and updates are treated similarly.
+ </para>
+
+ <para>
+ You can advance your timepoint by committing your transaction
+ and then doing another <literal>SELECT</literal>.
+ </para>
+
+ <para>
+ This is called <firstterm>multi-versioned concurrency
+ control</firstterm>.
+ </para>
+
+ <remark role="todo">
+ More ASCII art that needs to be turned into a graphic.
+ </remark>
+
+<programlisting>
+ User A User B
+
+ SET AUTOCOMMIT=0; SET AUTOCOMMIT=0;
+time
+| SELECT * FROM t;
+| empty set
+| INSERT INTO t VALUES (1, 2);
+|
+v SELECT * FROM t;
+ empty set
+ COMMIT;
+
+ SELECT * FROM t;
+ empty set
+
+ COMMIT;
+
+ SELECT * FROM t;
+ ---------------------
+ | 1 | 2 |
+ ---------------------
+ 1 row in set
+</programlisting>
+
+ <para>
+ In this example, user A sees the row inserted by B only when B
+ has committed the insert and A has committed as well, so that
+ the timepoint is advanced past the commit of B.
+ </para>
+
+ <para>
+ If you want to see the <quote>freshest</quote> state of the
+ database, you should use either the <literal>READ
+ COMMITTED</literal> isolation level or a locking read:
+ </para>
+
+<programlisting>
+SELECT * FROM t LOCK IN SHARE MODE;
+</programlisting>
+
+ </section>
+
+ <section id="innodb-locks-set">
+
+ <title>Locks Set by Different SQL Statements in <literal>InnoDB</literal></title>
+
+ <para>
+ A locking read, an <literal>UPDATE</literal>, or a
+ <literal>DELETE</literal> generally set record locks on every
+ index record that is scanned in the processing of the SQL
+ statement. It does not matter if there are
+ <literal>WHERE</literal> conditions in the statement that would
+ exclude the row. <literal>InnoDB</literal> does not remember the
+ exact <literal>WHERE</literal> condition, but only knows which
+ index ranges were scanned. The record locks are normally
+ next-key locks that also block inserts to the <quote>gap</quote>
+ immediately before the record.
+ </para>
+
+ <para>
+ If the locks to be set are exclusive, <literal>InnoDB</literal>
+ always retrieves also the clustered index record and sets a lock
+ on it.
+ </para>
+
+ <para>
+ If you do not have indexes suitable for your statement and MySQL
+ has to scan the whole table to process the statement, every row
+ of the table becomes locked, which in turn blocks all inserts by
+ other users to the table. It is important to create good indexes
+ so that your queries do not unnecessarily need to scan many
+ rows.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> sets specific types of locks as
+ follows:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ <literal>SELECT ... FROM</literal> is a consistent read,
+ reading a snapshot of the database and setting no locks
+ unless the transaction isolation level is set to
+ <literal>SERIALIZABLE</literal>. For
+ <literal>SERIALIZABLE</literal> level, this sets shared
+ next-key locks on the index records it encounters.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>SELECT ... FROM ... LOCK IN SHARE MODE</literal>
+ sets shared next-key locks on all index records the read
+ encounters.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>SELECT ... FROM ... FOR UPDATE</literal> sets
+ exclusive next-key locks on all index records the read
+ encounters and also on the corresponding clustered index
+ records if a secondary index is used in the search.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>UPDATE ... WHERE ...</literal> sets an exclusive
+ next-key lock on every record the search encounters.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>DELETE FROM ... WHERE ...</literal> sets an
+ exclusive next-key lock on every record the search
+ encounters.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>INSERT INTO ... VALUES (...)</literal> sets an
+ exclusive lock on the inserted row. Note that this lock is
+ not a next-key lock and does not prevent other users from
+ inserting to the gap before the inserted row. If a
+ duplicate-key error occurs, a shared lock on the duplicate
+ index record is set.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>REPLACE</literal> is done like an
+ <literal>INSERT</literal> if there is no collision on a
+ unique key. Otherwise, an exclusive next-key lock is placed
+ on the row that has to be updated.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ While initializing a previously specified
+ <literal>AUTO_INCREMENT</literal> column on a table,
+ <literal>InnoDB</literal> sets an exclusive lock on the end
+ of the index associated with the
+ <literal>AUTO_INCREMENT</literal> column. In accessing the
+ auto-increment counter, <literal>InnoDB</literal> uses a
+ specific table lock mode <literal>AUTO-INC</literal> where
+ the lock lasts only to the end of the current SQL statement,
+ not to the end of the entire transaction. Note that other
+ clients cannot insert into the table while the
+ <literal>AUTO-INC</literal> table lock is held; see
+ <xref linkend="innodb-and-autocommit"/>.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> fetches the value of a previously
+ initialized <literal>AUTO_INCREMENT</literal> column without
+ setting any locks.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>INSERT INTO T SELECT ... FROM S WHERE ...</literal>
+ sets an exclusive (non-next-key) lock on each row inserted
+ into <literal>T</literal>. <literal>InnoDB</literal> sets
+ shared next-key locks on <literal>S</literal>, unless
+ <literal>innodb_locks_unsafe_for_binlog</literal> is
+ enabled, in which case it does the search on
+ <literal>S</literal> as a consistent read.
+ <literal>InnoDB</literal> has to set locks in the latter
+ case: In roll-forward recovery from a backup, every SQL
+ statement has to be executed in exactly the same way it was
+ done originally.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>CREATE TABLE ... SELECT ...</literal> performs the
+ <literal>SELECT</literal> as a consistent read or with
+ shared locks, as in the previous item.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If a <literal>FOREIGN KEY</literal> constraint is defined on
+ a table, any insert, update, or delete that requires the
+ constraint condition to be checked sets shared record-level
+ locks on the records that it looks at to check the
+ constraint. <literal>InnoDB</literal> also sets these locks
+ in the case where the constraint fails.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>LOCK TABLES</literal> sets table locks, but it is
+ the higher MySQL layer above the <literal>InnoDB</literal>
+ layer that sets these locks. <literal>InnoDB</literal> is
+ aware of table locks if
+ <literal>innodb_table_locks=1</literal> (the default) and
+ <literal>AUTOCOMMIT=0</literal>, and the MySQL layer above
+ <literal>InnoDB</literal> knows about row-level locks.
+ Otherwise, <literal>InnoDB</literal>'s automatic deadlock
+ detection cannot detect deadlocks where such table locks are
+ involved. Also, because the higher MySQL layer does not know
+ about row-level locks, it is possible to get a table lock on
+ a table where another user currently has row-level locks.
+ However, this does not endanger transaction integrity, as
+ discussed in <xref linkend="innodb-deadlock-detection"/>.
+ See also <xref linkend="innodb-restrictions"/>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ In MySQL ¤t-series;, if the <literal>READ
+ COMMITTED</literal> isolation level is used or the
+ <literal>innodb_locks_unsafe_for_binlog</literal> system
+ variable is enabled, there is no <literal>InnoDB</literal>
+ gap locking except in constraint checking. Also, record
+ locks for non-matching rows are released after MySQL has
+ evaluated the <literal>WHERE</literal> condition.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ </section>
+
+ <section id="innodb-implicit-commit">
+
+ <title>Implicit Transaction Commit and Rollback</title>
+
+ <para>
+ By default, MySQL begins each client connection with autocommit
+ mode enabled. When autocommit is enabled, MySQL does a commit
+ after each SQL statement if that statement did not return an
+ error. If an SQL statement returns an error, the commit or
+ rollback behavior depends on the error. See
+ <xref linkend="innodb-error-handling"/>.
+ </para>
+
+ <para>
+ If you have the autocommit mode off and close a connection
+ without explicitly committing the final transaction, MySQL rolls
+ back that transaction.
+ </para>
+
+ <para>
+ For details about which statements implicitly end a transaction,
+ as if you had done a <literal>COMMIT</literal> before executing
+ the statement, see <xref linkend="implicit-commit"/>.
+ </para>
+
+ </section>
+
+ <section id="innodb-deadlock-detection">
+
+ <title>Deadlock Detection and Rollback</title>
+
+ <para>
+ <literal>InnoDB</literal> automatically detects a deadlock of
+ transactions and rolls back a transaction or transactions to
+ break the deadlock. <literal>InnoDB</literal> tries to pick
+ small transactions to roll back, where the size of a transaction
+ is determined by the number of rows inserted, updated, or
+ deleted.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> is aware of table locks if
+ <literal>innodb_table_locks=1</literal> (the default) and
+ <literal>AUTOCOMMIT=0</literal>, and the MySQL layer above it
+ knows about row-level locks. Otherwise,
+ <literal>InnoDB</literal> cannot detect deadlocks where a table
+ lock set by a MySQL <literal>LOCK TABLES</literal> statement or
+ a lock set by a storage engine other than
+ <literal>InnoDB</literal> is involved. You must resolve these
+ situations by setting the value of the
+ <literal>innodb_lock_wait_timeout</literal> system variable.
+ </para>
+
+ <para>
+ When <literal>InnoDB</literal> performs a complete rollback of a
+ transaction, all locks set by the transaction are released.
+ However, if just a single SQL statement is rolled back as a
+ result of an error, some of the locks set by the statement may
+ be preserved. This happens because <literal>InnoDB</literal>
+ stores row locks in a format such that it cannot know afterward
+ which lock was set by which statement.
+ </para>
+
+ </section>
+
+ <section id="innodb-deadlocks">
+
+ <title>How to Cope with Deadlocks</title>
+
+ <para>
+ Deadlocks are a classic problem in transactional databases, but
+ they are not dangerous unless they are so frequent that you
+ cannot run certain transactions at all. Normally, you must write
+ your applications so that they are always prepared to re-issue a
+ transaction if it gets rolled back because of a deadlock.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> uses automatic row-level locking. You
+ can get deadlocks even in the case of transactions that just
+ insert or delete a single row. That is because these operations
+ are not really <quote>atomic</quote>; they automatically set
+ locks on the (possibly several) index records of the row
+ inserted or deleted.
+ </para>
+
+ <para>
+ You can cope with deadlocks and reduce the likelihood of their
+ occurrence with the following techniques:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Use <literal>SHOW ENGINE INNODB STATUS</literal> to
+ determine the cause of the latest deadlock. That can help
+ you to tune your application to avoid deadlocks.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Always be prepared to re-issue a transaction if it fails due
+ to deadlock. Deadlocks are not dangerous. Just try again.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Commit your transactions often. Small transactions are less
+ prone to collision.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If you are using locking reads (<literal>SELECT ... FOR
+ UPDATE</literal> or <literal>... LOCK IN SHARE
+ MODE</literal>), try using a lower isolation level such as
+ <literal>READ COMMITTED</literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Access your tables and rows in a fixed order. Then
+ transactions form well-defined queues and do not deadlock.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Add well-chosen indexes to your tables. Then your queries
+ need to scan fewer index records and consequently set fewer
+ locks. Use <literal>EXPLAIN SELECT</literal> to determine
+ which indexes the MySQL server regards as the most
+ appropriate for your queries.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Use less locking. If you can afford to allow a
+ <literal>SELECT</literal> to return data from an old
+ snapshot, do not add the clause <literal>FOR
+ UPDATE</literal> or <literal>LOCK IN SHARE MODE</literal> to
+ it. Using the <literal>READ COMMITTED</literal> isolation
+ level is good here, because each consistent read within the
+ same transaction reads from its own fresh snapshot.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If nothing else helps, serialize your transactions with
+ table-level locks. The correct way to use <literal>LOCK
+ TABLES</literal> with transactional tables, such as
+ <literal>InnoDB</literal> tables, is to set
+ <literal>AUTOCOMMIT = 0</literal> and not to call
+ <literal>UNLOCK TABLES</literal> until after you commit the
+ transaction explicitly. For example, if you need to write to
+ table <literal>t1</literal> and read from table
+ <literal>t2</literal>, you can do this:
+ </para>
+
+<programlisting>
+SET AUTOCOMMIT=0;
+LOCK TABLES t1 WRITE, t2 READ, ...;
+<replaceable>... do something with tables t1 and t2 here ...</replaceable>
+COMMIT;
+UNLOCK TABLES;
+</programlisting>
+
+ <para>
+ Table-level locks make your transactions queue nicely, and
+ deadlocks are avoided.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Another way to serialize transactions is to create an
+ auxiliary <quote>semaphore</quote> table that contains just
+ a single row. Have each transaction update that row before
+ accessing other tables. In that way, all transactions happen
+ in a serial fashion. Note that the <literal>InnoDB</literal>
+ instant deadlock detection algorithm also works in this
+ case, because the serializing lock is a row-level lock. With
+ MySQL table-level locks, the timeout method must be used to
+ resolve deadlocks.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ </section>
+
+ </section>
+
+ <section id="innodb-tuning">
+
+ <title><literal>InnoDB</literal> Performance Tuning Tips</title>
+
+ <indexterm>
+ <primary>Solaris x86_64 issues</primary>
+ </indexterm>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ In <literal>InnoDB</literal>, having a long <literal>PRIMARY
+ KEY</literal> wastes a lot of disk space because its value
+ must be stored with every secondary index record. (See
+ <xref linkend="innodb-table-and-index"/>.) Create an
+ <literal>AUTO_INCREMENT</literal> column as the primary key if
+ your primary key is long.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If the Unix <filename>top</filename> tool or the Windows Task
+ Manager shows that the CPU usage percentage with your workload
+ is less than 70%, your workload is probably disk-bound. Maybe
+ you are making too many transaction commits, or the buffer
+ pool is too small. Making the buffer pool bigger can help, but
+ do not set it equal to more than 80% of physical memory.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Wrap several modifications into one transaction.
+ <literal>InnoDB</literal> must flush the log to disk at each
+ transaction commit if that transaction made modifications to
+ the database. The rotation speed of a disk is typically at
+ most 167 revolutions/second, which constrains the number of
+ commits to the same 167<superscript>th</superscript> of a
+ second if the disk does not <quote>fool</quote> the operating
+ system.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If you can afford the loss of some of the latest committed
+ transactions if a crash occurs, you can set the
+ <literal>innodb_flush_log_at_trx_commit</literal> parameter to
+ 0. <literal>InnoDB</literal> tries to flush the log once per
+ second anyway, although the flush is not guaranteed. You
+ should also set the value of
+ <literal>innodb_support_xa</literal> to 0 which will reduce
+ the number of disk flushes due to synchronizing on disk data
+ and the binary log.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Make your log files big, even as big as the buffer pool. When
+ <literal>InnoDB</literal> has written the log files full, it
+ has to write the modified contents of the buffer pool to disk
+ in a checkpoint. Small log files cause many unnecessary disk
+ writes. The drawback of big log files is that the recovery
+ time is longer.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Make the log buffer quite large as well (on the order of 8MB).
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Use the <literal>VARCHAR</literal> data type instead of
+ <literal>CHAR</literal> if you are storing variable-length
+ strings or if the column may contain many
+ <literal>NULL</literal> values. A
+ <literal>CHAR(<replaceable>N</replaceable>)</literal> column
+ always takes <replaceable>N</replaceable> characters to store
+ data, even if the string is shorter or its value is
+ <literal>NULL</literal>. Smaller tables fit better in the
+ buffer pool and reduce disk I/O.
+ </para>
+
+ <para>
+ When using <literal>row_format=compact</literal> (the default
+ <literal>InnoDB</literal> record format in MySQL
+ ¤t-series;) and variable-length character sets, such as
+ <literal>utf8</literal> or <literal>sjis</literal>,
+ <literal>CHAR(<replaceable>N</replaceable>)</literal> will
+ occupy a variable amount of space, at least
+ <replaceable>N</replaceable> bytes.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ In some versions of GNU/Linux and Unix, flushing files to disk
+ with the Unix <literal>fsync()</literal> call (which
+ <literal>InnoDB</literal> uses by default) and other similar
+ methods is surprisingly slow. If you are dissatisfied with
+ database write performance, you might try setting the
+ <literal>innodb_flush_method</literal> parameter to
+ <literal>O_DSYNC</literal>. Although
+ <literal>O_DSYNC</literal> seems to be slower on most systems,
+ yours might not be one of them.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ When using the <literal>InnoDB</literal> storage engine on
+ Solaris 10 for x86_64 architecture (AMD Opteron), it is
+ important to mount any filesystems used for storing
+ <literal>InnoDB</literal>-related files using the
+ <literal>forcedirectio</literal> option. (The default on
+ Solaris 10/x86_64 is <emphasis>not</emphasis> to use this
+ option.) Failure to use <literal>forcedirectio</literal>
+ causes a serious degradation of <literal>InnoDB</literal>'s
+ speed and performance on this platform.
+ </para>
+
+ <para>
+ When using the <literal>InnoDB</literal> storage engine with a
+ large <literal>innodb_buffer_pool_size</literal> value on any
+ release of Solaris 2.6 and up and any platform
+ (sparc/x86/x64/amd64), a significant performance gain can be
+ achieved by placing <literal>InnoDB</literal> data files and
+ log files on raw devices or on a separate direct I/O UFS
+ filesystem (using mount option
+ <literal>forcedirectio</literal>; see
+ <literal>mount_ufs(1M)</literal>). Users of the Veritas
+ filesystem VxFS should use the mount option
+ <literal>convosync=direct</literal>.
+ </para>
+
+ <para>
+ Other MySQL data files, such as those for
+ <literal>MyISAM</literal> tables, should not be placed on a
+ direct I/O filesystem. Executables or libraries <emphasis>must
+ not</emphasis> be placed on a direct I/O filesystem.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ When importing data into <literal>InnoDB</literal>, make sure
+ that MySQL does not have autocommit mode enabled because that
+ requires a log flush to disk for every insert. To disable
+ autocommit during your import operation, surround it with
+ <literal>SET AUTOCOMMIT</literal> and
+ <literal>COMMIT</literal> statements:
+ </para>
+
+<programlisting>
+SET AUTOCOMMIT=0;
+<replaceable>... SQL import statements ...</replaceable>
+COMMIT;
+</programlisting>
+
+ <para>
+ If you use the <command>mysqldump</command> option
+ <option>--opt</option>, you get dump files that are fast to
+ import into an <literal>InnoDB</literal> table, even without
+ wrapping them with the <literal>SET AUTOCOMMIT</literal> and
+ <literal>COMMIT</literal> statements.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Beware of big rollbacks of mass inserts:
+ <literal>InnoDB</literal> uses the insert buffer to save disk
+ I/O in inserts, but no such mechanism is used in a
+ corresponding rollback. A disk-bound rollback can take 30
+ times as long to perform as the corresponding insert. Killing
+ the database process does not help because the rollback starts
+ again on server startup. The only way to get rid of a runaway
+ rollback is to increase the buffer pool so that the rollback
+ becomes CPU-bound and runs fast, or to use a special
+ procedure. See <xref linkend="forcing-recovery"/>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Beware also of other big disk-bound operations. Use
+ <literal>DROP TABLE</literal> and <literal>CREATE
+ TABLE</literal> to empty a table, not <literal>DELETE FROM
+ <replaceable>tbl_name</replaceable></literal>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Use the multiple-row <literal>INSERT</literal> syntax to
+ reduce communication overhead between the client and the
+ server if you need to insert many rows:
+ </para>
+
+<programlisting>
+INSERT INTO yourtable VALUES (1,2), (5,5), ...;
+</programlisting>
+
+ <para>
+ This tip is valid for inserts into any table, not just
+ <literal>InnoDB</literal> tables.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If you have <literal>UNIQUE</literal> constraints on secondary
+ keys, you can speed up table imports by temporarily turning
+ off the uniqueness checks during the import session:
+ </para>
+
+<programlisting>
+SET UNIQUE_CHECKS=0;
+<replaceable>... import operation ...</replaceable>
+SET UNIQUE_CHECKS=1;
+</programlisting>
+
+ <para>
+ For big tables, this saves a lot of disk I/O because
+ <literal>InnoDB</literal> can use its insert buffer to write
+ secondary index records in a batch. Be certain that the data
+ contains no duplicate keys. <literal>UNIQUE_CHECKS</literal>
+ allows but does not require storage engines to ignore
+ duplicate keys.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If you have <literal>FOREIGN KEY</literal> constraints in your
+ tables, you can speed up table imports by turning the foreign
+ key checks off for the duration of the import session:
+ </para>
+
+<programlisting>
+SET FOREIGN_KEY_CHECKS=0;
+<replaceable>... import operation ...</replaceable>
+SET FOREIGN_KEY_CHECKS=1;
+</programlisting>
+
+ <para>
+ For big tables, this can save a lot of disk I/O.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If you often have recurring queries for tables that are not
+ updated frequently, use the query cache:
+ </para>
+
+<programlisting>
+[mysqld]
+query_cache_type = ON
+query_cache_size = 10M
+</programlisting>
+ </listitem>
+
+ <listitem>
+ <para>
+ Unlike <literal>MyISAM</literal>, <literal>InnoDB</literal>
+ does not store an index cardinality value in its tables.
+ Instead, <literal>InnoDB</literal> computes a cardinality for
+ a table the first time it accesses it after startup. With a
+ large number of tables, this might take significant time. It
+ is the initial table open operation that is important, so to
+ <quote>warm up</quote> a table for later use, you might want
+ to use it immediately after start up by issuing a statement
+ such as <literal>SELECT 1 FROM
+ <replaceable>tbl_name</replaceable> LIMIT 1</literal>.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <formalpara role="mnmas">
+
+ <title>MySQL Enterprise</title>
+
+ <para>
+ For optimization recommendations geared to your specific
+ circumstances subscribe to the MySQL Enterprise Monitor. For
+ more information see
+ <ulink url="&base-url-enterprise;advisors.html"/>.
+ </para>
+
+ </formalpara>
+
+ <section id="innodb-monitor">
+
+ <title><literal>SHOW ENGINE INNODB STATUS</literal> and the
+ <literal>InnoDB</literal> Monitors</title>
+
+ <para>
+ <literal>InnoDB</literal> includes <literal>InnoDB</literal>
+ Monitors that print information about the
+ <literal>InnoDB</literal> internal state. You can use the
+ <literal>SHOW ENGINE INNODB STATUS</literal> SQL statement at
+ any time to fetch the output of the standard
+ <literal>InnoDB</literal> Monitor to your SQL client. This
+ information is useful in performance tuning. (If you are using
+ the <command>mysql</command> interactive SQL client, the output
+ is more readable if you replace the usual semicolon statement
+ terminator with <literal>\G</literal>.) For a discussion of
+ <literal>InnoDB</literal> lock modes, see
+ <xref linkend="innodb-lock-modes"/>.
+ </para>
+
+<programlisting>
+mysql> <userinput>SHOW ENGINE INNODB STATUS\G</userinput>
+</programlisting>
+
+ <para>
+ Another way to use <literal>InnoDB</literal> Monitors is to let
+ them periodically write data to the standard output of the
+ <command>mysqld</command> server. In this case, no output is
+ sent to clients. When switched on, <literal>InnoDB</literal>
+ Monitors print data about every 15 seconds. Server output
+ usually is directed to the <filename>.err</filename> log in the
+ MySQL data directory. This data is useful in performance tuning.
+ On Windows, you must start the server from a command prompt in a
+ console window with the <option>--console</option> option if you
+ want to direct the output to the window rather than to the error
+ log.
+ </para>
+
+ <para>
+ Monitor output includes the following types of information:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Table and record locks held by each active transaction
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Lock waits of a transactions
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Semaphore waits of threads
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Pending file I/O requests
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Buffer pool statistics
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Purge and insert buffer merge activity of the main
+ <literal>InnoDB</literal> thread
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ To cause the standard <literal>InnoDB</literal> Monitor to write
+ to the standard output of <command>mysqld</command>, use the
+ following SQL statement:
+ </para>
+
+<programlisting>
+CREATE TABLE innodb_monitor (a INT) ENGINE=INNODB;
+</programlisting>
+
+ <para>
+ The monitor can be stopped by issuing the following statement:
+ </para>
+
+<programlisting>
+DROP TABLE innodb_monitor;
+</programlisting>
+
+ <para>
+ The <literal>CREATE TABLE</literal> syntax is just a way to pass
+ a command to the <literal>InnoDB</literal> engine through
+ MySQL's SQL parser: The only things that matter are the table
+ name <literal>innodb_monitor</literal> and that it be an
+ <literal>InnoDB</literal> table. The structure of the table is
+ not relevant at all for the <literal>InnoDB</literal> Monitor.
+ If you shut down the server, the monitor does not restart
+ automatically when you restart the server. You must drop the
+ monitor table and issue a new <literal>CREATE TABLE</literal>
+ statement to start the monitor. (This syntax may change in a
+ future release.)
+ </para>
+
+ <para>
+ You can use <literal>innodb_lock_monitor</literal> in a similar
+ fashion. This is the same as <literal>innodb_monitor</literal>,
+ except that it also provides a great deal of lock information. A
+ separate <literal>innodb_tablespace_monitor</literal> prints a
+ list of created file segments existing in the tablespace and
+ validates the tablespace allocation data structures. In
+ addition, there is <literal>innodb_table_monitor</literal> with
+ which you can print the contents of the
+ <literal>InnoDB</literal> internal data dictionary.
+ </para>
+
+ <para>
+ A sample of <literal>InnoDB</literal> Monitor output:
+ </para>
+
+<programlisting>
+mysql> <userinput>SHOW ENGINE INNODB STATUS\G</userinput>
+*************************** 1. row ***************************
+Status:
+=====================================
+030709 13:00:59 INNODB MONITOR OUTPUT
+=====================================
+Per second averages calculated from the last 18 seconds
+----------
+SEMAPHORES
+----------
+OS WAIT ARRAY INFO: reservation count 413452, signal count 378357
+--Thread 32782 has waited at btr0sea.c line 1477 for 0.00 seconds the
+semaphore: X-lock on RW-latch at 41a28668 created in file btr0sea.c line 135
+a writer (thread id 32782) has reserved it in mode wait exclusive
+number of readers 1, waiters flag 1
+Last time read locked in file btr0sea.c line 731
+Last time write locked in file btr0sea.c line 1347
+Mutex spin waits 0, rounds 0, OS waits 0
+RW-shared spins 108462, OS waits 37964; RW-excl spins 681824, OS waits
+375485
+------------------------
+LATEST FOREIGN KEY ERROR
+------------------------
+030709 13:00:59 Transaction:
+TRANSACTION 0 290328284, ACTIVE 0 sec, process no 3195, OS thread id 34831
+inserting
+15 lock struct(s), heap size 2496, undo log entries 9
+MySQL thread id 25, query id 4668733 localhost heikki update
+insert into ibtest11a (D, B, C) values (5, 'khDk' ,'khDk')
+Foreign key constraint fails for table test/ibtest11a:
+,
+ CONSTRAINT `0_219242` FOREIGN KEY (`A`, `D`) REFERENCES `ibtest11b` (`A`,
+ `D`) ON DELETE CASCADE ON UPDATE CASCADE
+Trying to add in child table, in index PRIMARY tuple:
+ 0: len 4; hex 80000101; asc ....;; 1: len 4; hex 80000005; asc ....;; 2:
+ len 4; hex 6b68446b; asc khDk;; 3: len 6; hex 0000114e0edc; asc ...N..;; 4:
+ len 7; hex 00000000c3e0a7; asc .......;; 5: len 4; hex 6b68446b; asc khDk;;
+But in parent table test/ibtest11b, in index PRIMARY,
+the closest match we can find is record:
+RECORD: info bits 0 0: len 4; hex 8000015b; asc ...[;; 1: len 4; hex
+80000005; asc ....;; 2: len 3; hex 6b6864; asc khd;; 3: len 6; hex
+0000111ef3eb; asc ......;; 4: len 7; hex 800001001e0084; asc .......;; 5:
+len 3; hex 6b6864; asc khd;;
+------------------------
+LATEST DETECTED DEADLOCK
+------------------------
+030709 12:59:58
+*** (1) TRANSACTION:
+TRANSACTION 0 290252780, ACTIVE 1 sec, process no 3185, OS thread id 30733
+inserting
+LOCK WAIT 3 lock struct(s), heap size 320, undo log entries 146
+MySQL thread id 21, query id 4553379 localhost heikki update
+INSERT INTO alex1 VALUES(86, 86, 794,'aA35818','bb','c79166','d4766t',
+'e187358f','g84586','h794',date_format('2001-04-03 12:54:22','%Y-%m-%d
+%H:%i'),7
+*** (1) WAITING FOR THIS LOCK TO BE GRANTED:
+RECORD LOCKS space id 0 page no 48310 n bits 568 table test/alex1 index
+symbole trx id 0 290252780 lock mode S waiting
+Record lock, heap no 324 RECORD: info bits 0 0: len 7; hex 61613335383138;
+asc aa35818;; 1:
+*** (2) TRANSACTION:
+TRANSACTION 0 290251546, ACTIVE 2 sec, process no 3190, OS thread id 32782
+inserting
+130 lock struct(s), heap size 11584, undo log entries 437
+MySQL thread id 23, query id 4554396 localhost heikki update
+REPLACE INTO alex1 VALUES(NULL, 32, NULL,'aa3572','','c3572','d6012t','',
+NULL,'h396', NULL, NULL, 7.31,7.31,7.31,200)
+*** (2) HOLDS THE LOCK(S):
+RECORD LOCKS space id 0 page no 48310 n bits 568 table test/alex1 index
+symbole trx id 0 290251546 lock_mode X locks rec but not gap
+Record lock, heap no 324 RECORD: info bits 0 0: len 7; hex 61613335383138;
+asc aa35818;; 1:
+*** (2) WAITING FOR THIS LOCK TO BE GRANTED:
+RECORD LOCKS space id 0 page no 48310 n bits 568 table test/alex1 index
+symbole trx id 0 290251546 lock_mode X locks gap before rec insert intention
+waiting
+Record lock, heap no 82 RECORD: info bits 0 0: len 7; hex 61613335373230;
+asc aa35720;; 1:
+*** WE ROLL BACK TRANSACTION (1)
+------------
+TRANSACTIONS
+------------
+Trx id counter 0 290328385
+Purge done for trx's n:o < 0 290315608 undo n:o < 0 17
+Total number of lock structs in row lock hash table 70
+LIST OF TRANSACTIONS FOR EACH SESSION:
+---TRANSACTION 0 0, not started, process no 3491, OS thread id 42002
+MySQL thread id 32, query id 4668737 localhost heikki
+show innodb status
+---TRANSACTION 0 290328384, ACTIVE 0 sec, process no 3205, OS thread id
+38929 inserting
+1 lock struct(s), heap size 320
+MySQL thread id 29, query id 4668736 localhost heikki update
+insert into speedc values (1519229,1, 'hgjhjgghggjgjgjgjgjggjgjgjgjgjgggjgjg
+jlhhgghggggghhjhghgggggghjhghghghghghhhhghghghjhhjghjghjkghjghjghjghjfhjfh
+---TRANSACTION 0 290328383, ACTIVE 0 sec, process no 3180, OS thread id
+28684 committing
+1 lock struct(s), heap size 320, undo log entries 1
+MySQL thread id 19, query id 4668734 localhost heikki update
+insert into speedcm values (1603393,1, 'hgjhjgghggjgjgjgjgjggjgjgjgjgjgggjgj
+gjlhhgghggggghhjhghgggggghjhghghghghghhhhghghghjhhjghjghjkghjghjghjghjfhjf
+---TRANSACTION 0 290328327, ACTIVE 0 sec, process no 3200, OS thread id
+36880 starting index read
+LOCK WAIT 2 lock struct(s), heap size 320
+MySQL thread id 27, query id 4668644 localhost heikki Searching rows for
+update
+update ibtest11a set B = 'kHdkkkk' where A = 89572
+------- TRX HAS BEEN WAITING 0 SEC FOR THIS LOCK TO BE GRANTED:
+RECORD LOCKS space id 0 page no 65556 n bits 232 table test/ibtest11a index
+PRIMARY trx id 0 290328327 lock_mode X waiting
+Record lock, heap no 1 RECORD: info bits 0 0: len 9; hex 73757072656d756d00;
+asc supremum.;;
+------------------
+---TRANSACTION 0 290328284, ACTIVE 0 sec, process no 3195, OS thread id
+34831 rollback of SQL statement
+ROLLING BACK 14 lock struct(s), heap size 2496, undo log entries 9
+MySQL thread id 25, query id 4668733 localhost heikki update
+insert into ibtest11a (D, B, C) values (5, 'khDk' ,'khDk')
+---TRANSACTION 0 290327208, ACTIVE 1 sec, process no 3190, OS thread id
+32782
+58 lock struct(s), heap size 5504, undo log entries 159
+MySQL thread id 23, query id 4668732 localhost heikki update
+REPLACE INTO alex1 VALUES(86, 46, 538,'aa95666','bb','c95666','d9486t',
+'e200498f','g86814','h538',date_format('2001-04-03 12:54:22','%Y-%m-%d
+%H:%i'),
+---TRANSACTION 0 290323325, ACTIVE 3 sec, process no 3185, OS thread id
+30733 inserting
+4 lock struct(s), heap size 1024, undo log entries 165
+MySQL thread id 21, query id 4668735 localhost heikki update
+INSERT INTO alex1 VALUES(NULL, 49, NULL,'aa42837','','c56319','d1719t','',
+NULL,'h321', NULL, NULL, 7.31,7.31,7.31,200)
+--------
+FILE I/O
+--------
+I/O thread 0 state: waiting for i/o request (insert buffer thread)
+I/O thread 1 state: waiting for i/o request (log thread)
+I/O thread 2 state: waiting for i/o request (read thread)
+I/O thread 3 state: waiting for i/o request (write thread)
+Pending normal aio reads: 0, aio writes: 0,
+ ibuf aio reads: 0, log i/o's: 0, sync i/o's: 0
+Pending flushes (fsync) log: 0; buffer pool: 0
+151671 OS file reads, 94747 OS file writes, 8750 OS fsyncs
+25.44 reads/s, 18494 avg bytes/read, 17.55 writes/s, 2.33 fsyncs/s
+-------------------------------------
+INSERT BUFFER AND ADAPTIVE HASH INDEX
+-------------------------------------
+Ibuf for space 0: size 1, free list len 19, seg size 21,
+85004 inserts, 85004 merged recs, 26669 merges
+Hash table size 207619, used cells 14461, node heap has 16 buffer(s)
+1877.67 hash searches/s, 5121.10 non-hash searches/s
+---
+LOG
+---
+Log sequence number 18 1212842764
+Log flushed up to 18 1212665295
+Last checkpoint at 18 1135877290
+0 pending log writes, 0 pending chkp writes
+4341 log i/o's done, 1.22 log i/o's/second
+----------------------
+BUFFER POOL AND MEMORY
+----------------------
+Total memory allocated 84966343; in additional pool allocated 1402624
+Buffer pool size 3200
+Free buffers 110
+Database pages 3074
+Modified db pages 2674
+Pending reads 0
+Pending writes: LRU 0, flush list 0, single page 0
+Pages read 171380, created 51968, written 194688
+28.72 reads/s, 20.72 creates/s, 47.55 writes/s
+Buffer pool hit rate 999 / 1000
+--------------
+ROW OPERATIONS
+--------------
+0 queries inside InnoDB, 0 queries in queue
+Main thread process no. 3004, id 7176, state: purging
+Number of rows inserted 3738558, updated 127415, deleted 33707, read 755779
+1586.13 inserts/s, 50.89 updates/s, 28.44 deletes/s, 107.88 reads/s
+----------------------------
+END OF INNODB MONITOR OUTPUT
+============================
+</programlisting>
+
+ <para>
+ Some notes on the output:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ If the <literal>TRANSACTIONS</literal> section reports lock
+ waits, your applications may have lock contention. The
+ output can also help to trace the reasons for transaction
+ deadlocks.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ The <literal>SEMAPHORES</literal> section reports threads
+ waiting for a semaphore and statistics on how many times
+ threads have needed a spin or a wait on a mutex or a rw-lock
+ semaphore. A large number of threads waiting for semaphores
+ may be a result of disk I/O, or contention problems inside
+ <literal>InnoDB</literal>. Contention can be due to heavy
+ parallelism of queries or problems in operating system
+ thread scheduling. Setting
+ <literal>innodb_thread_concurrency</literal> smaller than
+ the default value can help in such situations.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ The <literal>BUFFER POOL AND MEMORY</literal> section gives
+ you statistics on pages read and written. You can calculate
+ from these numbers how many data file I/O operations your
+ queries currently are doing.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ The <literal>ROW OPERATIONS</literal> section shows what the
+ main thread is doing.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ <literal>InnoDB</literal> sends diagnostic output to
+ <literal>stderr</literal> or to files rather than to
+ <literal>stdout</literal> or fixed-size memory buffers, to avoid
+ potential buffer overflows. As a side effect, the output of
+ <literal>SHOW ENGINE INNODB STATUS</literal> is written to a
+ status file in the MySQL data directory every fifteen seconds.
+ The name of the file is
+ <filename>innodb_status.<replaceable>pid</replaceable></filename>,
+ where <replaceable>pid</replaceable> is the server process ID.
+ <literal>InnoDB</literal> removes the file for a normal
+ shutdown. If abnormal shutdowns have occurred, instances of
+ these status files may be present and must be removed manually.
+ Before removing them, you might want to examine them to see
+ whether they contain useful information about the cause of
+ abnormal shutdowns. The
+ <filename>innodb_status.<replaceable>pid</replaceable></filename>
+ file is created only if the configuration option
+ <literal>innodb_status_file=1</literal> is set.
+ </para>
+
+ </section>
+
+ </section>
+
+ <section id="innodb-multi-versioning">
+
+ <title>Implementation of Multi-Versioning</title>
+
+ <para>
+ Because <literal>InnoDB</literal> is a multi-versioned storage
+ engine, it must keep information about old versions of rows in the
+ tablespace. This information is stored in a data structure called
+ a <firstterm>rollback segment</firstterm> (after an analogous data
+ structure in Oracle).
+ </para>
+
+ <para>
+ Internally, <literal>InnoDB</literal> adds two fields to each row
+ stored in the database. A 6-byte field indicates the transaction
+ identifier for the last transaction that inserted or updated the
+ row. Also, a deletion is treated internally as an update where a
+ special bit in the row is set to mark it as deleted. Each row also
+ contains a 7-byte field called the roll pointer. The roll pointer
+ points to an undo log record written to the rollback segment. If
+ the row was updated, the undo log record contains the information
+ necessary to rebuild the content of the row before it was updated.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> uses the information in the rollback
+ segment to perform the undo operations needed in a transaction
+ rollback. It also uses the information to build earlier versions
+ of a row for a consistent read.
+ </para>
+
+ <para>
+ Undo logs in the rollback segment are divided into insert and
+ update undo logs. Insert undo logs are needed only in transaction
+ rollback and can be discarded as soon as the transaction commits.
+ Update undo logs are used also in consistent reads, but they can
+ be discarded only after there is no transaction present for which
+ <literal>InnoDB</literal> has assigned a snapshot that in a
+ consistent read could need the information in the update undo log
+ to build an earlier version of a database row.
+ </para>
+
+ <para>
+ You must remember to commit your transactions regularly, including
+ those transactions that issue only consistent reads. Otherwise,
+ <literal>InnoDB</literal> cannot discard data from the update undo
+ logs, and the rollback segment may grow too big, filling up your
+ tablespace.
+ </para>
+
+ <para>
+ The physical size of an undo log record in the rollback segment is
+ typically smaller than the corresponding inserted or updated row.
+ You can use this information to calculate the space need for your
+ rollback segment.
+ </para>
+
+ <para>
+ In the <literal>InnoDB</literal> multi-versioning scheme, a row is
+ not physically removed from the database immediately when you
+ delete it with an SQL statement. Only when
+ <literal>InnoDB</literal> can discard the update undo log record
+ written for the deletion can it also physically remove the
+ corresponding row and its index records from the database. This
+ removal operation is called a purge, and it is quite fast, usually
+ taking the same order of time as the SQL statement that did the
+ deletion.
+ </para>
+
+ <para>
+ In a scenario where the user inserts and deletes rows in smallish
+ batches at about the same rate in the table, it is possible that
+ the purge thread starts to lag behind, and the table grows bigger
+ and bigger, making everything disk-bound and very slow. Even if
+ the table carries just 10MB of useful data, it may grow to occupy
+ 10GB with all the <quote>dead</quote> rows. In such a case, it
+ would be good to throttle new row operations, and allocate more
+ resources to the purge thread. The
+ <literal>innodb_max_purge_lag</literal> system variable exists for
+ exactly this purpose. See <xref linkend="innodb-parameters"/>, for
+ more information.
+ </para>
+
+ </section>
+
+ <section id="innodb-table-and-index">
+
+ <title><literal>InnoDB</literal> Table and Index Structures</title>
+
+ <para>
+ MySQL stores its data dictionary information for tables in
+ <filename>.frm</filename> files in database directories. This is
+ true for all MySQL storage engines. But every
+ <literal>InnoDB</literal> table also has its own entry in the
+ <literal>InnoDB</literal> internal data dictionary inside the
+ tablespace. When MySQL drops a table or a database, it has to
+ delete both an <filename>.frm</filename> file or files, and the
+ corresponding entries inside the <literal>InnoDB</literal> data
+ dictionary. This is the reason why you cannot move
+ <literal>InnoDB</literal> tables between databases simply by
+ moving the <filename>.frm</filename> files.
+ </para>
+
+ <para>
+ Every <literal>InnoDB</literal> table has a special index called
+ the <firstterm>clustered index</firstterm> where the data for the
+ rows is stored. If you define a <literal>PRIMARY KEY</literal> on
+ your table, the index of the primary key is the clustered index.
+ </para>
+
+ <para>
+ If you do not define a <literal>PRIMARY KEY</literal> for your
+ table, MySQL picks the first <literal>UNIQUE</literal> index that
+ has only <literal>NOT NULL</literal> columns as the primary key
+ and <literal>InnoDB</literal> uses it as the clustered index. If
+ there is no such index in the table, <literal>InnoDB</literal>
+ internally generates a clustered index where the rows are ordered
+ by the row ID that <literal>InnoDB</literal> assigns to the rows
+ in such a table. The row ID is a 6-byte field that increases
+ monotonically as new rows are inserted. Thus, the rows ordered by
+ the row ID are physically in insertion order.
+ </para>
+
+ <para>
+ Accessing a row through the clustered index is fast because the
+ row data is on the same page where the index search leads. If a
+ table is large, the clustered index architecture often saves a
+ disk I/O when compared to the traditional solution. (In many
+ database systems, data storage uses a different page from the
+ index record.)
+ </para>
+
+ <para>
+ In <literal>InnoDB</literal>, the records in non-clustered indexes
+ (also called secondary indexes) contain the primary key value for
+ the row. <literal>InnoDB</literal> uses this primary key value to
+ search for the row from the clustered index. Note that if the
+ primary key is long, the secondary indexes use more space.
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> compares <literal>CHAR</literal> and
+ <literal>VARCHAR</literal> strings of different lengths such that
+ the remaining length in the shorter string is treated as if padded
+ with spaces.
+ </para>
+
+ <section id="innodb-physical-structure">
+
+ <title>Physical Structure of an Index</title>
+
+ <para>
+ All <literal>InnoDB</literal> indexes are B-trees where the
+ index records are stored in the leaf pages of the tree. The
+ default size of an index page is 16KB. When new records are
+ inserted, <literal>InnoDB</literal> tries to leave 1/16 of the
+ page free for future insertions and updates of the index
+ records.
+ </para>
+
+ <para>
+ If index records are inserted in a sequential order (ascending
+ or descending), the resulting index pages are about 15/16 full.
+ If records are inserted in a random order, the pages are from
+ 1/2 to 15/16 full. If the fill factor of an index page drops
+ below 1/2, <literal>InnoDB</literal> tries to contract the index
+ tree to free the page.
+ </para>
+
+ </section>
+
+ <section id="innodb-insert-buffering">
+
+ <title>Insert Buffering</title>
+
+ <para>
+ It is a common situation in database applications that the
+ primary key is a unique identifier and new rows are inserted in
+ the ascending order of the primary key. Thus, the insertions to
+ the clustered index do not require random reads from a disk.
+ </para>
+
+ <para>
+ On the other hand, secondary indexes are usually non-unique, and
+ insertions into secondary indexes happen in a relatively random
+ order. This would cause a lot of random disk I/O operations
+ without a special mechanism used in <literal>InnoDB</literal>.
+ </para>
+
+ <para>
+ If an index record should be inserted to a non-unique secondary
+ index, <literal>InnoDB</literal> checks whether the secondary
+ index page is in the buffer pool. If that is the case,
+ <literal>InnoDB</literal> does the insertion directly to the
+ index page. If the index page is not found in the buffer pool,
+ <literal>InnoDB</literal> inserts the record to a special insert
+ buffer structure. The insert buffer is kept so small that it
+ fits entirely in the buffer pool, and insertions can be done
+ very fast.
+ </para>
+
+ <para>
+ Periodically, the insert buffer is merged into the secondary
+ index trees in the database. Often it is possible to merge
+ several insertions to the same page of the index tree, saving
+ disk I/O operations. It has been measured that the insert buffer
+ can speed up insertions into a table up to 15 times.
+ </para>
+
+ <para>
+ The insert buffer merging may continue to happen
+ <emphasis>after</emphasis> the inserting transaction has been
+ committed. In fact, it may continue to happen after a server
+ shutdown and restart (see <xref linkend="forcing-recovery"/>).
+ </para>
+
+ <para>
+ The insert buffer merging may take many hours, when many
+ secondary indexes must be updated, and many rows have been
+ inserted. During this time, disk I/O will be increased, which
+ can cause significant slowdown on disk-bound queries. Another
+ significant background I/O operation is the purge thread (see
+ <xref linkend="innodb-multi-versioning"/>).
+ </para>
+
+ </section>
+
+ <section id="innodb-adaptive-hash">
+
+ <title>Adaptive Hash Indexes</title>
+
+ <para>
+ If a table fits almost entirely in main memory, the fastest way
+ to perform queries on it is to use hash indexes.
+ <literal>InnoDB</literal> has a mechanism that monitors index
+ searches made to the indexes defined for a table. If
+ <literal>InnoDB</literal> notices that queries could benefit
+ from building a hash index, it does so automatically.
+ </para>
+
+ <para>
+ Note that the hash index is always built based on an existing
+ B-tree index on the table. <literal>InnoDB</literal> can build a
+ hash index on a prefix of any length of the key defined for the
+ B-tree, depending on the pattern of searches that
+ <literal>InnoDB</literal> observes for the B-tree index. A hash
+ index can be partial: It is not required that the whole B-tree
+ index is cached in the buffer pool. <literal>InnoDB</literal>
+ builds hash indexes on demand for those pages of the index that
+ are often accessed.
+ </para>
+
+ <para>
+ In a sense, <literal>InnoDB</literal> tailors itself through the
+ adaptive hash index mechanism to ample main memory, coming
+ closer to the architecture of main-memory databases.
+ </para>
+
+ </section>
+
+ <section id="innodb-physical-record">
+
+ <title>Physical Row Structure</title>
+
+ <para>
+ The physical record structure for InnoDB tables is dependent on
+ the row format specified when the table was created. By default
+ InnoDB uses the <literal>COMPACT</literal> format, but the
+ <literal>REDUNDANT</literal> format is available to retain
+ compatibility with older versions of MySQL.
+ </para>
+
+ <para>
+ Records in InnoDB <literal>ROW_FORMAT=REDUNDANT</literal> tables
+ have the following characteristics:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Each index record contains a six-byte header. The header is
+ used to link together consecutive records, and also in
+ row-level locking.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Records in the clustered index contain fields for all
+ user-defined columns. In addition, there is a six-byte field
+ for the transaction ID and a seven-byte field for the roll
+ pointer.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If no primary key was defined for a table, each clustered
+ index record also contains a six-byte row ID field.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Each secondary index record contains also all the fields
+ defined for the clustered index key.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ A record contains also a pointer to each field of the
+ record. If the total length of the fields in a record is
+ less than 128 bytes, the pointer is one byte; otherwise, two
+ bytes. The array of these pointers is called the record
+ directory. The area where these pointers point is called the
+ data part of the record.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Internally, InnoDB stores fixed-length character columns
+ such as <literal>CHAR(10)</literal> in a fixed-length
+ format. InnoDB truncates trailing spaces from
+ <literal>VARCHAR</literal> columns.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ An SQL <literal>NULL</literal> value reserves 1 or 2 bytes
+ in the record directory. Besides that, an SQL
+ <literal>NULL</literal> value reserves zero bytes in the
+ data part of the record if stored in a variable length
+ column. In a fixed-length column, it reserves the fixed
+ length of the column in the data part of the record. The
+ motivation behind reserving the fixed space for
+ <literal>NULL</literal> values is that it enables an update
+ of the column from <literal>NULL</literal> to a
+ non-<literal>NULL</literal> value to be done in place
+ without causing fragmentation of the index page.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ Records in InnoDB <literal>ROW_FORMAT=COMPACT</literal> tables
+ have the following characteristics:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Each index record contains a five-byte header that may be
+ preceded by a variable-length header. The header is used to
+ link together consecutive records, and also in row-level
+ locking.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ The record header contains a bit vector for indicating
+ <literal>NULL</literal> columns. The bit vector occupies
+ (<literal>n_nullable</literal>+7)/8 bytes. Columns that are
+ <literal>NULL</literal> will not occupy other space than the
+ bit in this vector.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ For each non-<literal>NULL</literal> variable-length field,
+ the record header contains the length of the column in one
+ or two bytes. Two bytes will only be needed if part of the
+ column is stored externally or the maximum length exceeds
+ 255 bytes and the actual length exceeds 127 bytes.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ The record header is followed by the data contents of the
+ columns. Columns that are <literal>NULL</literal> are
+ omitted.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Records in the clustered index contain fields for all
+ user-defined columns. In addition, there is a six-byte field
+ for the transaction ID and a seven-byte field for the roll
+ pointer.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If no primary key was defined for a table, each clustered
+ index record also contains a six-byte row ID field.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Each secondary index record contains also all the fields
+ defined for the clustered index key.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Internally, InnoDB stores fixed-length, fixed-width
+ character columns such as <literal>CHAR(10)</literal> in a
+ fixed-length format. InnoDB truncates trailing spaces from
+ <literal>VARCHAR</literal> columns.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Internally, InnoDB attempts to store UTF-8
+ <literal>CHAR(<literal>n</literal>)</literal> columns in
+ <literal>n</literal> bytes by trimming trailing spaces. In
+ <literal>ROW_FORMAT=REDUNDANT</literal>, such columns occupy
+ 3*<literal>n</literal> bytes. The motivation behind
+ reserving the minimum space <literal>n</literal> is that it
+ in many cases enables an update of the column to be done in
+ place without causing fragmentation of the index page.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ The presence of the compact row format decreases row storage
+ space by about 20% at the cost of increasing CPU use for some
+ operations. If your workload is a typical one that is limited by
+ cache hit rates and disk speed it is likely to be faster. If it
+ is a rare case that is limited by CPU speed, it might be slower.
+ </para>
+
+ </section>
+
+ </section>
+
+ <section id="file-space-management">
+
+ <title><literal>InnoDB</literal> File Space Management and Disk I/O</title>
+
+ <section id="innodb-disk-io">
+
+ <title><literal>InnoDB</literal> Disk I/O</title>
+
+ <para>
+ <literal>InnoDB</literal> uses simulated asynchronous disk I/O:
+ <literal>InnoDB</literal> creates a number of threads to take
+ care of I/O operations, such as read-ahead.
+ </para>
+
+ <para>
+ There are two read-ahead heuristics in
+ <literal>InnoDB</literal>:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ In sequential read-ahead, if <literal>InnoDB</literal>
+ notices that the access pattern to a segment in the
+ tablespace is sequential, it posts in advance a batch of
+ reads of database pages to the I/O system.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ In random read-ahead, if <literal>InnoDB</literal> notices
+ that some area in a tablespace seems to be in the process of
+ being fully read into the buffer pool, it posts the
+ remaining reads to the I/O system.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ <literal>InnoDB</literal> uses a novel file flush technique
+ called <firstterm>doublewrite</firstterm>. It adds safety to
+ recovery following an operating system crash or a power outage,
+ and improves performance on most varieties of Unix by reducing
+ the need for <literal>fsync()</literal> operations.
+ </para>
+
+ <para>
+ Doublewrite means that before writing pages to a data file,
+ <literal>InnoDB</literal> first writes them to a contiguous
+ tablespace area called the doublewrite buffer. Only after the
+ write and the flush to the doublewrite buffer has completed does
+ <literal>InnoDB</literal> write the pages to their proper
+ positions in the data file. If the operating system crashes in
+ the middle of a page write, <literal>InnoDB</literal> can later
+ find a good copy of the page from the doublewrite buffer during
+ recovery.
+ </para>
+
+ </section>
+
+ <section id="innodb-file-space">
+
+ <title>File Space Management</title>
+
+ <para>
+ The data files that you define in the configuration file form
+ the tablespace of <literal>InnoDB</literal>. The files are
+ simply concatenated to form the tablespace. There is no striping
+ in use. Currently, you cannot define where within the tablespace
+ your tables are allocated. However, in a newly created
+ tablespace, <literal>InnoDB</literal> allocates space starting
+ from the first data file.
+ </para>
+
+ <para>
+ The tablespace consists of database pages with a default size of
+ 16KB. The pages are grouped into extents of 64 consecutive
+ pages. The <quote>files</quote> inside a tablespace are called
+ <firstterm>segments</firstterm> in <literal>InnoDB</literal>.
+ The term <quote>rollback segment</quote> is somewhat confusing
+ because it actually contains many tablespace segments.
+ </para>
+
+ <para>
+ Two segments are allocated for each index in
+ <literal>InnoDB</literal>. One is for non-leaf nodes of the
+ B-tree, the other is for the leaf nodes. The idea here is to
+ achieve better sequentiality for the leaf nodes, which contain
+ the data.
+ </para>
+
+ <para>
+ When a segment grows inside the tablespace,
+ <literal>InnoDB</literal> allocates the first 32 pages to it
+ individually. After that <literal>InnoDB</literal> starts to
+ allocate whole extents to the segment. <literal>InnoDB</literal>
+ can add to a large segment up to 4 extents at a time to ensure
+ good sequentiality of data.
+ </para>
+
+ <para>
+ Some pages in the tablespace contain bitmaps of other pages, and
+ therefore a few extents in an <literal>InnoDB</literal>
+ tablespace cannot be allocated to segments as a whole, but only
+ as individual pages.
+ </para>
+
+ <para>
+ When you ask for available free space in the tablespace by
+ issuing a <literal>SHOW TABLE STATUS</literal> statement,
+ <literal>InnoDB</literal> reports the extents that are
+ definitely free in the tablespace. <literal>InnoDB</literal>
+ always reserves some extents for cleanup and other internal
+ purposes; these reserved extents are not included in the free
+ space.
+ </para>
+
+ <para>
+ When you delete data from a table, <literal>InnoDB</literal>
+ contracts the corresponding B-tree indexes. Whether the freed
+ space becomes available for other users depends on whether the
+ pattern of deletes frees individual pages or extents to the
+ tablespace. Dropping a table or deleting all rows from it is
+ guaranteed to release the space to other users, but remember
+ that deleted rows are physically removed only in an (automatic)
+ purge operation after they are no longer needed for transaction
+ rollbacks or consistent reads. (See
+ <xref linkend="innodb-multi-versioning"/>.)
+ </para>
+
+ </section>
+
+ <section id="innodb-file-defragmenting">
+
+ <title>Defragmenting a Table</title>
+
+ <para>
+ If there are random insertions into or deletions from the
+ indexes of a table, the indexes may become fragmented.
+ Fragmentation means that the physical ordering of the index
+ pages on the disk is not close to the index ordering of the
+ records on the pages, or that there are many unused pages in the
+ 64-page blocks that were allocated to the index.
+ </para>
+
+ <para>
+ A symptom of fragmentation is that a table takes more space than
+ it <quote>should</quote> take. How much that is exactly, is
+ difficult to determine. All <literal>InnoDB</literal> data and
+ indexes are stored in B-trees, and their fill factor may vary
+ from 50% to 100%. Another symptom of fragmentation is that a
+ table scan such as this takes more time than it
+ <quote>should</quote> take:
+ </para>
+
+<programlisting>
+SELECT COUNT(*) FROM t WHERE a_non_indexed_column <> 12345;
+</programlisting>
+
+ <para>
+ (In the preceding query, we are <quote>fooling</quote> the SQL
+ optimizer into scanning the clustered index, rather than a
+ secondary index.) Most disks can read 10 to 50MB/s, which can be
+ used to estimate how fast a table scan should run.
+ </para>
+
+ <para>
+ It can speed up index scans if you periodically perform a
+ <quote>null</quote> <literal>ALTER TABLE</literal> operation:
+ </para>
+
+<programlisting>
+ALTER TABLE <replaceable>tbl_name</replaceable> ENGINE=INNODB
+</programlisting>
+
+ <para>
+ That causes MySQL to rebuild the table. Another way to perform a
+ defragmentation operation is to use <command>mysqldump</command>
+ to dump the table to a text file, drop the table, and reload it
+ from the dump file.
+ </para>
+
+ <para>
+ If the insertions to an index are always ascending and records
+ are deleted only from the end, the <literal>InnoDB</literal>
+ filespace management algorithm guarantees that fragmentation in
+ the index does not occur.
+ </para>
+
+ </section>
+
+ </section>
+
+ <section id="innodb-error-handling">
+
+ <title><literal>InnoDB</literal> Error Handling</title>
+
+ <para>
+ Error handling in <literal>InnoDB</literal> is not always the same
+ as specified in the SQL standard. According to the standard, any
+ error during an SQL statement should cause the rollback of that
+ statement. <literal>InnoDB</literal> sometimes rolls back only
+ part of the statement, or the whole transaction. The following
+ items describe how <literal>InnoDB</literal> performs error
+ handling:
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ If you run out of file space in the tablespace, a MySQL
+ <literal>Table is full</literal> error occurs and
+ <literal>InnoDB</literal> rolls back the SQL statement.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ A transaction deadlock causes <literal>InnoDB</literal> to
+ roll back the entire transaction. In the case of a lock wait
+ timeout, <literal>InnoDB</literal> rolls back only the most
+ recent SQL statement.
+ </para>
+
+ <para>
+ When a transaction rollback occurs due to a deadlock or lock
+ wait timeout, it cancels the effect of the statements within
+ the transaction. But if the start-transaction statement was
+ <literal>START TRANSACTION</literal> or
+ <literal>BEGIN</literal> statement, rollback does not cancel
+ that statement. Further SQL statements become part of the
+ transaction until the occurrence of <literal>COMMIT</literal>,
+ <literal>ROLLBACK</literal>, or some SQL statement that causes
+ an implicit commit.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ A duplicate-key error rolls back the SQL statement, if you
+ have not specified the <literal>IGNORE</literal> option in
+ your statement.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ A <literal>row too long error</literal> rolls back the SQL
+ statement.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Other errors are mostly detected by the MySQL layer of code
+ (above the <literal>InnoDB</literal> storage engine level),
+ and they roll back the corresponding SQL statement. Locks are
+ not released in a rollback of a single SQL statement.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ During implicit rollbacks, as well as during the execution of an
+ explicit <literal>ROLLBACK</literal> SQL statement, <literal>SHOW
+ PROCESSLIST</literal> displays <literal>Rolling back</literal> in
+ the <literal>State</literal> column for the relevant connection.
+ </para>
+
+ <section id="innodb-error-codes">
+
+ <title><literal>InnoDB</literal> Error Codes</title>
+
+ <para>
+ The following is a non-exhaustive list of common
+ <literal>InnoDB</literal>-specific errors that you may
+ encounter, with information about why each occurs and how to
+ resolve the problem.
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ <literal>1005 (ER_CANT_CREATE_TABLE)</literal>
+ </para>
+
+ <para>
+ Cannot create table. If the error message refers to
+ <literal>errno</literal> 150, table creation failed because
+ a foreign key constraint was not correctly formed. If the
+ error message refers to <literal>errno</literal> -1, table
+ creation probably failed because the table includes a column
+ name that matched the name of an internal InnoDB table.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>1016 (ER_CANT_OPEN_FILE)</literal>
+ </para>
+
+ <para>
+ Cannot find the <literal>InnoDB</literal> table from the
+ <literal>InnoDB</literal> data files, although the
+ <filename>.frm</filename> file for the table exists. See
+ <xref linkend="innodb-troubleshooting-datadict"/>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>1114 (ER_RECORD_FILE_FULL)</literal>
+ </para>
+
+ <para>
+ <literal>InnoDB</literal> has run out of free space in the
+ tablespace. You should reconfigure the tablespace to add a
+ new data file.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>1205 (ER_LOCK_WAIT_TIMEOUT)</literal>
+ </para>
+
+ <para>
+ Lock wait timeout expired. Transaction was rolled back.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>1213 (ER_LOCK_DEADLOCK)</literal>
+ </para>
+
+ <para>
+ Transaction deadlock. You should rerun the transaction.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>1216 (ER_NO_REFERENCED_ROW)</literal>
+ </para>
+
+ <para>
+ You are trying to add a row but there is no parent row, and
+ a foreign key constraint fails. You should add the parent
+ row first.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>1217 (ER_ROW_IS_REFERENCED)</literal>
+ </para>
+
+ <para>
+ You are trying to delete a parent row that has children, and
+ a foreign key constraint fails. You should delete the
+ children first.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ </section>
+
+ <section id="operating-system-error-codes">
+
+ <title>Operating System Error Codes</title>
+
+ <para>
+ To print the meaning of an operating system error number, use
+ the <command>perror</command> program that comes with the MySQL
+ distribution.
+ </para>
+
+ <remark role="todo">
+ CHECK URL
+ </remark>
+
+ <para>
+ The following table provides a list of some common Linux system
+ error codes. For a more complete list, see
+ <ulink url="http://www.iglu.org.il/lxr/source/include/asm-i386/errno.h">Linux
+ source code</ulink>.
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ <literal>1 (EPERM)</literal>
+ </para>
+
+ <para>
+ Operation not permitted
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>2 (ENOENT)</literal>
+ </para>
+
+ <para>
+ No such file or directory
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>3 (ESRCH)</literal>
+ </para>
+
+ <para>
+ No such process
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>4 (EINTR)</literal>
+ </para>
+
+ <para>
+ Interrupted system call
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>5 (EIO)</literal>
+ </para>
+
+ <para>
+ I/O error
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>6 (ENXIO)</literal>
+ </para>
+
+ <para>
+ No such device or address
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>7 (E2BIG)</literal>
+ </para>
+
+ <para>
+ Arg list too long
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>8 (ENOEXEC)</literal>
+ </para>
+
+ <para>
+ Exec format error
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>9 (EBADF)</literal>
+ </para>
+
+ <para>
+ Bad file number
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>10 (ECHILD)</literal>
+ </para>
+
+ <para>
+ No child processes
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>11 (EAGAIN)</literal>
+ </para>
+
+ <para>
+ Try again
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>12 (ENOMEM)</literal>
+ </para>
+
+ <para>
+ Out of memory
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>13 (EACCES)</literal>
+ </para>
+
+ <para>
+ Permission denied
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>14 (EFAULT)</literal>
+ </para>
+
+ <para>
+ Bad address
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>15 (ENOTBLK)</literal>
+ </para>
+
+ <para>
+ Block device required
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>16 (EBUSY)</literal>
+ </para>
+
+ <para>
+ Device or resource busy
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>17 (EEXIST)</literal>
+ </para>
+
+ <para>
+ File exists
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <literal>18 (EXDEV)</literal>
+ </para>
+
+ <para>
+ Cross-device link
+ </para>
+ </listitem>
+