Handler-calling-functions. More...

#include "binlog.h"
#include "sql_priv.h"
#include "unireg.h"
#include "rpl_handler.h"
#include "sql_cache.h"
#include "key.h"
#include "sql_table.h"
#include "sql_parse.h"
#include "sql_acl.h"
#include "sql_base.h"
#include "discover.h"
#include "log_event.h"
#include "rpl_filter.h"
#include <myisampack.h>
#include "transaction.h"
#include <errno.h>
#include "probes_mysql.h"
#include <mysql/psi/mysql_table.h>
#include "debug_sync.h"
#include <my_bit.h>
#include <list>

Classes
struct	st_sys_tbl_chk_params
struct	xahton_st
struct	xarecover_st
struct	Ha_delete_table_error_handler
struct	st_discover_args
struct	st_find_files_args
struct	st_table_exists_in_engine_args
struct	st_make_pushed_join_args
struct	hton_list_st
struct	binlog_func_st
Defines
#define	BITMAP_STACKBUF_SIZE (128/8)
#define	SETMSG(nr, msg) handler_errmsgs[(nr) - HA_ERR_FIRST]= (msg)
#define	AUTO_INC_DEFAULT_NB_ROWS 1
#define	AUTO_INC_DEFAULT_NB_MAX_BITS 16
#define	AUTO_INC_DEFAULT_NB_MAX ((1 << AUTO_INC_DEFAULT_NB_MAX_BITS) - 1)
#define	MAX_HTON_LIST_ST 63
Typedefs
typedef bool	Log_func (THD , TABLE , bool, const uchar , const uchar )
Functions
double	log2 (double x)
const char *	ha_legacy_type_name (legacy_db_type legacy_type)
handlerton *	ha_default_handlerton (THD *thd)
	Return the default storage engine handlerton used for non-temp tables for thread.
handlerton *	ha_default_temp_handlerton (THD *thd)
	Return the default storage engine handlerton used for explicitly created temp tables for a thread.
plugin_ref	ha_resolve_by_name (THD thd, const LEX_STRING name, bool is_temp_table)
	Return the storage engine handlerton for the supplied name.
plugin_ref	ha_lock_engine (THD thd, const handlerton hton)
handlerton *	ha_resolve_by_legacy_type (THD *thd, enum legacy_db_type db_type)
handlerton *	ha_checktype (THD *thd, enum legacy_db_type database_type, bool no_substitute, bool report_error)
handler *	get_new_handler (TABLE_SHARE share, MEM_ROOT alloc, handlerton *db_type)
C_MODE_END int	ha_init_errors (void)
int	ha_finalize_handlerton (st_plugin_int *plugin)
int	ha_initialize_handlerton (st_plugin_int *plugin)
int	ha_init ()
int	ha_end ()
void	ha_drop_database (char *path)
void	ha_close_connection (THD *thd)
void	trans_register_ha (THD thd, bool all, handlerton ht_arg)
int	ha_prepare (THD *thd)
int	ha_commit_trans (THD *thd, bool all, bool ignore_global_read_lock)
int	ha_commit_low (THD *thd, bool all, bool run_after_commit)
int	ha_rollback_low (THD *thd, bool all)
int	ha_rollback_trans (THD *thd, bool all)
int	ha_commit_or_rollback_by_xid (THD thd, XID xid, bool commit)
int	ha_recover (HASH *commit_list)
bool	mysql_xa_recover (THD *thd)
int	ha_release_temporary_latches (THD *thd)
bool	ha_rollback_to_savepoint_can_release_mdl (THD *thd)
int	ha_rollback_to_savepoint (THD thd, SAVEPOINT sv)
int	ha_prepare_low (THD *thd, bool all)
int	ha_savepoint (THD thd, SAVEPOINT sv)
int	ha_release_savepoint (THD thd, SAVEPOINT sv)
int	ha_start_consistent_snapshot (THD *thd)
bool	ha_flush_logs (handlerton *db_type)
const char *	get_canonical_filename (handler file, const char path, char *tmp_path)
	make canonical filename
int	ha_delete_table (THD thd, handlerton table_type, const char path, const char db, const char *alias, bool generate_warning)
	This should return ENOENT if the file doesn't exists. The .frm file will be deleted only if we return 0 or ENOENT.
ulonglong	compute_next_insert_id (ulonglong nr, struct system_variables *variables)
ulonglong	prev_insert_id (ulonglong nr, struct system_variables *variables)
	Computes the largest number X:
void	print_keydup_error (TABLE table, KEY key, const char *msg, myf errflag)
void	print_keydup_error (TABLE table, KEY key, myf errflag)
int	ha_enable_transaction (THD *thd, bool on)
int	ha_create_table (THD thd, const char path, const char db, const char table_name, HA_CREATE_INFO *create_info, bool update_create_info, bool is_temp_table)
int	ha_create_table_from_engine (THD thd, const char db, const char *name)
bool	ha_check_if_table_exists (THD thd, const char db, const char name, bool exists)
bool	ha_check_if_supported_system_table (handlerton hton, const char db, const char *table_name)
	Check if a given table is a system table.
int	ha_init_key_cache (const char name, KEY_CACHE key_cache)
int	ha_resize_key_cache (KEY_CACHE *key_cache)
int	ha_change_key_cache_param (KEY_CACHE *key_cache)
int	ha_change_key_cache (KEY_CACHE old_key_cache, KEY_CACHE new_key_cache)
int	ha_discover (THD thd, const char db, const char name, uchar frmblob, size_t frmlen)
int	ha_find_files (THD thd, const char db, const char path, const char wild, bool dir, List< LEX_STRING > *files)
int	ha_table_exists_in_engine (THD thd, const char db, const char *name)
int	ha_make_pushed_joins (THD thd, const AQP::Join_plan plan)
int	ha_binlog_end (THD *thd)
bool	key_uses_partial_cols (TABLE *table, uint keyno)
void	get_sweep_read_cost (TABLE table, ha_rows nrows, bool interrupted, Cost_estimate cost)
TYPELIB *	ha_known_exts ()
bool	ha_show_status (THD thd, handlerton db_type, enum ha_stat_type stat)
int	binlog_log_row (TABLE table, const uchar before_record, const uchar after_record, Log_func log_func)
void	signal_log_not_needed (struct handlerton, char *log_file)
	Dummy function which accept information about log files which is not need by handlers.
Variables
st_plugin_int *	hton2plugin [MAX_HA]
KEY_CREATE_INFO	default_key_create_info
ulong	total_ha = 0
ulong	total_ha_2pc = 0
ulong	savepoint_alloc_size = 0
const char *	ha_row_type []
const char *	tx_isolation_names []
TYPELIB	tx_isolation_typelib
const char *	mysqld_system_database = "mysql"
st_system_tablename	mysqld_system_tables []

Detailed Description

Handler-calling-functions.

Define Documentation

#define AUTO_INC_DEFAULT_NB_ROWS 1

Update the auto_increment field if necessary.

Updates columns with type NEXT_NUMBER if:

If column value is set to NULL (in which case auto_increment_field_not_null is 0)
If column is set to 0 and (sql_mode & MODE_NO_AUTO_VALUE_ON_ZERO) is not set. In the future we will only set NEXT_NUMBER fields if one sets them to NULL (or they are not included in the insert list).

In those cases, we check if the currently reserved interval still has values we have not used. If yes, we pick the smallest one and use it. Otherwise:

If a list of intervals has been provided to the statement via SET INSERT_ID or via an Intvar_log_event (in a replication slave), we pick the first unused interval from this list, consider it as reserved.

Otherwise we set the column for the first row to the value next_insert_id(get_auto_increment(column))) which is usually max-used-column-value+1. We call get_auto_increment() for the first row in a multi-row statement. get_auto_increment() will tell us the interval of values it reserved for us.

In both cases, for the following rows we use those reserved values without calling the handler again (we just progress in the interval, computing each new value from the previous one). Until we have exhausted them, then we either take the next provided interval or call get_auto_increment() again to reserve a new interval.

In both cases, the reserved intervals are remembered in thd->auto_inc_intervals_in_cur_stmt_for_binlog if statement-based binlogging; the last reserved interval is remembered in auto_inc_interval_for_cur_row. The number of reserved intervals is remembered in auto_inc_intervals_count. It differs from the number of elements in thd->auto_inc_intervals_in_cur_stmt_for_binlog() because the latter list is cumulative over all statements forming one binlog event (when stored functions and triggers are used), and collapses two contiguous intervals in one (see its append() method).

The idea is that generated auto_increment values are predictable and independent of the column values in the table. This is needed to be able to replicate into a table that already has rows with a higher auto-increment value than the one that is inserted.

After we have already generated an auto-increment number and the user inserts a column with a higher value than the last used one, we will start counting from the inserted value.

This function's "outputs" are: the table's auto_increment field is filled with a value, thd->next_insert_id is filled with the value to use for the next row, if a value was autogenerated for the current row it is stored in thd->insert_id_for_cur_row, if get_auto_increment() was called thd->auto_inc_interval_for_cur_row is modified, if that interval is not present in thd->auto_inc_intervals_in_cur_stmt_for_binlog it is added to this list.

Todo:: Replace all references to "next number" or NEXT_NUMBER to "auto_increment", everywhere (see below: there is table->auto_increment_field_not_null, and there also exists table->next_number_field, it's not consistent).

Return values:

0	ok
HA_ERR_AUTOINC_READ_FAILED	get_auto_increment() was called and returned ~(ulonglong) 0
HA_ERR_AUTOINC_ERANGE	storing value in field caused strict mode failure.

Function Documentation

ulonglong compute_next_insert_id	(	ulonglong	nr,
		struct system_variables *	variables
	)		`[inline]`

Generate the next auto-increment number based on increment and offset. computes the lowest number

strictly greater than "nr"
of the form: auto_increment_offset + N * auto_increment_increment If overflow happened then return MAX_ULONGLONG value as an indication of overflow. In most cases increment= offset= 1, in which case we get:
```
1,2,3,4,5,... 
```
If increment=10 and offset=5 and previous number is 1, we get:
```
1,5,15,25,35,... 
```

const char* get_canonical_filename	(	handler *	file,
		const char *	path,
		char *	tmp_path
	)

make canonical filename

Parameters:

[in]	file	table handler
[in]	path	original path
[out]	tmp_path	buffer for canonized path

Lower case db name and table name path parts for non file based tables when lower_case_table_names is 2 (store as is, compare in lower case). Filesystem path prefix (mysql_data_home or tmpdir) is left intact.

Note:: tmp_path may be left intact if no conversion was performed.

Return values:

canonized path

Todo:: This may be done more efficiently when table path gets built. Convert this function to something like ASSERT_CANONICAL_FILENAME.

void get_sweep_read_cost	(	TABLE *	table,
		ha_rows	nrows,
		bool	interrupted,
		Cost_estimate *	cost
	)

Get cost of reading nrows table records in a "disk sweep"

A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made for an ordered sequence of rowids.

We assume hard disk IO. The read is performed as follows:

1. The disk head is moved to the needed cylinder 2. The controller waits for the plate to rotate 3. The data is transferred

Time to do #3 is insignificant compared to #2+#1.

Time to move the disk head is proportional to head travel distance.

Time to wait for the plate to rotate depends on whether the disk head was moved or not.

If disk head wasn't moved, the wait time is proportional to distance between the previous block and the block we're reading.

If the head was moved, we don't know how much we'll need to wait for the plate to rotate. We assume the wait time to be a variate with a mean of 0.5 of full rotation time.

Our cost units are "random disk seeks". The cost of random disk seek is actually not a constant, it depends one range of cylinders we're going to access. We make it constant by introducing a fuzzy concept of "typical datafile length" (it's fuzzy as it's hard to tell whether it should include index file, temp.tables etc). Then random seek cost is:

1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length

We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9.

Parameters:

table	Table to be accessed
nrows	Number of rows to retrieve
interrupted	TRUE <=> Assume that the disk sweep will be interrupted by other disk IO. FALSE - otherwise.
cost	OUT The cost.

int ha_change_key_cache	(	KEY_CACHE *	old_key_cache,
		KEY_CACHE *	new_key_cache
	)

Move all tables from one key cache to another one.

int ha_change_key_cache_param ( KEY_CACHE * key_cache )

Change parameters for key cache (like size)

bool ha_check_if_supported_system_table	(	handlerton *	hton,
		const char *	db,
		const char *	table_name
	)

Check if a given table is a system table.

The primary purpose of introducing this function is to stop system tables to be created or being moved to undesired storage engines.

Todo:: There is another function called is_system_table_name() used by get_table_category(), which is used to set TABLE_SHARE table_category. It checks only a subset of table name like proc, event and time*. We cannot use below function in get_table_category(), as that affects locking mechanism. If we need to unify these functions, we need to fix locking issues generated.

Parameters:

hton	Handlerton of new engine.
db	Database name.
table_name	Table name to be checked.

Returns:: Operation status

Return values:

true	If the table name is a valid system table or if its a valid user table.
false	If the table name is a system table name and does not belong to engine specified in the command.

bool ha_check_if_table_exists	(	THD *	thd,
		const char *	db,
		const char *	name,
		bool *	exists
	)

Try to find a table in a storage engine.

Parameters:

	db	Normalized table schema name
	name	Normalized table name.
[out]	exists	Only valid if the function succeeded.

Return values:

TRUE	An error is found
FALSE	Success, check *exists

handlerton* ha_checktype	(	THD *	thd,
		enum legacy_db_type	database_type,
		bool	no_substitute,
		bool	report_error
	)

Use other database handler if databasehandler is not compiled in.

void ha_close_connection ( THD * thd )

Note:: don't bother to rollback here, it's done already

int ha_commit_low	(	THD *	thd,
		bool	all,
		bool	run_after_commit
	)

Commit the sessions outstanding transaction.

Precondition:: thd->transaction.flags.commit_low == true

Postcondition:: thd->transaction.flags.commit_low == false

Note:: This function does not care about global read lock; the caller should.

Parameters:

[in]	all	Is set in case of explicit commit (COMMIT statement), or implicit commit issued by DDL. Is not set when called at the end of statement, even if autocommit=1.
[in]	run_after_commit	True by default, otherwise, does not execute the after_commit hook in the function.

int ha_commit_trans	(	THD *	thd,
		bool	all,
		bool	ignore_global_read_lock
	)

Parameters:

[in] ignore_global_read_lock Allow commit to complete even if a global read lock is active. This can be used to allow changes to internal tables (e.g. slave status tables).

Return values:

0	ok
1	transaction was rolled back
2	error during commit, data may be inconsistent

Todo:: Since we don't support nested statement transactions in 5.0, we can't commit or rollback stmt transactions while we are inside stored functions or triggers. So we simply do nothing now. TODO: This should be fixed in later ( >= 5.1) releases.

int ha_create_table	(	THD *	thd,
		const char *	path,
		const char *	db,
		const char *	table_name,
		HA_CREATE_INFO *	create_info,
		bool	update_create_info,
		bool	is_temp_table
	)

Initiates table-file and calls appropriate database-creator.

Return values:

0	ok
1	error

int ha_create_table_from_engine	(	THD *	thd,
		const char *	db,
		const char *	name
	)

Try to discover table from engine.

Note:: If found, write the frm file to disk.

Return values:

-1	Table did not exists
0	Table created ok
>	0 Error, table existed but could not be created

handlerton* ha_default_handlerton ( THD * thd )

Return the default storage engine handlerton used for non-temp tables for thread.

SYNOPSIS ha_default_handlerton(thd) thd current thread

RETURN pointer to handlerton

handlerton* ha_default_temp_handlerton ( THD * thd )

Return the default storage engine handlerton used for explicitly created temp tables for a thread.

SYNOPSIS ha_default_temp_handlerton(thd) thd current thread

RETURN pointer to handlerton

int ha_enable_transaction	(	THD *	thd,
		bool	on
	)

Tell the storage engine that it is allowed to "disable transaction" in the handler. It is a hint that ACID is not required - it is used in NDB for ALTER TABLE, for example, when data are copied to temporary table. A storage engine may treat this hint any way it likes. NDB for example starts to commit every now and then automatically. This hint can be safely ignored.

C_MODE_END int ha_init_errors ( void )

Register handler error messages for use with my_error().

Return values:

0	OK
!=0	Error

int ha_init_key_cache	(	const char *	name,
		KEY_CACHE *	key_cache
	)

Init a key cache if it has not been initied before.

int ha_prepare ( THD * thd )

Return values:

0	ok
1	error, transaction was rolled back

int ha_release_temporary_latches ( THD * thd )

This function should be called when MySQL sends rows of a SELECT result set or the EOF mark to the client. It releases a possible adaptive hash index S-latch held by thd in InnoDB and also releases a possible InnoDB query FIFO ticket to enter InnoDB. To save CPU time, InnoDB allows a thd to keep them over several calls of the InnoDB handler interface when a join is executed. But when we let the control to pass to the client they have to be released because if the application program uses mysql_use_result(), it may deadlock on the S-latch if the application on another connection performs another SQL query. In MySQL-4.1 this is even more important because there a connection can have several SELECT queries open at the same time.

Parameters:

thd	the thread handle of the current connection

Returns:: always 0

int ha_resize_key_cache ( KEY_CACHE * key_cache )

Resize key cache.

plugin_ref ha_resolve_by_name	(	THD *	thd,
		const LEX_STRING *	name,
		bool	is_temp_table
	)

Return the storage engine handlerton for the supplied name.

SYNOPSIS ha_resolve_by_name(thd, name) thd current thread name name of storage engine

RETURN pointer to storage engine plugin handle

bool ha_rollback_to_savepoint_can_release_mdl ( THD * thd )

Check if all storage engines used in transaction agree that after rollback to savepoint it is safe to release MDL locks acquired after savepoint creation.

Parameters:

thd	The client thread that executes the transaction.

Returns:: true - It is safe to release MDL locks. false - If it is not.

Checking whether it is safe to release metadata locks after rollback to savepoint in all the storage engines that are part of the transaction.

int ha_savepoint	(	THD *	thd,
		SAVEPOINT *	sv
	)

Note:: according to the sql standard (ISO/IEC 9075-2:2003) section "4.33.4 SQL-statements and transaction states", SAVEPOINT is *not* transaction-initiating SQL-statement

bool key_uses_partial_cols	(	TABLE *	table,
		uint	keyno
	)

Check if key has partially-covered columns

We can't use DS-MRR to perform range scans when the ranges are over partially-covered keys, because we'll not have full key part values (we'll have their prefixes from the index) and will not be able to check if we've reached the end the range.

Parameters:

keyno Key to check

Todo:: Allow use of DS-MRR in cases where the index has partially-covered components but they are not used for scanning.

Return values:

TRUE	Yes
FALSE	No

bool mysql_xa_recover ( THD * thd )

return the list of XID's to a client, the same way SHOW commands do.

Note:: I didn't find in XA specs that an RM cannot return the same XID twice, so mysql_xa_recover does not filter XID's to ensure uniqueness. It can be easily fixed later, if necessary.

ulonglong prev_insert_id	(	ulonglong	nr,
		struct system_variables *	variables
	)		`[inline]`

Computes the largest number X:

smaller than or equal to "nr"
of the form: auto_increment_offset + N * auto_increment_increment where N>=0.

SYNOPSIS prev_insert_id nr Number to "round down" variables variables struct containing auto_increment_increment and auto_increment_offset

RETURN The number X if it exists, "nr" otherwise.

void print_keydup_error	(	TABLE *	table,
		KEY *	key,
		const char *	msg,
		myf	errflag
	)

Construct and emit duplicate key error message using information from table's record buffer.

Parameters:

table	TABLE object which record buffer should be used as source for column values.
key	Key description.
msg	Error message template to which key value should be added.
errflag	Flags for my_error() call.

void print_keydup_error	(	TABLE *	table,
		KEY *	key,
		myf	errflag
	)

Construct and emit duplicate key error message using information from table's record buffer.

See also:: print_keydup_error(table, key, msg, errflag).

void trans_register_ha	(	THD *	thd,
		bool	all,
		handlerton *	ht_arg
	)

Transaction handling in the server ==================================

In each client connection, MySQL maintains two transactional states:

a statement transaction,
a standard, also called normal transaction.

Historical note --------------- "Statement transaction" is a non-standard term that comes from the times when MySQL supported BerkeleyDB storage engine.

First of all, it should be said that in BerkeleyDB auto-commit mode auto-commits operations that are atomic to the storage engine itself, such as a write of a record, and are too high-granular to be atomic from the application perspective (MySQL). One SQL statement could involve many BerkeleyDB auto-committed operations and thus BerkeleyDB auto-commit was of little use to MySQL.

Secondly, instead of SQL standard savepoints, BerkeleyDB provided the concept of "nested transactions". In a nutshell, transactions could be arbitrarily nested, but when the parent transaction was committed or aborted, all its child (nested) transactions were handled committed or aborted as well. Commit of a nested transaction, in turn, made its changes visible, but not durable: it destroyed the nested transaction, all its changes would become available to the parent and currently active nested transactions of this parent.

So the mechanism of nested transactions was employed to provide "all or nothing" guarantee of SQL statements required by the standard. A nested transaction would be created at start of each SQL statement, and destroyed (committed or aborted) at statement end. Such nested transaction was internally referred to as a "statement transaction" and gave birth to the term.

(Historical note ends)

Since then a statement transaction is started for each statement that accesses transactional tables or uses the binary log. If the statement succeeds, the statement transaction is committed. If the statement fails, the transaction is rolled back. Commits of statement transactions are not durable -- each such transaction is nested in the normal transaction, and if the normal transaction is rolled back, the effects of all enclosed statement transactions are undone as well. Technically, a statement transaction can be viewed as a savepoint which is maintained automatically in order to make effects of one statement atomic.

The normal transaction is started by the user and is ended usually upon a user request as well. The normal transaction encloses transactions of all statements issued between its beginning and its end. In autocommit mode, the normal transaction is equivalent to the statement transaction.

Since MySQL supports PSEA (pluggable storage engine architecture), more than one transactional engine can be active at a time. Hence transactions, from the server point of view, are always distributed. In particular, transactional state is maintained independently for each engine. In order to commit a transaction the two phase commit protocol is employed.

Not all statements are executed in context of a transaction. Administrative and status information statements do not modify engine data, and thus do not start a statement transaction and also have no effect on the normal transaction. Examples of such statements are SHOW STATUS and RESET SLAVE.

Similarly DDL statements are not transactional, and therefore a transaction is [almost] never started for a DDL statement. The difference between a DDL statement and a purely administrative statement though is that a DDL statement always commits the current transaction before proceeding, if there is any.

At last, SQL statements that work with non-transactional engines also have no effect on the transaction state of the connection. Even though they are written to the binary log, and the binary log is, overall, transactional, the writes are done in "write-through" mode, directly to the binlog file, followed with a OS cache sync, in other words, bypassing the binlog undo log (translog). They do not commit the current normal transaction. A failure of a statement that uses non-transactional tables would cause a rollback of the statement transaction, but in case there no non-transactional tables are used, no statement transaction is started.

Data layout -----------

The server stores its transaction-related data in thd->transaction. This structure has two members of type THD_TRANS. These members correspond to the statement and normal transactions respectively:

thd->transaction.stmt contains a list of engines that are participating in the given statement
thd->transaction.all contains a list of engines that have participated in any of the statement transactions started within the context of the normal transaction. Each element of the list contains a pointer to the storage engine, engine-specific transactional data, and engine-specific transaction flags.

In autocommit mode thd->transaction.all is empty. Instead, data of thd->transaction.stmt is used to commit/rollback the normal transaction.

The list of registered engines has a few important properties:

no engine is registered in the list twice
engines are present in the list a reverse temporal order -- new participants are always added to the beginning of the list.

Transaction life cycle ----------------------

When a new connection is established, thd->transaction members are initialized to an empty state. If a statement uses any tables, all affected engines are registered in the statement engine list. In non-autocommit mode, the same engines are registered in the normal transaction list. At the end of the statement, the server issues a commit or a roll back for all engines in the statement list. At this point transaction flags of an engine, if any, are propagated from the statement list to the list of the normal transaction. When commit/rollback is finished, the statement list is cleared. It will be filled in again by the next statement, and emptied again at the next statement's end.

The normal transaction is committed in a similar way (by going over all engines in thd->transaction.all list) but at different times:

upon COMMIT SQL statement is issued by the user
implicitly, by the server, at the beginning of a DDL statement or SET AUTOCOMMIT={0|1} statement.

The normal transaction can be rolled back as well:

if the user has requested so, by issuing ROLLBACK SQL statement
if one of the storage engines requested a rollback by setting thd->transaction_rollback_request. This may happen in case, e.g., when the transaction in the engine was chosen a victim of the internal deadlock resolution algorithm and rolled back internally. When such a situation happens, there is little the server can do and the only option is to rollback transactions in all other participating engines. In this case the rollback is accompanied by an error sent to the user.

As follows from the use cases above, the normal transaction is never committed when there is an outstanding statement transaction. In most cases there is no conflict, since commits of the normal transaction are issued by a stand-alone administrative or DDL statement, thus no outstanding statement transaction of the previous statement exists. Besides, all statements that manipulate with the normal transaction are prohibited in stored functions and triggers, therefore no conflicting situation can occur in a sub-statement either. The remaining rare cases when the server explicitly has to commit the statement transaction prior to committing the normal one cover error-handling scenarios (see for example SQLCOM_LOCK_TABLES).

When committing a statement or a normal transaction, the server either uses the two-phase commit protocol, or issues a commit in each engine independently. The two-phase commit protocol is used only if:

all participating engines support two-phase commit (provide handlerton::prepare PSEA API call) and
transactions in at least two engines modify data (i.e. are not read-only).

Note that the two phase commit is used for statement transactions, even though they are not durable anyway. This is done to ensure logical consistency of data in a multiple- engine transaction. For example, imagine that some day MySQL supports unique constraint checks deferred till the end of statement. In such case a commit in one of the engines may yield ER_DUP_KEY, and MySQL should be able to gracefully abort statement transactions of other participants.

After the normal transaction has been committed, thd->transaction.all list is cleared.

When a connection is closed, the current normal transaction, if any, is rolled back.

Roles and responsibilities --------------------------

The server has no way to know that an engine participates in the statement and a transaction has been started in it unless the engine says so. Thus, in order to be a part of a transaction, the engine must "register" itself. This is done by invoking trans_register_ha() server call. Normally the engine registers itself whenever handler::external_lock() is called. trans_register_ha() can be invoked many times: if an engine is already registered, the call does nothing. In case autocommit is not set, the engine must register itself twice -- both in the statement list and in the normal transaction list. In which list to register is a parameter of trans_register_ha().

Note, that although the registration interface in itself is fairly clear, the current usage practice often leads to undesired effects. E.g. since a call to trans_register_ha() in most engines is embedded into implementation of handler::external_lock(), some DDL statements start a transaction (at least from the server point of view) even though they are not expected to. E.g. CREATE TABLE does not start a transaction, since handler::external_lock() is never called during CREATE TABLE. But CREATE TABLE ... SELECT does, since handler::external_lock() is called for the table that is being selected from. This has no practical effects currently, but must be kept in mind nevertheless.

Once an engine is registered, the server will do the rest of the work.

During statement execution, whenever any of data-modifying PSEA API methods is used, e.g. handler::write_row() or handler::update_row(), the read-write flag is raised in the statement transaction for the involved engine. Currently All PSEA calls are "traced", and the data can not be changed in a way other than issuing a PSEA call. Important: unless this invariant is preserved the server will not know that a transaction in a given engine is read-write and will not involve the two-phase commit protocol!

At the end of a statement, server call trans_commit_stmt is invoked. This call in turn invokes handlerton::prepare() for every involved engine. Prepare is followed by a call to handlerton::commit_one_phase() If a one-phase commit will suffice, handlerton::prepare() is not invoked and the server only calls handlerton::commit_one_phase(). At statement commit, the statement-related read-write engine flag is propagated to the corresponding flag in the normal transaction. When the commit is complete, the list of registered engines is cleared.

Rollback is handled in a similar fashion.

Additional notes on DDL and the normal transaction. ---------------------------------------------------

DDLs and operations with non-transactional engines do not "register" in thd->transaction lists, and thus do not modify the transaction state. Besides, each DDL in MySQL is prefixed with an implicit normal transaction commit (a call to trans_commit_implicit()), and thus leaves nothing to modify. However, as it has been pointed out with CREATE TABLE .. SELECT, some DDL statements can start a *new* transaction.

Behaviour of the server in this case is currently badly defined. DDL statements use a form of "semantic" logging to maintain atomicity: if CREATE TABLE .. SELECT failed, the newly created table is deleted. In addition, some DDL statements issue interim transaction commits: e.g. ALTER TABLE issues a commit after data is copied from the original table to the internal temporary table. Other statements, e.g. CREATE TABLE ... SELECT do not always commit after itself. And finally there is a group of DDL statements such as RENAME/DROP TABLE that doesn't start a new transaction and doesn't commit.

This diversity makes it hard to say what will happen if by chance a stored function is invoked during a DDL -- whether any modifications it makes will be committed or not is not clear. Fortunately, SQL grammar of few DDLs allows invocation of a stored function.

A consistent behaviour is perhaps to always commit the normal transaction after all DDLs, just like the statement transaction is always committed at the end of all statements. Register a storage engine for a transaction.

Every storage engine MUST call this function when it starts a transaction or a statement (that is it must be called both for the "beginning of transaction" and "beginning of statement"). Only storage engines registered for the transaction/statement will know when to commit/rollback it.

Note:: trans_register_ha is idempotent - storage engine may register many times per transaction.

Variable Documentation

KEY_CREATE_INFO default_key_create_info

Initial value:

  { HA_KEY_ALG_UNDEF, 0, {NullS, 0}, {NullS, 0}, true }

const char* ha_row_type[]

Initial value:

 {
  "", "FIXED", "DYNAMIC", "COMPRESSED", "REDUNDANT", "COMPACT",
   "?",
  "?","?","?"
}

const char* mysqld_system_database = "mysql"

Database name that hold most of mysqld system tables. Current code assumes that, there exists only some specific "database name" designated as system database.

st_system_tablename mysqld_system_tables[]

Initial value:

 {
  {mysqld_system_database, "db"},
  {mysqld_system_database, "user"},
  {mysqld_system_database, "host"},
  {mysqld_system_database, "func"},
  {mysqld_system_database, "proc"},
  {mysqld_system_database, "event"},
  {mysqld_system_database, "plugin"},
  {mysqld_system_database, "servers"},
  {mysqld_system_database, "procs_priv"},
  {mysqld_system_database, "tables_priv"},
  {mysqld_system_database, "proxies_priv"},
  {mysqld_system_database, "columns_priv"},
  {mysqld_system_database, "time_zone"},
  {mysqld_system_database, "time_zone_name"},
  {mysqld_system_database, "time_zone_leap_second"},
  {mysqld_system_database, "time_zone_transition"},
  {mysqld_system_database, "time_zone_transition_type"},
  {mysqld_system_database, "help_category"},
  {mysqld_system_database, "help_keyword"},
  {mysqld_system_database, "help_relation"},
  {mysqld_system_database, "help_topic"},
  {(const char *)NULL, (const char *)NULL} 
}

const char* tx_isolation_names[]

Initial value:

{ "READ-UNCOMMITTED", "READ-COMMITTED", "REPEATABLE-READ", "SERIALIZABLE",
  NullS}

TYPELIB tx_isolation_typelib

Initial value:

 {array_elements(tx_isolation_names)-1,"",
                               tx_isolation_names, NULL}

Classes

Defines

Typedefs

Functions

Variables

Detailed Description

Define Documentation

Function Documentation

Variable Documentation