mysql5/mysql-5.7.27/sql/partitioning/partition_handler.cc

/*
   Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License
   as published by the Free Software Foundation; version 2 of
   the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
*/

#include "table.h"                           // TABLE_SHARE
#include "partition_info.h"                  // NOT_A_PARTITION_ID
#include "sql_partition.h"          // LIST_PART_ENTRY, part_id_range
#include "partition_handler.h"
#include "log.h"                             // sql_print_error
#include "key.h"                             // key_rec_cmp
#include "sql_class.h"                       // THD
#include "myisam.h"                          // MI_MAX_MSG_BUF

// In sql_class.cc:
extern "C" int thd_binlog_format(const MYSQL_THD thd);

/** operation names for the enum_part_operation. */
static const char *opt_op_name[]= {"optimize", "analyze", "check", "repair",
                                   "assign_to_keycache", "preload_keys"};

static PSI_memory_key key_memory_Partition_share;
static PSI_memory_key key_memory_partition_sort_buffer;
static PSI_memory_key key_memory_Partition_admin;
#ifdef HAVE_PSI_INTERFACE
PSI_mutex_key key_partition_auto_inc_mutex;
static PSI_memory_info all_partitioning_memory[]=
{ { &key_memory_Partition_share, "Partition_share", 0},
  { &key_memory_partition_sort_buffer, "partition_sort_buffer", 0},
  { &key_memory_Partition_admin, "Partition_admin", 0} };
static PSI_mutex_info all_partitioning_mutex[]=
{ { &key_partition_auto_inc_mutex, "Partiton_share::auto_inc_mutex", 0} };
#endif

void partitioning_init()
{
#ifdef HAVE_PSI_INTERFACE
  int count;
  count= array_elements(all_partitioning_memory);
  mysql_memory_register("sql", all_partitioning_memory, count);
  count= array_elements(all_partitioning_mutex);
  mysql_mutex_register("sql", all_partitioning_mutex, count);
#endif
}


/*
  Implementation of Partition_share class.
*/

Partition_share::Partition_share()
  : auto_inc_initialized(false),
  auto_inc_mutex(NULL), next_auto_inc_val(0),
  partition_name_hash_initialized(false),
  partition_names(NULL)
{}

Partition_share::~Partition_share()
{
  if (auto_inc_mutex)
  {
    mysql_mutex_destroy(auto_inc_mutex);
    my_free(auto_inc_mutex);
  }
  if (partition_names)
  {
    my_free(partition_names);
  }
  if (partition_name_hash_initialized)
  {
    my_hash_free(&partition_name_hash);
  }
}


/**
  Initialize auto increment mutex in share.

  @return Operation status.
    @retval true  Failure (out of memory).
    @retval false Success.
*/

bool Partition_share::init_auto_inc_mutex(TABLE_SHARE *table_share)
{
  DBUG_ENTER("Partition_share::init_auto_inc_mutex");
  DBUG_ASSERT(!auto_inc_mutex);
#ifndef DBUG_OFF
  if (table_share->tmp_table == NO_TMP_TABLE)
  {
    mysql_mutex_assert_owner(&table_share->LOCK_ha_data);
  }
#endif /* DBUG_OFF */
  auto_inc_mutex= static_cast<mysql_mutex_t*>(my_malloc(
                                              key_memory_Partition_share,
                                              sizeof(*auto_inc_mutex),
                                              MYF(MY_WME)));
  if (!auto_inc_mutex)
  {
    DBUG_RETURN(true);
  }
  mysql_mutex_init(key_partition_auto_inc_mutex,
                   auto_inc_mutex,
                   MY_MUTEX_INIT_FAST);
  DBUG_RETURN(false);
}


/**
  Release reserved auto increment values not used.
  @param thd             Thread.
  @param table_share     Table Share
  @param next_insert_id  Next insert id (first non used auto inc value).
  @param max_reserved    End of reserved auto inc range.
*/
void
Partition_share::release_auto_inc_if_possible(THD *thd, TABLE_SHARE *table_share,
                                              const ulonglong next_insert_id,
                                              const ulonglong max_reserved)
{
  DBUG_ASSERT(auto_inc_mutex);

#ifndef DBUG_OFF
  if (table_share->tmp_table == NO_TMP_TABLE)
  {
    mysql_mutex_assert_owner(auto_inc_mutex);
  }
#endif /* DBUG_OFF */

  /*
    If the current auto_increment values is lower than the reserved value (1)
    and the reserved value was reserved by this thread (2), then we can
    lower the reserved value.
    However, we cannot lower the value if there are forced/non generated
    values from 'SET INSERT_ID = forced_val' (3). */
  if (next_insert_id < next_auto_inc_val &&                       // (1)
      max_reserved >= next_auto_inc_val &&                        // (2)
      thd->auto_inc_intervals_forced.maximum() < next_insert_id)  // (3)
  {
    next_auto_inc_val= next_insert_id;
  }
}


/**
  Get the partition name.

  @param       part   Struct containing name and length
  @param[out]  length Length of the name

  @return Partition name
*/

static uchar *get_part_name_from_def(PART_NAME_DEF *part,
                                     size_t *length,
                                     my_bool not_used MY_ATTRIBUTE((unused)))
{
  *length= part->length;
  return part->partition_name;
}


/**
  Populate the partition_name_hash in part_share.
*/

bool Partition_share::populate_partition_name_hash(partition_info *part_info)
{
  uint tot_names;
  uint num_subparts= part_info->num_subparts;
  DBUG_ENTER("Partition_share::populate_partition_name_hash");
  DBUG_ASSERT(!part_info->is_sub_partitioned() || num_subparts);

  if (num_subparts == 0)
  {
    num_subparts= 1;
  }

  /*
    TABLE_SHARE::LOCK_ha_data must been locked before calling this function.
    This ensures only one thread/table instance will execute this.
  */

#ifndef DBUG_OFF
  if (part_info->table->s->tmp_table == NO_TMP_TABLE)
  {
    mysql_mutex_assert_owner(&part_info->table->s->LOCK_ha_data);
  }
#endif
  if (partition_name_hash_initialized)
  {
    DBUG_RETURN(false);
  }
  tot_names= part_info->num_parts;
  if (part_info->is_sub_partitioned())
  {
    tot_names+= part_info->num_parts * num_subparts;
  }
  partition_names= static_cast<const uchar**>(my_malloc(
                                              key_memory_Partition_share,
                                              part_info->get_tot_partitions() *
                                                sizeof(*partition_names),
                                              MYF(MY_WME)));
  if (!partition_names)
  {
    DBUG_RETURN(true);
  }
  if (my_hash_init(&partition_name_hash,
                   system_charset_info, tot_names, 0, 0,
                   (my_hash_get_key) get_part_name_from_def,
                   my_free, HASH_UNIQUE,
                   key_memory_Partition_share))
  {
    my_free(partition_names);
    partition_names= NULL;
    DBUG_RETURN(true);
  }

  List_iterator<partition_element> part_it(part_info->partitions);
  uint i= 0;
  do
  {
    partition_element *part_elem= part_it++;
    DBUG_ASSERT(part_elem->part_state == PART_NORMAL);
    if (part_elem->part_state == PART_NORMAL)
    {
      if (insert_partition_name_in_hash(part_elem->partition_name,
                                        i * num_subparts,
                                        false))
        goto err;
      if (part_info->is_sub_partitioned())
      {
        List_iterator<partition_element>
                                    subpart_it(part_elem->subpartitions);
        partition_element *sub_elem;
        uint j= 0;
        do
        {
          sub_elem= subpart_it++;
          if (insert_partition_name_in_hash(sub_elem->partition_name,
                                            i * num_subparts + j, true))
            goto err;

        } while (++j < num_subparts);
      }
    }
  } while (++i < part_info->num_parts);

  for (i= 0; i < tot_names; i++)
  {
    PART_NAME_DEF *part_def;
    part_def= reinterpret_cast<PART_NAME_DEF*>(
                              my_hash_element(&partition_name_hash, i));
    if (part_def->is_subpart == part_info->is_sub_partitioned())
    {
      partition_names[part_def->part_id]= part_def->partition_name;
    }
  }
  partition_name_hash_initialized= true;

  DBUG_RETURN(false);
err:
  my_hash_free(&partition_name_hash);
  my_free(partition_names);
  partition_names= NULL;

  DBUG_RETURN(true);
}


/**
  Insert a partition name in the partition_name_hash.

  @param name        Name of partition
  @param part_id     Partition id (number)
  @param is_subpart  Set if the name belongs to a subpartition

  @return Operation status
    @retval true   Failure
    @retval false  Success
*/

bool Partition_share::insert_partition_name_in_hash(const char *name,
                                                    uint part_id,
                                                    bool is_subpart)
{
  PART_NAME_DEF *part_def;
  uchar *part_name;
  uint part_name_length;
  DBUG_ENTER("Partition_share::insert_partition_name_in_hash");
  /*
    Calculate and store the length here, to avoid doing it when
    searching the hash.
  */
  part_name_length= static_cast<uint>(strlen(name));
  /*
    Must use memory that lives as long as table_share.
    Freed in the Partition_share destructor.
    Since we use my_multi_malloc, then my_free(part_def) will also free
    part_name, as a part of my_hash_free.
  */
  if (!my_multi_malloc(key_memory_Partition_share,
                       MY_WME,
                       &part_def, sizeof(PART_NAME_DEF),
                       &part_name, part_name_length + 1,
                       NULL))
  {
    DBUG_RETURN(true);
  }
  memcpy(part_name, name, part_name_length + 1);
  part_def->partition_name= part_name;
  part_def->length= part_name_length;
  part_def->part_id= part_id;
  part_def->is_subpart= is_subpart;
  if (my_hash_insert(&partition_name_hash, (uchar *) part_def))
  {
    my_free(part_def);
    DBUG_RETURN(true);
  }
  DBUG_RETURN(false);
}


const char *Partition_share::get_partition_name(size_t part_id) const
{
  if (partition_names == NULL)
  {
    return NULL;
  }
  return reinterpret_cast<const char*>(partition_names[part_id]);
}
/*
  Implementation of Partition_helper class.
*/
Partition_helper::Partition_helper(handler *main_handler)
  :
  m_handler(main_handler),
  m_part_info(),
  m_tot_parts(),
  m_last_part(),
  m_err_rec(),
  m_ordered(),
  m_ordered_scan_ongoing(),
  m_ordered_rec_buffer(),
  m_queue()
{}


Partition_helper::~Partition_helper()
{
  DBUG_ASSERT(m_ordered_rec_buffer == NULL);
  DBUG_ASSERT(m_key_not_found_partitions.bitmap == NULL);
}


/**
  Set partition info.

  To be called from Partition_handler.

  @param  part_info  Partition info to use.
  @param  early      True if called when part_info only created and parsed,
                     but not setup, checked or fixed.
  */
void Partition_helper::set_part_info_low(partition_info *part_info,
                                         bool early)
{
  /*
    ha_partition will set m_tot_parts from the .par file during creating
    the new handler.
    And this call can be earlier than the partition_default_handling(),
    so get_tot_partitions() may return zero.
  */
  if (m_tot_parts == 0 &&
      (m_part_info == NULL || !early))
  {
    m_tot_parts= part_info->get_tot_partitions();
  }
  m_part_info= part_info;
  m_is_sub_partitioned= m_part_info->is_sub_partitioned();
}

/**
  Initialize the partitioning helper for use after the table is opened.

  @param part_share  Partitioning share (used for auto increment).

  @return Operation status.
    @retval false for success otherwise true.
*/

bool Partition_helper::open_partitioning(Partition_share *part_share)
{
  m_table= get_table();
  DBUG_ASSERT(m_part_info == m_table->part_info);
  m_part_share= part_share;
  m_tot_parts= m_part_info->get_tot_partitions();
  if (bitmap_init(&m_key_not_found_partitions, NULL, m_tot_parts, false))
  {
    return true;
  }
  bitmap_clear_all(&m_key_not_found_partitions);
  m_key_not_found= false;
  m_is_sub_partitioned= m_part_info->is_sub_partitioned();
  m_auto_increment_lock= false;
  m_auto_increment_safe_stmt_log_lock= false;
  m_pkey_is_clustered= m_handler->primary_key_is_clustered();
  m_part_spec.start_part= NOT_A_PARTITION_ID;
  m_part_spec.end_part= NOT_A_PARTITION_ID;
  m_index_scan_type= PARTITION_NO_INDEX_SCAN;
  m_start_key.key= NULL;
  m_start_key.length= 0;
  m_scan_value= 3;
  m_reverse_order= false;
  m_curr_key_info[0]= NULL;
  m_curr_key_info[1]= NULL;
  m_curr_key_info[2]= NULL;
  m_top_entry= NO_CURRENT_PART_ID;
  m_ref_usage= REF_NOT_USED;
  legacy_db_type db_type = ha_legacy_type(m_part_info->default_engine_type);
  if(db_type == DB_TYPE_HEAP)
  {
    m_rec_length= m_table->s->rec_buff_length;
  } else {
    m_rec_length= m_table->s->reclength;
  }
  DBUG_ASSERT(db_type !=  DB_TYPE_UNKNOWN);
  return false;
}


void Partition_helper::close_partitioning()
{
  bitmap_free(&m_key_not_found_partitions);
  DBUG_ASSERT(!m_ordered_rec_buffer);
  destroy_record_priority_queue();
}

/****************************************************************************
                MODULE change record
****************************************************************************/

/**
  Insert a row to the partitioned table.

  @param buf The row in MySQL Row Format.

  @return Operation status.
    @retval    0 Success
    @retval != 0 Error code
*/

int Partition_helper::ph_write_row(uchar *buf)
{
  uint32 part_id;
  int error;
  longlong func_value;
  bool have_auto_increment= m_table->next_number_field &&
                            buf == m_table->record[0];
  THD *thd= get_thd();
  sql_mode_t saved_sql_mode= thd->variables.sql_mode;
  bool saved_auto_inc_field_not_null= m_table->auto_increment_field_not_null;
#ifndef DBUG_OFF
  my_bitmap_map *old_map;
#endif /* DBUG_OFF */
  DBUG_ENTER("Partition_helper::ph_write_row");
  DBUG_ASSERT(buf == m_table->record[0]);

  /*
    If we have an auto_increment column and we are writing a changed row
    or a new row, then update the auto_increment value in the record.
  */
  if (have_auto_increment)
  {
    error= m_handler->update_auto_increment();

    /*
      If we have failed to set the auto-increment value for this row,
      it is highly likely that we will not be able to insert it into
      the correct partition. We must check and fail if neccessary.
    */
    if (error)
      DBUG_RETURN(error);

    /*
      Don't allow generation of auto_increment value the partitions handler.
      If a partitions handler would change the value, then it might not
      match the partition any longer.
      This can occur if 'SET INSERT_ID = 0; INSERT (NULL)',
      So allow this by adding 'MODE_NO_AUTO_VALUE_ON_ZERO' to sql_mode.
      The partitions handler::next_insert_id must always be 0. Otherwise
      we need to forward release_auto_increment, or reset it for all
      partitions.
    */
    if (m_table->next_number_field->val_int() == 0)
    {
      m_table->auto_increment_field_not_null= TRUE;
      thd->variables.sql_mode|= MODE_NO_AUTO_VALUE_ON_ZERO;
    }
  }

#ifndef DBUG_OFF
  /* Temporary mark the partitioning fields as readable. */
  old_map= dbug_tmp_use_all_columns(m_table, m_table->read_set);
#endif /* DBUG_OFF */

  error= m_part_info->get_partition_id(m_part_info, &part_id, &func_value);

#ifndef DBUG_OFF
  dbug_tmp_restore_column_map(m_table->read_set, old_map);
#endif /* DBUG_OFF */

  if (unlikely(error))
  {
    m_part_info->err_value= func_value;
    goto exit;
  }
  if (!m_part_info->is_partition_locked(part_id))
  {
    DBUG_PRINT("info", ("Write to non-locked partition %u (func_value: %ld)",
                        part_id, (long) func_value));
    error= HA_ERR_NOT_IN_LOCK_PARTITIONS;
    goto exit;
  }
  m_last_part= part_id;
  DBUG_PRINT("info", ("Insert in partition %d", part_id));

  error= write_row_in_part(part_id, buf);

  if (have_auto_increment && !m_table->s->next_number_keypart)
  {
    set_auto_increment_if_higher();
  }
exit:
  thd->variables.sql_mode= saved_sql_mode;
  m_table->auto_increment_field_not_null= saved_auto_inc_field_not_null;
  DBUG_RETURN(error);
}


/**
  Update an existing row in the partitioned table.

  Yes, update_row() does what you expect, it updates a row. old_data will
  have the previous row record in it, while new_data will have the newest
  data in it.
  Keep in mind that the server can do updates based on ordering if an
  ORDER BY clause was used. Consecutive ordering is not guaranteed.

  If the new record belongs to a different partition than the old record
  then it will be inserted into the new partition and deleted from the old.

  new_data is always record[0]
  old_data is always record[1]

  @param old_data  The old record in MySQL Row Format.
  @param new_data  The new record in MySQL Row Format.

  @return Operation status.
    @retval    0 Success
    @retval != 0 Error code
*/

int Partition_helper::ph_update_row(const uchar *old_data, uchar *new_data)
{
  uint32 new_part_id, old_part_id;
  int error= 0;
  longlong func_value;
  DBUG_ENTER("Partition_helper::ph_update_row");
  m_err_rec= NULL;

  // Need to read partition-related columns, to locate the row's partition:
  DBUG_ASSERT(bitmap_is_subset(&m_part_info->full_part_field_set,
                               m_table->read_set));
  if ((error= get_parts_for_update(old_data, new_data, m_table->record[0],
                                   m_part_info, &old_part_id, &new_part_id,
                                   &func_value)))
  {
    DBUG_RETURN(error);
  }
  if (!bitmap_is_set(&(m_part_info->lock_partitions), new_part_id))
  {
    error= HA_ERR_NOT_IN_LOCK_PARTITIONS;
    DBUG_RETURN(error);
  }

  /*
    The protocol for updating a row is:
    1) position the handler (cursor) on the row to be updated,
       either through the last read row (rnd or index) or by rnd_pos.
    2) call update_row with both old and new full records as arguments.

    This means that m_last_part should already be set to actual partition
    where the row was read from. And if that is not the same as the
    calculated part_id we found a misplaced row, we return an error to
    notify the user that something is broken in the row distribution
    between partitions! Since we don't check all rows on read, we return an
    error instead of correcting m_last_part, to make the user aware of the
    problem!

    Notice that HA_READ_BEFORE_WRITE_REMOVAL does not require this protocol,
    so this is not supported for this engine.
  */
  if (old_part_id != m_last_part)
  {
    m_err_rec= old_data;
    DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION);
  }

  m_last_part= new_part_id;
  if (new_part_id == old_part_id)
  {
    DBUG_PRINT("info", ("Update in partition %d", new_part_id));
    error= update_row_in_part(new_part_id, old_data, new_data);
  }
  else
  {
    Field *saved_next_number_field= m_table->next_number_field;
    /*
      Don't allow generation of auto_increment value for update.
      table->next_number_field is never set on UPDATE.
      But is set for INSERT ... ON DUPLICATE KEY UPDATE,
      and since update_row() does not generate or update an auto_inc value,
      we cannot have next_number_field set when moving a row
      to another partition with write_row(), since that could
      generate/update the auto_inc value.
      This gives the same behavior for partitioned vs non partitioned tables.
    */
    m_table->next_number_field= NULL;
    DBUG_PRINT("info", ("Update from partition %d to partition %d",
                        old_part_id, new_part_id));
    error= write_row_in_part(new_part_id, new_data);
    m_table->next_number_field= saved_next_number_field;
    if (!error)
    {
      error= delete_row_in_part(old_part_id, old_data);
    }
  }

  /*
    if updating an auto_increment column, update
    m_part_share->next_auto_inc_val if needed.
    (not to be used if auto_increment on secondary field in a multi-column
    index)
    mysql_update does not set table->next_number_field, so we use
    table->found_next_number_field instead.
    Also checking that the field is marked in the write set.
  */
  if (m_table->found_next_number_field &&
      new_data == m_table->record[0] &&
      !m_table->s->next_number_keypart &&
      bitmap_is_set(m_table->write_set,
                    m_table->found_next_number_field->field_index))
  {
    set_auto_increment_if_higher();
  }
  DBUG_RETURN(error);
}


/**
  Delete an existing row in the partitioned table.

  This will delete a row. buf will contain a copy of the row to be deleted.
  The server will call this right after the current row has been read
  (from either a previous rnd_xxx() or index_xxx() call).
  If you keep a pointer to the last row or can access a primary key it will
  make doing the deletion quite a bit easier.
  Keep in mind that the server does no guarentee consecutive deletions.
  ORDER BY clauses can be used.

  buf is either record[0] or record[1]

  @param buf  The record in MySQL Row Format.

  @return Operation status.
    @retval    0 Success
    @retval != 0 Error code
*/

int Partition_helper::ph_delete_row(const uchar *buf)
{
  int error;
  uint part_id;
  DBUG_ENTER("Partition_helper::ph_delete_row");
  m_err_rec= NULL;

  DBUG_ASSERT(bitmap_is_subset(&m_part_info->full_part_field_set,
                               m_table->read_set));
  if ((error= get_part_for_delete(buf,
                                  m_table->record[0],
                                  m_part_info,
                                  &part_id)))
  {
    DBUG_RETURN(error);
  }
  if (!m_part_info->is_partition_locked(part_id))
  {
    DBUG_RETURN(HA_ERR_NOT_IN_LOCK_PARTITIONS);
  }

  /*
    The protocol for deleting a row is:
    1) position the handler (cursor) on the row to be deleted,
       either through the last read row (rnd or index) or by rnd_pos.
    2) call delete_row with the full record as argument.

    This means that m_last_part should already be set to actual partition
    where the row was read from. And if that is not the same as the
    calculated part_id we found a misplaced row, we return an error to
    notify the user that something is broken in the row distribution
    between partitions! Since we don't check all rows on read, we return an
    error instead of forwarding the delete to the correct (m_last_part)
    partition!

    Notice that HA_READ_BEFORE_WRITE_REMOVAL does not require this protocol,
    so this is not supported for this engine.

    TODO: change the assert in InnoDB into an error instead and make this one
    an assert instead and remove the get_part_for_delete()!
  */
  if (part_id != m_last_part)
  {
    m_err_rec= buf;
    DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION);
  }
  /* Should never call delete_row on a partition which is not read */
  DBUG_ASSERT(m_part_info->is_partition_used(part_id));

  m_last_part= part_id;
  error= delete_row_in_part(part_id, buf);
  DBUG_RETURN(error);
}


/**
  Get a range of auto increment values.

  Can only be used if the auto increment field is the first field in an index.

  This method is called by update_auto_increment which in turn is called
  by the individual handlers as part of write_row. We use the
  part_share->next_auto_inc_val, or search all
  partitions for the highest auto_increment_value if not initialized or
  if auto_increment field is a secondary part of a key, we must search
  every partition when holding a mutex to be sure of correctness.

  @param[in]   increment           Increment value.
  @param[in]   nb_desired_values   Number of desired values.
  @param[out]  first_value         First auto inc value reserved
                                      or MAX if failure.
  @param[out]  nb_reserved_values  Number of values reserved.
*/

void Partition_helper
::get_auto_increment_first_field(ulonglong increment,
                                 ulonglong nb_desired_values,
                                 ulonglong *first_value,
                                 ulonglong *nb_reserved_values)
{
  THD *thd= get_thd();
  DBUG_ENTER("Partition_helper::get_auto_increment_first_field");
  DBUG_PRINT("info", ("inc: %lu desired_values: %lu first_value: %lu",
                      (ulong) increment,
                      (ulong) nb_desired_values,
                      (ulong) *first_value));
  DBUG_ASSERT(increment && nb_desired_values);
  /*
    next_number_keypart is != 0 if the auto_increment column is a secondary
    column in the index (it is allowed in MyISAM)
  */
  DBUG_ASSERT(m_table->s->next_number_keypart == 0);
  *first_value= 0;

  /*
    Get a lock for handling the auto_increment in part_share
    for avoiding two concurrent statements getting the same number.
  */
  lock_auto_increment();

  /* Initialize if not already done. */
  if (!m_part_share->auto_inc_initialized)
  {
    initialize_auto_increment(false);
  }

  /*
    In a multi-row insert statement like INSERT SELECT and LOAD DATA
    where the number of candidate rows to insert is not known in advance
    we must hold a lock/mutex for the whole statement if we have statement
    based replication. Because the statement-based binary log contains
    only the first generated value used by the statement, and slaves assumes
    all other generated values used by this statement were consecutive to
    this first one, we must exclusively lock the generator until the statement
    is done.
  */
  int binlog_format= thd_binlog_format(thd);
  if (!m_auto_increment_safe_stmt_log_lock &&
      thd->lex->sql_command != SQLCOM_INSERT &&
      binlog_format != BINLOG_FORMAT_UNSPEC &&
      binlog_format != BINLOG_FORMAT_ROW)
  {
    DBUG_PRINT("info", ("locking auto_increment_safe_stmt_log_lock"));
    m_auto_increment_safe_stmt_log_lock= true;
  }

  /* this gets corrected (for offset/increment) in update_auto_increment */
  *first_value= m_part_share->next_auto_inc_val;
  m_part_share->next_auto_inc_val+= nb_desired_values * increment;
  if (m_part_share->next_auto_inc_val < *first_value)
  {
    /* Overflow, set to max. */
    m_part_share->next_auto_inc_val= ULLONG_MAX;
  }

  unlock_auto_increment();
  DBUG_PRINT("info", ("*first_value: %lu", (ulong) *first_value));
  *nb_reserved_values= nb_desired_values;
  DBUG_VOID_RETURN;
}


inline void Partition_helper::set_auto_increment_if_higher()
{
  Field_num *field= static_cast<Field_num*>(m_table->found_next_number_field);
  ulonglong nr= (field->unsigned_flag || field->val_int() > 0)
                 ? field->val_int() : 0;
  lock_auto_increment();
  if (!m_part_share->auto_inc_initialized)
  {
    initialize_auto_increment(false);
  }
  /* must hold the mutex when looking/changing m_part_share. */
  if (nr >= m_part_share->next_auto_inc_val)
  {
    m_part_share->next_auto_inc_val= nr + 1;
  }
  unlock_auto_increment();
  save_auto_increment(nr);
}


void Partition_helper::ph_release_auto_increment()
{
  DBUG_ENTER("Partition_helper::ph_release_auto_increment");

  if (m_table->s->next_number_keypart)
  {
    release_auto_increment_all_parts();
  }
  else if (m_handler->next_insert_id)
  {
    ulonglong max_reserved= m_handler->auto_inc_interval_for_cur_row.maximum();
    lock_auto_increment();
    m_part_share->release_auto_inc_if_possible(get_thd(), m_table->s,
                                               m_handler->next_insert_id,
                                               max_reserved);
    DBUG_PRINT("info", ("part_share->next_auto_inc_val: %lu",
                        (ulong) m_part_share->next_auto_inc_val));

    /* Unlock the multi row statement lock taken in get_auto_increment */
    if (m_auto_increment_safe_stmt_log_lock)
    {
      m_auto_increment_safe_stmt_log_lock= FALSE;
      DBUG_PRINT("info", ("unlocking auto_increment_safe_stmt_log_lock"));
    }

    unlock_auto_increment();
  }
  DBUG_VOID_RETURN;
}


/**
  Calculate key hash value from an null terminated array of fields.
  Support function for KEY partitioning.

  @param field_array   An array of the fields in KEY partitioning

  @return hash_value calculated

  @note Uses the hash function on the character set of the field.
  Integer and floating point fields use the binary character set by default.
*/

uint32 Partition_helper::ph_calculate_key_hash_value(Field **field_array)
{
  ulong nr1= 1;
  ulong nr2= 4;
  bool use_51_hash;
  use_51_hash= MY_TEST((*field_array)->table->part_info->key_algorithm ==
                       partition_info::KEY_ALGORITHM_51);

  do
  {
    Field *field= *field_array;
    if (use_51_hash)
    {
      switch (field->real_type()) {
      case MYSQL_TYPE_TINY:
      case MYSQL_TYPE_SHORT:
      case MYSQL_TYPE_LONG:
      case MYSQL_TYPE_FLOAT:
      case MYSQL_TYPE_DOUBLE:
      case MYSQL_TYPE_NEWDECIMAL:
      case MYSQL_TYPE_TIMESTAMP:
      case MYSQL_TYPE_LONGLONG:
      case MYSQL_TYPE_INT24:
      case MYSQL_TYPE_TIME:
      case MYSQL_TYPE_DATETIME:
      case MYSQL_TYPE_YEAR:
      case MYSQL_TYPE_NEWDATE:
        {
          if (field->is_null())
          {
            nr1^= (nr1 << 1) | 1;
            continue;
          }
          /* Force this to my_hash_sort_bin, which was used in 5.1! */
          uint len= field->pack_length();
          my_charset_bin.coll->hash_sort(&my_charset_bin, field->ptr, len,
                                         &nr1, &nr2);
          /* Done with this field, continue with next one. */
          continue;
        }
      case MYSQL_TYPE_STRING:
      case MYSQL_TYPE_VARCHAR:
      case MYSQL_TYPE_BIT:
        /* Not affected, same in 5.1 and 5.5 */
        break;
      /*
        ENUM/SET uses my_hash_sort_simple in 5.1 (i.e. my_charset_latin1)
        and my_hash_sort_bin in 5.5!
      */
      case MYSQL_TYPE_ENUM:
      case MYSQL_TYPE_SET:
        {
          if (field->is_null())
          {
            nr1^= (nr1 << 1) | 1;
            continue;
          }
          /* Force this to my_hash_sort_bin, which was used in 5.1! */
          uint len= field->pack_length();
          my_charset_latin1.coll->hash_sort(&my_charset_latin1, field->ptr,
                                            len, &nr1, &nr2);
          continue;
        }
      /* New types in mysql-5.6. */
      case MYSQL_TYPE_DATETIME2:
      case MYSQL_TYPE_TIME2:
      case MYSQL_TYPE_TIMESTAMP2:
        /* Not affected, 5.6+ only! */
        break;

      /* These types should not be allowed for partitioning! */
      case MYSQL_TYPE_NULL:
      case MYSQL_TYPE_DECIMAL:
      case MYSQL_TYPE_DATE:
      case MYSQL_TYPE_TINY_BLOB:
      case MYSQL_TYPE_MEDIUM_BLOB:
      case MYSQL_TYPE_LONG_BLOB:
      case MYSQL_TYPE_BLOB:
      case MYSQL_TYPE_VAR_STRING:
      case MYSQL_TYPE_GEOMETRY:
        /* fall through. */
      default:
        DBUG_ASSERT(0);                    // New type?
        /* Fall through for default hashing (5.5). */
      }
      /* fall through, use collation based hashing. */
    }
    field->hash(&nr1, &nr2);
  } while (*(++field_array));
  return (uint32) nr1;
}


bool Partition_helper::print_partition_error(int error, myf errflag)
{
  THD *thd= get_thd();
  DBUG_ENTER("Partition_helper::print_partition_error");

  /* Should probably look for my own errors first */
  DBUG_PRINT("enter", ("error: %d", error));

  if ((error == HA_ERR_NO_PARTITION_FOUND) &&
      ! (thd->lex->alter_info.flags & Alter_info::ALTER_TRUNCATE_PARTITION))
  {
    m_part_info->print_no_partition_found(m_table);
    // print_no_partition_found() reports an error, so we can just return here.
    DBUG_RETURN(false);
  }
  else if (error == HA_ERR_ROW_IN_WRONG_PARTITION)
  {
    /*
      Should only happen on DELETE or UPDATE!
      Or in ALTER TABLE REBUILD/REORGANIZE where there are a misplaced
      row that needed to move to an old partition (not in the given set).
    */
    DBUG_ASSERT(thd_sql_command(thd) == SQLCOM_DELETE ||
                thd_sql_command(thd) == SQLCOM_DELETE_MULTI ||
                thd_sql_command(thd) == SQLCOM_UPDATE ||
                thd_sql_command(thd) == SQLCOM_UPDATE_MULTI ||
                thd_sql_command(thd) == SQLCOM_ALTER_TABLE);
    DBUG_ASSERT(m_err_rec);
    if (m_err_rec)
    {
      size_t max_length;
      char buf[MAX_KEY_LENGTH];
      String str(buf,sizeof(buf),system_charset_info);
      uint32 part_id;
      DBUG_ASSERT(m_last_part < m_tot_parts);
      str.length(0);
      if (thd_sql_command(thd) == SQLCOM_ALTER_TABLE)
      {
        str.append("from REBUILD/REORGANIZED partition: ");
        str.append_ulonglong(m_last_part);
        str.append(" to non included partition (new definition): ");
      }
      else
      {
        str.append_ulonglong(m_last_part);
        str.append(". Correct is ");
      }
      if (get_part_for_delete(m_err_rec,
                              m_table->record[0],
                              m_part_info,
                              &part_id))
      {
        str.append("?");
      }
      else
      {
        str.append_ulonglong(part_id);
      }
      append_row_to_str(str, m_err_rec, m_table);

      /* Log this error, so the DBA can notice it and fix it! */
      sql_print_error("Table '%-192s' corrupted: row in wrong partition: %s\n"
                      "Please REPAIR the table!",
                      m_table->s->table_name.str,
                      str.c_ptr_safe());

      max_length= (MYSQL_ERRMSG_SIZE - strlen(ER(ER_ROW_IN_WRONG_PARTITION)));
      if (str.length() >= max_length)
      {
        str.length(max_length-4);
        str.append(STRING_WITH_LEN("..."));
      }
      my_error(ER_ROW_IN_WRONG_PARTITION, MYF(0), str.c_ptr_safe());
      m_err_rec= NULL;
      DBUG_RETURN(false);
    }
  }

  DBUG_RETURN(true);
}

/**
  Implement the partition changes defined by ALTER TABLE of partitions.

  Add and copy if needed a number of partitions, during this operation
  only read operation is ongoing in the server. This is used by
  ADD PARTITION all types as well as by REORGANIZE PARTITION. For
  one-phased implementations it is used also by DROP and COALESCE
  PARTITIONs.
  One-phased implementation needs the new frm file, other handlers will
  get zero length and a NULL reference here.

  @param[in]  create_info       HA_CREATE_INFO object describing all
                                fields and indexes in table
  @param[in]  path              Complete path of db and table name
  @param[out] copied            Output parameter where number of copied
                                records are added
  @param[out] deleted           Output parameter where number of deleted
                                records are added

  @return Operation status
    @retval    0 Success
    @retval != 0 Failure
*/

int Partition_helper::change_partitions(HA_CREATE_INFO *create_info,
                                        const char *path,
                                        ulonglong * const copied,
                                        ulonglong * const deleted)
{
  List_iterator<partition_element> part_it(m_part_info->partitions);
  List_iterator <partition_element> t_it(m_part_info->temp_partitions);
  char part_name_buff[FN_REFLEN];
  const char *table_level_data_file_name= create_info->data_file_name;
  const char *table_level_index_file_name= create_info->index_file_name;
  const char *table_level_tablespace_name= create_info->tablespace;
  uint num_parts= m_part_info->partitions.elements;
  uint num_subparts= m_part_info->num_subparts;
  uint i= 0;
  uint num_remain_partitions;
  uint num_reorged_parts;
  int error= 1;
  bool first;
  uint temp_partitions= m_part_info->temp_partitions.elements;
  THD *thd= get_thd();
  DBUG_ENTER("Partition_helper::change_partitions");

  /*
    Use the read_partitions bitmap for reorganized partitions,
    i.e. what to copy.
  */
  bitmap_clear_all(&m_part_info->read_partitions);

  /*
    Assert that it works without HA_FILE_BASED and lower_case_table_name = 2.
  */
  DBUG_ASSERT(!strcmp(path, get_canonical_filename(m_handler, path,
                                                   part_name_buff)));
  num_reorged_parts= 0;
  if (!m_part_info->is_sub_partitioned())
    num_subparts= 1;

  /*
    Step 1:
      Calculate number of reorganized partitions.
  */
  if (temp_partitions)
  {
    num_reorged_parts= temp_partitions * num_subparts;
  }
  else
  {
    do
    {
      partition_element *part_elem= part_it++;
      if (part_elem->part_state == PART_CHANGED ||
          part_elem->part_state == PART_REORGED_DROPPED)
      {
        num_reorged_parts+= num_subparts;
      }
    } while (++i < num_parts);
  }

  /*
    Step 2:
      Calculate number of partitions after change.
  */
  num_remain_partitions= 0;
  if (temp_partitions)
  {
    num_remain_partitions= num_parts * num_subparts;
  }
  else
  {
    part_it.rewind();
    i= 0;
    do
    {
      partition_element *part_elem= part_it++;
      if (part_elem->part_state == PART_NORMAL ||
          part_elem->part_state == PART_TO_BE_ADDED ||
          part_elem->part_state == PART_CHANGED)
      {
        num_remain_partitions+= num_subparts;
      }
    } while (++i < num_parts);
  }

  /*
    Step 3:
      Set the read_partition bit for all partitions to be copied.
  */
  if (num_reorged_parts)
  {
    i= 0;
    first= true;
    part_it.rewind();
    do
    {
      partition_element *part_elem= part_it++;
      if (part_elem->part_state == PART_CHANGED ||
          part_elem->part_state == PART_REORGED_DROPPED)
      {
        for (uint sp = 0; sp < num_subparts; sp++)
        {
          bitmap_set_bit(&m_part_info->read_partitions, i * num_subparts + sp);
        }
        DBUG_ASSERT(first);
      }
      else if (first && temp_partitions &&
               part_elem->part_state == PART_TO_BE_ADDED)
      {
        /*
          When doing an ALTER TABLE REORGANIZE PARTITION a number of
          partitions is to be reorganized into a set of new partitions.
          The reorganized partitions are in this case in the temp_partitions
          list. We mark all of them in one batch and thus we only do this
          until we find the first partition with state PART_TO_BE_ADDED
          since this is where the new partitions go in and where the old
          ones used to be.
        */
        first= false;
        DBUG_ASSERT(((i*num_subparts) + num_reorged_parts) <= m_tot_parts);
        for (uint sp = 0; sp < num_reorged_parts; sp++)
        {
          bitmap_set_bit(&m_part_info->read_partitions, i * num_subparts + sp);
        }
      }
    } while (++i < num_parts);
  }

  /*
    Step 4:
      Create the new partitions and also open, lock and call
      external_lock on them (if needed) to prepare them for copy phase
      and also for later close calls.
      No need to create PART_NORMAL partitions since they must not
      be written to!
      Only PART_CHANGED and PART_TO_BE_ADDED should be written to!
  */

  error= prepare_for_new_partitions(num_remain_partitions,
                                    num_reorged_parts == 0);

  i= 0;
  part_it.rewind();
  do
  {
    partition_element *part_elem= part_it++;
    DBUG_ASSERT(part_elem->part_state >= PART_NORMAL &&
                part_elem->part_state <= PART_CHANGED);
    if (part_elem->part_state == PART_TO_BE_ADDED ||
        part_elem->part_state == PART_CHANGED)
    {
      /*
        A new partition needs to be created PART_TO_BE_ADDED means an
        entirely new partition and PART_CHANGED means a changed partition
        that will still exist with either more or less data in it.
      */
      uint name_variant= NORMAL_PART_NAME;
      if (part_elem->part_state == PART_CHANGED ||
          (part_elem->part_state == PART_TO_BE_ADDED && temp_partitions))
        name_variant= TEMP_PART_NAME;
      if (m_part_info->is_sub_partitioned())
      {
        List_iterator<partition_element> sub_it(part_elem->subpartitions);
        uint j= 0, part;
        do
        {
          partition_element *sub_elem= sub_it++;
          create_subpartition_name(part_name_buff, path,
                                   part_elem->partition_name,
                                   sub_elem->partition_name,
                                   name_variant);
          part= i * num_subparts + j;
          DBUG_PRINT("info", ("Add subpartition %s", part_name_buff));
          /*
            update_create_info was called previously in
            mysql_prepare_alter_table. Which may have set data/index_file_name
            for the partitions to the full partition name, including
            '#P#<part_name>[#SP#<subpart_name>] suffix. Remove that suffix
            if it exists.
          */
          truncate_partition_filename(&m_table->mem_root,
                                      &sub_elem->data_file_name);
          truncate_partition_filename(&m_table->mem_root,
                                      &sub_elem->index_file_name);
          /* Notice that sub_elem is already based on part_elem's defaults. */
          error= set_up_table_before_create(thd,
                                            m_table->s,
                                            part_name_buff,
                                            create_info,
                                            sub_elem);
          if (error)
          {
            goto err;
          }
          if ((error= create_new_partition(m_table,
                                           create_info,
                                           part_name_buff,
                                           part,
                                           sub_elem)))
          {
            goto err;
          }
          /* Reset create_info to table level values. */
          create_info->data_file_name= table_level_data_file_name;
          create_info->index_file_name= table_level_index_file_name;
          create_info->tablespace= table_level_tablespace_name;
        } while (++j < num_subparts);
      }
      else
      {
        create_partition_name(part_name_buff, path,
                              part_elem->partition_name, name_variant,
                              true);
        DBUG_PRINT("info", ("Add partition %s", part_name_buff));
        /* See comment in subpartition branch above! */
        truncate_partition_filename(&m_table->mem_root,
                                    &part_elem->data_file_name);
        truncate_partition_filename(&m_table->mem_root,
                                    &part_elem->index_file_name);
        error= set_up_table_before_create(thd,
                                          m_table->s,
                                          part_name_buff,
                                          create_info,
                                          part_elem);
        if (error)
        {
          goto err;
        }
        if ((error= create_new_partition(m_table,
                                         create_info,
                                         (const char *)part_name_buff,
                                         i,
                                         part_elem)))
        {
          goto err;
        }
        /* Reset create_info to table level values. */
        create_info->data_file_name= table_level_data_file_name;
        create_info->index_file_name= table_level_index_file_name;
        create_info->tablespace= table_level_tablespace_name;
      }
    }
  } while (++i < num_parts);

  /*
    Step 5:
      State update to prepare for next write of the frm file.
  */
  i= 0;
  part_it.rewind();
  do
  {
    partition_element *part_elem= part_it++;
    if (part_elem->part_state == PART_TO_BE_ADDED)
      part_elem->part_state= PART_IS_ADDED;
    else if (part_elem->part_state == PART_CHANGED)
      part_elem->part_state= PART_IS_CHANGED;
    else if (part_elem->part_state == PART_REORGED_DROPPED)
      part_elem->part_state= PART_TO_BE_DROPPED;
  } while (++i < num_parts);
  for (i= 0; i < temp_partitions; i++)
  {
    partition_element *part_elem= t_it++;
    DBUG_ASSERT(part_elem->part_state == PART_TO_BE_REORGED);
    part_elem->part_state= PART_TO_BE_DROPPED;
  }
  error= copy_partitions(copied, deleted);
err:
  if (error)
  {
    m_handler->print_error(error,
                           MYF(error != ER_OUTOFMEMORY ? 0 : ME_FATALERROR));
  }
  /*
    Close and unlock the new temporary partitions.
    They will later be deleted or renamed through the ddl-log.
  */
  close_new_partitions();
  DBUG_RETURN(error);
}

/**
  Copy partitions as part of ALTER TABLE of partitions.

  change_partitions has done all the preparations, now it is time to
  actually copy the data from the reorganized partitions to the new
  partitions.

  @param[out] copied   Number of records copied.
  @param[out] deleted  Number of records deleted.

  @return Operation status
    @retval  0  Success
    @retval >0  Error code
*/

int Partition_helper::copy_partitions(ulonglong * const copied,
                                      ulonglong * const deleted)
{
  uint new_part= 0;
  int result= 0;
  longlong func_value;
  DBUG_ENTER("Partition_helper::copy_partitions");

  if (m_part_info->linear_hash_ind)
  {
    if (m_part_info->part_type == HASH_PARTITION)
      set_linear_hash_mask(m_part_info, m_part_info->num_parts);
    else
      set_linear_hash_mask(m_part_info, m_part_info->num_subparts);
  }

  /*
    m_part_info->read_partitions bitmap is setup for all the reorganized
    partitions to be copied. So we can use the normal handler rnd interface
    for reading.
  */
  if ((result= m_handler->ha_rnd_init(1)))
  {
    DBUG_RETURN(result);
  }
  while (true)
  {
    if ((result= m_handler->ha_rnd_next(m_table->record[0])))
    {
      if (result == HA_ERR_RECORD_DELETED)
        continue;                              //Probably MyISAM
      if (result != HA_ERR_END_OF_FILE)
        goto error;
      /*
        End-of-file reached, break out to end the copy process.
      */
      break;
    }
    /* Found record to insert into new handler */
    if (m_part_info->get_partition_id(m_part_info, &new_part,
                                      &func_value))
    {
      /*
        This record is in the original table but will not be in the new
        table since it doesn't fit into any partition any longer due to
        changed partitioning ranges or list values.
      */
      (*deleted)++;
    }
    else
    {
      if ((result= write_row_in_new_part(new_part)))
      {
        goto error;
      }
    }
  }
  m_handler->ha_rnd_end();
  DBUG_RETURN(false);
error:
  m_handler->ha_rnd_end();
  DBUG_RETURN(result);
}


/**
  Check/fix misplaced rows.

  @param part_id  Partition to check/fix.
  @param repair   If true, move misplaced rows to correct partition.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error
*/

int Partition_helper::check_misplaced_rows(uint read_part_id, bool repair)
{
  int result= 0;
  THD *thd= get_thd();
  bool ignore= thd->lex->is_ignore();
  uint32 correct_part_id;
  longlong func_value;
  ha_rows num_misplaced_rows= 0;
  ha_rows num_deleted_rows= 0;

  DBUG_ENTER("Partition_helper::check_misplaced_rows");

  if (repair)
  {
    /* We must read the full row, if we need to move it! */
    bitmap_set_all(m_table->read_set);
    bitmap_set_all(m_table->write_set);
  }
  else
  {
    /* Only need to read the partitioning fields. */
    bitmap_union(m_table->read_set, &m_part_info->full_part_field_set);
    /* Fill the base columns of virtual generated columns if necessary */
    for (Field **ptr= m_part_info->full_part_field_array; *ptr; ptr++)
    {
      if ((*ptr)->is_virtual_gcol())
        m_table->mark_gcol_in_maps(*ptr);
    }
  }

  if ((result= rnd_init_in_part(read_part_id, true)))
    DBUG_RETURN(result);

  while (true)
  {
    if ((result= ph_rnd_next_in_part(read_part_id, m_table->record[0])))
    {
      if (result == HA_ERR_RECORD_DELETED)
        continue;
      if (result != HA_ERR_END_OF_FILE)
        break;

      if (num_misplaced_rows > 0)
      {
        if (repair)
        {
          if (num_deleted_rows > 0)
          {
            print_admin_msg(thd, MI_MAX_MSG_BUF, "warning",
                            m_table->s->db.str, m_table->alias,
                            opt_op_name[REPAIR_PARTS],
                            "Moved %lld misplaced rows, deleted %lld rows",
                            num_misplaced_rows - num_deleted_rows,
                            num_deleted_rows);
          }
          else
          {
            print_admin_msg(thd, MI_MAX_MSG_BUF, "warning",
                            m_table->s->db.str, m_table->alias,
                            opt_op_name[REPAIR_PARTS],
                            "Moved %lld misplaced rows",
                            num_misplaced_rows);
          }
        }
        else
        {
          print_admin_msg(thd, MI_MAX_MSG_BUF, "error",
                          m_table->s->db.str, m_table->alias,
                          opt_op_name[CHECK_PARTS],
                          "Found %lld misplaced rows in partition %u",
                          num_misplaced_rows,
                          read_part_id);
        }
      }
      /* End-of-file reached, all rows are now OK, reset result and break. */
      result= 0;
      break;
    }

    result= m_part_info->get_partition_id(m_part_info, &correct_part_id,
                                          &func_value);
    // TODO: Add code to delete rows not matching any partition.
    if (result)
      break;

    if (correct_part_id != read_part_id)
    {
      num_misplaced_rows++;
      m_err_rec= NULL;
      if (!repair)
      {
        /* Check. */
        result= HA_ADMIN_NEEDS_UPGRADE;
        char buf[MAX_KEY_LENGTH];
        String str(buf,sizeof(buf),system_charset_info);
        str.length(0);
        append_row_to_str(str, m_err_rec, m_table);
        print_admin_msg(thd, MI_MAX_MSG_BUF, "error",
                        m_table->s->db.str, m_table->alias,
                        opt_op_name[CHECK_PARTS],
                        "Found a misplaced row"
                        " in part %d should be in part %d:\n%s",
                        read_part_id,
                        correct_part_id,
                        str.c_ptr_safe());
        /* Break on first misplaced row, unless ignore is given! */
        if (!ignore)
          break;
      }
      else
      {
        DBUG_PRINT("info", ("Moving row from partition %d to %d",
                            read_part_id, correct_part_id));

        /*
          Insert row into correct partition. Notice that there are no commit
          for every N row, so the repair will be one large transaction!
        */
        if ((result= write_row_in_part(correct_part_id, m_table->record[0])))
        {
          /*
            We have failed to insert a row, it might have been a duplicate!
          */
          char buf[MAX_KEY_LENGTH];
          String str(buf,sizeof(buf),system_charset_info);
          str.length(0);
          if (result == HA_ERR_FOUND_DUPP_KEY)
          {
            if (ignore)
            {
              str.append("Duplicate key found, deleting the record:\n");
              num_deleted_rows++;
            }
            else
            {
              str.append("Duplicate key found, "
                         "please update or delete the record:\n");
              result= HA_ADMIN_CORRUPT;
            }
          }
          append_row_to_str(str, m_err_rec, m_table);

          /*
            If the engine supports transactions, the failure will be
            rollbacked.
          */
          if (!m_handler->has_transactions() ||
              ignore || result == HA_ADMIN_CORRUPT)
          {
            /* Log this error, so the DBA can notice it and fix it! */
            sql_print_error("Table '%-192s' failed to move/insert a row"
                            " from part %d into part %d:\n%s",
                            m_table->s->table_name.str,
                            read_part_id,
                            correct_part_id,
                            str.c_ptr_safe());
          }
          print_admin_msg(thd, MI_MAX_MSG_BUF, "error",
                          m_table->s->db.str, m_table->alias,
                          opt_op_name[REPAIR_PARTS],
                          "Failed to move/insert a row"
                          " from part %d into part %d:\n%s",
                          read_part_id,
                          correct_part_id,
                          str.c_ptr_safe());
          if (!ignore || result != HA_ERR_FOUND_DUPP_KEY)
            break;
        }

        /* Delete row from wrong partition. */
        if ((result= delete_row_in_part(read_part_id, m_table->record[0])))
        {
          result= HA_ADMIN_CORRUPT;
          if (m_handler->has_transactions())
            break;
          /*
            We have introduced a duplicate, since we failed to remove it
            from the wrong partition.
          */
          char buf[MAX_KEY_LENGTH];
          String str(buf,sizeof(buf),system_charset_info);
          str.length(0);
          append_row_to_str(str, m_err_rec, m_table);

          /* Log this error, so the DBA can notice it and fix it! */
          sql_print_error("Table '%-192s': Delete from part %d failed with"
                          " error %d. But it was already inserted into"
                          " part %d, when moving the misplaced row!"
                          "\nPlease manually fix the duplicate row:\n%s",
                          m_table->s->table_name.str,
                          read_part_id,
                          result,
                          correct_part_id,
                          str.c_ptr_safe());
          break;
        }
      }
    }
  }

  int tmp_result= rnd_end_in_part(read_part_id, true);
  DBUG_RETURN(result ? result : tmp_result);
}

/**
  Read next row during full partition scan (scan in random row order).

  This function can evaluate the virtual generated columns. If virtual
  generated columns are involved, you should not call rnd_next_in_part
  directly but this one.

  @param         part_id  Partition to read from.
  @param[in,out] buf      buffer that should be filled with data.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_rnd_next_in_part(uint part_id, uchar *buf)
{
  int result= rnd_next_in_part(part_id, buf);

  if (!result && m_table->has_gcol())
    result= update_generated_read_fields(buf, m_table);

  return result;
}


/** Set used partitions bitmap from Alter_info.

  @return false if success else true.
*/

bool Partition_helper::set_altered_partitions()
{
  Alter_info *alter_info= &get_thd()->lex->alter_info;

  if ((alter_info->flags & Alter_info::ALTER_ADMIN_PARTITION) == 0 ||
      (alter_info->flags & Alter_info::ALTER_ALL_PARTITION))
  {
    /*
      Full table command, not ALTER TABLE t <cmd> PARTITION <partition list>.
      All partitions are already set, so do nothing.
    */
    return false;
  }
  return m_part_info->set_read_partitions(&alter_info->partition_names);
}

/**
  Print a message row formatted for ANALYZE/CHECK/OPTIMIZE/REPAIR TABLE.

  Modeled after mi_check_print_msg.

  @param thd         Thread context.
  @param len         Needed length for message buffer.
  @param msg_type    Message type.
  @param db_name     Database name.
  @param table_name  Table name.
  @param op_name     Operation name.
  @param fmt         Message (in printf format with additional arguments).

  @return Operation status.
    @retval false for success else true.
*/

bool Partition_helper::print_admin_msg(THD* thd,
                                       uint len,
                                       const char *msg_type,
                                       const char *db_name,
                                       const char *table_name,
                                       const char *op_name,
                                       const char *fmt,
                                       ...)
{
  va_list args;
  Protocol *protocol= thd->get_protocol();
  uint length;
  size_t msg_length;
  char name[NAME_LEN*2+2];
  char *msgbuf;
  bool error= true;

  if (!(msgbuf= (char*) my_malloc(key_memory_Partition_admin, len, MYF(0))))
    return true;
  va_start(args, fmt);
  msg_length= my_vsnprintf(msgbuf, len, fmt, args);
  va_end(args);
  if (msg_length >= (len - 1))
    goto err;
  msgbuf[len - 1] = 0; // healthy paranoia


  if (!thd->get_protocol()->connection_alive())
  {
    sql_print_error("%s", msgbuf);
    goto err;
  }

  length=(uint) (strxmov(name, db_name, ".", table_name,NullS) - name);
  /*
     TODO: switch from protocol to push_warning here. The main reason we didn't
     it yet is parallel repair. Due to following trace:
     mi_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr.

     Also we likely need to lock mutex here (in both cases with protocol and
     push_warning).
  */
  DBUG_PRINT("info",("print_admin_msg:  %s, %s, %s, %s", name, op_name,
                     msg_type, msgbuf));
  protocol->start_row();
  protocol->store(name, length, system_charset_info);
  protocol->store(op_name, system_charset_info);
  protocol->store(msg_type, system_charset_info);
  protocol->store(msgbuf, msg_length, system_charset_info);
  if (protocol->end_row())
  {
    sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n",
                    msgbuf);
    goto err;
  }
  error= false;
err:
  my_free(msgbuf);
  return error;
}


/**
  Set table->read_set taking partitioning expressions into account.

  @param[in]	rnd_init	True if called from rnd_init (else index_init).
*/

inline
void Partition_helper::set_partition_read_set()
{
  /*
    For operations that may need to change data, we may need to extend
    read_set.
  */
  if (m_handler->get_lock_type() == F_WRLCK)
  {
    /*
      If write_set contains any of the fields used in partition and
      subpartition expression, we need to set all bits in read_set because
      the row may need to be inserted in a different [sub]partition. In
      other words update_row() can be converted into write_row(), which
      requires a complete record.
    */
    if (bitmap_is_overlapping(&m_part_info->full_part_field_set,
                              m_table->write_set))
    {
      bitmap_set_all(m_table->read_set);
    }
    else
    {
      /*
        Some handlers only read fields as specified by the bitmap for the
        read set. For partitioned handlers we always require that the
        fields of the partition functions are read such that we can
        calculate the partition id to place updated and deleted records.
      */
      bitmap_union(m_table->read_set, &m_part_info->full_part_field_set);
      /* Fill the base columns of virtual generated columns if necessary */
      for (Field **ptr= m_part_info->full_part_field_array; *ptr; ptr++)
      {
        if ((*ptr)->is_virtual_gcol())
          m_table->mark_gcol_in_maps(*ptr);
      }
    }
    // Mark virtual generated columns writable
    for (Field **vf= m_table->vfield; vf && *vf; vf++)
    {
      if (bitmap_is_set(m_table->read_set, (*vf)->field_index))
        bitmap_set_bit(m_table->write_set, (*vf)->field_index);
    }
  }
}


/****************************************************************************
                MODULE full table scan
****************************************************************************/

/**
  Initialize engine for random reads.

  rnd_init() is called when the server wants the storage engine to do a
  table scan or when the server wants to access data through rnd_pos.

  When scan is used we will scan one handler partition at a time.
  When preparing for rnd_pos we will initialize all handler partitions.
  No extra cache handling is needed when scanning is not performed.

  Before initializing we will call rnd_end to ensure that we clean up from
  any previous incarnation of a table scan.

  @param scan  false for initialize for random reads through rnd_pos()
               true for initialize for random scan through rnd_next().

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_rnd_init(bool scan)
{
  int error;
  uint i= 0;
  uint part_id;
  DBUG_ENTER("Partition_helper::ph_rnd_init");

  set_partition_read_set();

  /* Now we see what the index of our first important partition is */
  DBUG_PRINT("info", ("m_part_info->read_partitions: 0x%lx",
                      (long) m_part_info->read_partitions.bitmap));
  part_id= m_part_info->get_first_used_partition();
  DBUG_PRINT("info", ("m_part_spec.start_part %d", part_id));

  if (MY_BIT_NONE == part_id)
  {
    error= 0;
    goto err1;
  }

  DBUG_PRINT("info", ("rnd_init on partition %d", part_id));
  if (scan)
  {
    /* A scan can be restarted without rnd_end() in between! */
    if (m_scan_value == 1 && m_part_spec.start_part != NOT_A_PARTITION_ID)
    {
      /* End previous scan on partition before restart. */
      if ((error= rnd_end_in_part(m_part_spec.start_part, scan)))
      {
        DBUG_RETURN(error);
      }
    }
    m_scan_value= 1;
    if ((error= rnd_init_in_part(part_id, scan)))
      goto err;
  }
  else
  {
    m_scan_value= 0;
    for (i= part_id;
         i < MY_BIT_NONE;
         i= m_part_info->get_next_used_partition(i))
    {
      if ((error= rnd_init_in_part(i, scan)))
        goto err;
    }
  }
  m_part_spec.start_part= part_id;
  m_part_spec.end_part= m_tot_parts - 1;
  DBUG_PRINT("info", ("m_scan_value=%d", m_scan_value));
  DBUG_RETURN(0);

err:
  /* Call rnd_end for all previously initialized partitions. */
  for (;
       part_id < i;
       part_id= m_part_info->get_next_used_partition(part_id))
  {
    rnd_end_in_part(part_id, scan);
  }
err1:
  m_scan_value= 2;
  m_part_spec.start_part= NO_CURRENT_PART_ID;
  DBUG_RETURN(error);
}


/**
  End of a table scan.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_rnd_end()
{
  int error= 0;
  DBUG_ENTER("Partition_helper::ph_rnd_end");
  switch (m_scan_value) {
  case 3:                                       // Error
    DBUG_ASSERT(0);
    /* fall through. */
  case 2:                                       // Error
    break;
  case 1:
    if (NO_CURRENT_PART_ID != m_part_spec.start_part)         // Table scan
    {
      error= rnd_end_in_part(m_part_spec.start_part, true);
    }
    break;
  case 0:
    uint i;
    for (i= m_part_info->get_first_used_partition();
         i < MY_BIT_NONE;
         i= m_part_info->get_next_used_partition(i))
    {
      int part_error;
      part_error= rnd_end_in_part(i, false);
      if (part_error && !error) {
        error= part_error;
      }
    }
    break;
  }
  m_scan_value= 3;
  m_part_spec.start_part= NO_CURRENT_PART_ID;
  DBUG_RETURN(error);
}


/**
  Read next row during full table scan (scan in random row order).

  This is called for each row of the table scan. When you run out of records
  you should return HA_ERR_END_OF_FILE.
  The Field structure for the table is the key to getting data into buf
  in a manner that will allow the server to understand it.

  @param[out] buf  buffer that should be filled with data.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_rnd_next(uchar *buf)
{
  int result= HA_ERR_END_OF_FILE;
  uint part_id= m_part_spec.start_part;
  DBUG_ENTER("Partition_helper::ph_rnd_next");

  if (NO_CURRENT_PART_ID == part_id)
  {
    /*
      The original set of partitions to scan was empty and thus we report
      the result here.
    */
    goto end;
  }

  DBUG_ASSERT(m_scan_value == 1);

  while (TRUE)
  {
    result= rnd_next_in_part(part_id, buf);
    if (!result)
    {
      m_last_part= part_id;
      m_part_spec.start_part= part_id;
      m_table->status= 0;
      DBUG_RETURN(0);
    }

    /*
      if we get here, then the current partition ha_rnd_next returned failure
    */
    if (result == HA_ERR_RECORD_DELETED)
      continue;                               // Probably MyISAM

    if (result != HA_ERR_END_OF_FILE)
      goto end_dont_reset_start_part;         // Return error

    /* End current partition */
    DBUG_PRINT("info", ("rnd_end on partition %d", part_id));
    if ((result= rnd_end_in_part(part_id, true)))
      break;

    /* Shift to next partition */
    part_id= m_part_info->get_next_used_partition(part_id);
    if (part_id >= m_tot_parts)
    {
      result= HA_ERR_END_OF_FILE;
      break;
    }
    m_last_part= part_id;
    m_part_spec.start_part= part_id;
    DBUG_PRINT("info", ("rnd_init on partition %d", part_id));
    if ((result= rnd_init_in_part(part_id, true)))
      break;
  }

end:
  m_part_spec.start_part= NO_CURRENT_PART_ID;
end_dont_reset_start_part:
  m_table->status= STATUS_NOT_FOUND;
  DBUG_RETURN(result);
}


/**
  Save position of current row.

  position() is called after each call to rnd_next() if the data needs
  to be ordered or accessed later.

  The server uses ref to store data. ref_length in the above case is
  the size needed to store current_position. ref is just a byte array
  that the server will maintain. If you are using offsets to mark rows, then
  current_position should be the offset. If it is a primary key like in
  InnoDB, then it needs to be a primary key.

  @param record  Current record in MySQL Row Format.
*/

void Partition_helper::ph_position(const uchar *record)
{
  DBUG_ASSERT(m_part_info->is_partition_used(m_last_part));
  DBUG_ENTER("Partition_helper::ph_position");
  DBUG_PRINT("info", ("record: %p", record));
  DBUG_DUMP("record", record, m_rec_length);

  /*
    If m_ref_usage is set, then the ref is already stored in the
    priority queue (m_queue) when doing ordered scans.
  */
  if (m_ref_usage != REF_NOT_USED && m_ordered_scan_ongoing)
  {
    DBUG_ASSERT(!m_queue->empty());
    DBUG_ASSERT(m_ordered_rec_buffer);
    DBUG_ASSERT(!m_curr_key_info[1]);
    DBUG_ASSERT(uint2korr(m_queue->top()) == m_last_part);
    /* We already have the ref and part id. */
    memcpy(m_handler->ref, m_queue->top(), m_handler->ref_length);
  }
  else
  {
    DBUG_PRINT("info", ("m_last_part: %u", m_last_part));
    int2store(m_handler->ref, m_last_part);
    position_in_last_part(m_handler->ref + PARTITION_BYTES_IN_POS, record);
  }
  DBUG_DUMP("ref_out", m_handler->ref, m_handler->ref_length);

  DBUG_VOID_RETURN;
}


/**
  Read row using position.

  This is like rnd_next, but you are given a position to use to determine
  the row. The position will be pointing to data of length handler::ref_length
  that handler::ref was set by position(record). Tables clustered on primary
  key usually use the full primary key as reference (like InnoDB). Heap based
  tables usually returns offset in heap file (like MyISAM).

  @param[out] buf  buffer that should be filled with record in MySQL format.
  @param[in]  pos  position given as handler::ref when position() was called.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_rnd_pos(uchar *buf, uchar *pos)
{
  uint part_id;
  DBUG_ENTER("Partition_helper::ph_rnd_pos");

  part_id= uint2korr(pos);
  DBUG_ASSERT(part_id < m_tot_parts);
  DBUG_ASSERT(m_part_info->is_partition_used(part_id));
  m_last_part= part_id;
  DBUG_RETURN(rnd_pos_in_part(part_id, buf, (pos + PARTITION_BYTES_IN_POS)));
}

/****************************************************************************
                MODULE index scan
****************************************************************************/
/*
  Positions an index cursor to the index specified in the handle. Fetches the
  row if available. If the key value is null, begin at the first key of the
  index.

  There are loads of optimizations possible here for the partition handler.
  The same optimizations can also be checked for full table scan although
  only through conditions and not from index ranges.
  Phase one optimizations:
    Check if the fields of the partition function are bound. If so only use
    the single partition it becomes bound to.
  Phase two optimizations:
    If it can be deducted through range or list partitioning that only a
    subset of the partitions are used, then only use those partitions.
*/

/**
  Setup the ordered record buffer and the priority queue.

  Call destroy_record_priority_queue() to deallocate or clean-up
  from failure.

  @return false on success, else true.
*/

int Partition_helper::init_record_priority_queue()
{
  uint used_parts= m_part_info->num_partitions_used();
  DBUG_ENTER("Partition_helper::init_record_priority_queue");
  DBUG_ASSERT(!m_ordered_rec_buffer);
  DBUG_ASSERT(!m_queue);
  /* Initialize the priority queue. */
  // TODO: Create test to see the cost of allocating when needed vs
  // allocate once and keep between statements. Also test on NUMA
  // machines to see the difference (I guess that allocating when needed
  // will allocate on 'correct' NUMA node and be faster.)
  if (!m_queue)
  {
    m_queue= new (std::nothrow) Prio_queue(Key_rec_less(m_curr_key_info));
    if (!m_queue)
    {
      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
    }
  }
  /* Initialize the ordered record buffer.  */
  if (!m_ordered_rec_buffer)
  {
    uint alloc_len;
    /*
      Allocate record buffer for each used partition.
      If PK is clustered index, it is either the primary sort key or is
      added as secondary sort. So we only need to allocate for part id
      and a full record per partition.
      Otherwise if the clustered index was generated, we might need to
      do a secondary sort by rowid (handler::ref) and must allocate for
      ref (includes part id) and full record per partition. We don't
      know yet if we need to do secondary sort by rowid, so we must
      allocate space for it.
      TODO: enhance ha_index_init() for HA_EXTRA_SECONDARY_SORT_ROWID to
      avoid allocating space for handler::ref when not needed.
      When enhancing ha_index_init() care must be taken on ph_position(),
      so InnoDB's row_id is correctly handled (taken from m_last_part).
    */
    if (m_pkey_is_clustered && m_table->s->primary_key != MAX_KEY)
    {
      m_rec_offset= PARTITION_BYTES_IN_POS;
      m_ref_usage= REF_NOT_USED;
    }
    else
    {
      m_rec_offset= m_handler->ref_length;
      m_ref_usage= REF_STORED_IN_PQ;
    }
    alloc_len= used_parts * (m_rec_offset + m_rec_length);
    /* Allocate a key for temporary use when setting up the scan. */
    alloc_len+= m_table->s->max_key_length;

    m_ordered_rec_buffer= static_cast<uchar*>(
                            my_malloc(key_memory_partition_sort_buffer,
                                      alloc_len,
                                      MYF(MY_WME)));
    if (!m_ordered_rec_buffer)
    {
      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
    }

    /*
      We set-up one record per partition and each record has 2 bytes in
      front where the partition id is written. This is used by ordered
      index_read.
      If we need to also sort by rowid (handler::ref), then m_curr_key_info[1]
      is NULL and we add the rowid before the record.
      We also set-up a reference to the first record for temporary use in
      setting up the scan.
    */
    char *ptr= (char*) m_ordered_rec_buffer;
    uint i;
    for (i= m_part_info->get_first_used_partition();
         i < MY_BIT_NONE;
         i= m_part_info->get_next_used_partition(i))
    {
      DBUG_PRINT("info", ("init rec-buf for part %u", i));
      int2store(ptr, i);
      ptr+= m_rec_offset + m_rec_length;
    }
    m_start_key.key= (const uchar*)ptr;
    /*
      Initialize priority queue, initialized to reading forward.
      Start by only sort by KEY, HA_EXTRA_SECONDARY_SORT_ROWID
      will be given if we should sort by handler::ref too.
    */
    m_queue->m_rec_offset= m_rec_offset;
    if (m_queue->reserve(used_parts))
    {
      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
    }
  }
  DBUG_RETURN(init_record_priority_queue_for_parts(used_parts));
}


/**
  Destroy the ordered record buffer and the priority queue.
*/

void Partition_helper::destroy_record_priority_queue()
{
  DBUG_ENTER("Partition_helper::destroy_record_priority_queue");
  destroy_record_priority_queue_for_parts();
  if (m_ordered_rec_buffer)
  {
    my_free(m_ordered_rec_buffer);
    m_ordered_rec_buffer= NULL;
  }
  if (m_queue)
  {
    m_queue->clear();
    delete m_queue;
    m_queue= NULL;
  }
  m_ref_usage= REF_NOT_USED;
  m_ordered_scan_ongoing= false;
  DBUG_VOID_RETURN;
}


/**
  Common setup for index_init.

  Set up variables and initialize the record priority queue.

  @param inx     Index to be used.
  @param sorted  True if the rows must be returned in index order.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_init_setup(uint inx, bool sorted)
{
  DBUG_ENTER("Partition_helper:ph_:index_init_setup");

  DBUG_ASSERT(inx != MAX_KEY);
  DBUG_PRINT("info", ("inx %u sorted %u", inx, sorted));
  m_part_spec.start_part= NO_CURRENT_PART_ID;
  m_start_key.length= 0;
  m_ordered= sorted;
  m_ref_usage= REF_NOT_USED;
  m_curr_key_info[0]= m_table->key_info+inx;
  m_curr_key_info[1]= NULL;
  /*
    There are two cases where it is not enough to only sort on the key:
    1) For clustered indexes, the optimizer assumes that all keys
       have the rest of the PK columns appended to the KEY, so it will
       sort by PK as secondary sort key.
    2) Rowid-Order-Retrieval access methods, like index_merge_intersect
       and index_merge_union. These methods requires the index to be sorted
       on rowid (handler::ref) as secondary sort key.
  */
  if (m_pkey_is_clustered && m_table->s->primary_key != MAX_KEY &&
      inx != m_table->s->primary_key)
  {
    /*
      if PK is clustered, then the key cmp must use the pk to
      differentiate between equal key in given index.
    */
    DBUG_PRINT("info", ("Clustered pk, using pk as secondary cmp"));
    m_curr_key_info[1]= m_table->key_info+m_table->s->primary_key;
  }

  /*
    Some handlers only read fields as specified by the bitmap for the
    read set. For partitioned handlers we always require that the
    fields of the partition functions are read such that we can
    calculate the partition id to place updated and deleted records.
  */
  if (m_handler->get_lock_type() == F_WRLCK)
    bitmap_union(m_table->read_set, &m_part_info->full_part_field_set);

  DBUG_RETURN(0);
}


/**
  Initialize handler before start of index scan.

  index_init is always called before starting index scans (except when
  starting through index_read_idx and using read_range variants).

  @param inx     Index number.
  @param sorted  Is rows to be returned in sorted order.

  @return Operation status
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_init(uint inx, bool sorted)
{
  int error;
  uint part_id= m_part_info->get_first_used_partition();
  DBUG_ENTER("Partition_helper::ph_index_init");
  m_handler->active_index= inx;

  if (part_id == MY_BIT_NONE)
  {
    DBUG_RETURN(0);
  }

  if ((error= ph_index_init_setup(inx, sorted)))
  {
    DBUG_RETURN(error);
  }
  if ((error= init_record_priority_queue()))
  {
    destroy_record_priority_queue();
    DBUG_RETURN(error);
  }

  for (/* part_id already set. */;
       part_id < MY_BIT_NONE;
       part_id= m_part_info->get_next_used_partition(part_id))
  {
    if ((error= index_init_in_part(part_id, inx, sorted)))
      goto err;

    DBUG_EXECUTE_IF("partition_fail_index_init", {
      part_id++;
      error= HA_ERR_NO_PARTITION_FOUND;
      goto err;
    });
  }
err:
  if (error)
  {
    /* End the previously initialized indexes. */
    uint j;
    for (j= m_part_info->get_first_used_partition();
         j < part_id;
         j= m_part_info->get_next_used_partition(j))
    {
      (void) index_end_in_part(j);
    }
    destroy_record_priority_queue();
  }
  DBUG_RETURN(error);
}


/**
  End of index scan.

  index_end is called at the end of an index scan to clean up any
  things needed to clean up.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_end()
{
  int error= 0;
  uint i;
  DBUG_ENTER("Partition_helper::ph_index_end");

  m_part_spec.start_part= NO_CURRENT_PART_ID;
  m_ref_usage= REF_NOT_USED;
  for (i= m_part_info->get_first_used_partition();
       i < MY_BIT_NONE;
       i= m_part_info->get_next_used_partition(i))
  {
    int tmp;
    if ((tmp= index_end_in_part(i)))
      error= tmp;
  }
  destroy_record_priority_queue();
  m_handler->active_index= MAX_KEY;
  DBUG_RETURN(error);
}


/**
  Read one record in an index scan and start an index scan.

  index_read_map starts a new index scan using a start key. The MySQL Server
  will check the end key on its own. Thus to function properly the
  partitioned handler need to ensure that it delivers records in the sort
  order of the MySQL Server.
  index_read_map can be restarted without calling index_end on the previous
  index scan and without calling index_init. In this case the index_read_map
  is on the same index as the previous index_scan. This is particularly
  used in conjunction with multi read ranges.

  @param[out] buf          Read row in MySQL Row Format
  @param[in]  key          Key parts in consecutive order
  @param[in]  keypart_map  Which part of key is used
  @param[in]  find_flag    What type of key condition is used

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_read_map(uchar *buf,
                                     const uchar *key,
                                     key_part_map keypart_map,
                                     enum ha_rkey_function find_flag)
{
  DBUG_ENTER("Partition_handler::ph_index_read_map");
  m_index_scan_type= PARTITION_INDEX_READ;
  m_start_key.key= key;
  m_start_key.keypart_map= keypart_map;
  m_start_key.flag= find_flag;
  DBUG_RETURN(common_index_read(buf, true));
}


/**
  Common routine for a number of index_read variants.

  @param[out] buf             Buffer where the record should be returned.
  @param[in]  have_start_key  TRUE <=> the left endpoint is available, i.e.
                              we're in index_read call or in read_range_first
                              call and the range has left endpoint.
                              FALSE <=> there is no left endpoint (we're in
                              read_range_first() call and the range has no left
                              endpoint).

  @return Operation status
    @retval 0                    OK
    @retval HA_ERR_END_OF_FILE   Whole index scanned, without finding the record.
    @retval HA_ERR_KEY_NOT_FOUND Record not found, but index cursor positioned.
    @retval other                Error code.

  @details
    Start scanning the range (when invoked from read_range_first()) or doing
    an index lookup (when invoked from index_read_XXX):
     - If possible, perform partition selection
     - Find the set of partitions we're going to use
     - Depending on whether we need ordering:
        NO:  Get the first record from first used partition (see
             handle_unordered_scan_next_partition)
        YES: Fill the priority queue and get the record that is the first in
             the ordering
*/

int Partition_helper::common_index_read(uchar *buf, bool have_start_key)
{
  int error;
  m_reverse_order= false;
  DBUG_ENTER("Partition_helper::common_index_read");

  DBUG_PRINT("info", ("m_ordered %u m_ordered_scan_ong %u",
                      m_ordered, m_ordered_scan_ongoing));

  if (have_start_key)
  {
    m_start_key.length= calculate_key_len(m_table,
                                          m_handler->active_index,
                                          m_start_key.keypart_map);
    DBUG_PRINT("info", ("have_start_key map %lu find_flag %u len %u",
                        m_start_key.keypart_map, m_start_key.flag,
                        m_start_key.length));
    DBUG_ASSERT(m_start_key.length);
  }
  if ((error= partition_scan_set_up(buf, have_start_key)))
  {
    DBUG_RETURN(error);
  }

  if (have_start_key &&
      (m_start_key.flag == HA_READ_KEY_OR_PREV ||
       m_start_key.flag == HA_READ_PREFIX_LAST ||
       m_start_key.flag == HA_READ_PREFIX_LAST_OR_PREV ||
       m_start_key.flag == HA_READ_BEFORE_KEY))
  {
    m_reverse_order= true;
    m_ordered_scan_ongoing= true;
  }
  DBUG_PRINT("info", ("m_ordered %u m_o_scan_ong %u have_start_key %u",
                      m_ordered, m_ordered_scan_ongoing, have_start_key));
  if (!m_ordered_scan_ongoing)
   {
    /*
      We use unordered index scan when read_range is used and flag
      is set to not use ordered.
      We also use an unordered index scan when the number of partitions to
      scan is only one.
      The unordered index scan will use the partition set created.
    */
    DBUG_PRINT("info", ("doing unordered scan"));
    error= handle_unordered_scan_next_partition(buf);
  }
  else
  {
    /*
      In all other cases we will use the ordered index scan. This will use
      the partition set created by the get_partition_set method.
    */
    error= handle_ordered_index_scan(buf);
  }
  DBUG_RETURN(error);
}


/**
  Start an index scan from leftmost record and return first record.

  index_first() asks for the first key in the index.
  This is similar to index_read except that there is no start key since
  the scan starts from the leftmost entry and proceeds forward with
  index_next.

  @param[out] buf  Read row in MySQL Row Format.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_first(uchar *buf)
{
  DBUG_ENTER("Partition_helper::ph_index_first");

  m_index_scan_type= PARTITION_INDEX_FIRST;
  m_reverse_order= false;
  DBUG_RETURN(common_first_last(buf));
}


/**
  Start an index scan from rightmost record and return first record.

  index_last() asks for the last key in the index.
  This is similar to index_read except that there is no start key since
  the scan starts from the rightmost entry and proceeds forward with
  index_prev.

  @param[out] buf  Read row in MySQL Row Format.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_last(uchar *buf)
{
  DBUG_ENTER("Partition_helper::ph_index_last");

  int error = HA_ERR_END_OF_FILE;
  uint part_id = m_part_info->get_first_used_partition();
  if (part_id == MY_BIT_NONE)
  {
     /* No partition to scan. */
      DBUG_RETURN(error);
  }
  m_index_scan_type= PARTITION_INDEX_LAST;
  m_reverse_order= true;
  DBUG_RETURN(common_first_last(buf));
}


/**
  Common routine for index_first/index_last.

  @param[out] buf  Read row in MySQL Row Format.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::common_first_last(uchar *buf)
{
  int error;
  DBUG_ENTER("Partition_helper::common_first_last");

  if ((error= partition_scan_set_up(buf, false)))
  {
    DBUG_RETURN(error);
  }
  if (!m_ordered_scan_ongoing &&
      m_index_scan_type != PARTITION_INDEX_LAST)
  {
    DBUG_RETURN(handle_unordered_scan_next_partition(buf));
  }
  DBUG_RETURN(handle_ordered_index_scan(buf));
}


/**
  Read last using key.

  This is used in join_read_last_key to optimize away an ORDER BY.
  Can only be used on indexes supporting HA_READ_ORDER.

  @param[out] buf          Read row in MySQL Row Format
  @param[in]  key          Key
  @param[in]  keypart_map  Which part of key is used

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_read_last_map(uchar *buf,
                                          const uchar *key,
                                          key_part_map keypart_map)
{
  DBUG_ENTER("Partition_helper::ph_index_read_last_map");

  m_ordered= true;                              // Safety measure
  m_index_scan_type= PARTITION_INDEX_READ_LAST;
  m_start_key.key= key;
  m_start_key.keypart_map= keypart_map;
  m_start_key.flag= HA_READ_PREFIX_LAST;
  DBUG_RETURN(common_index_read(buf, true));
}


/**
  Read index by key and keymap.

  Positions an index cursor to the index specified.
  Fetches the row if available. If the key value is null,
  begin at first key of the index.

  Optimization of the default implementation to take advantage of dynamic
  partition pruning.

  @param[out] buf          Read row in MySQL Row Format
  @param[in]  index        Index to read from
  @param[in]  key          Key
  @param[in]  keypart_map  Which part of key is used
  @param[in]  find_flag    Direction/how to search.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/
int Partition_helper::ph_index_read_idx_map(uchar *buf,
                                         uint index,
                                         const uchar *key,
                                         key_part_map keypart_map,
                                         enum ha_rkey_function find_flag)
{
  int error= HA_ERR_KEY_NOT_FOUND;
  DBUG_ENTER("Partition_helper::ph_index_read_idx_map");

  if (find_flag == HA_READ_KEY_EXACT)
  {
    uint part;
    m_start_key.key= key;
    m_start_key.keypart_map= keypart_map;
    m_start_key.flag= find_flag;
    m_start_key.length= calculate_key_len(m_table,
                                          index,
                                          m_start_key.keypart_map);

    get_partition_set(m_table, buf, index, &m_start_key, &m_part_spec);

    /*
      We have either found exactly 1 partition
      (in which case start_part == end_part)
      or no matching partitions (start_part > end_part)
    */
    DBUG_ASSERT(m_part_spec.start_part >= m_part_spec.end_part);
    /* The start part is must be marked as used. */
    DBUG_ASSERT(m_part_spec.start_part > m_part_spec.end_part ||
                m_part_info->is_partition_used(m_part_spec.start_part));

    for (part= m_part_spec.start_part;
         part <= m_part_spec.end_part;
         part= m_part_info->get_next_used_partition(part))
    {
      error= index_read_idx_map_in_part(part,
                                        buf,
                                        index,
                                        key,
                                        keypart_map,
                                        find_flag);
      if (error != HA_ERR_KEY_NOT_FOUND &&
          error != HA_ERR_END_OF_FILE)
      {
        break;
      }
    }
    if (part <= m_part_spec.end_part)
    {
      m_last_part= part;
    }
  }
  else
  {
    /*
      If not only used with HA_READ_KEY_EXACT, we should investigate if
      possible to optimize for other find_flag's as well.
    */
    DBUG_ASSERT(0);
    error= HA_ERR_INTERNAL_ERROR;
  }
  DBUG_RETURN(error);
}


/**
  Read next record in a forward index scan.

  Used to read forward through the index (left to right, low to high).

  @param[out] buf  Read row in MySQL Row Format.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_next(uchar *buf)
{
  DBUG_ENTER("Partition_helper::ph_index_next");

  /*
    TODO(low priority):
    If we want partition to work with the HANDLER commands, we
    must be able to do index_last() -> index_prev() -> index_next()
    and if direction changes, we must step back those partitions in
    the record queue so we don't return a value from the wrong direction.
  */
  DBUG_ASSERT(m_index_scan_type != PARTITION_INDEX_LAST ||
              m_table->open_by_handler);
  if (!m_ordered_scan_ongoing)
  {
    DBUG_RETURN(handle_unordered_next(buf, false));
  }
  DBUG_RETURN(handle_ordered_next(buf, false));
}


/**
  Read next same record.

  This routine is used to read the next but only if the key is the same
  as supplied in the call.

  @param[out] buf     Read row in MySQL Row Format.
  @param[in]  key     Key.
  @param[in]  keylen  Length of key.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_next_same(uchar *buf, const uchar *key, uint keylen)
{
  DBUG_ENTER("Partition_helper::ph_index_next_same");

  DBUG_ASSERT(keylen == m_start_key.length);
  DBUG_ASSERT(m_index_scan_type != PARTITION_INDEX_LAST);
  if (!m_ordered_scan_ongoing)
    DBUG_RETURN(handle_unordered_next(buf, true));
  DBUG_RETURN(handle_ordered_next(buf, true));
}


/**
  Read next record when performing index scan backwards.

  Used to read backwards through the index (right to left, high to low).

  @param[out] buf  Read row in MySQL Row Format.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_index_prev(uchar *buf)
{
  DBUG_ENTER("Partition_helper::ph_index_prev");

  /* TODO: read comment in index_next */
  DBUG_ASSERT(m_index_scan_type != PARTITION_INDEX_FIRST ||
              m_table->open_by_handler);
  DBUG_RETURN(handle_ordered_prev(buf));
}


/**
  Start a read of one range with start and end key.

  We re-implement read_range_first since we don't want the compare_key
  check at the end. This is already performed in the partition handler.
  read_range_next is very much different due to that we need to scan
  all underlying handlers.

  @param start_key     Specification of start key.
  @param end_key       Specification of end key.
  @param eq_range_arg  Is it equal range.
  @param sorted        Should records be returned in sorted order.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_read_range_first(const key_range *start_key,
                                       const key_range *end_key,
                                       bool eq_range_arg,
                                       bool sorted)
{
  int error= HA_ERR_END_OF_FILE;
  bool have_start_key= (start_key != NULL);
  uint part_id= m_part_info->get_first_used_partition();
  DBUG_ENTER("Partition_helper::ph_read_range_first");

  if (part_id == MY_BIT_NONE)
  {
    /* No partition to scan. */
    m_table->status= STATUS_NOT_FOUND;
    DBUG_RETURN(error);
  }

  m_ordered= sorted;
  set_eq_range(eq_range_arg);
  m_handler->set_end_range(end_key, handler::RANGE_SCAN_ASC);

  set_range_key_part(m_curr_key_info[0]->key_part);
  if (have_start_key)
    m_start_key= *start_key;
  else
    m_start_key.key= NULL;

  m_index_scan_type= PARTITION_READ_RANGE;
  error= common_index_read(m_table->record[0], have_start_key);
  DBUG_RETURN(error);
}


/**
  Read next record in read of a range with start and end key.

  @return Operation status.
    @retval    0  Success
    @retval != 0  Error code
*/

int Partition_helper::ph_read_range_next()
{
  DBUG_ENTER("Partition_helper::ph_read_range_next");

  if (m_ordered_scan_ongoing)
  {
    DBUG_RETURN(handle_ordered_next(m_table->record[0], get_eq_range()));
  }
  DBUG_RETURN(handle_unordered_next(m_table->record[0], get_eq_range()));
}


/**
  Common routine to set up index scans.

  Find out which partitions we'll need to read when scanning the specified
  range.

  If we need to scan only one partition, set m_ordered_scan_ongoing=FALSE
  as we will not need to do merge ordering.

  @param buf            Buffer to later return record in (this function
                        needs it to calculate partitioning function values)

  @param idx_read_flag  TRUE <=> m_start_key has range start endpoint which
                        probably can be used to determine the set of
                        partitions to scan.
                        FALSE <=> there is no start endpoint.

  @return Operation status.
    @retval   0  Success
    @retval !=0  Error code
*/

int Partition_helper::partition_scan_set_up(uchar * buf, bool idx_read_flag)
{
  DBUG_ENTER("Partition_helper::partition_scan_set_up");

  if (idx_read_flag)
    get_partition_set(m_table,
                      buf,
                      m_handler->active_index,
                      &m_start_key,
                      &m_part_spec);
  else
  {
    // TODO: set to get_first_used_part() instead!
    m_part_spec.start_part= 0;
    // TODO: Implement bitmap_get_last_set() and use that here!
    m_part_spec.end_part= m_tot_parts - 1;
  }
  if (m_part_spec.start_part > m_part_spec.end_part)
  {
    /*
      We discovered a partition set but the set was empty so we report
      key not found.
    */
    DBUG_PRINT("info", ("scan with no partition to scan"));
    m_table->status= STATUS_NOT_FOUND;
    DBUG_RETURN(HA_ERR_END_OF_FILE);
  }
  if (m_part_spec.start_part == m_part_spec.end_part)
  {
    /*
      We discovered a single partition to scan, this never needs to be
      performed using the ordered index scan.
    */
    DBUG_PRINT("info", ("index scan using the single partition %d",
                        m_part_spec.start_part));
    m_ordered_scan_ongoing= FALSE;
  }
  else
  {
    /*
      Set m_ordered_scan_ongoing according how the scan should be done
      Only exact partitions are discovered atm by get_partition_set.
      Verify this, also bitmap must have at least one bit set otherwise
      the result from this table is the empty set.
    */
    uint start_part= m_part_info->get_first_used_partition();
    if (start_part == MY_BIT_NONE)
    {
      DBUG_PRINT("info", ("scan with no partition to scan"));
      m_table->status= STATUS_NOT_FOUND;
      DBUG_RETURN(HA_ERR_END_OF_FILE);
    }
    if (start_part > m_part_spec.start_part)
      m_part_spec.start_part= start_part;
    m_ordered_scan_ongoing= m_ordered;
  }
  DBUG_ASSERT(m_part_spec.start_part < m_tot_parts);
  DBUG_ASSERT(m_part_spec.end_part < m_tot_parts);
  DBUG_RETURN(0);
}


/**
  Common routine to handle index_next with unordered results.

  These routines are used to scan partitions without considering order.
  This is performed in two situations.
  1) In read_multi_range this is the normal case
  2) When performing any type of index_read, index_first, index_last where
  all fields in the partition function is bound. In this case the index
  scan is performed on only one partition and thus it isn't necessary to
  perform any sort.

  @param[out] buf        Read row in MySQL Row Format.
  @param[in]  next_same  Called from index_next_same.

  @return Operation status.
    @retval HA_ERR_END_OF_FILE  End of scan
    @retval 0                   Success
    @retval other               Error code
*/

int Partition_helper::handle_unordered_next(uchar *buf, bool is_next_same)
{
  int error;
  DBUG_ENTER("Partition_helper::handle_unordered_next");

  if (m_part_spec.start_part >= m_tot_parts)
  {
    /* Should only happen with SQL HANDLER! */
    DBUG_ASSERT(m_table->open_by_handler);
    DBUG_RETURN(HA_ERR_END_OF_FILE);
  }

  /*
    We should consider if this should be split into three functions as
    partition_read_range is_next_same are always local constants
  */

  if(is_next_same)
  {
    error= index_next_same_in_part(m_part_spec.start_part,
				   buf,
                                   m_start_key.key,
                                   m_start_key.length);
  }
  else if (m_index_scan_type == PARTITION_READ_RANGE)
  {
    DBUG_ASSERT(buf == m_table->record[0]);
    error= read_range_next_in_part(m_part_spec.start_part, NULL);
  }
  else
  {
    error= index_next_in_part(m_part_spec.start_part, buf);
  }

  if (error == HA_ERR_END_OF_FILE)
  {
    m_part_spec.start_part++;                    // Start using next part
    error= handle_unordered_scan_next_partition(buf);
  }
  else
  {
    m_last_part= m_part_spec.start_part;
  }
  DBUG_RETURN(error);
}


/**
  Handle index_next when changing to new partition.

  This routine is used to start the index scan on the next partition.
  Both initial start and after completing scan on one partition.

  @param[out] buf  Read row in MySQL Row Format

  @return Operation status.
    @retval HA_ERR_END_OF_FILE  End of scan
    @retval 0                   Success
    @retval other               Error code
*/

int Partition_helper::handle_unordered_scan_next_partition(uchar * buf)
{
  uint i= m_part_spec.start_part;
  int saved_error= HA_ERR_END_OF_FILE;
  DBUG_ENTER("Partition_helper::handle_unordered_scan_next_partition");

  if (i)
    i= m_part_info->get_next_used_partition(i - 1);
  else
    i= m_part_info->get_first_used_partition();

  for (;
       i <= m_part_spec.end_part;
       i= m_part_info->get_next_used_partition(i))
  {
    int error;
    m_part_spec.start_part= i;
    switch (m_index_scan_type) {
    case PARTITION_READ_RANGE:
      DBUG_ASSERT(buf == m_table->record[0]);
      DBUG_PRINT("info", ("read_range_first on partition %d", i));
      error= read_range_first_in_part(i,
                                      NULL,
                                      m_start_key.key? &m_start_key: NULL,
                                      m_handler->end_range,
                                      get_eq_range(),
                                      false);
      break;
    case PARTITION_INDEX_READ:
      DBUG_PRINT("info", ("index_read on partition %d", i));
      error= index_read_map_in_part(i,
                                    buf,
                                    m_start_key.key,
                                    m_start_key.keypart_map,
                                    m_start_key.flag);
      break;
    case PARTITION_INDEX_FIRST:
      DBUG_PRINT("info", ("index_first on partition %d", i));
      error= index_first_in_part(i, buf);
      break;
    case PARTITION_INDEX_FIRST_UNORDERED:
      /* When is this ever used? */
      DBUG_ASSERT(0);
      /*
        We perform a scan without sorting and this means that we
        should not use the index_first since not all handlers
        support it and it is also unnecessary to restrict sort
        order.
      */
      DBUG_PRINT("info", ("read_range_first on partition %d", i));
      DBUG_ASSERT(buf == m_table->record[0]);
      error= read_range_first_in_part(i,
                                      NULL,
                                      0,
                                      m_handler->end_range,
                                      get_eq_range(),
                                      0);
      break;
    default:
      DBUG_ASSERT(0);
      DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
    }
    if (!error)
    {
      m_last_part= i;
      DBUG_RETURN(0);
    }
    if ((error != HA_ERR_END_OF_FILE) && (error != HA_ERR_KEY_NOT_FOUND))
      DBUG_RETURN(error);

    /*
      If HA_ERR_KEY_NOT_FOUND, we must return that error instead of
      HA_ERR_END_OF_FILE, to be able to continue search.
    */
    if (saved_error != HA_ERR_KEY_NOT_FOUND)
      saved_error= error;
    DBUG_PRINT("info", ("END_OF_FILE/KEY_NOT_FOUND on partition %d", i));
  }
  if (saved_error == HA_ERR_END_OF_FILE)
    m_part_spec.start_part= NO_CURRENT_PART_ID;
  DBUG_RETURN(saved_error);
}


/**
  Common routine to start index scan with ordered results.

  @param[out] buf  Read row in MySQL Row Format

  @return Operation status
    @retval HA_ERR_END_OF_FILE    End of scan
    @retval HA_ERR_KEY_NOT_FOUND  End of scan
    @retval 0                     Success
    @retval other                 Error code

  @details
    This part contains the logic to handle index scans that require ordered
    output. This includes all except those started by read_range_first with
    the flag ordered set to FALSE. Thus most direct index_read and all
    index_first and index_last.

    We implement ordering by keeping one record plus a key buffer for each
    partition. Every time a new entry is requested we will fetch a new
    entry from the partition that is currently not filled with an entry.
    Then the entry is put into its proper sort position.

    Returning a record is done by getting the top record, copying the
    record to the request buffer and setting the partition as empty on
    entries.
*/

int Partition_helper::handle_ordered_index_scan(uchar *buf)
{
  uint i;
  std::vector<uchar*> parts;
  bool found= FALSE;
  uchar *part_rec_buf_ptr= m_ordered_rec_buffer;
  int saved_error= HA_ERR_END_OF_FILE;
  DBUG_ENTER("Partition_helper::handle_ordered_index_scan");
  DBUG_ASSERT(part_rec_buf_ptr);

  if (m_key_not_found)
  {
    m_key_not_found= false;
    bitmap_clear_all(&m_key_not_found_partitions);
    DBUG_PRINT("info", ("Cleared m_key_not_found_partitions"));
  }
  m_top_entry= NO_CURRENT_PART_ID;
  m_queue->clear();
  parts.reserve(m_queue->capacity());
  DBUG_ASSERT(m_part_info->is_partition_used(m_part_spec.start_part));

  /*
    Position part_rec_buf_ptr to point to the first used partition >=
    start_part. There may be partitions marked by used_partitions,
    but is before start_part. These partitions has allocated record buffers
    but is dynamically pruned, so those buffers must be skipped.
  */
  for (i= m_part_info->get_first_used_partition();
       i < m_part_spec.start_part;
       i= m_part_info->get_next_used_partition(i))
  {
    part_rec_buf_ptr+= m_rec_offset + m_rec_length;
  }
  DBUG_PRINT("info", ("m_part_spec.start_part %u first_used_part %u",
                      m_part_spec.start_part, i));
  for (/* continue from above */ ;
       i <= m_part_spec.end_part;
       i= m_part_info->get_next_used_partition(i))
  {
    DBUG_PRINT("info", ("reading from part %u (scan_type: %u inx: %u)",
                        i, m_index_scan_type, m_handler->active_index));
    DBUG_ASSERT(i == uint2korr(part_rec_buf_ptr));
    uchar *rec_buf_ptr= part_rec_buf_ptr + m_rec_offset;
    uchar *read_buf;
    int error;
    DBUG_PRINT("info", ("part %u, scan_type %d", i, m_index_scan_type));

    /* ICP relies on Item evaluation, which expects the row in record[0]. */
    if (m_handler->pushed_idx_cond)
      read_buf= m_table->record[0];
    else
      read_buf= rec_buf_ptr;

    switch (m_index_scan_type) {
    case PARTITION_INDEX_READ:
      error= index_read_map_in_part(i,
                                    read_buf,
                                    m_start_key.key,
                                    m_start_key.keypart_map,
                                    m_start_key.flag);
      break;
    case PARTITION_INDEX_FIRST:
      error= index_first_in_part(i, read_buf);
      break;
    case PARTITION_INDEX_LAST:
      error= index_last_in_part(i, read_buf);
      break;
    case PARTITION_INDEX_READ_LAST:
      error= index_read_last_map_in_part(i,
                                         read_buf,
                                         m_start_key.key,
                                         m_start_key.keypart_map);
      break;
    case PARTITION_READ_RANGE:
    {
      /*
        To enable optimization in derived engines, we provide a read buffer
        pointer if we want to read into something different than table->record[0]
        (which read_range_* always uses).
      */
      error= read_range_first_in_part(i,
                                      read_buf == m_table->record[0]
                                        ? NULL : read_buf,
                                      m_start_key.key ? &m_start_key : NULL,
                                      m_handler->end_range,
                                      get_eq_range(),
                                      true);
      break;
    }
    default:
      DBUG_ASSERT(false);
      DBUG_RETURN(HA_ERR_END_OF_FILE);
    }
    DBUG_PRINT("info", ("error %d from partition %u", error, i));
    /* When using ICP, copy record[0] to the priority queue for sorting. */
    if (m_handler->pushed_idx_cond)
      memcpy(rec_buf_ptr, read_buf, m_rec_length);
    if (!error)
    {
      found= true;
      if (m_ref_usage != REF_NOT_USED)
      {
        /* position_in_last_part needs m_last_part set. */
        m_last_part= i;
        position_in_last_part(part_rec_buf_ptr + PARTITION_BYTES_IN_POS,
                              rec_buf_ptr);
      }
      /*
        Save for later insertion in queue;
      */
      parts.push_back(part_rec_buf_ptr);
      DBUG_DUMP("row", read_buf, m_rec_length);
    }
    else if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)
    {
      DBUG_RETURN(error);
    }
    else if (error == HA_ERR_KEY_NOT_FOUND)
    {
      DBUG_PRINT("info", ("HA_ERR_KEY_NOT_FOUND from partition %u", i));
      bitmap_set_bit(&m_key_not_found_partitions, i);
      m_key_not_found= true;
      saved_error= error;
    }
    part_rec_buf_ptr+= m_rec_offset + m_rec_length;
  }
  if (found)
  {
    /*
      We found at least one partition with data, now sort all entries and
      after that read the first entry and copy it to the buffer to return in.
    */
    m_queue->m_max_at_top= m_reverse_order;
    m_queue->m_keys= m_curr_key_info;
    DBUG_ASSERT(m_queue->empty());
    /*
      If PK, we should not sort by rowid, since that is already done
      through the KEY setup.
    */
    DBUG_ASSERT(!m_curr_key_info[1] || m_ref_usage == REF_NOT_USED);
    m_queue->assign(parts);
    return_top_record(buf);
    m_table->status= 0;
    DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry));
    DBUG_RETURN(0);
  }
  DBUG_RETURN(saved_error);
}


/**
  Return the top record in sort order.

  @param[out] buf  Row returned in MySQL Row Format.
*/

void Partition_helper::return_top_record(uchar *buf)
{
  uint part_id;
  uchar *key_buffer= m_queue->top();
  uchar *rec_buffer= key_buffer + m_rec_offset;

  part_id= uint2korr(key_buffer);
  copy_cached_row(buf, rec_buffer);
  DBUG_PRINT("info", ("from part_id %u", part_id));
  DBUG_DUMP("returned_row", buf, m_table->s->reclength);
  m_last_part= part_id;
  m_top_entry= part_id;
}


/**
  Add index_next/prev results from partitions without exact match.

  If there where any partitions that returned HA_ERR_KEY_NOT_FOUND when
  ha_index_read_map was done, those partitions must be included in the
  following index_next/prev call.
*/

int Partition_helper::handle_ordered_index_scan_key_not_found()
{
  int error;
  uint i;
  size_t old_elements= m_queue->size();
  uchar *part_buf= m_ordered_rec_buffer;
  uchar *curr_rec_buf= NULL;
  DBUG_ENTER("Partition_helper::handle_ordered_index_scan_key_not_found");
  DBUG_ASSERT(m_key_not_found);
  DBUG_ASSERT(part_buf);
  /*
    Loop over all used partitions to get the correct offset
    into m_ordered_rec_buffer.
  */
  for (i= m_part_info->get_first_used_partition();
       i < MY_BIT_NONE;
       i= m_part_info->get_next_used_partition(i))
  {
    if (bitmap_is_set(&m_key_not_found_partitions, i))
    {
      /*
        This partition is used and did return HA_ERR_KEY_NOT_FOUND
        in index_read_map.
      */
      uchar *read_buf;
      curr_rec_buf= part_buf + m_rec_offset;
      /* ICP relies on Item evaluation, which expects the row in record[0]. */
      if (m_handler->pushed_idx_cond)
        read_buf= m_table->record[0];
      else
        read_buf= curr_rec_buf;

      if (m_reverse_order)
        error= index_prev_in_part(i, read_buf);
      else
        error= index_next_in_part(i, read_buf);
      /* HA_ERR_KEY_NOT_FOUND is not allowed from index_next! */
      DBUG_ASSERT(error != HA_ERR_KEY_NOT_FOUND);
      DBUG_PRINT("info", ("Filling from partition %u reverse %u error %d",
                         i, m_reverse_order, error));
      if (!error)
      {
        /* When using ICP, copy record[0] to the priority queue for sorting. */
        if (m_handler->pushed_idx_cond)
          memcpy(curr_rec_buf, read_buf, m_rec_length);
        if (m_ref_usage != REF_NOT_USED)
        {
          /* position_in_last_part needs m_last_part set. */
          m_last_part= i;
          position_in_last_part(part_buf + PARTITION_BYTES_IN_POS,
                                curr_rec_buf);
        }
        m_queue->push(part_buf);
      }
      else if (error != HA_ERR_END_OF_FILE && error != HA_ERR_KEY_NOT_FOUND)
        DBUG_RETURN(error);
    }
    part_buf+= m_rec_offset + m_rec_length;
  }
  DBUG_ASSERT(curr_rec_buf);
  bitmap_clear_all(&m_key_not_found_partitions);
  m_key_not_found= false;

  if (m_queue->size() > old_elements)
  {
    /* Update m_top_entry, which may have changed. */
    uchar *key_buffer= m_queue->top();
    m_top_entry= uint2korr(key_buffer);
  }
  DBUG_RETURN(0);
}


/**
  Common routine to handle index_next with ordered results.

  @param[out] buf        Read row in MySQL Row Format.
  @param[in]  next_same  Called from index_next_same.

  @return Operation status.
    @retval HA_ERR_END_OF_FILE  End of scan
    @retval 0                   Success
    @retval other               Error code
*/

int Partition_helper::handle_ordered_next(uchar *buf, bool is_next_same)
{
  int error;
  uint part_id= m_top_entry;
  uchar *rec_buf= m_queue->empty() ? NULL : m_queue->top() + m_rec_offset;
  uchar *read_buf;
  DBUG_ENTER("Partition_helper::handle_ordered_next");

  if (m_reverse_order)
  {
    /*
      TODO: To support change of direction (index_prev -> index_next,
      index_read_map(HA_READ_KEY_EXACT) -> index_prev etc.)
      We would need to:
      - Step back all cursors we have a buffered row from a previous next/prev
        call (i.e. for all partitions we previously called index_prev, we must
        call index_next and skip that row.
      - empty the priority queue and initialize it again with reverse ordering.
    */
    DBUG_ASSERT(m_table->open_by_handler);
    DBUG_RETURN(HA_ERR_WRONG_COMMAND);
  }

  if (m_key_not_found)
  {
    if (is_next_same)
    {
      /* Only rows which match the key. */
      m_key_not_found= false;
      bitmap_clear_all(&m_key_not_found_partitions);
    }
    else
    {
      /* There are partitions not included in the index record queue. */
      size_t old_elements= m_queue->size();
      if ((error= handle_ordered_index_scan_key_not_found()))
        DBUG_RETURN(error);
      /*
        If the queue top changed, i.e. one of the partitions that gave
        HA_ERR_KEY_NOT_FOUND in index_read_map found the next record,
        return it.
        Otherwise replace the old with a call to index_next (fall through).
      */
      if (old_elements != m_queue->size() && part_id != m_top_entry)
      {
        return_top_record(buf);
        DBUG_PRINT("info", ("Returning row from part %u (prev KEY_NOT_FOUND)",
                            m_top_entry));
        DBUG_RETURN(0);
      }
    }
  }
  if (part_id >= m_tot_parts)
    DBUG_RETURN(HA_ERR_END_OF_FILE);

  DBUG_PRINT("info", ("next row from part %u (inx %u)",
                      part_id, m_handler->active_index));

  /* Assert that buffer for fetch is not NULL */
  DBUG_ASSERT(rec_buf);

  /* ICP relies on Item evaluation, which expects the row in record[0]. */
  if (m_handler->pushed_idx_cond)
    read_buf= m_table->record[0];
  else
    read_buf= rec_buf;

  if (is_next_same) {
    error = index_next_same_in_part(part_id,
                                   read_buf,
                                   m_start_key.key,
                                   m_start_key.length);
  } else if (m_index_scan_type == PARTITION_READ_RANGE) {
    error = read_range_next_in_part(part_id,
                                   read_buf == m_table->record[0]
                                     ? NULL : read_buf);
  }
  else {
    error = index_next_in_part(part_id, read_buf);
  }

  if (error)
  {
    if (error == HA_ERR_END_OF_FILE)
    {
      /* Return next buffered row */
      if (!m_queue->empty())
        m_queue->pop();
      if (m_queue->empty())
      {
        /*
          If priority queue is empty, we have finished fetching rows from all
          partitions. Reset the value of next partition to NONE. This would
          imply HA_ERR_END_OF_FILE for all future calls.
        */
        m_top_entry= NO_CURRENT_PART_ID;
      }
      else
      {
         return_top_record(buf);
         DBUG_PRINT("info", ("Record returned from partition %u (2)",
                     m_top_entry));
         m_table->status= 0;
         error= 0;
      }
    }
    DBUG_RETURN(error);
  }
  /* When using ICP, copy record[0] to the priority queue for sorting. */
  if (m_handler->pushed_idx_cond)
    memcpy(rec_buf, read_buf, m_rec_length);
  if (m_ref_usage != REF_NOT_USED)
  {
    /* position_in_last_part needs m_last_part set. */
    m_last_part= part_id;
    position_in_last_part(rec_buf - m_rec_offset + PARTITION_BYTES_IN_POS,
                          rec_buf);
  }
  DBUG_DUMP("rec_buf", rec_buf, m_rec_length);
  m_queue->update_top();
  return_top_record(buf);
  DBUG_PRINT("info", ("Record returned from partition %u", m_top_entry));
  DBUG_RETURN(0);
}


/**
  Common routine to handle index_prev with ordered results.

  @param[out] buf  Read row in MySQL Row Format.

  @return Operation status.
    @retval HA_ERR_END_OF_FILE  End of scan
    @retval 0                   Success
    @retval other               Error code
*/

int Partition_helper::handle_ordered_prev(uchar *buf)
{
  int error;
  uint part_id= m_top_entry;
  uchar *rec_buf= m_queue->empty() ? NULL : m_queue->top() + m_rec_offset;
  uchar *read_buf;
  DBUG_ENTER("Partition_helper::handle_ordered_prev");

  if (!m_reverse_order)
  {
    /* TODO: See comment in handle_ordered_next(). */
    DBUG_ASSERT(m_table->open_by_handler);
    DBUG_RETURN(HA_ERR_WRONG_COMMAND);
  }

  if (m_key_not_found)
  {
    /* There are partitions not included in the index record queue. */
    size_t old_elements= m_queue->size();
    if ((error= handle_ordered_index_scan_key_not_found()))
      DBUG_RETURN(error);
    if (old_elements != m_queue->size() && part_id != m_top_entry)
    {
      /*
        Should only be possible for when HA_READ_KEY_EXACT was previously used,
        which is not supported to have a subsequent call for PREV.
        I.e. HA_READ_KEY_EXACT is considered to not have reverse order!
      */
      DBUG_ASSERT(0);
      /*
        If the queue top changed, i.e. one of the partitions that gave
        HA_ERR_KEY_NOT_FOUND in index_read_map found the next record,
        return it.
        Otherwise replace the old with a call to index_next (fall through).
      */
      return_top_record(buf);
      DBUG_RETURN(0);
    }
  }

  if (part_id >= m_tot_parts)
  {
    /* This should never happen, except for SQL HANDLER calls! */
    DBUG_ASSERT(m_table->open_by_handler);
    DBUG_RETURN(HA_ERR_END_OF_FILE);
  }

  /* Assert that buffer for fetch is not NULL */
  DBUG_ASSERT(rec_buf);

  /* ICP relies on Item evaluation, which expects the row in record[0]. */
  if (m_handler->pushed_idx_cond)
    read_buf= m_table->record[0];
  else
    read_buf= rec_buf;

  if ((error= index_prev_in_part(part_id, read_buf)))
  {
    if (error == HA_ERR_END_OF_FILE)
    {
      if (!m_queue->empty())
        m_queue->pop();
      if (m_queue->empty())
      {
        /*
          If priority queue is empty, we have finished fetching rows from all
          partitions. Reset the value of next partition to NONE. This would
          imply HA_ERR_END_OF_FILE for all future calls.
        */
        m_top_entry= NO_CURRENT_PART_ID;
      }
      else
      {
        return_top_record(buf);
        DBUG_PRINT("info", ("Record returned from partition %d (2)",
                            m_top_entry));
        error= 0;
        m_table->status= 0;
      }
    }
    DBUG_RETURN(error);
  }
  /* When using ICP, copy record[0] to the priority queue for sorting. */
  if (m_handler->pushed_idx_cond)
    memcpy(rec_buf, read_buf, m_rec_length);

  if (m_ref_usage != REF_NOT_USED)
  {
    /* position_in_last_part needs m_last_part set. */
    m_last_part= part_id;
    position_in_last_part(rec_buf - m_rec_offset + PARTITION_BYTES_IN_POS,
                          rec_buf);
  }
  m_queue->update_top();
  return_top_record(buf);
  DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry));
  DBUG_RETURN(0);
}

/**
  Get statistics from a specific partition.

  @param[out] stat_info  Area to report values into.
  @param[out] check_sum  Check sum of partition.
  @param[in]  part_id    Partition to report from.
*/
void
Partition_helper::get_dynamic_partition_info_low(ha_statistics *stat_info,
                                                 ha_checksum *check_sum,
                                                 uint part_id)
{
  ha_statistics *part_stat= &m_handler->stats;
  DBUG_ASSERT(bitmap_is_set(&m_part_info->read_partitions, part_id));
  DBUG_ASSERT(bitmap_is_subset(&m_part_info->read_partitions,
                               &m_part_info->lock_partitions));
  DBUG_ASSERT(bitmap_is_subset(&m_part_info->lock_partitions,
                               &m_part_info->read_partitions));
  bitmap_clear_all(&m_part_info->read_partitions);
  bitmap_set_bit(&m_part_info->read_partitions, part_id);
  m_handler->info(HA_STATUS_TIME |
                  HA_STATUS_VARIABLE |
                  HA_STATUS_VARIABLE_EXTRA |
                  HA_STATUS_NO_LOCK);
  stat_info->records=              part_stat->records;
  stat_info->mean_rec_length=      part_stat->mean_rec_length;
  stat_info->data_file_length=     part_stat->data_file_length;
  stat_info->max_data_file_length= part_stat->max_data_file_length;
  stat_info->index_file_length=    part_stat->index_file_length;
  stat_info->delete_length=        part_stat->delete_length;
  stat_info->create_time=          part_stat->create_time;
  stat_info->update_time=          part_stat->update_time;
  stat_info->check_time=           part_stat->check_time;
  if (m_handler->ha_table_flags() & HA_HAS_CHECKSUM)
  {
    *check_sum= checksum_in_part(part_id);
  }
  bitmap_copy(&m_part_info->read_partitions, &m_part_info->lock_partitions);
}


/**
  Get checksum for table.

  @return Checksum or 0 if not supported, which also may be a correct checksum!.
*/

ha_checksum Partition_helper::ph_checksum() const
{
  ha_checksum sum= 0;
  if ((m_handler->ha_table_flags() & HA_HAS_CHECKSUM))
  {
    for (uint i= 0; i < m_tot_parts; i++)
    {
      sum+= checksum_in_part(i);
    }
  }
  return sum;
}