From: vasil.dimov Date: August 16 2010 2:19pm Subject: bzr commit into mysql-5.5-innodb branch (vasil.dimov:3156) Bug#53761 List-Archive: http://lists.mysql.com/commits/115810 X-Bug: 53761 Message-Id: <20100816141942.526B32E0AA@mail.v5d.org> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="===============6359443650819450856==" --===============6359443650819450856== MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Content-Disposition: inline #At file:///usr/local/devel/bzrroot/server/mysql-5.5-innodb/ based on revid:inaam.rana@stripped 3156 Vasil Dimov 2010-08-16 Fix Bug#53761 RANGE estimation for matched rows may be 200 times different Improve the range estimation algorithm. Previously: For a given level the algo knows the number of pages in the requested range and the number of records on the leftmost and the rightmost page. Then it assumes all pages in between contain the average between the two border pages and multiplies this average number by the number of intermediate pages. With this change: Same idea, but peek a few (10) of the intermediate pages to get a better estimate of the average number of records per page. If there are less than 10 intermediate pages then all of them will be scanned and the result will be precise, not an estimation. In the bug report one of the examples has a btree with a snippet of the leaf level like this: page1(899 records), page2(1 record), page3(1 record), page4(1 record) so when trying to estimate, the previous algo, assumed there are average (899+1)/2=450 records per page which went terribly wrong. With this change page2 and page3 will be read and the exact number of records will be returned. modified: storage/innobase/btr/btr0cur.c storage/innobase/include/btr0cur.h === modified file 'storage/innobase/btr/btr0cur.c' --- a/storage/innobase/btr/btr0cur.c revid:inaam.rana@stripped +++ b/storage/innobase/btr/btr0cur.c revid:vasil.dimov@stripped @@ -3153,6 +3153,7 @@ btr_cur_add_path_info( { btr_path_t* slot; rec_t* rec; + page_t* page; ut_a(cursor->path_arr); @@ -3175,8 +3176,155 @@ btr_cur_add_path_info( slot = cursor->path_arr + (root_height - height); + page = page_align(rec); + slot->nth_rec = page_rec_get_n_recs_before(rec); - slot->n_recs = page_get_n_recs(page_align(rec)); + slot->n_recs = page_get_n_recs(page); + slot->page_no = page_get_page_no(page); + slot->page_level = btr_page_get_level_low(page); +} + +/*******************************************************************//** +Estimate the number of rows between slot1 and slot2 for any level on a +B-tree. This function starts from slot1->page and reads a few pages to +the right, counting their records. If we reach slot2->page quickly then +we know exactly how many records there are between slot1 and slot2 and +we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly +then we calculate the average number of records in the pages scanned +so far and assume that all pages that we did not scan up to slot2->page +contain the same number of records, then we multiply that average to +the number of pages between slot1->page and slot2->page (which is +n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE. +@return number of rows (exact or estimated) */ +static +ib_int64_t +btr_estimate_n_rows_in_range_on_level( +/*==================================*/ + dict_index_t* index, /*!< in: index */ + btr_path_t* slot1, /*!< in: left border */ + btr_path_t* slot2, /*!< in: right border */ + ib_int64_t n_rows_on_prev_level, /*!< in: number of rows + on the previous level for the + same descend paths; used to + determine the numbe of pages + on this level */ + ibool* is_n_rows_exact) /*!< out: TRUE if the returned + value is exact i.e. not an + estimation */ +{ + ulint space; + ib_int64_t n_rows; + ulint n_pages_read; + ulint page_no; + ulint zip_size; + ulint level; + + space = dict_index_get_space(index); + + n_rows = 0; + n_pages_read = 0; + + /* Assume by default that we will scan all pages between + slot1->page_no and slot2->page_no */ + *is_n_rows_exact = TRUE; + + /* add records from slot1->page_no which are to the right of + the record which serves as a left border of the range, if any */ + if (slot1->nth_rec < slot1->n_recs) { + n_rows += slot1->n_recs - slot1->nth_rec; + } + + /* add records from slot2->page_no which are to the left of + the record which servers as a right border of the range, if any */ + if (slot2->nth_rec > 1) { + n_rows += slot2->nth_rec - 1; + } + + /* count the records in the pages between slot1->page_no and + slot2->page_no (non inclusive), if any */ + + zip_size = fil_space_get_zip_size(space); + + /* Do not read more than this number of pages in order not to hurt + performance with this code which is just an estimation. If we read + this many pages before reaching slot2->page_no then we estimate the + average from the pages scanned so far */ + #define N_PAGES_READ_LIMIT 10 + + page_no = slot1->page_no; + level = slot1->page_level; + + do { + mtr_t mtr; + page_t* page; + buf_block_t* block; + + mtr_start(&mtr); + + /* fetch the page */ + block = buf_page_get(space, zip_size, page_no, RW_S_LATCH, + &mtr); + + page = buf_block_get_frame(block); + + /* It is possible that the tree has been reorganized in the + meantime and this is a different page. If this happens the + calculated estimate will be bogus, which is not fatal as + this is only an estimate. We are sure that a page with + page_no exists because InnoDB never frees pages, only + reuses them. */ + if (fil_page_get_type(page) != FIL_PAGE_INDEX + || btr_page_get_index_id(page) != index->id + || btr_page_get_level_low(page) != level) { + + /* The page got reused for something else */ + goto inexact; + } + + n_pages_read++; + + if (page_no != slot1->page_no) { + /* Do not count the records on slot1->page_no, + we already counted them before this loop. */ + n_rows += page_get_n_recs(page); + } + + page_no = btr_page_get_next(page, &mtr); + + mtr_commit(&mtr); + + if (n_pages_read == N_PAGES_READ_LIMIT + || page_no == FIL_NULL) { + /* Either we read too many pages or + we reached the end of the level without passing + through slot2->page_no, the tree must have changed + in the meantime */ + goto inexact; + } + + } while (page_no != slot2->page_no); + + return(n_rows); + +inexact: + + *is_n_rows_exact = FALSE; + + /* We did interrupt before reaching slot2->page */ + + if (n_pages_read > 0) { + /* The number of pages on this level is + n_rows_on_prev_level, multiply it by the + average number of recs per page so far */ + n_rows = n_rows_on_prev_level + * n_rows / n_pages_read; + } else { + /* The tree changed before we could even + start with slot1->page_no */ + n_rows = 10; + } + + return(n_rows); } /*******************************************************************//** @@ -3201,6 +3349,7 @@ btr_estimate_n_rows_in_range( ibool diverged_lot; ulint divergence_level; ib_int64_t n_rows; + ibool is_n_rows_exact; ulint i; mtr_t mtr; @@ -3243,6 +3392,7 @@ btr_estimate_n_rows_in_range( /* We have the path information for the range in path1 and path2 */ n_rows = 1; + is_n_rows_exact = TRUE; diverged = FALSE; /* This becomes true when the path is not the same any more */ diverged_lot = FALSE; /* This becomes true when the paths are @@ -3258,7 +3408,7 @@ btr_estimate_n_rows_in_range( if (slot1->nth_rec == ULINT_UNDEFINED || slot2->nth_rec == ULINT_UNDEFINED) { - if (i > divergence_level + 1) { + if (i > divergence_level + 1 && !is_n_rows_exact) { /* In trees whose height is > 1 our algorithm tends to underestimate: multiply the estimate by 2: */ @@ -3270,7 +3420,9 @@ btr_estimate_n_rows_in_range( to over 1 / 2 of the estimated rows in the whole table */ - if (n_rows > index->table->stat_n_rows / 2) { + if (n_rows > index->table->stat_n_rows / 2 + && !is_n_rows_exact) { + n_rows = index->table->stat_n_rows / 2; /* If there are just 0 or 1 rows in the table, @@ -3296,10 +3448,15 @@ btr_estimate_n_rows_in_range( divergence_level = i; } } else { - /* Maybe the tree has changed between - searches */ - - return(10); + /* It is possible that + slot1->nth_rec >= slot2->nth_rec + if, for example, we have a single page + tree which contains (inf, 5, 6, supr) + and we select where x > 20 and x < 30; + in this case slot1->nth_rec will point + to the supr record and slot2->nth_rec + will point to 6 */ + n_rows = 0; } } else if (diverged && !diverged_lot) { @@ -3323,8 +3480,9 @@ btr_estimate_n_rows_in_range( } } else if (diverged_lot) { - n_rows = (n_rows * (slot1->n_recs + slot2->n_recs)) - / 2; + n_rows = btr_estimate_n_rows_in_range_on_level( + index, slot1, slot2, n_rows, + &is_n_rows_exact); } } } === modified file 'storage/innobase/include/btr0cur.h' --- a/storage/innobase/include/btr0cur.h revid:inaam.rana@stripped +++ b/storage/innobase/include/btr0cur.h revid:vasil.dimov@stripped @@ -615,6 +615,11 @@ struct btr_path_struct{ order); value ULINT_UNDEFINED denotes array end */ ulint n_recs; /*!< number of records on the page */ + ulint page_no; /*!< no of the page containing the record */ + ulint page_level; /*!< level of the page, if later we fetch + the page under page_no and it is no different + level then we know that the tree has been + reorganized */ }; #define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */ --===============6359443650819450856== MIME-Version: 1.0 Content-Type: text/bzr-bundle; charset="us-ascii"; name="bzr/vasil.dimov@stripped" Content-Transfer-Encoding: 7bit Content-Disposition: inline # Bazaar merge directive format 2 (Bazaar 0.90) # revision_id: vasil.dimov@stripped # target_branch: file:///usr/local/devel/bzrroot/server/mysql-5.5-\ # innodb/ # testament_sha1: 6425809477bb044aa81c8be6aef8c509ed1ec9f3 # timestamp: 2010-08-16 17:19:42 +0300 # source_branch: file:///usr/local/devel/bzrroot/server/mysql-5.5-\ # bugfixing/ # base_revision_id: inaam.rana@stripped\ # 6so7vug3ykcay5n2 # # Begin bundle IyBCYXphYXIgcmV2aXNpb24gYnVuZGxlIHY0CiMKQlpoOTFBWSZTWUp3XvAABiX/gHAwAAF7f/// /+dfwL////5gDxw3feengrTznu93HKoBtbdJUK73e8oNSL23ajIkoDWjtgxRqZIZCeQmA0mQwSG0 TJo0ABkNNBoCUECaaNEyE0KDxJiajZJ6magMDSYQ9GgjDUwKaTJ6p6agA0AA0DQAAAAA0Ak1ISCe kJ6k9kjQantU2oYhoAA0aPUA0ABzCaA0Bo0YRoMRpiZMTQYRoGQDJgJEhNAIAJo0JpknqnjUYTSb TUDagAAaepbJJEGEAwQnLEhTGIoxBFVeTs9N1BvUWXEsilpVSgVisEGYfVNvxYuaKKERBiiNVQpF gs0D+LZLNNJQLLqlCKsiLGKaTq0+fXu1d3ir7n65/hH2EPs79v2NbHh4pu33b/j9kumFRcq8WZxl IKo5dzd71bJB3IhCJwNDqJCBoErJMExcYTyXyBrCRWNZ0BzgX0b9kBA7H5SglEUjHyWdffW9q2IL pnV+Rv6+WNu9XzWTa+sLeeTiu1ilisXQ8kuRFirGJrbnDmhMy0NJTDLvRyNat2hPkpxYRVVI2Ho1 EnUBrrmuNwo0+FFwJnGCoQ7k763748KPHuObRVcm7GydDPMUd4VgLOvLnnpJk8vC67GMXx4Bw9GZ pab8fJxOgPNZD0GREi0tO6eCUtWNShop0qhju+F/EhvlYZ6fygyT2QgJa1eD9jRcY4DmCVgK+vD4 ZwJJNFXt0yNF7ldbfFQZ5osXXvUx6sGaNzdoMSfl4JRwyNrFNojqYRyUII6/nP4n0oyKSDPf/4pN SNllmxQFFwbNdpXosMFLYY93yemRpPO8PrcjWHmk0wu2mRm51a2tx6bZXTBc7O4qZyiz4hpjMVU8 QZdunHEr0w154xbNXYMbzhIwwa2jJFM6YQzykV40PhOKHDjvPr6x7ZbVUJ2ViaYBkRcQpTgzkQUN r4R1kKUSODDcLSqCqu7NEnVTQPci/JuWT4MyO467njfUM828yEwF2XSLTqTplita9LxPsOh29Mwr knLaCTSzXnd1XsFJ4nDfjHzd8o0Z3ZbFGZeptZjNOMMay4Veq58Au4b82eNIvSPBm2bEZNXZYTnJ JOHFUjZRashYrQaGomR2exQUaYurlEUwRUFBFhIb0wpM7yhiAmIFdKdHSqkwwSZ3pkZBBFgeex7Z 6NLGRUU1n18pdM4jgJ3E+vxTbyUtF/YPHz1AB6Bx8/IZRWDjzDntroDvPgGEWd2WG3pneUiNATsv HnqTpoCKEZQ9nO6ifK/+UYZary2dpOk1vwCOWHO690ukLs9ol0tAxqCf5UQd4D39H9XTPXpb7r66 eb+zno2hE8q4VbCCNbPrtxZUvGayeSE/omec2gZ58beKxiJVUtUHBBJKu2QHLCi2ANfJ0wtUdWDc 1kNJDfY8i0WbOcsCd6R8ZLFdqSOk7Q9bAO7xPnp6Spr46Dq9g6BzaS692+hMnCFSEJ/UKXq7qUAn 4WnnUStmkR6wauvhlxI6BWUtPgKoXtVdLoUc4fgfYtc/TMQdMvhZmgn9F7sO88lILnuQjMTXoVee nOypxDywAU3Nbv5c35JjUDrIef1iWHmf3WgG/a6Ve5PQYTfkNziVSBu6+ng2yaaSsOaFneYEE9Ul gStOvyJSdZjIpiOHTVq3tUmthNoIePsVRV9Yh1mhQo54JgakbS3GimZZAs0+A9Q9FSEcWiCmLFdr OymkmpCnvAvY/gkLybTRDF7OIZqQn6lHezORsPFarVMkxToItNYSUrSGzw6PPtrCynXU9fvocb31 U0ENDrgPjoJ840TqpGHEoJFYYnkSVt+IqgbXwewEU6mJRie8jQ46APxrOxhvcK0LMJySbYRXB14u 7DcEMRIu/b4qJsI09emFi26WUbXULVOZkcDY9KPhkIvAwNxM3RBZXOqYCq9hbAeM5aS11mUNeZqY Aek3faRITUsimiicts5ktxsiBDQsQXyAdfZ7v8n8Av13F55XgPkt0lu4RYFmsEohOi0EgmnZPPwz /irxkYPt4pL9qLRO1bEy5hO4JWFM0jzNj+BK5StSIT05BAzFjcP9cpX4Zg9DHyJcb7x+/x8iYvt7 kIKRB2jWUQKLw+57B3eibtibgCSlb5Zu7tDH7d/ekhOLDFllZQ4CHqfHFPZqan6e0qhQUIQS9D2o K8RUk0GwIqS2lKlbEGsMs4krKmzWYWjkf82/E8l4iPC/lZPQjal68Fp8svDwpgqUXPbGZELFMhIC +8KUqKQxUF+AEIiISSmfeDVlh7lcZZXFTcCp93v95YLznw9gq/H7rhgaxfTLgbRWoewS1QCxDtCF 7oCqgxQDmYBUGSE0Q2mU4fZUO57U1d518hr2GndnUqsp5m4dSAs7G2Smxn4lQ1lAtVOxaJi2JZKK MZaf91pFlDOy9yOnXR249KRXTKwwxxaDUdjFxefWyeA853TzzsgjQsTzFTvFbisWHyYoSTAtN+j8 X2k6MaPL3o+THLoYVGNi6j52xRJMJD3i8Y8TM2LWLaWHAVZY5eCIU/SuIuCWPoaH9LfUaTt3AVIi rxCU1j8YdGmQJWBZuEuXKXFlwIaMyBWgNgajVXA4yYMYxavXq0pqyZMxa7TLPaKqCIMbnFg6RNk4 v1ebj6+J0l9DALQd5NG3l+gUyWsyU/HeIrKLeDECYGgtkQ1FHCSE+WsQbE79ushAWsU8TRMKWdsm 1dyf0YTyhdJoEtCM4mwHRYm5KYOyEGRbLHPYqa0rFZIK5Na+Geu4FomBmk1DobAMoKyZRhEmuDUM Fyj46MUQZsLd0KWaGs5zTKHVrtWTU2is8NFHRYT4zAJnY0dsOCtsMxLrvQooIsuIRDVBoSz6tKS1 sreQBN7mkKe615OK62i+MlbGTQdMETv7uPhutzmKqE+A4ahjntzRA+OPW9/7oKI29xapzcDcQKEc Z8dmlamrUwOGs1ziBJliZO+OMrAJh0KyWdBRKRkdE4CN0I0tUfmYjGKWHXy2B4j72Gy/AzgFpMsN lUNJiyAV3KQFSqDFUPKBWJyCNiL+879iRDaSG/iiyV4uR1gentJHlL1nJaA6fMWi2imGItsf7OZB by1Cg30bN9QuFqwbGCsyOYsrAWEEUhtCQxjIhAp1BUwR5ATrLoUvUcd3L8zgBTvtRsO8sBSeEiUG kKARkh86CyQUNtyiW8VAMg2ElmQwmqFc0Jvrdg381pK5E5FO9BLbvLT0M4/W0ZRwvwAmj7aiwRYZ AuZmxbgL0g++1IvDnjCjGAzOK9yl9iHwtrIqsvRwmVjVqXDV/Gene9yXYZq/FY922eIYMYepcwPK F5y+Dim0igoIsfQb7D2H7Vwzh5Mp3OmfAPq0OamnPneDSMdizcTAoio68hDxA5YkSVViHgJIJpTZ 9KJEBgikIJH1FtpElo9h7XiJTCtrDDsU7lHSzrbiwumcyb5yVqggzTbxAZwyuIIYFLEy6pmkF28i 99x7Y13BVQTvaqP2XRReCCGiMwCB0TalslROne6ufSSK6CgK6qZ7e8MQmG4EscqKmNXYdnZkcYdU GGKsyKo7lInNsKHRMPYqhCMPEZjIJnqOgdFFI6mqqUTBR1SMlapkZOQbYRJtoUkSYjB58k5Vmr2g lURHU+Mi4VCFrpFQVrbBXmcghrkrNjIrNZVMTWFrB2gNZbEuhoJORtMSb/GnFBQRnU08soN2zyGL g4TgwybRDuwenfmkLs7TEmFBK1mmrLYh+4cJcRR6rYKhoxMqYuzw2Qw/qJWouSuKNQhUPOzibIDY WplDWKjMxCM5Gd6NS787gVLpu4kkoYI5X2WgtwtR0C9AlCId8BDpELjzgiuLOBlLelpl2WbLKkCx OFCDQ6p1122ZZ2wZzBvnB2T0LPAKGN1NppwsPwAyliMiHDG2gIaMspkjgpsLxMUjSOAVBKAGjj7V yxDUUETKZyhdbQ2wTEYx56k3Oy4QoWZyZ6JqS8LA6uBCdwv4cONFdoGcs5omiYakaNAeeuPN3Ekh jBFqtu1EnF4sJklK+XCyIlza6Wk3SfdBfUvpdQmVZNDc6kElA4CrVGpk4ZbIkSOp32QUadiclKKx FZ6GCgQwmK67u0CmdF+RF0im1kMg9D1l5aCdKorQWBmzgrBtcuUJSYmwk0Q0Va4nyUTusIjr1HE0 fNrCTc61u7saL1tI2I1xsCVVyuYMayfS7Zz6VdBmXkdXPm3dYWUf4TAec3tC1NioP2qZCyIONCZb z1iFxLYJVUSBqeDWFZkobCTLQa20CeQpkJFGpTjpbUxtpDk4yWAyLySczDoSG8SWYRZkzSwJtNki LDQ9eIleV3kUWlmSjbsqgvGtDvRcu5eAMTGAyYUl2UyFSuZMNWpgqcOg7PgbA+vMP0+1bJm8jilw 47OC2AEBPVtFaETUrhRyRggNli9WVEn+supQ6wiRADTiHYOsoO6pxHLKQCrxcwFuYsB80chZz7ZO plIU7pEnjQnBJZyBB7i2RhJSQxiQ9wEJcwOJpT0+bjrmUByGSKKI/Us1fTFBr79EZ+gCSW1r8ltV gps/+LuSKcKEglO694A= --===============6359443650819450856==--