List:General Discussion« Previous MessageNext Message »
From:Tom Cunningham Date:December 23 2005 4:29am
Subject:MERGE tables considered harmful for data warehouse fact tables
View as plain text  
The script to prove it follows.

Splitting a million-row fact table into a 5-part merge table makes
3-second queries take 9  seconds.

The basic problem is this: fact tables are generally referenced by
unique combinations of dimensions, and though often only one
underlying table needs to be referenced, mysql doesn't know this, so
every single underlying table is polled for each
dimension-combination.

Practical alternatives: (1) leave your fact table as a big one; (2)
split the table up, but put logic in the client-app so it knows which
table to address; (3) use a *union* of queries instead of a merge
table, then mysql could look at each underlying table one at a time.

If anyone has ideas for other alternatives, or improvements on the
script, please tell me.

Thanks.

Tom.



#################################

#---------------- MERGE FACT TABLE TEST -------------
#-------------tom.cunningham-@stripped


   USE sandbox;

CREATE TABLE big_table LIKE mysql.help_keyword;
 ALTER TABLE big_table ENGINE=MERGE UNION=(mysql.help_keyword);

  DROP TABLE IF EXISTS dimension_1;
CREATE TABLE dimension_1 (
       key_1       INT PRIMARY KEY NOT NULL,
       attribute_1 VARCHAR(255) NOT NULL,
       INDEX attribute_1 (attribute_1(10))
);
   SET @A:=1;
INSERT INTO dimension_1
SELECT @A:=@A+1,
       SHA(RAND())
  FROM big_table b1, big_table b2, big_table b3
 LIMIT 100000;

  DROP TABLE IF EXISTS dimension_2;
CREATE TABLE dimension_2 (
       key_2       INT PRIMARY KEY NOT NULL,
       attribute_2 VARCHAR(255) NOT NULL,
       INDEX attribute_1 (attribute_2(10))
);
   SET @A:=1;
INSERT INTO dimension_2
SELECT @A:=@A+1,
       SHA(RAND())
  FROM big_table b1, big_table b2, big_table b3
 LIMIT 100000;


  DROP TABLE IF EXISTS facts;
CREATE TABLE facts (
       key_1 INT UNSIGNED,
       key_2 INT UNSIGNED,
       fact_1 INT UNSIGNED,
       fact_2 INT UNSIGNED,
       PRIMARY KEY key_1_key_2 (key_1,key_2)
);
INSERT IGNORE INTO facts
SELECT CEIL(RAND()*100000),
       CEIL(RAND()*100000),
       CEIL(RAND()*1000000),
       CEIL(RAND()*1000000)
  FROM big_table b1, big_table b2, big_table b3
 LIMIT 1000000;



###### Typical query: (takes about 1.5seconds)
SELECT SQL_NO_CACHE attribute_1, sum(fact_1), avg(fact_2)
  FROM facts
  JOIN dimension_1 ON dimension_1.key_1=facts.key_1
  JOIN dimension_2 ON dimension_2.key_2=facts.key_2
 WHERE attribute_1 LIKE 'ff%'
   AND attribute_2 LIKE 'a3%'
 GROUP BY attribute_1
 LIMIT 1;


###### Variation 1: Unionised fact table;

DROP TABLES f1, f2, f3, f4, f5;
CREATE TABLE f1 LIKE facts; INSERT INTO f1 SELECT * FROM facts WHERE
key_1 BETWEEN 0 AND 20000;
CREATE TABLE f2 LIKE facts; INSERT INTO f2 SELECT * FROM facts WHERE
key_1 BETWEEN 20001 AND 40000;
CREATE TABLE f3 LIKE facts; INSERT INTO f3 SELECT * FROM facts WHERE
key_1 BETWEEN 40001 AND 60000;
CREATE TABLE f4 LIKE facts; INSERT INTO f4 SELECT * FROM facts WHERE
key_1 BETWEEN 60001 AND 80000;
CREATE TABLE f5 LIKE facts; INSERT INTO f5 SELECT * FROM facts WHERE
key_1 BETWEEN 80001 AND 100000;
RENAME TABLE facts TO facts_old;
CREATE TABLE facts LIKE facts_old;
ALTER TABLE facts ENGINE=MERGE UNION=(f1,f2,f3,f4,f5);

# (now try the above same query again)
Thread
MERGE tables considered harmful for data warehouse fact tablesTom Cunningham23 Dec
  • Re: MERGE tables considered harmful for data warehouse fact tablessheeri kritzer13 Jan