From: Date: December 10 2006 2:45pm Subject: RE: load XML data List-Archive: http://lists.mysql.com/internals/34169 Message-Id: <6287242.1165758323135.JavaMail.?@fh1063.dia.cp.net> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="----=_Part_2387_20464045.1165758323126" ------=_Part_2387_20464045.1165758323126 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable Hi, A first version of the LOAD XML is now available. It is based on the=20 LOAD DATA command, and has the following syntax: LOAD XML [LOW_PRIORITY | CONCURRENT] [LOCAL] INFILE 'file_name' [REPLACE | IGNORE] INTO TABLE tbl_name ROWS IDENTIFIED BY 'row_tag' [IGNORE number LINES] [(col_name_or_user_var,...)] [SET col_name =3D expr,...)] Basically this is the same syntax as LOAD DATA, and the xml import=20 also works the same way. The main difference is that the LINES and=20 FIELDS clauses are not supported.=20 Instead a ROWS IDENTIFIED BY clause is used. For every tag with this=20 tag name the command will try to create a record. It will match all=20 tags and attributes inside the row tag and all tags and attributes on a=20 higher level with the columns in the target table and fill in values=20 accordingly. The command supports three different xml formats: field values as attributes: =E2=80=9C=E2=80=9D field values as tags:=20 valuevalue2 the format used by other MySQL tools, where the field name is an=20 attribute, and the value is in the tag: value The three formats can be used in the same xml file, the import will=20 automatically detect them. The tags are matched based on tag/attribute=20 name and column name.=20 Changes are in attached patch file. Four files are affected: lex.h (only add symbol XML_SYM) sql_class.h (add enum_filetype and use in sql_exchange) sql_yacc.yy (definition of command LOAD XML) sql_load.cc(the implementation) Erik Wetterberg ------=_Part_2387_20464045.1165758323126 Content-Type: TEXT/X-PATCH; name=loadxml.patch Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename=loadxml.patch; size=19578 diff -ur mysql-5.1.12-beta/sql/lex.h work/sql/lex.h --- mysql-5.1.12-beta/sql/lex.h 2006-10-26 14:01:59.000000000 +0200 +++ work/sql/lex.h 2006-12-07 11:01:28.000000000 +0100 @@ -580,6 +580,7 @@ { "X509", SYM(X509_SYM)}, { "XOR", SYM(XOR)}, { "XA", SYM(XA_SYM)}, + { "XML", SYM(XML_SYM)}, /* LOAD XML Arnold/Erik */ { "YEAR", SYM(YEAR_SYM)}, { "YEAR_MONTH", SYM(YEAR_MONTH_SYM)}, { "ZEROFILL", SYM(ZEROFILL)}, diff -ur mysql-5.1.12-beta/sql/sql_class.h work/sql/sql_class.h --- mysql-5.1.12-beta/sql/sql_class.h 2006-10-26 14:01:59.000000000 +0200 +++ work/sql/sql_class.h 2006-12-10 08:26:55.000000000 +0100 @@ -36,14 +36,16 @@ enum enum_ha_read_modes { RFIRST, RNEXT, RPREV, RLAST, RKEY, RNEXT_SAME }; enum enum_duplicates { DUP_ERROR, DUP_REPLACE, DUP_UPDATE }; enum enum_delay_key_write { DELAY_KEY_WRITE_NONE, DELAY_KEY_WRITE_ON, - DELAY_KEY_WRITE_ALL }; -enum enum_check_fields -{ CHECK_FIELD_IGNORE, CHECK_FIELD_WARN, CHECK_FIELD_ERROR_FOR_NULL }; -enum enum_mark_columns -{ MARK_COLUMNS_NONE, MARK_COLUMNS_READ, MARK_COLUMNS_WRITE}; + DELAY_KEY_WRITE_ALL }; +enum enum_check_fields +{ CHECK_FIELD_IGNORE, CHECK_FIELD_WARN, CHECK_FIELD_ERROR_FOR_NULL }; +enum enum_mark_columns +{ MARK_COLUMNS_NONE, MARK_COLUMNS_READ, MARK_COLUMNS_WRITE}; +enum enum_filetype { FILETYPE_CSV, FILETYPE_XML }; /* LOAD XML Added by +Arnold & Erik */ -extern char internal_table_name[2]; -extern char empty_c_string[1]; +extern char internal_table_name[2]; +extern char empty_c_string[1]; extern const char **errmesg; #define TC_LOG_PAGE_SIZE 8192 @@ -1641,6 +1643,7 @@ class sql_exchange :public Sql_alloc { public: + enum enum_filetype filetype; /* load XML, Added by Arnold & Erik */ char *file_name; String *field_term,*enclosed,*line_term,*line_start,*escaped; bool opt_enclosed; diff -ur mysql-5.1.12-beta/sql/sql_load.cc work/sql/sql_load.cc --- mysql-5.1.12-beta/sql/sql_load.cc 2006-10-26 14:01:58.000000000 +0200 +++ work/sql/sql_load.cc 2006-12-10 07:49:14.000000000 +0100 @@ -16,6 +16,7 @@ /* Copy data from a textfile to table */ +/* 2006-12 Erik Wetterberg : LOAD XML added */ #include "mysql_priv.h" #include @@ -24,6 +25,21 @@ #include "sp_head.h" #include "sql_trigger.h" +class XML_TAG { +public: + int level; + String field; + String value; + + XML_TAG(int l, String f, String v); +}; +XML_TAG::XML_TAG(int l, String f, String v) +{ + level = l; + field.append(f); + value.append(v); +} + class READ_INFO { File file; byte *buffer, /* Buffer for read text */ @@ -38,6 +54,8 @@ bool need_end_io_cache; IO_CACHE cache; NET *io_net; + /* load xml */ + int level; public: bool error,line_cuted,found_null,enclosed; @@ -55,6 +73,12 @@ char unescape(char chr); int terminator(char *ptr,uint length); bool find_start_of_fields(); + /* load xml */ + List taglist; + int read_value(int delim, String *val); + int read_xml(); + int clear_level(int level); + /* We need to force cache close before destructor is invoked to log the last read block @@ -86,6 +110,12 @@ static bool write_execute_load_query_log_event(THD *thd, bool duplicates, bool ignore, bool transactional_table); + /* load xml */ +static int read_xml_field(THD *thd, COPY_INFO &info, TABLE_LIST *table_list, + List &fields_vars, List &set_fields, + List &set_values, READ_INFO &read_info, + String &enclosed, ulong skip_lines, + bool ignore_check_option_errors); /* @@ -372,7 +402,11 @@ (MODE_STRICT_TRANS_TABLES | MODE_STRICT_ALL_TABLES))); - if (!field_term->length() && !enclosed->length()) + if( ex->filetype == FILETYPE_XML ) /* load xml */ + error= read_xml_field(thd, info, table_list, fields_vars, + set_fields, set_values, read_info, + *(ex->line_term), skip_lines, ignore); + else if (!field_term->length() && !enclosed->length()) error= read_fixed_length(thd, info, table_list, fields_vars, set_fields, set_values, read_info, skip_lines, ignore); @@ -816,6 +850,168 @@ } DBUG_RETURN(test(read_info.error)); } +/* load xml */ +/**************************************************************************** +** Read rows in xml format +****************************************************************************/ +static int +read_xml_field(THD *thd, COPY_INFO &info, TABLE_LIST *table_list, + List &fields_vars, List &set_fields, + List &set_values, READ_INFO &read_info, + String &row_tag, ulong skip_lines, + bool ignore_check_option_errors) +{ + List_iterator_fast it(fields_vars); + Item *item; + TABLE *table= table_list->table; + uint enclosed_length; + ulonglong id; + bool no_trans_update; + DBUG_ENTER("read_xml_field"); + + id= 0; + no_trans_update= !table->file->has_transactions(); + + for (;;it.rewind()) + { + if (thd->killed) + { + thd->send_kill_message(); + DBUG_RETURN(1); + } + //read row tag and save values into tag list + if (read_info.read_xml()) + break; + List_iterator_fast xmlit(read_info.taglist); + xmlit.rewind(); + XML_TAG *tag = NULL; + + while(tag = xmlit++) + { + DBUG_PRINT("read_xml_field", + ("got tag:%i %s %s", + tag->level,tag->field.c_ptr(),tag->value.c_ptr())); + } + + restore_record(table, s->default_values); + + while ((item= it++)) + { + /* If this line is to be skipped we don't want to fill field or var */ + if (skip_lines) + continue; + + /* find field in tag list */ + xmlit.rewind(); + tag = xmlit++; + + while(tag && strcmp(tag->field.c_ptr(), item->name)!= 0) + { + tag = xmlit++; + } + if (!tag) // found null + { + if (item->type() == Item::FIELD_ITEM) + { + Field *field= ((Item_field *)item)->field; + field->reset(); + field->set_null(); + if (field == table->next_number_field) + table->auto_increment_field_not_null= TRUE; + if (!field->maybe_null()) + { + if (field->type() == FIELD_TYPE_TIMESTAMP) + ((Field_timestamp*) field)->set_time(); + else if (field != table->next_number_field) + field->set_warning(MYSQL_ERROR::WARN_LEVEL_WARN, + ER_WARN_NULL_TO_NOTNULL, 1); + } + } + else + ((Item_user_var_as_out_param *)item)->set_null_value( + read_info.read_charset); + continue; + } + + if (item->type() == Item::FIELD_ITEM) + { + + Field *field= ((Item_field *)item)->field; + field->set_notnull(); + if (field == table->next_number_field) + table->auto_increment_field_not_null= TRUE; + field->store((char*) tag->value.c_ptr(), tag->value.length(), read_info.read_charset); + } + else + ((Item_user_var_as_out_param *)item)->set_value((char*) tag->value.c_ptr(), + tag->value.length(),read_info.read_charset); + } + + if (read_info.error) + break; + if (skip_lines) + { + skip_lines--; + continue; + } + if (item) + { + /* Have not read any field, thus input file is simply ended */ + if (item == fields_vars.head()) + break; + for (; item ; item= it++) + { + if (item->type() == Item::FIELD_ITEM) + { + /* + QQ: We probably should not throw warning for each field. + But how about intention to always have the same number + of warnings in THD::cuted_fields (and get rid of cuted_fields + in the end ?) + */ + thd->cuted_fields++; + push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, + ER_WARN_TOO_FEW_RECORDS, + ER(ER_WARN_TOO_FEW_RECORDS), thd->row_count); + } + else + ((Item_user_var_as_out_param *)item)->set_null_value( + read_info.read_charset); + } + } + + if (thd->killed || + fill_record_n_invoke_before_triggers(thd, set_fields, set_values, + ignore_check_option_errors, + table->triggers, + TRG_EVENT_INSERT)) + DBUG_RETURN(1); + + switch (table_list->view_check_option(thd, + ignore_check_option_errors)) { + case VIEW_CHECK_SKIP: + read_info.next_line(); + goto continue_loop; + case VIEW_CHECK_ERROR: + DBUG_RETURN(-1); + } + + + if (write_record(thd, table, &info)) + DBUG_RETURN(1); + /* + We don't need to reset auto-increment field since we are restoring + its default value at the beginning of each loop iteration. + */ + thd->no_trans_update= no_trans_update; + thd->row_count++; + continue_loop:; + } + DBUG_RETURN(test(read_info.error)); +} + + +/* load xml end */ /* Unescape all escape characters, mark \N as null */ @@ -855,6 +1051,8 @@ field_term_length= field_term.length(); line_term_ptr=(char*) line_term.ptr(); line_term_length= line_term.length(); + /* load xml */ + level = 0; if (line_start.length() == 0) { line_start_ptr=0; @@ -1262,3 +1460,234 @@ } return 0; } +/* + Clear taglist from tags with a specified level +*/ +int READ_INFO::clear_level(int level) +{ + DBUG_ENTER("READ_INFO::read_xml clear_level"); + List_iterator xmlit(taglist); + xmlit.rewind(); + XML_TAG *tag = NULL; + + while(tag = xmlit++) + { + if(tag->level >= level) + { + xmlit.remove(); + delete tag; + } + } + DBUG_RETURN(0); +} +/* + Read an xml value: handle multibyte and xml escape + */ +int READ_INFO::read_value(int delim, String *val) +{ + int chr; + String tmp; + chr = GET; + while(chr != delim && chr != my_b_EOF) + { +#ifdef USE_MB + if (my_mbcharlen(read_charset, chr) > 1) + { + DBUG_PRINT("read_xml",("multi byte")); + int ml = my_mbcharlen(read_charset, chr); + int i; + for (i=1; iappend(chr); + chr = GET; + if (chr == my_b_EOF) + return chr; + } + } +#endif + if(my_isspace(read_charset, chr)) /* convert newline, tab etc to space */ + val->append(' '); + else if(chr == '&') + { + tmp.length(0); + while(chr != ';' && chr != my_b_EOF) + { + tmp.append(chr); + chr= GET; + } + if (chr == my_b_EOF) + return chr; + if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN(">"))) + val->append('>'); + else if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN("<"))) + val->append('<'); + else if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN("&"))) + val->append('&'); + else if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN("""))) + val->append('"'); + else if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN("&apos"))) + val->append('\''); + else + val->append(tmp); + } + else + val->append(chr); + chr = GET; + } + return chr; +} + +/* + Read a record in xml format + tags and attributes are stored in taglist + when tag set in ROWS IDENTIFIED BY is closed, we are ready and return +*/ +int READ_INFO::read_xml() +{ + DBUG_ENTER("READ_INFO::read_xml"); + int chr, chr2, chr3; + int delim = 0; + String tag, attribute, value; + bool in_tag = false; + tag.length(0); + attribute.length(0); + value.length(0); + chr=GET; + while(chr != my_b_EOF) + { + switch(chr){ + case '<': /* read tag */ + /* TODO: check if this is a comment */ + chr = GET; + if(chr == '!') + { + chr2 = GET; chr3 = GET; + if(chr2 == '-' && chr3 == '-') + { + chr2 = 0; chr3 = 0; + chr = GET; + while(chr != '>' || chr2 != '-' || chr3 != '-') + { + if(chr == '-') + { + chr3 = chr2; chr2 = chr; + } + else if (chr2 == '-') + { + chr2 = 0; chr3 = 0; + } + chr = GET; + if (chr == my_b_EOF) goto found_eof; + } + break; + } + } + tag.length(0); + while(chr != '>' && chr != ' ' && chr != '/' && chr != my_b_EOF) + { + if(chr != delim) /* fix for the '' - stored in line_term + if((tag.length() == line_term_length -2) && + (strncmp(tag.c_ptr(),line_term_ptr+1,tag.length())==0)) + { + DBUG_PRINT("read_xml", + ("start-of-row: %i %s %s",level,tag.c_ptr(),line_term_ptr)); + } + if(chr == ' ' || chr == '>') + { + level++; + clear_level(level+1); + } + if(chr == ' ') + in_tag = true; + else + in_tag = false; + break; + case ' ': /* read attribute */ + while(chr == ' ') chr=GET; /* skip blanks */ + if(!in_tag) + break; + while(chr != '=' && chr != '/' && chr != '>' && chr != my_b_EOF) + { + attribute.append(chr); + chr = GET; + } + break; + case '>': /* end tag - read tag value */ + in_tag = false; + chr = read_value('<',&value); + if(chr == my_b_EOF) goto found_eof; + /* save value to list */ + if(tag.length() > 0 && value.length() > 0) + { + DBUG_PRINT("read_xml", + ("lev:%i tag:%s val:%s\n",level,tag.c_ptr(),value.c_ptr())); + taglist.push_front( new XML_TAG(level, tag, value)); + } + tag.length(0);value.length(0);attribute.length(0); + break; + case '/': /* close tag */ + level--; + chr = GET; + if(chr != '>') /* if this is an empty tag */ + tag.length(0); /* we should keep tag value */ + while(chr != '>' && chr != my_b_EOF) + { + tag.append(chr); + chr = GET; + } + if((tag.length() == line_term_length -2) && + (strncmp(tag.c_ptr(),line_term_ptr+1,tag.length())==0)) + { + DBUG_PRINT("read_xml", ("found end-of-row %i %s", + level,tag.c_ptr())); + DBUG_RETURN(0); //normal return + } + chr=GET; + break; + case '=': /* attribute name end - read the value */ + //check for tag field and attribute name + if(!memcmp(tag.c_ptr(),STRING_WITH_LEN("field")) && !memcmp(attribute.c_ptr(),STRING_WITH_LEN("name"))) + { + // this is format xx where actual fieldname is in attribute + delim = GET; + tag.length(0);attribute.length(0); + chr = '<'; /* we pretend that it is a tag */ + level--; + break; + } + //check for " or ' + chr = GET; + if(chr == my_b_EOF) goto found_eof; + if(chr == '"' || chr == '\'') + { + delim = chr; + } + else + { + delim = ' ';/* no delimiter, use space */ + PUSH(chr); + } + chr = read_value(delim, &value); + if(attribute.length() > 0 && value.length() > 0) + { + DBUG_PRINT("read_xml", + ("lev:%i att:%s val:%s\n",level+1,attribute.c_ptr(),value.c_ptr())); + taglist.push_front( new XML_TAG(level+1, attribute, value)); + } + attribute.length(0);value.length(0); + if (chr != ' ') + chr = GET; + break; + default: + chr = GET; + } /* end switch */ + } /* end while */ +found_eof: + DBUG_PRINT("read_xml",("Found eof")); + eof=1; + DBUG_RETURN(1); +} diff -ur mysql-5.1.12-beta/sql/sql_yacc.yy work/sql/sql_yacc.yy --- mysql-5.1.12-beta/sql/sql_yacc.yy 2006-10-26 14:01:57.000000000 +0200 +++ work/sql/sql_yacc.yy 2006-12-08 18:15:53.000000000 +0100 @@ -721,6 +721,7 @@ %token WRITE_SYM %token X509_SYM %token XA_SYM +%token XML_SYM /* Load XML Arnold/Erik */ %token XOR %token YEARWEEK %token YEAR_MONTH_SYM @@ -867,7 +868,7 @@ get_select_lex %type comp_op - +/* load xml Arnold/Erik: Added load_xml and opt_xml_rows_identified_by rules */ %type query verb_clause create change select do drop insert replace insert2 insert_values update delete truncate rename @@ -904,6 +905,7 @@ statement sp_suid sp_c_chistics sp_a_chistics sp_chistic sp_c_chistic xa load_data opt_field_or_var_spec fields_or_vars opt_load_data_set_spec + load_xml opt_xml_rows_identified_by definer view_replace_or_algorithm view_replace view_algorithm_opt view_algorithm view_or_trigger_or_sp_or_event view_or_trigger_or_sp_or_event_tail @@ -8616,7 +8618,7 @@ }; /* import, export of files */ - +/* Load xml Arnold/Erik: Added LOAD XML_SYM part */ load: LOAD DATA_SYM { LEX *lex=Lex; @@ -8630,6 +8632,19 @@ load_data {} | + LOAD XML_SYM + { + LEX *lex=Lex; + if (lex->sphead) + { + my_error(ER_SP_BADSTATEMENT, MYF(0), "LOAD DATA"); + YYABORT; + } + lex->fname_start= lex->ptr; + } + load_xml + {} + | LOAD TABLE_SYM table_ident FROM MASTER_SYM { LEX *lex=Lex; @@ -8656,6 +8671,7 @@ lex->ignore= 0; if (!(lex->exchange= new sql_exchange($4.str, 0))) YYABORT; + lex->exchange->filetype = FILETYPE_CSV; } opt_duplicate INTO { @@ -8683,6 +8699,40 @@ "mysqldump or future " "BACKUP/RESTORE DATABASE facility"); }; +/* Load XML Added by Arnold/Erik -- */ +load_xml: + load_data_lock opt_local INFILE TEXT_STRING_filesystem + { + LEX *lex=Lex; + lex->sql_command= SQLCOM_LOAD; + lex->lock_option= $1; + lex->local_file= $2; + lex->duplicates= DUP_ERROR; + lex->ignore= 0; + if (!(lex->exchange= new sql_exchange($4.str, 0))) + YYABORT; + lex->exchange->filetype = FILETYPE_XML; + } + opt_duplicate INTO + { + LEX *lex=Lex; + lex->fname_end= lex->ptr; + } + TABLE_SYM table_ident + { + LEX *lex=Lex; + if (!Select->add_table_to_list(YYTHD, $10, NULL, TL_OPTION_UPDATING, + lex->lock_option)) + YYABORT; + lex->field_list.empty(); + lex->update_list.empty(); + lex->value_list.empty(); + } + opt_xml_rows_identified_by opt_ignore_lines opt_field_or_var_spec + opt_load_data_set_spec + {}; + +/* Load XML Added by Arnold/Erik END */ opt_local: /* empty */ { $$=0;} @@ -8789,6 +8839,13 @@ opt_load_data_set_spec: /* empty */ { } | SET insert_update_list { }; +/* LOAD XML Added by Arnold/Erik */ + +opt_xml_rows_identified_by: + /* empty */ { } + | ROWS_SYM IDENTIFIED_SYM BY text_string { Lex->exchange->line_term = $4; }; + +/* --------------------- */ /* Common definitions */ ------=_Part_2387_20464045.1165758323126--