From: Date: December 10 2006 2:45pm
Subject: RE: load XML data
List-Archive: http://lists.mysql.com/internals/34169
Message-Id: <6287242.1165758323135.JavaMail.?@fh1063.dia.cp.net>
MIME-Version: 1.0
Content-Type: multipart/mixed;
boundary="----=_Part_2387_20464045.1165758323126"
------=_Part_2387_20464045.1165758323126
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
Hi,
A first version of the LOAD XML is now available. It is based on the=20
LOAD DATA command, and has the following syntax:
LOAD XML [LOW_PRIORITY | CONCURRENT] [LOCAL] INFILE 'file_name'
[REPLACE | IGNORE]
INTO TABLE tbl_name
ROWS IDENTIFIED BY 'row_tag'
[IGNORE number LINES]
[(col_name_or_user_var,...)]
[SET col_name =3D expr,...)]
Basically this is the same syntax as LOAD DATA, and the xml import=20
also works the same way. The main difference is that the LINES and=20
FIELDS clauses are not supported.=20
Instead a ROWS IDENTIFIED BY clause is used. For every tag with this=20
tag name the command will try to create a record. It will match all=20
tags and attributes inside the row tag and all tags and attributes on a=20
higher level with the columns in the target table and fill in values=20
accordingly.
The command supports three different xml formats:
field values as attributes: =E2=80=9C
=E2=80=9D
field values as tags:=20
valuevalue2
the format used by other MySQL tools, where the field name is an=20
attribute, and the value is in the tag:
value
The three formats can be used in the same xml file, the import will=20
automatically detect them. The tags are matched based on tag/attribute=20
name and column name.=20
Changes are in attached patch file. Four files are affected:
lex.h (only add symbol XML_SYM)
sql_class.h (add enum_filetype and use in sql_exchange)
sql_yacc.yy (definition of command LOAD XML)
sql_load.cc(the implementation)
Erik Wetterberg
------=_Part_2387_20464045.1165758323126
Content-Type: TEXT/X-PATCH; name=loadxml.patch
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename=loadxml.patch; size=19578
diff -ur mysql-5.1.12-beta/sql/lex.h work/sql/lex.h
--- mysql-5.1.12-beta/sql/lex.h 2006-10-26 14:01:59.000000000 +0200
+++ work/sql/lex.h 2006-12-07 11:01:28.000000000 +0100
@@ -580,6 +580,7 @@
{ "X509", SYM(X509_SYM)},
{ "XOR", SYM(XOR)},
{ "XA", SYM(XA_SYM)},
+ { "XML", SYM(XML_SYM)}, /* LOAD XML Arnold/Erik */
{ "YEAR", SYM(YEAR_SYM)},
{ "YEAR_MONTH", SYM(YEAR_MONTH_SYM)},
{ "ZEROFILL", SYM(ZEROFILL)},
diff -ur mysql-5.1.12-beta/sql/sql_class.h work/sql/sql_class.h
--- mysql-5.1.12-beta/sql/sql_class.h 2006-10-26 14:01:59.000000000 +0200
+++ work/sql/sql_class.h 2006-12-10 08:26:55.000000000 +0100
@@ -36,14 +36,16 @@
enum enum_ha_read_modes { RFIRST, RNEXT, RPREV, RLAST, RKEY, RNEXT_SAME };
enum enum_duplicates { DUP_ERROR, DUP_REPLACE, DUP_UPDATE };
enum enum_delay_key_write { DELAY_KEY_WRITE_NONE, DELAY_KEY_WRITE_ON,
- DELAY_KEY_WRITE_ALL };
-enum enum_check_fields
-{ CHECK_FIELD_IGNORE, CHECK_FIELD_WARN, CHECK_FIELD_ERROR_FOR_NULL };
-enum enum_mark_columns
-{ MARK_COLUMNS_NONE, MARK_COLUMNS_READ, MARK_COLUMNS_WRITE};
+ DELAY_KEY_WRITE_ALL };
+enum enum_check_fields
+{ CHECK_FIELD_IGNORE, CHECK_FIELD_WARN, CHECK_FIELD_ERROR_FOR_NULL };
+enum enum_mark_columns
+{ MARK_COLUMNS_NONE, MARK_COLUMNS_READ, MARK_COLUMNS_WRITE};
+enum enum_filetype { FILETYPE_CSV, FILETYPE_XML }; /* LOAD XML Added by
+Arnold & Erik */
-extern char internal_table_name[2];
-extern char empty_c_string[1];
+extern char internal_table_name[2];
+extern char empty_c_string[1];
extern const char **errmesg;
#define TC_LOG_PAGE_SIZE 8192
@@ -1641,6 +1643,7 @@
class sql_exchange :public Sql_alloc
{
public:
+ enum enum_filetype filetype; /* load XML, Added by Arnold & Erik */
char *file_name;
String *field_term,*enclosed,*line_term,*line_start,*escaped;
bool opt_enclosed;
diff -ur mysql-5.1.12-beta/sql/sql_load.cc work/sql/sql_load.cc
--- mysql-5.1.12-beta/sql/sql_load.cc 2006-10-26 14:01:58.000000000 +0200
+++ work/sql/sql_load.cc 2006-12-10 07:49:14.000000000 +0100
@@ -16,6 +16,7 @@
/* Copy data from a textfile to table */
+/* 2006-12 Erik Wetterberg : LOAD XML added */
#include "mysql_priv.h"
#include
@@ -24,6 +25,21 @@
#include "sp_head.h"
#include "sql_trigger.h"
+class XML_TAG {
+public:
+ int level;
+ String field;
+ String value;
+
+ XML_TAG(int l, String f, String v);
+};
+XML_TAG::XML_TAG(int l, String f, String v)
+{
+ level = l;
+ field.append(f);
+ value.append(v);
+}
+
class READ_INFO {
File file;
byte *buffer, /* Buffer for read text */
@@ -38,6 +54,8 @@
bool need_end_io_cache;
IO_CACHE cache;
NET *io_net;
+ /* load xml */
+ int level;
public:
bool error,line_cuted,found_null,enclosed;
@@ -55,6 +73,12 @@
char unescape(char chr);
int terminator(char *ptr,uint length);
bool find_start_of_fields();
+ /* load xml */
+ List taglist;
+ int read_value(int delim, String *val);
+ int read_xml();
+ int clear_level(int level);
+
/*
We need to force cache close before destructor is invoked to log
the last read block
@@ -86,6 +110,12 @@
static bool write_execute_load_query_log_event(THD *thd,
bool duplicates, bool ignore,
bool transactional_table);
+ /* load xml */
+static int read_xml_field(THD *thd, COPY_INFO &info, TABLE_LIST *table_list,
+ List- &fields_vars, List
- &set_fields,
+ List
- &set_values, READ_INFO &read_info,
+ String &enclosed, ulong skip_lines,
+ bool ignore_check_option_errors);
/*
@@ -372,7 +402,11 @@
(MODE_STRICT_TRANS_TABLES |
MODE_STRICT_ALL_TABLES)));
- if (!field_term->length() && !enclosed->length())
+ if( ex->filetype == FILETYPE_XML ) /* load xml */
+ error= read_xml_field(thd, info, table_list, fields_vars,
+ set_fields, set_values, read_info,
+ *(ex->line_term), skip_lines, ignore);
+ else if (!field_term->length() && !enclosed->length())
error= read_fixed_length(thd, info, table_list, fields_vars,
set_fields, set_values, read_info,
skip_lines, ignore);
@@ -816,6 +850,168 @@
}
DBUG_RETURN(test(read_info.error));
}
+/* load xml */
+/****************************************************************************
+** Read rows in xml format
+****************************************************************************/
+static int
+read_xml_field(THD *thd, COPY_INFO &info, TABLE_LIST *table_list,
+ List
- &fields_vars, List
- &set_fields,
+ List
- &set_values, READ_INFO &read_info,
+ String &row_tag, ulong skip_lines,
+ bool ignore_check_option_errors)
+{
+ List_iterator_fast
- it(fields_vars);
+ Item *item;
+ TABLE *table= table_list->table;
+ uint enclosed_length;
+ ulonglong id;
+ bool no_trans_update;
+ DBUG_ENTER("read_xml_field");
+
+ id= 0;
+ no_trans_update= !table->file->has_transactions();
+
+ for (;;it.rewind())
+ {
+ if (thd->killed)
+ {
+ thd->send_kill_message();
+ DBUG_RETURN(1);
+ }
+ //read row tag and save values into tag list
+ if (read_info.read_xml())
+ break;
+ List_iterator_fast xmlit(read_info.taglist);
+ xmlit.rewind();
+ XML_TAG *tag = NULL;
+
+ while(tag = xmlit++)
+ {
+ DBUG_PRINT("read_xml_field",
+ ("got tag:%i %s %s",
+ tag->level,tag->field.c_ptr(),tag->value.c_ptr()));
+ }
+
+ restore_record(table, s->default_values);
+
+ while ((item= it++))
+ {
+ /* If this line is to be skipped we don't want to fill field or var */
+ if (skip_lines)
+ continue;
+
+ /* find field in tag list */
+ xmlit.rewind();
+ tag = xmlit++;
+
+ while(tag && strcmp(tag->field.c_ptr(), item->name)!= 0)
+ {
+ tag = xmlit++;
+ }
+ if (!tag) // found null
+ {
+ if (item->type() == Item::FIELD_ITEM)
+ {
+ Field *field= ((Item_field *)item)->field;
+ field->reset();
+ field->set_null();
+ if (field == table->next_number_field)
+ table->auto_increment_field_not_null= TRUE;
+ if (!field->maybe_null())
+ {
+ if (field->type() == FIELD_TYPE_TIMESTAMP)
+ ((Field_timestamp*) field)->set_time();
+ else if (field != table->next_number_field)
+ field->set_warning(MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_WARN_NULL_TO_NOTNULL, 1);
+ }
+ }
+ else
+ ((Item_user_var_as_out_param *)item)->set_null_value(
+ read_info.read_charset);
+ continue;
+ }
+
+ if (item->type() == Item::FIELD_ITEM)
+ {
+
+ Field *field= ((Item_field *)item)->field;
+ field->set_notnull();
+ if (field == table->next_number_field)
+ table->auto_increment_field_not_null= TRUE;
+ field->store((char*) tag->value.c_ptr(), tag->value.length(), read_info.read_charset);
+ }
+ else
+ ((Item_user_var_as_out_param *)item)->set_value((char*) tag->value.c_ptr(),
+ tag->value.length(),read_info.read_charset);
+ }
+
+ if (read_info.error)
+ break;
+ if (skip_lines)
+ {
+ skip_lines--;
+ continue;
+ }
+ if (item)
+ {
+ /* Have not read any field, thus input file is simply ended */
+ if (item == fields_vars.head())
+ break;
+ for (; item ; item= it++)
+ {
+ if (item->type() == Item::FIELD_ITEM)
+ {
+ /*
+ QQ: We probably should not throw warning for each field.
+ But how about intention to always have the same number
+ of warnings in THD::cuted_fields (and get rid of cuted_fields
+ in the end ?)
+ */
+ thd->cuted_fields++;
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_WARN_TOO_FEW_RECORDS,
+ ER(ER_WARN_TOO_FEW_RECORDS), thd->row_count);
+ }
+ else
+ ((Item_user_var_as_out_param *)item)->set_null_value(
+ read_info.read_charset);
+ }
+ }
+
+ if (thd->killed ||
+ fill_record_n_invoke_before_triggers(thd, set_fields, set_values,
+ ignore_check_option_errors,
+ table->triggers,
+ TRG_EVENT_INSERT))
+ DBUG_RETURN(1);
+
+ switch (table_list->view_check_option(thd,
+ ignore_check_option_errors)) {
+ case VIEW_CHECK_SKIP:
+ read_info.next_line();
+ goto continue_loop;
+ case VIEW_CHECK_ERROR:
+ DBUG_RETURN(-1);
+ }
+
+
+ if (write_record(thd, table, &info))
+ DBUG_RETURN(1);
+ /*
+ We don't need to reset auto-increment field since we are restoring
+ its default value at the beginning of each loop iteration.
+ */
+ thd->no_trans_update= no_trans_update;
+ thd->row_count++;
+ continue_loop:;
+ }
+ DBUG_RETURN(test(read_info.error));
+}
+
+
+/* load xml end */
/* Unescape all escape characters, mark \N as null */
@@ -855,6 +1051,8 @@
field_term_length= field_term.length();
line_term_ptr=(char*) line_term.ptr();
line_term_length= line_term.length();
+ /* load xml */
+ level = 0;
if (line_start.length() == 0)
{
line_start_ptr=0;
@@ -1262,3 +1460,234 @@
}
return 0;
}
+/*
+ Clear taglist from tags with a specified level
+*/
+int READ_INFO::clear_level(int level)
+{
+ DBUG_ENTER("READ_INFO::read_xml clear_level");
+ List_iterator xmlit(taglist);
+ xmlit.rewind();
+ XML_TAG *tag = NULL;
+
+ while(tag = xmlit++)
+ {
+ if(tag->level >= level)
+ {
+ xmlit.remove();
+ delete tag;
+ }
+ }
+ DBUG_RETURN(0);
+}
+/*
+ Read an xml value: handle multibyte and xml escape
+ */
+int READ_INFO::read_value(int delim, String *val)
+{
+ int chr;
+ String tmp;
+ chr = GET;
+ while(chr != delim && chr != my_b_EOF)
+ {
+#ifdef USE_MB
+ if (my_mbcharlen(read_charset, chr) > 1)
+ {
+ DBUG_PRINT("read_xml",("multi byte"));
+ int ml = my_mbcharlen(read_charset, chr);
+ int i;
+ for (i=1; iappend(chr);
+ chr = GET;
+ if (chr == my_b_EOF)
+ return chr;
+ }
+ }
+#endif
+ if(my_isspace(read_charset, chr)) /* convert newline, tab etc to space */
+ val->append(' ');
+ else if(chr == '&')
+ {
+ tmp.length(0);
+ while(chr != ';' && chr != my_b_EOF)
+ {
+ tmp.append(chr);
+ chr= GET;
+ }
+ if (chr == my_b_EOF)
+ return chr;
+ if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN(">")))
+ val->append('>');
+ else if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN("<")))
+ val->append('<');
+ else if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN("&")))
+ val->append('&');
+ else if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN(""")))
+ val->append('"');
+ else if(!memcmp(tmp.c_ptr(),STRING_WITH_LEN("&apos")))
+ val->append('\'');
+ else
+ val->append(tmp);
+ }
+ else
+ val->append(chr);
+ chr = GET;
+ }
+ return chr;
+}
+
+/*
+ Read a record in xml format
+ tags and attributes are stored in taglist
+ when tag set in ROWS IDENTIFIED BY is closed, we are ready and return
+*/
+int READ_INFO::read_xml()
+{
+ DBUG_ENTER("READ_INFO::read_xml");
+ int chr, chr2, chr3;
+ int delim = 0;
+ String tag, attribute, value;
+ bool in_tag = false;
+ tag.length(0);
+ attribute.length(0);
+ value.length(0);
+ chr=GET;
+ while(chr != my_b_EOF)
+ {
+ switch(chr){
+ case '<': /* read tag */
+ /* TODO: check if this is a comment */
+ chr = GET;
+ if(chr == '!')
+ {
+ chr2 = GET; chr3 = GET;
+ if(chr2 == '-' && chr3 == '-')
+ {
+ chr2 = 0; chr3 = 0;
+ chr = GET;
+ while(chr != '>' || chr2 != '-' || chr3 != '-')
+ {
+ if(chr == '-')
+ {
+ chr3 = chr2; chr2 = chr;
+ }
+ else if (chr2 == '-')
+ {
+ chr2 = 0; chr3 = 0;
+ }
+ chr = GET;
+ if (chr == my_b_EOF) goto found_eof;
+ }
+ break;
+ }
+ }
+ tag.length(0);
+ while(chr != '>' && chr != ' ' && chr != '/' && chr != my_b_EOF)
+ {
+ if(chr != delim) /* fix for the '' - stored in line_term
+ if((tag.length() == line_term_length -2) &&
+ (strncmp(tag.c_ptr(),line_term_ptr+1,tag.length())==0))
+ {
+ DBUG_PRINT("read_xml",
+ ("start-of-row: %i %s %s",level,tag.c_ptr(),line_term_ptr));
+ }
+ if(chr == ' ' || chr == '>')
+ {
+ level++;
+ clear_level(level+1);
+ }
+ if(chr == ' ')
+ in_tag = true;
+ else
+ in_tag = false;
+ break;
+ case ' ': /* read attribute */
+ while(chr == ' ') chr=GET; /* skip blanks */
+ if(!in_tag)
+ break;
+ while(chr != '=' && chr != '/' && chr != '>' && chr != my_b_EOF)
+ {
+ attribute.append(chr);
+ chr = GET;
+ }
+ break;
+ case '>': /* end tag - read tag value */
+ in_tag = false;
+ chr = read_value('<',&value);
+ if(chr == my_b_EOF) goto found_eof;
+ /* save value to list */
+ if(tag.length() > 0 && value.length() > 0)
+ {
+ DBUG_PRINT("read_xml",
+ ("lev:%i tag:%s val:%s\n",level,tag.c_ptr(),value.c_ptr()));
+ taglist.push_front( new XML_TAG(level, tag, value));
+ }
+ tag.length(0);value.length(0);attribute.length(0);
+ break;
+ case '/': /* close tag */
+ level--;
+ chr = GET;
+ if(chr != '>') /* if this is an empty tag */
+ tag.length(0); /* we should keep tag value */
+ while(chr != '>' && chr != my_b_EOF)
+ {
+ tag.append(chr);
+ chr = GET;
+ }
+ if((tag.length() == line_term_length -2) &&
+ (strncmp(tag.c_ptr(),line_term_ptr+1,tag.length())==0))
+ {
+ DBUG_PRINT("read_xml", ("found end-of-row %i %s",
+ level,tag.c_ptr()));
+ DBUG_RETURN(0); //normal return
+ }
+ chr=GET;
+ break;
+ case '=': /* attribute name end - read the value */
+ //check for tag field and attribute name
+ if(!memcmp(tag.c_ptr(),STRING_WITH_LEN("field")) && !memcmp(attribute.c_ptr(),STRING_WITH_LEN("name")))
+ {
+ // this is format xx where actual fieldname is in attribute
+ delim = GET;
+ tag.length(0);attribute.length(0);
+ chr = '<'; /* we pretend that it is a tag */
+ level--;
+ break;
+ }
+ //check for " or '
+ chr = GET;
+ if(chr == my_b_EOF) goto found_eof;
+ if(chr == '"' || chr == '\'')
+ {
+ delim = chr;
+ }
+ else
+ {
+ delim = ' ';/* no delimiter, use space */
+ PUSH(chr);
+ }
+ chr = read_value(delim, &value);
+ if(attribute.length() > 0 && value.length() > 0)
+ {
+ DBUG_PRINT("read_xml",
+ ("lev:%i att:%s val:%s\n",level+1,attribute.c_ptr(),value.c_ptr()));
+ taglist.push_front( new XML_TAG(level+1, attribute, value));
+ }
+ attribute.length(0);value.length(0);
+ if (chr != ' ')
+ chr = GET;
+ break;
+ default:
+ chr = GET;
+ } /* end switch */
+ } /* end while */
+found_eof:
+ DBUG_PRINT("read_xml",("Found eof"));
+ eof=1;
+ DBUG_RETURN(1);
+}
diff -ur mysql-5.1.12-beta/sql/sql_yacc.yy work/sql/sql_yacc.yy
--- mysql-5.1.12-beta/sql/sql_yacc.yy 2006-10-26 14:01:57.000000000 +0200
+++ work/sql/sql_yacc.yy 2006-12-08 18:15:53.000000000 +0100
@@ -721,6 +721,7 @@
%token WRITE_SYM
%token X509_SYM
%token XA_SYM
+%token XML_SYM /* Load XML Arnold/Erik */
%token XOR
%token YEARWEEK
%token YEAR_MONTH_SYM
@@ -867,7 +868,7 @@
get_select_lex
%type comp_op
-
+/* load xml Arnold/Erik: Added load_xml and opt_xml_rows_identified_by rules */
%type
query verb_clause create change select do drop insert replace insert2
insert_values update delete truncate rename
@@ -904,6 +905,7 @@
statement sp_suid
sp_c_chistics sp_a_chistics sp_chistic sp_c_chistic xa
load_data opt_field_or_var_spec fields_or_vars opt_load_data_set_spec
+ load_xml opt_xml_rows_identified_by
definer view_replace_or_algorithm view_replace view_algorithm_opt
view_algorithm view_or_trigger_or_sp_or_event
view_or_trigger_or_sp_or_event_tail
@@ -8616,7 +8618,7 @@
};
/* import, export of files */
-
+/* Load xml Arnold/Erik: Added LOAD XML_SYM part */
load: LOAD DATA_SYM
{
LEX *lex=Lex;
@@ -8630,6 +8632,19 @@
load_data
{}
|
+ LOAD XML_SYM
+ {
+ LEX *lex=Lex;
+ if (lex->sphead)
+ {
+ my_error(ER_SP_BADSTATEMENT, MYF(0), "LOAD DATA");
+ YYABORT;
+ }
+ lex->fname_start= lex->ptr;
+ }
+ load_xml
+ {}
+ |
LOAD TABLE_SYM table_ident FROM MASTER_SYM
{
LEX *lex=Lex;
@@ -8656,6 +8671,7 @@
lex->ignore= 0;
if (!(lex->exchange= new sql_exchange($4.str, 0)))
YYABORT;
+ lex->exchange->filetype = FILETYPE_CSV;
}
opt_duplicate INTO
{
@@ -8683,6 +8699,40 @@
"mysqldump or future "
"BACKUP/RESTORE DATABASE facility");
};
+/* Load XML Added by Arnold/Erik -- */
+load_xml:
+ load_data_lock opt_local INFILE TEXT_STRING_filesystem
+ {
+ LEX *lex=Lex;
+ lex->sql_command= SQLCOM_LOAD;
+ lex->lock_option= $1;
+ lex->local_file= $2;
+ lex->duplicates= DUP_ERROR;
+ lex->ignore= 0;
+ if (!(lex->exchange= new sql_exchange($4.str, 0)))
+ YYABORT;
+ lex->exchange->filetype = FILETYPE_XML;
+ }
+ opt_duplicate INTO
+ {
+ LEX *lex=Lex;
+ lex->fname_end= lex->ptr;
+ }
+ TABLE_SYM table_ident
+ {
+ LEX *lex=Lex;
+ if (!Select->add_table_to_list(YYTHD, $10, NULL, TL_OPTION_UPDATING,
+ lex->lock_option))
+ YYABORT;
+ lex->field_list.empty();
+ lex->update_list.empty();
+ lex->value_list.empty();
+ }
+ opt_xml_rows_identified_by opt_ignore_lines opt_field_or_var_spec
+ opt_load_data_set_spec
+ {};
+
+/* Load XML Added by Arnold/Erik END */
opt_local:
/* empty */ { $$=0;}
@@ -8789,6 +8839,13 @@
opt_load_data_set_spec:
/* empty */ { }
| SET insert_update_list { };
+/* LOAD XML Added by Arnold/Erik */
+
+opt_xml_rows_identified_by:
+ /* empty */ { }
+ | ROWS_SYM IDENTIFIED_SYM BY text_string { Lex->exchange->line_term = $4; };
+
+/* --------------------- */
/* Common definitions */
------=_Part_2387_20464045.1165758323126--