/* Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* #include #include #include */ #include #include #include typedef struct my_ctype_name_st { const char *name; int val; } MY_CTYPE_NAME_ST; static MY_CTYPE_NAME_ST my_ctype_name[]= { {"Lu", _MY_U}, /* Letter, Uppercase */ {"Ll", _MY_L}, /* Letter, Lowercase */ {"Lt", _MY_U}, /* Letter, Titlecase */ {"Lm", _MY_L}, /* Letter, Modifier */ {"Lo", _MY_L}, /* Letter, other */ {"Nd", _MY_NMR}, /* Number, Decimal Digit */ {"Nl", _MY_NMR|_MY_U|_MY_L}, /* Number, Letter */ {"No", _MY_NMR|_MY_PNT}, /* Number, Other */ {"Mn", _MY_L|_MY_PNT}, /* Mark, Nonspacing */ {"Mc", _MY_L|_MY_PNT}, /* Mark, Spacing Combining */ {"Me", _MY_L|_MY_PNT}, /* Mark, Enclosing */ {"Pc", _MY_PNT}, /* Punctuation, Connector */ {"Pd", _MY_PNT}, /* Punctuation, Dash */ {"Ps", _MY_PNT}, /* Punctuation, Open */ {"Pe", _MY_PNT}, /* Punctuation, Close */ {"Pi", _MY_PNT}, /* Punctuation, Initial quote */ {"Pf", _MY_PNT}, /* Punctuation, Final quote */ {"Po", _MY_PNT}, /* Punctuation, Other */ {"Sm", _MY_PNT}, /* Symbol, Math */ {"Sc", _MY_PNT}, /* Symbol, Currency */ {"Sk", _MY_PNT}, /* Symbol, Modifier */ {"So", _MY_PNT}, /* Symbol, Other */ {"Zs", _MY_SPC}, /* Separator, Space */ {"Zl", _MY_SPC}, /* Separator, Line */ {"Zp", _MY_SPC}, /* Separator, Paragraph */ {"Cc", _MY_CTR}, /* Other, Control */ {"Cf", _MY_CTR}, /* Other, Format */ {"Cs", _MY_CTR}, /* Other, Surrogate */ {"Co", _MY_CTR}, /* Other, Private Use */ {"Cn", _MY_CTR}, /* Other, Not Assigned */ {NULL, 0} }; static int ctypestr2num(const char *tok) { MY_CTYPE_NAME_ST *p; for (p= my_ctype_name; p->name; p++) { if (!strncasecmp(p->name, tok, 2)) return p->val; } return 0; } #define MAX_CHAR 0x10FFFF #define MAX_DECOMPOSITION_LENGTH 2 typedef struct { uint code; char *name; char general_category[3]; int combining_class; int bidirectional_category; uint decomposition_mapping[MAX_DECOMPOSITION_LENGTH]; uint decimal_digit_value; /* 0-9 */ uint digit_value; /* 0-9 */ char *numeric_value; /* Examples: 0, 1, 10, 100, 1000, 1/2, 5/2 */ my_bool mirrored; /* Y or N */ char *unicode_1_0_name; char *iso10646_comment_field; uint uppercase_mapping; uint lowercase_mapping; uint titlecase_mapping; int mysql_ctype; /* ctype in MySQL format */ } MY_UNIDATA_CHAR; typedef struct { int maxchar; int debug; int ctype; int decomp; const char *fname; const char *varname; } MY_UNIDATA_PARAM; static void unidata_param_init(MY_UNIDATA_PARAM *p) { p->maxchar= MAX_CHAR; p->debug= 0; p->ctype= 1; p->decomp= 1; p->fname= NULL; p->varname= ""; } static void load_unidata(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *chr) { char str[1024]; FILE *f= prm->fname ? fopen(prm->fname, "r") : stdin; if (!f) { fprintf(stderr, "Can't open file %s\n", prm->fname); exit(1); } while (fgets(str, sizeof(str), f)) { size_t n; char *s, *e; MY_UNIDATA_CHAR ch; memset(&ch, 0, sizeof(ch)); for(n= 0, s= str; s; n++) { char *end, tok[1024]= ""; if((e= strchr(s, ';'))) { strncpy(tok, s, (unsigned int) (e - s)); tok[e - s]= 0; } else { strcpy(tok, s); } end= tok + strlen(tok); switch(n) { case 0: ch.code= strtol(tok, &end, 16); break; case 1: break; /* Character name */ case 2: /* General category */ ch.general_category[0]= tok[0]; ch.general_category[1]= tok[1]; ch.general_category[2]= '\0'; ch.mysql_ctype= ctypestr2num(tok); break; case 3: /* Canonical Combining Class */ ch.combining_class= atoi(tok); /* if (ch.combining_class) printf("YYY[%04X]=%d\n", ch.code, ch.combining_class); */ break; case 4: break; /* Bidirectional Category */ case 5: /* Character Decomposition Mapping */ if (*tok != '<') { size_t i; char *dec, *endptr; for (dec= strtok_r(tok, " \t", &endptr), i= 0; dec; dec= strtok_r(NULL, " \t", &endptr), i++) { if (i >= MAX_DECOMPOSITION_LENGTH) { fprintf(stderr, "Decomposition length is too long for character %04X\n", ch.code); exit(1); } ch.decomposition_mapping[i]= strtol(dec, NULL, 16); } } break; case 6: /* Decimal digit value */ ch.decimal_digit_value= atoi(tok); break; case 7: /* Digit value */ ch.digit_value= atoi(tok); break; case 8: /* Numeric value */ break; case 9: break; /* Mirrored */ case 10: break; /* Unicode 1.0 Name */ case 11: break; /* 10646 comment field */ case 12: break; /* Uppercase */ case 13: break; /* Lowecase */ case 14: break; /* Titlecase */ } s= e ? e + 1 : e; } if(ch.code <= prm->maxchar) chr[ch.code]= ch; } } static void unidata_char_set_cjk(MY_UNIDATA_CHAR *unidata, int max_char, int cur_char) { if (cur_char < max_char) { MY_UNIDATA_CHAR *ch= &unidata[cur_char]; ch->mysql_ctype= _MY_L | _MY_U; strcpy(ch->general_category, "Lo"); } } static void fill_implicit_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata) { int i; /* Fill digits */ for (i= '0'; i <= '9'; i++) unidata[i].mysql_ctype= _MY_NMR; /* Fill hex digits */ for (i= 'a'; i <= 'z'; i++) unidata[i].mysql_ctype|= _MY_X; for (i= 'A'; i <= 'Z'; i++) unidata[i].mysql_ctype|= _MY_X; /* Fill ideographs */ /* CJK Ideographs Extension A (U+3400 - U+4DB5) */ for(i= 0x3400; i <= 0x4DB5; i++) unidata_char_set_cjk(unidata, prm->maxchar, i); /* CJK Ideographs (U+4E00 - U+9FA5) */ for(i= 0x4E00; i <= 0x9FA5; i++) /* 9FCB in 5.2.0 */ unidata_char_set_cjk(unidata, prm->maxchar, i); /* Hangul Syllables (U+AC00 - U+D7A3) */ for(i= 0xAC00; i <= 0xD7A3; i++) unidata_char_set_cjk(unidata, prm->maxchar, i); /* 20000;;Lo;0;L;;;;;N;;;;; 2A6D6;;Lo;0;L;;;;;N;;;;; */ for (i= 0x20000; i <= 0x2A6D6; i++) unidata_char_set_cjk(unidata, prm->maxchar, i); /* 2A700;;Lo;0;L;;;;;N;;;;; 2B734;;Lo;0;L;;;;;N;;;;; */ for (i= 0x2A700; i <= 0x2B734; i++) unidata_char_set_cjk(unidata, prm->maxchar, i); /* TODO: D800;;Cs;0;L;;;;;N;;;;; DB7F;;Cs;0;L;;;;;N;;;;; DB80;;Cs;0;L;;;;;N;;;;; DBFF;;Cs;0;L;;;;;N;;;;; DC00;;Cs;0;L;;;;;N;;;;; DFFF;;Cs;0;L;;;;;N;;;;; E000;;Co;0;L;;;;;N;;;;; F8FF;;Co;0;L;;;;;N;;;;; F0000;;Co;0;L;;;;;N;;;;; FFFFD;;Co;0;L;;;;;N;;;;; 100000;;Co;0;L;;;;;N;;;;; 10FFFD;;Co;0;L;;;;;N;;;;;0 */ } /* Check if ctype for the entire page consisting of "nchars" characters is the same. Return -1 otherwise. */ static int page_ctype(MY_UNIDATA_CHAR *data, size_t nchars) { size_t i; for (i= 1; i < nchars; i++) { if (data[i].mysql_ctype != data->mysql_ctype) return -1; } return data->mysql_ctype; } static void dump_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata) { int page, max_page= (prm->maxchar + 255) / 256; printf("/*\n"); printf(" Unicode ctype data\n"); printf(" Generated from %s\n", prm->fname ? prm->fname : "stdin"); printf("*/\n"); /* Dump planes with mixed ctype */ for(page= 0; page < max_page; page++) { if (page_ctype(unidata + page * 256, 256) < 0) { size_t charnum, num; printf("static unsigned char uctype%s_page%02X[256]=\n{\n", prm->varname, page); for(num= 0, charnum=0; charnum < 256; charnum++) { printf(" %2d%s", unidata[page * 256 + charnum].mysql_ctype, charnum < 255 ? "," : ""); if(++num == 16) { printf("\n"); num= 0; } } printf("};\n\n"); } } /* Dump ctype page index */ printf("MY_UNI_CTYPE my_uni_ctype%s[%d]={\n", prm->varname, max_page); for(page= 0; page < max_page; page++) { char page_name[128]="NULL"; int ctype; if ((ctype= page_ctype(unidata + page * 256, 256)) < 0) { sprintf(page_name,"uctype%s_page%02X", prm->varname, page); ctype= 0; } printf("\t{%d,%s}%s\n", ctype, page_name, page < max_page - 1 ? "," : ""); } printf("};\n\n\n"); } /* static int decomposition_length(MY_UNIDATA_CHAR *ch) { if (ch->decomposition_mapping[1]) return 2; if (ch->decomposition_mapping[0]) return 1; return 0; } */ static void dump_decomposition_page(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata, uint pageno, uint nchars) { uint i, ofs= pageno * 256; printf("static MY_UNI_DECOMPOSITION decomp%s_p%02X[256]= {\n", prm->varname, pageno); for (i= 0; i < nchars; i++) { MY_UNIDATA_CHAR *ch= &unidata[ofs + i]; printf("/* %04X */ {0x%04X,0x%04X},", ofs + i, ch->decomposition_mapping[0], ch->decomposition_mapping[1]); if (ch->decomposition_mapping[0]) printf(" %s/* [%s-%s][%d-%d] */", ch->decomposition_mapping[0] < 0x10000 ? " " : "", unidata[ch->decomposition_mapping[0]].general_category, unidata[ch->decomposition_mapping[1]].general_category, unidata[ch->decomposition_mapping[0]].combining_class, unidata[ch->decomposition_mapping[1]].combining_class); printf("\n"); } printf("};\n\n\n"); } static size_t calc_decompositions(MY_UNIDATA_CHAR *unidata, size_t nchars) { size_t i, n; for (n= i= 0; i < nchars; i++) { if (unidata[i].decomposition_mapping[0]) n++; } return n; } static void dump_decomposition(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata) { int i, npages= (prm->maxchar + 255) / 256; printf("/*\n"); printf(" Unicode canonical decomposition data\n"); printf(" Generated from %s\n", prm->fname ? prm->fname : "stdin"); printf("*/\n"); /* Dump pages */ for (i= 0; i < npages; i++) { MY_UNIDATA_CHAR *page= unidata + i * 256; if (calc_decompositions(page, 256)) dump_decomposition_page(prm, unidata, i, 256); } /* Dump decompositions */ printf("static MY_UNI_DECOMPOSITION *my_uni_decomp%s[%d]=\n{\n", prm->varname, npages); for (i= 0; i < npages; i++) { MY_UNIDATA_CHAR *page= unidata + i * 256; if (calc_decompositions(page, 256)) printf("decom%s_p%02X,", prm->varname, i); else printf("NULL,"); if ((i % 8) == 7) printf("\n"); } printf("};\n"); } static void usage(FILE *f, int rc) { exit(rc); } static int get_int_option(const char *str, const char *name, int *num) { size_t namelen= strlen(name); if (!strncmp(str, name, namelen)) { const char *val= str + namelen; if (val[0] == '0' && val[1] == 'x') { *num= strtol(val, NULL, 16); } else { *num= atoi(val); if (*num == 0 && *val !='0') { fprintf(stderr, "\nBad numeric option value: %s\n\n", str); usage(stderr, 1); } } return 1; } return 0; } static int get_const_str_option(const char *str, const char *name, const char **val) { size_t namelen= strlen(name); if (!strncmp(str, name, namelen)) { *val= str + namelen; return 1; } return 0; } static void process_options(MY_UNIDATA_PARAM *prm, int ac, char **av) { int i; unidata_param_init(prm); for (i= 1; i < ac ; i++) { /* printf("[%d]=%s\n", i, av[i]); */ if (av[i][0] != '-' || av[i][1] != '-') break; if (!get_const_str_option(av[i], "--name=", &prm->varname) && !get_int_option(av[i], "--maxchar=", &prm->maxchar) && !get_int_option(av[i], "--ctype=", &prm->ctype) && !get_int_option(av[i], "--decomp=", &prm->decomp) && !get_int_option(av[i], "--debug=", &prm->debug)) { fprintf(stderr, "\nUnknown option: %s\n\n", av[i]); usage(stderr, 1); } } prm->fname= av[i]; } int main(int ac, char ** av) { MY_UNIDATA_PARAM prm; static MY_UNIDATA_CHAR unidata[MAX_CHAR + 1]; process_options(&prm, ac, av); memset(unidata, 0, sizeof(unidata)); fill_implicit_ctype(&prm, unidata); load_unidata(&prm, unidata); if (prm.ctype) dump_ctype(&prm, unidata); if (prm.decomp) dump_decomposition(&prm, unidata); return 0; }