mysql5/mysql-5.7.27/strings/uctypedump.c

551 lines
14 KiB
C

/* Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
/*
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
*/
#include <my_global.h>
#include <m_string.h>
#include <m_ctype.h>
typedef struct my_ctype_name_st
{
const char *name;
int val;
} MY_CTYPE_NAME_ST;
static MY_CTYPE_NAME_ST my_ctype_name[]=
{
{"Lu", _MY_U}, /* Letter, Uppercase */
{"Ll", _MY_L}, /* Letter, Lowercase */
{"Lt", _MY_U}, /* Letter, Titlecase */
{"Lm", _MY_L}, /* Letter, Modifier */
{"Lo", _MY_L}, /* Letter, other */
{"Nd", _MY_NMR}, /* Number, Decimal Digit */
{"Nl", _MY_NMR|_MY_U|_MY_L}, /* Number, Letter */
{"No", _MY_NMR|_MY_PNT}, /* Number, Other */
{"Mn", _MY_L|_MY_PNT}, /* Mark, Nonspacing */
{"Mc", _MY_L|_MY_PNT}, /* Mark, Spacing Combining */
{"Me", _MY_L|_MY_PNT}, /* Mark, Enclosing */
{"Pc", _MY_PNT}, /* Punctuation, Connector */
{"Pd", _MY_PNT}, /* Punctuation, Dash */
{"Ps", _MY_PNT}, /* Punctuation, Open */
{"Pe", _MY_PNT}, /* Punctuation, Close */
{"Pi", _MY_PNT}, /* Punctuation, Initial quote */
{"Pf", _MY_PNT}, /* Punctuation, Final quote */
{"Po", _MY_PNT}, /* Punctuation, Other */
{"Sm", _MY_PNT}, /* Symbol, Math */
{"Sc", _MY_PNT}, /* Symbol, Currency */
{"Sk", _MY_PNT}, /* Symbol, Modifier */
{"So", _MY_PNT}, /* Symbol, Other */
{"Zs", _MY_SPC}, /* Separator, Space */
{"Zl", _MY_SPC}, /* Separator, Line */
{"Zp", _MY_SPC}, /* Separator, Paragraph */
{"Cc", _MY_CTR}, /* Other, Control */
{"Cf", _MY_CTR}, /* Other, Format */
{"Cs", _MY_CTR}, /* Other, Surrogate */
{"Co", _MY_CTR}, /* Other, Private Use */
{"Cn", _MY_CTR}, /* Other, Not Assigned */
{NULL, 0}
};
static int
ctypestr2num(const char *tok)
{
MY_CTYPE_NAME_ST *p;
for (p= my_ctype_name; p->name; p++)
{
if (!strncasecmp(p->name, tok, 2))
return p->val;
}
return 0;
}
#define MAX_CHAR 0x10FFFF
#define MAX_DECOMPOSITION_LENGTH 2
typedef struct
{
uint code;
char *name;
char general_category[3];
int combining_class;
int bidirectional_category;
uint decomposition_mapping[MAX_DECOMPOSITION_LENGTH];
uint decimal_digit_value; /* 0-9 */
uint digit_value; /* 0-9 */
char *numeric_value; /* Examples: 0, 1, 10, 100, 1000, 1/2, 5/2 */
my_bool mirrored; /* Y or N */
char *unicode_1_0_name;
char *iso10646_comment_field;
uint uppercase_mapping;
uint lowercase_mapping;
uint titlecase_mapping;
int mysql_ctype; /* ctype in MySQL format */
} MY_UNIDATA_CHAR;
typedef struct
{
int maxchar;
int debug;
int ctype;
int decomp;
const char *fname;
const char *varname;
} MY_UNIDATA_PARAM;
static void
unidata_param_init(MY_UNIDATA_PARAM *p)
{
p->maxchar= MAX_CHAR;
p->debug= 0;
p->ctype= 1;
p->decomp= 1;
p->fname= NULL;
p->varname= "";
}
static void
load_unidata(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *chr)
{
char str[1024];
FILE *f= prm->fname ? fopen(prm->fname, "r") : stdin;
if (!f)
{
fprintf(stderr, "Can't open file %s\n", prm->fname);
exit(1);
}
while (fgets(str, sizeof(str), f))
{
size_t n;
char *s, *e;
MY_UNIDATA_CHAR ch;
memset(&ch, 0, sizeof(ch));
for(n= 0, s= str; s; n++)
{
char *end, tok[1024]= "";
if((e= strchr(s, ';')))
{
strncpy(tok, s, (unsigned int) (e - s));
tok[e - s]= 0;
}
else
{
strcpy(tok, s);
}
end= tok + strlen(tok);
switch(n)
{
case 0: ch.code= strtol(tok, &end, 16); break;
case 1: break; /* Character name */
case 2: /* General category */
ch.general_category[0]= tok[0];
ch.general_category[1]= tok[1];
ch.general_category[2]= '\0';
ch.mysql_ctype= ctypestr2num(tok);
break;
case 3: /* Canonical Combining Class */
ch.combining_class= atoi(tok);
/*
if (ch.combining_class)
printf("YYY[%04X]=%d\n", ch.code, ch.combining_class);
*/
break;
case 4: break; /* Bidirectional Category */
case 5: /* Character Decomposition Mapping */
if (*tok != '<')
{
size_t i;
char *dec, *endptr;
for (dec= strtok_r(tok, " \t", &endptr), i= 0;
dec;
dec= strtok_r(NULL, " \t", &endptr), i++)
{
if (i >= MAX_DECOMPOSITION_LENGTH)
{
fprintf(stderr, "Decomposition length is too long for character %04X\n", ch.code);
exit(1);
}
ch.decomposition_mapping[i]= strtol(dec, NULL, 16);
}
}
break;
case 6: /* Decimal digit value */
ch.decimal_digit_value= atoi(tok);
break;
case 7: /* Digit value */
ch.digit_value= atoi(tok);
break;
case 8: /* Numeric value */
break;
case 9: break; /* Mirrored */
case 10: break; /* Unicode 1.0 Name */
case 11: break; /* 10646 comment field */
case 12: break; /* Uppercase */
case 13: break; /* Lowecase */
case 14: break; /* Titlecase */
}
s= e ? e + 1 : e;
}
if(ch.code <= prm->maxchar)
chr[ch.code]= ch;
}
}
static void
unidata_char_set_cjk(MY_UNIDATA_CHAR *unidata, int max_char, int cur_char)
{
if (cur_char < max_char)
{
MY_UNIDATA_CHAR *ch= &unidata[cur_char];
ch->mysql_ctype= _MY_L | _MY_U;
strcpy(ch->general_category, "Lo");
}
}
static void
fill_implicit_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata)
{
int i;
/* Fill digits */
for (i= '0'; i <= '9'; i++)
unidata[i].mysql_ctype= _MY_NMR;
/* Fill hex digits */
for (i= 'a'; i <= 'z'; i++)
unidata[i].mysql_ctype|= _MY_X;
for (i= 'A'; i <= 'Z'; i++)
unidata[i].mysql_ctype|= _MY_X;
/* Fill ideographs */
/* CJK Ideographs Extension A (U+3400 - U+4DB5) */
for(i= 0x3400; i <= 0x4DB5; i++)
unidata_char_set_cjk(unidata, prm->maxchar, i);
/* CJK Ideographs (U+4E00 - U+9FA5) */
for(i= 0x4E00; i <= 0x9FA5; i++) /* 9FCB in 5.2.0 */
unidata_char_set_cjk(unidata, prm->maxchar, i);
/* Hangul Syllables (U+AC00 - U+D7A3) */
for(i= 0xAC00; i <= 0xD7A3; i++)
unidata_char_set_cjk(unidata, prm->maxchar, i);
/*
20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
*/
for (i= 0x20000; i <= 0x2A6D6; i++)
unidata_char_set_cjk(unidata, prm->maxchar, i);
/*
2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
*/
for (i= 0x2A700; i <= 0x2B734; i++)
unidata_char_set_cjk(unidata, prm->maxchar, i);
/*
TODO:
D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
DB80;<Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
DBFF;<Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
F0000;<Plane 15 Private Use, First>;Co;0;L;;;;;N;;;;;
FFFFD;<Plane 15 Private Use, Last>;Co;0;L;;;;;N;;;;;
100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;0
*/
}
/*
Check if ctype for the entire page consisting of "nchars"
characters is the same.
Return -1 otherwise.
*/
static int
page_ctype(MY_UNIDATA_CHAR *data, size_t nchars)
{
size_t i;
for (i= 1; i < nchars; i++)
{
if (data[i].mysql_ctype != data->mysql_ctype)
return -1;
}
return data->mysql_ctype;
}
static void
dump_ctype(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata)
{
int page, max_page= (prm->maxchar + 255) / 256;
printf("/*\n");
printf(" Unicode ctype data\n");
printf(" Generated from %s\n", prm->fname ? prm->fname : "stdin");
printf("*/\n");
/* Dump planes with mixed ctype */
for(page= 0; page < max_page; page++)
{
if (page_ctype(unidata + page * 256, 256) < 0)
{
size_t charnum, num;
printf("static unsigned char uctype%s_page%02X[256]=\n{\n",
prm->varname, page);
for(num= 0, charnum=0; charnum < 256; charnum++)
{
printf(" %2d%s", unidata[page * 256 + charnum].mysql_ctype,
charnum < 255 ? "," : "");
if(++num == 16)
{
printf("\n");
num= 0;
}
}
printf("};\n\n");
}
}
/* Dump ctype page index */
printf("MY_UNI_CTYPE my_uni_ctype%s[%d]={\n", prm->varname, max_page);
for(page= 0; page < max_page; page++)
{
char page_name[128]="NULL";
int ctype;
if ((ctype= page_ctype(unidata + page * 256, 256)) < 0)
{
sprintf(page_name,"uctype%s_page%02X", prm->varname, page);
ctype= 0;
}
printf("\t{%d,%s}%s\n", ctype, page_name, page < max_page - 1 ? "," : "");
}
printf("};\n\n\n");
}
/*
static int
decomposition_length(MY_UNIDATA_CHAR *ch)
{
if (ch->decomposition_mapping[1])
return 2;
if (ch->decomposition_mapping[0])
return 1;
return 0;
}
*/
static void
dump_decomposition_page(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata,
uint pageno, uint nchars)
{
uint i, ofs= pageno * 256;
printf("static MY_UNI_DECOMPOSITION decomp%s_p%02X[256]= {\n",
prm->varname, pageno);
for (i= 0; i < nchars; i++)
{
MY_UNIDATA_CHAR *ch= &unidata[ofs + i];
printf("/* %04X */ {0x%04X,0x%04X},",
ofs + i, ch->decomposition_mapping[0], ch->decomposition_mapping[1]);
if (ch->decomposition_mapping[0])
printf(" %s/* [%s-%s][%d-%d] */",
ch->decomposition_mapping[0] < 0x10000 ? " " : "",
unidata[ch->decomposition_mapping[0]].general_category,
unidata[ch->decomposition_mapping[1]].general_category,
unidata[ch->decomposition_mapping[0]].combining_class,
unidata[ch->decomposition_mapping[1]].combining_class);
printf("\n");
}
printf("};\n\n\n");
}
static size_t
calc_decompositions(MY_UNIDATA_CHAR *unidata, size_t nchars)
{
size_t i, n;
for (n= i= 0; i < nchars; i++)
{
if (unidata[i].decomposition_mapping[0])
n++;
}
return n;
}
static void
dump_decomposition(MY_UNIDATA_PARAM *prm, MY_UNIDATA_CHAR *unidata)
{
int i, npages= (prm->maxchar + 255) / 256;
printf("/*\n");
printf(" Unicode canonical decomposition data\n");
printf(" Generated from %s\n", prm->fname ? prm->fname : "stdin");
printf("*/\n");
/* Dump pages */
for (i= 0; i < npages; i++)
{
MY_UNIDATA_CHAR *page= unidata + i * 256;
if (calc_decompositions(page, 256))
dump_decomposition_page(prm, unidata, i, 256);
}
/* Dump decompositions */
printf("static MY_UNI_DECOMPOSITION *my_uni_decomp%s[%d]=\n{\n",
prm->varname, npages);
for (i= 0; i < npages; i++)
{
MY_UNIDATA_CHAR *page= unidata + i * 256;
if (calc_decompositions(page, 256))
printf("decom%s_p%02X,", prm->varname, i);
else
printf("NULL,");
if ((i % 8) == 7)
printf("\n");
}
printf("};\n");
}
static void
usage(FILE *f, int rc)
{
exit(rc);
}
static int
get_int_option(const char *str, const char *name, int *num)
{
size_t namelen= strlen(name);
if (!strncmp(str, name, namelen))
{
const char *val= str + namelen;
if (val[0] == '0' && val[1] == 'x')
{
*num= strtol(val, NULL, 16);
}
else
{
*num= atoi(val);
if (*num == 0 && *val !='0')
{
fprintf(stderr, "\nBad numeric option value: %s\n\n", str);
usage(stderr, 1);
}
}
return 1;
}
return 0;
}
static int
get_const_str_option(const char *str, const char *name, const char **val)
{
size_t namelen= strlen(name);
if (!strncmp(str, name, namelen))
{
*val= str + namelen;
return 1;
}
return 0;
}
static void
process_options(MY_UNIDATA_PARAM *prm, int ac, char **av)
{
int i;
unidata_param_init(prm);
for (i= 1; i < ac ; i++)
{
/* printf("[%d]=%s\n", i, av[i]); */
if (av[i][0] != '-' || av[i][1] != '-')
break;
if (!get_const_str_option(av[i], "--name=", &prm->varname) &&
!get_int_option(av[i], "--maxchar=", &prm->maxchar) &&
!get_int_option(av[i], "--ctype=", &prm->ctype) &&
!get_int_option(av[i], "--decomp=", &prm->decomp) &&
!get_int_option(av[i], "--debug=", &prm->debug))
{
fprintf(stderr, "\nUnknown option: %s\n\n", av[i]);
usage(stderr, 1);
}
}
prm->fname= av[i];
}
int main(int ac, char ** av)
{
MY_UNIDATA_PARAM prm;
static MY_UNIDATA_CHAR unidata[MAX_CHAR + 1];
process_options(&prm, ac, av);
memset(unidata, 0, sizeof(unidata));
fill_implicit_ctype(&prm, unidata);
load_unidata(&prm, unidata);
if (prm.ctype)
dump_ctype(&prm, unidata);
if (prm.decomp)
dump_decomposition(&prm, unidata);
return 0;
}