diff -Nru boilerpipe-1.2.0/pom.xml boilerpipe-1.2.0-gil/pom.xml --- boilerpipe-1.2.0/pom.xml 2013-10-11 11:54:23.418310128 +0200 +++ boilerpipe-1.2.0-gil/pom.xml 2013-10-11 11:51:51.334701196 +0200 @@ -32,4 +32,13 @@ Christian Kohlschütter + + + + net.sourceforge.nekohtml + nekohtml + 1.9.14 + + + diff -Nru boilerpipe-1.2.0/src/main/org/cyberneko/html/HTMLElements.java boilerpipe-1.2.0-gil/src/main/org/cyberneko/html/HTMLElements.java --- boilerpipe-1.2.0/src/main/org/cyberneko/html/HTMLElements.java 2010-12-16 11:30:06.000000000 +0100 +++ boilerpipe-1.2.0-gil/src/main/org/cyberneko/html/HTMLElements.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,794 +0,0 @@ -/* - * Copyright 2002-2009 Andy Clark, Marc Guillemot - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.cyberneko.html; - -/** - * Collection of HTML element information. - * - * @author Andy Clark - * @author Ahmed Ashour - * @author Marc Guillemot - * - * @version $Id: HTMLElements.java,v 1.12 2005/02/14 07:16:59 andyc Exp $ - */ -public class HTMLElements { - - // - // Constants - // - - // element codes - - // NOTE: The element codes *must* start with 0 and increment in - // sequence. The parent and closes references depends on - // this assumption. -Ac - - public static final short A = 0; - public static final short ABBR = A+1; - public static final short ACRONYM = ABBR+1; - public static final short ADDRESS = ACRONYM+1; - public static final short APPLET = ADDRESS+1; - public static final short AREA = APPLET+1; - public static final short B = AREA+1; - public static final short BASE = B+1; - public static final short BASEFONT = BASE+1; - public static final short BDO = BASEFONT+1; - public static final short BGSOUND = BDO+1; - public static final short BIG = BGSOUND+1; - public static final short BLINK = BIG+1; - public static final short BLOCKQUOTE = BLINK+1; - public static final short BODY = BLOCKQUOTE+1; - public static final short BR = BODY+1; - public static final short BUTTON = BR+1; - public static final short CAPTION = BUTTON+1; - public static final short CENTER = CAPTION+1; - public static final short CITE = CENTER+1; - public static final short CODE = CITE+1; - public static final short COL = CODE+1; - public static final short COLGROUP = COL+1; - public static final short COMMENT = COLGROUP+1; - public static final short DEL = COMMENT+1; - public static final short DFN = DEL+1; - public static final short DIR = DFN+1; - public static final short DIV = DIR+1; - public static final short DD = DIV+1; - public static final short DL = DD+1; - public static final short DT = DL+1; - public static final short EM = DT+1; - public static final short EMBED = EM+1; - public static final short FIELDSET = EMBED+1; - public static final short FONT = FIELDSET+1; - public static final short FORM = FONT+1; - public static final short FRAME = FORM+1; - public static final short FRAMESET = FRAME+1; - public static final short H1 = FRAMESET+1; - public static final short H2 = H1+1; - public static final short H3 = H2+1; - public static final short H4 = H3+1; - public static final short H5 = H4+1; - public static final short H6 = H5+1; - public static final short HEAD = H6+1; - public static final short HR = HEAD+1; - public static final short HTML = HR+1; - public static final short I = HTML+1; - public static final short IFRAME = I+1; - public static final short ILAYER = IFRAME+1; - public static final short IMG = ILAYER+1; - public static final short INPUT = IMG+1; - public static final short INS = INPUT+1; - public static final short ISINDEX = INS+1; - public static final short KBD = ISINDEX+1; - public static final short KEYGEN = KBD+1; - public static final short LABEL = KEYGEN+1; - public static final short LAYER = LABEL+1; - public static final short LEGEND = LAYER+1; - public static final short LI = LEGEND+1; - public static final short LINK = LI+1; - public static final short LISTING = LINK+1; - public static final short MAP = LISTING+1; - public static final short MARQUEE = MAP+1; - public static final short MENU = MARQUEE+1; - public static final short META = MENU+1; - public static final short MULTICOL = META+1; - public static final short NEXTID = MULTICOL+1; - public static final short NOBR = NEXTID+1; - public static final short NOEMBED = NOBR+1; - public static final short NOFRAMES = NOEMBED+1; - public static final short NOLAYER = NOFRAMES+1; - public static final short NOSCRIPT = NOLAYER+1; - public static final short OBJECT = NOSCRIPT+1; - public static final short OL = OBJECT+1; - public static final short OPTION = OL+1; - public static final short OPTGROUP = OPTION+1; - public static final short P = OPTGROUP+1; - public static final short PARAM = P+1; - public static final short PLAINTEXT = PARAM+1; - public static final short PRE = PLAINTEXT+1; - public static final short Q = PRE+1; - public static final short RB = Q+1; - public static final short RBC = RB+1; - public static final short RP = RBC+1; - public static final short RT = RP+1; - public static final short RTC = RT+1; - public static final short RUBY = RTC+1; - public static final short S = RUBY+1; - public static final short SAMP = S+1; - public static final short SCRIPT = SAMP+1; - public static final short SELECT = SCRIPT+1; - public static final short SMALL = SELECT+1; - public static final short SOUND = SMALL+1; - public static final short SPACER = SOUND+1; - public static final short SPAN = SPACER+1; - public static final short STRIKE = SPAN+1; - public static final short STRONG = STRIKE+1; - public static final short STYLE = STRONG+1; - public static final short SUB = STYLE+1; - public static final short SUP = SUB+1; - public static final short TABLE = SUP+1; - public static final short TBODY = TABLE+1; - public static final short TD = TBODY+1; - public static final short TEXTAREA = TD+1; - public static final short TFOOT = TEXTAREA+1; - public static final short TH = TFOOT+1; - public static final short THEAD = TH+1; - public static final short TITLE = THEAD+1; - public static final short TR = TITLE+1; - public static final short TT = TR+1; - public static final short U = TT+1; - public static final short UL = U+1; - public static final short VAR = UL+1; - public static final short WBR = VAR+1; - public static final short XML = WBR+1; - public static final short XMP = XML+1; - public static final short UNKNOWN = XMP+1; - - // information - - /** Element information organized by first letter. */ - protected static final Element[][] ELEMENTS_ARRAY = new Element[26][]; - - /** Element information as a contiguous list. */ - protected static final ElementList ELEMENTS = new ElementList(); - - /** No such element. */ - public static final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "", Element.CONTAINER, new short[]{BODY,HEAD}/*HTML*/, null); - - // - // Static initializer - // - - /** - * Initializes the element information. - *

- * Note: - * The getElement method requires that the HTML elements - * are added to the list in alphabetical order. If new elements are - * added, then they must be inserted in alphabetical order. - */ - static { - // - // - // - // - // - // - // - // - - // initialize array of element information - ELEMENTS_ARRAY['A'-'A'] = new Element[] { - // A - - (%inline;)* -(A) - new Element(A, "A", Element.INLINE, BODY, new short[] {A}), - // ABBR - - (%inline;)* - new Element(ABBR, "ABBR", Element.INLINE, BODY, null), - // ACRONYM - - (%inline;)* - new Element(ACRONYM, "ACRONYM", Element.INLINE, BODY, null), - // ADDRESS - - (%inline;)* - new Element(ADDRESS, "ADDRESS", Element.BLOCK, BODY, null), - // APPLET - new Element(APPLET, "APPLET", 0, BODY, null), - // AREA - O EMPTY - new Element(AREA, "AREA", Element.EMPTY, MAP, null), - }; - ELEMENTS_ARRAY['B'-'A'] = new Element[] { - // B - - (%inline;)* - new Element(B, "B", Element.INLINE, BODY, null), - // BASE - O EMPTY - new Element(BASE, "BASE", Element.EMPTY, HEAD, null), - // BASEFONT - new Element(BASEFONT, "BASEFONT", 0, HEAD, null), - // BDO - - (%inline;)* - new Element(BDO, "BDO", Element.INLINE, BODY, null), - // BGSOUND - new Element(BGSOUND, "BGSOUND", Element.EMPTY, HEAD, null), - // BIG - - (%inline;)* - new Element(BIG, "BIG", Element.INLINE, BODY, null), - // BLINK - new Element(BLINK, "BLINK", Element.INLINE, BODY, null), - // BLOCKQUOTE - - (%block;|SCRIPT)+ - new Element(BLOCKQUOTE, "BLOCKQUOTE", Element.BLOCK, BODY, new short[]{P}), - // BODY O O (%block;|SCRIPT)+ +(INS|DEL) - new Element(BODY, "BODY", Element.CONTAINER, HTML, new short[]{HEAD}), - // BR - O EMPTY - new Element(BR, "BR", Element.EMPTY, BODY, null), - // BUTTON - - (%flow;)* -(A|%formctrl;|FORM|FIELDSET) - new Element(BUTTON, "BUTTON", 0, BODY, null), - }; - ELEMENTS_ARRAY['C'-'A'] = new Element[] { - // CAPTION - - (%inline;)* - new Element(CAPTION, "CAPTION", Element.INLINE, TABLE, null), - // CENTER, - new Element(CENTER, "CENTER", 0, BODY, null), - // CITE - - (%inline;)* - new Element(CITE, "CITE", Element.INLINE, BODY, null), - // CODE - - (%inline;)* - new Element(CODE, "CODE", Element.INLINE, BODY, null), - // COL - O EMPTY - new Element(COL, "COL", Element.EMPTY, TABLE, null), - // COLGROUP - O (COL)* - new Element(COLGROUP, "COLGROUP", 0, TABLE, new short[]{COL,COLGROUP}), - // COMMENT - new Element(COMMENT, "COMMENT", Element.SPECIAL, HTML, null), - }; - ELEMENTS_ARRAY['D'-'A'] = new Element[] { - // DEL - - (%flow;)* - new Element(DEL, "DEL", 0, BODY, null), - // DFN - - (%inline;)* - new Element(DFN, "DFN", Element.INLINE, BODY, null), - // DIR - new Element(DIR, "DIR", 0, BODY, null), - // DIV - - (%flow;)* - new Element(DIV, "DIV", Element.BLOCK, BODY, new short[]{P}), - // DD - O (%flow;)* - new Element(DD, "DD", 0, DL, new short[]{DT,DD}), - // DL - - (DT|DD)+ - new Element(DL, "DL", Element.BLOCK, BODY, null), - // DT - O (%inline;)* - new Element(DT, "DT", 0, DL, new short[]{DT,DD}), - }; - ELEMENTS_ARRAY['E'-'A'] = new Element[] { - // EM - - (%inline;)* - new Element(EM, "EM", Element.INLINE, BODY, null), - // EMBED - new Element(EMBED, "EMBED", 0, BODY, null), - }; - ELEMENTS_ARRAY['F'-'A'] = new Element[] { - // FIELDSET - - (#PCDATA,LEGEND,(%flow;)*) - new Element(FIELDSET, "FIELDSET", 0, BODY, null), - // FONT - new Element(FONT, "FONT", Element.CONTAINER, BODY, null), - // FORM - - (%block;|SCRIPT)+ -(FORM) - new Element(FORM, "FORM", Element.CONTAINER, new short[]{BODY,TD,DIV}, new short[]{BUTTON,P}), - // FRAME - O EMPTY - new Element(FRAME, "FRAME", Element.EMPTY, FRAMESET, null), - // FRAMESET - - ((FRAMESET|FRAME)+ & NOFRAMES?) - new Element(FRAMESET, "FRAMESET", 0, HTML, null), - }; - ELEMENTS_ARRAY['H'-'A'] = new Element[] { - // (H1|H2|H3|H4|H5|H6) - - (%inline;)* - new Element(H1, "H1", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), - new Element(H2, "H2", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), - new Element(H3, "H3", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), - new Element(H4, "H4", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), - new Element(H5, "H5", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), - new Element(H6, "H6", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), - // HEAD O O (%head.content;) +(%head.misc;) - new Element(HEAD, "HEAD", 0, HTML, null), - // HR - O EMPTY - new Element(HR, "HR", Element.EMPTY, BODY, new short[]{P}), - // HTML O O (%html.content;) - new Element(HTML, "HTML", 0, null, null), - }; - ELEMENTS_ARRAY['I'-'A'] = new Element[] { - // I - - (%inline;)* - new Element(I, "I", Element.INLINE, BODY, null), - // IFRAME - new Element(IFRAME, "IFRAME", Element.BLOCK, BODY, null), - // ILAYER - new Element(ILAYER, "ILAYER", Element.BLOCK, BODY, null), - // IMG - O EMPTY - new Element(IMG, "IMG", Element.EMPTY, BODY, null), - // INPUT - O EMPTY - new Element(INPUT, "INPUT", Element.EMPTY, BODY, null), - // INS - - (%flow;)* - new Element(INS, "INS", 0, BODY, null), - // ISINDEX - new Element(ISINDEX, "ISINDEX", 0, HEAD, null), - }; - ELEMENTS_ARRAY['K'-'A'] = new Element[] { - // KBD - - (%inline;)* - new Element(KBD, "KBD", Element.INLINE, BODY, null), - // KEYGEN - new Element(KEYGEN, "KEYGEN", 0, BODY, null), - }; - ELEMENTS_ARRAY['L'-'A'] = new Element[] { - // LABEL - - (%inline;)* -(LABEL) - new Element(LABEL, "LABEL", 0, BODY, null), - // LAYER - new Element(LAYER, "LAYER", Element.BLOCK, BODY, null), - // LEGEND - - (%inline;)* - new Element(LEGEND, "LEGEND", Element.INLINE, FIELDSET, null), - // LI - O (%flow;)* - new Element(LI, "LI", 0, new short[]{BODY,UL,OL}, new short[]{LI}), - // LINK - O EMPTY - new Element(LINK, "LINK", Element.EMPTY, HEAD, null), - // LISTING - new Element(LISTING, "LISTING", 0, BODY, null), - }; - ELEMENTS_ARRAY['M'-'A'] = new Element[] { - // MAP - - ((%block;) | AREA)+ - new Element(MAP, "MAP", Element.INLINE, BODY, null), - // MARQUEE - new Element(MARQUEE, "MARQUEE", 0, BODY, null), - // MENU - new Element(MENU, "MENU", 0, BODY, null), - // META - O EMPTY - new Element(META, "META", Element.EMPTY, HEAD, new short[]{STYLE,TITLE}), - // MULTICOL - new Element(MULTICOL, "MULTICOL", 0, BODY, null), - }; - ELEMENTS_ARRAY['N'-'A'] = new Element[] { - // NEXTID - new Element(NEXTID, "NEXTID", Element.EMPTY, BODY, null), - // NOBR - new Element(NOBR, "NOBR", Element.INLINE, BODY, null), - // NOEMBED - new Element(NOEMBED, "NOEMBED", 0, BODY, null), - // NOFRAMES - - (BODY) -(NOFRAMES) - new Element(NOFRAMES, "NOFRAMES", 0, FRAMESET, null), - // NOLAYER - new Element(NOLAYER, "NOLAYER", 0, BODY, null), - // NOSCRIPT - - (%block;)+ - new Element(NOSCRIPT, "NOSCRIPT", 0, new short[]{BODY}, null), - }; - ELEMENTS_ARRAY['O'-'A'] = new Element[] { - // OBJECT - - (PARAM | %flow;)* - new Element(OBJECT, "OBJECT", 0, BODY, null), - // OL - - (LI)+ - new Element(OL, "OL", Element.BLOCK, BODY, null), - // OPTGROUP - - (OPTION)+ - new Element(OPTGROUP, "OPTGROUP", 0, SELECT, new short[]{OPTION}), - // OPTION - O (#PCDATA) - new Element(OPTION, "OPTION", 0, SELECT, new short[]{OPTION}), - }; - ELEMENTS_ARRAY['P'-'A'] = new Element[] { - // P - O (%inline;)* - new Element(P, "P", Element.CONTAINER, BODY, new short[]{P}), - // PARAM - O EMPTY - new Element(PARAM, "PARAM", Element.EMPTY, new short[]{OBJECT,APPLET}, null), - // PLAINTEXT - new Element(PLAINTEXT, "PLAINTEXT", Element.SPECIAL, BODY, null), - // PRE - - (%inline;)* -(%pre.exclusion;) - new Element(PRE, "PRE", 0, BODY, null), - }; - ELEMENTS_ARRAY['Q'-'A'] = new Element[] { - // Q - - (%inline;)* - new Element(Q, "Q", Element.INLINE, BODY, null), - }; - ELEMENTS_ARRAY['R'-'A'] = new Element[] { - // RB - new Element(RB, "RB", Element.INLINE, RUBY, new short[]{RB}), - // RBC - new Element(RBC, "RBC", 0, RUBY, null), - // RP - new Element(RP, "RP", Element.INLINE, RUBY, new short[]{RB}), - // RT - new Element(RT, "RT", Element.INLINE, RUBY, new short[]{RB,RP}), - // RTC - new Element(RTC, "RTC", 0, RUBY, new short[]{RBC}), - // RUBY - new Element(RUBY, "RUBY", 0, BODY, new short[]{RUBY}), - }; - ELEMENTS_ARRAY['S'-'A'] = new Element[] { - // S - new Element(S, "S", 0, BODY, null), - // SAMP - - (%inline;)* - new Element(SAMP, "SAMP", Element.INLINE, BODY, null), - // SCRIPT - - %Script; - new Element(SCRIPT, "SCRIPT", Element.SPECIAL, new short[]{HEAD,BODY}, null), - // SELECT - - (OPTGROUP|OPTION)+ - new Element(SELECT, "SELECT", Element.CONTAINER, BODY, new short[]{SELECT}), - // SMALL - - (%inline;)* - new Element(SMALL, "SMALL", Element.INLINE, BODY, null), - // SOUND - new Element(SOUND, "SOUND", Element.EMPTY, HEAD, null), - // SPACER - new Element(SPACER, "SPACER", Element.EMPTY, BODY, null), - // SPAN - - (%inline;)* - new Element(SPAN, "SPAN", Element.CONTAINER, BODY, null), - // STRIKE - new Element(STRIKE, "STRIKE", Element.INLINE, BODY, null), - // STRONG - - (%inline;)* - new Element(STRONG, "STRONG", Element.INLINE, BODY, null), - // STYLE - - %StyleSheet; - new Element(STYLE, "STYLE", Element.SPECIAL, new short[]{HEAD,BODY}, new short[]{STYLE,TITLE,META}), - // SUB - - (%inline;)* - new Element(SUB, "SUB", Element.INLINE, BODY, null), - // SUP - - (%inline;)* - new Element(SUP, "SUP", Element.INLINE, BODY, null), - }; - ELEMENTS_ARRAY['T'-'A'] = new Element[] { - // TABLE - - (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+) - new Element(TABLE, "TABLE", Element.BLOCK|Element.CONTAINER, BODY, null), - // TBODY O O (TR)+ - new Element(TBODY, "TBODY", 0, TABLE, new short[]{THEAD,TD,TH,TR,COLGROUP}), - // TD - O (%flow;)* - new Element(TD, "TD", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}), - // TEXTAREA - - (#PCDATA) - new Element(TEXTAREA, "TEXTAREA", Element.SPECIAL, BODY, null), - // TFOOT - O (TR)+ - new Element(TFOOT, "TFOOT", 0, TABLE, new short[]{THEAD,TBODY,TD,TH,TR}), - // TH - O (%flow;)* - new Element(TH, "TH", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}), - // THEAD - O (TR)+ - new Element(THEAD, "THEAD", 0, TABLE, new short[]{COLGROUP}), - // TITLE - - (#PCDATA) -(%head.misc;) - new Element(TITLE, "TITLE", Element.SPECIAL, new short[]{HEAD,BODY}, null), - // TR - O (TH|TD)+ - new Element(TR, "TR", Element.BLOCK, new short[]{TBODY, THEAD, TFOOT}, TABLE, new short[]{TD,TH,TR,COLGROUP}), - // TT - - (%inline;)* - new Element(TT, "TT", Element.INLINE, BODY, null), - }; - ELEMENTS_ARRAY['U'-'A'] = new Element[] { - // U, - new Element(U, "U", Element.INLINE, BODY, null), - // UL - - (LI)+ - new Element(UL, "UL", Element.BLOCK, BODY, null), - }; - ELEMENTS_ARRAY['V'-'A'] = new Element[] { - // VAR - - (%inline;)* - new Element(VAR, "VAR", Element.INLINE, BODY, null), - }; - ELEMENTS_ARRAY['W'-'A'] = new Element[] { - // WBR - new Element(WBR, "WBR", Element.EMPTY, BODY, null), - }; - ELEMENTS_ARRAY['X'-'A'] = new Element[] { - // XML - new Element(XML, "XML", 0, BODY, null), - // XMP - new Element(XMP, "XMP", Element.SPECIAL, BODY, null), - }; - - // keep contiguous list of elements for lookups by code - for (int i = 0; i < ELEMENTS_ARRAY.length; i++) { - Element[] elements = ELEMENTS_ARRAY[i]; - if (elements != null) { - for (int j = 0; j < elements.length; j++) { - Element element = elements[j]; - ELEMENTS.addElement(element); - } - } - } - ELEMENTS.addElement(NO_SUCH_ELEMENT); - - // initialize cross references to parent elements - for (int i = 0; i < ELEMENTS.size; i++) { - Element element = ELEMENTS.data[i]; - if (element.parentCodes != null) { - element.parent = new Element[element.parentCodes.length]; - for (int j = 0; j < element.parentCodes.length; j++) { - element.parent[j] = ELEMENTS.data[element.parentCodes[j]]; - } - element.parentCodes = null; - } - } - - } // () - - // - // Public static methods - // - - /** - * Returns the element information for the specified element code. - * - * @param code The element code. - */ - public static final Element getElement(short code) { - return ELEMENTS.data[code]; - } // getElement(short):Element - - /** - * Returns the element information for the specified element name. - * - * @param ename The element name. - */ - public static final Element getElement(String ename) { - return getElement(ename, NO_SUCH_ELEMENT); - } // getElement(String):Element - - /** - * Returns the element information for the specified element name. - * - * @param ename The element name. - * @param element The default element to return if not found. - */ - public static final Element getElement(String ename, Element element) { - - if (ename.length() > 0) { - int c = ename.charAt(0); - if (c >= 'a' && c <= 'z') { - c = 'A' + c - 'a'; - } - if (c >= 'A' && c <= 'Z') { - Element[] elements = ELEMENTS_ARRAY[c - 'A']; - if (elements != null) { - for (int i = 0; i < elements.length; i++) { - Element elem = elements[i]; - if (elem.name.equalsIgnoreCase(ename)) { - return elem; - } - } - } - } - } - return element; - - } // getElement(String):Element - - // - // Classes - // - - /** - * Element information. - * - * @author Andy Clark - */ - public static class Element { - - // - // Constants - // - - /** Inline element. */ - public static final int INLINE = 0x01; - - /** Block element. */ - public static final int BLOCK = 0x02; - - /** Empty element. */ - public static final int EMPTY = 0x04; - - /** Container element. */ - public static final int CONTAINER = 0x08; - - /** Special element. */ - public static final int SPECIAL = 0x10; - - // - // Data - // - - /** The element code. */ - public short code; - - /** The element name. */ - public String name; - - /** Informational flags. */ - public int flags; - - /** Parent elements. */ - public short[] parentCodes; - - /** Parent elements. */ - public Element[] parent; - - /** The bounding element code. */ - public short bounds; - - /** List of elements this element can close. */ - public short[] closes; - - /** If set to true, then this element may not be nested, example: "A" **/ - boolean nestable = true; - - // - // Constructors - // - - /** - * Constructs an element object. - * - * @param code The element code. - * @param name The element name. - * @param flags Informational flags - * @param parent Natural closing parent name. - * @param closes List of elements this element can close. - */ - public Element(short code, String name, int flags, - short parent, short[] closes) { - this(code, name, flags, new short[]{parent}, (short)-1, closes); - } // (short,String,int,short,short[]); - - /** - * Constructs an element object. - * - * @param code The element code. - * @param name The element name. - * @param flags Informational flags - * @param parent Natural closing parent name. - * @param closes List of elements this element can close. - */ - public Element(short code, String name, int flags, - short parent, short bounds, short[] closes) { - this(code, name, flags, new short[]{parent}, bounds, closes); - } // (short,String,int,short,short,short[]) - - /** - * Constructs an element object. - * - * @param code The element code. - * @param name The element name. - * @param flags Informational flags - * @param parents Natural closing parent names. - * @param closes List of elements this element can close. - */ - public Element(short code, String name, int flags, - short[] parents, short[] closes) { - this(code, name, flags, parents, (short)-1, closes); - } // (short,String,int,short[],short[]) - - /** - * Constructs an element object. - * - * @param code The element code. - * @param name The element name. - * @param flags Informational flags - * @param parents Natural closing parent names. - * @param closes List of elements this element can close. - */ - public Element(short code, String name, int flags, - short[] parents, short bounds, short[] closes) { - this.code = code; - this.name = name; - this.flags = flags; - this.parentCodes = parents; - this.parent = null; - this.bounds = bounds; - this.closes = closes; - if(closes != null) { - for(int i=0;i(short,String,int,short[],short,short[]) - - // - // Public methods - // - - /** Returns true if this element is an inline element. */ - public final boolean isInline() { - return (flags & INLINE) != 0; - } // isInline():boolean - - /** Returns true if this element is a block element. */ - public final boolean isBlock() { - return (flags & BLOCK) != 0; - } // isBlock():boolean - - /** Returns true if this element is an empty element. */ - public final boolean isEmpty() { - return (flags & EMPTY) != 0; - } // isEmpty():boolean - - /** Returns true if this element is a container element. */ - public final boolean isContainer() { - return (flags & CONTAINER) != 0; - } // isContainer():boolean - - /** - * Returns true if this element is special -- if its content - * should be parsed ignoring markup. - */ - public final boolean isSpecial() { - return (flags & SPECIAL) != 0; - } // isSpecial():boolean - - /** - * Returns true if this element can close the specified Element. - * - * @param tag The element. - */ - public boolean closes(short tag) { - - if (closes != null) { - for (int i = 0; i < closes.length; i++) { - if (closes[i] == tag) { - return true; - } - } - } - return false; - - } // closes(short):boolean - - // - // Object methods - // - - /** Returns a hash code for this object. */ - public int hashCode() { - return name.hashCode(); - } // hashCode():int - - /** Returns true if the objects are equal. */ - public boolean equals(Object o) { - return name.equals(o); - } // equals(Object):boolean - - /** - * Provides a simple representation to make debugging easier - */ - public String toString() { - return super.toString() + "(name=" + name + ")"; - } - - /** - * Indicates if the provided element is an accepted parent of current element - * @param element the element to test for "paternity" - * @return true if element belongs to the {@link #parent} - */ - public boolean isParent(final Element element) { - if (parent == null) - return false; - else { - for (int i=0; i - *

  • add missing parent elements; - *
  • automatically close elements with optional end tags; and - *
  • handle mis-matched inline element tags. - * - *

    - * This component recognizes the following features: - *

    - *

    - * This component recognizes the following properties: - *

    - * - * @see HTMLElements - * - * @author Andy Clark - * @author Marc Guillemot - * - * @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $ - */ -public class HTMLTagBalancer - implements XMLDocumentFilter, HTMLComponent { - - // - // Constants - // - - // features - - /** Namespaces. */ - protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces"; - - /** Include infoset augmentations. */ - protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations"; - - /** Report errors. */ - protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors"; - - /** Document fragment balancing only (deprecated). */ - protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment"; - - /** Document fragment balancing only. */ - protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment"; - - /** Ignore outside content. */ - protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content"; - - /** Recognized features. */ - private static final String[] RECOGNIZED_FEATURES = { - NAMESPACES, - AUGMENTATIONS, - REPORT_ERRORS, - DOCUMENT_FRAGMENT_DEPRECATED, - DOCUMENT_FRAGMENT, - IGNORE_OUTSIDE_CONTENT, - }; - - /** Recognized features defaults. */ - private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = { - null, - null, - null, - null, - Boolean.FALSE, - Boolean.FALSE, - }; - - // properties - - /** Modify HTML element names: { "upper", "lower", "default" }. */ - protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems"; - - /** Modify HTML attribute names: { "upper", "lower", "default" }. */ - protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs"; - - /** Error reporter. */ - protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter"; - - /** - * EXPERIMENTAL: may change in next release
    - * Name of the property holding the stack of elements in which context a document fragment should be parsed. - **/ - public static final String FRAGMENT_CONTEXT_STACK = "http://cyberneko.org/html/properties/balance-tags/fragment-context-stack"; - - /** Recognized properties. */ - private static final String[] RECOGNIZED_PROPERTIES = { - NAMES_ELEMS, - NAMES_ATTRS, - ERROR_REPORTER, - FRAGMENT_CONTEXT_STACK, - }; - - /** Recognized properties defaults. */ - private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = { - null, - null, - null, - null, - }; - - // modify HTML names - - /** Don't modify HTML names. */ - protected static final short NAMES_NO_CHANGE = 0; - - /** Match HTML element names. */ - protected static final short NAMES_MATCH = 0; - - /** Uppercase HTML names. */ - protected static final short NAMES_UPPERCASE = 1; - - /** Lowercase HTML names. */ - protected static final short NAMES_LOWERCASE = 2; - - // static vars - - /** Synthesized event info item. */ - protected static final HTMLEventInfo SYNTHESIZED_ITEM = - new HTMLEventInfo.SynthesizedItem(); - - // - // Data - // - - // features - - /** Namespaces. */ - protected boolean fNamespaces; - - /** Include infoset augmentations. */ - protected boolean fAugmentations; - - /** Report errors. */ - protected boolean fReportErrors; - - /** Document fragment balancing only. */ - protected boolean fDocumentFragment; - - /** Ignore outside content. */ - protected boolean fIgnoreOutsideContent; - - // properties - - /** Modify HTML element names. */ - protected short fNamesElems; - - /** Modify HTML attribute names. */ - protected short fNamesAttrs; - - /** Error reporter. */ - protected HTMLErrorReporter fErrorReporter; - - // connections - - /** The document source. */ - protected XMLDocumentSource fDocumentSource; - - /** The document handler. */ - protected XMLDocumentHandler fDocumentHandler; - - // state - - /** The element stack. */ - protected final InfoStack fElementStack = new InfoStack(); - - /** The inline stack. */ - protected final InfoStack fInlineStack = new InfoStack(); - - /** True if seen anything. Important for xml declaration. */ - protected boolean fSeenAnything; - - /** True if root element has been seen. */ - protected boolean fSeenDoctype; - - /** True if root element has been seen. */ - protected boolean fSeenRootElement; - - /** - * True if seen the end of the document element. In other words, - * this variable is set to false until the end </HTML> - * tag is seen (or synthesized). This is used to ensure that - * extraneous events after the end of the document element do not - * make the document stream ill-formed. - */ - protected boolean fSeenRootElementEnd; - - /** True if seen <head< element. */ - protected boolean fSeenHeadElement; - - /** True if seen <body< element. */ - protected boolean fSeenBodyElement; - - /** True if a form is in the stack (allow to discard opening of nested forms) */ - protected boolean fOpenedForm; - - // temp vars - - /** A qualified name. */ - private final QName fQName = new QName(); - - /** Empty attributes. */ - private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl(); - - /** Augmentations. */ - private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations(); - - protected HTMLTagBalancingListener tagBalancingListener; - private LostText lostText_ = new LostText(); - - private boolean forcedStartElement_ = false; - private boolean forcedEndElement_ = false; - - /** - * Stack of elements determining the context in which a document fragment should be parsed - */ - private QName[] fragmentContextStack_ = null; - private int fragmentContextStackSize_ = 0; // not 0 only when a fragment is parsed and fragmentContextStack_ is set - - private List/*ElementEntry*/ endElementsBuffer_ = new ArrayList(); - - // - // HTMLComponent methods - // - - /** Returns the default state for a feature. */ - public Boolean getFeatureDefault(String featureId) { - int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0; - for (int i = 0; i < length; i++) { - if (RECOGNIZED_FEATURES[i].equals(featureId)) { - return RECOGNIZED_FEATURES_DEFAULTS[i]; - } - } - return null; - } // getFeatureDefault(String):Boolean - - /** Returns the default state for a property. */ - public Object getPropertyDefault(String propertyId) { - int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0; - for (int i = 0; i < length; i++) { - if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) { - return RECOGNIZED_PROPERTIES_DEFAULTS[i]; - } - } - return null; - } // getPropertyDefault(String):Object - - // - // XMLComponent methods - // - - /** Returns recognized features. */ - public String[] getRecognizedFeatures() { - return RECOGNIZED_FEATURES; - } // getRecognizedFeatures():String[] - - /** Returns recognized properties. */ - public String[] getRecognizedProperties() { - return RECOGNIZED_PROPERTIES; - } // getRecognizedProperties():String[] - - /** Resets the component. */ - public void reset(XMLComponentManager manager) - throws XMLConfigurationException { - - // get features - fNamespaces = manager.getFeature(NAMESPACES); - fAugmentations = manager.getFeature(AUGMENTATIONS); - fReportErrors = manager.getFeature(REPORT_ERRORS); - fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT) || - manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED); - fIgnoreOutsideContent = manager.getFeature(IGNORE_OUTSIDE_CONTENT); - - // get properties - fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS))); - fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS))); - fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER); - - fragmentContextStack_ = (QName[]) manager.getProperty(FRAGMENT_CONTEXT_STACK); - - } // reset(XMLComponentManager) - - /** Sets a feature. */ - public void setFeature(String featureId, boolean state) - throws XMLConfigurationException { - - if (featureId.equals(AUGMENTATIONS)) { - fAugmentations = state; - return; - } - if (featureId.equals(REPORT_ERRORS)) { - fReportErrors = state; - return; - } - if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) { - fIgnoreOutsideContent = state; - return; - } - - } // setFeature(String,boolean) - - /** Sets a property. */ - public void setProperty(String propertyId, Object value) - throws XMLConfigurationException { - - if (propertyId.equals(NAMES_ELEMS)) { - fNamesElems = getNamesValue(String.valueOf(value)); - return; - } - - if (propertyId.equals(NAMES_ATTRS)) { - fNamesAttrs = getNamesValue(String.valueOf(value)); - return; - } - - } // setProperty(String,Object) - - // - // XMLDocumentSource methods - // - - /** Sets the document handler. */ - public void setDocumentHandler(XMLDocumentHandler handler) { - fDocumentHandler = handler; - } // setDocumentHandler(XMLDocumentHandler) - - // @since Xerces 2.1.0 - - /** Returns the document handler. */ - public XMLDocumentHandler getDocumentHandler() { - return fDocumentHandler; - } // getDocumentHandler():XMLDocumentHandler - - // - // XMLDocumentHandler methods - // - - // since Xerces-J 2.2.0 - - /** Start document. */ - public void startDocument(XMLLocator locator, String encoding, - NamespaceContext nscontext, Augmentations augs) - throws XNIException { - - // reset state - fElementStack.top = 0; - if (fragmentContextStack_ != null) { - fragmentContextStackSize_ = fragmentContextStack_.length; - for (int i=0; i and have been buffered to consider outside content - fIgnoreOutsideContent = true; // endElement should not ignore the elements passed from buffer - consumeBufferedEndElements(); - - // handle empty document - if (!fSeenRootElement && !fDocumentFragment) { - if (fReportErrors) { - fErrorReporter.reportError("HTML2000", null); - } - if (fDocumentHandler != null) { - fSeenRootElementEnd = false; - forceStartBody(); // will force and - final String body = modifyName("body", fNamesElems); - fQName.setValues(null, body, body, null); - callEndElement(fQName, synthesizedAugs()); - - final String ename = modifyName("html", fNamesElems); - fQName.setValues(null, ename, ename, null); - callEndElement(fQName, synthesizedAugs()); - } - } - - // pop all remaining elements - else { - int length = fElementStack.top - fragmentContextStackSize_; - for (int i = 0; i < length; i++) { - Info info = fElementStack.pop(); - if (fReportErrors) { - String ename = info.qname.rawname; - fErrorReporter.reportWarning("HTML2001", new Object[]{ename}); - } - if (fDocumentHandler != null) { - callEndElement(info.qname, synthesizedAugs()); - } - } - } - - // call handler - if (fDocumentHandler != null) { - fDocumentHandler.endDocument(augs); - } - - } // endDocument(Augmentations) - - /** - * Consume elements that have been buffered, like that are first consumed - * at the end of document - */ - private void consumeBufferedEndElements() { - final List toConsume = new ArrayList(endElementsBuffer_); - endElementsBuffer_.clear(); - for (int i=0; i (if any) has been buffered - } - else if (elementCode == HTMLElements.BODY) { - // create if none was present - if (!fSeenHeadElement) { - final QName head = createQName("head"); - forceStartElement(head, null, synthesizedAugs()); - endElement(head, synthesizedAugs()); - } - consumeBufferedEndElements(); // (if any) has been buffered - - if (fSeenBodyElement) { - notifyDiscardedStartElement(elem, attrs, augs); - return; - } - fSeenBodyElement = true; - } - else if (elementCode == HTMLElements.FORM) { - if (fOpenedForm) { - notifyDiscardedStartElement(elem, attrs, augs); - return; - } - fOpenedForm = true; - } - else if (elementCode == HTMLElements.UNKNOWN) { - consumeBufferedEndElements(); - } - - // check proper parent - if (element.parent != null) { - if (!fSeenRootElement && !fDocumentFragment) { - String pname = element.parent[0].name; - pname = modifyName(pname, fNamesElems); - if (fReportErrors) { - String ename = elem.rawname; - fErrorReporter.reportWarning("HTML2002", new Object[]{ename,pname}); - } - final QName qname = new QName(null, pname, pname, null); - final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs()); - if (!parentCreated) { - if (!isForcedCreation) { - notifyDiscardedStartElement(elem, attrs, augs); - } - return; - } - } - else { - HTMLElements.Element preferedParent = element.parent[0]; - if (preferedParent.code != HTMLElements.HEAD || (!fSeenBodyElement && !fDocumentFragment)) { - int depth = getParentDepth(element.parent, element.bounds); - if (depth == -1) { // no parent found - final String pname = modifyName(preferedParent.name, fNamesElems); - final QName qname = new QName(null, pname, pname, null); - if (fReportErrors) { - String ename = elem.rawname; - fErrorReporter.reportWarning("HTML2004", new Object[]{ename,pname}); - } - final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs()); - if (!parentCreated) { - if (!isForcedCreation) { - notifyDiscardedStartElement(elem, attrs, augs); - } - return; - } - } - } - } - } - - // if block element, save immediate parent inline elements - int depth = 0; - if (element.flags == 0) { - int length = fElementStack.top; - fInlineStack.top = 0; - for (int i = length - 1; i >= 0; i--) { - Info info = fElementStack.data[i]; - if (!info.element.isInline()) { - break; - } - fInlineStack.push(info); - endElement(info.qname, synthesizedAugs()); - } - depth = fInlineStack.top; - } - - // close previous elements - // all elements close a