diff --git a/boilerpipe-1.2.0-libdir-patch b/boilerpipe-1.2.0-libdir-patch new file mode 100644 index 0000000..d4244ab --- /dev/null +++ b/boilerpipe-1.2.0-libdir-patch @@ -0,0 +1,21 @@ +--- build.xml 2011-05-28 16:56:41.000000000 +0200 ++++ build.xml-gil 2011-08-15 17:57:57.279492364 +0200 +@@ -53,7 +53,7 @@ + + + +- ++ + + + +@@ -67,7 +67,8 @@ + + + +- ++ ++ + + + diff --git a/boilerpipe-1.2.0-nekohtml-patch b/boilerpipe-1.2.0-nekohtml-patch new file mode 100644 index 0000000..5918988 --- /dev/null +++ b/boilerpipe-1.2.0-nekohtml-patch @@ -0,0 +1,2228 @@ +diff -Nru boilerpipe-1.2.0/pom.xml boilerpipe-1.2.0-gil/pom.xml +--- boilerpipe-1.2.0/pom.xml 2013-10-11 11:54:23.418310128 +0200 ++++ boilerpipe-1.2.0-gil/pom.xml 2013-10-11 11:51:51.334701196 +0200 +@@ -32,4 +32,13 @@ + Christian Kohlschütter + + ++ ++ ++ ++ net.sourceforge.nekohtml ++ nekohtml ++ 1.9.14 ++ ++ ++ + +diff -Nru boilerpipe-1.2.0/src/main/org/cyberneko/html/HTMLElements.java boilerpipe-1.2.0-gil/src/main/org/cyberneko/html/HTMLElements.java +--- boilerpipe-1.2.0/src/main/org/cyberneko/html/HTMLElements.java 2010-12-16 11:30:06.000000000 +0100 ++++ boilerpipe-1.2.0-gil/src/main/org/cyberneko/html/HTMLElements.java 1970-01-01 01:00:00.000000000 +0100 +@@ -1,794 +0,0 @@ +-/* +- * Copyright 2002-2009 Andy Clark, Marc Guillemot +- * +- * Licensed under the Apache License, Version 2.0 (the "License"); +- * you may not use this file except in compliance with the License. +- * You may obtain a copy of the License at +- * +- * http://www.apache.org/licenses/LICENSE-2.0 +- * +- * Unless required by applicable law or agreed to in writing, software +- * distributed under the License is distributed on an "AS IS" BASIS, +- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +- * See the License for the specific language governing permissions and +- * limitations under the License. +- */ +- +-package org.cyberneko.html; +- +-/** +- * Collection of HTML element information. +- * +- * @author Andy Clark +- * @author Ahmed Ashour +- * @author Marc Guillemot +- * +- * @version $Id: HTMLElements.java,v 1.12 2005/02/14 07:16:59 andyc Exp $ +- */ +-public class HTMLElements { +- +- // +- // Constants +- // +- +- // element codes +- +- // NOTE: The element codes *must* start with 0 and increment in +- // sequence. The parent and closes references depends on +- // this assumption. -Ac +- +- public static final short A = 0; +- public static final short ABBR = A+1; +- public static final short ACRONYM = ABBR+1; +- public static final short ADDRESS = ACRONYM+1; +- public static final short APPLET = ADDRESS+1; +- public static final short AREA = APPLET+1; +- public static final short B = AREA+1; +- public static final short BASE = B+1; +- public static final short BASEFONT = BASE+1; +- public static final short BDO = BASEFONT+1; +- public static final short BGSOUND = BDO+1; +- public static final short BIG = BGSOUND+1; +- public static final short BLINK = BIG+1; +- public static final short BLOCKQUOTE = BLINK+1; +- public static final short BODY = BLOCKQUOTE+1; +- public static final short BR = BODY+1; +- public static final short BUTTON = BR+1; +- public static final short CAPTION = BUTTON+1; +- public static final short CENTER = CAPTION+1; +- public static final short CITE = CENTER+1; +- public static final short CODE = CITE+1; +- public static final short COL = CODE+1; +- public static final short COLGROUP = COL+1; +- public static final short COMMENT = COLGROUP+1; +- public static final short DEL = COMMENT+1; +- public static final short DFN = DEL+1; +- public static final short DIR = DFN+1; +- public static final short DIV = DIR+1; +- public static final short DD = DIV+1; +- public static final short DL = DD+1; +- public static final short DT = DL+1; +- public static final short EM = DT+1; +- public static final short EMBED = EM+1; +- public static final short FIELDSET = EMBED+1; +- public static final short FONT = FIELDSET+1; +- public static final short FORM = FONT+1; +- public static final short FRAME = FORM+1; +- public static final short FRAMESET = FRAME+1; +- public static final short H1 = FRAMESET+1; +- public static final short H2 = H1+1; +- public static final short H3 = H2+1; +- public static final short H4 = H3+1; +- public static final short H5 = H4+1; +- public static final short H6 = H5+1; +- public static final short HEAD = H6+1; +- public static final short HR = HEAD+1; +- public static final short HTML = HR+1; +- public static final short I = HTML+1; +- public static final short IFRAME = I+1; +- public static final short ILAYER = IFRAME+1; +- public static final short IMG = ILAYER+1; +- public static final short INPUT = IMG+1; +- public static final short INS = INPUT+1; +- public static final short ISINDEX = INS+1; +- public static final short KBD = ISINDEX+1; +- public static final short KEYGEN = KBD+1; +- public static final short LABEL = KEYGEN+1; +- public static final short LAYER = LABEL+1; +- public static final short LEGEND = LAYER+1; +- public static final short LI = LEGEND+1; +- public static final short LINK = LI+1; +- public static final short LISTING = LINK+1; +- public static final short MAP = LISTING+1; +- public static final short MARQUEE = MAP+1; +- public static final short MENU = MARQUEE+1; +- public static final short META = MENU+1; +- public static final short MULTICOL = META+1; +- public static final short NEXTID = MULTICOL+1; +- public static final short NOBR = NEXTID+1; +- public static final short NOEMBED = NOBR+1; +- public static final short NOFRAMES = NOEMBED+1; +- public static final short NOLAYER = NOFRAMES+1; +- public static final short NOSCRIPT = NOLAYER+1; +- public static final short OBJECT = NOSCRIPT+1; +- public static final short OL = OBJECT+1; +- public static final short OPTION = OL+1; +- public static final short OPTGROUP = OPTION+1; +- public static final short P = OPTGROUP+1; +- public static final short PARAM = P+1; +- public static final short PLAINTEXT = PARAM+1; +- public static final short PRE = PLAINTEXT+1; +- public static final short Q = PRE+1; +- public static final short RB = Q+1; +- public static final short RBC = RB+1; +- public static final short RP = RBC+1; +- public static final short RT = RP+1; +- public static final short RTC = RT+1; +- public static final short RUBY = RTC+1; +- public static final short S = RUBY+1; +- public static final short SAMP = S+1; +- public static final short SCRIPT = SAMP+1; +- public static final short SELECT = SCRIPT+1; +- public static final short SMALL = SELECT+1; +- public static final short SOUND = SMALL+1; +- public static final short SPACER = SOUND+1; +- public static final short SPAN = SPACER+1; +- public static final short STRIKE = SPAN+1; +- public static final short STRONG = STRIKE+1; +- public static final short STYLE = STRONG+1; +- public static final short SUB = STYLE+1; +- public static final short SUP = SUB+1; +- public static final short TABLE = SUP+1; +- public static final short TBODY = TABLE+1; +- public static final short TD = TBODY+1; +- public static final short TEXTAREA = TD+1; +- public static final short TFOOT = TEXTAREA+1; +- public static final short TH = TFOOT+1; +- public static final short THEAD = TH+1; +- public static final short TITLE = THEAD+1; +- public static final short TR = TITLE+1; +- public static final short TT = TR+1; +- public static final short U = TT+1; +- public static final short UL = U+1; +- public static final short VAR = UL+1; +- public static final short WBR = VAR+1; +- public static final short XML = WBR+1; +- public static final short XMP = XML+1; +- public static final short UNKNOWN = XMP+1; +- +- // information +- +- /** Element information organized by first letter. */ +- protected static final Element[][] ELEMENTS_ARRAY = new Element[26][]; +- +- /** Element information as a contiguous list. */ +- protected static final ElementList ELEMENTS = new ElementList(); +- +- /** No such element. */ +- public static final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "", Element.CONTAINER, new short[]{BODY,HEAD}/*HTML*/, null); +- +- // +- // Static initializer +- // +- +- /** +- * Initializes the element information. +- *

+- * Note: +- * The getElement method requires that the HTML elements +- * are added to the list in alphabetical order. If new elements are +- * added, then they must be inserted in alphabetical order. +- */ +- static { +- // +- // +- // +- // +- // +- // +- // +- // +- +- // initialize array of element information +- ELEMENTS_ARRAY['A'-'A'] = new Element[] { +- // A - - (%inline;)* -(A) +- new Element(A, "A", Element.INLINE, BODY, new short[] {A}), +- // ABBR - - (%inline;)* +- new Element(ABBR, "ABBR", Element.INLINE, BODY, null), +- // ACRONYM - - (%inline;)* +- new Element(ACRONYM, "ACRONYM", Element.INLINE, BODY, null), +- // ADDRESS - - (%inline;)* +- new Element(ADDRESS, "ADDRESS", Element.BLOCK, BODY, null), +- // APPLET +- new Element(APPLET, "APPLET", 0, BODY, null), +- // AREA - O EMPTY +- new Element(AREA, "AREA", Element.EMPTY, MAP, null), +- }; +- ELEMENTS_ARRAY['B'-'A'] = new Element[] { +- // B - - (%inline;)* +- new Element(B, "B", Element.INLINE, BODY, null), +- // BASE - O EMPTY +- new Element(BASE, "BASE", Element.EMPTY, HEAD, null), +- // BASEFONT +- new Element(BASEFONT, "BASEFONT", 0, HEAD, null), +- // BDO - - (%inline;)* +- new Element(BDO, "BDO", Element.INLINE, BODY, null), +- // BGSOUND +- new Element(BGSOUND, "BGSOUND", Element.EMPTY, HEAD, null), +- // BIG - - (%inline;)* +- new Element(BIG, "BIG", Element.INLINE, BODY, null), +- // BLINK +- new Element(BLINK, "BLINK", Element.INLINE, BODY, null), +- // BLOCKQUOTE - - (%block;|SCRIPT)+ +- new Element(BLOCKQUOTE, "BLOCKQUOTE", Element.BLOCK, BODY, new short[]{P}), +- // BODY O O (%block;|SCRIPT)+ +(INS|DEL) +- new Element(BODY, "BODY", Element.CONTAINER, HTML, new short[]{HEAD}), +- // BR - O EMPTY +- new Element(BR, "BR", Element.EMPTY, BODY, null), +- // BUTTON - - (%flow;)* -(A|%formctrl;|FORM|FIELDSET) +- new Element(BUTTON, "BUTTON", 0, BODY, null), +- }; +- ELEMENTS_ARRAY['C'-'A'] = new Element[] { +- // CAPTION - - (%inline;)* +- new Element(CAPTION, "CAPTION", Element.INLINE, TABLE, null), +- // CENTER, +- new Element(CENTER, "CENTER", 0, BODY, null), +- // CITE - - (%inline;)* +- new Element(CITE, "CITE", Element.INLINE, BODY, null), +- // CODE - - (%inline;)* +- new Element(CODE, "CODE", Element.INLINE, BODY, null), +- // COL - O EMPTY +- new Element(COL, "COL", Element.EMPTY, TABLE, null), +- // COLGROUP - O (COL)* +- new Element(COLGROUP, "COLGROUP", 0, TABLE, new short[]{COL,COLGROUP}), +- // COMMENT +- new Element(COMMENT, "COMMENT", Element.SPECIAL, HTML, null), +- }; +- ELEMENTS_ARRAY['D'-'A'] = new Element[] { +- // DEL - - (%flow;)* +- new Element(DEL, "DEL", 0, BODY, null), +- // DFN - - (%inline;)* +- new Element(DFN, "DFN", Element.INLINE, BODY, null), +- // DIR +- new Element(DIR, "DIR", 0, BODY, null), +- // DIV - - (%flow;)* +- new Element(DIV, "DIV", Element.BLOCK, BODY, new short[]{P}), +- // DD - O (%flow;)* +- new Element(DD, "DD", 0, DL, new short[]{DT,DD}), +- // DL - - (DT|DD)+ +- new Element(DL, "DL", Element.BLOCK, BODY, null), +- // DT - O (%inline;)* +- new Element(DT, "DT", 0, DL, new short[]{DT,DD}), +- }; +- ELEMENTS_ARRAY['E'-'A'] = new Element[] { +- // EM - - (%inline;)* +- new Element(EM, "EM", Element.INLINE, BODY, null), +- // EMBED +- new Element(EMBED, "EMBED", 0, BODY, null), +- }; +- ELEMENTS_ARRAY['F'-'A'] = new Element[] { +- // FIELDSET - - (#PCDATA,LEGEND,(%flow;)*) +- new Element(FIELDSET, "FIELDSET", 0, BODY, null), +- // FONT +- new Element(FONT, "FONT", Element.CONTAINER, BODY, null), +- // FORM - - (%block;|SCRIPT)+ -(FORM) +- new Element(FORM, "FORM", Element.CONTAINER, new short[]{BODY,TD,DIV}, new short[]{BUTTON,P}), +- // FRAME - O EMPTY +- new Element(FRAME, "FRAME", Element.EMPTY, FRAMESET, null), +- // FRAMESET - - ((FRAMESET|FRAME)+ & NOFRAMES?) +- new Element(FRAMESET, "FRAMESET", 0, HTML, null), +- }; +- ELEMENTS_ARRAY['H'-'A'] = new Element[] { +- // (H1|H2|H3|H4|H5|H6) - - (%inline;)* +- new Element(H1, "H1", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), +- new Element(H2, "H2", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), +- new Element(H3, "H3", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), +- new Element(H4, "H4", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), +- new Element(H5, "H5", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), +- new Element(H6, "H6", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}), +- // HEAD O O (%head.content;) +(%head.misc;) +- new Element(HEAD, "HEAD", 0, HTML, null), +- // HR - O EMPTY +- new Element(HR, "HR", Element.EMPTY, BODY, new short[]{P}), +- // HTML O O (%html.content;) +- new Element(HTML, "HTML", 0, null, null), +- }; +- ELEMENTS_ARRAY['I'-'A'] = new Element[] { +- // I - - (%inline;)* +- new Element(I, "I", Element.INLINE, BODY, null), +- // IFRAME +- new Element(IFRAME, "IFRAME", Element.BLOCK, BODY, null), +- // ILAYER +- new Element(ILAYER, "ILAYER", Element.BLOCK, BODY, null), +- // IMG - O EMPTY +- new Element(IMG, "IMG", Element.EMPTY, BODY, null), +- // INPUT - O EMPTY +- new Element(INPUT, "INPUT", Element.EMPTY, BODY, null), +- // INS - - (%flow;)* +- new Element(INS, "INS", 0, BODY, null), +- // ISINDEX +- new Element(ISINDEX, "ISINDEX", 0, HEAD, null), +- }; +- ELEMENTS_ARRAY['K'-'A'] = new Element[] { +- // KBD - - (%inline;)* +- new Element(KBD, "KBD", Element.INLINE, BODY, null), +- // KEYGEN +- new Element(KEYGEN, "KEYGEN", 0, BODY, null), +- }; +- ELEMENTS_ARRAY['L'-'A'] = new Element[] { +- // LABEL - - (%inline;)* -(LABEL) +- new Element(LABEL, "LABEL", 0, BODY, null), +- // LAYER +- new Element(LAYER, "LAYER", Element.BLOCK, BODY, null), +- // LEGEND - - (%inline;)* +- new Element(LEGEND, "LEGEND", Element.INLINE, FIELDSET, null), +- // LI - O (%flow;)* +- new Element(LI, "LI", 0, new short[]{BODY,UL,OL}, new short[]{LI}), +- // LINK - O EMPTY +- new Element(LINK, "LINK", Element.EMPTY, HEAD, null), +- // LISTING +- new Element(LISTING, "LISTING", 0, BODY, null), +- }; +- ELEMENTS_ARRAY['M'-'A'] = new Element[] { +- // MAP - - ((%block;) | AREA)+ +- new Element(MAP, "MAP", Element.INLINE, BODY, null), +- // MARQUEE +- new Element(MARQUEE, "MARQUEE", 0, BODY, null), +- // MENU +- new Element(MENU, "MENU", 0, BODY, null), +- // META - O EMPTY +- new Element(META, "META", Element.EMPTY, HEAD, new short[]{STYLE,TITLE}), +- // MULTICOL +- new Element(MULTICOL, "MULTICOL", 0, BODY, null), +- }; +- ELEMENTS_ARRAY['N'-'A'] = new Element[] { +- // NEXTID +- new Element(NEXTID, "NEXTID", Element.EMPTY, BODY, null), +- // NOBR +- new Element(NOBR, "NOBR", Element.INLINE, BODY, null), +- // NOEMBED +- new Element(NOEMBED, "NOEMBED", 0, BODY, null), +- // NOFRAMES - - (BODY) -(NOFRAMES) +- new Element(NOFRAMES, "NOFRAMES", 0, FRAMESET, null), +- // NOLAYER +- new Element(NOLAYER, "NOLAYER", 0, BODY, null), +- // NOSCRIPT - - (%block;)+ +- new Element(NOSCRIPT, "NOSCRIPT", 0, new short[]{BODY}, null), +- }; +- ELEMENTS_ARRAY['O'-'A'] = new Element[] { +- // OBJECT - - (PARAM | %flow;)* +- new Element(OBJECT, "OBJECT", 0, BODY, null), +- // OL - - (LI)+ +- new Element(OL, "OL", Element.BLOCK, BODY, null), +- // OPTGROUP - - (OPTION)+ +- new Element(OPTGROUP, "OPTGROUP", 0, SELECT, new short[]{OPTION}), +- // OPTION - O (#PCDATA) +- new Element(OPTION, "OPTION", 0, SELECT, new short[]{OPTION}), +- }; +- ELEMENTS_ARRAY['P'-'A'] = new Element[] { +- // P - O (%inline;)* +- new Element(P, "P", Element.CONTAINER, BODY, new short[]{P}), +- // PARAM - O EMPTY +- new Element(PARAM, "PARAM", Element.EMPTY, new short[]{OBJECT,APPLET}, null), +- // PLAINTEXT +- new Element(PLAINTEXT, "PLAINTEXT", Element.SPECIAL, BODY, null), +- // PRE - - (%inline;)* -(%pre.exclusion;) +- new Element(PRE, "PRE", 0, BODY, null), +- }; +- ELEMENTS_ARRAY['Q'-'A'] = new Element[] { +- // Q - - (%inline;)* +- new Element(Q, "Q", Element.INLINE, BODY, null), +- }; +- ELEMENTS_ARRAY['R'-'A'] = new Element[] { +- // RB +- new Element(RB, "RB", Element.INLINE, RUBY, new short[]{RB}), +- // RBC +- new Element(RBC, "RBC", 0, RUBY, null), +- // RP +- new Element(RP, "RP", Element.INLINE, RUBY, new short[]{RB}), +- // RT +- new Element(RT, "RT", Element.INLINE, RUBY, new short[]{RB,RP}), +- // RTC +- new Element(RTC, "RTC", 0, RUBY, new short[]{RBC}), +- // RUBY +- new Element(RUBY, "RUBY", 0, BODY, new short[]{RUBY}), +- }; +- ELEMENTS_ARRAY['S'-'A'] = new Element[] { +- // S +- new Element(S, "S", 0, BODY, null), +- // SAMP - - (%inline;)* +- new Element(SAMP, "SAMP", Element.INLINE, BODY, null), +- // SCRIPT - - %Script; +- new Element(SCRIPT, "SCRIPT", Element.SPECIAL, new short[]{HEAD,BODY}, null), +- // SELECT - - (OPTGROUP|OPTION)+ +- new Element(SELECT, "SELECT", Element.CONTAINER, BODY, new short[]{SELECT}), +- // SMALL - - (%inline;)* +- new Element(SMALL, "SMALL", Element.INLINE, BODY, null), +- // SOUND +- new Element(SOUND, "SOUND", Element.EMPTY, HEAD, null), +- // SPACER +- new Element(SPACER, "SPACER", Element.EMPTY, BODY, null), +- // SPAN - - (%inline;)* +- new Element(SPAN, "SPAN", Element.CONTAINER, BODY, null), +- // STRIKE +- new Element(STRIKE, "STRIKE", Element.INLINE, BODY, null), +- // STRONG - - (%inline;)* +- new Element(STRONG, "STRONG", Element.INLINE, BODY, null), +- // STYLE - - %StyleSheet; +- new Element(STYLE, "STYLE", Element.SPECIAL, new short[]{HEAD,BODY}, new short[]{STYLE,TITLE,META}), +- // SUB - - (%inline;)* +- new Element(SUB, "SUB", Element.INLINE, BODY, null), +- // SUP - - (%inline;)* +- new Element(SUP, "SUP", Element.INLINE, BODY, null), +- }; +- ELEMENTS_ARRAY['T'-'A'] = new Element[] { +- // TABLE - - (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+) +- new Element(TABLE, "TABLE", Element.BLOCK|Element.CONTAINER, BODY, null), +- // TBODY O O (TR)+ +- new Element(TBODY, "TBODY", 0, TABLE, new short[]{THEAD,TD,TH,TR,COLGROUP}), +- // TD - O (%flow;)* +- new Element(TD, "TD", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}), +- // TEXTAREA - - (#PCDATA) +- new Element(TEXTAREA, "TEXTAREA", Element.SPECIAL, BODY, null), +- // TFOOT - O (TR)+ +- new Element(TFOOT, "TFOOT", 0, TABLE, new short[]{THEAD,TBODY,TD,TH,TR}), +- // TH - O (%flow;)* +- new Element(TH, "TH", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}), +- // THEAD - O (TR)+ +- new Element(THEAD, "THEAD", 0, TABLE, new short[]{COLGROUP}), +- // TITLE - - (#PCDATA) -(%head.misc;) +- new Element(TITLE, "TITLE", Element.SPECIAL, new short[]{HEAD,BODY}, null), +- // TR - O (TH|TD)+ +- new Element(TR, "TR", Element.BLOCK, new short[]{TBODY, THEAD, TFOOT}, TABLE, new short[]{TD,TH,TR,COLGROUP}), +- // TT - - (%inline;)* +- new Element(TT, "TT", Element.INLINE, BODY, null), +- }; +- ELEMENTS_ARRAY['U'-'A'] = new Element[] { +- // U, +- new Element(U, "U", Element.INLINE, BODY, null), +- // UL - - (LI)+ +- new Element(UL, "UL", Element.BLOCK, BODY, null), +- }; +- ELEMENTS_ARRAY['V'-'A'] = new Element[] { +- // VAR - - (%inline;)* +- new Element(VAR, "VAR", Element.INLINE, BODY, null), +- }; +- ELEMENTS_ARRAY['W'-'A'] = new Element[] { +- // WBR +- new Element(WBR, "WBR", Element.EMPTY, BODY, null), +- }; +- ELEMENTS_ARRAY['X'-'A'] = new Element[] { +- // XML +- new Element(XML, "XML", 0, BODY, null), +- // XMP +- new Element(XMP, "XMP", Element.SPECIAL, BODY, null), +- }; +- +- // keep contiguous list of elements for lookups by code +- for (int i = 0; i < ELEMENTS_ARRAY.length; i++) { +- Element[] elements = ELEMENTS_ARRAY[i]; +- if (elements != null) { +- for (int j = 0; j < elements.length; j++) { +- Element element = elements[j]; +- ELEMENTS.addElement(element); +- } +- } +- } +- ELEMENTS.addElement(NO_SUCH_ELEMENT); +- +- // initialize cross references to parent elements +- for (int i = 0; i < ELEMENTS.size; i++) { +- Element element = ELEMENTS.data[i]; +- if (element.parentCodes != null) { +- element.parent = new Element[element.parentCodes.length]; +- for (int j = 0; j < element.parentCodes.length; j++) { +- element.parent[j] = ELEMENTS.data[element.parentCodes[j]]; +- } +- element.parentCodes = null; +- } +- } +- +- } // () +- +- // +- // Public static methods +- // +- +- /** +- * Returns the element information for the specified element code. +- * +- * @param code The element code. +- */ +- public static final Element getElement(short code) { +- return ELEMENTS.data[code]; +- } // getElement(short):Element +- +- /** +- * Returns the element information for the specified element name. +- * +- * @param ename The element name. +- */ +- public static final Element getElement(String ename) { +- return getElement(ename, NO_SUCH_ELEMENT); +- } // getElement(String):Element +- +- /** +- * Returns the element information for the specified element name. +- * +- * @param ename The element name. +- * @param element The default element to return if not found. +- */ +- public static final Element getElement(String ename, Element element) { +- +- if (ename.length() > 0) { +- int c = ename.charAt(0); +- if (c >= 'a' && c <= 'z') { +- c = 'A' + c - 'a'; +- } +- if (c >= 'A' && c <= 'Z') { +- Element[] elements = ELEMENTS_ARRAY[c - 'A']; +- if (elements != null) { +- for (int i = 0; i < elements.length; i++) { +- Element elem = elements[i]; +- if (elem.name.equalsIgnoreCase(ename)) { +- return elem; +- } +- } +- } +- } +- } +- return element; +- +- } // getElement(String):Element +- +- // +- // Classes +- // +- +- /** +- * Element information. +- * +- * @author Andy Clark +- */ +- public static class Element { +- +- // +- // Constants +- // +- +- /** Inline element. */ +- public static final int INLINE = 0x01; +- +- /** Block element. */ +- public static final int BLOCK = 0x02; +- +- /** Empty element. */ +- public static final int EMPTY = 0x04; +- +- /** Container element. */ +- public static final int CONTAINER = 0x08; +- +- /** Special element. */ +- public static final int SPECIAL = 0x10; +- +- // +- // Data +- // +- +- /** The element code. */ +- public short code; +- +- /** The element name. */ +- public String name; +- +- /** Informational flags. */ +- public int flags; +- +- /** Parent elements. */ +- public short[] parentCodes; +- +- /** Parent elements. */ +- public Element[] parent; +- +- /** The bounding element code. */ +- public short bounds; +- +- /** List of elements this element can close. */ +- public short[] closes; +- +- /** If set to true, then this element may not be nested, example: "A" **/ +- boolean nestable = true; +- +- // +- // Constructors +- // +- +- /** +- * Constructs an element object. +- * +- * @param code The element code. +- * @param name The element name. +- * @param flags Informational flags +- * @param parent Natural closing parent name. +- * @param closes List of elements this element can close. +- */ +- public Element(short code, String name, int flags, +- short parent, short[] closes) { +- this(code, name, flags, new short[]{parent}, (short)-1, closes); +- } // (short,String,int,short,short[]); +- +- /** +- * Constructs an element object. +- * +- * @param code The element code. +- * @param name The element name. +- * @param flags Informational flags +- * @param parent Natural closing parent name. +- * @param closes List of elements this element can close. +- */ +- public Element(short code, String name, int flags, +- short parent, short bounds, short[] closes) { +- this(code, name, flags, new short[]{parent}, bounds, closes); +- } // (short,String,int,short,short,short[]) +- +- /** +- * Constructs an element object. +- * +- * @param code The element code. +- * @param name The element name. +- * @param flags Informational flags +- * @param parents Natural closing parent names. +- * @param closes List of elements this element can close. +- */ +- public Element(short code, String name, int flags, +- short[] parents, short[] closes) { +- this(code, name, flags, parents, (short)-1, closes); +- } // (short,String,int,short[],short[]) +- +- /** +- * Constructs an element object. +- * +- * @param code The element code. +- * @param name The element name. +- * @param flags Informational flags +- * @param parents Natural closing parent names. +- * @param closes List of elements this element can close. +- */ +- public Element(short code, String name, int flags, +- short[] parents, short bounds, short[] closes) { +- this.code = code; +- this.name = name; +- this.flags = flags; +- this.parentCodes = parents; +- this.parent = null; +- this.bounds = bounds; +- this.closes = closes; +- if(closes != null) { +- for(int i=0;i(short,String,int,short[],short,short[]) +- +- // +- // Public methods +- // +- +- /** Returns true if this element is an inline element. */ +- public final boolean isInline() { +- return (flags & INLINE) != 0; +- } // isInline():boolean +- +- /** Returns true if this element is a block element. */ +- public final boolean isBlock() { +- return (flags & BLOCK) != 0; +- } // isBlock():boolean +- +- /** Returns true if this element is an empty element. */ +- public final boolean isEmpty() { +- return (flags & EMPTY) != 0; +- } // isEmpty():boolean +- +- /** Returns true if this element is a container element. */ +- public final boolean isContainer() { +- return (flags & CONTAINER) != 0; +- } // isContainer():boolean +- +- /** +- * Returns true if this element is special -- if its content +- * should be parsed ignoring markup. +- */ +- public final boolean isSpecial() { +- return (flags & SPECIAL) != 0; +- } // isSpecial():boolean +- +- /** +- * Returns true if this element can close the specified Element. +- * +- * @param tag The element. +- */ +- public boolean closes(short tag) { +- +- if (closes != null) { +- for (int i = 0; i < closes.length; i++) { +- if (closes[i] == tag) { +- return true; +- } +- } +- } +- return false; +- +- } // closes(short):boolean +- +- // +- // Object methods +- // +- +- /** Returns a hash code for this object. */ +- public int hashCode() { +- return name.hashCode(); +- } // hashCode():int +- +- /** Returns true if the objects are equal. */ +- public boolean equals(Object o) { +- return name.equals(o); +- } // equals(Object):boolean +- +- /** +- * Provides a simple representation to make debugging easier +- */ +- public String toString() { +- return super.toString() + "(name=" + name + ")"; +- } +- +- /** +- * Indicates if the provided element is an accepted parent of current element +- * @param element the element to test for "paternity" +- * @return true if element belongs to the {@link #parent} +- */ +- public boolean isParent(final Element element) { +- if (parent == null) +- return false; +- else { +- for (int i=0; i +- *

  • add missing parent elements; +- *
  • automatically close elements with optional end tags; and +- *
  • handle mis-matched inline element tags. +- * +- *

    +- * This component recognizes the following features: +- *

      +- *
    • http://cyberneko.org/html/features/augmentations +- *
    • http://cyberneko.org/html/features/report-errors +- *
    • http://cyberneko.org/html/features/balance-tags/document-fragment +- *
    • http://cyberneko.org/html/features/balance-tags/ignore-outside-content +- *
    +- *

    +- * This component recognizes the following properties: +- *

      +- *
    • http://cyberneko.org/html/properties/names/elems +- *
    • http://cyberneko.org/html/properties/names/attrs +- *
    • http://cyberneko.org/html/properties/error-reporter +- *
    • http://cyberneko.org/html/properties/balance-tags/current-stack +- *
    +- * +- * @see HTMLElements +- * +- * @author Andy Clark +- * @author Marc Guillemot +- * +- * @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $ +- */ +-public class HTMLTagBalancer +- implements XMLDocumentFilter, HTMLComponent { +- +- // +- // Constants +- // +- +- // features +- +- /** Namespaces. */ +- protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces"; +- +- /** Include infoset augmentations. */ +- protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations"; +- +- /** Report errors. */ +- protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors"; +- +- /** Document fragment balancing only (deprecated). */ +- protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment"; +- +- /** Document fragment balancing only. */ +- protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment"; +- +- /** Ignore outside content. */ +- protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content"; +- +- /** Recognized features. */ +- private static final String[] RECOGNIZED_FEATURES = { +- NAMESPACES, +- AUGMENTATIONS, +- REPORT_ERRORS, +- DOCUMENT_FRAGMENT_DEPRECATED, +- DOCUMENT_FRAGMENT, +- IGNORE_OUTSIDE_CONTENT, +- }; +- +- /** Recognized features defaults. */ +- private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = { +- null, +- null, +- null, +- null, +- Boolean.FALSE, +- Boolean.FALSE, +- }; +- +- // properties +- +- /** Modify HTML element names: { "upper", "lower", "default" }. */ +- protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems"; +- +- /** Modify HTML attribute names: { "upper", "lower", "default" }. */ +- protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs"; +- +- /** Error reporter. */ +- protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter"; +- +- /** +- * EXPERIMENTAL: may change in next release
    +- * Name of the property holding the stack of elements in which context a document fragment should be parsed. +- **/ +- public static final String FRAGMENT_CONTEXT_STACK = "http://cyberneko.org/html/properties/balance-tags/fragment-context-stack"; +- +- /** Recognized properties. */ +- private static final String[] RECOGNIZED_PROPERTIES = { +- NAMES_ELEMS, +- NAMES_ATTRS, +- ERROR_REPORTER, +- FRAGMENT_CONTEXT_STACK, +- }; +- +- /** Recognized properties defaults. */ +- private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = { +- null, +- null, +- null, +- null, +- }; +- +- // modify HTML names +- +- /** Don't modify HTML names. */ +- protected static final short NAMES_NO_CHANGE = 0; +- +- /** Match HTML element names. */ +- protected static final short NAMES_MATCH = 0; +- +- /** Uppercase HTML names. */ +- protected static final short NAMES_UPPERCASE = 1; +- +- /** Lowercase HTML names. */ +- protected static final short NAMES_LOWERCASE = 2; +- +- // static vars +- +- /** Synthesized event info item. */ +- protected static final HTMLEventInfo SYNTHESIZED_ITEM = +- new HTMLEventInfo.SynthesizedItem(); +- +- // +- // Data +- // +- +- // features +- +- /** Namespaces. */ +- protected boolean fNamespaces; +- +- /** Include infoset augmentations. */ +- protected boolean fAugmentations; +- +- /** Report errors. */ +- protected boolean fReportErrors; +- +- /** Document fragment balancing only. */ +- protected boolean fDocumentFragment; +- +- /** Ignore outside content. */ +- protected boolean fIgnoreOutsideContent; +- +- // properties +- +- /** Modify HTML element names. */ +- protected short fNamesElems; +- +- /** Modify HTML attribute names. */ +- protected short fNamesAttrs; +- +- /** Error reporter. */ +- protected HTMLErrorReporter fErrorReporter; +- +- // connections +- +- /** The document source. */ +- protected XMLDocumentSource fDocumentSource; +- +- /** The document handler. */ +- protected XMLDocumentHandler fDocumentHandler; +- +- // state +- +- /** The element stack. */ +- protected final InfoStack fElementStack = new InfoStack(); +- +- /** The inline stack. */ +- protected final InfoStack fInlineStack = new InfoStack(); +- +- /** True if seen anything. Important for xml declaration. */ +- protected boolean fSeenAnything; +- +- /** True if root element has been seen. */ +- protected boolean fSeenDoctype; +- +- /** True if root element has been seen. */ +- protected boolean fSeenRootElement; +- +- /** +- * True if seen the end of the document element. In other words, +- * this variable is set to false until the end </HTML> +- * tag is seen (or synthesized). This is used to ensure that +- * extraneous events after the end of the document element do not +- * make the document stream ill-formed. +- */ +- protected boolean fSeenRootElementEnd; +- +- /** True if seen <head< element. */ +- protected boolean fSeenHeadElement; +- +- /** True if seen <body< element. */ +- protected boolean fSeenBodyElement; +- +- /** True if a form is in the stack (allow to discard opening of nested forms) */ +- protected boolean fOpenedForm; +- +- // temp vars +- +- /** A qualified name. */ +- private final QName fQName = new QName(); +- +- /** Empty attributes. */ +- private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl(); +- +- /** Augmentations. */ +- private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations(); +- +- protected HTMLTagBalancingListener tagBalancingListener; +- private LostText lostText_ = new LostText(); +- +- private boolean forcedStartElement_ = false; +- private boolean forcedEndElement_ = false; +- +- /** +- * Stack of elements determining the context in which a document fragment should be parsed +- */ +- private QName[] fragmentContextStack_ = null; +- private int fragmentContextStackSize_ = 0; // not 0 only when a fragment is parsed and fragmentContextStack_ is set +- +- private List/*ElementEntry*/ endElementsBuffer_ = new ArrayList(); +- +- // +- // HTMLComponent methods +- // +- +- /** Returns the default state for a feature. */ +- public Boolean getFeatureDefault(String featureId) { +- int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0; +- for (int i = 0; i < length; i++) { +- if (RECOGNIZED_FEATURES[i].equals(featureId)) { +- return RECOGNIZED_FEATURES_DEFAULTS[i]; +- } +- } +- return null; +- } // getFeatureDefault(String):Boolean +- +- /** Returns the default state for a property. */ +- public Object getPropertyDefault(String propertyId) { +- int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0; +- for (int i = 0; i < length; i++) { +- if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) { +- return RECOGNIZED_PROPERTIES_DEFAULTS[i]; +- } +- } +- return null; +- } // getPropertyDefault(String):Object +- +- // +- // XMLComponent methods +- // +- +- /** Returns recognized features. */ +- public String[] getRecognizedFeatures() { +- return RECOGNIZED_FEATURES; +- } // getRecognizedFeatures():String[] +- +- /** Returns recognized properties. */ +- public String[] getRecognizedProperties() { +- return RECOGNIZED_PROPERTIES; +- } // getRecognizedProperties():String[] +- +- /** Resets the component. */ +- public void reset(XMLComponentManager manager) +- throws XMLConfigurationException { +- +- // get features +- fNamespaces = manager.getFeature(NAMESPACES); +- fAugmentations = manager.getFeature(AUGMENTATIONS); +- fReportErrors = manager.getFeature(REPORT_ERRORS); +- fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT) || +- manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED); +- fIgnoreOutsideContent = manager.getFeature(IGNORE_OUTSIDE_CONTENT); +- +- // get properties +- fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS))); +- fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS))); +- fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER); +- +- fragmentContextStack_ = (QName[]) manager.getProperty(FRAGMENT_CONTEXT_STACK); +- +- } // reset(XMLComponentManager) +- +- /** Sets a feature. */ +- public void setFeature(String featureId, boolean state) +- throws XMLConfigurationException { +- +- if (featureId.equals(AUGMENTATIONS)) { +- fAugmentations = state; +- return; +- } +- if (featureId.equals(REPORT_ERRORS)) { +- fReportErrors = state; +- return; +- } +- if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) { +- fIgnoreOutsideContent = state; +- return; +- } +- +- } // setFeature(String,boolean) +- +- /** Sets a property. */ +- public void setProperty(String propertyId, Object value) +- throws XMLConfigurationException { +- +- if (propertyId.equals(NAMES_ELEMS)) { +- fNamesElems = getNamesValue(String.valueOf(value)); +- return; +- } +- +- if (propertyId.equals(NAMES_ATTRS)) { +- fNamesAttrs = getNamesValue(String.valueOf(value)); +- return; +- } +- +- } // setProperty(String,Object) +- +- // +- // XMLDocumentSource methods +- // +- +- /** Sets the document handler. */ +- public void setDocumentHandler(XMLDocumentHandler handler) { +- fDocumentHandler = handler; +- } // setDocumentHandler(XMLDocumentHandler) +- +- // @since Xerces 2.1.0 +- +- /** Returns the document handler. */ +- public XMLDocumentHandler getDocumentHandler() { +- return fDocumentHandler; +- } // getDocumentHandler():XMLDocumentHandler +- +- // +- // XMLDocumentHandler methods +- // +- +- // since Xerces-J 2.2.0 +- +- /** Start document. */ +- public void startDocument(XMLLocator locator, String encoding, +- NamespaceContext nscontext, Augmentations augs) +- throws XNIException { +- +- // reset state +- fElementStack.top = 0; +- if (fragmentContextStack_ != null) { +- fragmentContextStackSize_ = fragmentContextStack_.length; +- for (int i=0; i and have been buffered to consider outside content +- fIgnoreOutsideContent = true; // endElement should not ignore the elements passed from buffer +- consumeBufferedEndElements(); +- +- // handle empty document +- if (!fSeenRootElement && !fDocumentFragment) { +- if (fReportErrors) { +- fErrorReporter.reportError("HTML2000", null); +- } +- if (fDocumentHandler != null) { +- fSeenRootElementEnd = false; +- forceStartBody(); // will force and +- final String body = modifyName("body", fNamesElems); +- fQName.setValues(null, body, body, null); +- callEndElement(fQName, synthesizedAugs()); +- +- final String ename = modifyName("html", fNamesElems); +- fQName.setValues(null, ename, ename, null); +- callEndElement(fQName, synthesizedAugs()); +- } +- } +- +- // pop all remaining elements +- else { +- int length = fElementStack.top - fragmentContextStackSize_; +- for (int i = 0; i < length; i++) { +- Info info = fElementStack.pop(); +- if (fReportErrors) { +- String ename = info.qname.rawname; +- fErrorReporter.reportWarning("HTML2001", new Object[]{ename}); +- } +- if (fDocumentHandler != null) { +- callEndElement(info.qname, synthesizedAugs()); +- } +- } +- } +- +- // call handler +- if (fDocumentHandler != null) { +- fDocumentHandler.endDocument(augs); +- } +- +- } // endDocument(Augmentations) +- +- /** +- * Consume elements that have been buffered, like that are first consumed +- * at the end of document +- */ +- private void consumeBufferedEndElements() { +- final List toConsume = new ArrayList(endElementsBuffer_); +- endElementsBuffer_.clear(); +- for (int i=0; i (if any) has been buffered +- } +- else if (elementCode == HTMLElements.BODY) { +- // create if none was present +- if (!fSeenHeadElement) { +- final QName head = createQName("head"); +- forceStartElement(head, null, synthesizedAugs()); +- endElement(head, synthesizedAugs()); +- } +- consumeBufferedEndElements(); // (if any) has been buffered +- +- if (fSeenBodyElement) { +- notifyDiscardedStartElement(elem, attrs, augs); +- return; +- } +- fSeenBodyElement = true; +- } +- else if (elementCode == HTMLElements.FORM) { +- if (fOpenedForm) { +- notifyDiscardedStartElement(elem, attrs, augs); +- return; +- } +- fOpenedForm = true; +- } +- else if (elementCode == HTMLElements.UNKNOWN) { +- consumeBufferedEndElements(); +- } +- +- // check proper parent +- if (element.parent != null) { +- if (!fSeenRootElement && !fDocumentFragment) { +- String pname = element.parent[0].name; +- pname = modifyName(pname, fNamesElems); +- if (fReportErrors) { +- String ename = elem.rawname; +- fErrorReporter.reportWarning("HTML2002", new Object[]{ename,pname}); +- } +- final QName qname = new QName(null, pname, pname, null); +- final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs()); +- if (!parentCreated) { +- if (!isForcedCreation) { +- notifyDiscardedStartElement(elem, attrs, augs); +- } +- return; +- } +- } +- else { +- HTMLElements.Element preferedParent = element.parent[0]; +- if (preferedParent.code != HTMLElements.HEAD || (!fSeenBodyElement && !fDocumentFragment)) { +- int depth = getParentDepth(element.parent, element.bounds); +- if (depth == -1) { // no parent found +- final String pname = modifyName(preferedParent.name, fNamesElems); +- final QName qname = new QName(null, pname, pname, null); +- if (fReportErrors) { +- String ename = elem.rawname; +- fErrorReporter.reportWarning("HTML2004", new Object[]{ename,pname}); +- } +- final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs()); +- if (!parentCreated) { +- if (!isForcedCreation) { +- notifyDiscardedStartElement(elem, attrs, augs); +- } +- return; +- } +- } +- } +- } +- } +- +- // if block element, save immediate parent inline elements +- int depth = 0; +- if (element.flags == 0) { +- int length = fElementStack.top; +- fInlineStack.top = 0; +- for (int i = length - 1; i >= 0; i--) { +- Info info = fElementStack.data[i]; +- if (!info.element.isInline()) { +- break; +- } +- fInlineStack.push(info); +- endElement(info.qname, synthesizedAugs()); +- } +- depth = fInlineStack.top; +- } +- +- // close previous elements +- // all elements close a