2229 lines
81 KiB
Plaintext
2229 lines
81 KiB
Plaintext
diff -Nru boilerpipe-1.2.0/pom.xml boilerpipe-1.2.0-gil/pom.xml
|
||
--- boilerpipe-1.2.0/pom.xml 2013-10-11 11:54:23.418310128 +0200
|
||
+++ boilerpipe-1.2.0-gil/pom.xml 2013-10-11 11:51:51.334701196 +0200
|
||
@@ -32,4 +32,13 @@
|
||
<name>Christian Kohlschütter</name>
|
||
</developer>
|
||
</developers>
|
||
+
|
||
+ <dependencies>
|
||
+ <dependency>
|
||
+ <groupId>net.sourceforge.nekohtml</groupId>
|
||
+ <artifactId>nekohtml</artifactId>
|
||
+ <version>1.9.14</version>
|
||
+ </dependency>
|
||
+ </dependencies>
|
||
+
|
||
</project>
|
||
diff -Nru boilerpipe-1.2.0/src/main/org/cyberneko/html/HTMLElements.java boilerpipe-1.2.0-gil/src/main/org/cyberneko/html/HTMLElements.java
|
||
--- boilerpipe-1.2.0/src/main/org/cyberneko/html/HTMLElements.java 2010-12-16 11:30:06.000000000 +0100
|
||
+++ boilerpipe-1.2.0-gil/src/main/org/cyberneko/html/HTMLElements.java 1970-01-01 01:00:00.000000000 +0100
|
||
@@ -1,794 +0,0 @@
|
||
-/*
|
||
- * Copyright 2002-2009 Andy Clark, Marc Guillemot
|
||
- *
|
||
- * Licensed under the Apache License, Version 2.0 (the "License");
|
||
- * you may not use this file except in compliance with the License.
|
||
- * You may obtain a copy of the License at
|
||
- *
|
||
- * http://www.apache.org/licenses/LICENSE-2.0
|
||
- *
|
||
- * Unless required by applicable law or agreed to in writing, software
|
||
- * distributed under the License is distributed on an "AS IS" BASIS,
|
||
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
- * See the License for the specific language governing permissions and
|
||
- * limitations under the License.
|
||
- */
|
||
-
|
||
-package org.cyberneko.html;
|
||
-
|
||
-/**
|
||
- * Collection of HTML element information.
|
||
- *
|
||
- * @author Andy Clark
|
||
- * @author Ahmed Ashour
|
||
- * @author Marc Guillemot
|
||
- *
|
||
- * @version $Id: HTMLElements.java,v 1.12 2005/02/14 07:16:59 andyc Exp $
|
||
- */
|
||
-public class HTMLElements {
|
||
-
|
||
- //
|
||
- // Constants
|
||
- //
|
||
-
|
||
- // element codes
|
||
-
|
||
- // NOTE: The element codes *must* start with 0 and increment in
|
||
- // sequence. The parent and closes references depends on
|
||
- // this assumption. -Ac
|
||
-
|
||
- public static final short A = 0;
|
||
- public static final short ABBR = A+1;
|
||
- public static final short ACRONYM = ABBR+1;
|
||
- public static final short ADDRESS = ACRONYM+1;
|
||
- public static final short APPLET = ADDRESS+1;
|
||
- public static final short AREA = APPLET+1;
|
||
- public static final short B = AREA+1;
|
||
- public static final short BASE = B+1;
|
||
- public static final short BASEFONT = BASE+1;
|
||
- public static final short BDO = BASEFONT+1;
|
||
- public static final short BGSOUND = BDO+1;
|
||
- public static final short BIG = BGSOUND+1;
|
||
- public static final short BLINK = BIG+1;
|
||
- public static final short BLOCKQUOTE = BLINK+1;
|
||
- public static final short BODY = BLOCKQUOTE+1;
|
||
- public static final short BR = BODY+1;
|
||
- public static final short BUTTON = BR+1;
|
||
- public static final short CAPTION = BUTTON+1;
|
||
- public static final short CENTER = CAPTION+1;
|
||
- public static final short CITE = CENTER+1;
|
||
- public static final short CODE = CITE+1;
|
||
- public static final short COL = CODE+1;
|
||
- public static final short COLGROUP = COL+1;
|
||
- public static final short COMMENT = COLGROUP+1;
|
||
- public static final short DEL = COMMENT+1;
|
||
- public static final short DFN = DEL+1;
|
||
- public static final short DIR = DFN+1;
|
||
- public static final short DIV = DIR+1;
|
||
- public static final short DD = DIV+1;
|
||
- public static final short DL = DD+1;
|
||
- public static final short DT = DL+1;
|
||
- public static final short EM = DT+1;
|
||
- public static final short EMBED = EM+1;
|
||
- public static final short FIELDSET = EMBED+1;
|
||
- public static final short FONT = FIELDSET+1;
|
||
- public static final short FORM = FONT+1;
|
||
- public static final short FRAME = FORM+1;
|
||
- public static final short FRAMESET = FRAME+1;
|
||
- public static final short H1 = FRAMESET+1;
|
||
- public static final short H2 = H1+1;
|
||
- public static final short H3 = H2+1;
|
||
- public static final short H4 = H3+1;
|
||
- public static final short H5 = H4+1;
|
||
- public static final short H6 = H5+1;
|
||
- public static final short HEAD = H6+1;
|
||
- public static final short HR = HEAD+1;
|
||
- public static final short HTML = HR+1;
|
||
- public static final short I = HTML+1;
|
||
- public static final short IFRAME = I+1;
|
||
- public static final short ILAYER = IFRAME+1;
|
||
- public static final short IMG = ILAYER+1;
|
||
- public static final short INPUT = IMG+1;
|
||
- public static final short INS = INPUT+1;
|
||
- public static final short ISINDEX = INS+1;
|
||
- public static final short KBD = ISINDEX+1;
|
||
- public static final short KEYGEN = KBD+1;
|
||
- public static final short LABEL = KEYGEN+1;
|
||
- public static final short LAYER = LABEL+1;
|
||
- public static final short LEGEND = LAYER+1;
|
||
- public static final short LI = LEGEND+1;
|
||
- public static final short LINK = LI+1;
|
||
- public static final short LISTING = LINK+1;
|
||
- public static final short MAP = LISTING+1;
|
||
- public static final short MARQUEE = MAP+1;
|
||
- public static final short MENU = MARQUEE+1;
|
||
- public static final short META = MENU+1;
|
||
- public static final short MULTICOL = META+1;
|
||
- public static final short NEXTID = MULTICOL+1;
|
||
- public static final short NOBR = NEXTID+1;
|
||
- public static final short NOEMBED = NOBR+1;
|
||
- public static final short NOFRAMES = NOEMBED+1;
|
||
- public static final short NOLAYER = NOFRAMES+1;
|
||
- public static final short NOSCRIPT = NOLAYER+1;
|
||
- public static final short OBJECT = NOSCRIPT+1;
|
||
- public static final short OL = OBJECT+1;
|
||
- public static final short OPTION = OL+1;
|
||
- public static final short OPTGROUP = OPTION+1;
|
||
- public static final short P = OPTGROUP+1;
|
||
- public static final short PARAM = P+1;
|
||
- public static final short PLAINTEXT = PARAM+1;
|
||
- public static final short PRE = PLAINTEXT+1;
|
||
- public static final short Q = PRE+1;
|
||
- public static final short RB = Q+1;
|
||
- public static final short RBC = RB+1;
|
||
- public static final short RP = RBC+1;
|
||
- public static final short RT = RP+1;
|
||
- public static final short RTC = RT+1;
|
||
- public static final short RUBY = RTC+1;
|
||
- public static final short S = RUBY+1;
|
||
- public static final short SAMP = S+1;
|
||
- public static final short SCRIPT = SAMP+1;
|
||
- public static final short SELECT = SCRIPT+1;
|
||
- public static final short SMALL = SELECT+1;
|
||
- public static final short SOUND = SMALL+1;
|
||
- public static final short SPACER = SOUND+1;
|
||
- public static final short SPAN = SPACER+1;
|
||
- public static final short STRIKE = SPAN+1;
|
||
- public static final short STRONG = STRIKE+1;
|
||
- public static final short STYLE = STRONG+1;
|
||
- public static final short SUB = STYLE+1;
|
||
- public static final short SUP = SUB+1;
|
||
- public static final short TABLE = SUP+1;
|
||
- public static final short TBODY = TABLE+1;
|
||
- public static final short TD = TBODY+1;
|
||
- public static final short TEXTAREA = TD+1;
|
||
- public static final short TFOOT = TEXTAREA+1;
|
||
- public static final short TH = TFOOT+1;
|
||
- public static final short THEAD = TH+1;
|
||
- public static final short TITLE = THEAD+1;
|
||
- public static final short TR = TITLE+1;
|
||
- public static final short TT = TR+1;
|
||
- public static final short U = TT+1;
|
||
- public static final short UL = U+1;
|
||
- public static final short VAR = UL+1;
|
||
- public static final short WBR = VAR+1;
|
||
- public static final short XML = WBR+1;
|
||
- public static final short XMP = XML+1;
|
||
- public static final short UNKNOWN = XMP+1;
|
||
-
|
||
- // information
|
||
-
|
||
- /** Element information organized by first letter. */
|
||
- protected static final Element[][] ELEMENTS_ARRAY = new Element[26][];
|
||
-
|
||
- /** Element information as a contiguous list. */
|
||
- protected static final ElementList ELEMENTS = new ElementList();
|
||
-
|
||
- /** No such element. */
|
||
- public static final Element NO_SUCH_ELEMENT = new Element(UNKNOWN, "", Element.CONTAINER, new short[]{BODY,HEAD}/*HTML*/, null);
|
||
-
|
||
- //
|
||
- // Static initializer
|
||
- //
|
||
-
|
||
- /**
|
||
- * Initializes the element information.
|
||
- * <p>
|
||
- * <strong>Note:</strong>
|
||
- * The <code>getElement</code> method requires that the HTML elements
|
||
- * are added to the list in alphabetical order. If new elements are
|
||
- * added, then they <em>must</em> be inserted in alphabetical order.
|
||
- */
|
||
- static {
|
||
- // <!ENTITY % heading "H1|H2|H3|H4|H5|H6">
|
||
- // <!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
|
||
- // <!ENTITY % phrase "EM | STRONG | DFN | CODE | SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
|
||
- // <!ENTITY % special "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
|
||
- // <!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
|
||
- // <!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
|
||
- // <!ENTITY % block "P | %heading; | %list; | %preformatted; | DL | DIV | NOSCRIPT | BLOCKQUOTE | FORM | HR | TABLE | FIELDSET | ADDRESS">
|
||
- // <!ENTITY % flow "%block; | %inline;">
|
||
-
|
||
- // initialize array of element information
|
||
- ELEMENTS_ARRAY['A'-'A'] = new Element[] {
|
||
- // A - - (%inline;)* -(A)
|
||
- new Element(A, "A", Element.INLINE, BODY, new short[] {A}),
|
||
- // ABBR - - (%inline;)*
|
||
- new Element(ABBR, "ABBR", Element.INLINE, BODY, null),
|
||
- // ACRONYM - - (%inline;)*
|
||
- new Element(ACRONYM, "ACRONYM", Element.INLINE, BODY, null),
|
||
- // ADDRESS - - (%inline;)*
|
||
- new Element(ADDRESS, "ADDRESS", Element.BLOCK, BODY, null),
|
||
- // APPLET
|
||
- new Element(APPLET, "APPLET", 0, BODY, null),
|
||
- // AREA - O EMPTY
|
||
- new Element(AREA, "AREA", Element.EMPTY, MAP, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['B'-'A'] = new Element[] {
|
||
- // B - - (%inline;)*
|
||
- new Element(B, "B", Element.INLINE, BODY, null),
|
||
- // BASE - O EMPTY
|
||
- new Element(BASE, "BASE", Element.EMPTY, HEAD, null),
|
||
- // BASEFONT
|
||
- new Element(BASEFONT, "BASEFONT", 0, HEAD, null),
|
||
- // BDO - - (%inline;)*
|
||
- new Element(BDO, "BDO", Element.INLINE, BODY, null),
|
||
- // BGSOUND
|
||
- new Element(BGSOUND, "BGSOUND", Element.EMPTY, HEAD, null),
|
||
- // BIG - - (%inline;)*
|
||
- new Element(BIG, "BIG", Element.INLINE, BODY, null),
|
||
- // BLINK
|
||
- new Element(BLINK, "BLINK", Element.INLINE, BODY, null),
|
||
- // BLOCKQUOTE - - (%block;|SCRIPT)+
|
||
- new Element(BLOCKQUOTE, "BLOCKQUOTE", Element.BLOCK, BODY, new short[]{P}),
|
||
- // BODY O O (%block;|SCRIPT)+ +(INS|DEL)
|
||
- new Element(BODY, "BODY", Element.CONTAINER, HTML, new short[]{HEAD}),
|
||
- // BR - O EMPTY
|
||
- new Element(BR, "BR", Element.EMPTY, BODY, null),
|
||
- // BUTTON - - (%flow;)* -(A|%formctrl;|FORM|FIELDSET)
|
||
- new Element(BUTTON, "BUTTON", 0, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['C'-'A'] = new Element[] {
|
||
- // CAPTION - - (%inline;)*
|
||
- new Element(CAPTION, "CAPTION", Element.INLINE, TABLE, null),
|
||
- // CENTER,
|
||
- new Element(CENTER, "CENTER", 0, BODY, null),
|
||
- // CITE - - (%inline;)*
|
||
- new Element(CITE, "CITE", Element.INLINE, BODY, null),
|
||
- // CODE - - (%inline;)*
|
||
- new Element(CODE, "CODE", Element.INLINE, BODY, null),
|
||
- // COL - O EMPTY
|
||
- new Element(COL, "COL", Element.EMPTY, TABLE, null),
|
||
- // COLGROUP - O (COL)*
|
||
- new Element(COLGROUP, "COLGROUP", 0, TABLE, new short[]{COL,COLGROUP}),
|
||
- // COMMENT
|
||
- new Element(COMMENT, "COMMENT", Element.SPECIAL, HTML, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['D'-'A'] = new Element[] {
|
||
- // DEL - - (%flow;)*
|
||
- new Element(DEL, "DEL", 0, BODY, null),
|
||
- // DFN - - (%inline;)*
|
||
- new Element(DFN, "DFN", Element.INLINE, BODY, null),
|
||
- // DIR
|
||
- new Element(DIR, "DIR", 0, BODY, null),
|
||
- // DIV - - (%flow;)*
|
||
- new Element(DIV, "DIV", Element.BLOCK, BODY, new short[]{P}),
|
||
- // DD - O (%flow;)*
|
||
- new Element(DD, "DD", 0, DL, new short[]{DT,DD}),
|
||
- // DL - - (DT|DD)+
|
||
- new Element(DL, "DL", Element.BLOCK, BODY, null),
|
||
- // DT - O (%inline;)*
|
||
- new Element(DT, "DT", 0, DL, new short[]{DT,DD}),
|
||
- };
|
||
- ELEMENTS_ARRAY['E'-'A'] = new Element[] {
|
||
- // EM - - (%inline;)*
|
||
- new Element(EM, "EM", Element.INLINE, BODY, null),
|
||
- // EMBED
|
||
- new Element(EMBED, "EMBED", 0, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['F'-'A'] = new Element[] {
|
||
- // FIELDSET - - (#PCDATA,LEGEND,(%flow;)*)
|
||
- new Element(FIELDSET, "FIELDSET", 0, BODY, null),
|
||
- // FONT
|
||
- new Element(FONT, "FONT", Element.CONTAINER, BODY, null),
|
||
- // FORM - - (%block;|SCRIPT)+ -(FORM)
|
||
- new Element(FORM, "FORM", Element.CONTAINER, new short[]{BODY,TD,DIV}, new short[]{BUTTON,P}),
|
||
- // FRAME - O EMPTY
|
||
- new Element(FRAME, "FRAME", Element.EMPTY, FRAMESET, null),
|
||
- // FRAMESET - - ((FRAMESET|FRAME)+ & NOFRAMES?)
|
||
- new Element(FRAMESET, "FRAMESET", 0, HTML, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['H'-'A'] = new Element[] {
|
||
- // (H1|H2|H3|H4|H5|H6) - - (%inline;)*
|
||
- new Element(H1, "H1", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
|
||
- new Element(H2, "H2", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
|
||
- new Element(H3, "H3", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
|
||
- new Element(H4, "H4", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
|
||
- new Element(H5, "H5", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
|
||
- new Element(H6, "H6", Element.BLOCK, new short[]{BODY,A}, new short[]{H1,H2,H3,H4,H5,H6,P}),
|
||
- // HEAD O O (%head.content;) +(%head.misc;)
|
||
- new Element(HEAD, "HEAD", 0, HTML, null),
|
||
- // HR - O EMPTY
|
||
- new Element(HR, "HR", Element.EMPTY, BODY, new short[]{P}),
|
||
- // HTML O O (%html.content;)
|
||
- new Element(HTML, "HTML", 0, null, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['I'-'A'] = new Element[] {
|
||
- // I - - (%inline;)*
|
||
- new Element(I, "I", Element.INLINE, BODY, null),
|
||
- // IFRAME
|
||
- new Element(IFRAME, "IFRAME", Element.BLOCK, BODY, null),
|
||
- // ILAYER
|
||
- new Element(ILAYER, "ILAYER", Element.BLOCK, BODY, null),
|
||
- // IMG - O EMPTY
|
||
- new Element(IMG, "IMG", Element.EMPTY, BODY, null),
|
||
- // INPUT - O EMPTY
|
||
- new Element(INPUT, "INPUT", Element.EMPTY, BODY, null),
|
||
- // INS - - (%flow;)*
|
||
- new Element(INS, "INS", 0, BODY, null),
|
||
- // ISINDEX
|
||
- new Element(ISINDEX, "ISINDEX", 0, HEAD, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['K'-'A'] = new Element[] {
|
||
- // KBD - - (%inline;)*
|
||
- new Element(KBD, "KBD", Element.INLINE, BODY, null),
|
||
- // KEYGEN
|
||
- new Element(KEYGEN, "KEYGEN", 0, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['L'-'A'] = new Element[] {
|
||
- // LABEL - - (%inline;)* -(LABEL)
|
||
- new Element(LABEL, "LABEL", 0, BODY, null),
|
||
- // LAYER
|
||
- new Element(LAYER, "LAYER", Element.BLOCK, BODY, null),
|
||
- // LEGEND - - (%inline;)*
|
||
- new Element(LEGEND, "LEGEND", Element.INLINE, FIELDSET, null),
|
||
- // LI - O (%flow;)*
|
||
- new Element(LI, "LI", 0, new short[]{BODY,UL,OL}, new short[]{LI}),
|
||
- // LINK - O EMPTY
|
||
- new Element(LINK, "LINK", Element.EMPTY, HEAD, null),
|
||
- // LISTING
|
||
- new Element(LISTING, "LISTING", 0, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['M'-'A'] = new Element[] {
|
||
- // MAP - - ((%block;) | AREA)+
|
||
- new Element(MAP, "MAP", Element.INLINE, BODY, null),
|
||
- // MARQUEE
|
||
- new Element(MARQUEE, "MARQUEE", 0, BODY, null),
|
||
- // MENU
|
||
- new Element(MENU, "MENU", 0, BODY, null),
|
||
- // META - O EMPTY
|
||
- new Element(META, "META", Element.EMPTY, HEAD, new short[]{STYLE,TITLE}),
|
||
- // MULTICOL
|
||
- new Element(MULTICOL, "MULTICOL", 0, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['N'-'A'] = new Element[] {
|
||
- // NEXTID
|
||
- new Element(NEXTID, "NEXTID", Element.EMPTY, BODY, null),
|
||
- // NOBR
|
||
- new Element(NOBR, "NOBR", Element.INLINE, BODY, null),
|
||
- // NOEMBED
|
||
- new Element(NOEMBED, "NOEMBED", 0, BODY, null),
|
||
- // NOFRAMES - - (BODY) -(NOFRAMES)
|
||
- new Element(NOFRAMES, "NOFRAMES", 0, FRAMESET, null),
|
||
- // NOLAYER
|
||
- new Element(NOLAYER, "NOLAYER", 0, BODY, null),
|
||
- // NOSCRIPT - - (%block;)+
|
||
- new Element(NOSCRIPT, "NOSCRIPT", 0, new short[]{BODY}, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['O'-'A'] = new Element[] {
|
||
- // OBJECT - - (PARAM | %flow;)*
|
||
- new Element(OBJECT, "OBJECT", 0, BODY, null),
|
||
- // OL - - (LI)+
|
||
- new Element(OL, "OL", Element.BLOCK, BODY, null),
|
||
- // OPTGROUP - - (OPTION)+
|
||
- new Element(OPTGROUP, "OPTGROUP", 0, SELECT, new short[]{OPTION}),
|
||
- // OPTION - O (#PCDATA)
|
||
- new Element(OPTION, "OPTION", 0, SELECT, new short[]{OPTION}),
|
||
- };
|
||
- ELEMENTS_ARRAY['P'-'A'] = new Element[] {
|
||
- // P - O (%inline;)*
|
||
- new Element(P, "P", Element.CONTAINER, BODY, new short[]{P}),
|
||
- // PARAM - O EMPTY
|
||
- new Element(PARAM, "PARAM", Element.EMPTY, new short[]{OBJECT,APPLET}, null),
|
||
- // PLAINTEXT
|
||
- new Element(PLAINTEXT, "PLAINTEXT", Element.SPECIAL, BODY, null),
|
||
- // PRE - - (%inline;)* -(%pre.exclusion;)
|
||
- new Element(PRE, "PRE", 0, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['Q'-'A'] = new Element[] {
|
||
- // Q - - (%inline;)*
|
||
- new Element(Q, "Q", Element.INLINE, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['R'-'A'] = new Element[] {
|
||
- // RB
|
||
- new Element(RB, "RB", Element.INLINE, RUBY, new short[]{RB}),
|
||
- // RBC
|
||
- new Element(RBC, "RBC", 0, RUBY, null),
|
||
- // RP
|
||
- new Element(RP, "RP", Element.INLINE, RUBY, new short[]{RB}),
|
||
- // RT
|
||
- new Element(RT, "RT", Element.INLINE, RUBY, new short[]{RB,RP}),
|
||
- // RTC
|
||
- new Element(RTC, "RTC", 0, RUBY, new short[]{RBC}),
|
||
- // RUBY
|
||
- new Element(RUBY, "RUBY", 0, BODY, new short[]{RUBY}),
|
||
- };
|
||
- ELEMENTS_ARRAY['S'-'A'] = new Element[] {
|
||
- // S
|
||
- new Element(S, "S", 0, BODY, null),
|
||
- // SAMP - - (%inline;)*
|
||
- new Element(SAMP, "SAMP", Element.INLINE, BODY, null),
|
||
- // SCRIPT - - %Script;
|
||
- new Element(SCRIPT, "SCRIPT", Element.SPECIAL, new short[]{HEAD,BODY}, null),
|
||
- // SELECT - - (OPTGROUP|OPTION)+
|
||
- new Element(SELECT, "SELECT", Element.CONTAINER, BODY, new short[]{SELECT}),
|
||
- // SMALL - - (%inline;)*
|
||
- new Element(SMALL, "SMALL", Element.INLINE, BODY, null),
|
||
- // SOUND
|
||
- new Element(SOUND, "SOUND", Element.EMPTY, HEAD, null),
|
||
- // SPACER
|
||
- new Element(SPACER, "SPACER", Element.EMPTY, BODY, null),
|
||
- // SPAN - - (%inline;)*
|
||
- new Element(SPAN, "SPAN", Element.CONTAINER, BODY, null),
|
||
- // STRIKE
|
||
- new Element(STRIKE, "STRIKE", Element.INLINE, BODY, null),
|
||
- // STRONG - - (%inline;)*
|
||
- new Element(STRONG, "STRONG", Element.INLINE, BODY, null),
|
||
- // STYLE - - %StyleSheet;
|
||
- new Element(STYLE, "STYLE", Element.SPECIAL, new short[]{HEAD,BODY}, new short[]{STYLE,TITLE,META}),
|
||
- // SUB - - (%inline;)*
|
||
- new Element(SUB, "SUB", Element.INLINE, BODY, null),
|
||
- // SUP - - (%inline;)*
|
||
- new Element(SUP, "SUP", Element.INLINE, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['T'-'A'] = new Element[] {
|
||
- // TABLE - - (CAPTION?, (COL*|COLGROUP*), THEAD?, TFOOT?, TBODY+)
|
||
- new Element(TABLE, "TABLE", Element.BLOCK|Element.CONTAINER, BODY, null),
|
||
- // TBODY O O (TR)+
|
||
- new Element(TBODY, "TBODY", 0, TABLE, new short[]{THEAD,TD,TH,TR,COLGROUP}),
|
||
- // TD - O (%flow;)*
|
||
- new Element(TD, "TD", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}),
|
||
- // TEXTAREA - - (#PCDATA)
|
||
- new Element(TEXTAREA, "TEXTAREA", Element.SPECIAL, BODY, null),
|
||
- // TFOOT - O (TR)+
|
||
- new Element(TFOOT, "TFOOT", 0, TABLE, new short[]{THEAD,TBODY,TD,TH,TR}),
|
||
- // TH - O (%flow;)*
|
||
- new Element(TH, "TH", Element.CONTAINER, TR, TABLE, new short[]{TD,TH}),
|
||
- // THEAD - O (TR)+
|
||
- new Element(THEAD, "THEAD", 0, TABLE, new short[]{COLGROUP}),
|
||
- // TITLE - - (#PCDATA) -(%head.misc;)
|
||
- new Element(TITLE, "TITLE", Element.SPECIAL, new short[]{HEAD,BODY}, null),
|
||
- // TR - O (TH|TD)+
|
||
- new Element(TR, "TR", Element.BLOCK, new short[]{TBODY, THEAD, TFOOT}, TABLE, new short[]{TD,TH,TR,COLGROUP}),
|
||
- // TT - - (%inline;)*
|
||
- new Element(TT, "TT", Element.INLINE, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['U'-'A'] = new Element[] {
|
||
- // U,
|
||
- new Element(U, "U", Element.INLINE, BODY, null),
|
||
- // UL - - (LI)+
|
||
- new Element(UL, "UL", Element.BLOCK, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['V'-'A'] = new Element[] {
|
||
- // VAR - - (%inline;)*
|
||
- new Element(VAR, "VAR", Element.INLINE, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['W'-'A'] = new Element[] {
|
||
- // WBR
|
||
- new Element(WBR, "WBR", Element.EMPTY, BODY, null),
|
||
- };
|
||
- ELEMENTS_ARRAY['X'-'A'] = new Element[] {
|
||
- // XML
|
||
- new Element(XML, "XML", 0, BODY, null),
|
||
- // XMP
|
||
- new Element(XMP, "XMP", Element.SPECIAL, BODY, null),
|
||
- };
|
||
-
|
||
- // keep contiguous list of elements for lookups by code
|
||
- for (int i = 0; i < ELEMENTS_ARRAY.length; i++) {
|
||
- Element[] elements = ELEMENTS_ARRAY[i];
|
||
- if (elements != null) {
|
||
- for (int j = 0; j < elements.length; j++) {
|
||
- Element element = elements[j];
|
||
- ELEMENTS.addElement(element);
|
||
- }
|
||
- }
|
||
- }
|
||
- ELEMENTS.addElement(NO_SUCH_ELEMENT);
|
||
-
|
||
- // initialize cross references to parent elements
|
||
- for (int i = 0; i < ELEMENTS.size; i++) {
|
||
- Element element = ELEMENTS.data[i];
|
||
- if (element.parentCodes != null) {
|
||
- element.parent = new Element[element.parentCodes.length];
|
||
- for (int j = 0; j < element.parentCodes.length; j++) {
|
||
- element.parent[j] = ELEMENTS.data[element.parentCodes[j]];
|
||
- }
|
||
- element.parentCodes = null;
|
||
- }
|
||
- }
|
||
-
|
||
- } // <clinit>()
|
||
-
|
||
- //
|
||
- // Public static methods
|
||
- //
|
||
-
|
||
- /**
|
||
- * Returns the element information for the specified element code.
|
||
- *
|
||
- * @param code The element code.
|
||
- */
|
||
- public static final Element getElement(short code) {
|
||
- return ELEMENTS.data[code];
|
||
- } // getElement(short):Element
|
||
-
|
||
- /**
|
||
- * Returns the element information for the specified element name.
|
||
- *
|
||
- * @param ename The element name.
|
||
- */
|
||
- public static final Element getElement(String ename) {
|
||
- return getElement(ename, NO_SUCH_ELEMENT);
|
||
- } // getElement(String):Element
|
||
-
|
||
- /**
|
||
- * Returns the element information for the specified element name.
|
||
- *
|
||
- * @param ename The element name.
|
||
- * @param element The default element to return if not found.
|
||
- */
|
||
- public static final Element getElement(String ename, Element element) {
|
||
-
|
||
- if (ename.length() > 0) {
|
||
- int c = ename.charAt(0);
|
||
- if (c >= 'a' && c <= 'z') {
|
||
- c = 'A' + c - 'a';
|
||
- }
|
||
- if (c >= 'A' && c <= 'Z') {
|
||
- Element[] elements = ELEMENTS_ARRAY[c - 'A'];
|
||
- if (elements != null) {
|
||
- for (int i = 0; i < elements.length; i++) {
|
||
- Element elem = elements[i];
|
||
- if (elem.name.equalsIgnoreCase(ename)) {
|
||
- return elem;
|
||
- }
|
||
- }
|
||
- }
|
||
- }
|
||
- }
|
||
- return element;
|
||
-
|
||
- } // getElement(String):Element
|
||
-
|
||
- //
|
||
- // Classes
|
||
- //
|
||
-
|
||
- /**
|
||
- * Element information.
|
||
- *
|
||
- * @author Andy Clark
|
||
- */
|
||
- public static class Element {
|
||
-
|
||
- //
|
||
- // Constants
|
||
- //
|
||
-
|
||
- /** Inline element. */
|
||
- public static final int INLINE = 0x01;
|
||
-
|
||
- /** Block element. */
|
||
- public static final int BLOCK = 0x02;
|
||
-
|
||
- /** Empty element. */
|
||
- public static final int EMPTY = 0x04;
|
||
-
|
||
- /** Container element. */
|
||
- public static final int CONTAINER = 0x08;
|
||
-
|
||
- /** Special element. */
|
||
- public static final int SPECIAL = 0x10;
|
||
-
|
||
- //
|
||
- // Data
|
||
- //
|
||
-
|
||
- /** The element code. */
|
||
- public short code;
|
||
-
|
||
- /** The element name. */
|
||
- public String name;
|
||
-
|
||
- /** Informational flags. */
|
||
- public int flags;
|
||
-
|
||
- /** Parent elements. */
|
||
- public short[] parentCodes;
|
||
-
|
||
- /** Parent elements. */
|
||
- public Element[] parent;
|
||
-
|
||
- /** The bounding element code. */
|
||
- public short bounds;
|
||
-
|
||
- /** List of elements this element can close. */
|
||
- public short[] closes;
|
||
-
|
||
- /** If set to true, then this element may not be nested, example: "A" **/
|
||
- boolean nestable = true;
|
||
-
|
||
- //
|
||
- // Constructors
|
||
- //
|
||
-
|
||
- /**
|
||
- * Constructs an element object.
|
||
- *
|
||
- * @param code The element code.
|
||
- * @param name The element name.
|
||
- * @param flags Informational flags
|
||
- * @param parent Natural closing parent name.
|
||
- * @param closes List of elements this element can close.
|
||
- */
|
||
- public Element(short code, String name, int flags,
|
||
- short parent, short[] closes) {
|
||
- this(code, name, flags, new short[]{parent}, (short)-1, closes);
|
||
- } // <init>(short,String,int,short,short[]);
|
||
-
|
||
- /**
|
||
- * Constructs an element object.
|
||
- *
|
||
- * @param code The element code.
|
||
- * @param name The element name.
|
||
- * @param flags Informational flags
|
||
- * @param parent Natural closing parent name.
|
||
- * @param closes List of elements this element can close.
|
||
- */
|
||
- public Element(short code, String name, int flags,
|
||
- short parent, short bounds, short[] closes) {
|
||
- this(code, name, flags, new short[]{parent}, bounds, closes);
|
||
- } // <init>(short,String,int,short,short,short[])
|
||
-
|
||
- /**
|
||
- * Constructs an element object.
|
||
- *
|
||
- * @param code The element code.
|
||
- * @param name The element name.
|
||
- * @param flags Informational flags
|
||
- * @param parents Natural closing parent names.
|
||
- * @param closes List of elements this element can close.
|
||
- */
|
||
- public Element(short code, String name, int flags,
|
||
- short[] parents, short[] closes) {
|
||
- this(code, name, flags, parents, (short)-1, closes);
|
||
- } // <init>(short,String,int,short[],short[])
|
||
-
|
||
- /**
|
||
- * Constructs an element object.
|
||
- *
|
||
- * @param code The element code.
|
||
- * @param name The element name.
|
||
- * @param flags Informational flags
|
||
- * @param parents Natural closing parent names.
|
||
- * @param closes List of elements this element can close.
|
||
- */
|
||
- public Element(short code, String name, int flags,
|
||
- short[] parents, short bounds, short[] closes) {
|
||
- this.code = code;
|
||
- this.name = name;
|
||
- this.flags = flags;
|
||
- this.parentCodes = parents;
|
||
- this.parent = null;
|
||
- this.bounds = bounds;
|
||
- this.closes = closes;
|
||
- if(closes != null) {
|
||
- for(int i=0;i<closes.length;i++) {
|
||
- if(closes[i] == code) {
|
||
- this.nestable = false;
|
||
- break;
|
||
- }
|
||
- }
|
||
- }
|
||
- } // <init>(short,String,int,short[],short,short[])
|
||
-
|
||
- //
|
||
- // Public methods
|
||
- //
|
||
-
|
||
- /** Returns true if this element is an inline element. */
|
||
- public final boolean isInline() {
|
||
- return (flags & INLINE) != 0;
|
||
- } // isInline():boolean
|
||
-
|
||
- /** Returns true if this element is a block element. */
|
||
- public final boolean isBlock() {
|
||
- return (flags & BLOCK) != 0;
|
||
- } // isBlock():boolean
|
||
-
|
||
- /** Returns true if this element is an empty element. */
|
||
- public final boolean isEmpty() {
|
||
- return (flags & EMPTY) != 0;
|
||
- } // isEmpty():boolean
|
||
-
|
||
- /** Returns true if this element is a container element. */
|
||
- public final boolean isContainer() {
|
||
- return (flags & CONTAINER) != 0;
|
||
- } // isContainer():boolean
|
||
-
|
||
- /**
|
||
- * Returns true if this element is special -- if its content
|
||
- * should be parsed ignoring markup.
|
||
- */
|
||
- public final boolean isSpecial() {
|
||
- return (flags & SPECIAL) != 0;
|
||
- } // isSpecial():boolean
|
||
-
|
||
- /**
|
||
- * Returns true if this element can close the specified Element.
|
||
- *
|
||
- * @param tag The element.
|
||
- */
|
||
- public boolean closes(short tag) {
|
||
-
|
||
- if (closes != null) {
|
||
- for (int i = 0; i < closes.length; i++) {
|
||
- if (closes[i] == tag) {
|
||
- return true;
|
||
- }
|
||
- }
|
||
- }
|
||
- return false;
|
||
-
|
||
- } // closes(short):boolean
|
||
-
|
||
- //
|
||
- // Object methods
|
||
- //
|
||
-
|
||
- /** Returns a hash code for this object. */
|
||
- public int hashCode() {
|
||
- return name.hashCode();
|
||
- } // hashCode():int
|
||
-
|
||
- /** Returns true if the objects are equal. */
|
||
- public boolean equals(Object o) {
|
||
- return name.equals(o);
|
||
- } // equals(Object):boolean
|
||
-
|
||
- /**
|
||
- * Provides a simple representation to make debugging easier
|
||
- */
|
||
- public String toString() {
|
||
- return super.toString() + "(name=" + name + ")";
|
||
- }
|
||
-
|
||
- /**
|
||
- * Indicates if the provided element is an accepted parent of current element
|
||
- * @param element the element to test for "paternity"
|
||
- * @return <code>true</code> if <code>element</code> belongs to the {@link #parent}
|
||
- */
|
||
- public boolean isParent(final Element element) {
|
||
- if (parent == null)
|
||
- return false;
|
||
- else {
|
||
- for (int i=0; i<parent.length; ++i) {
|
||
- if (element.code == parent[i].code)
|
||
- return true;
|
||
- }
|
||
- }
|
||
- return false;
|
||
- }
|
||
- } // class Element
|
||
-
|
||
- /** Unsynchronized list of elements. */
|
||
- public static class ElementList {
|
||
-
|
||
- //
|
||
- // Data
|
||
- //
|
||
-
|
||
- /** The size of the list. */
|
||
- public int size;
|
||
-
|
||
- /** The data in the list. */
|
||
- public Element[] data = new Element[120];
|
||
-
|
||
- //
|
||
- // Public methods
|
||
- //
|
||
-
|
||
- /** Adds an element to list, resizing if necessary. */
|
||
- public void addElement(Element element) {
|
||
- if (size == data.length) {
|
||
- Element[] newarray = new Element[size + 20];
|
||
- System.arraycopy(data, 0, newarray, 0, size);
|
||
- data = newarray;
|
||
- }
|
||
- data[size++] = element;
|
||
- } // addElement(Element)
|
||
-
|
||
- } // class Element
|
||
-
|
||
-} // class HTMLElements
|
||
diff -Nru boilerpipe-1.2.0/src/main/org/cyberneko/html/HTMLTagBalancer.java boilerpipe-1.2.0-gil/src/main/org/cyberneko/html/HTMLTagBalancer.java
|
||
--- boilerpipe-1.2.0/src/main/org/cyberneko/html/HTMLTagBalancer.java 2010-12-16 11:30:06.000000000 +0100
|
||
+++ boilerpipe-1.2.0-gil/src/main/org/cyberneko/html/HTMLTagBalancer.java 1970-01-01 01:00:00.000000000 +0100
|
||
@@ -1,1409 +0,0 @@
|
||
-/*
|
||
- * Copyright 2002-2009 Andy Clark, Marc Guillemot
|
||
- *
|
||
- * Licensed under the Apache License, Version 2.0 (the "License");
|
||
- * you may not use this file except in compliance with the License.
|
||
- * You may obtain a copy of the License at
|
||
- *
|
||
- * http://www.apache.org/licenses/LICENSE-2.0
|
||
- *
|
||
- * Unless required by applicable law or agreed to in writing, software
|
||
- * distributed under the License is distributed on an "AS IS" BASIS,
|
||
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
- * See the License for the specific language governing permissions and
|
||
- * limitations under the License.
|
||
- */
|
||
-
|
||
-package org.cyberneko.html;
|
||
-
|
||
-import java.util.ArrayList;
|
||
-import java.util.List;
|
||
-import org.apache.xerces.util.XMLAttributesImpl;
|
||
-import org.apache.xerces.xni.Augmentations;
|
||
-import org.apache.xerces.xni.NamespaceContext;
|
||
-import org.apache.xerces.xni.QName;
|
||
-import org.apache.xerces.xni.XMLAttributes;
|
||
-import org.apache.xerces.xni.XMLDocumentHandler;
|
||
-import org.apache.xerces.xni.XMLLocator;
|
||
-import org.apache.xerces.xni.XMLResourceIdentifier;
|
||
-import org.apache.xerces.xni.XMLString;
|
||
-import org.apache.xerces.xni.XNIException;
|
||
-import org.apache.xerces.xni.parser.XMLComponentManager;
|
||
-import org.apache.xerces.xni.parser.XMLConfigurationException;
|
||
-import org.apache.xerces.xni.parser.XMLDocumentFilter;
|
||
-import org.apache.xerces.xni.parser.XMLDocumentSource;
|
||
-import org.cyberneko.html.HTMLElements.Element;
|
||
-import org.cyberneko.html.filters.NamespaceBinder;
|
||
-import org.cyberneko.html.xercesbridge.XercesBridge;
|
||
-
|
||
-/**
|
||
- * Balances tags in an HTML document. This component receives document events
|
||
- * and tries to correct many common mistakes that human (and computer) HTML
|
||
- * document authors make. This tag balancer can:
|
||
- * <ul>
|
||
- * <li>add missing parent elements;
|
||
- * <li>automatically close elements with optional end tags; and
|
||
- * <li>handle mis-matched inline element tags.
|
||
- * </ul>
|
||
- * <p>
|
||
- * This component recognizes the following features:
|
||
- * <ul>
|
||
- * <li>http://cyberneko.org/html/features/augmentations
|
||
- * <li>http://cyberneko.org/html/features/report-errors
|
||
- * <li>http://cyberneko.org/html/features/balance-tags/document-fragment
|
||
- * <li>http://cyberneko.org/html/features/balance-tags/ignore-outside-content
|
||
- * </ul>
|
||
- * <p>
|
||
- * This component recognizes the following properties:
|
||
- * <ul>
|
||
- * <li>http://cyberneko.org/html/properties/names/elems
|
||
- * <li>http://cyberneko.org/html/properties/names/attrs
|
||
- * <li>http://cyberneko.org/html/properties/error-reporter
|
||
- * <li>http://cyberneko.org/html/properties/balance-tags/current-stack
|
||
- * </ul>
|
||
- *
|
||
- * @see HTMLElements
|
||
- *
|
||
- * @author Andy Clark
|
||
- * @author Marc Guillemot
|
||
- *
|
||
- * @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $
|
||
- */
|
||
-public class HTMLTagBalancer
|
||
- implements XMLDocumentFilter, HTMLComponent {
|
||
-
|
||
- //
|
||
- // Constants
|
||
- //
|
||
-
|
||
- // features
|
||
-
|
||
- /** Namespaces. */
|
||
- protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
|
||
-
|
||
- /** Include infoset augmentations. */
|
||
- protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
|
||
-
|
||
- /** Report errors. */
|
||
- protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
|
||
-
|
||
- /** Document fragment balancing only (deprecated). */
|
||
- protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment";
|
||
-
|
||
- /** Document fragment balancing only. */
|
||
- protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment";
|
||
-
|
||
- /** Ignore outside content. */
|
||
- protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content";
|
||
-
|
||
- /** Recognized features. */
|
||
- private static final String[] RECOGNIZED_FEATURES = {
|
||
- NAMESPACES,
|
||
- AUGMENTATIONS,
|
||
- REPORT_ERRORS,
|
||
- DOCUMENT_FRAGMENT_DEPRECATED,
|
||
- DOCUMENT_FRAGMENT,
|
||
- IGNORE_OUTSIDE_CONTENT,
|
||
- };
|
||
-
|
||
- /** Recognized features defaults. */
|
||
- private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
|
||
- null,
|
||
- null,
|
||
- null,
|
||
- null,
|
||
- Boolean.FALSE,
|
||
- Boolean.FALSE,
|
||
- };
|
||
-
|
||
- // properties
|
||
-
|
||
- /** Modify HTML element names: { "upper", "lower", "default" }. */
|
||
- protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
|
||
-
|
||
- /** Modify HTML attribute names: { "upper", "lower", "default" }. */
|
||
- protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
|
||
-
|
||
- /** Error reporter. */
|
||
- protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
|
||
-
|
||
- /**
|
||
- * <font color="red">EXPERIMENTAL: may change in next release</font><br/>
|
||
- * Name of the property holding the stack of elements in which context a document fragment should be parsed.
|
||
- **/
|
||
- public static final String FRAGMENT_CONTEXT_STACK = "http://cyberneko.org/html/properties/balance-tags/fragment-context-stack";
|
||
-
|
||
- /** Recognized properties. */
|
||
- private static final String[] RECOGNIZED_PROPERTIES = {
|
||
- NAMES_ELEMS,
|
||
- NAMES_ATTRS,
|
||
- ERROR_REPORTER,
|
||
- FRAGMENT_CONTEXT_STACK,
|
||
- };
|
||
-
|
||
- /** Recognized properties defaults. */
|
||
- private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
|
||
- null,
|
||
- null,
|
||
- null,
|
||
- null,
|
||
- };
|
||
-
|
||
- // modify HTML names
|
||
-
|
||
- /** Don't modify HTML names. */
|
||
- protected static final short NAMES_NO_CHANGE = 0;
|
||
-
|
||
- /** Match HTML element names. */
|
||
- protected static final short NAMES_MATCH = 0;
|
||
-
|
||
- /** Uppercase HTML names. */
|
||
- protected static final short NAMES_UPPERCASE = 1;
|
||
-
|
||
- /** Lowercase HTML names. */
|
||
- protected static final short NAMES_LOWERCASE = 2;
|
||
-
|
||
- // static vars
|
||
-
|
||
- /** Synthesized event info item. */
|
||
- protected static final HTMLEventInfo SYNTHESIZED_ITEM =
|
||
- new HTMLEventInfo.SynthesizedItem();
|
||
-
|
||
- //
|
||
- // Data
|
||
- //
|
||
-
|
||
- // features
|
||
-
|
||
- /** Namespaces. */
|
||
- protected boolean fNamespaces;
|
||
-
|
||
- /** Include infoset augmentations. */
|
||
- protected boolean fAugmentations;
|
||
-
|
||
- /** Report errors. */
|
||
- protected boolean fReportErrors;
|
||
-
|
||
- /** Document fragment balancing only. */
|
||
- protected boolean fDocumentFragment;
|
||
-
|
||
- /** Ignore outside content. */
|
||
- protected boolean fIgnoreOutsideContent;
|
||
-
|
||
- // properties
|
||
-
|
||
- /** Modify HTML element names. */
|
||
- protected short fNamesElems;
|
||
-
|
||
- /** Modify HTML attribute names. */
|
||
- protected short fNamesAttrs;
|
||
-
|
||
- /** Error reporter. */
|
||
- protected HTMLErrorReporter fErrorReporter;
|
||
-
|
||
- // connections
|
||
-
|
||
- /** The document source. */
|
||
- protected XMLDocumentSource fDocumentSource;
|
||
-
|
||
- /** The document handler. */
|
||
- protected XMLDocumentHandler fDocumentHandler;
|
||
-
|
||
- // state
|
||
-
|
||
- /** The element stack. */
|
||
- protected final InfoStack fElementStack = new InfoStack();
|
||
-
|
||
- /** The inline stack. */
|
||
- protected final InfoStack fInlineStack = new InfoStack();
|
||
-
|
||
- /** True if seen anything. Important for xml declaration. */
|
||
- protected boolean fSeenAnything;
|
||
-
|
||
- /** True if root element has been seen. */
|
||
- protected boolean fSeenDoctype;
|
||
-
|
||
- /** True if root element has been seen. */
|
||
- protected boolean fSeenRootElement;
|
||
-
|
||
- /**
|
||
- * True if seen the end of the document element. In other words,
|
||
- * this variable is set to false <em>until</em> the end </HTML>
|
||
- * tag is seen (or synthesized). This is used to ensure that
|
||
- * extraneous events after the end of the document element do not
|
||
- * make the document stream ill-formed.
|
||
- */
|
||
- protected boolean fSeenRootElementEnd;
|
||
-
|
||
- /** True if seen <head< element. */
|
||
- protected boolean fSeenHeadElement;
|
||
-
|
||
- /** True if seen <body< element. */
|
||
- protected boolean fSeenBodyElement;
|
||
-
|
||
- /** True if a form is in the stack (allow to discard opening of nested forms) */
|
||
- protected boolean fOpenedForm;
|
||
-
|
||
- // temp vars
|
||
-
|
||
- /** A qualified name. */
|
||
- private final QName fQName = new QName();
|
||
-
|
||
- /** Empty attributes. */
|
||
- private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl();
|
||
-
|
||
- /** Augmentations. */
|
||
- private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
|
||
-
|
||
- protected HTMLTagBalancingListener tagBalancingListener;
|
||
- private LostText lostText_ = new LostText();
|
||
-
|
||
- private boolean forcedStartElement_ = false;
|
||
- private boolean forcedEndElement_ = false;
|
||
-
|
||
- /**
|
||
- * Stack of elements determining the context in which a document fragment should be parsed
|
||
- */
|
||
- private QName[] fragmentContextStack_ = null;
|
||
- private int fragmentContextStackSize_ = 0; // not 0 only when a fragment is parsed and fragmentContextStack_ is set
|
||
-
|
||
- private List/*ElementEntry*/ endElementsBuffer_ = new ArrayList();
|
||
-
|
||
- //
|
||
- // HTMLComponent methods
|
||
- //
|
||
-
|
||
- /** Returns the default state for a feature. */
|
||
- public Boolean getFeatureDefault(String featureId) {
|
||
- int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0;
|
||
- for (int i = 0; i < length; i++) {
|
||
- if (RECOGNIZED_FEATURES[i].equals(featureId)) {
|
||
- return RECOGNIZED_FEATURES_DEFAULTS[i];
|
||
- }
|
||
- }
|
||
- return null;
|
||
- } // getFeatureDefault(String):Boolean
|
||
-
|
||
- /** Returns the default state for a property. */
|
||
- public Object getPropertyDefault(String propertyId) {
|
||
- int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0;
|
||
- for (int i = 0; i < length; i++) {
|
||
- if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
|
||
- return RECOGNIZED_PROPERTIES_DEFAULTS[i];
|
||
- }
|
||
- }
|
||
- return null;
|
||
- } // getPropertyDefault(String):Object
|
||
-
|
||
- //
|
||
- // XMLComponent methods
|
||
- //
|
||
-
|
||
- /** Returns recognized features. */
|
||
- public String[] getRecognizedFeatures() {
|
||
- return RECOGNIZED_FEATURES;
|
||
- } // getRecognizedFeatures():String[]
|
||
-
|
||
- /** Returns recognized properties. */
|
||
- public String[] getRecognizedProperties() {
|
||
- return RECOGNIZED_PROPERTIES;
|
||
- } // getRecognizedProperties():String[]
|
||
-
|
||
- /** Resets the component. */
|
||
- public void reset(XMLComponentManager manager)
|
||
- throws XMLConfigurationException {
|
||
-
|
||
- // get features
|
||
- fNamespaces = manager.getFeature(NAMESPACES);
|
||
- fAugmentations = manager.getFeature(AUGMENTATIONS);
|
||
- fReportErrors = manager.getFeature(REPORT_ERRORS);
|
||
- fDocumentFragment = manager.getFeature(DOCUMENT_FRAGMENT) ||
|
||
- manager.getFeature(DOCUMENT_FRAGMENT_DEPRECATED);
|
||
- fIgnoreOutsideContent = manager.getFeature(IGNORE_OUTSIDE_CONTENT);
|
||
-
|
||
- // get properties
|
||
- fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
|
||
- fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
|
||
- fErrorReporter = (HTMLErrorReporter)manager.getProperty(ERROR_REPORTER);
|
||
-
|
||
- fragmentContextStack_ = (QName[]) manager.getProperty(FRAGMENT_CONTEXT_STACK);
|
||
-
|
||
- } // reset(XMLComponentManager)
|
||
-
|
||
- /** Sets a feature. */
|
||
- public void setFeature(String featureId, boolean state)
|
||
- throws XMLConfigurationException {
|
||
-
|
||
- if (featureId.equals(AUGMENTATIONS)) {
|
||
- fAugmentations = state;
|
||
- return;
|
||
- }
|
||
- if (featureId.equals(REPORT_ERRORS)) {
|
||
- fReportErrors = state;
|
||
- return;
|
||
- }
|
||
- if (featureId.equals(IGNORE_OUTSIDE_CONTENT)) {
|
||
- fIgnoreOutsideContent = state;
|
||
- return;
|
||
- }
|
||
-
|
||
- } // setFeature(String,boolean)
|
||
-
|
||
- /** Sets a property. */
|
||
- public void setProperty(String propertyId, Object value)
|
||
- throws XMLConfigurationException {
|
||
-
|
||
- if (propertyId.equals(NAMES_ELEMS)) {
|
||
- fNamesElems = getNamesValue(String.valueOf(value));
|
||
- return;
|
||
- }
|
||
-
|
||
- if (propertyId.equals(NAMES_ATTRS)) {
|
||
- fNamesAttrs = getNamesValue(String.valueOf(value));
|
||
- return;
|
||
- }
|
||
-
|
||
- } // setProperty(String,Object)
|
||
-
|
||
- //
|
||
- // XMLDocumentSource methods
|
||
- //
|
||
-
|
||
- /** Sets the document handler. */
|
||
- public void setDocumentHandler(XMLDocumentHandler handler) {
|
||
- fDocumentHandler = handler;
|
||
- } // setDocumentHandler(XMLDocumentHandler)
|
||
-
|
||
- // @since Xerces 2.1.0
|
||
-
|
||
- /** Returns the document handler. */
|
||
- public XMLDocumentHandler getDocumentHandler() {
|
||
- return fDocumentHandler;
|
||
- } // getDocumentHandler():XMLDocumentHandler
|
||
-
|
||
- //
|
||
- // XMLDocumentHandler methods
|
||
- //
|
||
-
|
||
- // since Xerces-J 2.2.0
|
||
-
|
||
- /** Start document. */
|
||
- public void startDocument(XMLLocator locator, String encoding,
|
||
- NamespaceContext nscontext, Augmentations augs)
|
||
- throws XNIException {
|
||
-
|
||
- // reset state
|
||
- fElementStack.top = 0;
|
||
- if (fragmentContextStack_ != null) {
|
||
- fragmentContextStackSize_ = fragmentContextStack_.length;
|
||
- for (int i=0; i<fragmentContextStack_.length; ++i) {
|
||
- final QName name = fragmentContextStack_[i];
|
||
- final Element elt = HTMLElements.getElement(name.localpart);
|
||
- fElementStack.push(new Info(elt, name));
|
||
- }
|
||
-
|
||
- }
|
||
- else {
|
||
- fragmentContextStackSize_ = 0;
|
||
- }
|
||
- fSeenAnything = false;
|
||
- fSeenDoctype = false;
|
||
- fSeenRootElement = false;
|
||
- fSeenRootElementEnd = false;
|
||
- fSeenHeadElement = false;
|
||
- fSeenBodyElement = false;
|
||
-
|
||
-
|
||
- // pass on event
|
||
- if (fDocumentHandler != null) {
|
||
- XercesBridge.getInstance().XMLDocumentHandler_startDocument(fDocumentHandler, locator, encoding, nscontext, augs);
|
||
- }
|
||
-
|
||
- } // startDocument(XMLLocator,String,Augmentations)
|
||
-
|
||
- // old methods
|
||
-
|
||
- /** XML declaration. */
|
||
- public void xmlDecl(String version, String encoding, String standalone,
|
||
- Augmentations augs) throws XNIException {
|
||
- if (!fSeenAnything && fDocumentHandler != null) {
|
||
- fDocumentHandler.xmlDecl(version, encoding, standalone, augs);
|
||
- }
|
||
- } // xmlDecl(String,String,String,Augmentations)
|
||
-
|
||
- /** Doctype declaration. */
|
||
- public void doctypeDecl(String rootElementName, String publicId, String systemId,
|
||
- Augmentations augs) throws XNIException {
|
||
- fSeenAnything = true;
|
||
- if (fReportErrors) {
|
||
- if (fSeenRootElement) {
|
||
- fErrorReporter.reportError("HTML2010", null);
|
||
- }
|
||
- else if (fSeenDoctype) {
|
||
- fErrorReporter.reportError("HTML2011", null);
|
||
- }
|
||
- }
|
||
- if (!fSeenRootElement && !fSeenDoctype) {
|
||
- fSeenDoctype = true;
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.doctypeDecl(rootElementName, publicId, systemId, augs);
|
||
- }
|
||
- }
|
||
- } // doctypeDecl(String,String,String,Augmentations)
|
||
-
|
||
- /** End document. */
|
||
- public void endDocument(Augmentations augs) throws XNIException {
|
||
-
|
||
- // </body> and </html> have been buffered to consider outside content
|
||
- fIgnoreOutsideContent = true; // endElement should not ignore the elements passed from buffer
|
||
- consumeBufferedEndElements();
|
||
-
|
||
- // handle empty document
|
||
- if (!fSeenRootElement && !fDocumentFragment) {
|
||
- if (fReportErrors) {
|
||
- fErrorReporter.reportError("HTML2000", null);
|
||
- }
|
||
- if (fDocumentHandler != null) {
|
||
- fSeenRootElementEnd = false;
|
||
- forceStartBody(); // will force <html> and <head></head>
|
||
- final String body = modifyName("body", fNamesElems);
|
||
- fQName.setValues(null, body, body, null);
|
||
- callEndElement(fQName, synthesizedAugs());
|
||
-
|
||
- final String ename = modifyName("html", fNamesElems);
|
||
- fQName.setValues(null, ename, ename, null);
|
||
- callEndElement(fQName, synthesizedAugs());
|
||
- }
|
||
- }
|
||
-
|
||
- // pop all remaining elements
|
||
- else {
|
||
- int length = fElementStack.top - fragmentContextStackSize_;
|
||
- for (int i = 0; i < length; i++) {
|
||
- Info info = fElementStack.pop();
|
||
- if (fReportErrors) {
|
||
- String ename = info.qname.rawname;
|
||
- fErrorReporter.reportWarning("HTML2001", new Object[]{ename});
|
||
- }
|
||
- if (fDocumentHandler != null) {
|
||
- callEndElement(info.qname, synthesizedAugs());
|
||
- }
|
||
- }
|
||
- }
|
||
-
|
||
- // call handler
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.endDocument(augs);
|
||
- }
|
||
-
|
||
- } // endDocument(Augmentations)
|
||
-
|
||
- /**
|
||
- * Consume elements that have been buffered, like </body></html> that are first consumed
|
||
- * at the end of document
|
||
- */
|
||
- private void consumeBufferedEndElements() {
|
||
- final List toConsume = new ArrayList(endElementsBuffer_);
|
||
- endElementsBuffer_.clear();
|
||
- for (int i=0; i<toConsume.size(); ++i) {
|
||
- final ElementEntry entry = (ElementEntry) toConsume.get(i);
|
||
- forcedEndElement_ = true;
|
||
- endElement(entry.name_, entry.augs_);
|
||
- }
|
||
- endElementsBuffer_.clear();
|
||
- }
|
||
-
|
||
- /** Comment. */
|
||
- public void comment(XMLString text, Augmentations augs) throws XNIException {
|
||
- fSeenAnything = true;
|
||
- consumeEarlyTextIfNeeded();
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.comment(text, augs);
|
||
- }
|
||
- } // comment(XMLString,Augmentations)
|
||
-
|
||
- private void consumeEarlyTextIfNeeded() {
|
||
- if (!lostText_.isEmpty()) {
|
||
- if (!fSeenBodyElement) {
|
||
- forceStartBody();
|
||
- }
|
||
- lostText_.refeed(this);
|
||
- }
|
||
- }
|
||
-
|
||
- /** Processing instruction. */
|
||
- public void processingInstruction(String target, XMLString data,
|
||
- Augmentations augs) throws XNIException {
|
||
- fSeenAnything = true;
|
||
- consumeEarlyTextIfNeeded();
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.processingInstruction(target, data, augs);
|
||
- }
|
||
- } // processingInstruction(String,XMLString,Augmentations)
|
||
-
|
||
- /** Start element. */
|
||
- public void startElement(final QName elem, XMLAttributes attrs, final Augmentations augs)
|
||
- throws XNIException {
|
||
- fSeenAnything = true;
|
||
-
|
||
- final boolean isForcedCreation = forcedStartElement_;
|
||
- forcedStartElement_ = false;
|
||
-
|
||
- // check for end of document
|
||
- if (fSeenRootElementEnd) {
|
||
- notifyDiscardedStartElement(elem, attrs, augs);
|
||
- return;
|
||
- }
|
||
-
|
||
- // get element information
|
||
- final HTMLElements.Element element = getElement(elem);
|
||
- final short elementCode = element.code;
|
||
-
|
||
- // the creation of some elements like TABLE or SELECT can't be forced. Any others?
|
||
- if (isForcedCreation && (elementCode == HTMLElements.TABLE || elementCode == HTMLElements.SELECT)) {
|
||
- return; // don't accept creation
|
||
- }
|
||
-
|
||
- // ignore multiple html, head, body elements
|
||
- if (fSeenRootElement && elementCode == HTMLElements.HTML) {
|
||
- notifyDiscardedStartElement(elem, attrs, augs);
|
||
- return;
|
||
- }
|
||
- if (elementCode == HTMLElements.HEAD) {
|
||
- if (fSeenHeadElement) {
|
||
- notifyDiscardedStartElement(elem, attrs, augs);
|
||
- return;
|
||
- }
|
||
- fSeenHeadElement = true;
|
||
- }
|
||
- else if (elementCode == HTMLElements.FRAMESET) {
|
||
- consumeBufferedEndElements(); // </head> (if any) has been buffered
|
||
- }
|
||
- else if (elementCode == HTMLElements.BODY) {
|
||
- // create <head></head> if none was present
|
||
- if (!fSeenHeadElement) {
|
||
- final QName head = createQName("head");
|
||
- forceStartElement(head, null, synthesizedAugs());
|
||
- endElement(head, synthesizedAugs());
|
||
- }
|
||
- consumeBufferedEndElements(); // </head> (if any) has been buffered
|
||
-
|
||
- if (fSeenBodyElement) {
|
||
- notifyDiscardedStartElement(elem, attrs, augs);
|
||
- return;
|
||
- }
|
||
- fSeenBodyElement = true;
|
||
- }
|
||
- else if (elementCode == HTMLElements.FORM) {
|
||
- if (fOpenedForm) {
|
||
- notifyDiscardedStartElement(elem, attrs, augs);
|
||
- return;
|
||
- }
|
||
- fOpenedForm = true;
|
||
- }
|
||
- else if (elementCode == HTMLElements.UNKNOWN) {
|
||
- consumeBufferedEndElements();
|
||
- }
|
||
-
|
||
- // check proper parent
|
||
- if (element.parent != null) {
|
||
- if (!fSeenRootElement && !fDocumentFragment) {
|
||
- String pname = element.parent[0].name;
|
||
- pname = modifyName(pname, fNamesElems);
|
||
- if (fReportErrors) {
|
||
- String ename = elem.rawname;
|
||
- fErrorReporter.reportWarning("HTML2002", new Object[]{ename,pname});
|
||
- }
|
||
- final QName qname = new QName(null, pname, pname, null);
|
||
- final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs());
|
||
- if (!parentCreated) {
|
||
- if (!isForcedCreation) {
|
||
- notifyDiscardedStartElement(elem, attrs, augs);
|
||
- }
|
||
- return;
|
||
- }
|
||
- }
|
||
- else {
|
||
- HTMLElements.Element preferedParent = element.parent[0];
|
||
- if (preferedParent.code != HTMLElements.HEAD || (!fSeenBodyElement && !fDocumentFragment)) {
|
||
- int depth = getParentDepth(element.parent, element.bounds);
|
||
- if (depth == -1) { // no parent found
|
||
- final String pname = modifyName(preferedParent.name, fNamesElems);
|
||
- final QName qname = new QName(null, pname, pname, null);
|
||
- if (fReportErrors) {
|
||
- String ename = elem.rawname;
|
||
- fErrorReporter.reportWarning("HTML2004", new Object[]{ename,pname});
|
||
- }
|
||
- final boolean parentCreated = forceStartElement(qname, null, synthesizedAugs());
|
||
- if (!parentCreated) {
|
||
- if (!isForcedCreation) {
|
||
- notifyDiscardedStartElement(elem, attrs, augs);
|
||
- }
|
||
- return;
|
||
- }
|
||
- }
|
||
- }
|
||
- }
|
||
- }
|
||
-
|
||
- // if block element, save immediate parent inline elements
|
||
- int depth = 0;
|
||
- if (element.flags == 0) {
|
||
- int length = fElementStack.top;
|
||
- fInlineStack.top = 0;
|
||
- for (int i = length - 1; i >= 0; i--) {
|
||
- Info info = fElementStack.data[i];
|
||
- if (!info.element.isInline()) {
|
||
- break;
|
||
- }
|
||
- fInlineStack.push(info);
|
||
- endElement(info.qname, synthesizedAugs());
|
||
- }
|
||
- depth = fInlineStack.top;
|
||
- }
|
||
-
|
||
- // close previous elements
|
||
- // all elements close a <script>
|
||
- // in head, no element has children
|
||
- if ((fElementStack.top > 1
|
||
- && (fElementStack.peek().element.code == HTMLElements.SCRIPT))
|
||
- || fElementStack.top > 2 && fElementStack.data[fElementStack.top-2].element.code == HTMLElements.HEAD) {
|
||
- final Info info = fElementStack.pop();
|
||
- if (fDocumentHandler != null) {
|
||
- callEndElement(info.qname, synthesizedAugs());
|
||
- }
|
||
- }
|
||
- if (element.closes != null) {
|
||
- int length = fElementStack.top;
|
||
- for (int i = length - 1; i >= 0; i--) {
|
||
- Info info = fElementStack.data[i];
|
||
-
|
||
- // does it close the element we're looking at?
|
||
- if (element.closes(info.element.code)) {
|
||
- if (fReportErrors) {
|
||
- String ename = elem.rawname;
|
||
- String iname = info.qname.rawname;
|
||
- fErrorReporter.reportWarning("HTML2005", new Object[]{ename,iname});
|
||
- }
|
||
- for (int j = length - 1; j >= i; j--) {
|
||
- info = fElementStack.pop();
|
||
- if (fDocumentHandler != null) {
|
||
- // PATCH: Marc-Andr<64> Morissette
|
||
- callEndElement(info.qname, synthesizedAugs());
|
||
- }
|
||
- }
|
||
- length = i;
|
||
- continue;
|
||
- }
|
||
-
|
||
- // should we stop searching?
|
||
- if(element.nestable) {
|
||
- if (info.element.isBlock() || element.isParent(info.element)) {
|
||
- break;
|
||
- }
|
||
- }
|
||
- }
|
||
- }
|
||
- // TODO: investigate if only table is special here
|
||
- // table closes all opened inline elements
|
||
- else if (elementCode == HTMLElements.TABLE) {
|
||
- for (int i=fElementStack.top-1; i >= 0; i--) {
|
||
- final Info info = fElementStack.data[i];
|
||
- if (!info.element.isInline()) {
|
||
- break;
|
||
- }
|
||
- endElement(info.qname, synthesizedAugs());
|
||
- }
|
||
- }
|
||
-
|
||
- // call handler
|
||
- fSeenRootElement = true;
|
||
- if (element != null && element.isEmpty()) {
|
||
- if (attrs == null) {
|
||
- attrs = emptyAttributes();
|
||
- }
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.emptyElement(elem, attrs, augs);
|
||
- }
|
||
- }
|
||
- else {
|
||
- boolean inline = element != null && element.isInline();
|
||
- fElementStack.push(new Info(element, elem, inline ? attrs : null));
|
||
- if (attrs == null) {
|
||
- attrs = emptyAttributes();
|
||
- }
|
||
- if (fDocumentHandler != null) {
|
||
- callStartElement(elem, attrs, augs);
|
||
- }
|
||
- }
|
||
-
|
||
- // re-open inline elements
|
||
- for (int i = 0; i < depth; i++) {
|
||
- Info info = fInlineStack.pop();
|
||
- forceStartElement(info.qname, info.attributes, synthesizedAugs());
|
||
- }
|
||
-
|
||
- if (elementCode == HTMLElements.BODY) {
|
||
- lostText_.refeed(this);
|
||
- }
|
||
- } // startElement(QName,XMLAttributes,Augmentations)
|
||
-
|
||
- /**
|
||
- * Forces an element start, taking care to set the information to allow startElement to "see" that's
|
||
- * the element has been forced.
|
||
- * @return <code>true</code> if creation could be done (TABLE's creation for instance can't be forced)
|
||
- */
|
||
- private boolean forceStartElement(final QName elem, XMLAttributes attrs, final Augmentations augs)
|
||
- throws XNIException {
|
||
-
|
||
- forcedStartElement_ = true;
|
||
- startElement(elem, attrs, augs);
|
||
-
|
||
- return fElementStack.top > 0 && elem.equals(fElementStack.peek().qname);
|
||
- }
|
||
-
|
||
- private QName createQName(String tagName) {
|
||
- tagName = modifyName(tagName, fNamesElems);
|
||
- return new QName(null, tagName, tagName, NamespaceBinder.XHTML_1_0_URI);
|
||
- }
|
||
-
|
||
- /** Empty element. */
|
||
- public void emptyElement(final QName element, XMLAttributes attrs, Augmentations augs)
|
||
- throws XNIException {
|
||
- startElement(element, attrs, augs);
|
||
- // browser ignore the closing indication for non empty tags like <form .../> but not for unknown element
|
||
- final HTMLElements.Element elem = getElement(element);
|
||
- if (elem.isEmpty() || elem.code == HTMLElements.UNKNOWN) {
|
||
- endElement(element, augs);
|
||
- }
|
||
- } // emptyElement(QName,XMLAttributes,Augmentations)
|
||
-
|
||
- /** Start entity. */
|
||
- public void startGeneralEntity(String name,
|
||
- XMLResourceIdentifier id,
|
||
- String encoding,
|
||
- Augmentations augs) throws XNIException {
|
||
- fSeenAnything = true;
|
||
-
|
||
- // check for end of document
|
||
- if (fSeenRootElementEnd) {
|
||
- return;
|
||
- }
|
||
-
|
||
- // insert body, if needed
|
||
- if (!fDocumentFragment) {
|
||
- boolean insertBody = !fSeenRootElement;
|
||
- if (!insertBody) {
|
||
- Info info = fElementStack.peek();
|
||
- if (info.element.code == HTMLElements.HEAD ||
|
||
- info.element.code == HTMLElements.HTML) {
|
||
- String hname = modifyName("head", fNamesElems);
|
||
- String bname = modifyName("body", fNamesElems);
|
||
- if (fReportErrors) {
|
||
- fErrorReporter.reportWarning("HTML2009", new Object[]{hname,bname});
|
||
- }
|
||
- fQName.setValues(null, hname, hname, null);
|
||
- endElement(fQName, synthesizedAugs());
|
||
- insertBody = true;
|
||
- }
|
||
- }
|
||
- if (insertBody) {
|
||
- forceStartBody();
|
||
- }
|
||
- }
|
||
-
|
||
- // call handler
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.startGeneralEntity(name, id, encoding, augs);
|
||
- }
|
||
-
|
||
- } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
|
||
-
|
||
- /**
|
||
- * Generates a missing <body> (which creates missing <head> when needed)
|
||
- */
|
||
- private void forceStartBody() {
|
||
- final QName body = createQName("body");
|
||
- if (fReportErrors) {
|
||
- fErrorReporter.reportWarning("HTML2006", new Object[]{body.localpart});
|
||
- }
|
||
- forceStartElement(body, null, synthesizedAugs());
|
||
- }
|
||
-
|
||
- /** Text declaration. */
|
||
- public void textDecl(String version, String encoding, Augmentations augs)
|
||
- throws XNIException {
|
||
- fSeenAnything = true;
|
||
-
|
||
- // check for end of document
|
||
- if (fSeenRootElementEnd) {
|
||
- return;
|
||
- }
|
||
-
|
||
- // call handler
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.textDecl(version, encoding, augs);
|
||
- }
|
||
-
|
||
- } // textDecl(String,String,Augmentations)
|
||
-
|
||
- /** End entity. */
|
||
- public void endGeneralEntity(String name, Augmentations augs) throws XNIException {
|
||
-
|
||
- // check for end of document
|
||
- if (fSeenRootElementEnd) {
|
||
- return;
|
||
- }
|
||
-
|
||
- // call handler
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.endGeneralEntity(name, augs);
|
||
- }
|
||
-
|
||
- } // endGeneralEntity(String,Augmentations)
|
||
-
|
||
- /** Start CDATA section. */
|
||
- public void startCDATA(Augmentations augs) throws XNIException {
|
||
- fSeenAnything = true;
|
||
-
|
||
- consumeEarlyTextIfNeeded();
|
||
-
|
||
- // check for end of document
|
||
- if (fSeenRootElementEnd) {
|
||
- return;
|
||
- }
|
||
-
|
||
- // call handler
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.startCDATA(augs);
|
||
- }
|
||
-
|
||
- } // startCDATA(Augmentations)
|
||
-
|
||
- /** End CDATA section. */
|
||
- public void endCDATA(Augmentations augs) throws XNIException {
|
||
-
|
||
- // check for end of document
|
||
- if (fSeenRootElementEnd) {
|
||
- return;
|
||
- }
|
||
-
|
||
- // call handler
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.endCDATA(augs);
|
||
- }
|
||
-
|
||
- } // endCDATA(Augmentations)
|
||
-
|
||
- /** Characters. */
|
||
- public void characters(final XMLString text, final Augmentations augs) throws XNIException {
|
||
- // check for end of document
|
||
- if (fSeenRootElementEnd) {
|
||
- return;
|
||
- }
|
||
-
|
||
- if (fElementStack.top == 0 && !fDocumentFragment) {
|
||
- // character before first opening tag
|
||
- lostText_.add(text, augs);
|
||
- return;
|
||
- }
|
||
-
|
||
- // is this text whitespace?
|
||
- boolean whitespace = true;
|
||
- for (int i = 0; i < text.length; i++) {
|
||
- if (!Character.isWhitespace(text.ch[text.offset + i])) {
|
||
- whitespace = false;
|
||
- break;
|
||
- }
|
||
- }
|
||
-
|
||
- if (!fDocumentFragment) {
|
||
- // handle bare characters
|
||
- if (!fSeenRootElement) {
|
||
- if (whitespace) {
|
||
- return;
|
||
- }
|
||
- forceStartBody();
|
||
- }
|
||
-
|
||
- if (whitespace && (fElementStack.top < 2 || endElementsBuffer_.size() == 1)) {
|
||
- // ignore spaces directly within <html>
|
||
- return;
|
||
- }
|
||
-
|
||
- // handle character content in head
|
||
- // NOTE: This frequently happens when the document looks like:
|
||
- // <title>Title</title>
|
||
- // And here's some text.
|
||
- else if (!whitespace) {
|
||
- Info info = fElementStack.peek();
|
||
- if (info.element.code == HTMLElements.HEAD ||
|
||
- info.element.code == HTMLElements.HTML) {
|
||
- String hname = modifyName("head", fNamesElems);
|
||
- String bname = modifyName("body", fNamesElems);
|
||
- if (fReportErrors) {
|
||
- fErrorReporter.reportWarning("HTML2009", new Object[]{hname,bname});
|
||
- }
|
||
- forceStartBody();
|
||
- }
|
||
- }
|
||
- }
|
||
-
|
||
- // call handler
|
||
- if (fDocumentHandler != null) {
|
||
- fDocumentHandler.characters(text, augs);
|
||
- }
|
||
-
|
||
- } // characters(XMLString,Augmentations)
|
||
-
|
||
- /** Ignorable whitespace. */
|
||
- public void ignorableWhitespace(XMLString text, Augmentations augs)
|
||
- throws XNIException {
|
||
- characters(text, augs);
|
||
- } // ignorableWhitespace(XMLString,Augmentations)
|
||
-
|
||
- /** End element. */
|
||
- public void endElement(final QName element, final Augmentations augs) throws XNIException {
|
||
- final boolean forcedEndElement = forcedEndElement_;
|
||
- // is there anything to do?
|
||
- if (fSeenRootElementEnd) {
|
||
- notifyDiscardedEndElement(element, augs);
|
||
- return;
|
||
- }
|
||
-
|
||
- // get element information
|
||
- HTMLElements.Element elem = getElement(element);
|
||
-
|
||
- // if we consider outside content, just buffer </body> and </html> to consider them at the very end
|
||
- if (!fIgnoreOutsideContent &&
|
||
- (elem.code == HTMLElements.BODY || elem.code == HTMLElements.HTML)) {
|
||
- endElementsBuffer_.add(new ElementEntry(element, augs));
|
||
- return;
|
||
- }
|
||
-
|
||
- // check for end of document
|
||
- if (elem.code == HTMLElements.HTML) {
|
||
- fSeenRootElementEnd = true;
|
||
- }
|
||
- else if (elem.code == HTMLElements.FORM) {
|
||
- fOpenedForm = false;
|
||
- }
|
||
- else if (elem.code == HTMLElements.HEAD && !forcedEndElement) {
|
||
- // consume </head> first when <body> is reached to retrieve content lost between </head> and <body>
|
||
- endElementsBuffer_.add(new ElementEntry(element, augs));
|
||
- return;
|
||
- }
|
||
-
|
||
-
|
||
- // empty element
|
||
- int depth = getElementDepth(elem);
|
||
- if (depth == -1) {
|
||
- if (elem.code == HTMLElements.P) {
|
||
- forceStartElement(element, emptyAttributes(), synthesizedAugs());
|
||
- endElement(element, augs);
|
||
- }
|
||
- else if (!elem.isEmpty()) {
|
||
- notifyDiscardedEndElement(element, augs);
|
||
- }
|
||
- return;
|
||
- }
|
||
-
|
||
- // find unbalanced inline elements
|
||
- if (depth > 1 && elem.isInline()) {
|
||
- final int size = fElementStack.top;
|
||
- fInlineStack.top = 0;
|
||
- for (int i = 0; i < depth - 1; i++) {
|
||
- final Info info = fElementStack.data[size - i - 1];
|
||
- final HTMLElements.Element pelem = info.element;
|
||
-
|
||
- if (pelem.isInline() || pelem.code == HTMLElements.FONT) { // TODO: investigate if only FONT
|
||
- // NOTE: I don't have to make a copy of the info because
|
||
- // it will just be popped off of the element stack
|
||
- // as soon as we close it, anyway.
|
||
- fInlineStack.push(info);
|
||
- }
|
||
- }
|
||
- }
|
||
-
|
||
- // close children up to appropriate element
|
||
- for (int i = 0; i < depth; i++) {
|
||
- Info info = fElementStack.pop();
|
||
-
|
||
- if (fReportErrors && i < depth - 1) {
|
||
- String ename = modifyName(element.rawname, fNamesElems);
|
||
- String iname = info.qname.rawname;
|
||
- fErrorReporter.reportWarning("HTML2007", new Object[]{ename,iname});
|
||
- }
|
||
- if (fDocumentHandler != null) {
|
||
- // PATCH: Marc-Andr\u00e8 Morissette
|
||
- callEndElement(info.qname, i < depth - 1 ? synthesizedAugs() : augs);
|
||
- }
|
||
- }
|
||
-
|
||
- // re-open inline elements
|
||
- if (depth > 1) {
|
||
- int size = fInlineStack.top;
|
||
- for (int i = 0; i < size; i++) {
|
||
- Info info = (Info)fInlineStack.pop();
|
||
- XMLAttributes attributes = info.attributes;
|
||
- if (fReportErrors) {
|
||
- String iname = info.qname.rawname;
|
||
- fErrorReporter.reportWarning("HTML2008", new Object[]{iname});
|
||
- }
|
||
- forceStartElement(info.qname, attributes, synthesizedAugs());
|
||
- }
|
||
- }
|
||
-
|
||
- } // endElement(QName,Augmentations)
|
||
-
|
||
- // @since Xerces 2.1.0
|
||
-
|
||
- /** Sets the document source. */
|
||
- public void setDocumentSource(XMLDocumentSource source) {
|
||
- fDocumentSource = source;
|
||
- } // setDocumentSource(XMLDocumentSource)
|
||
-
|
||
- /** Returns the document source. */
|
||
- public XMLDocumentSource getDocumentSource() {
|
||
- return fDocumentSource;
|
||
- } // getDocumentSource():XMLDocumentSource
|
||
-
|
||
- // removed since Xerces-J 2.3.0
|
||
-
|
||
- /** Start document. */
|
||
- public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
|
||
- throws XNIException {
|
||
- startDocument(locator, encoding, null, augs);
|
||
- } // startDocument(XMLLocator,String,Augmentations)
|
||
-
|
||
- /** Start prefix mapping. */
|
||
- public void startPrefixMapping(String prefix, String uri, Augmentations augs)
|
||
- throws XNIException {
|
||
-
|
||
- // check for end of document
|
||
- if (fSeenRootElementEnd) {
|
||
- return;
|
||
- }
|
||
-
|
||
- // call handler
|
||
- if (fDocumentHandler != null) {
|
||
- XercesBridge.getInstance().XMLDocumentHandler_startPrefixMapping(fDocumentHandler, prefix, uri, augs);
|
||
- }
|
||
-
|
||
- } // startPrefixMapping(String,String,Augmentations)
|
||
-
|
||
- /** End prefix mapping. */
|
||
- public void endPrefixMapping(String prefix, Augmentations augs)
|
||
- throws XNIException {
|
||
-
|
||
- // check for end of document
|
||
- if (fSeenRootElementEnd) {
|
||
- return;
|
||
- }
|
||
-
|
||
- // call handler
|
||
- if (fDocumentHandler != null) {
|
||
- XercesBridge.getInstance().XMLDocumentHandler_endPrefixMapping(fDocumentHandler, prefix, augs);
|
||
- }
|
||
-
|
||
- } // endPrefixMapping(String,Augmentations)
|
||
-
|
||
- //
|
||
- // Protected methods
|
||
- //
|
||
-
|
||
- /** Returns an HTML element. */
|
||
- protected HTMLElements.Element getElement(final QName elementName) {
|
||
- String name = elementName.rawname;
|
||
- if (fNamespaces && NamespaceBinder.XHTML_1_0_URI.equals(elementName.uri)) {
|
||
- int index = name.indexOf(':');
|
||
- if (index != -1) {
|
||
- name = name.substring(index+1);
|
||
- }
|
||
- }
|
||
- return HTMLElements.getElement(name);
|
||
- } // getElement(String):HTMLElements.Element
|
||
-
|
||
- /** Call document handler start element. */
|
||
- protected final void callStartElement(QName element, XMLAttributes attrs,
|
||
- Augmentations augs)
|
||
- throws XNIException {
|
||
- fDocumentHandler.startElement(element, attrs, augs);
|
||
- } // callStartElement(QName,XMLAttributes,Augmentations)
|
||
-
|
||
- /** Call document handler end element. */
|
||
- protected final void callEndElement(QName element, Augmentations augs)
|
||
- throws XNIException {
|
||
- fDocumentHandler.endElement(element, augs);
|
||
- } // callEndElement(QName,Augmentations)
|
||
-
|
||
- /**
|
||
- * Returns the depth of the open tag associated with the specified
|
||
- * element name or -1 if no matching element is found.
|
||
- *
|
||
- * @param element The element.
|
||
- */
|
||
- protected final int getElementDepth(HTMLElements.Element element) {
|
||
- final boolean container = element.isContainer();
|
||
- int depth = -1;
|
||
- for (int i = fElementStack.top - 1; i >=fragmentContextStackSize_; i--) {
|
||
- Info info = fElementStack.data[i];
|
||
- if (info.element.code == element.code) {
|
||
- depth = fElementStack.top - i;
|
||
- break;
|
||
- }
|
||
- if (!container && (element.nestable && info.element.isBlock())) {
|
||
- break;
|
||
- }
|
||
- }
|
||
- return depth;
|
||
- } // getElementDepth(HTMLElements.Element)
|
||
-
|
||
- /**
|
||
- * Returns the depth of the open tag associated with the specified
|
||
- * element parent names or -1 if no matching element is found.
|
||
- *
|
||
- * @param parents The parent elements.
|
||
- */
|
||
- protected int getParentDepth(HTMLElements.Element[] parents, short bounds) {
|
||
- if (parents != null) {
|
||
- for (int i = fElementStack.top - 1; i >= 0; i--) {
|
||
- Info info = fElementStack.data[i];
|
||
- if (info.element.code == bounds) {
|
||
- break;
|
||
- }
|
||
- for (int j = 0; j < parents.length; j++) {
|
||
- if (info.element.code == parents[j].code) {
|
||
- return fElementStack.top - i;
|
||
- }
|
||
- }
|
||
- }
|
||
- }
|
||
- return -1;
|
||
- } // getParentDepth(HTMLElements.Element[],short):int
|
||
-
|
||
- /** Returns a set of empty attributes. */
|
||
- protected final XMLAttributes emptyAttributes() {
|
||
- fEmptyAttrs.removeAllAttributes();
|
||
- return fEmptyAttrs;
|
||
- } // emptyAttributes():XMLAttributes
|
||
-
|
||
- /** Returns an augmentations object with a synthesized item added. */
|
||
- protected final Augmentations synthesizedAugs() {
|
||
- HTMLAugmentations augs = null;
|
||
- if (fAugmentations) {
|
||
- augs = fInfosetAugs;
|
||
- augs.removeAllItems();
|
||
- augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
|
||
- }
|
||
- return augs;
|
||
- } // synthesizedAugs():Augmentations
|
||
-
|
||
- //
|
||
- // Protected static methods
|
||
- //
|
||
-
|
||
- /** Modifies the given name based on the specified mode. */
|
||
- protected static final String modifyName(String name, short mode) {
|
||
- switch (mode) {
|
||
- case NAMES_UPPERCASE: return name.toUpperCase();
|
||
- case NAMES_LOWERCASE: return name.toLowerCase();
|
||
- }
|
||
- return name;
|
||
- } // modifyName(String,short):String
|
||
-
|
||
- /**
|
||
- * Converts HTML names string value to constant value.
|
||
- *
|
||
- * @see #NAMES_NO_CHANGE
|
||
- * @see #NAMES_LOWERCASE
|
||
- * @see #NAMES_UPPERCASE
|
||
- */
|
||
- protected static final short getNamesValue(String value) {
|
||
- if (value.equals("lower")) {
|
||
- return NAMES_LOWERCASE;
|
||
- }
|
||
- if (value.equals("upper")) {
|
||
- return NAMES_UPPERCASE;
|
||
- }
|
||
- return NAMES_NO_CHANGE;
|
||
- } // getNamesValue(String):short
|
||
-
|
||
- //
|
||
- // Classes
|
||
- //
|
||
-
|
||
- /**
|
||
- * Element info for each start element. This information is used when
|
||
- * closing unbalanced inline elements. For example:
|
||
- * <pre>
|
||
- * <i>unbalanced <b>HTML</i> content</b>
|
||
- * </pre>
|
||
- * <p>
|
||
- * It seems that it is a waste of processing and memory to copy the
|
||
- * attributes for every start element even if there are no unbalanced
|
||
- * inline elements in the document. However, if the attributes are
|
||
- * <em>not</em> saved, then important attributes such as style
|
||
- * information would be lost.
|
||
- *
|
||
- * @author Andy Clark
|
||
- */
|
||
- public static class Info {
|
||
-
|
||
- //
|
||
- // Data
|
||
- //
|
||
-
|
||
- /** The element. */
|
||
- public HTMLElements.Element element;
|
||
-
|
||
- /** The element qualified name. */
|
||
- public QName qname;
|
||
-
|
||
- /** The element attributes. */
|
||
- public XMLAttributes attributes;
|
||
-
|
||
- //
|
||
- // Constructors
|
||
- //
|
||
-
|
||
- /**
|
||
- * Creates an element information object.
|
||
- * <p>
|
||
- * <strong>Note:</strong>
|
||
- * This constructor makes a copy of the element information.
|
||
- *
|
||
- * @param element The element qualified name.
|
||
- */
|
||
- public Info(HTMLElements.Element element, QName qname) {
|
||
- this(element, qname, null);
|
||
- } // <init>(HTMLElements.Element,QName)
|
||
-
|
||
- /**
|
||
- * Creates an element information object.
|
||
- * <p>
|
||
- * <strong>Note:</strong>
|
||
- * This constructor makes a copy of the element information.
|
||
- *
|
||
- * @param element The element qualified name.
|
||
- * @param attributes The element attributes.
|
||
- */
|
||
- public Info(HTMLElements.Element element,
|
||
- QName qname, XMLAttributes attributes) {
|
||
- this.element = element;
|
||
- this.qname = new QName(qname);
|
||
- if (attributes != null) {
|
||
- int length = attributes.getLength();
|
||
- if (length > 0) {
|
||
- QName aqname = new QName();
|
||
- XMLAttributes newattrs = new XMLAttributesImpl();
|
||
- for (int i = 0; i < length; i++) {
|
||
- attributes.getName(i, aqname);
|
||
- String type = attributes.getType(i);
|
||
- String value = attributes.getValue(i);
|
||
- String nonNormalizedValue = attributes.getNonNormalizedValue(i);
|
||
- boolean specified = attributes.isSpecified(i);
|
||
- newattrs.addAttribute(aqname, type, value);
|
||
- newattrs.setNonNormalizedValue(i, nonNormalizedValue);
|
||
- newattrs.setSpecified(i, specified);
|
||
- }
|
||
- this.attributes = newattrs;
|
||
- }
|
||
- }
|
||
- } // <init>(HTMLElements.Element,QName,XMLAttributes)
|
||
-
|
||
- /**
|
||
- * Simple representation to make debugging easier
|
||
- */
|
||
- public String toString() {
|
||
- return super.toString() + qname;
|
||
- }
|
||
- } // class Info
|
||
-
|
||
- /** Unsynchronized stack of element information. */
|
||
- public static class InfoStack {
|
||
-
|
||
- //
|
||
- // Data
|
||
- //
|
||
-
|
||
- /** The top of the stack. */
|
||
- public int top;
|
||
-
|
||
- /** The stack data. */
|
||
- public Info[] data = new Info[10];
|
||
-
|
||
- //
|
||
- // Public methods
|
||
- //
|
||
-
|
||
- /** Pushes element information onto the stack. */
|
||
- public void push(Info info) {
|
||
- if (top == data.length) {
|
||
- Info[] newarray = new Info[top + 10];
|
||
- System.arraycopy(data, 0, newarray, 0, top);
|
||
- data = newarray;
|
||
- }
|
||
- data[top++] = info;
|
||
- } // push(Info)
|
||
-
|
||
- /** Peeks at the top of the stack. */
|
||
- public Info peek() {
|
||
- return data[top-1];
|
||
- } // peek():Info
|
||
-
|
||
- /** Pops the top item off of the stack. */
|
||
- public Info pop() {
|
||
- return data[--top];
|
||
- } // pop():Info
|
||
-
|
||
- /**
|
||
- * Simple representation to make debugging easier
|
||
- */
|
||
- public String toString() {
|
||
- final StringBuffer sb = new StringBuffer("InfoStack(");
|
||
- for (int i=top-1; i>=0; --i) {
|
||
- sb.append(data[i]);
|
||
- if (i != 0)
|
||
- sb.append(", ");
|
||
- }
|
||
- sb.append(")");
|
||
- return sb.toString();
|
||
- }
|
||
-
|
||
-
|
||
- } // class InfoStack
|
||
-
|
||
- void setTagBalancingListener(final HTMLTagBalancingListener tagBalancingListener) {
|
||
- this.tagBalancingListener = tagBalancingListener;
|
||
- }
|
||
-
|
||
- /**
|
||
- * Notifies the tagBalancingListener (if any) of an ignored start element
|
||
- */
|
||
- private void notifyDiscardedStartElement(final QName elem, final XMLAttributes attrs,
|
||
- final Augmentations augs) {
|
||
- if (tagBalancingListener != null)
|
||
- tagBalancingListener.ignoredStartElement(elem, attrs, augs);
|
||
- }
|
||
-
|
||
- /**
|
||
- * Notifies the tagBalancingListener (if any) of an ignored end element
|
||
- */
|
||
- private void notifyDiscardedEndElement(final QName element, final Augmentations augs) {
|
||
- if (tagBalancingListener != null)
|
||
- tagBalancingListener.ignoredEndElement(element, augs);
|
||
- }
|
||
-
|
||
- /**
|
||
- * Structure to hold information about an element placed in buffer to be comsumed later
|
||
- */
|
||
- static class ElementEntry {
|
||
- private final QName name_;
|
||
- private final Augmentations augs_;
|
||
- ElementEntry(final QName element, final Augmentations augs) {
|
||
- name_ = new QName(element);
|
||
- augs_ = (augs == null) ? null : new HTMLAugmentations(augs);
|
||
- }
|
||
- }
|
||
-} // class HTMLTagBalancer
|