// Copyright 2000-2005 the Contributors, as shown in the revision logs. // Licensed under the GNU General Public License version 2 ("the License"). // You may not use this file except in compliance with the License. package org.ibex.graphics; import java.util.*; import java.net.*; import java.io.*; import org.ibex.js.*; import org.ibex.util.*; /* * While entities are limited to a subset of Unicode characters , * numeric character references can specify any character. Numeric * character references may be given in decimal or hexadecimal, though * browser support is stronger for decimal references. Decimal * references are of the form &#number; while hexadecimal references * take the case-insensitive form &#xnumber;. Examples of numeric * character references include © or © for the copyright * symbol, Α or Α for the Greek capital letter alpha, and * ا or ا for the Arabic letter ALEF. * * http://www.htmlhelp.com/reference/html40/entities/special.html * http://www.htmlhelp.com/reference/html40/entities/symbols.html * http://www.htmlhelp.com/reference/html40/entities/latin1.html */ /** * This class parses an InputStream containing HTML and returns it * as an XWT DOM tree. Each HTML Element is returned as a struct, * with the following members: * * Since HTML may have multiple top level elements (unlike XML), * this class will search all top level elements for one with a tag * name 'html'. If such a node is found, only it is returned. If no * top-level element has the tag name 'html', such a node is * fabricated, and all top level elements become the children of * that node, which is then returned. */ public class HTML { private final static String[] noEndTag = new String[] { "area", "base", "basefont", "br", "col", "frame", "hr", "img", "input", "isindex", "link", "meta", "param" }; /** we keep a char[] around for use by removeRedundantWhitespace() */ private static char[] cbuf = null; /** we keep a StringBuffer around for use by removeRedundantWhitespace() */ private static StringBuffer sbuf = null; /** true iff we have encountered an LI more recently than the last OL/UL */ private static boolean withinLI = false; // FEATURE: This is ugly private static class JS extends org.ibex.js.JS.Obj { public void put(String key, Object value) throws JSExn { if(value instanceof String) put(JSU.S(key),JSU.S((String)value)); else if(value instanceof Number) put(JSU.S(key), JSU.N((Number)value)); else if(value == null) put(JSU.S(key), (JS)null); else throw new Error("FIXME"); } public Object _get(String key) throws JSExn { org.ibex.js.JS js = get(JSU.S(key)); if(JSU.isInt(js)) return new Integer(JSU.toInt(js)); return JSU.toString(js); } } public static synchronized JS parseReader(Reader r) throws IOException, JSExn { CharStream cs = new CharStream(r); JS h = new JS(); withinLI = false; h.put("$name", "html"); try { while (true) parseBody(cs, h, null); } catch (EOFException e) { // continue until we get an EOFException } /* FIXME Object[] ids = h.keys(); for(int i=0; ih. The * CharStream should be positioned immediately after the * open bracket. * * If a close tag not matching this open tag is found, the * tagname on the close tag will be returned in order to * facilitate correcting broken HTML. Otherwise, this returns * null. */ private static String parseElement(CharStream cs, JS h) throws IOException, JSExn { // scan element name while(Character.isSpace(cs.peek())) cs.get(); String elementName = parseElementName(cs); boolean saveWithinLI = withinLI; if (elementName.equals("li")) { if (withinLI) { cs.unread(new char[] { '<', 'l', 'i', ' ' }); return "li"; } else { withinLI = true; } } else if (elementName.equals("ol") || elementName.equals("ul")) { withinLI = false; } h.put("$name", elementName); if (elementName.equals("!--")) { h.put("0", parseComment(cs)); h.put("$numchildren", new Integer(0)); return null; } // scan attributes while (cs.peek() != '>') { String name = parseAttributeName(cs); if (name.equals("")) break; String value = expandEntities(parseAttributeValue(cs)); h.put(name, value); } // eat the close-angle bracket cs.get(); // bodyless tags return here for(int i=0; i 0) { h.put(String.valueOf(length), expanded); h.put("$numchildren", new Integer(++length)); } cdata = ""; } catch (EOFException e) { String expanded = removeRedundantWhitespace(expandEntities(cdata)); if (expanded.length() > 0) { h.put(String.valueOf(length), expanded); h.put("$numchildren", new Integer(++length)); } throw e; } try { // scan subelement if (cs.peek() != '/') { JS kid = new JS(); closetag = parseElement(cs, kid); h.put(String.valueOf(length), kid); h.put("$numchildren", new Integer(++length)); // scan close-tag } else { cs.get(); // drop the slash closetag = parseElementName(cs); while(cs.get() != '>'); } } catch (EOFException e) { throw e; } if (closetag != null) return closetag.equals(elementName) ? null : closetag; } } /** Parses an element name and returns it. The CharStream should * be positioned at the first character of the name. */ private static String parseElementName(CharStream cs) throws IOException, JSExn { String ret = ""; while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get(); return ret.toLowerCase(); } /** Parses an attribute name and returns it. The CharStream should * be positioned at the first character of the name, possibly * with intervening whitespace. */ private static String parseAttributeName(CharStream cs) throws IOException, JSExn { while(Character.isSpace(cs.peek())) cs.get(); String ret = ""; while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get(); return ret.toLowerCase(); } /** Parses an attribute value and returns it. The CharStream * should be positioned at the equals sign, possibly with * intervening whitespace. */ private static String parseAttributeValue(CharStream cs) throws IOException, JSExn { // eat whitespace and equals sign while(Character.isSpace(cs.peek())) cs.get(); if (cs.peek() != '=') return ""; cs.get(); while(Character.isSpace(cs.peek())) cs.get(); boolean doublequoted = false; boolean singlequoted = false; String ret = ""; if (cs.peek() == '\"') { doublequoted = true; cs.get(); } else if (cs.peek() == '\'') { singlequoted = true; cs.get(); } while(true) { char c = cs.peek(); if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break; if (singlequoted && c == '\'') { cs.get(); break; } if (doublequoted && c == '\"') { cs.get(); break; } ret += cs.get(); } return ret; } /** Parses a comment and returns its body. The CharStream should * be positioned immediately after the <!-- */ private static String parseComment(CharStream cs) throws IOException, JSExn { int dashes = 0; String ret = ""; while(true) { char c = cs.get(); if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2); if (c == '-') dashes++; else dashes = 0; ret += c; } } /** Expands all SGML entities in string s */ public static String expandEntities(String s) throws IOException, JSExn { if (s.indexOf('&') == -1) return s; StringBuffer sb = new StringBuffer(); int i=0; int nextamp = 0; while(nextamp != -1) { nextamp = s.indexOf('&', i); sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp)); if (nextamp == -1) break; if (s.regionMatches(nextamp, "&", 0, 5)) { sb.append("&"); i = nextamp + 5; } else if (s.regionMatches(nextamp, ">", 0, 4)) { sb.append(">"); i = nextamp + 4; } else if (s.regionMatches(nextamp, "<", 0, 4)) { sb.append("<"); i = nextamp + 4; } else if (s.regionMatches(nextamp, """, 0, 6)) { sb.append("\""); i = nextamp + 6; } else if (s.regionMatches(nextamp, " ", 0, 6)) { // FEATURE: perhaps we should distinguish this somehow sb.append(" "); i = nextamp + 6; } else { sb.append("&"); i = nextamp + 1; } } return sb.toString(); } /** removes all redundant whitespace */ private static String removeRedundantWhitespace(String s) throws JSExn { if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s; int len = s.length(); if (cbuf == null || cbuf.length < len) { cbuf = new char[len * 2]; sbuf = new StringBuffer(len * 2); } sbuf.setLength(0); s.getChars(0, len, cbuf, 0); int last = 0; boolean lastWasWhitespace = false; for(int i=0; i