updates to processing of Web pages:

1) adopted the code from Rational for HTML handling
    i) extended it to integrate better source mapping into IR
    ii) integrated fixes from the old html processing to collect more info on forms
    iii) added some copyright comments
  2) updated version of jericho
  3) added support for nu.validator for html5
  4) added script to fetch html parser jars, and removed binary jar
  

git-svn-id: https://wala.svn.sourceforge.net/svnroot/wala/trunk@4091 f5eafffb-2e1d-0410-98e4-8ec43c5233c4
This commit is contained in:
dolby-oss 2011-04-04 15:25:53 +00:00
parent cdd42a68b6
commit b0849e3ab6
3 changed files with 0 additions and 486 deletions

View File

@ -1,143 +0,0 @@
/******************************************************************************
* Copyright (c) 2002 - 2006 IBM Corporation.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
*****************************************************************************/
package com.ibm.wala.cast.js.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import com.ibm.wala.cast.js.html.IHtmlCallback;
import com.ibm.wala.cast.js.html.IHtmlParser;
import com.ibm.wala.cast.js.html.jericho.JerichoHtmlParser;
public class Generator {
public static final String preamble = "preamble.js", temp1 = "temp1.js", temp2 = "temp2.js", temp3 = "temp3.js";
public static interface CallbackFactory {
IHtmlCallback createCallback(URL input, FileWriter domTreeFile, FileWriter embeddedScriptFile, FileWriter entrypointFile);
}
public static class HTMLCallbackFactory implements CallbackFactory {
public IHtmlCallback createCallback(URL input, FileWriter domTreeFile, FileWriter embeddedScriptFile, FileWriter entrypointFile) {
return new HTMLCallback(input, domTreeFile, embeddedScriptFile, entrypointFile);
}
}
public static final CallbackFactory defaultCallbackFactory = new HTMLCallbackFactory();
private CallbackFactory callbackFactory;
private boolean ignoreCharset;
public Generator(boolean ignoreCharset, CallbackFactory factory) {
this.ignoreCharset = ignoreCharset;
this.callbackFactory = factory;
}
public Generator() {
this(true, defaultCallbackFactory);
}
/*
private InputStreamReader getStream(String url) throws IOException {
return getStream( Generator.class.getClassLoader().getResource( url ) );
}
*/
private InputStreamReader getStream(URL url) throws IOException {
URLConnection conn = url.openConnection();
conn.setDefaultUseCaches(false);
conn.setUseCaches(false);
return new InputStreamReader(conn.getInputStream());
}
public static void main(String args[]) throws IOException {
Generator g = new Generator();
if (new File(args[0]).exists()) {
g.generate(new File(args[0]).toURI().toURL(), new File(args[1]));
}
}
private void writeRegion(FileWriter out, String region, String tempFileName) throws IOException {
FileReader tmp = new FileReader(tempFileName);
BufferedReader tempIn = new BufferedReader(tmp);
out.write("// " + region + " Region Begins\n");
String line = tempIn.readLine();
while(line != null) {
out.write(line+"\n");
line = tempIn.readLine();
}
out.write("// " + region + " Region Ends\n\n\n");
}
public void generate(URL input, File outFile) throws IOException {
InputStreamReader fr = getStream( input );
FileWriter out = new FileWriter(outFile);
FileWriter out1 = new FileWriter(temp1);
FileWriter out2 = new FileWriter(temp2);
FileWriter out3 = new FileWriter(temp3);
IHtmlParser parser = new JerichoHtmlParser();
IHtmlCallback parseHandler = callbackFactory.createCallback(input, out1, out2, out3);
parser.parse(fr, parseHandler, input.getFile());
out1.flush();
out1.close();
out2.flush();
out2.close();
out3.flush();
out3.close();
// generatePreamble(out, cb);
out.write("\n\ndocument.URL = new String('" + input + "');\n");
out.write("note_url(document.URL);\n\n");
writeRegion(out, "Embedded Script", temp2);
writeRegion(out, "DOM Tree", temp1);
out.write("while (true) {\n\n");
writeRegion(out, "Entrypoints", temp3);
out.write("\n}\n\n");
generateTrailer(out, parseHandler);
out.close();
}
/*
protected void generatePreamble(FileWriter out, HTMLEditorKit.ParserCallback cb) throws IOException {
InputStreamReader pm = getStream( preamble );
BufferedReader pmIn = new BufferedReader(pm);
out.write("//Preamble Begin\n");
String line = pmIn.readLine();
while(line != null) {
out.write(line+"\n");
line = pmIn.readLine();
}
out.write("//Preamble End\n\n\n");
}
*/
protected void generateTrailer(FileWriter out, IHtmlCallback parseHandler) throws IOException {
out.write("//Trailer Begin\n");
out.write("//Trailer End\n");
}
}

View File

@ -1,257 +0,0 @@
/******************************************************************************
* Copyright (c) 2002 - 2006 IBM Corporation.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
*****************************************************************************/
package com.ibm.wala.cast.js.util;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.ibm.wala.cast.js.html.IHtmlCallback;
import com.ibm.wala.cast.js.html.ITag;
import com.ibm.wala.util.collections.HashMapFactory;
import com.ibm.wala.util.collections.Pair;
import com.ibm.wala.util.debug.Assertions;
public class HTMLCallback implements IHtmlCallback {
private final URL input;
private final FileWriter domTreeFile, embeddedScriptFile, entrypointFile;
private int counter=0;
private final HashMap<String, String> constructors = HashMapFactory.make();
protected final Stack<String> stack;
public HTMLCallback(URL input, FileWriter out, FileWriter out2, FileWriter entrypointFile) {
this.input = input;
this.domTreeFile = out;
this.embeddedScriptFile = out2;
this.entrypointFile = entrypointFile;
stack = new Stack<String>();
constructors.put("FORM", "DOMHTMLFormElement");
constructors.put("TABLE", "DOMHTMLTableElement");
}
private void indent() throws IOException {
for(int i = 0; i < stack.size(); i++) {
domTreeFile.write(" ");
}
}
private void writeEmbeddedScript(char[] text, int length) throws IOException {
embeddedScriptFile.write(text, 0, length);
}
private final Pattern ctrl = Pattern.compile("[\\p{Cntrl}&&[^\\p{Space}]]");
private void writeEmbeddedScript(String text) throws IOException {
Matcher m = ctrl.matcher(text);
embeddedScriptFile.write(m.replaceAll(" "));
}
protected String createElement(ITag tag) {
// String tag = t.toString().toUpperCase();
if(tag.getName().equalsIgnoreCase("SCRIPT")) {
String lang = tag.getAttributeByName("language");
if (lang == null || lang.toUpperCase().indexOf("VB") < 0) {
String src = tag.getAttributeByName("src");
// script is out-of-line
if (src != null) {
try {
URL scriptSrc = new URL(input, src);
InputStreamReader scriptReader =
new InputStreamReader(
scriptSrc.openConnection().getInputStream());
int read;
char[] buffer = new char[ 1024 ];
while ( (read = scriptReader.read(buffer)) != -1 ) {
writeEmbeddedScript(buffer, read);
}
embeddedScriptFile.flush();
scriptReader.close();
} catch (IOException e) {
System.out.println("bad input script " + src);
}
// script is inline
} else {
String content = tag.getBodyText().snd;
try {
writeEmbeddedScript(content);
} catch (IOException e) {
System.err.println("Cannot write embedded script " + content);
}
}
}
}
String varName = getVarNameForTag(tag);
String cons = constructors.get(tag.getName().toUpperCase());
if(cons == null) cons = "DOMHTMLElement";
try {
writeElement(tag, cons, varName);
domTreeFile.write("\n");
} catch (IOException e) {
System.out.println("Error writing to file");
System.exit(1);
}
return varName;
}
private String getVarNameForTag(ITag tag) {
String varName = null;
for (Map.Entry<String, String> e : tag.getAllAttributes().entrySet()){
String attr = e.getKey();
String value = e.getValue();
if (attr.equalsIgnoreCase("id")) {
if (value.indexOf('-') == -1) {
varName = value;
}
break;
}
}
if (varName == null) {
varName = "node" + (counter++);
}
return varName;
}
private Stack<ITag> forms = new Stack<ITag>();
private Set<Pair<ITag,String>> sets = new HashSet<Pair<ITag,String>>();
protected void writeElement(ITag tag, String cons, String varName) throws IOException {
indent(); domTreeFile.write("function make_" + varName + "(parent) {\n");
indent(); domTreeFile.write(" this.temp = " + cons + ";\n");
indent(); domTreeFile.write(" this.temp('" + tag.getName() + "');\n");
for (Map.Entry<String, String> e : tag.getAllAttributes().entrySet()){
String attr = e.getKey();
String value = e.getValue();
domTreeFile.write(" ");
writeAttribute(tag, attr, value, "this", varName);
}
if (tag.getName().equalsIgnoreCase("FORM")) {
forms.push(tag);
indent(); domTreeFile.write(" var currentForm = this;\n");
} if (tag.getName().equalsIgnoreCase("INPUT")) {
String prop = tag.getAttributeByName("NAME");
if (prop == null) {
prop = tag.getAttributeByName("name");
}
String type = tag.getAttributeByName("TYPE");
if (type == null) {
type = tag.getAttributeByName("type");
}
if (type != null && prop != null) {
if (type.equalsIgnoreCase("RADIO")) {
if (! sets.contains(Pair.make(forms.peek(), prop))) {
sets.add(Pair.make(forms.peek(), prop));
indent(); domTreeFile.write(" currentForm." + prop + " = new Array();\n");
indent(); domTreeFile.write(" currentForm." + prop + "Counter = 0;\n");
}
indent(); domTreeFile.write(" currentForm." + prop + "[currentForm." + prop + "Counter++] = this;\n");
} else {
indent(); domTreeFile.write(" currentForm." + prop + " = this;\n");
}
}
}
indent(); domTreeFile.write(" " + varName + " = this;\n");
indent(); domTreeFile.write(" dom_nodes." + varName + " = this;\n");
indent(); domTreeFile.write(" parent.appendChild(this);\n");
}
protected void writeAttribute(ITag tag, String attr, String value, String varName, String varName2) throws IOException {
writePortletAttribute(tag, attr, value, varName);
writeEventAttribute(tag, attr, value, varName, varName2);
}
protected void writeEventAttribute(ITag tag, String attr, String value, String varName, String varName2) throws IOException {
if(attr.substring(0,2).equals("on")) {
indent(); domTreeFile.write(varName + "." + attr + " = function " + attr + "_" + varName2 + "(event) {" + value + "};\n");
entrypointFile.write("\n\n " + varName2 + "." + attr + "(null);\n\n");
} else if (value != null) {
if (value.indexOf('\'') > 0) {
value = value.replaceAll("\\'", "\\\\'");
}
if (value.indexOf('\n') > 0) {
value = value.replaceAll("\\n", "\\\\n");
}
if (attr.equals(attr.toUpperCase())) {
attr = attr.toLowerCase();
}
// indent(); domTreeFile.write(varName + ".setAttribute('" + attr + "', '" + value + "');\n");
indent(); domTreeFile.write(varName + "['" + attr + "'] = '" + value + "';\n");
}
}
protected void writePortletAttribute(ITag tag, String attr, String value, String varName) throws IOException {
if(attr.equals("portletid")) {
if(value.substring(value.length()-4).equals("vice")) {
indent(); domTreeFile.write("\n\nfunction cVice() { var contextVice = " + varName + "; }\ncVice();\n\n");
} else if(value.substring(value.length()-4).equals("root")) {
indent(); domTreeFile.write("\n\nfunction cRoot() { var contextRoot = " + varName + "; }\ncRoot();\n\n");
}
}
}
private void endElement(String name) {
try {
indent(); domTreeFile.write("};\n");
indent();
if (stack.isEmpty()) {
domTreeFile.write("new make_" + name + "(document);\n\n\n");
} else {
domTreeFile.write("new make_" + name + "(this);\n\n");
}
} catch (IOException e) {
System.exit(-1);
}
}
public void handleEndTag(ITag tag) {
endElement(stack.pop());
if (tag.getName().equalsIgnoreCase("FORM")) {
forms.pop();
}
for(String v : tag.getAllAttributes().values()) {
if (v != null && v.startsWith("javascript:")) {
try {
entrypointFile.write( v.substring(11) );
} catch (IOException e) {
Assertions.UNREACHABLE(e.toString());
}
}
}
}
public void handleStartTag(ITag tag) {
String varName = createElement(tag);
stack.push(varName);
}
}

View File

@ -1,86 +0,0 @@
/******************************************************************************
* Copyright (c) 2002 - 2006 IBM Corporation.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* IBM Corporation - initial API and implementation
*****************************************************************************/
package com.ibm.wala.cast.js.util;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import com.ibm.wala.classLoader.SourceFileModule;
import com.ibm.wala.util.debug.Assertions;
public class WebUtil {
private static final String outputDir;
private static final Generator defaultGenerator = new Generator();
static {
String dir = System.getProperty("java.io.tmpdir");
if (! dir.endsWith(File.separator))
dir = dir + File.separator;
outputDir = dir;
}
public static SourceFileModule extractScriptFromHTML(String url) {
try {
if (! url.startsWith("file://")) {
url = "file://" + url;
}
return extractScriptFromHTML(new URL(url), defaultGenerator);
} catch (MalformedURLException e) {
Assertions.UNREACHABLE( e.toString() );
return null;
}
}
public static SourceFileModule extractScriptFromHTML(URL url) {
return extractScriptFromHTML(url, defaultGenerator);
}
private static String urlName(URL url) {
String urlFile = url.getFile();
return urlFile.lastIndexOf('/')>0?
urlFile.substring(urlFile.lastIndexOf('/')):
url.getHost() + ".html";
}
public static File extractScriptFileFromHTML(URL url, Generator generator) {
try {
String urlName = urlName(url);
File F = new File(outputDir + urlName);
System.err.println(("making driver at " + F + " " + outputDir));
if (F.exists()) F.delete();
generator.generate(url, F);
return F;
} catch (IOException e) {
Assertions.UNREACHABLE("error processing " + url + ": " + e);
return null;
}
}
public static SourceFileModule extractScriptFromHTML(URL url, Generator generator) {
String urlName = urlName(url);
File F = extractScriptFileFromHTML(url, generator);
return new SourceFileModule(F, urlName.substring(1));
}
public static void main(String[] args) throws MalformedURLException {
System.err.println(extractScriptFromHTML(new URL(args[0]), defaultGenerator));
}
}