1) handle files with byte order marks

2) ignore code in comment blocks
2014-05-28 13:33:49 -04:00 · 2014-05-28 13:33:49 -04:00 · 50373e122d
parent a5a80e7848
commit 50373e122d
1 changed files with 18 additions and 8 deletions
--- a/com.ibm.wala.cast.js/source/com/ibm/wala/cast/js/html/DomLessSourceExtractor.java
+++ b/com.ibm.wala.cast.js/source/com/ibm/wala/cast/js/html/DomLessSourceExtractor.java
@ -25,6 +25,9 @@ import java.util.Map.Entry;
 import java.util.Set;
 import java.util.regex.Pattern;

+import org.apache.commons.io.ByteOrderMark;
+import org.apache.commons.io.input.BOMInputStream;
+
 import com.ibm.wala.cast.ir.translator.TranslatorToCAst.Error;
 import com.ibm.wala.cast.js.html.jericho.JerichoHtmlParser;
 import com.ibm.wala.cast.tree.CAstSourcePositionMap.Position;
@ -55,6 +58,7 @@ public class DomLessSourceExtractor extends JSSourceExtractor {
    protected final SourceRegion entrypointRegion;
    
    private ITag currentScriptTag;
+    private ITag currentCommentTag;
    
    private int nodeCounter = 0;
    private int scriptNodeCounter = 0;
@ -86,12 +90,15 @@ public class DomLessSourceExtractor extends JSSourceExtractor {
      if (tag.getName().equalsIgnoreCase("script")) {
        assert currentScriptTag != null;
        currentScriptTag = null;
+      } else if (currentScriptTag != null && tag.getName().equals("!--")) {
+        assert currentCommentTag != null;
+        currentCommentTag = null;
      }
    }

    @Override
    public void handleText(Position p, String text) {
-      if (currentScriptTag != null) {
+      if (currentScriptTag != null && currentCommentTag == null) {
        if (text.startsWith("<![CDATA[")) {
         assert text.endsWith("]]>");
         text = text.substring(9, text.length()-11);
@ -116,6 +123,8 @@ public class DomLessSourceExtractor extends JSSourceExtractor {
        assert currentScriptTag == null;
        currentScriptTag = tag;
        scriptNodeCounter++;
+      } else if (currentScriptTag != null && tag.getName().equals("!--")){
+        currentCommentTag = tag;
      }
      handleDOM(tag);
    }
@ -226,16 +235,17 @@ public class DomLessSourceExtractor extends JSSourceExtractor {
    }

    private void getScriptFromUrl(String urlAsString, ITag scriptTag) throws IOException, MalformedURLException {
-//      URL absoluteUrl = UrlManipulator.relativeToAbsoluteUrl(urlAsString, this.entrypointUrl);
-//      URL scriptSrc = urlResolver.resolve(absoluteUrl);
      URL scriptSrc = new URL(entrypointUrl, urlAsString);
-      if (scriptSrc == null) { //Error resolving URL
-        return;
-      }
-
      Reader scriptInputStream;
      try {
-         scriptInputStream = new InputStreamReader(scriptSrc.openConnection().getInputStream());
+        BOMInputStream bs = new BOMInputStream(scriptSrc.openConnection().getInputStream(), false, 
+            ByteOrderMark.UTF_8, 
+            ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
+            ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
+        if (bs.hasBOM()) {
+          System.err.println("removing BOM " + bs.getBOM());
+        }
+        scriptInputStream = new InputStreamReader(bs);
      } catch (Exception e) {
        //it looks like this happens when we can't resolve the url?
        if (DEBUG) {