diff --git a/ExtensionCrawler/db.py b/ExtensionCrawler/db.py index 41d5794..0ff6bca 100644 --- a/ExtensionCrawler/db.py +++ b/ExtensionCrawler/db.py @@ -30,6 +30,7 @@ import os import glob import datetime import hashlib +from jsmin import jsmin def get_etag(ext_id, datepath, con): @@ -214,16 +215,6 @@ def parse_and_insert_crx(ext_id, datepath, con): # Trying a different encoding, manifests are weird... content = raw_content.decode("latin1") - # Attempt to remove JavaScript-style comments from json - comment_regex = re.compile(r'\s*//.*') - multiline_comment_regex = re.compile(r'\s*/\\*.*\\*/\s*') - lines = content.splitlines() - for index, line in enumerate(lines): - if comment_regex.fullmatch( - line) or multiline_comment_regex.fullmatch(line): - lines[index] = "" - content = "\n".join(lines) - con.insert( "crx", crx_etag=etag, @@ -232,7 +223,7 @@ def parse_and_insert_crx(ext_id, datepath, con): manifest=content, publickey=public_key) - manifest = json.loads(content, strict=False) + manifest = json.loads(jsmin(content), strict=False) if "permissions" in manifest: for permission in manifest["permissions"]: con.insert( diff --git a/ExtensionCrawler/file_identifiers.py b/ExtensionCrawler/file_identifiers.py index 31e0262..d6b8828 100644 --- a/ExtensionCrawler/file_identifiers.py +++ b/ExtensionCrawler/file_identifiers.py @@ -62,6 +62,9 @@ def get_features(s): def get_simhash(encoding, data): """Compute simhash of text.""" if encoding is not None: + # VISCII is not supported by python, UTF-8 parses at least the for us important parts + if encoding == "VISCII": + encoding = "UTF-8" str_data = data.decode(encoding=encoding, errors="replace") else: str_data = str(data) diff --git a/requirements.txt b/requirements.txt index 109177f..0512212 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ GitPython==2.1.5 python_magic==0.4.13 jsbeautifier==1.7.3 pebble==4.3.7 +jsmin==2.2.2 \ No newline at end of file