From 26e50670420bac3fca5c82c16ea726c38802049a Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Wed, 15 May 2019 21:59:59 +0100 Subject: [PATCH 1/7] Added option to handle more than one extension per sharc job. --- sge/create-db.sh | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/sge/create-db.sh b/sge/create-db.sh index e1d06d5..9ffce44 100755 --- a/sge/create-db.sh +++ b/sge/create-db.sh @@ -17,9 +17,10 @@ usage() { echo " -s \"\" (add qsub arguments, default: ${SGE_EXTRA_ARGS})" echo " -p \"\" (add python script arguments, default: ${PY_EXTRA_ARGS})" echo " -e (set path to extension id list, default: crawl from archive)" + echo " -l (limit number of sharc tasks, default: number of extensions)" } -while getopts ":a:t:s:p:m:e:" o; do +while getopts ":a:t:s:p:m:e:l:" o; do case "${o}" in a) REMOTE_ARCHIVE=${OPTARG} @@ -39,6 +40,9 @@ while getopts ":a:t:s:p:m:e:" o; do e) EXTENSION_IDS="${OPTARG}" ;; + l) + MAX_TASKS="${OPTARG}" + ;; *) usage exit 1 @@ -59,13 +63,19 @@ echo "Pushing sge script ..." scp "$BASEDIR/sge/create-db.sge" sharc.shef.ac.uk:"$TARGETDIR/create-db.sge" echo "Building image..." -if [ -f "$BASEDIR/singularity/create-db.img" ]; then - rm -f "$BASEDIR/singularity/create-db.img" +if [ -f "$BASEDIR/scripts/singularity/create-db.img" ]; then + rm -f "$BASEDIR/scripts/singularity/create-db.img" fi -sudo singularity build "$BASEDIR/singularity/create-db.img" "$BASEDIR/singularity/ExtensionCrawler-dev.def" +( + cd "$BASEDIR/scripts/singularity" + if [[ "$(docker images -q singularitybuilder-arch 2> /dev/null)" == "" ]]; then + docker build --tag=singularitybuilder -f singularitybuilder-arch.Dockerfile . + fi + docker run -it -v "$(pwd):$(pwd)" -w "$(pwd)" --privileged singularitybuilder-arch:latest singularity build create-db.img ExtensionCrawler.def +) echo "Pushing image..." -scp "$BASEDIR/singularity/create-db.img" sharc.shef.ac.uk:"$TARGETDIR/create-db.img" +scp "$BASEDIR/scripts/singularity/create-db.img" sharc.shef.ac.uk:"$TARGETDIR/create-db.img" if [[ -z $EXTENSION_IDS ]]; then @@ -86,8 +96,12 @@ fi echo "Pushing extension IDs..." scp ${TEMP_FOLDER}/extension.ids sharc.shef.ac.uk:$TARGETDIR/ -NO_BATCH_JOBS=$(((NO_IDS+1)/75000+1)) -JOBS_PER_BATCH=$((NO_IDS/NO_BATCH_JOBS+1)) +if [[ ! -v MAX_TASKS ]]; then + MAX_TASKS=NO_IDS +fi + +NO_BATCH_JOBS=$(((MAX_TASKS+1)/75000+1)) +JOBS_PER_BATCH=$((MAX_TASKS/NO_BATCH_JOBS+1)) for run_no in $(seq 1 $NO_BATCH_JOBS); do FIRST_ID=$(((run_no-1) * $JOBS_PER_BATCH + 1)) @@ -100,5 +114,5 @@ for run_no in $(seq 1 $NO_BATCH_JOBS); do -wd "$TARGETDIR" \ -o "$TARGETDIR/logs" \ ${SGE_EXTRA_ARGS} \ - "$TARGETDIR/create-db.sge" -a "$REMOTE_ARCHIVE" -e "${TARGETDIR}/extension.ids" -N $NO_IDS ${PY_EXTRA_ARGS}) + "$TARGETDIR/create-db.sge" -a "$REMOTE_ARCHIVE" -e "${TARGETDIR}/extension.ids" -N $MAX_TASKS ${PY_EXTRA_ARGS}) done From ecb0734505768ed02eb6dc1af10e44a0b7ee81ac Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Wed, 15 May 2019 22:00:40 +0100 Subject: [PATCH 2/7] Added pycharm folders to gitignore. --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 8192df9..756b5cd 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,6 @@ archive .ropeproject ExtensionCrawler.img ExtensionCrawler-cdnjs.img + +.idea +venv From 15f478ee78f01a62468b7515eabf9bb0fc4e4fd2 Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Wed, 15 May 2019 22:45:52 +0100 Subject: [PATCH 3/7] Use ast parser to parse ETag. --- PermissionAnalysis/grep-unused-permissions | 9 ++++----- extgrep | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/PermissionAnalysis/grep-unused-permissions b/PermissionAnalysis/grep-unused-permissions index 582f83e..3e7d749 100644 --- a/PermissionAnalysis/grep-unused-permissions +++ b/PermissionAnalysis/grep-unused-permissions @@ -25,6 +25,7 @@ import json import sys import csv from jsmin import jsmin +import ast from zipfile import ZipFile @@ -34,11 +35,9 @@ from ExtensionCrawler.js_mincer import mince_js def get_etag(headers_content): - headers_content = headers_content.replace( - '"', '\\"').replace("'", '"') - headers_json = json.loads(headers_content) - if "ETag" in headers_json: - return headers_json["ETag"] + d = ast.literal_eval(headers_content) + if "ETag" in d: + return d["ETag"] def get_name_and_version(overview_contents): diff --git a/extgrep b/extgrep index 9a54b4e..709eee8 100755 --- a/extgrep +++ b/extgrep @@ -26,6 +26,7 @@ import sys import importlib.util import csv import math +import ast from zipfile import ZipFile @@ -63,11 +64,9 @@ def import_regexs(path): def get_etag(headers_content): - headers_content = headers_content.replace( - '"', '\\"').replace("'", '"') - headers_json = json.loads(headers_content) - if "ETag" in headers_json: - return headers_json["ETag"] + d = ast.literal_eval(headers_content) + if "ETag" in d: + return d["ETag"] def get_name_and_version(overview_contents): From 67d14bb7fbb4b7f3b85d7f19d67ae5a43b206f30 Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Wed, 15 May 2019 22:47:44 +0100 Subject: [PATCH 4/7] Use ast parser to parse ETag, also in db.py. --- ExtensionCrawler/db.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ExtensionCrawler/db.py b/ExtensionCrawler/db.py index 0ff6bca..351589d 100644 --- a/ExtensionCrawler/db.py +++ b/ExtensionCrawler/db.py @@ -31,6 +31,7 @@ import glob import datetime import hashlib from jsmin import jsmin +import ast def get_etag(ext_id, datepath, con): @@ -48,7 +49,7 @@ def get_etag(ext_id, datepath, con): with open(headerpath) as f: content = f.read() try: - headers = eval(content) + headers = ast.literal_eval(content) if "ETag" in headers: return headers["ETag"] except Exception: From 55af0df4144b55d6824967b70f0f1c91c9b81d20 Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Fri, 17 May 2019 09:47:07 +0100 Subject: [PATCH 5/7] Don't crash grep on exceptions. --- PermissionAnalysis/grep-unused-permissions | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PermissionAnalysis/grep-unused-permissions b/PermissionAnalysis/grep-unused-permissions index 3e7d749..337f67a 100644 --- a/PermissionAnalysis/grep-unused-permissions +++ b/PermissionAnalysis/grep-unused-permissions @@ -157,7 +157,10 @@ def main(conf): csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version"] + sorted(list(permission_map.keys()))) for extid in [l.strip() for l in f.readlines()]: - handle_extid(conf, extid, permission_map, csvwriter) + try: + handle_extid(conf, extid, permission_map, csvwriter) + except Exception as e: + logging.exception(f"Fatal error when handling extension '{extid}'") def build_parser(): From 441c8b377911215425d65e8535da7e686ca408fd Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Tue, 21 May 2019 22:29:05 +0100 Subject: [PATCH 6/7] Print downloads and versions, deleted permissions.json. --- PermissionAnalysis/grep-unused-permissions | 21 +- PermissionAnalysis/permissions.json | 524 --------------------- 2 files changed, 16 insertions(+), 529 deletions(-) delete mode 100644 PermissionAnalysis/permissions.json diff --git a/PermissionAnalysis/grep-unused-permissions b/PermissionAnalysis/grep-unused-permissions index 337f67a..017c6c4 100644 --- a/PermissionAnalysis/grep-unused-permissions +++ b/PermissionAnalysis/grep-unused-permissions @@ -40,7 +40,7 @@ def get_etag(headers_content): return d["ETag"] -def get_name_and_version(overview_contents): +def get_metadata(overview_contents): # Extract extension name match = re.search("""""", overview_contents) @@ -50,8 +50,19 @@ def get_name_and_version(overview_contents): match = re.search( """""", overview_contents) version = match.group(1) if match else None + + # Extracts extension categories + match = re.search( + """Attribute name="category">(.+?)""", overview_contents) + categories = match.group(1).split(",") if match else None - return name, version + # Extracts the number of downloads + match = re.search( + """.getAudioState", - ".setAudioMuted", - ".isAudioMuted", - ".captureVisibleRegion", - ".addContentScripts", - ".back", - ".canGoBack", - ".canGoForward", - ".clearData", - ".executeScript", - ".find", - ".forward", - ".getProcessId", - ".getUserAgent", - ".getZoom", - ".getZoomMode", - ".go", - ".insertCSS", - ".isUserAgentOverridden", - ".print", - ".reload", - ".removeContentScripts", - ".setUserAgentOverride", - ".setZoom", - ".setZoomMode", - ".stop", - ".stopFinding", - ".loadDataWithBaseUrl", - ".setSpatialNavigationEnabled", - ".isSpatialNavigationEnabled", - ".terminate" - ], - "system.network": [ - "chrome.system.network.getNetworkInterfaces" - ], - "hid": [ - "chrome.hid.getDevices", - "chrome.hid.getUserSelectedDevices", - "chrome.hid.connect", - "chrome.hid.disconnect", - "chrome.hid.receive", - "chrome.hid.send", - "chrome.hid.receiveFeatureReport", - "chrome.hid.sendFeatureReport" - ], - "enterprise.platformKeys": [ - "chrome.enterprise.platformKeys.getTokens", - "chrome.enterprise.platformKeys.getCertificates", - "chrome.enterprise.platformKeys.importCertificate", - "chrome.enterprise.platformKeys.removeCertificate", - "chrome.enterprise.platformKeys.challengeMachineKey", - "chrome.enterprise.platformKeys.challengeUserKey" - ], - "documentScan": [ - "chrome.documentScan.scan" - ], - "serial": [ - "chrome.serial.getDevices", - "chrome.serial.connect", - "chrome.serial.update", - "chrome.serial.disconnect", - "chrome.serial.setPaused", - "chrome.serial.getInfo", - "chrome.serial.getConnections", - "chrome.serial.send", - "chrome.serial.flush", - "chrome.serial.getControlSignals", - " chrome.serial.setControlSignals", - "chrome.serial.setBreak", - "chrome.serial.clearBreak" - ], - "pkcs11": [ - "pkcs11.getModuleSlots", - "pkcs11.installModule", - "pkcs11.isModuleInstalled", - "pkcs11.uninstallModule" - ], - "browsingData": [ - "chrome.browsingData.settings", - "chrome.browsingData.remove", - "chrome.browsingData.removeAppcache", - "chrome.browsingData.removeCache", - "chrome.browsingData.removeCacheStorage", - "chrome.browsingData.removeCookies", - "chrome.browsingData.removeDownloads", - "chrome.browsingData.removeFileSystems", - "chrome.browsingData.removeFormData", - "chrome.browsingData.removeHistory", - "chrome.browsingData.removeIndexedDB", - "chrome.browsingData.removeLocalStorage", - "chrome.browsingData.removePluginData", - "chrome.browsingData.removePasswords", - "chrome.browsingData.removeServiceWorkers", - "chrome.browsingData.removeWebSQL" - ], - "management": [ - "chrome.management.getAll", - "chrome.management.get", - "chrome.management.getSelf", - "chrome.management.getPermissionWarningsById", - "chrome.management.getPermissionWarningsByManifest", - "chrome.management.setEnabled", - "chrome.management.uninstall", - "chrome.management.uninstallSelf", - "chrome.management.launchApp", - "chrome.management.createAppShortcut", - "chrome.management.setLaunchType", - "chrome.management.generateAppForLink" - ], - "tabCapture": [ - "chrome.tabCapture.capture", - "chrome.tabCapture.getCapturedTabs", - "chrome.tabCapture.captureOffscreenTab", - "chrome.tabCapture.getMediaStreamId" - ], - "declarativeContent": [ - "chrome.declarativeContent" - ], - "privacy": [ - "chrome.privacy" - ], - "input": [ - "chrome.input.ime.setComposition", - "chrome.input.ime.clearComposition", - "chrome.input.ime.commitText", - "chrome.input.ime.sendKeyEvents", - "chrome.input.ime.hideInputView", - "chrome.input.ime.setCandidateWindowProperties", - "chrome.input.ime.setCandidates", - "chrome.input.ime.setCursorPosition", - "chrome.input.ime.setMenuItems", - "chrome.input.ime.updateMenuItems", - "chrome.input.ime.deleteSurroundingText", - "chrome.input.ime.keyEventHandled", - "chrome.input.ime.createWindow", - "chrome.input.ime.showWindow", - "chrome.input.ime.hideWindow", - "chrome.input.ime.activate", - "chrome.input.ime.deactivate" - ], - "fontSettings": [ - "chrome.fontSettings.clearFont", - "chrome.fontSettings.getFont", - "chrome.fontSettings.setFont", - "chrome.fontSettings.getFontList", - "chrome.fontSettings.clearDefaultFontSize", - "chrome.fontSettings.getDefaultFontSize", - "chrome.fontSettings.setDefaultFontSize", - "chrome.fontSettings.clearDefaultFixedFontSize", - "chrome.fontSettings.getDefaultFixedFontSize", - "chrome.fontSettings.setDefaultFixedFontSize", - "chrome.fontSettings.clearMinimumFontSize", - "chrome.fontSettings.getMinimumFontSize", - "chrome.fontSettings.setMinimumFontSize" - ], - "signedInDevices": [ - "chrome.signedInDevices.get" - ], - "clipboardRead": [ - "document.execCommand(paste)" - ], - "storage": [ - "chrome.storage" - ], - "vpnProvider": [ - "chrome.vpnProvider.createConfig", - "chrome.vpnProvider.destroyConfig", - "chrome.vpnProvider.setParameters", - "chrome.vpnProvider.sendPacket", - "chrome.vpnProvider.notifyConnectionStateChanged" - ], - "debugger": [ - "chrome.debugger.attach", - "chrome.debugger.detach", - "chrome.debugger.sendCommand", - "chrome.debugger.getTargets" - ], - "gcm": [ - "chrome.gcm.register", - "chrome.gcm.unregister", - "chrome.gcm.send", - "chrome.instanceID.getID", - "chrome.instanceID.getCreationTime", - "chrome.instanceID.getToken", - "chrome.instanceID.deleteToken", - "chrome.instanceID.deleteID" - ], - "webRequest": [ - "chrome.webRequest.handlerBehaviorChanged", - "chrome.webRequest" - ], - "tabs": [ - "chrome.tabs.getCurrent", - "chrome.tabs.connect", - "chrome.tabs.query", - "chrome.tabs.sendMessage", - "chrome.tabs.get", - "chrome.tabs.sendRequest", - "chrome.tabs.getSelected", - "chrome.tabs.getAllInWindow", - "chrome.tabs.create", - "chrome.tabs.duplicate", - "chrome.tabs.highlight", - "chrome.tabs.update", - "chrome.tabs.move", - "chrome.tabs.reload", - "chrome.tabs.remove", - "chrome.tabs.detectLanguage", - "chrome.tabs.captureVisibleTab", - "chrome.tabs.executeScript", - "chrome.tabs.insertCSS", - "chrome.tabs.setZoom", - "chrome.tabs.getZoom", - "chrome.tabs.setZoomSettings", - "chrome.tabs.getZoomSettings", - "chrome.tabs.discard", - "chrome.tabs.goForward", - "chrome.tabs.goBack" - ], - "syncFileSystem": [ - "chrome.syncFileSystem.requestFileSystem", - "chrome.syncFileSystem.setConflictResolutionPolicy", - "hrome.syncFileSystem.getConflictResolutionPolicy", - "chrome.syncFileSystem.getUsageAndQuota", - "chrome.syncFileSystem.getFileStatus", - "chrome.syncFileSystem.getFileStatuses", - "chrome.syncFileSystem.getServiceStatus" - ], - "virtualKeyboard": [ - "chrome.virtualKeyboard.restrictFeatures" - ], - "pageCapture": [ - "chrome.pageCapture.saveAsMHTML" - ], - "contextMenus": [ - "chrome.contextMenus.create", - "chrome.contextMenus.update", - "chrome.contextMenus.remove", - "chrome.contextMenus.removeAll" - ], - "cookies": [ - "chrome.cookies.get", - "chrome.cookies.getAll", - "chrome.cookies.set", - "chrome.cookies.remove", - "chrome.cookies.getAllCookieStores" - ], - "power": [ - "chrome.power.requestKeepAwake", - "chrome.power.releaseKeepAwake" - ], - "bookmarks": [ - "chrome.bookmarks.get", - "chrome.bookmarks.getChildren", - "chrome.bookmarks.getRecent", - "chrome.bookmarks.getTree", - "chrome.bookmarks.getSubTree", - "chrome.bookmarks.search", - "chrome.bookmarks.create", - "chrome.bookmarks.move", - "chrome.bookmarks.update", - "chrome.bookmarks.remove", - "chrome.bookmarks.removeTree" - ], - "fileSystemProvider": [ - "chrome.fileSystemProvider.mount", - "chrome.fileSystemProvider.unmount", - "chrome.fileSystemProvider.getAll", - "chrome.fileSystemProvider.get", - "chrome.fileSystemProvider.notify" - ], - "networking.onc": [ - "chrome.networking.onc.getProperties", - "chrome.networking.onc.getManagedProperties", - "chrome.networking.onc.getState", - "chrome.networking.onc.setProperties", - "chrome.networking.onc.createNetwork", - "chrome.networking.onc.forgetNetwork", - "chrome.networking.onc.getNetworks", - "chrome.networking.onc.getDeviceStates", - "chrome.networking.onc.enableNetworkType", - "chrome.networking.onc.disableNetworkType", - "chrome.networking.onc.requestNetworkScan", - "chrome.networking.onc.startConnect", - "chrome.networking.onc.startDisconnect", - "chrome.networking.onc.getCaptivePortalStatus", - "chrome.networking.onc.getGlobalPolicy" - ], - "fileBrowserHandler": [ - "chrome.fileBrowserHandler.selectFile" - ], - "webNavigation": [ - "chrome.webNavigation.getFrame", - "chrome.webNavigation.getAllFrames" - ], - "system.storage": [ - "chrome.system.storage.getInfo", - "chrome.system.storage.ejectDevice", - "chrome.system.storage.getAvailableCapacity" - ], - "system.cpu": [ - "chrome.system.cpu.getInfo" - ], - "accessibilityFeatures.read": [ - "accessibilityFeatures.read" - ], - "sessions": [ - "chrome.sessions.getRecentlyClosed", - "chrome.sessions.getDevices", - "chrome.sessions.restore" - ], - "pointerLock": [ - "requestPointerLock" - ], - "alarms": [ - "chrome.alarms.create", - "chrome.alarms.get", - "chrome.alarms.getAll", - "chrome.alarms.clear", - "chrome.alarms.clearAll" - ], - "wallpaper": [ - "chrome.wallpaper.setWallpaper" - ], - "tts": [ - "chrome.tts.speak", - "chrome.tts.stop", - "chrome.tts.pause", - "chrome.tts.resume", - "chrome.tts.isSpeaking", - "chrome.tts.getVoices" - ], - "topSites": [ - "chrome.topSites.get" - ], - "platformKeys": [ - "chrome.platformKeys.selectClientCertificates", - "chrome.platformKeys.getKeyPair", - "chrome.platformKeys.subtleCrypto", - "chrome.platformKeys.verifyTLSServerCertificate" - ], - "notifications": [ - "chrome.notifications.create", - "chrome.notifications.update", - "chrome.notifications.clear", - "chrome.notifications.getAll", - "chrome.notifications.getPermissionLevel" - ], - "audio": [ - "chrome.audio.getDevices", - "chrome.audio.setActiveDevices", - "chrome.audio.setProperties", - "chrome.audio.getMute", - "chrome.audio.setMute" - ], - "desktopCapture": [ - "chrome.desktopCapture.chooseDesktopMedia", - "chrome.desktopCapture.cancelChooseDesktopMedia" - ], - "identity": [ - "chrome.identity.getAccounts", - "chrome.identity.getAuthToken", - "chrome.identity.getProfileUserInfo", - "chrome.identity.removeCachedAuthToken", - "chrome.identity.launchWebAuthFlow", - "chrome.identity.getRedirectURL" - ], - "clipboardWrite": [ - "document.execCommand(\"copy\")", - "document.execCommand(\"cut\")" - ], - "system.display": [ - "chrome.system.display.getInfo", - "chrome.system.display.getDisplayLayout", - "chrome.system.display.setDisplayProperties", - "chrome.system.display.setDisplayLayout", - "chrome.system.display.enableUnifiedDesktop", - "chrome.system.display.overscanCalibrationStart", - "chrome.system.display.overscanCalibrationAdjust", - "chrome.system.display.overscanCalibrationReset", - "chrome.system.display.overscanCalibrationComplete", - "chrome.system.display.showNativeTouchCalibration", - "chrome.system.display.startCustomTouchCalibration", - "chrome.system.display.completeCustomTouchCalibration", - "chrome.system.display.clearTouchCalibration", - "chrome.system.display.setMirrorMode" - ] -} From 40dfa3f5935fb21df998119f146ab1647b20b5c2 Mon Sep 17 00:00:00 2001 From: Michael Herzberg Date: Tue, 21 May 2019 22:31:04 +0100 Subject: [PATCH 7/7] Don't crash on invalid categories. --- PermissionAnalysis/grep-unused-permissions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PermissionAnalysis/grep-unused-permissions b/PermissionAnalysis/grep-unused-permissions index 017c6c4..e8ac674 100644 --- a/PermissionAnalysis/grep-unused-permissions +++ b/PermissionAnalysis/grep-unused-permissions @@ -54,7 +54,7 @@ def get_metadata(overview_contents): # Extracts extension categories match = re.search( """Attribute name="category">(.+?)""", overview_contents) - categories = match.group(1).split(",") if match else None + categories = match.group(1).split(",") if match else [] # Extracts the number of downloads match = re.search(